(import (scheme base)) ; Regular expression AST representation ; Regex representing the empty language (define-record-type (empty-regex) empty-regex?) ; A string literal to be matched literally, the empty string represents the null regex (define-record-type (lit-regex c) lit-regex? (c lit-value)) (define-record-type (cons-regex lhs rhs) cons-regex? (lhs cons-lhs) (rhs cons-rhs)) (define-record-type (alt-regex lhs rhs) alt-regex? (lhs alt-lhs) (rhs alt-rhs)) (define-record-type (star-regex body) star-regex? (body star-body)) ; Combinators for defining regular expressions (define (regex-box-string x) (if (string? x) (lit-regex x) x)) (define rgx-null (lit-regex "")) (define (rgx-cons . rs) (if (null? rs) rgx-null (cons-regex (regex-box-string (car rs)) (apply rgx-cons (cdr rs))))) (define (rgx-alt . rs) (if (null? rs) (empty-regex) (alt-regex (regex-box-string (car rs)) (apply rgx-alt (cdr rs))))) (define (rgx-star r) (star-regex (regex-box-string r))) (define (string-null? s) (= 0 (string-length s))) ; Check if a regex matches the null string (define (nullable? r) (cond ((empty-regex? r) #f) ((lit-regex? r) (string-null? (lit-value r))) ((cons-regex? r) (and (nullable? (cons-lhs r)) (nullable? (cons-rhs r)))) ((alt-regex? r) (or (nullable? (alt-lhs r)) (nullable? (alt-rhs r)))) ((star-regex? r) #t))) (define (unsatisfiable? r) (cond ((empty-regex? r) #t) ((lit-regex? r) #f) ((cons-regex? r) (or (unsatisfiable? (cons-lhs r)) (unsatisfiable? (cons-rhs r)))) ((alt-regex? r) (and (unsatisfiable? (cons-lhs r)) (unsatisfiable? (cons-rhs r)))) ((star-regex? r) #f))) ; Now given a regex AST we can define its derivative with respect to a char ; Definition: ; D_c({}) = {} ; D_c(Lit(s)) = ok ; D_c(r + b) = D_c(r) + D_c(b) ; D_c(r s) = D_c(r) s + d(r) D_c(s) ; D_c(r*) = D_c(r) r* ; TODO: Lazily create the resulting regex (define (regex-derive c r) (cond ((empty-regex? r) (empty-regex)) ((lit-regex? r) (cond ((string-null? (lit-value r)) (empty-regex)) ((eq? c (string-ref (lit-value r) 0)) (lit-regex (string-copy (lit-value r) 1))) (else (empty-regex)))) ((alt-regex? r) (alt-regex (regex-derive c (alt-lhs r)) (regex-derive c (alt-rhs r)))) ((star-regex? r) (cons-regex (regex-derive c (star-body r)) r)) ((cons-regex? r) (if (nullable? (cons-lhs r)) (alt-regex (cons-regex (regex-derive c (cons-lhs r)) (cons-rhs r)) (regex-derive c (cons-rhs r))) (cons-regex (regex-derive c (cons-lhs r)) (cons-rhs r)))))) ; Let us leverage the regex derivative algorithm to define a regex matcher (define (match-regex str regex) (define (match-regex-list str regex) (if (null? str) (nullable? regex) (match-regex-list (cdr str) (regex-derive (car str) regex)))) (match-regex-list (string->list str) regex)) ; As a tiny variation, let us define a grep-like function ; The first step is finding where there are matches for the regex in the string ; Given a string and a regex we want to find all the matches ; A match is a range in the string (define-record-type (make-match start end) match? (start match-start) (end match-end)) ; regex is the regex to match ; str the string upon which to match ; start is where to start reading from, end where to stop! ; matches is the list of matches to which we must cons our current result ; Try to get the longest matching value (define (find-match regex str start pos end matches) (if (nullable? regex) (cons (make-match start pos) matches) (find-match (regex-derive (string-ref str pos) start (+ pos 1) end)) ())) ; Let us represent (a OR b)*cde (define my-regex (rgx-cons (rgx-star (rgx-alt "a" "b")) "cde")) (match-regex "abababababababcde" my-regex) ; ==> #t