109 lines
4.0 KiB
Scheme
109 lines
4.0 KiB
Scheme
(import (scheme base))
|
|
; Regular expression AST representation
|
|
|
|
; Regex representing the empty language
|
|
(define-record-type <empty-regex> (empty-regex) empty-regex?)
|
|
|
|
; A string literal to be matched literally, the empty string represents the null regex
|
|
(define-record-type <lit-regex> (lit-regex c) lit-regex?
|
|
(c lit-value))
|
|
|
|
(define-record-type <cons-regex> (cons-regex lhs rhs) cons-regex?
|
|
(lhs cons-lhs)
|
|
(rhs cons-rhs))
|
|
|
|
(define-record-type <alt-regex> (alt-regex lhs rhs) alt-regex?
|
|
(lhs alt-lhs)
|
|
(rhs alt-rhs))
|
|
|
|
(define-record-type <star-regex> (star-regex body) star-regex?
|
|
(body star-body))
|
|
|
|
; Combinators for defining regular expressions
|
|
(define (regex-box-string x)
|
|
(if (string? x) (lit-regex x) x))
|
|
|
|
(define rgx-null (lit-regex ""))
|
|
(define (rgx-cons . rs)
|
|
(if (null? rs)
|
|
rgx-null
|
|
(cons-regex (regex-box-string (car rs)) (apply rgx-cons (cdr rs)))))
|
|
|
|
(define (rgx-alt . rs)
|
|
(if (null? rs)
|
|
(empty-regex)
|
|
(alt-regex (regex-box-string (car rs)) (apply rgx-alt (cdr rs)))))
|
|
|
|
(define (rgx-star r) (star-regex (regex-box-string r)))
|
|
|
|
(define (string-null? s) (= 0 (string-length s)))
|
|
|
|
; Check if a regex matches the null string
|
|
(define (nullable? r)
|
|
(cond ((empty-regex? r) #f)
|
|
((lit-regex? r) (string-null? (lit-value r)))
|
|
((cons-regex? r) (and (nullable? (cons-lhs r)) (nullable? (cons-rhs r))))
|
|
((alt-regex? r) (or (nullable? (alt-lhs r)) (nullable? (alt-rhs r))))
|
|
((star-regex? r) #t)))
|
|
|
|
(define (unsatisfiable? r)
|
|
(cond ((empty-regex? r) #t)
|
|
((lit-regex? r) #f)
|
|
((cons-regex? r) (or (unsatisfiable? (cons-lhs r)) (unsatisfiable? (cons-rhs r))))
|
|
((alt-regex? r) (and (unsatisfiable? (cons-lhs r)) (unsatisfiable? (cons-rhs r))))
|
|
((star-regex? r) #f)))
|
|
|
|
; Now given a regex AST we can define its derivative with respect to a char
|
|
; Definition:
|
|
; D_c({}) = {}
|
|
; D_c(Lit(s)) = ok
|
|
; D_c(r + b) = D_c(r) + D_c(b)
|
|
; D_c(r s) = D_c(r) s + d(r) D_c(s)
|
|
; D_c(r*) = D_c(r) r*
|
|
; TODO: Lazily create the resulting regex
|
|
(define (regex-derive c r)
|
|
(cond ((empty-regex? r) (empty-regex))
|
|
((lit-regex? r) (cond ((string-null? (lit-value r)) (empty-regex))
|
|
((eq? c (string-ref (lit-value r) 0)) (lit-regex (string-copy (lit-value r) 1)))
|
|
(else (empty-regex))))
|
|
((alt-regex? r) (alt-regex (regex-derive c (alt-lhs r)) (regex-derive c (alt-rhs r))))
|
|
((star-regex? r) (cons-regex (regex-derive c (star-body r)) r))
|
|
((cons-regex? r) (if (nullable? (cons-lhs r))
|
|
(alt-regex (cons-regex (regex-derive c (cons-lhs r))
|
|
(cons-rhs r))
|
|
(regex-derive c (cons-rhs r)))
|
|
(cons-regex (regex-derive c (cons-lhs r))
|
|
(cons-rhs r))))))
|
|
|
|
|
|
; Let us leverage the regex derivative algorithm to define a regex matcher
|
|
(define (match-regex str regex)
|
|
(define (match-regex-list str regex)
|
|
(if (null? str)
|
|
(nullable? regex)
|
|
(match-regex-list (cdr str) (regex-derive (car str) regex))))
|
|
|
|
(match-regex-list (string->list str) regex))
|
|
|
|
; As a tiny variation, let us define a grep-like function
|
|
; The first step is finding where there are matches for the regex in the string
|
|
|
|
; Given a string and a regex we want to find all the matches
|
|
; A match is a range in the string
|
|
(define-record-type <match> (make-match start end) match? (start match-start) (end match-end))
|
|
|
|
; regex is the regex to match
|
|
; str the string upon which to match
|
|
; start is where to start reading from, end where to stop!
|
|
; matches is the list of matches to which we must cons our current result
|
|
; Try to get the longest matching value
|
|
(define (find-match regex str start pos end matches)
|
|
(if (nullable? regex)
|
|
(cons (make-match start pos) matches)
|
|
(find-match (regex-derive (string-ref str pos) start (+ pos 1) end))
|
|
()))
|
|
|
|
; Let us represent (a OR b)*cde
|
|
(define my-regex (rgx-cons (rgx-star (rgx-alt "a" "b")) "cde"))
|
|
(match-regex "abababababababcde" my-regex) ; ==> #t
|