From 4dffe1d7b4d3950da32b25df1846c292427a9a77 Mon Sep 17 00:00:00 2001 From: Francesco Magliocca Date: Wed, 25 May 2022 13:41:41 +0200 Subject: [PATCH] Implement parser --- .gitignore | 3 + Makefile | 23 +++ examples/example.scm | 1 + examples/hard_example.scm | 108 +++++++++++ include/parser.h | 10 + include/sexp.h | 47 +++++ include/string_table.h | 22 +++ include/syntax_checker.h | 12 ++ src/main.c | 23 +++ src/parser.c | 398 ++++++++++++++++++++++++++++++++++++++ src/sexp.c | 96 +++++++++ src/string_table.c | 97 ++++++++++ src/syntax_checker.c | 251 ++++++++++++++++++++++++ 13 files changed, 1091 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 examples/example.scm create mode 100644 examples/hard_example.scm create mode 100644 include/parser.h create mode 100644 include/sexp.h create mode 100644 include/string_table.h create mode 100644 include/syntax_checker.h create mode 100644 src/main.c create mode 100644 src/parser.c create mode 100644 src/sexp.c create mode 100644 src/string_table.c create mode 100644 src/syntax_checker.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b4d2acc --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Build artifacts +build/ +leao diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..89b2f23 --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +CFLAGS = -std=c11 -g -Wpedantic -Iinclude/ + +# Directory containing object files and build artifacts +obj_dir = build + +objects = main.o sexp.o parser.o syntax_checker.o +object_paths = $(addprefix $(obj_dir)/, $(objects)) + +leao: $(object_paths) + $(CC) -o $@ $(CLFAGS) $(object_paths) + +$(obj_dir)/%.o: src/%.c | $(obj_dir) + $(CC) $(CFLAGS) -o $@ -c $< + +$(obj_dir): + mkdir -p $(obj_dir) + +clean: + rm -f leao + rm -rf $(obj_dir) + +.PHONY: clean + diff --git a/examples/example.scm b/examples/example.scm new file mode 100644 index 0000000..4617fae --- /dev/null +++ b/examples/example.scm @@ -0,0 +1 @@ +(define name 123) diff --git a/examples/hard_example.scm b/examples/hard_example.scm new file mode 100644 index 0000000..a30c54d --- /dev/null +++ b/examples/hard_example.scm @@ -0,0 +1,108 @@ +(import (scheme base)) +; Regular expression AST representation + +; Regex representing the empty language +(define-record-type (empty-regex) empty-regex?) + +; A string literal to be matched literally, the empty string represents the null regex +(define-record-type (lit-regex c) lit-regex? + (c lit-value)) + +(define-record-type (cons-regex lhs rhs) cons-regex? + (lhs cons-lhs) + (rhs cons-rhs)) + +(define-record-type (alt-regex lhs rhs) alt-regex? + (lhs alt-lhs) + (rhs alt-rhs)) + +(define-record-type (star-regex body) star-regex? + (body star-body)) + +; Combinators for defining regular expressions +(define (regex-box-string x) + (if (string? x) (lit-regex x) x)) + +(define rgx-null (lit-regex "")) +(define (rgx-cons . rs) + (if (null? rs) + rgx-null + (cons-regex (regex-box-string (car rs)) (apply rgx-cons (cdr rs))))) + +(define (rgx-alt . rs) + (if (null? rs) + (empty-regex) + (alt-regex (regex-box-string (car rs)) (apply rgx-alt (cdr rs))))) + +(define (rgx-star r) (star-regex (regex-box-string r))) + +(define (string-null? s) (= 0 (string-length s))) + +; Check if a regex matches the null string +(define (nullable? r) + (cond ((empty-regex? r) #f) + ((lit-regex? r) (string-null? (lit-value r))) + ((cons-regex? r) (and (nullable? (cons-lhs r)) (nullable? (cons-rhs r)))) + ((alt-regex? r) (or (nullable? (alt-lhs r)) (nullable? (alt-rhs r)))) + ((star-regex? r) #t))) + +(define (unsatisfiable? r) + (cond ((empty-regex? r) #t) + ((lit-regex? r) #f) + ((cons-regex? r) (or (unsatisfiable? (cons-lhs r)) (unsatisfiable? (cons-rhs r)))) + ((alt-regex? r) (and (unsatisfiable? (cons-lhs r)) (unsatisfiable? (cons-rhs r)))) + ((star-regex? r) #f))) + +; Now given a regex AST we can define its derivative with respect to a char +; Definition: +; D_c({}) = {} +; D_c(Lit(s)) = ok +; D_c(r + b) = D_c(r) + D_c(b) +; D_c(r s) = D_c(r) s + d(r) D_c(s) +; D_c(r*) = D_c(r) r* +; TODO: Lazily create the resulting regex +(define (regex-derive c r) + (cond ((empty-regex? r) (empty-regex)) + ((lit-regex? r) (cond ((string-null? (lit-value r)) (empty-regex)) + ((eq? c (string-ref (lit-value r) 0)) (lit-regex (string-copy (lit-value r) 1))) + (else (empty-regex)))) + ((alt-regex? r) (alt-regex (regex-derive c (alt-lhs r)) (regex-derive c (alt-rhs r)))) + ((star-regex? r) (cons-regex (regex-derive c (star-body r)) r)) + ((cons-regex? r) (if (nullable? (cons-lhs r)) + (alt-regex (cons-regex (regex-derive c (cons-lhs r)) + (cons-rhs r)) + (regex-derive c (cons-rhs r))) + (cons-regex (regex-derive c (cons-lhs r)) + (cons-rhs r)))))) + + +; Let us leverage the regex derivative algorithm to define a regex matcher +(define (match-regex str regex) + (define (match-regex-list str regex) + (if (null? str) + (nullable? regex) + (match-regex-list (cdr str) (regex-derive (car str) regex)))) + + (match-regex-list (string->list str) regex)) + +; As a tiny variation, let us define a grep-like function +; The first step is finding where there are matches for the regex in the string + +; Given a string and a regex we want to find all the matches +; A match is a range in the string +(define-record-type (make-match start end) match? (start match-start) (end match-end)) + +; regex is the regex to match +; str the string upon which to match +; start is where to start reading from, end where to stop! +; matches is the list of matches to which we must cons our current result +; Try to get the longest matching value +(define (find-match regex str start pos end matches) + (if (nullable? regex) + (cons (make-match start pos) matches) + (find-match (regex-derive (string-ref str pos) start (+ pos 1) end)) + ())) + +; Let us represent (a OR b)*cde +(define my-regex (rgx-cons (rgx-star (rgx-alt "a" "b")) "cde")) +(match-regex "abababababababcde" my-regex) ; ==> #t diff --git a/include/parser.h b/include/parser.h new file mode 100644 index 0000000..3197874 --- /dev/null +++ b/include/parser.h @@ -0,0 +1,10 @@ +#include +#include + +struct sexp_list; + +/* Parse a single s-expression from the `in` stream. + * If the parse is successful, The output is put in out. + * Otherwise the program is aborted. + */ +void parse_program(FILE *in, struct sexp_list **out); diff --git a/include/sexp.h b/include/sexp.h new file mode 100644 index 0000000..708c38e --- /dev/null +++ b/include/sexp.h @@ -0,0 +1,47 @@ +#include + +/* Datastructures representing s-exprs */ + +struct sexp_list; + +enum sexp_form { + sexp_bool, + sexp_int, + sexp_string, + sexp_symbol, + sexp_list, +}; + +struct sexp { + enum sexp_form form; + union { + bool bool_lit; + /* Integer literals are not directly converted to numbers */ + char *lexeme; + struct sexp_list *list; + }; +}; + +struct sexp_list { + struct sexp *elem; + struct sexp_list *next; +}; + +/* Helper to create a list by appending at its end in O(1) time */ +struct sexp_list_builder { + struct sexp_list *head; + struct sexp_list *last; +}; + +/* Returns false if allocation fails */ +bool sexp_list_append(struct sexp_list_builder *b, struct sexp *elem); + +/* If allocation fails, these functions return NULL */ +struct sexp *sexp_make_bool(bool v); +struct sexp *sexp_make_int(char *lexeme); +struct sexp *sexp_make_string(char *lexeme); +struct sexp *sexp_make_symbol(char *lexeme); +struct sexp *sexp_make_list(struct sexp_list *list); + +void free_sexp_list(struct sexp_list *head); +void free_sexp(struct sexp *s); diff --git a/include/string_table.h b/include/string_table.h new file mode 100644 index 0000000..6f893e0 --- /dev/null +++ b/include/string_table.h @@ -0,0 +1,22 @@ +#include + +/* Hash table with linear probing implementation. + * The keys are always strings, the values can be anything. + * The table does not own any data. + */ + +struct string_table; + +struct string_table *new_table(void); +void free_table(struct string_table *t); + +/* Inserts a new entry in the table. If the key is already present, the new value + * substitutes the old one + */ +void string_table_insert(struct string_table *t, const char *key, void *value); + +/* Remove, if it's present, the entry for the given key */ +void string_table_remove(struct string_table *t, const char *key); + +/* Lookup a key, returns its value if found, otherwise NULL */ +void *string_table_lookup(struct string_table *t, const char *key); diff --git a/include/syntax_checker.h b/include/syntax_checker.h new file mode 100644 index 0000000..58cc3cc --- /dev/null +++ b/include/syntax_checker.h @@ -0,0 +1,12 @@ +struct sexp; +struct sexp_list; + +/* The Syntax checker is in charge + * of checking that all syntactic forms are correctly adoperated. + * TODO: In the future this will be part of the macro expander. + * + * If there is any syntax error, an error message is printed to stderr the program is aborted. + */ +void check_syntax(struct sexp *s); + +void check_program(struct sexp_list *prog); diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..9c56ce0 --- /dev/null +++ b/src/main.c @@ -0,0 +1,23 @@ +#include +#include "parser.h" +#include "sexp.h" +#include "syntax_checker.h" + +int main(int argc, char *argv[]) { + if (argc != 2) { + printf("Usage %s file.scm\n", argv[0]); + return 1; + } + FILE *in = fopen(argv[1], "r"); + if (!in) { + printf("Could not open file %s.\n", argv[1]); + return 1; + } + struct sexp_list *prog; + parse_program(in, &prog); + check_program(prog); + free_sexp_list(prog); + fclose(in); + printf("AC Milan campione d'italia 2021/2022.\n"); + return 0; +} diff --git a/src/parser.c b/src/parser.c new file mode 100644 index 0000000..e5d0ff0 --- /dev/null +++ b/src/parser.c @@ -0,0 +1,398 @@ +#include "parser.h" +#include "sexp.h" +#include +#include +#include +#include +#include +#include +#include + +/* + * S-expressions grammar + * + * comment := ; any character except until newline + * space := [\n\t ] + * comments and spaces and newlines and tabs are ignored + * + * sexpr := list | atom | quoted_expr + * + * quoted_expr := ' sexpr + * + * list := LPAREN sexpr * RPAREN + * + * LPAREN := ( | [ | { + * RPAREN := ) | ] | } + * + * atom := int_lit | bool_lit | string_lit | ident + * + * int_lit := [0-9]+ + * bool_lit := #t | #f + * + * string_lit := " (string_elem | escaped_elem)* " + * string_elem := ASCII printable except \ and " + * escaped_elem := \ [\\nt"] + * + * ident := ident_start ident_cont * + * special_start := [!$%&*+-/.:<=>?@^_~] + * ident_start := [a-zA-Z] | special_start + * ident_cont := ident_start | [0-9] + * + */ + +static inline bool is_ident_start(const char c) { + switch (c) { + case '!': case '$': case '%': + case '&': case '*': case '+': + case '-': case '/': case '.': + case ':': case '<': case '=': + case '>': case '?': case '@': + case '^': case '_': case '~': + return true; + + default: + return isalpha(c); + } +} + +static inline bool is_ident_cont(const char c) { + return is_ident_start(c) || isdigit(c); +} + +/* In case of memory shortage or IO failure, the program is brutally aborted. */ +/* In case of syntax errors, the program is brutally aborted. + * TODO: Try to make a more compromising parser + */ + +/* Print a message to stderr and abort the program */ +static noreturn void abort_program(const char *fmt, ...) { + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + exit(EXIT_FAILURE); +} + +static noreturn void parse_error_abort(void) { + abort_program("Parse error.\n"); +} + +static noreturn void out_of_memory_abort(void) { + abort_program("Out of memory.\n"); +} + +struct parser { + /* Input stream */ + FILE *in; + /* Current lookahead symbol */ + char curr_char; + + /* Internal buffer */ + char *buf; + size_t buf_len; + size_t buf_cap; + + /* Location info */ + size_t line; + size_t col; +}; + +/* Helper function to report error messages with the attached location */ +static void report_error(struct parser *p, const char *fmt, ...) { + va_list args; + fprintf(stderr, "Error at %d:%d ", p->line, p->col); + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); +} + +/* read next input char and update location info */ +static inline void consume(struct parser *p) { + /* First update location info */ + if (p->curr_char == '\n') { + ++p->line; + p->col = 1; + } + else { + ++p->col; + } + + p->curr_char = fgetc(p->in); + if (ferror(p->in)) { + abort_program("Error while reading from input stream.\n"); + } +} + +/* Store a character in the internal buffer */ +static inline void store_char(struct parser *p, const char c) { + /* If the buffer has not already been initialized, do so now */ + if (p->buf_cap == 0) { + p->buf_cap = 256; + p->buf = malloc(p->buf_cap); + if (!p->buf) { + out_of_memory_abort(); + } + } + /* Otherwise check if it's large enough to contain another character */ + else if (p->buf_cap - 1 < p->buf_len) { + /* We double the internal buffer capacity each time, + * but we must pay attention to integer overflow, + * so let us first check if we can double the size of the buffer + * with no problem + */ + if (SIZE_MAX / 2 <= p->buf_cap) { + report_error(p, "Lexeme is too long.\n"); + abort_program("Buffer overflow, can't allocate more space, lexeme is too large.\n"); + } + /* If the initial buffer capacity is set to zero, + * we set it to 256, as initial capacity + */ + p->buf_cap = p->buf_cap * 2; + + p->buf = realloc(p->buf, p->buf_cap); + if (!p->buf) { + out_of_memory_abort(); + } + } + /* Now we can safely store the given character in the internal buffer */ + p->buf[p->buf_len] = c; + ++p->buf_len; +} + +/* Create a null terminated copy of the internal buffer, + * and reset the internal buffer, so that in can be used for the next lexeme */ +static inline char *lexeme_recognized(struct parser *p) { + char *res = malloc(p->buf_len + 1); + if (!res) { + out_of_memory_abort(); + } + memcpy(res, p->buf, p->buf_len); + res[p->buf_len] = 0; + + /* Reset internal buffer */ + p->buf_len = 0; + return res; +} + +/* Reset internal buffer */ +static inline void reset_buffer(struct parser *p) { + p->buf_len = 0; +} + +static void skip_ws_comments(struct parser *p) { + bool keep_ignoring = true; + while (keep_ignoring) { + switch (p->curr_char) { + case ' ': case '\n': case '\t': + consume(p); + break; + case ';': + /* Ignore everything until a newline */ + while (p->curr_char != '\n') { + consume(p); + } + break; + default: + keep_ignoring = false; + } + } +} + +static void parse_int_lit(struct parser *p, struct sexp **out) { + store_char(p, p->curr_char); + while (isdigit(p->curr_char)) { + store_char(p, p->curr_char); + consume(p); + } + *out = sexp_make_int(lexeme_recognized(p)); + if (!*out) out_of_memory_abort(); +} + +static inline void parse_bool_lit(struct parser *p, struct sexp **out) { + consume(p); + switch (p->curr_char) { + case 't': + *out = sexp_make_bool(true); + break; + case 'f': + *out = sexp_make_bool(false); + break; + default: + report_error(p, "Expected bool literal, invalid lexeme.\n"); + parse_error_abort(); + } + + if (!*out) out_of_memory_abort(); +} + +static void parse_string_lit(struct parser *p, struct sexp **out) { + consume(p); + while (p->curr_char != EOF && p->curr_char != '"') { + if (p->curr_char == '\\') { + consume(p); + /* Read escape sequence, and store the meaning of the escape + * sequence in the internal buffer + */ + switch (p->curr_char) { + case '"': + store_char(p, '"'); + break; + case 'n': + store_char(p, '\n'); + break; + case 't': + store_char(p, '\t'); + break; + default: + report_error(p, "Invalid escape sequence.\n"); + parse_error_abort(); + } + consume(p); + } + else if (isprint(p->curr_char)) { + store_char(p, p->curr_char); + consume(p); + } + else { + report_error(p, "Invalid character in string literal.\n"); + parse_error_abort(); + } + } + if (p->curr_char != '"') { + report_error(p, "Unexpected end of file. expected closing quote in string literal.\n"); + parse_error_abort(); + } + consume(p); + *out = sexp_make_string(lexeme_recognized(p)); + if (!*out) out_of_memory_abort(); +} + +static void parse_ident(struct parser *p, struct sexp **out) { + if (!is_ident_start(p->curr_char)) { + report_error(p, "Invalid character found.\n"); + parse_error_abort(); + } + store_char(p, p->curr_char); + consume(p); + while (is_ident_cont(p->curr_char)) { + store_char(p, p->curr_char); + consume(p); + } + *out = sexp_make_symbol(lexeme_recognized(p)); + if (!*out) out_of_memory_abort(); +} + + +static void parse_sexp(struct parser *p, struct sexp **out); + +static void parse_quoted_sexp(struct parser *p, struct sexp **out) { + consume(p); + struct sexp *quoted; + parse_sexp(p, "ed); + struct sexp_list_builder quote = { NULL }; + + /* Quoted s-expressions are a shortcut for (quote sexp), so we + * just expand the meaning + */ + const char quote_kw[] = "quote"; + + char *quote_lexeme = malloc(sizeof(quote_kw)); + if (!quote_lexeme) out_of_memory_abort(); + memcpy(quote_lexeme, quote_kw, sizeof(quote_kw)); + + struct sexp *quote_symbol = sexp_make_symbol(quote_lexeme); + if (!quote_symbol) out_of_memory_abort(); + + if (!sexp_list_append("e, quote_symbol)) out_of_memory_abort(); + if (!sexp_list_append("e, quoted)) out_of_memory_abort(); + + *out = sexp_make_list(quote.head); + if (!*out) out_of_memory_abort(); +} + +static void parse_list(struct parser *p, struct sexp **out, char delim) { + consume(p); + skip_ws_comments(p); + struct sexp_list_builder builder = { NULL }; + struct sexp *curr; + while (p->curr_char != EOF && p->curr_char != delim) { + parse_sexp(p, &curr); + if (!sexp_list_append(&builder, curr)) out_of_memory_abort(); + skip_ws_comments(p); + } + if (p->curr_char != delim) { + report_error(p, "Unbalanced parens. Expected closing `%c` in list expression.\n", delim); + parse_error_abort(); + } + consume(p); + *out = sexp_make_list(builder.head); + if (!*out) out_of_memory_abort(); +} + +static void parse_sexp(struct parser *p, struct sexp **out) { + switch (p->curr_char) { + case EOF: + report_error(p, "Expected s-expression, found end of file.\n"); + parse_error_abort(); + break; + case '\'': + parse_quoted_sexp(p, out); + break; + case '"': + parse_string_lit(p, out); + break; + case '(': + parse_list(p, out, ')'); + break; + case '[': + parse_list(p, out, ']'); + break; + case '{': + parse_list(p, out, ']'); + break; + case '#': + parse_bool_lit(p, out); + break; + case '0': case '1': case '2': + case '3': case '4': case '5': + case '6': case '7': case '8': + case '9': + parse_int_lit(p, out); + break; + default: + parse_ident(p, out); + break; + } +} + +void parse_program(FILE *in, struct sexp_list **out) { + char curr_char = fgetc(in); + if (ferror(in)) { + abort_program("Error while reading from input stream.\n"); + } + + struct parser p = { + .in = in, + .curr_char = curr_char, + .buf = NULL, + .buf_len = 0, + .buf_cap = 0, + .line = 1, + .col = 1, + }; + + skip_ws_comments(&p); + struct sexp_list_builder builder = { NULL }; + struct sexp *s; + while (p.curr_char != EOF) { + parse_sexp(&p, &s); + if (!sexp_list_append(&builder, s)) + out_of_memory_abort(); + + skip_ws_comments(&p); + } + + *out = builder.head; + /* Free internal buffer, before quitting */ + free(p.buf); +} diff --git a/src/sexp.c b/src/sexp.c new file mode 100644 index 0000000..d5abade --- /dev/null +++ b/src/sexp.c @@ -0,0 +1,96 @@ +#include "sexp.h" +#include + +bool sexp_list_append(struct sexp_list_builder *b, struct sexp *elem) { + struct sexp_list *node = malloc(sizeof(struct sexp_list)); + if (!node) return false; + node->elem = elem; + node->next = NULL; + + if (b->last == NULL) { + b->head = node; + b->last = node; + } + else { + b->last->next = node; + b->last = b->last->next; + } + return true; +} + +struct sexp *sexp_make_bool(bool v) { + struct sexp *res = malloc(sizeof(struct sexp)); + if (!res) return NULL; + + res->form = sexp_bool; + res->bool_lit = v; + return res; +} + +struct sexp *sexp_make_int(char *lexeme) { + struct sexp *res = malloc(sizeof(struct sexp)); + if (!res) return NULL; + + res->form = sexp_int; + res->lexeme = lexeme; + return res; +} + +struct sexp *sexp_make_string(char *lexeme) { + struct sexp *res = malloc(sizeof(struct sexp)); + if (!res) return NULL; + + res->form = sexp_string; + res->lexeme = lexeme; + return res; +} + +struct sexp *sexp_make_symbol(char *lexeme) { + struct sexp *res = malloc(sizeof(struct sexp)); + if (!res) return NULL; + + res->form = sexp_symbol; + res->lexeme = lexeme; + return res; +} + +struct sexp *sexp_make_list(struct sexp_list *list) { + struct sexp *res = malloc(sizeof(struct sexp)); + if (!res) return NULL; + + res->form = sexp_list; + res->list = list; + return res; +} + +void free_sexp_list(struct sexp_list *head) { + struct sexp_list *tmp; + while (head) { + tmp = head; + head = head->next; + free_sexp(tmp->elem); + free(tmp); + } +} + +void free_sexp(struct sexp *s) { + if (!s) return; + + switch (s->form) { + case sexp_bool: + break; + case sexp_int: + free(s->lexeme); + break; + case sexp_string: + free(s->lexeme); + break; + case sexp_symbol: + free(s->lexeme); + break; + case sexp_list: + free_sexp_list(s->list); + break; + } + free(s); +} diff --git a/src/string_table.c b/src/string_table.c new file mode 100644 index 0000000..4f849e3 --- /dev/null +++ b/src/string_table.c @@ -0,0 +1,97 @@ +#include "string_table.h" +#include +#include + +/* For now we use the simple djb2 hashing algorithm, + * but TODO: implement SipHash-2-4 + */ +static inline size_t hash_string(const char *str) { + size_t hash = 5381; + for (; *str != 0; ++str) { + hash = hash * 33 + *str; + } + return hash; +} + +struct bucket { + /* This flag distinguishes between empty + * and used buckets + */ + bool used; + const char *key; + void *value; +}; + +struct string_table { + struct bucket *entries; + size_t capacity; + /* Elements count in the table */ + size_t size; +}; + +static void rehash(struct string_table *t) { + /* We double the size, but what's a better strategy? */ + size_t new_capacity = t->capacity * 2; + struct bucket *new_entries = calloc(new_capacity, sizeof(struct bucket)); + size_t i; + size_t new_hash; + size_t new_pos; + for (i = 0; i < t->capacity; ++i) { + if (t->entries[i].used) { + new_hash = hash_string(t->entries[i].key); + new_pos = new_hash % new_capacity; + new_entries[new_pos] = t->entries[i]; + } + } + t->capacity = new_capacity; + free(t->entries); + t->entries = new_entries; +} + +void string_table_insert(struct string_table *t, const char *key, void *value) { + /* First check if we need to rehash, we do this + * when the table is full at 75%, this should prevent + * some collisions + */ + if (t->size * 4 > t->capacity * 3) { + rehash(t); + } + size_t pos = hash_string(key) % t->capacity; + while (t->entries[pos].used) { + if (!strcmp(key, t->entries[pos].key)) { + /* If the key is already been inserted, + * just update its corresponding value + */ + t->entries[pos].value = value; + break; + } + ++pos; + } + /* There must be an empty pos, so pos < t->capacity surely */ + t->entries[pos] = (struct bucket) { .key = key, .value = value, .used = true }; + ++t->size; +} + +void string_table_remove(struct string_table *t, const char *key) { + size_t pos = hash_string(key) % t->capacity; + while (t->entries[pos].used) { + if (!strcmp(key, t->entries[pos].key)) { + /* Remove this bucket */ + t->entries[pos].used = false; + break; + } + ++pos; + } +} + +void *string_table_lookup(struct string_table *t, const char *key) { + size_t pos = hash_string(key) % t->capacity; + while (pos < t->capacity && t->entries[pos].used) { + if (!strcmp(key, t->entries[pos].key)) { + return t->entries[pos].value; + } + + ++pos; + } + return NULL; +} diff --git a/src/syntax_checker.c b/src/syntax_checker.c new file mode 100644 index 0000000..20437f3 --- /dev/null +++ b/src/syntax_checker.c @@ -0,0 +1,251 @@ +#include "syntax_checker.h" +#include "sexp.h" +#include +#include +#include +#include +#include +#include + +/* Small DSL to perform pattern matching on a list */ +struct list_checker { + const char *syntax_form; + /* Cursor inside the list */ + struct sexp_list *curr; +}; + +/* TODO: Make error messages more informative */ +static noreturn void report_check_error(const char *fmt, ...) { + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + exit(EXIT_FAILURE); +} + +static const char *syn_form_descr(enum sexp_form f) { + switch (f) { + case sexp_bool: + return "bool literal"; + case sexp_int: + return "int literal"; + case sexp_string: + return "string literal"; + case sexp_symbol: + return "symbol"; + case sexp_list: + return "list"; + } +} + +static inline enum sexp_form curr_form(struct list_checker *c) { + return c->curr->elem->form; +} + +static inline struct sexp *curr_elem(struct list_checker *c) { + return c->curr->elem; +} + +static inline void consume(struct list_checker *c) { + c->curr = c->curr->next; +} + +static inline void expect_form(struct list_checker *c, enum sexp_form form) { + if (c->curr->elem->form != form) { + report_check_error("Syntax form %s: Expected `%s` but got `%s` inside list.\n", c->syntax_form, syn_form_descr(form), syn_form_descr(c->curr->elem->form)); + } + consume(c); +} + +static inline void expect_any(struct list_checker *c) { + if (c->curr == NULL) report_check_error("Syntax form %s: Too few arguments, unexpected end of list.\n", c->syntax_form); + consume(c); +} + +static inline void expect_list_end(struct list_checker *c) { + if (c-> curr != NULL) report_check_error("Syntax form %s: Too many arguments, expected end of list.\n", c->syntax_form); +} + +static inline bool has_more_elems(struct list_checker *c) { + return c->curr != NULL; +} + +/* Syntax Checkers */ + +typedef void (*syntax_checker)(struct list_checker *c); + +void begin_syntax_checker(struct list_checker *c) { + /* (begin expr expr...) */ + /* We require at least one expr in the begin body */ + expect_any(c); +} + +void check_define_with_formals(struct sexp *formals) { + /* (ident formals) + * (ident . formal) + */ + struct list_checker c = { "define formals", formals->list }; + expect_form(&c, sexp_symbol); + if (curr_form(&c) == sexp_symbol && !strcmp(curr_elem(&c)->lexeme, ".")) { + consume(&c); + expect_form(&c, sexp_symbol); + expect_list_end(&c); + } + else { + while (has_more_elems(&c)) { + expect_form(&c, sexp_symbol); + } + } +} + +void define_syntax_checker(struct list_checker *c) { + /* (define ident expr expr...) + * (define (ident formals) expr expr...) + * (define (ident . formal) expr expr...) + */ + switch (curr_form(c)) { + case sexp_symbol: + consume(c); + /* At least one expr */ + expect_any(c); + break; + case sexp_list: + check_define_with_formals(curr_elem(c)); + consume(c); + /* At least one expr */ + expect_any(c); + break; + default: + report_check_error("Invalid define form, expected either symbol or list, but got `%s`\n", syn_form_descr(curr_form(c))); + } +} + +void if_syntax_checker(struct list_checker *c) { + /* (if expr expr expr) */ + expect_any(c); + expect_any(c); + expect_any(c); + expect_list_end(c); +} + +void check_lambda_formals(struct sexp *l) { + /* + * (ident ...) + * (ident ident ... . ident) + */ + struct list_checker c = { "lambda formals", l->list }; + if (has_more_elems(&c)) { + if (curr_form(&c) != sexp_symbol) report_check_error("Expected symbol in lambda formal.\n"); + /* First element can't be a dot */ + if (!strcmp(curr_elem(&c)->lexeme, ".")) report_check_error("Expected at least one param before `.` in lambda formal.\n"); + consume(&c); + while (has_more_elems(&c)) { + /* All elements of the list must be symbols */ + if (curr_form(&c) != sexp_symbol) report_check_error("Invalid formal. All formal parameters must be symbols.\n"); + /* If the current element is a dot, + * then it must be followed by exactly one variable + */ + if (strcmp(curr_elem(&c)->lexeme, ".")) { + consume(&c); + expect_form(&c, sexp_symbol); + expect_list_end(&c); + break; + } + consume(&c); + } + } +} + +void lambda_syntax_checker(struct list_checker *c) { + /* (lambda ident exp exp...) + * (lambda (formals) exp exp...) + */ + switch (curr_form(c)) { + case sexp_symbol: + consume(c); + break; + case sexp_list: + check_lambda_formals(curr_elem(c)); + consume(c); + break; + } + /* At least one more element */ + expect_any(c); +} + +void check_let_defs(struct sexp *s) { + /* (name exp) */ + struct list_checker c = { "let definition list", s->list }; + struct list_checker def; + while (has_more_elems(&c)) { + if (curr_form(&c) != sexp_list) report_check_error("Let definition must be a list.\n"); + def = (struct list_checker) { "let definition", curr_elem(&c)->list }; + expect_form(&def, sexp_symbol); + expect_any(&def); + expect_list_end(&def); + consume(&c); + } +} + +void let_syntax_checker(struct list_checker *c) { + /* (let (def ..) exp exp..) */ + if (curr_form(c) != sexp_list) report_check_error("Let definitions must be in a list.\n"); + check_let_defs(curr_elem(c)); + /* At least one more element */ + expect_any(c); +} + +void quote_syntax_checker(struct list_checker *c) { + /* (quote exp) */ + expect_any(c); + expect_list_end(c); +} + +struct syntax { + const char *name; + syntax_checker checker; +}; + +/* This must be alpha sorted with respect to syntax name to allow binary search */ +struct syntax syntaxes[] = { + { "begin", begin_syntax_checker }, + { "define", define_syntax_checker }, + { "if", if_syntax_checker }, + { "lambda", lambda_syntax_checker }, + { "let", let_syntax_checker }, + /* let and letrec share the same syntax */ + { "letrec", let_syntax_checker }, + { "quote", quote_syntax_checker }, +}; + +int form_cmp(const void *lhs, const void *rhs) { + const char *key = lhs; + const struct syntax *syn = rhs; + return strcmp(key, syn->name); +} + +void check_sexp(struct sexp *s) { + if (s->form == sexp_list && s->list != NULL) { + struct sexp *head = s->list->elem; + if (head->form == sexp_symbol) { + const char *form = head->lexeme; + struct syntax *res = bsearch(form, + syntaxes, + sizeof(syntaxes)/sizeof(struct syntax), + sizeof(struct syntax), + form_cmp); + + if (res) { + struct list_checker c = { form, s->list->next }; + res->checker(&c); + } + } + } +} + +void check_program(struct sexp_list *l) { + while (l) { + check_sexp(l->elem); + l = l->next; + } +}