Implement parser

Update readme
2022-05-25 13:41:41 +02:00 · 2022-05-25 13:41:29 +02:00
14 changed files with 1094 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+# Build artifacts
+build/
+leao
--- a/23
+++ b/23
@ -0,0 +1,23 @@
+CFLAGS = -std=c11 -g -Wpedantic -Iinclude/
+
+# Directory containing object files and build artifacts
+obj_dir = build
+
+objects = main.o sexp.o parser.o syntax_checker.o
+object_paths = $(addprefix $(obj_dir)/, $(objects))
+
+leao: $(object_paths)
+	$(CC) -o $@ $(CLFAGS) $(object_paths)
+
+$(obj_dir)/%.o: src/%.c | $(obj_dir)
+	$(CC) $(CFLAGS) -o $@ -c $<
+
+$(obj_dir):
+	mkdir -p $(obj_dir)
+
+clean:
+	rm -f leao
+	rm -rf $(obj_dir)
+
+.PHONY: clean
+
--- a/README.md
+++ b/README.md
@ -3,6 +3,8 @@
 minimal scheme dialect implementation written in ISO C11 meant to be used
 to write interpreters and compilers (that can therefore be easily bootstrapped).

+This implementation is only an hobby and should be not taken seriously at all.
+
 ## How to build

 This project uses **GNU Make**, in order to build it, run in your shell:
@ -13,7 +15,7 @@ This project uses **GNU Make**, in order to build it, run in your shell:

 Here is a list of features that are planned:

-* Input source code can be encoded in UTF-8, but identifiers are very limited (cfr. `src/parser.c`)
+* Only ASCII is supported (for the full grammar check `src/parser.c`)
 * There are only integers (probably int64_t) and integer arithmetic
 * **box** primitive datatype, being the only one that allows mutation, all the other values are immutable
 * First class functions
--- a/examples/example.scm
+++ b/examples/example.scm
@ -0,0 +1 @@
+(define name 123)
--- a/examples/hard_example.scm
+++ b/examples/hard_example.scm
@ -0,0 +1,108 @@
+(import (scheme base))
+; Regular expression AST representation
+
+; Regex representing the empty language
+(define-record-type <empty-regex> (empty-regex) empty-regex?)
+
+; A string literal to be matched literally, the empty string represents the null regex
+(define-record-type <lit-regex> (lit-regex c) lit-regex?
+  (c lit-value))
+
+(define-record-type <cons-regex> (cons-regex lhs rhs) cons-regex?
+  (lhs cons-lhs)
+  (rhs cons-rhs))
+
+(define-record-type <alt-regex> (alt-regex lhs rhs) alt-regex?
+  (lhs alt-lhs)
+  (rhs alt-rhs))
+
+(define-record-type <star-regex> (star-regex body) star-regex?
+  (body star-body))
+
+; Combinators for defining regular expressions
+(define (regex-box-string x)
+  (if (string? x) (lit-regex x) x))
+
+(define rgx-null (lit-regex ""))
+(define (rgx-cons . rs)
+  (if (null? rs)
+      rgx-null
+      (cons-regex (regex-box-string (car rs)) (apply rgx-cons (cdr rs)))))
+
+(define (rgx-alt . rs)
+  (if (null? rs)
+      (empty-regex)
+      (alt-regex (regex-box-string (car rs)) (apply rgx-alt (cdr rs)))))
+
+(define (rgx-star r) (star-regex (regex-box-string r)))
+
+(define (string-null? s) (= 0 (string-length s)))
+
+; Check if a regex matches the null string
+(define (nullable? r)
+  (cond ((empty-regex? r) #f)
+        ((lit-regex? r) (string-null? (lit-value r)))
+        ((cons-regex? r) (and (nullable? (cons-lhs r)) (nullable? (cons-rhs r))))
+        ((alt-regex? r) (or (nullable? (alt-lhs r)) (nullable? (alt-rhs r))))
+        ((star-regex? r) #t)))
+
+(define (unsatisfiable? r)
+  (cond ((empty-regex? r) #t)
+        ((lit-regex? r) #f)
+        ((cons-regex? r) (or (unsatisfiable? (cons-lhs r)) (unsatisfiable? (cons-rhs r))))
+        ((alt-regex? r) (and (unsatisfiable? (cons-lhs r)) (unsatisfiable? (cons-rhs r))))
+        ((star-regex? r) #f)))
+
+; Now given a regex AST we can define its derivative with respect to a char
+; Definition:
+; D_c({}) = {}
+; D_c(Lit(s)) = ok
+; D_c(r + b) = D_c(r) + D_c(b)
+; D_c(r s) = D_c(r) s + d(r) D_c(s)
+; D_c(r*) = D_c(r) r*
+; TODO: Lazily create the resulting regex
+(define (regex-derive c r)
+  (cond ((empty-regex? r) (empty-regex))
+        ((lit-regex? r) (cond ((string-null? (lit-value r)) (empty-regex))
+                              ((eq? c (string-ref (lit-value r) 0)) (lit-regex (string-copy (lit-value r) 1)))
+                              (else (empty-regex))))
+        ((alt-regex? r) (alt-regex (regex-derive c (alt-lhs r)) (regex-derive c (alt-rhs r))))
+        ((star-regex? r) (cons-regex (regex-derive c (star-body r)) r))
+        ((cons-regex? r) (if (nullable? (cons-lhs r))
+                             (alt-regex (cons-regex (regex-derive c (cons-lhs r))
+                                                    (cons-rhs r))
+                                        (regex-derive c (cons-rhs r)))
+                             (cons-regex (regex-derive c (cons-lhs r))
+                                         (cons-rhs r))))))
+
+
+; Let us leverage the regex derivative algorithm to define a regex matcher
+(define (match-regex str regex)
+  (define (match-regex-list str regex)
+    (if (null? str)
+        (nullable? regex)
+        (match-regex-list (cdr str) (regex-derive (car str) regex))))
+
+  (match-regex-list (string->list str) regex))
+
+; As a tiny variation, let us define a grep-like function
+; The first step is finding where there are matches for the regex in the string
+
+; Given a string and a regex we want to find all the matches
+; A match is a range in the string
+(define-record-type <match> (make-match start end) match? (start match-start) (end match-end))
+
+; regex is the regex to match
+; str the string upon which to match
+; start is where to start reading from, end where to stop!
+; matches is the list of matches to which we must cons our current result
+; Try to get the longest matching value
+(define (find-match regex str start pos end matches)
+  (if (nullable? regex)
+      (cons (make-match start pos) matches)
+      (find-match (regex-derive (string-ref str pos) start (+ pos 1) end))
+      ()))
+
+; Let us represent (a OR b)*cde
+(define my-regex (rgx-cons (rgx-star (rgx-alt "a" "b")) "cde"))
+(match-regex "abababababababcde" my-regex) ; ==> #t
--- a/include/parser.h
+++ b/include/parser.h
@ -0,0 +1,10 @@
+#include <stdio.h>
+#include <stdbool.h>
+
+struct sexp_list;
+
+/* Parse a single s-expression from the `in` stream.
+ * If the parse is successful, The output is put in out.
+ * Otherwise the program is aborted.
+ */
+void parse_program(FILE *in, struct sexp_list **out);
--- a/include/sexp.h
+++ b/include/sexp.h
@ -0,0 +1,47 @@
+#include <stdbool.h>
+
+/* Datastructures representing s-exprs */
+
+struct sexp_list;
+
+enum sexp_form {
+    sexp_bool,
+    sexp_int,
+    sexp_string,
+    sexp_symbol,
+    sexp_list,
+};
+
+struct sexp {
+    enum sexp_form form;
+    union {
+        bool bool_lit;
+        /* Integer literals are not directly converted to numbers */
+        char *lexeme;
+        struct sexp_list *list;
+    };
+};
+
+struct sexp_list {
+    struct sexp *elem;
+    struct sexp_list *next;
+};
+
+/* Helper to create a list by appending at its end in O(1) time */
+struct sexp_list_builder {
+    struct sexp_list *head;
+    struct sexp_list *last;
+};
+
+/* Returns false if allocation fails */
+bool sexp_list_append(struct sexp_list_builder *b, struct sexp *elem);
+
+/* If allocation fails, these functions return NULL */
+struct sexp *sexp_make_bool(bool v);
+struct sexp *sexp_make_int(char *lexeme);
+struct sexp *sexp_make_string(char *lexeme);
+struct sexp *sexp_make_symbol(char *lexeme);
+struct sexp *sexp_make_list(struct sexp_list *list);
+
+void free_sexp_list(struct sexp_list *head);
+void free_sexp(struct sexp *s);
--- a/include/string_table.h
+++ b/include/string_table.h
@ -0,0 +1,22 @@
+#include <stdbool.h>
+
+/* Hash table with linear probing implementation.
+ * The keys are always strings, the values can be anything.
+ * The table does not own any data.
+ */
+
+struct string_table;
+
+struct string_table *new_table(void);
+void free_table(struct string_table *t);
+
+/* Inserts a new entry in the table. If the key is already present, the new value
+ * substitutes the old one
+ */
+void string_table_insert(struct string_table *t, const char *key, void *value);
+
+/* Remove, if it's present, the entry for the given key */
+void string_table_remove(struct string_table *t, const char *key);
+
+/* Lookup a key, returns its value if found, otherwise NULL */
+void *string_table_lookup(struct string_table *t, const char *key);
--- a/include/syntax_checker.h
+++ b/include/syntax_checker.h
@ -0,0 +1,12 @@
+struct sexp;
+struct sexp_list;
+
+/* The Syntax checker is in charge
+ * of checking that all syntactic forms are correctly adoperated.
+ * TODO: In the future this will be part of the macro expander.
+ * 
+ * If there is any syntax error, an error message is printed to stderr the program is aborted.
+ */
+void check_syntax(struct sexp *s);
+
+void check_program(struct sexp_list *prog);
--- a/src/main.c
+++ b/src/main.c
@ -0,0 +1,23 @@
+#include <stdio.h>
+#include "parser.h"
+#include "sexp.h"
+#include "syntax_checker.h"
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        printf("Usage %s file.scm\n", argv[0]);
+        return 1;
+    }
+    FILE *in = fopen(argv[1], "r");
+    if (!in) {
+        printf("Could not open file %s.\n", argv[1]);
+        return 1;
+    }
+    struct sexp_list *prog;
+    parse_program(in, &prog);
+    check_program(prog);
+    free_sexp_list(prog);
+    fclose(in);
+    printf("AC Milan campione d'italia 2021/2022.\n");
+    return 0;
+}
--- a/src/parser.c
+++ b/src/parser.c
@ -0,0 +1,398 @@
+#include "parser.h"
+#include "sexp.h"
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdnoreturn.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <string.h>
+
+/*
+ * S-expressions grammar
+ * 
+ * comment := ; any character except until newline
+ * space   := [\n\t ]
+ * comments and spaces and newlines and tabs are ignored
+ *
+ * sexpr := list | atom | quoted_expr
+ * 
+ * quoted_expr := ' sexpr
+ * 
+ * list := LPAREN sexpr * RPAREN
+ * 
+ * LPAREN := ( | [ | {
+ * RPAREN := ) | ] | }
+ * 
+ * atom := int_lit | bool_lit | string_lit | ident
+ * 
+ * int_lit := [0-9]+
+ * bool_lit := #t | #f
+ * 
+ * string_lit := " (string_elem | escaped_elem)* "
+ * string_elem := ASCII printable except \ and "
+ * escaped_elem := \ [\\nt"]
+ * 
+ * ident := ident_start ident_cont *
+ * special_start := [!$%&*+-/.:<=>?@^_~]
+ * ident_start := [a-zA-Z] | special_start
+ * ident_cont := ident_start | [0-9]
+ * 
+ */
+
+static inline bool is_ident_start(const char c) {
+    switch (c) {
+    case '!': case '$': case '%':
+    case '&': case '*': case '+':
+    case '-': case '/': case '.':
+    case ':': case '<': case '=':
+    case '>': case '?': case '@':
+    case '^': case '_': case '~':
+        return true;
+
+    default:
+        return isalpha(c);
+    }
+}
+
+static inline bool is_ident_cont(const char c) {
+    return is_ident_start(c) || isdigit(c);
+}
+
+/* In case of memory shortage or IO failure, the program is brutally aborted. */
+/* In case of syntax errors, the program is brutally aborted.
+ * TODO: Try to make a more compromising parser
+ */
+
+/* Print a message to stderr and abort the program */
+static noreturn void abort_program(const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    vfprintf(stderr, fmt, args);
+    va_end(args);
+    exit(EXIT_FAILURE);
+}
+
+static noreturn void parse_error_abort(void) {
+    abort_program("Parse error.\n");
+}
+
+static noreturn void out_of_memory_abort(void) {
+    abort_program("Out of memory.\n");
+}
+
+struct parser {
+    /* Input stream */
+    FILE *in;
+    /* Current lookahead symbol */
+    char curr_char;
+
+    /* Internal buffer */
+    char *buf;
+    size_t buf_len;
+    size_t buf_cap;
+
+    /* Location info */
+    size_t line;
+    size_t col;
+};
+
+/* Helper function to report error messages with the attached location */
+static void report_error(struct parser *p, const char *fmt, ...) {
+    va_list args;
+    fprintf(stderr, "Error at %d:%d ", p->line, p->col);
+    va_start(args, fmt);
+    vfprintf(stderr, fmt, args);
+    va_end(args);
+}
+
+/* read next input char and update location info */
+static inline void consume(struct parser *p) {
+    /* First update location info */
+    if (p->curr_char == '\n') {
+        ++p->line;
+        p->col = 1;
+    }
+    else {
+        ++p->col;
+    }
+
+    p->curr_char = fgetc(p->in);
+    if (ferror(p->in)) {
+        abort_program("Error while reading from input stream.\n");
+    }
+}
+
+/* Store a character in the internal buffer */
+static inline void store_char(struct parser *p, const char c) {
+    /* If the buffer has not already been initialized, do so now */
+    if (p->buf_cap == 0) {
+        p->buf_cap = 256;
+        p->buf = malloc(p->buf_cap);
+        if (!p->buf) {
+            out_of_memory_abort();
+        }
+    }
+    /* Otherwise check if it's large enough to contain another character */
+    else if (p->buf_cap - 1 < p->buf_len) {
+        /* We double the internal buffer capacity each time,
+         * but we must pay attention to integer overflow,
+         * so let us first check if we can double the size of the buffer
+         * with no problem
+         */
+         if (SIZE_MAX / 2 <= p->buf_cap) {
+             report_error(p, "Lexeme is too long.\n");
+             abort_program("Buffer overflow, can't allocate more space, lexeme is too large.\n");
+         }
+         /* If the initial buffer capacity is set to zero,
+          * we set it to 256, as initial capacity
+          */
+         p->buf_cap = p->buf_cap * 2;
+
+         p->buf = realloc(p->buf, p->buf_cap);
+         if (!p->buf) {
+             out_of_memory_abort();
+         }
+    }
+    /* Now we can safely store the given character in the internal buffer */
+    p->buf[p->buf_len] = c;
+    ++p->buf_len;
+}
+
+/* Create a null terminated copy of the internal buffer,
+ * and reset the internal buffer, so that in can be used for the next lexeme */
+static inline char *lexeme_recognized(struct parser *p) {
+    char *res = malloc(p->buf_len + 1);
+    if (!res) {
+        out_of_memory_abort();
+    }
+    memcpy(res, p->buf, p->buf_len);
+    res[p->buf_len] = 0;
+
+    /* Reset internal buffer */
+    p->buf_len = 0;
+    return res;
+}
+
+/* Reset internal buffer */
+static inline void reset_buffer(struct parser *p) {
+    p->buf_len = 0;
+}
+
+static void skip_ws_comments(struct parser *p) {
+    bool keep_ignoring = true;
+    while (keep_ignoring) {
+        switch (p->curr_char) {
+        case ' ': case '\n': case '\t':
+            consume(p);
+            break;
+        case ';':
+            /* Ignore everything until a newline */
+            while (p->curr_char != '\n') {
+                consume(p);
+            }
+            break;
+        default:
+            keep_ignoring = false;
+        }
+    }
+}
+
+static void parse_int_lit(struct parser *p, struct sexp **out) {
+    store_char(p, p->curr_char);
+    while (isdigit(p->curr_char)) {
+        store_char(p, p->curr_char);
+        consume(p);
+    }
+    *out = sexp_make_int(lexeme_recognized(p));
+    if (!*out) out_of_memory_abort();
+}
+
+static inline void parse_bool_lit(struct parser *p, struct sexp **out) {
+    consume(p);
+    switch (p->curr_char) {
+    case 't':
+        *out = sexp_make_bool(true);
+        break;
+    case 'f':
+        *out = sexp_make_bool(false);
+        break;
+    default:
+        report_error(p, "Expected bool literal, invalid lexeme.\n");
+        parse_error_abort();
+    }
+
+    if (!*out) out_of_memory_abort();
+}
+
+static void parse_string_lit(struct parser *p, struct sexp **out) {
+    consume(p);
+    while (p->curr_char != EOF && p->curr_char != '"') {
+        if (p->curr_char == '\\') {
+            consume(p);
+            /* Read escape sequence, and store the meaning of the escape
+             * sequence in the internal buffer
+             */
+            switch (p->curr_char) {
+            case '"':
+                store_char(p, '"');
+                break;
+            case 'n':
+                store_char(p, '\n');
+                break;
+            case 't':
+                store_char(p, '\t');
+                break;
+            default:
+                report_error(p, "Invalid escape sequence.\n");
+                parse_error_abort();
+            }
+            consume(p);
+        }
+        else if (isprint(p->curr_char)) {
+            store_char(p, p->curr_char);
+            consume(p);
+        }
+        else {
+            report_error(p, "Invalid character in string literal.\n");
+            parse_error_abort();
+        }
+    }
+    if (p->curr_char != '"') {
+        report_error(p, "Unexpected end of file. expected closing quote in string literal.\n");
+        parse_error_abort();
+    }
+    consume(p);
+    *out = sexp_make_string(lexeme_recognized(p));
+    if (!*out) out_of_memory_abort();
+}
+
+static void parse_ident(struct parser *p, struct sexp **out) {
+    if (!is_ident_start(p->curr_char)) {
+        report_error(p, "Invalid character found.\n");
+        parse_error_abort();
+    }
+    store_char(p, p->curr_char);
+    consume(p);
+    while (is_ident_cont(p->curr_char)) {
+        store_char(p, p->curr_char);
+        consume(p);
+    }
+    *out = sexp_make_symbol(lexeme_recognized(p));
+    if (!*out) out_of_memory_abort();
+}
+
+
+static void parse_sexp(struct parser *p, struct sexp **out);
+
+static void parse_quoted_sexp(struct parser *p, struct sexp **out) {
+    consume(p);
+    struct sexp *quoted;
+    parse_sexp(p, &quoted);
+    struct sexp_list_builder quote = { NULL };
+
+    /* Quoted s-expressions are a shortcut for (quote sexp), so we
+     * just expand the meaning
+     */
+    const char quote_kw[] = "quote";
+
+    char *quote_lexeme = malloc(sizeof(quote_kw));
+    if (!quote_lexeme) out_of_memory_abort();
+    memcpy(quote_lexeme, quote_kw, sizeof(quote_kw));
+
+    struct sexp *quote_symbol = sexp_make_symbol(quote_lexeme);
+    if (!quote_symbol) out_of_memory_abort();
+
+    if (!sexp_list_append(&quote, quote_symbol)) out_of_memory_abort();
+    if (!sexp_list_append(&quote, quoted)) out_of_memory_abort();
+
+    *out = sexp_make_list(quote.head);
+    if (!*out) out_of_memory_abort();
+}
+
+static void parse_list(struct parser *p, struct sexp **out, char delim) {
+    consume(p);
+    skip_ws_comments(p);
+    struct sexp_list_builder builder = { NULL };
+    struct sexp *curr;
+    while (p->curr_char != EOF && p->curr_char != delim) {
+        parse_sexp(p, &curr);
+        if (!sexp_list_append(&builder, curr)) out_of_memory_abort();
+        skip_ws_comments(p);
+    }
+    if (p->curr_char != delim) {
+        report_error(p, "Unbalanced parens. Expected closing `%c` in list expression.\n", delim);
+        parse_error_abort();
+    }
+    consume(p);
+    *out = sexp_make_list(builder.head);
+    if (!*out) out_of_memory_abort();
+}
+
+static void parse_sexp(struct parser *p, struct sexp **out) {
+    switch (p->curr_char) {
+    case EOF:
+        report_error(p, "Expected s-expression, found end of file.\n");
+        parse_error_abort();
+        break;
+    case '\'':
+        parse_quoted_sexp(p, out);
+        break;
+    case '"':
+        parse_string_lit(p, out);
+        break;
+    case '(':
+        parse_list(p, out, ')');
+        break;
+    case '[':
+        parse_list(p, out, ']');
+        break;
+    case '{':
+        parse_list(p, out, ']');
+        break;
+    case '#':
+        parse_bool_lit(p, out);
+        break;
+    case '0': case '1': case '2':
+    case '3': case '4': case '5':
+    case '6': case '7': case '8':
+    case '9':
+        parse_int_lit(p, out);
+        break;
+    default:
+        parse_ident(p, out);
+        break;
+    }
+}
+
+void parse_program(FILE *in, struct sexp_list **out) {
+    char curr_char = fgetc(in);
+    if (ferror(in)) {
+        abort_program("Error while reading from input stream.\n");
+    }
+
+    struct parser p = {
+        .in = in,
+        .curr_char = curr_char,
+        .buf = NULL,
+        .buf_len = 0,
+        .buf_cap = 0,
+        .line = 1,
+        .col = 1,
+    };
+
+    skip_ws_comments(&p);
+    struct sexp_list_builder builder = { NULL };
+    struct sexp *s;
+    while (p.curr_char != EOF) {
+        parse_sexp(&p, &s);
+        if (!sexp_list_append(&builder, s))
+            out_of_memory_abort();
+
+        skip_ws_comments(&p);
+    }
+
+    *out = builder.head;
+    /* Free internal buffer, before quitting */
+    free(p.buf);
+}
--- a/src/sexp.c
+++ b/src/sexp.c
@ -0,0 +1,96 @@
+#include "sexp.h"
+#include <stdlib.h>
+
+bool sexp_list_append(struct sexp_list_builder *b, struct sexp *elem) {
+    struct sexp_list *node = malloc(sizeof(struct sexp_list));
+    if (!node) return false;
+    node->elem = elem;
+    node->next = NULL;
+
+    if (b->last == NULL) {
+        b->head = node;
+        b->last = node;
+    }
+    else {
+        b->last->next = node;
+        b->last = b->last->next;
+    }
+    return true;
+}
+
+struct sexp *sexp_make_bool(bool v) {
+    struct sexp *res = malloc(sizeof(struct sexp));
+    if (!res) return NULL;
+
+    res->form = sexp_bool;
+    res->bool_lit = v;
+    return res;
+}
+
+struct sexp *sexp_make_int(char *lexeme) {
+    struct sexp *res = malloc(sizeof(struct sexp));
+    if (!res) return NULL;
+
+    res->form = sexp_int;
+    res->lexeme = lexeme;
+    return res;
+}
+
+struct sexp *sexp_make_string(char *lexeme) {
+    struct sexp *res = malloc(sizeof(struct sexp));
+    if (!res) return NULL;
+
+    res->form = sexp_string;
+    res->lexeme = lexeme;
+    return res;
+}
+
+struct sexp *sexp_make_symbol(char *lexeme) {
+    struct sexp *res = malloc(sizeof(struct sexp));
+    if (!res) return NULL;
+
+    res->form = sexp_symbol;
+    res->lexeme = lexeme;
+    return res;
+}
+
+struct sexp *sexp_make_list(struct sexp_list *list) {
+    struct sexp *res = malloc(sizeof(struct sexp));
+    if (!res) return NULL;
+
+    res->form = sexp_list;
+    res->list = list;
+    return res;
+}
+
+void free_sexp_list(struct sexp_list *head) {
+    struct sexp_list *tmp;
+    while (head) {
+        tmp = head;
+        head = head->next;
+        free_sexp(tmp->elem);
+        free(tmp);
+    }
+}
+
+void free_sexp(struct sexp *s) {
+    if (!s) return;
+
+    switch (s->form) {
+    case sexp_bool:
+        break;
+    case sexp_int:
+        free(s->lexeme);
+        break;
+    case sexp_string:
+        free(s->lexeme);
+        break;
+    case sexp_symbol:
+        free(s->lexeme);
+        break;
+    case sexp_list:
+        free_sexp_list(s->list);
+        break;
+    }
+    free(s);
+}
--- a/src/string_table.c
+++ b/src/string_table.c
@ -0,0 +1,97 @@
+#include "string_table.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* For now we use the simple djb2 hashing algorithm,
+ * but TODO: implement SipHash-2-4
+ */
+static inline size_t hash_string(const char *str) {
+    size_t hash = 5381;
+    for (; *str != 0; ++str) {
+        hash = hash * 33 + *str;
+    }
+    return hash;
+}
+
+struct bucket {
+    /* This flag distinguishes between empty
+     * and used buckets
+     */
+    bool used;
+    const char *key;
+    void *value;
+};
+
+struct string_table {
+    struct bucket *entries;
+    size_t capacity;
+    /* Elements count in the table */
+    size_t size;
+};
+
+static void rehash(struct string_table *t) {
+    /* We double the size, but what's a better strategy? */
+    size_t new_capacity = t->capacity * 2;
+    struct bucket *new_entries = calloc(new_capacity, sizeof(struct bucket));
+    size_t i;
+    size_t new_hash;
+    size_t new_pos;
+    for (i = 0; i < t->capacity; ++i) {
+        if (t->entries[i].used) {
+            new_hash = hash_string(t->entries[i].key);
+            new_pos = new_hash % new_capacity;
+            new_entries[new_pos] = t->entries[i];
+        }
+    }
+    t->capacity = new_capacity;
+    free(t->entries);
+    t->entries = new_entries;
+}
+
+void string_table_insert(struct string_table *t, const char *key, void *value) {
+    /* First check if we need to rehash, we do this
+     * when the table is full at 75%, this should prevent
+     * some collisions
+     */
+    if (t->size * 4 > t->capacity * 3) {
+        rehash(t);
+    }
+    size_t pos = hash_string(key)  % t->capacity;
+    while (t->entries[pos].used) {
+        if (!strcmp(key, t->entries[pos].key)) {
+            /* If the key is already been inserted,
+             * just update its corresponding value
+             */
+            t->entries[pos].value = value;
+            break;
+        }
+        ++pos;
+    }
+    /* There must be an empty pos, so pos < t->capacity surely */
+    t->entries[pos] = (struct bucket) { .key = key, .value = value, .used = true };
+    ++t->size;
+}
+
+void string_table_remove(struct string_table *t, const char *key) {
+    size_t pos = hash_string(key) % t->capacity;
+    while (t->entries[pos].used) {
+        if (!strcmp(key, t->entries[pos].key)) {
+            /* Remove this bucket */
+            t->entries[pos].used = false;
+            break;
+        }
+        ++pos;
+    }
+}
+
+void *string_table_lookup(struct string_table *t, const char *key) {
+    size_t pos = hash_string(key) % t->capacity;
+    while (pos < t->capacity && t->entries[pos].used) {
+        if (!strcmp(key, t->entries[pos].key)) {
+            return t->entries[pos].value;
+        }
+
+        ++pos;
+    }
+    return NULL;
+}
--- a/src/syntax_checker.c
+++ b/src/syntax_checker.c
@ -0,0 +1,251 @@
+#include "syntax_checker.h"
+#include "sexp.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <stdnoreturn.h>
+
+/* Small DSL to perform pattern matching on a list */
+struct list_checker {
+    const char *syntax_form;
+    /* Cursor inside the list */
+    struct sexp_list *curr;
+};
+
+/* TODO: Make error messages more informative */
+static noreturn void report_check_error(const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    vfprintf(stderr, fmt, args);
+    va_end(args);
+    exit(EXIT_FAILURE);
+}
+
+static const char *syn_form_descr(enum sexp_form f) {
+    switch (f) {
+    case sexp_bool:
+        return "bool literal";
+    case sexp_int:
+        return "int literal";
+    case sexp_string:
+        return "string literal";
+    case sexp_symbol:
+        return "symbol";
+    case sexp_list:
+        return "list";
+    }
+}
+
+static inline enum sexp_form curr_form(struct list_checker *c) {
+    return c->curr->elem->form;
+}
+
+static inline struct sexp *curr_elem(struct list_checker *c) {
+    return c->curr->elem;
+}
+
+static inline void consume(struct list_checker *c) {
+    c->curr = c->curr->next;
+}
+
+static inline void expect_form(struct list_checker *c, enum sexp_form form) {
+    if (c->curr->elem->form != form) {
+        report_check_error("Syntax form %s: Expected `%s` but got `%s` inside list.\n", c->syntax_form, syn_form_descr(form), syn_form_descr(c->curr->elem->form));
+    }
+    consume(c);
+}
+
+static inline void expect_any(struct list_checker *c) {
+    if (c->curr == NULL) report_check_error("Syntax form %s: Too few arguments, unexpected end of list.\n", c->syntax_form);
+    consume(c);
+}
+
+static inline void expect_list_end(struct list_checker *c) {
+    if (c-> curr != NULL) report_check_error("Syntax form %s: Too many arguments, expected end of list.\n", c->syntax_form);
+}
+
+static inline bool has_more_elems(struct list_checker *c) {
+    return c->curr != NULL;
+}
+
+/* Syntax Checkers */
+
+typedef void (*syntax_checker)(struct list_checker *c);
+
+void begin_syntax_checker(struct list_checker *c) {
+    /* (begin expr expr...) */
+    /* We require at least one expr in the begin body */
+    expect_any(c);
+}
+
+void check_define_with_formals(struct sexp *formals) {
+    /* (ident formals)
+     * (ident . formal)
+     */
+    struct list_checker c = { "define formals", formals->list };
+    expect_form(&c, sexp_symbol);
+    if (curr_form(&c) == sexp_symbol && !strcmp(curr_elem(&c)->lexeme, ".")) {
+        consume(&c);
+        expect_form(&c, sexp_symbol);
+        expect_list_end(&c);
+    }
+    else {
+        while (has_more_elems(&c)) {
+            expect_form(&c, sexp_symbol);
+        }
+    }
+}
+
+void define_syntax_checker(struct list_checker *c) {
+    /* (define ident expr expr...)
+     * (define (ident formals) expr expr...)
+     * (define (ident . formal) expr expr...)
+     */
+    switch (curr_form(c)) {
+    case sexp_symbol:
+        consume(c);
+        /* At least one expr */
+        expect_any(c);
+        break;
+    case sexp_list:
+        check_define_with_formals(curr_elem(c));
+        consume(c);
+        /* At least one expr */
+        expect_any(c);
+        break;
+    default:
+        report_check_error("Invalid define form, expected either symbol or list, but got `%s`\n", syn_form_descr(curr_form(c)));
+    }
+}
+
+void if_syntax_checker(struct list_checker *c) {
+    /* (if expr expr expr) */
+    expect_any(c);
+    expect_any(c);
+    expect_any(c);
+    expect_list_end(c);
+}
+
+void check_lambda_formals(struct sexp *l) {
+    /*
+     * (ident ...)
+     * (ident ident ... . ident)
+     */
+    struct list_checker c = { "lambda formals", l->list };
+    if (has_more_elems(&c)) {
+        if (curr_form(&c) != sexp_symbol) report_check_error("Expected symbol in lambda formal.\n");
+        /* First element can't be a dot */
+        if (!strcmp(curr_elem(&c)->lexeme, ".")) report_check_error("Expected at least one param before `.` in lambda formal.\n");
+        consume(&c);
+        while (has_more_elems(&c)) {
+            /* All elements of the list must be symbols */
+            if (curr_form(&c) != sexp_symbol) report_check_error("Invalid formal. All formal parameters must be symbols.\n");
+            /* If the current element is a dot,
+             * then it must be followed by exactly one variable
+             */
+            if (strcmp(curr_elem(&c)->lexeme, ".")) {
+                consume(&c);
+                expect_form(&c, sexp_symbol);
+                expect_list_end(&c);
+                break;
+            }
+            consume(&c);
+        }
+    }
+}
+
+void lambda_syntax_checker(struct list_checker *c) {
+    /* (lambda ident exp exp...)
+     * (lambda (formals) exp exp...)
+     */
+    switch (curr_form(c)) {
+    case sexp_symbol:
+        consume(c);
+        break;
+    case sexp_list:
+        check_lambda_formals(curr_elem(c));
+        consume(c);
+        break;
+    }
+    /* At least one more element */
+    expect_any(c);
+}
+
+void check_let_defs(struct sexp *s) {
+    /* (name exp) */
+    struct list_checker c = { "let definition list", s->list };
+    struct list_checker def;
+    while (has_more_elems(&c)) {
+        if (curr_form(&c) != sexp_list) report_check_error("Let definition must be a list.\n");
+        def = (struct list_checker) { "let definition", curr_elem(&c)->list };
+        expect_form(&def, sexp_symbol);
+        expect_any(&def);
+        expect_list_end(&def);
+        consume(&c);
+    }
+}
+
+void let_syntax_checker(struct list_checker *c) {
+    /* (let (def ..) exp exp..) */
+    if (curr_form(c) != sexp_list) report_check_error("Let definitions must be in a list.\n");
+    check_let_defs(curr_elem(c));
+    /* At least one more element */
+    expect_any(c);
+}
+
+void quote_syntax_checker(struct list_checker *c) {
+    /* (quote exp) */
+    expect_any(c);
+    expect_list_end(c);
+}
+
+struct syntax {
+    const char *name;
+    syntax_checker checker;
+};
+
+/* This must be alpha sorted with respect to syntax name to allow binary search */
+struct syntax syntaxes[] = {
+    { "begin", begin_syntax_checker },
+    { "define", define_syntax_checker },
+    { "if", if_syntax_checker },
+    { "lambda", lambda_syntax_checker },
+    { "let", let_syntax_checker },
+    /* let and letrec share the same syntax */
+    { "letrec", let_syntax_checker },
+    { "quote", quote_syntax_checker },
+};
+
+int form_cmp(const void *lhs, const void *rhs) {
+    const char *key = lhs;
+    const struct syntax *syn = rhs;
+    return strcmp(key, syn->name);
+}
+
+void check_sexp(struct sexp *s) {
+    if (s->form == sexp_list && s->list != NULL) {
+        struct sexp *head = s->list->elem;
+        if (head->form == sexp_symbol) {
+            const char *form = head->lexeme;
+            struct syntax *res = bsearch(form,
+                                         syntaxes,
+                                         sizeof(syntaxes)/sizeof(struct syntax),
+                                         sizeof(struct syntax),
+                                         form_cmp);
+
+            if (res) {
+                struct list_checker c = { form, s->list->next };
+                res->checker(&c);
+            }
+        }
+    }
+}
+
+void check_program(struct sexp_list *l) {
+    while (l) {
+        check_sexp(l->elem);
+        l = l->next;
+    }
+}
Author	SHA1	Message	Date
Francesco Magliocca	4dffe1d7b4	Implement parser	2022-05-25 13:41:41 +02:00
Francesco Magliocca	a57e36e6ea	Update readme	2022-05-25 13:41:29 +02:00