Implement parser

Update readme
2022-05-25 13:41:41 +02:00 · 2022-05-25 13:41:29 +02:00
14 changed files with 1094 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 # Build artifacts
 build/
 leao
--- a/23
+++ b/23
@ -0,0 +1,23 @@
 CFLAGS = -std=c11 -g -Wpedantic -Iinclude/
 # Directory containing object files and build artifacts
 obj_dir = build
 objects = main.o sexp.o parser.o syntax_checker.o
 object_paths = $(addprefix $(obj_dir)/, $(objects))
 leao: $(object_paths)
 	$(CC) -o $@ $(CLFAGS) $(object_paths)
 $(obj_dir)/%.o: src/%.c | $(obj_dir)
 	$(CC) $(CFLAGS) -o $@ -c $<
 $(obj_dir):
 	mkdir -p $(obj_dir)
 clean:
 	rm -f leao
 	rm -rf $(obj_dir)
 .PHONY: clean
--- a/README.md
+++ b/README.md
@ -3,6 +3,8 @@
 minimal scheme dialect implementation written in ISO C11 meant to be used
 to write interpreters and compilers (that can therefore be easily bootstrapped).
 This implementation is only an hobby and should be not taken seriously at all.
 ## How to build
 This project uses **GNU Make**, in order to build it, run in your shell:
@ -13,7 +15,7 @@ This project uses **GNU Make**, in order to build it, run in your shell:
 Here is a list of features that are planned:
-* Input source code can be encoded in UTF-8, but identifiers are very limited (cfr. `src/parser.c`)
+* Only ASCII is supported (for the full grammar check `src/parser.c`)
 * There are only integers (probably int64_t) and integer arithmetic
 * **box** primitive datatype, being the only one that allows mutation, all the other values are immutable
 * First class functions
--- a/examples/example.scm
+++ b/examples/example.scm
@ -0,0 +1 @@
 (define name 123)
--- a/examples/hard_example.scm
+++ b/examples/hard_example.scm
@ -0,0 +1,108 @@
 (import (scheme base))
 ; Regular expression AST representation
 ; Regex representing the empty language
 (define-record-type <empty-regex> (empty-regex) empty-regex?)
 ; A string literal to be matched literally, the empty string represents the null regex
 (define-record-type <lit-regex> (lit-regex c) lit-regex?
  (c lit-value))
 (define-record-type <cons-regex> (cons-regex lhs rhs) cons-regex?
  (lhs cons-lhs)
  (rhs cons-rhs))
 (define-record-type <alt-regex> (alt-regex lhs rhs) alt-regex?
  (lhs alt-lhs)
  (rhs alt-rhs))
 (define-record-type <star-regex> (star-regex body) star-regex?
  (body star-body))
 ; Combinators for defining regular expressions
 (define (regex-box-string x)
  (if (string? x) (lit-regex x) x))
 (define rgx-null (lit-regex ""))
 (define (rgx-cons . rs)
  (if (null? rs)
      rgx-null
      (cons-regex (regex-box-string (car rs)) (apply rgx-cons (cdr rs)))))
 (define (rgx-alt . rs)
  (if (null? rs)
      (empty-regex)
      (alt-regex (regex-box-string (car rs)) (apply rgx-alt (cdr rs)))))
 (define (rgx-star r) (star-regex (regex-box-string r)))
 (define (string-null? s) (= 0 (string-length s)))
 ; Check if a regex matches the null string
 (define (nullable? r)
  (cond ((empty-regex? r) #f)
        ((lit-regex? r) (string-null? (lit-value r)))
        ((cons-regex? r) (and (nullable? (cons-lhs r)) (nullable? (cons-rhs r))))
        ((alt-regex? r) (or (nullable? (alt-lhs r)) (nullable? (alt-rhs r))))
        ((star-regex? r) #t)))
 (define (unsatisfiable? r)
  (cond ((empty-regex? r) #t)
        ((lit-regex? r) #f)
        ((cons-regex? r) (or (unsatisfiable? (cons-lhs r)) (unsatisfiable? (cons-rhs r))))
        ((alt-regex? r) (and (unsatisfiable? (cons-lhs r)) (unsatisfiable? (cons-rhs r))))
        ((star-regex? r) #f)))
 ; Now given a regex AST we can define its derivative with respect to a char
 ; Definition:
 ; D_c({}) = {}
 ; D_c(Lit(s)) = ok
 ; D_c(r + b) = D_c(r) + D_c(b)
 ; D_c(r s) = D_c(r) s + d(r) D_c(s)
 ; D_c(r*) = D_c(r) r*
 ; TODO: Lazily create the resulting regex
 (define (regex-derive c r)
  (cond ((empty-regex? r) (empty-regex))
        ((lit-regex? r) (cond ((string-null? (lit-value r)) (empty-regex))
                              ((eq? c (string-ref (lit-value r) 0)) (lit-regex (string-copy (lit-value r) 1)))
                              (else (empty-regex))))
        ((alt-regex? r) (alt-regex (regex-derive c (alt-lhs r)) (regex-derive c (alt-rhs r))))
        ((star-regex? r) (cons-regex (regex-derive c (star-body r)) r))
        ((cons-regex? r) (if (nullable? (cons-lhs r))
                             (alt-regex (cons-regex (regex-derive c (cons-lhs r))
                                                    (cons-rhs r))
                                        (regex-derive c (cons-rhs r)))
                             (cons-regex (regex-derive c (cons-lhs r))
                                         (cons-rhs r))))))
 ; Let us leverage the regex derivative algorithm to define a regex matcher
 (define (match-regex str regex)
  (define (match-regex-list str regex)
    (if (null? str)
        (nullable? regex)
        (match-regex-list (cdr str) (regex-derive (car str) regex))))
  (match-regex-list (string->list str) regex))
 ; As a tiny variation, let us define a grep-like function
 ; The first step is finding where there are matches for the regex in the string
 ; Given a string and a regex we want to find all the matches
 ; A match is a range in the string
 (define-record-type <match> (make-match start end) match? (start match-start) (end match-end))
 ; regex is the regex to match
 ; str the string upon which to match
 ; start is where to start reading from, end where to stop!
 ; matches is the list of matches to which we must cons our current result
 ; Try to get the longest matching value
 (define (find-match regex str start pos end matches)
  (if (nullable? regex)
      (cons (make-match start pos) matches)
      (find-match (regex-derive (string-ref str pos) start (+ pos 1) end))
      ()))
 ; Let us represent (a OR b)*cde
 (define my-regex (rgx-cons (rgx-star (rgx-alt "a" "b")) "cde"))
 (match-regex "abababababababcde" my-regex) ; ==> #t
--- a/include/parser.h
+++ b/include/parser.h
@ -0,0 +1,10 @@
 #include <stdio.h>
 #include <stdbool.h>
 struct sexp_list;
 /* Parse a single s-expression from the `in` stream.
 * If the parse is successful, The output is put in out.
 * Otherwise the program is aborted.
 */
 void parse_program(FILE *in, struct sexp_list **out);
--- a/include/sexp.h
+++ b/include/sexp.h
@ -0,0 +1,47 @@
 #include <stdbool.h>
 /* Datastructures representing s-exprs */
 struct sexp_list;
 enum sexp_form {
    sexp_bool,
    sexp_int,
    sexp_string,
    sexp_symbol,
    sexp_list,
 };
 struct sexp {
    enum sexp_form form;
    union {
        bool bool_lit;
        /* Integer literals are not directly converted to numbers */
        char *lexeme;
        struct sexp_list *list;
    };
 };
 struct sexp_list {
    struct sexp *elem;
    struct sexp_list *next;
 };
 /* Helper to create a list by appending at its end in O(1) time */
 struct sexp_list_builder {
    struct sexp_list *head;
    struct sexp_list *last;
 };
 /* Returns false if allocation fails */
 bool sexp_list_append(struct sexp_list_builder *b, struct sexp *elem);
 /* If allocation fails, these functions return NULL */
 struct sexp *sexp_make_bool(bool v);
 struct sexp *sexp_make_int(char *lexeme);
 struct sexp *sexp_make_string(char *lexeme);
 struct sexp *sexp_make_symbol(char *lexeme);
 struct sexp *sexp_make_list(struct sexp_list *list);
 void free_sexp_list(struct sexp_list *head);
 void free_sexp(struct sexp *s);
--- a/include/string_table.h
+++ b/include/string_table.h
@ -0,0 +1,22 @@
 #include <stdbool.h>
 /* Hash table with linear probing implementation.
 * The keys are always strings, the values can be anything.
 * The table does not own any data.
 */
 struct string_table;
 struct string_table *new_table(void);
 void free_table(struct string_table *t);
 /* Inserts a new entry in the table. If the key is already present, the new value
 * substitutes the old one
 */
 void string_table_insert(struct string_table *t, const char *key, void *value);
 /* Remove, if it's present, the entry for the given key */
 void string_table_remove(struct string_table *t, const char *key);
 /* Lookup a key, returns its value if found, otherwise NULL */
 void *string_table_lookup(struct string_table *t, const char *key);
--- a/include/syntax_checker.h
+++ b/include/syntax_checker.h
@ -0,0 +1,12 @@
 struct sexp;
 struct sexp_list;
 /* The Syntax checker is in charge
 * of checking that all syntactic forms are correctly adoperated.
 * TODO: In the future this will be part of the macro expander.
 * 
 * If there is any syntax error, an error message is printed to stderr the program is aborted.
 */
 void check_syntax(struct sexp *s);
 void check_program(struct sexp_list *prog);
--- a/src/main.c
+++ b/src/main.c
@ -0,0 +1,23 @@
 #include <stdio.h>
 #include "parser.h"
 #include "sexp.h"
 #include "syntax_checker.h"
 int main(int argc, char *argv[]) {
    if (argc != 2) {
        printf("Usage %s file.scm\n", argv[0]);
        return 1;
    }
    FILE *in = fopen(argv[1], "r");
    if (!in) {
        printf("Could not open file %s.\n", argv[1]);
        return 1;
    }
    struct sexp_list *prog;
    parse_program(in, &prog);
    check_program(prog);
    free_sexp_list(prog);
    fclose(in);
    printf("AC Milan campione d'italia 2021/2022.\n");
    return 0;
 }
--- a/src/parser.c
+++ b/src/parser.c
@ -0,0 +1,398 @@
 #include "parser.h"
 #include "sexp.h"
 #include <stdio.h>
 #include <ctype.h>
 #include <stdlib.h>
 #include <stdnoreturn.h>
 #include <stdarg.h>
 #include <stdint.h>
 #include <string.h>
 /*
 * S-expressions grammar
 * 
 * comment := ; any character except until newline
 * space   := [\n\t ]
 * comments and spaces and newlines and tabs are ignored
 *
 * sexpr := list | atom | quoted_expr
 * 
 * quoted_expr := ' sexpr
 * 
 * list := LPAREN sexpr * RPAREN
 * 
 * LPAREN := ( | [ | {
 * RPAREN := ) | ] | }
 * 
 * atom := int_lit | bool_lit | string_lit | ident
 * 
 * int_lit := [0-9]+
 * bool_lit := #t | #f
 * 
 * string_lit := " (string_elem | escaped_elem)* "
 * string_elem := ASCII printable except \ and "
 * escaped_elem := \ [\\nt"]
 * 
 * ident := ident_start ident_cont *
 * special_start := [!$%&*+-/.:<=>?@^_~]
 * ident_start := [a-zA-Z] | special_start
 * ident_cont := ident_start | [0-9]
 * 
 */
 static inline bool is_ident_start(const char c) {
    switch (c) {
    case '!': case '$': case '%':
    case '&': case '*': case '+':
    case '-': case '/': case '.':
    case ':': case '<': case '=':
    case '>': case '?': case '@':
    case '^': case '_': case '~':
        return true;
    default:
        return isalpha(c);
    }
 }
 static inline bool is_ident_cont(const char c) {
    return is_ident_start(c) || isdigit(c);
 }
 /* In case of memory shortage or IO failure, the program is brutally aborted. */
 /* In case of syntax errors, the program is brutally aborted.
 * TODO: Try to make a more compromising parser
 */
 /* Print a message to stderr and abort the program */
 static noreturn void abort_program(const char *fmt, ...) {
    va_list args;
    va_start(args, fmt);
    vfprintf(stderr, fmt, args);
    va_end(args);
    exit(EXIT_FAILURE);
 }
 static noreturn void parse_error_abort(void) {
    abort_program("Parse error.\n");
 }
 static noreturn void out_of_memory_abort(void) {
    abort_program("Out of memory.\n");
 }
 struct parser {
    /* Input stream */
    FILE *in;
    /* Current lookahead symbol */
    char curr_char;
    /* Internal buffer */
    char *buf;
    size_t buf_len;
    size_t buf_cap;
    /* Location info */
    size_t line;
    size_t col;
 };
 /* Helper function to report error messages with the attached location */
 static void report_error(struct parser *p, const char *fmt, ...) {
    va_list args;
    fprintf(stderr, "Error at %d:%d ", p->line, p->col);
    va_start(args, fmt);
    vfprintf(stderr, fmt, args);
    va_end(args);
 }
 /* read next input char and update location info */
 static inline void consume(struct parser *p) {
    /* First update location info */
    if (p->curr_char == '\n') {
        ++p->line;
        p->col = 1;
    }
    else {
        ++p->col;
    }
    p->curr_char = fgetc(p->in);
    if (ferror(p->in)) {
        abort_program("Error while reading from input stream.\n");
    }
 }
 /* Store a character in the internal buffer */
 static inline void store_char(struct parser *p, const char c) {
    /* If the buffer has not already been initialized, do so now */
    if (p->buf_cap == 0) {
        p->buf_cap = 256;
        p->buf = malloc(p->buf_cap);
        if (!p->buf) {
            out_of_memory_abort();
        }
    }
    /* Otherwise check if it's large enough to contain another character */
    else if (p->buf_cap - 1 < p->buf_len) {
        /* We double the internal buffer capacity each time,
         * but we must pay attention to integer overflow,
         * so let us first check if we can double the size of the buffer
         * with no problem
         */
         if (SIZE_MAX / 2 <= p->buf_cap) {
             report_error(p, "Lexeme is too long.\n");
             abort_program("Buffer overflow, can't allocate more space, lexeme is too large.\n");
         }
         /* If the initial buffer capacity is set to zero,
          * we set it to 256, as initial capacity
          */
         p->buf_cap = p->buf_cap * 2;
         p->buf = realloc(p->buf, p->buf_cap);
         if (!p->buf) {
             out_of_memory_abort();
         }
    }
    /* Now we can safely store the given character in the internal buffer */
    p->buf[p->buf_len] = c;
    ++p->buf_len;
 }
 /* Create a null terminated copy of the internal buffer,
 * and reset the internal buffer, so that in can be used for the next lexeme */
 static inline char *lexeme_recognized(struct parser *p) {
    char *res = malloc(p->buf_len + 1);
    if (!res) {
        out_of_memory_abort();
    }
    memcpy(res, p->buf, p->buf_len);
    res[p->buf_len] = 0;
    /* Reset internal buffer */
    p->buf_len = 0;
    return res;
 }
 /* Reset internal buffer */
 static inline void reset_buffer(struct parser *p) {
    p->buf_len = 0;
 }
 static void skip_ws_comments(struct parser *p) {
    bool keep_ignoring = true;
    while (keep_ignoring) {
        switch (p->curr_char) {
        case ' ': case '\n': case '\t':
            consume(p);
            break;
        case ';':
            /* Ignore everything until a newline */
            while (p->curr_char != '\n') {
                consume(p);
            }
            break;
        default:
            keep_ignoring = false;
        }
    }
 }
 static void parse_int_lit(struct parser *p, struct sexp **out) {
    store_char(p, p->curr_char);
    while (isdigit(p->curr_char)) {
        store_char(p, p->curr_char);
        consume(p);
    }
    *out = sexp_make_int(lexeme_recognized(p));
    if (!*out) out_of_memory_abort();
 }
 static inline void parse_bool_lit(struct parser *p, struct sexp **out) {
    consume(p);
    switch (p->curr_char) {
    case 't':
        *out = sexp_make_bool(true);
        break;
    case 'f':
        *out = sexp_make_bool(false);
        break;
    default:
        report_error(p, "Expected bool literal, invalid lexeme.\n");
        parse_error_abort();
    }
    if (!*out) out_of_memory_abort();
 }
 static void parse_string_lit(struct parser *p, struct sexp **out) {
    consume(p);
    while (p->curr_char != EOF && p->curr_char != '"') {
        if (p->curr_char == '\\') {
            consume(p);
            /* Read escape sequence, and store the meaning of the escape
             * sequence in the internal buffer
             */
            switch (p->curr_char) {
            case '"':
                store_char(p, '"');
                break;
            case 'n':
                store_char(p, '\n');
                break;
            case 't':
                store_char(p, '\t');
                break;
            default:
                report_error(p, "Invalid escape sequence.\n");
                parse_error_abort();
            }
            consume(p);
        }
        else if (isprint(p->curr_char)) {
            store_char(p, p->curr_char);
            consume(p);
        }
        else {
            report_error(p, "Invalid character in string literal.\n");
            parse_error_abort();
        }
    }
    if (p->curr_char != '"') {
        report_error(p, "Unexpected end of file. expected closing quote in string literal.\n");
        parse_error_abort();
    }
    consume(p);
    *out = sexp_make_string(lexeme_recognized(p));
    if (!*out) out_of_memory_abort();
 }
 static void parse_ident(struct parser *p, struct sexp **out) {
    if (!is_ident_start(p->curr_char)) {
        report_error(p, "Invalid character found.\n");
        parse_error_abort();
    }
    store_char(p, p->curr_char);
    consume(p);
    while (is_ident_cont(p->curr_char)) {
        store_char(p, p->curr_char);
        consume(p);
    }
    *out = sexp_make_symbol(lexeme_recognized(p));
    if (!*out) out_of_memory_abort();
 }
 static void parse_sexp(struct parser *p, struct sexp **out);
 static void parse_quoted_sexp(struct parser *p, struct sexp **out) {
    consume(p);
    struct sexp *quoted;
    parse_sexp(p, &quoted);
    struct sexp_list_builder quote = { NULL };
    /* Quoted s-expressions are a shortcut for (quote sexp), so we
     * just expand the meaning
     */
    const char quote_kw[] = "quote";
    char *quote_lexeme = malloc(sizeof(quote_kw));
    if (!quote_lexeme) out_of_memory_abort();
    memcpy(quote_lexeme, quote_kw, sizeof(quote_kw));
    struct sexp *quote_symbol = sexp_make_symbol(quote_lexeme);
    if (!quote_symbol) out_of_memory_abort();
    if (!sexp_list_append(&quote, quote_symbol)) out_of_memory_abort();
    if (!sexp_list_append(&quote, quoted)) out_of_memory_abort();
    *out = sexp_make_list(quote.head);
    if (!*out) out_of_memory_abort();
 }
 static void parse_list(struct parser *p, struct sexp **out, char delim) {
    consume(p);
    skip_ws_comments(p);
    struct sexp_list_builder builder = { NULL };
    struct sexp *curr;
    while (p->curr_char != EOF && p->curr_char != delim) {
        parse_sexp(p, &curr);
        if (!sexp_list_append(&builder, curr)) out_of_memory_abort();
        skip_ws_comments(p);
    }
    if (p->curr_char != delim) {
        report_error(p, "Unbalanced parens. Expected closing `%c` in list expression.\n", delim);
        parse_error_abort();
    }
    consume(p);
    *out = sexp_make_list(builder.head);
    if (!*out) out_of_memory_abort();
 }
 static void parse_sexp(struct parser *p, struct sexp **out) {
    switch (p->curr_char) {
    case EOF:
        report_error(p, "Expected s-expression, found end of file.\n");
        parse_error_abort();
        break;
    case '\'':
        parse_quoted_sexp(p, out);
        break;
    case '"':
        parse_string_lit(p, out);
        break;
    case '(':
        parse_list(p, out, ')');
        break;
    case '[':
        parse_list(p, out, ']');
        break;
    case '{':
        parse_list(p, out, ']');
        break;
    case '#':
        parse_bool_lit(p, out);
        break;
    case '0': case '1': case '2':
    case '3': case '4': case '5':
    case '6': case '7': case '8':
    case '9':
        parse_int_lit(p, out);
        break;
    default:
        parse_ident(p, out);
        break;
    }
 }
 void parse_program(FILE *in, struct sexp_list **out) {
    char curr_char = fgetc(in);
    if (ferror(in)) {
        abort_program("Error while reading from input stream.\n");
    }
    struct parser p = {
        .in = in,
        .curr_char = curr_char,
        .buf = NULL,
        .buf_len = 0,
        .buf_cap = 0,
        .line = 1,
        .col = 1,
    };
    skip_ws_comments(&p);
    struct sexp_list_builder builder = { NULL };
    struct sexp *s;
    while (p.curr_char != EOF) {
        parse_sexp(&p, &s);
        if (!sexp_list_append(&builder, s))
            out_of_memory_abort();
        skip_ws_comments(&p);
    }
    *out = builder.head;
    /* Free internal buffer, before quitting */
    free(p.buf);
 }
--- a/src/sexp.c
+++ b/src/sexp.c
@ -0,0 +1,96 @@
 #include "sexp.h"
 #include <stdlib.h>
 bool sexp_list_append(struct sexp_list_builder *b, struct sexp *elem) {
    struct sexp_list *node = malloc(sizeof(struct sexp_list));
    if (!node) return false;
    node->elem = elem;
    node->next = NULL;
    if (b->last == NULL) {
        b->head = node;
        b->last = node;
    }
    else {
        b->last->next = node;
        b->last = b->last->next;
    }
    return true;
 }
 struct sexp *sexp_make_bool(bool v) {
    struct sexp *res = malloc(sizeof(struct sexp));
    if (!res) return NULL;
    res->form = sexp_bool;
    res->bool_lit = v;
    return res;
 }
 struct sexp *sexp_make_int(char *lexeme) {
    struct sexp *res = malloc(sizeof(struct sexp));
    if (!res) return NULL;
    res->form = sexp_int;
    res->lexeme = lexeme;
    return res;
 }
 struct sexp *sexp_make_string(char *lexeme) {
    struct sexp *res = malloc(sizeof(struct sexp));
    if (!res) return NULL;
    res->form = sexp_string;
    res->lexeme = lexeme;
    return res;
 }
 struct sexp *sexp_make_symbol(char *lexeme) {
    struct sexp *res = malloc(sizeof(struct sexp));
    if (!res) return NULL;
    res->form = sexp_symbol;
    res->lexeme = lexeme;
    return res;
 }
 struct sexp *sexp_make_list(struct sexp_list *list) {
    struct sexp *res = malloc(sizeof(struct sexp));
    if (!res) return NULL;
    res->form = sexp_list;
    res->list = list;
    return res;
 }
 void free_sexp_list(struct sexp_list *head) {
    struct sexp_list *tmp;
    while (head) {
        tmp = head;
        head = head->next;
        free_sexp(tmp->elem);
        free(tmp);
    }
 }
 void free_sexp(struct sexp *s) {
    if (!s) return;
    switch (s->form) {
    case sexp_bool:
        break;
    case sexp_int:
        free(s->lexeme);
        break;
    case sexp_string:
        free(s->lexeme);
        break;
    case sexp_symbol:
        free(s->lexeme);
        break;
    case sexp_list:
        free_sexp_list(s->list);
        break;
    }
    free(s);
 }
--- a/src/string_table.c
+++ b/src/string_table.c
@ -0,0 +1,97 @@
 #include "string_table.h"
 #include <stdlib.h>
 #include <string.h>
 /* For now we use the simple djb2 hashing algorithm,
 * but TODO: implement SipHash-2-4
 */
 static inline size_t hash_string(const char *str) {
    size_t hash = 5381;
    for (; *str != 0; ++str) {
        hash = hash * 33 + *str;
    }
    return hash;
 }
 struct bucket {
    /* This flag distinguishes between empty
     * and used buckets
     */
    bool used;
    const char *key;
    void *value;
 };
 struct string_table {
    struct bucket *entries;
    size_t capacity;
    /* Elements count in the table */
    size_t size;
 };
 static void rehash(struct string_table *t) {
    /* We double the size, but what's a better strategy? */
    size_t new_capacity = t->capacity * 2;
    struct bucket *new_entries = calloc(new_capacity, sizeof(struct bucket));
    size_t i;
    size_t new_hash;
    size_t new_pos;
    for (i = 0; i < t->capacity; ++i) {
        if (t->entries[i].used) {
            new_hash = hash_string(t->entries[i].key);
            new_pos = new_hash % new_capacity;
            new_entries[new_pos] = t->entries[i];
        }
    }
    t->capacity = new_capacity;
    free(t->entries);
    t->entries = new_entries;
 }
 void string_table_insert(struct string_table *t, const char *key, void *value) {
    /* First check if we need to rehash, we do this
     * when the table is full at 75%, this should prevent
     * some collisions
     */
    if (t->size * 4 > t->capacity * 3) {
        rehash(t);
    }
    size_t pos = hash_string(key)  % t->capacity;
    while (t->entries[pos].used) {
        if (!strcmp(key, t->entries[pos].key)) {
            /* If the key is already been inserted,
             * just update its corresponding value
             */
            t->entries[pos].value = value;
            break;
        }
        ++pos;
    }
    /* There must be an empty pos, so pos < t->capacity surely */
    t->entries[pos] = (struct bucket) { .key = key, .value = value, .used = true };
    ++t->size;
 }
 void string_table_remove(struct string_table *t, const char *key) {
    size_t pos = hash_string(key) % t->capacity;
    while (t->entries[pos].used) {
        if (!strcmp(key, t->entries[pos].key)) {
            /* Remove this bucket */
            t->entries[pos].used = false;
            break;
        }
        ++pos;
    }
 }
 void *string_table_lookup(struct string_table *t, const char *key) {
    size_t pos = hash_string(key) % t->capacity;
    while (pos < t->capacity && t->entries[pos].used) {
        if (!strcmp(key, t->entries[pos].key)) {
            return t->entries[pos].value;
        }
        ++pos;
    }
    return NULL;
 }
--- a/src/syntax_checker.c
+++ b/src/syntax_checker.c
@ -0,0 +1,251 @@
 #include "syntax_checker.h"
 #include "sexp.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdbool.h>
 #include <stdarg.h>
 #include <stdnoreturn.h>
 /* Small DSL to perform pattern matching on a list */
 struct list_checker {
    const char *syntax_form;
    /* Cursor inside the list */
    struct sexp_list *curr;
 };
 /* TODO: Make error messages more informative */
 static noreturn void report_check_error(const char *fmt, ...) {
    va_list args;
    va_start(args, fmt);
    vfprintf(stderr, fmt, args);
    va_end(args);
    exit(EXIT_FAILURE);
 }
 static const char *syn_form_descr(enum sexp_form f) {
    switch (f) {
    case sexp_bool:
        return "bool literal";
    case sexp_int:
        return "int literal";
    case sexp_string:
        return "string literal";
    case sexp_symbol:
        return "symbol";
    case sexp_list:
        return "list";
    }
 }
 static inline enum sexp_form curr_form(struct list_checker *c) {
    return c->curr->elem->form;
 }
 static inline struct sexp *curr_elem(struct list_checker *c) {
    return c->curr->elem;
 }
 static inline void consume(struct list_checker *c) {
    c->curr = c->curr->next;
 }
 static inline void expect_form(struct list_checker *c, enum sexp_form form) {
    if (c->curr->elem->form != form) {
        report_check_error("Syntax form %s: Expected `%s` but got `%s` inside list.\n", c->syntax_form, syn_form_descr(form), syn_form_descr(c->curr->elem->form));
    }
    consume(c);
 }
 static inline void expect_any(struct list_checker *c) {
    if (c->curr == NULL) report_check_error("Syntax form %s: Too few arguments, unexpected end of list.\n", c->syntax_form);
    consume(c);
 }
 static inline void expect_list_end(struct list_checker *c) {
    if (c-> curr != NULL) report_check_error("Syntax form %s: Too many arguments, expected end of list.\n", c->syntax_form);
 }
 static inline bool has_more_elems(struct list_checker *c) {
    return c->curr != NULL;
 }
 /* Syntax Checkers */
 typedef void (*syntax_checker)(struct list_checker *c);
 void begin_syntax_checker(struct list_checker *c) {
    /* (begin expr expr...) */
    /* We require at least one expr in the begin body */
    expect_any(c);
 }
 void check_define_with_formals(struct sexp *formals) {
    /* (ident formals)
     * (ident . formal)
     */
    struct list_checker c = { "define formals", formals->list };
    expect_form(&c, sexp_symbol);
    if (curr_form(&c) == sexp_symbol && !strcmp(curr_elem(&c)->lexeme, ".")) {
        consume(&c);
        expect_form(&c, sexp_symbol);
        expect_list_end(&c);
    }
    else {
        while (has_more_elems(&c)) {
            expect_form(&c, sexp_symbol);
        }
    }
 }
 void define_syntax_checker(struct list_checker *c) {
    /* (define ident expr expr...)
     * (define (ident formals) expr expr...)
     * (define (ident . formal) expr expr...)
     */
    switch (curr_form(c)) {
    case sexp_symbol:
        consume(c);
        /* At least one expr */
        expect_any(c);
        break;
    case sexp_list:
        check_define_with_formals(curr_elem(c));
        consume(c);
        /* At least one expr */
        expect_any(c);
        break;
    default:
        report_check_error("Invalid define form, expected either symbol or list, but got `%s`\n", syn_form_descr(curr_form(c)));
    }
 }
 void if_syntax_checker(struct list_checker *c) {
    /* (if expr expr expr) */
    expect_any(c);
    expect_any(c);
    expect_any(c);
    expect_list_end(c);
 }
 void check_lambda_formals(struct sexp *l) {
    /*
     * (ident ...)
     * (ident ident ... . ident)
     */
    struct list_checker c = { "lambda formals", l->list };
    if (has_more_elems(&c)) {
        if (curr_form(&c) != sexp_symbol) report_check_error("Expected symbol in lambda formal.\n");
        /* First element can't be a dot */
        if (!strcmp(curr_elem(&c)->lexeme, ".")) report_check_error("Expected at least one param before `.` in lambda formal.\n");
        consume(&c);
        while (has_more_elems(&c)) {
            /* All elements of the list must be symbols */
            if (curr_form(&c) != sexp_symbol) report_check_error("Invalid formal. All formal parameters must be symbols.\n");
            /* If the current element is a dot,
             * then it must be followed by exactly one variable
             */
            if (strcmp(curr_elem(&c)->lexeme, ".")) {
                consume(&c);
                expect_form(&c, sexp_symbol);
                expect_list_end(&c);
                break;
            }
            consume(&c);
        }
    }
 }
 void lambda_syntax_checker(struct list_checker *c) {
    /* (lambda ident exp exp...)
     * (lambda (formals) exp exp...)
     */
    switch (curr_form(c)) {
    case sexp_symbol:
        consume(c);
        break;
    case sexp_list:
        check_lambda_formals(curr_elem(c));
        consume(c);
        break;
    }
    /* At least one more element */
    expect_any(c);
 }
 void check_let_defs(struct sexp *s) {
    /* (name exp) */
    struct list_checker c = { "let definition list", s->list };
    struct list_checker def;
    while (has_more_elems(&c)) {
        if (curr_form(&c) != sexp_list) report_check_error("Let definition must be a list.\n");
        def = (struct list_checker) { "let definition", curr_elem(&c)->list };
        expect_form(&def, sexp_symbol);
        expect_any(&def);
        expect_list_end(&def);
        consume(&c);
    }
 }
 void let_syntax_checker(struct list_checker *c) {
    /* (let (def ..) exp exp..) */
    if (curr_form(c) != sexp_list) report_check_error("Let definitions must be in a list.\n");
    check_let_defs(curr_elem(c));
    /* At least one more element */
    expect_any(c);
 }
 void quote_syntax_checker(struct list_checker *c) {
    /* (quote exp) */
    expect_any(c);
    expect_list_end(c);
 }
 struct syntax {
    const char *name;
    syntax_checker checker;
 };
 /* This must be alpha sorted with respect to syntax name to allow binary search */
 struct syntax syntaxes[] = {
    { "begin", begin_syntax_checker },
    { "define", define_syntax_checker },
    { "if", if_syntax_checker },
    { "lambda", lambda_syntax_checker },
    { "let", let_syntax_checker },
    /* let and letrec share the same syntax */
    { "letrec", let_syntax_checker },
    { "quote", quote_syntax_checker },
 };
 int form_cmp(const void *lhs, const void *rhs) {
    const char *key = lhs;
    const struct syntax *syn = rhs;
    return strcmp(key, syn->name);
 }
 void check_sexp(struct sexp *s) {
    if (s->form == sexp_list && s->list != NULL) {
        struct sexp *head = s->list->elem;
        if (head->form == sexp_symbol) {
            const char *form = head->lexeme;
            struct syntax *res = bsearch(form,
                                         syntaxes,
                                         sizeof(syntaxes)/sizeof(struct syntax),
                                         sizeof(struct syntax),
                                         form_cmp);
            if (res) {
                struct list_checker c = { form, s->list->next };
                res->checker(&c);
            }
        }
    }
 }
 void check_program(struct sexp_list *l) {
    while (l) {
        check_sexp(l->elem);
        l = l->next;
    }
 }
Author	SHA1	Message	Date
Francesco Magliocca	4dffe1d7b4	Implement parser	2022-05-25 13:41:41 +02:00
Francesco Magliocca	a57e36e6ea	Update readme	2022-05-25 13:41:29 +02:00