#include #include #include "lex.h" #include "arena.h" #include "strio.h" void lex_start(Lexer *l, const char *path) { l->ofs = 0; l->filename = str_dup(str_from_cstr(path), &l->arena); FILE *f = fopen(path, "r/o"); if (!f) { fprintf(stderr, "Couldn't open file %s\n", path); exit(1); } if (read_all(f, &l->buf, &l->arena)) { fprintf(stderr, "Couldn't read file %s\n", path); fclose(f); exit(1); } lex_next(l); } void lex_free(Lexer *l) { arena_free(&l->arena); } Str lex_mask_str(Lexer *l, TokMask t) { Str s = S(""); for (Token i = 0; i < TOK_MAX; i++) { if (t & TMASK(i)) { if (s.n > 0) str_cat(&s, S(" or "), &l->arena); str_cat(&s, str_from_cstr(lex_tok_str[i]), &l->arena); } } return s; } void lex_expect(Lexer *l, TokMask t) { lex_next(l); lex_expected(l, t); } void lex_expect_not(Lexer *l, TokMask t) { lex_next(l); lex_expected_not(l, t); } void lex_expected(Lexer *l, TokMask t) { if (!(TMASK(l->tok) & t)) { lex_error(l, LE_ERROR, str_fmt(&l->arena, "expected %S but got %s", lex_mask_str(l, t), lex_tok_str[l->tok])); } } void lex_expected_not(Lexer *l, TokMask t) { if (TMASK(l->tok) & t) { lex_error(l, LE_ERROR, str_fmt(&l->arena, "unexpected %s", lex_tok_str[l->tok])); } } /* should only be called in the event of errors, so probably not * much of an issue that this has to scan the whole file */ void lex_pos(Lexer *l, int ofs, int *line, int *col) { int ln = 0, c = 0; for (int i = 0; i < ofs; i++) { if (l->buf.s[i] == '\n') { ln++; c = 0; } else { c++; if (l->buf.s[i] == '\t') { c += (unsigned)-c & 7; } } } *line = ln; *col = c; } void lex_error(Lexer *l, LexErr e, Str msg) { lex_error_at(l, l->pos, e, msg); } void lex_err_color(LexErr e) { fprintf(stderr, "%s", e == LE_NONE ? "\x1b[0m" : (e == LE_ERROR ? "\x1b[1;31m" : "\x1b[1;33m")); } void lex_error_at(Lexer *l, LexSpan pos, LexErr e, Str msg) { int line, col; lex_pos(l, pos.ofs, &line, &col); fprintf(stderr, "\x1b[1m%.*s:%d:%d:\x1b[0m ", (int)l->filename.n, l->filename.s, line + 1, col + 1); lex_err_color(e); if (e != LE_NONE) fprintf(stderr, "%s", e == LE_ERROR ? "error" : "warn"); lex_err_color(LE_NONE); fprintf(stderr, ": %.*s\n", (int)msg.n, msg.s); { int ofs = pos.ofs; int line_start = ofs; while (line_start > 0 && l->buf.s[line_start - 1] != '\n') line_start--; int line_end = line_start; while (line_end < l->buf.n && l->buf.s[line_end] != '\n') line_end++; lex_err_color(e); fprintf(stderr, "%.*s\n", line_end - line_start, &l->buf.s[line_start]); for (int i = 0; i < col; i++) fputc(' ', stderr); fputc('^', stderr); for (int i = ofs + 1; i < ofs + pos.n && i < line_end; i++) fputc('~', stderr); lex_err_color(LE_NONE); fputc('\n', stderr); } if (e == LE_ERROR) { exit(1); } } static inline int is_digit(int c) { return c >= '0' && c <= '9'; } static inline int is_ident_first_char(int c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; } static inline int is_ident_next_char(int c) { return is_ident_first_char(c) || is_digit(c); } Token ident_to_keyword(Str ident) { /* evil & stupid hack to avoid keeping a separate table of keywords */ for (Token t = 0; t < TOK_MAX; t++) { if (str_eql(str_from_cstr(lex_tok_str[t]), ident)) { return t; } } return TOK_IDENT; } #define T(t) (l->tok = t) void lex_next(Lexer *l) { recurse: int i = l->ofs; while (i < l->buf.n && is_space(l->buf.s[i])) { i++; } if (str_starts(str_skip(l->buf, i), S("/*"))) { Str f = str_find(str_skip(l->buf, i), S("*/")); l->ofs = (f.s - l->buf.s) + 2; goto recurse; } if (str_starts(str_skip(l->buf, i), S("//"))) { Str f = str_findc(str_skip(l->buf, i), '\n'); l->ofs = f.s - l->buf.s; goto recurse; } int start_ofs = i; l->ident = (Str) { &l->buf.s[start_ofs], 0 }; l->pos = (LexSpan) { start_ofs, 0 }; if (i >= l->buf.n) { l->tok = TOK_EOF; return; } char c = l->buf.s[i++]; l->tok = TOK_MAX; if (is_ident_first_char(c)) { T(TOK_IDENT); while (i < l->buf.n && is_ident_next_char(l->buf.s[i])) i++; } else if (is_digit(c)) { T(TOK_LIT_NUM); while (i < l->buf.n && (is_digit(l->buf.s[i]) || l->buf.s[i] == '.' || l->buf.s[i] == 'e')) i++; } else { switch (c) { #define X(a,b) case b: T(a); break; LEX_TOK_CHAR_LIST #undef X case '\'': T(TOK_LIT_CHAR); if (i < l->buf.n && l->buf.s[i] == '\\') i += 2; else i++; if (i >= l->buf.n) lex_error(l, LE_ERROR, S("unterminated character literal")); if (l->buf.s[i] != '\'') lex_error(l, LE_ERROR, S("overlong character literal")); i++; break; case '"': T(TOK_LIT_STR); for (;;) { if (i >= l->buf.n) { lex_error(l, LE_ERROR, S("unterminated string literal")); } if (l->buf.s[i] == '\\') { i += 2; continue; } if (l->buf.s[i++] == '"') break; } break; case '<': switch (l->buf.s[i]) { case '=': T(TOK_LTE); i++; break; case '>': T(TOK_NEQ); i++; break; case '<': T(TOK_SHL); i++; break; default: T(TOK_LES); break; } break; case '>': switch (l->buf.s[i]) { case '=': T(TOK_GTE); i++; break; case '>': T(TOK_SHR); i++; break; default: T(TOK_GTR); break; } break; } } if (l->tok == TOK_MAX) { lex_error(l, LE_ERROR, S("parse error")); } l->ident.n = i - start_ofs; l->pos.n = i - start_ofs; l->ofs = i; if (l->tok == TOK_IDENT) { l->tok = ident_to_keyword(l->ident); } }