diff options
| author | WormHeamer | 2025-07-31 22:37:38 -0400 |
|---|---|---|
| committer | WormHeamer | 2025-07-31 22:37:38 -0400 |
| commit | 842e22e9eb0f3dff7dabdaa41bcc2133e8f015f5 (patch) | |
| tree | 78f42e68da656698526ff6099e78d82adab1d582 /lex.c | |
initial commit
Diffstat (limited to 'lex.c')
| -rw-r--r-- | lex.c | 184 |
1 files changed, 184 insertions, 0 deletions
@@ -0,0 +1,184 @@ +#include <stdlib.h> +#include <stdio.h> + +#include "lex.h" +#include "arena.h" +#include "strio.h" + +void lex_start(Lexer *l, const char *path) { + l->filename = str_dup(str_from_cstr(path), &l->arena); + FILE *f = fopen(path, "r/o"); + if (!f) { + fprintf(stderr, "Couldn't open file %s\n", path); + exit(1); + } + if (read_all(f, &l->buf, &l->arena)) { + fprintf(stderr, "Couldn't read file %s\n", path); + fclose(f); + exit(1); + } + lex_next(l); +} + +void lex_free(Lexer *l) { + arena_free(&l->arena); +} + +Str lex_mask_str(Lexer *l, TokMask t) { + Str s = S(""); + for (Token i = 0; i < TOK_MAX; i++) { + if (t & TMASK(i)) { + if (s.n > 0) str_cat(&s, S(" or "), &l->arena); + str_cat(&s, str_from_cstr(lex_tok_str[i]), &l->arena); + } + } + return s; +} + +void lex_expect(Lexer *l, TokMask t) { + lex_next(l); + lex_expected(l, t); +} + +void lex_expect_not(Lexer *l, TokMask t) { + lex_next(l); + lex_expected_not(l, t); +} + +void lex_expected(Lexer *l, TokMask t) { + if (!(TMASK(l->tok) & t)) { + lex_error(l, LE_ERROR, str_fmt(&l->arena, "Expected %S but got %s", lex_mask_str(l, t), lex_tok_str[l->tok])); + } +} + +void lex_expected_not(Lexer *l, TokMask t) { + if (TMASK(l->tok) & t) { + lex_error(l, LE_ERROR, str_fmt(&l->arena, "Unexpected %s", lex_tok_str[l->tok])); + } +} + +void lex_pos(Lexer *l, int *line, int *col) { + int ln = 0, c = 0; + for (int i = 0; i < l->ofs; i++) { + if (l->buf.s[i] == '\n') { + ln++; + c = 0; + } else { + c++; + if (l->buf.s[i] == '\t') { + c += (unsigned)-c & 7; + } + } + } + *line = ln; + *col = c; +} + +void lex_error(Lexer *l, LexErr e, Str msg) { + int line, col; + l->ofs -= l->ident.n; + lex_pos(l, &line, &col); + + fprintf(stderr, "%s", e == LE_ERROR ? "\x1b[1;31m" : "\x1b[1;33m"); + + fprintf(stderr, "%.*s:%d:%d: %.*s\n\n", + (int)l->filename.n, l->filename.s, line + 1, col + 1, (int)msg.n, msg.s); + + { + int ofs = l->ofs; + int line_start = ofs; + while (line_start > 0 && l->buf.s[line_start - 1] != '\n') line_start--; + int line_end = line_start; + while (line_end < l->buf.n && l->buf.s[line_end] != '\n') line_end++; + fprintf(stderr, "%.*s\n", line_end - line_start, &l->buf.s[line_start]); + for (int i = 0; i < col; i++) putchar(' '); + for (int i = ofs; i < ofs + l->ident.n && i < line_end; i++) putchar('^'); + putchar('\n'); + } + + fprintf(stderr, "\x1b[0m\n"); + + if (e == LE_ERROR) { + exit(1); + } +} + +static inline int is_digit(int c) { + return c >= '0' && c <= '9'; +} + +static inline int is_ident_first_char(int c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; +} + +static inline int is_ident_next_char(int c) { + return is_ident_first_char(c) || is_digit(c); +} + +Token ident_to_keyword(Str ident) { + /* evil & stupid hack to avoid keeping a separate table of keywords */ + for (Token t = 0; t < TOK_MAX; t++) { + if (str_eql(str_from_cstr(lex_tok_str[t]), ident)) { + return t; + } + } + return TOK_IDENT; +} + +#define T(t) (l->tok = t) +void lex_next(Lexer *l) { + int i = l->ofs; + while (i < l->buf.n && is_space(l->buf.s[i])) { + i++; + } + int start_ofs = i; + l->ident = (Str) { &l->buf.s[start_ofs], 0 }; + if (i >= l->buf.n) { + l->tok = TOK_EOF; + return; + } + char c = l->buf.s[i++]; + l->tok = TOK_MAX; + if (is_ident_first_char(c)) { + T(TOK_IDENT); + while (i < l->buf.n && is_ident_next_char(l->buf.s[i])) i++; + } else if (is_digit(c)) { + T(TOK_LIT_NUM); + while (i < l->buf.n && (is_digit(l->buf.s[i]) || l->buf.s[i] == '.' || l->buf.s[i] == 'e')) i++; + } else { + switch (c) { +#define X(a,b) case b: T(a); break; + LEX_TOK_CHAR_LIST +#undef X + case '\'': + T(TOK_LIT_CHAR); + if (i < l->buf.n && l->buf.s[i] == '\\') i += 2; + else i++; + if (i >= l->buf.n) lex_error(l, LE_ERROR, S("Unterminated character literal")); + if (l->buf.s[i] != '\'') lex_error(l, LE_ERROR, S("Overlong character literal")); + i++; + break; + case '"': + T(TOK_LIT_STR); + for (;;) { + if (i >= l->buf.n) { + lex_error(l, LE_ERROR, S("Unterminated string literal")); + } + if (l->buf.s[i] == '\\') { + i += 2; + continue; + } + if (l->buf.s[i++] == '"') break; + } + break; + } + } + if (l->tok == TOK_MAX) { + lex_error(l, LE_ERROR, S("Invalid token")); + } + l->ident.n = i - start_ofs; + l->ofs = i; + if (l->tok == TOK_IDENT) { + l->tok = ident_to_keyword(l->ident); + } +} |
