summaryrefslogtreecommitdiff
path: root/lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'lex.c')
-rw-r--r--lex.c184
1 files changed, 184 insertions, 0 deletions
diff --git a/lex.c b/lex.c
new file mode 100644
index 0000000..7a1d345
--- /dev/null
+++ b/lex.c
@@ -0,0 +1,184 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "lex.h"
+#include "arena.h"
+#include "strio.h"
+
+void lex_start(Lexer *l, const char *path) {
+ l->filename = str_dup(str_from_cstr(path), &l->arena);
+ FILE *f = fopen(path, "r/o");
+ if (!f) {
+ fprintf(stderr, "Couldn't open file %s\n", path);
+ exit(1);
+ }
+ if (read_all(f, &l->buf, &l->arena)) {
+ fprintf(stderr, "Couldn't read file %s\n", path);
+ fclose(f);
+ exit(1);
+ }
+ lex_next(l);
+}
+
+void lex_free(Lexer *l) {
+ arena_free(&l->arena);
+}
+
+Str lex_mask_str(Lexer *l, TokMask t) {
+ Str s = S("");
+ for (Token i = 0; i < TOK_MAX; i++) {
+ if (t & TMASK(i)) {
+ if (s.n > 0) str_cat(&s, S(" or "), &l->arena);
+ str_cat(&s, str_from_cstr(lex_tok_str[i]), &l->arena);
+ }
+ }
+ return s;
+}
+
+void lex_expect(Lexer *l, TokMask t) {
+ lex_next(l);
+ lex_expected(l, t);
+}
+
+void lex_expect_not(Lexer *l, TokMask t) {
+ lex_next(l);
+ lex_expected_not(l, t);
+}
+
+void lex_expected(Lexer *l, TokMask t) {
+ if (!(TMASK(l->tok) & t)) {
+ lex_error(l, LE_ERROR, str_fmt(&l->arena, "Expected %S but got %s", lex_mask_str(l, t), lex_tok_str[l->tok]));
+ }
+}
+
+void lex_expected_not(Lexer *l, TokMask t) {
+ if (TMASK(l->tok) & t) {
+ lex_error(l, LE_ERROR, str_fmt(&l->arena, "Unexpected %s", lex_tok_str[l->tok]));
+ }
+}
+
+void lex_pos(Lexer *l, int *line, int *col) {
+ int ln = 0, c = 0;
+ for (int i = 0; i < l->ofs; i++) {
+ if (l->buf.s[i] == '\n') {
+ ln++;
+ c = 0;
+ } else {
+ c++;
+ if (l->buf.s[i] == '\t') {
+ c += (unsigned)-c & 7;
+ }
+ }
+ }
+ *line = ln;
+ *col = c;
+}
+
+void lex_error(Lexer *l, LexErr e, Str msg) {
+ int line, col;
+ l->ofs -= l->ident.n;
+ lex_pos(l, &line, &col);
+
+ fprintf(stderr, "%s", e == LE_ERROR ? "\x1b[1;31m" : "\x1b[1;33m");
+
+ fprintf(stderr, "%.*s:%d:%d: %.*s\n\n",
+ (int)l->filename.n, l->filename.s, line + 1, col + 1, (int)msg.n, msg.s);
+
+ {
+ int ofs = l->ofs;
+ int line_start = ofs;
+ while (line_start > 0 && l->buf.s[line_start - 1] != '\n') line_start--;
+ int line_end = line_start;
+ while (line_end < l->buf.n && l->buf.s[line_end] != '\n') line_end++;
+ fprintf(stderr, "%.*s\n", line_end - line_start, &l->buf.s[line_start]);
+ for (int i = 0; i < col; i++) putchar(' ');
+ for (int i = ofs; i < ofs + l->ident.n && i < line_end; i++) putchar('^');
+ putchar('\n');
+ }
+
+ fprintf(stderr, "\x1b[0m\n");
+
+ if (e == LE_ERROR) {
+ exit(1);
+ }
+}
+
+static inline int is_digit(int c) {
+ return c >= '0' && c <= '9';
+}
+
+static inline int is_ident_first_char(int c) {
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
+}
+
+static inline int is_ident_next_char(int c) {
+ return is_ident_first_char(c) || is_digit(c);
+}
+
+Token ident_to_keyword(Str ident) {
+ /* evil & stupid hack to avoid keeping a separate table of keywords */
+ for (Token t = 0; t < TOK_MAX; t++) {
+ if (str_eql(str_from_cstr(lex_tok_str[t]), ident)) {
+ return t;
+ }
+ }
+ return TOK_IDENT;
+}
+
+#define T(t) (l->tok = t)
+void lex_next(Lexer *l) {
+ int i = l->ofs;
+ while (i < l->buf.n && is_space(l->buf.s[i])) {
+ i++;
+ }
+ int start_ofs = i;
+ l->ident = (Str) { &l->buf.s[start_ofs], 0 };
+ if (i >= l->buf.n) {
+ l->tok = TOK_EOF;
+ return;
+ }
+ char c = l->buf.s[i++];
+ l->tok = TOK_MAX;
+ if (is_ident_first_char(c)) {
+ T(TOK_IDENT);
+ while (i < l->buf.n && is_ident_next_char(l->buf.s[i])) i++;
+ } else if (is_digit(c)) {
+ T(TOK_LIT_NUM);
+ while (i < l->buf.n && (is_digit(l->buf.s[i]) || l->buf.s[i] == '.' || l->buf.s[i] == 'e')) i++;
+ } else {
+ switch (c) {
+#define X(a,b) case b: T(a); break;
+ LEX_TOK_CHAR_LIST
+#undef X
+ case '\'':
+ T(TOK_LIT_CHAR);
+ if (i < l->buf.n && l->buf.s[i] == '\\') i += 2;
+ else i++;
+ if (i >= l->buf.n) lex_error(l, LE_ERROR, S("Unterminated character literal"));
+ if (l->buf.s[i] != '\'') lex_error(l, LE_ERROR, S("Overlong character literal"));
+ i++;
+ break;
+ case '"':
+ T(TOK_LIT_STR);
+ for (;;) {
+ if (i >= l->buf.n) {
+ lex_error(l, LE_ERROR, S("Unterminated string literal"));
+ }
+ if (l->buf.s[i] == '\\') {
+ i += 2;
+ continue;
+ }
+ if (l->buf.s[i++] == '"') break;
+ }
+ break;
+ }
+ }
+ if (l->tok == TOK_MAX) {
+ lex_error(l, LE_ERROR, S("Invalid token"));
+ }
+ l->ident.n = i - start_ofs;
+ l->ofs = i;
+ if (l->tok == TOK_IDENT) {
+ l->tok = ident_to_keyword(l->ident);
+ }
+}