From 842e22e9eb0f3dff7dabdaa41bcc2133e8f015f5 Mon Sep 17 00:00:00 2001 From: WormHeamer Date: Thu, 31 Jul 2025 22:37:38 -0400 Subject: initial commit --- .gitignore | 2 + Makefile | 38 ++++++++++ arena.h | 149 +++++++++++++++++++++++++++++++++++++ doc/demo.txt | 94 ++++++++++++++++++++++++ doc/test.txt | 119 ++++++++++++++++++++++++++++++ impl.c | 6 ++ lex.c | 184 ++++++++++++++++++++++++++++++++++++++++++++++ lex.h | 81 +++++++++++++++++++++ main.c | 161 ++++++++++++++++++++++++++++++++++++++++ str.h | 152 ++++++++++++++++++++++++++++++++++++++ strio.h | 234 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ test.lang | 3 + utf8.h | 64 ++++++++++++++++ 13 files changed, 1287 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 arena.h create mode 100644 doc/demo.txt create mode 100644 doc/test.txt create mode 100644 impl.c create mode 100644 lex.c create mode 100644 lex.h create mode 100644 main.c create mode 100644 str.h create mode 100644 strio.h create mode 100644 test.lang create mode 100644 utf8.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d7756c2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +a.out +*.o diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8693d09 --- /dev/null +++ b/Makefile @@ -0,0 +1,38 @@ +EXE = a.out +RUNARGS = test.lang + +CFLAGS = -std=c23 -Wall -Wextra -Wpedantic ${CFLAGS_${DEBUG}} +LDFLAGS = ${LDFLAGS_${DEBUG}} +LDLIBS = + +OBJ != find -type f -name '*.c' | sed 's/\.c$$/.o/' + +DEBUG = 0 +GDB != which gf2 2> /dev/null || which gdb + +CFLAGS_1 = -g3 -fsanitize=undefined +LDFLAGS_1 = -g3 -fsanitize=undefined +LDFLAGS_0 = -Os -s + +PREFIX ?= ${HOME}/.local +BINDIR = ${PREFIX}/bin + +.PHONY: run all clean install uninstall + +all: ${EXE} +run: ${EXE} + ./${EXE} ${RUNARGS} +debug: ${EXE} + ${GDB} -ex start --args ./${EXE} ${RUNARGS} + +clean: + rm -fv ${EXE} ${OBJ} + +${EXE}: ${OBJ} + ${CC} ${LDFLAGS} ${OBJ} -o ${EXE} ${LDLIBS} + +install: ${EXE} + cp ${EXE} ${BINDIR}/${EXE} + +uninstall: + rm ${BINDIR}/${EXE} diff --git a/arena.h b/arena.h new file mode 100644 index 0000000..045cde0 --- /dev/null +++ b/arena.h @@ -0,0 +1,149 @@ +#ifndef ARENA_H +#define ARENA_H + +#include +#include +#include + +typedef struct ArenaPg { + struct ArenaPg *prev, *next; + char *beg, *end; + char data[]; +} ArenaPg; + +typedef struct { + ArenaPg *pg; + char *beg; +} ArenaMark; + +typedef struct { + ArenaPg *cur, *tail; +} Arena; + +#define new(a, t)\ + (t*)arena_zeroed(arena_alloc(a, sizeof(t), _Alignof(t)), sizeof(t)) + +#define new_arr(a, t, n)\ + arena_alloc(a, sizeof(t) * n, _Alignof(t)) + +#define resize(a, p, old, new)\ + arena_realloc(a, p, (old) * sizeof(*(p)), (new) * sizeof(*(p)),\ + _Alignof(__typeof__(*(p)))) + +void arena_free(Arena *a); + +void arena_save(Arena *a, ArenaMark *m); +void arena_load(Arena *a, ArenaMark *m); + +void arena_reset(Arena *a); +void arena_reserve(Arena *a, ptrdiff_t n); + +void *arena_alloc(Arena *a, ptrdiff_t n, ptrdiff_t align); +void *arena_realloc(Arena *a, void *ptr, ptrdiff_t old, ptrdiff_t new, ptrdiff_t align); +void *arena_zeroed(void *p, size_t n); + +#define ARENA_BACKEND_MALLOC 0 +#define ARENA_BACKEND_MMAP 1 + +#ifndef ARENA_BACKEND +#if defined(__linux__) +# define ARENA_BACKEND ARENA_BACKEND_MMAP +#else +# define ARENA_BACKEND ARENA_BACKEND_MALLOC +#endif +#endif + +#ifdef ARENA_IMPL + +#include +#include +static void arena_pg_alloc_fail(void) { + fprintf(stderr, "failed to allocate arena page\n"); + abort(); +} + +#if ARENA_BACKEND == ARENA_BACKEND_MMAP +#include +#include +#define ARENA_PG_SIZE sysconf(_SC_PAGESIZE) +static inline void *arena_pg_alloc(ptrdiff_t n) { + void *p = mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + return p == MAP_FAILED ? NULL : p; +} +static inline void arena_pg_free(void *ptr, ptrdiff_t n) { munmap(ptr, n); } +#elif ARENA_BACKEND == ARENA_BACKEND_MALLOC +#define ARENA_PG_SIZE 8192 +static inline void *arena_pg_alloc(ptrdiff_t n) { return malloc(n); } +static inline void arena_pg_free(void *ptr, ptrdiff_t n) { free(ptr); (void)n; } +#endif + +void arena_free(Arena *a) { + while (a->tail) { + a->cur = a->tail->prev; + arena_pg_free(a->tail, (uintptr_t)(a->tail->end - (char*)a->tail)); + a->tail = a->cur; + } +} + +void arena_reserve(Arena *a, ptrdiff_t n) { + while (a->cur && a->cur->beg + n >= a->cur->end) a->cur = a->cur->next; + if (a->cur) return; + ptrdiff_t cap = n + sizeof(ArenaPg); + cap += (uintptr_t)-cap & (ARENA_PG_SIZE - 1); + ArenaPg *p = arena_pg_alloc(cap); + if (!p) arena_pg_alloc_fail(); + p->next = NULL; + p->prev = a->tail; + p->beg = p->data; + p->end = (char*)p + cap; + if (a->tail) a->tail->next = p; + a->cur = (a->tail = p); +} + +void *arena_alloc(Arena *a, ptrdiff_t n, ptrdiff_t align) { + arena_reserve(a, n + (align - 1)); + char *ptr = a->cur->beg + (-(uintptr_t)a->cur->beg & (align - 1)); + a->cur->beg = ptr + n; + return ptr; +} + +void *arena_realloc(Arena *a, void *ptr, ptrdiff_t old, ptrdiff_t new, ptrdiff_t align) { + if (a->cur && ptr == a->cur->beg - old && (char*)ptr + new < a->cur->end) { + a->cur->beg += new - old; + return ptr; + } else { + void *p = arena_alloc(a, new, align); + if (ptr) memcpy(p, ptr, old); + return p; + } +} + +void *arena_zeroed(void *p, size_t n) { + memset(p, 0, n); + return p; +} + +void arena_reset(Arena *a) { + if (!a->cur) return; + while (a->cur->prev) { + a->cur->beg = a->cur->data; + a->cur = a->cur->prev; + } + a->cur->beg = a->cur->data; +} + +void arena_save(Arena *a, ArenaMark *m) { + m->pg = a->cur; + if (a->cur) m->beg = a->cur->beg; +} + +void arena_load(Arena *a, ArenaMark *m) { + while (a->cur && a->cur != m->pg) { + a->cur->beg = a->cur->data; + a->cur = a->cur->prev; + } + if (a->cur) a->cur->beg = m->beg; +} + +#endif +#endif diff --git a/doc/demo.txt b/doc/demo.txt new file mode 100644 index 0000000..58c0839 --- /dev/null +++ b/doc/demo.txt @@ -0,0 +1,94 @@ +type vec2 = [2]f32 + +type Sprite = enum u8 { + BLANK, + PLAYER +} + +type Player = struct { + pos, vel vec2 +} + +extern { + blit proc(spr Sprite, x, y i32), + exp func(v f32) f32 +} + +proc frame(dt f32) { + /* unary .. can splat an array as parameters */ + blit(.PLAYER, ..plr.pos) + plr.pos += plr.vel * dt + plr.vel *= exp(-dt * 5) +} + +/* statements + + type Ident = T Alias Ident to type T + extern Ident T Define an external variable (or function!) of the given type + extern { a A, b B... } Block of extern declarations, formatted like a struct + +*/ + +/* expressions + + N Integer + N.N | NeN | N.NeN Float + [-+~] A Unary operator on term A + A [+-/*%&|^] B Binary operator on term A and expression B + A := B Type inference assignment + A, B, C := D + T { ... } Literal of type T; mostly only makes sense for structs, + but this can be used to do something like u8 { 2 } too. + +/* types + + IDENT type of the given identifier + ^T pointer to type T + enum { FOO, BAR... } enum + enum T { FOO, BAR... } enum with backing type T, which must be an integer type + struct { a A, b B... } struct + A..B range (kinda an anonymous enum) + A and B must be ordinal types (enum value, integer, or character) + [N]T array of T of length N + [E]T array of T indexed by enum or range E + []T slice + (Ta, Tb...) tuple of given types; can be destructured to multiple assignment + set[E] a set of enum or range E + proc(a A, b B...) function pointer + func(a A, b B...) T function pointer with return value T + +*/ + +Expr = LValue | Literal +LValue = Ident | LValue '.' Ident +Literal = Number +Number = Integer | Float +Integer = /[0-9]+/ +Float = Integer 'e' Integer | Integer '.' Integer | Integer '.' Integer 'e' Integer + + IDENT Variable + LVALUE . IDENT Member access + [0-9]+ Integer + [0-9]+ . [0-9]+ Float + [0-9]+ . [0-9]+ e [0-9]+ Float + F(E1, E2...) function call with the given expression arguments + +*/ + +/* import first tries to scan local directory for the package, then standard + * system directory; in said directory it will look for an IR cache file, + * or compile a new one if it's not present (or if its last-modified time + * is older than the compiler executable), which is imported with a name + * equivalent to the last part of the path. + * + * codegen is only performed on procedures that get called from main and their + * own dependencies, recursively. this allows to keep output binaries small. + * maybe supply some sort of export qualifier to do something similar for + * procedures that aren't main. + */ + +import "io" + +proc main { + io.print("Hello, world!") +} diff --git a/doc/test.txt b/doc/test.txt new file mode 100644 index 0000000..da8f501 --- /dev/null +++ b/doc/test.txt @@ -0,0 +1,119 @@ +type string = []char + +func split(src: string, by: char, a: ^Arena): [dyn]string do + let r: [dyn]string = {} + let s: string = "" + for c in src do + if c == by then + r.push(s, a) + s = "" + else + s.push(c, a) + end + end + return r +end + +proc main do + let a = Arena() + let lines = split(stdin.readAll(@a), '\n', @a) + for i in 0..len(lines) do + print(lines[i]) + end +end + +--- + +fn next_line(src, line: ^[]u8) -> bool { + if len(src) < 1 { + return false + } + let idx = find(src, '\n') + if idx <> -1 { + line^ := src^[0..idx] + } else { + line^ := src^ + } + src^ = src^[len(line)..] + return true +} + +fn main { + let a = Arena() + let buf = readall(stdin, &a) + let line: []u8 + while next_line(&buf, &line) { + print(line) + } +} + +--- + +/* builtin slice type */ + +fn print(str []u8) { + for i in str { + putchar(str[i]) + } + putchar('\n') +} + +/* iterators are just functions that return an initial state object and a + * step function that acts on it to produce a value */ + +type LineIterFn = fn(it ^[]u8) ([]u8, bool) + +fn lines(src []u8) ([]u8, LineIterFn) { + fn step(it ^[]u8) []u8, bool { + if len(it^) < 1 { return {}, false } + idx := find(it^, '\n') + line := if idx == -1 { it^ } else { it^[0..idx] } + it^ = it^[len(line)..] + return line, true + } + return src, step +} + +/* if multiple values used in if/while condition, only the last one is checked */ + +fn main { + a := Arena {} + buf := readall(stdin, &a) + for line in lines(buf) { + print(line) + } +} + +/* generics? */ + +fn reverse[T any](buf []T) ([]T, fn(it ^[]T) (T, bool)) { + fn step(it ^[]T) (T, bool) { + if len(^it) < 1 { return {}, false } + n := len(it^) - 1 + v := it^[n] + it^ := it^[0..n] + return v, true + } + return buf, step +} + +fn foo { + s := "Hello, world!" + // print(reverse(s)) <- wouldn't actually work, because iterators can't be cast back to slices + a := Arena {} + print(collect(reverse(s), &a)) +} + +struct MapIter[In any, Out any] { + ctx: rawptr, + iter: fn(it rawptr) In, + map: fn(x ^In) Out +} + +fn map[In any, Out any](p rawptr, f func(it rawptr) In, f func(x ^In) Out) (MapIter[In, Out], fn(it ^MapIter[In, Out]) (Out, bool)) { + fn step(it ^MapIter[In, Out]) (Out, bool) { + v, ok := it.iter(it.ctx) + if !ok { return {}, false } + return it.map(&v), true + } +} diff --git a/impl.c b/impl.c new file mode 100644 index 0000000..96d8836 --- /dev/null +++ b/impl.c @@ -0,0 +1,6 @@ +#define STR_IMPL +#define STRIO_IMPL +#define ARENA_IMPL +#include "str.h" +#include "strio.h" +#include "arena.h" diff --git a/lex.c b/lex.c new file mode 100644 index 0000000..7a1d345 --- /dev/null +++ b/lex.c @@ -0,0 +1,184 @@ +#include +#include + +#include "lex.h" +#include "arena.h" +#include "strio.h" + +void lex_start(Lexer *l, const char *path) { + l->filename = str_dup(str_from_cstr(path), &l->arena); + FILE *f = fopen(path, "r/o"); + if (!f) { + fprintf(stderr, "Couldn't open file %s\n", path); + exit(1); + } + if (read_all(f, &l->buf, &l->arena)) { + fprintf(stderr, "Couldn't read file %s\n", path); + fclose(f); + exit(1); + } + lex_next(l); +} + +void lex_free(Lexer *l) { + arena_free(&l->arena); +} + +Str lex_mask_str(Lexer *l, TokMask t) { + Str s = S(""); + for (Token i = 0; i < TOK_MAX; i++) { + if (t & TMASK(i)) { + if (s.n > 0) str_cat(&s, S(" or "), &l->arena); + str_cat(&s, str_from_cstr(lex_tok_str[i]), &l->arena); + } + } + return s; +} + +void lex_expect(Lexer *l, TokMask t) { + lex_next(l); + lex_expected(l, t); +} + +void lex_expect_not(Lexer *l, TokMask t) { + lex_next(l); + lex_expected_not(l, t); +} + +void lex_expected(Lexer *l, TokMask t) { + if (!(TMASK(l->tok) & t)) { + lex_error(l, LE_ERROR, str_fmt(&l->arena, "Expected %S but got %s", lex_mask_str(l, t), lex_tok_str[l->tok])); + } +} + +void lex_expected_not(Lexer *l, TokMask t) { + if (TMASK(l->tok) & t) { + lex_error(l, LE_ERROR, str_fmt(&l->arena, "Unexpected %s", lex_tok_str[l->tok])); + } +} + +void lex_pos(Lexer *l, int *line, int *col) { + int ln = 0, c = 0; + for (int i = 0; i < l->ofs; i++) { + if (l->buf.s[i] == '\n') { + ln++; + c = 0; + } else { + c++; + if (l->buf.s[i] == '\t') { + c += (unsigned)-c & 7; + } + } + } + *line = ln; + *col = c; +} + +void lex_error(Lexer *l, LexErr e, Str msg) { + int line, col; + l->ofs -= l->ident.n; + lex_pos(l, &line, &col); + + fprintf(stderr, "%s", e == LE_ERROR ? "\x1b[1;31m" : "\x1b[1;33m"); + + fprintf(stderr, "%.*s:%d:%d: %.*s\n\n", + (int)l->filename.n, l->filename.s, line + 1, col + 1, (int)msg.n, msg.s); + + { + int ofs = l->ofs; + int line_start = ofs; + while (line_start > 0 && l->buf.s[line_start - 1] != '\n') line_start--; + int line_end = line_start; + while (line_end < l->buf.n && l->buf.s[line_end] != '\n') line_end++; + fprintf(stderr, "%.*s\n", line_end - line_start, &l->buf.s[line_start]); + for (int i = 0; i < col; i++) putchar(' '); + for (int i = ofs; i < ofs + l->ident.n && i < line_end; i++) putchar('^'); + putchar('\n'); + } + + fprintf(stderr, "\x1b[0m\n"); + + if (e == LE_ERROR) { + exit(1); + } +} + +static inline int is_digit(int c) { + return c >= '0' && c <= '9'; +} + +static inline int is_ident_first_char(int c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; +} + +static inline int is_ident_next_char(int c) { + return is_ident_first_char(c) || is_digit(c); +} + +Token ident_to_keyword(Str ident) { + /* evil & stupid hack to avoid keeping a separate table of keywords */ + for (Token t = 0; t < TOK_MAX; t++) { + if (str_eql(str_from_cstr(lex_tok_str[t]), ident)) { + return t; + } + } + return TOK_IDENT; +} + +#define T(t) (l->tok = t) +void lex_next(Lexer *l) { + int i = l->ofs; + while (i < l->buf.n && is_space(l->buf.s[i])) { + i++; + } + int start_ofs = i; + l->ident = (Str) { &l->buf.s[start_ofs], 0 }; + if (i >= l->buf.n) { + l->tok = TOK_EOF; + return; + } + char c = l->buf.s[i++]; + l->tok = TOK_MAX; + if (is_ident_first_char(c)) { + T(TOK_IDENT); + while (i < l->buf.n && is_ident_next_char(l->buf.s[i])) i++; + } else if (is_digit(c)) { + T(TOK_LIT_NUM); + while (i < l->buf.n && (is_digit(l->buf.s[i]) || l->buf.s[i] == '.' || l->buf.s[i] == 'e')) i++; + } else { + switch (c) { +#define X(a,b) case b: T(a); break; + LEX_TOK_CHAR_LIST +#undef X + case '\'': + T(TOK_LIT_CHAR); + if (i < l->buf.n && l->buf.s[i] == '\\') i += 2; + else i++; + if (i >= l->buf.n) lex_error(l, LE_ERROR, S("Unterminated character literal")); + if (l->buf.s[i] != '\'') lex_error(l, LE_ERROR, S("Overlong character literal")); + i++; + break; + case '"': + T(TOK_LIT_STR); + for (;;) { + if (i >= l->buf.n) { + lex_error(l, LE_ERROR, S("Unterminated string literal")); + } + if (l->buf.s[i] == '\\') { + i += 2; + continue; + } + if (l->buf.s[i++] == '"') break; + } + break; + } + } + if (l->tok == TOK_MAX) { + lex_error(l, LE_ERROR, S("Invalid token")); + } + l->ident.n = i - start_ofs; + l->ofs = i; + if (l->tok == TOK_IDENT) { + l->tok = ident_to_keyword(l->ident); + } +} diff --git a/lex.h b/lex.h new file mode 100644 index 0000000..460636a --- /dev/null +++ b/lex.h @@ -0,0 +1,81 @@ +#ifndef LEX_H +#define LEX_H + +#include "str.h" + +#define LEX_TOK_LIST\ + X(TOK_EOF, "end-of-file")\ + X(TOK_IDENT, "identifier")\ + X(TOK_PROC, "proc")\ + X(TOK_LET, "let")\ + X(TOK_VAR, "var")\ + X(TOK_CONST, "const")\ + X(TOK_RETURN, "return")\ + X(TOK_LBRACE, "{")\ + X(TOK_RBRACE, "}")\ + X(TOK_LPAREN, "(")\ + X(TOK_RPAREN, ")")\ + X(TOK_PLUS, "+")\ + X(TOK_MINUS, "-")\ + X(TOK_ASTERISK, "*")\ + X(TOK_SLASH, "/")\ + X(TOK_EQUALS, "=")\ + X(TOK_COLON, ":")\ + X(TOK_COMMA, ",")\ + X(TOK_LIT_STR, "string literal")\ + X(TOK_LIT_CHAR, "character literal")\ + X(TOK_LIT_NUM, "numeric literal") + +#define LEX_TOK_CHAR_LIST\ + X(TOK_LBRACE, '{')\ + X(TOK_RBRACE, '}')\ + X(TOK_LPAREN, '(')\ + X(TOK_RPAREN, ')')\ + X(TOK_PLUS, '+')\ + X(TOK_MINUS, '-')\ + X(TOK_ASTERISK, '*')\ + X(TOK_SLASH, '/')\ + X(TOK_EQUALS, '=')\ + X(TOK_COLON, ':')\ + X(TOK_COMMA, ',') + +typedef enum { +#define X(n, s) n, + LEX_TOK_LIST +#undef X + TOK_MAX, +} Token; + +static const char *lex_tok_str[TOK_MAX] = { +#define X(n, s) s, + LEX_TOK_LIST +#undef X +}; + +#define TMASK(t) (1L << t) +typedef unsigned long TokMask; + +typedef struct { + Token tok; + Str ident; + Str filename, buf; + Arena arena; + int ofs; +} Lexer; + +typedef enum { + LE_WARN, + LE_ERROR +} LexErr; + +void lex_start(Lexer *l, const char *path); +void lex_free(Lexer *l); +void lex_expected(Lexer *l, TokMask t); +void lex_expected_not(Lexer *l, TokMask t); +void lex_expect(Lexer *l, TokMask t); /* next -> expected */ +void lex_expect_not(Lexer *l, TokMask t); /* next -> expected_not */ +void lex_error(Lexer *l, LexErr e, Str msg); +void lex_pos(Lexer *l, int *line, int *col); +void lex_next(Lexer *l); + +#endif diff --git a/main.c b/main.c new file mode 100644 index 0000000..59a41a2 --- /dev/null +++ b/main.c @@ -0,0 +1,161 @@ +#include +#include +#include +#include +#include + +#include "lex.h" + +/* node graph */ + +typedef enum { + N_START, + N_STOP, + N_LIT_INT, + N_OP_ADD, + N_OP_SUB, + N_OP_MUL, + N_OP_DIV, + N_OP_NEG, +} NodeType; + +typedef struct Node { + NodeType type; + struct Node **inputs, **outputs; +} Node; + +typedef struct { + Node *start, *stop; +} ProcGraph; + +/* parsing */ + +void parse_expr(Lexer *l); + +void parse_stmt_let(Lexer *l) { + lex_expect(l, TMASK(TOK_IDENT)); + Str name = l->ident; + lex_expect(l, TMASK(TOK_EQUALS) | TMASK(TOK_COLON)); + lex_next(l); + parse_expr(l); + printf("name_bind %.*s\n", (int)name.n, name.s); +} + +void parse_stmt(Lexer *l) { + /* TODO */ + (void)l; + switch (l->tok) { + case TOK_LET: + parse_stmt_let(l); + break; + case TOK_RETURN: + lex_next(l); + parse_expr(l); + puts("return"); + break; + default: + lex_expected(l, TMASK(TOK_RBRACE)); + } +} + +void parse_proc(Lexer *l) { + lex_expect(l, TMASK(TOK_IDENT)); + lex_expect(l, TMASK(TOK_LBRACE)); + lex_next(l); + while (l->tok != TOK_RBRACE) { + lex_expected_not(l, TMASK(TOK_EOF)); + parse_stmt(l); + } + lex_expected(l, TMASK(TOK_RBRACE)); + lex_next(l); +} + +void parse_term(Lexer *l) { + int negate_after = l->tok == TOK_MINUS; + if (TMASK(l->tok) & (TMASK(TOK_MINUS) | TMASK(TOK_PLUS))) { + lex_next(l); + } + if (l->tok == TOK_LPAREN) { + lex_next(l); + parse_expr(l); + lex_expected(l, TMASK(TOK_RPAREN)); + lex_next(l); + } else { + lex_expected(l, TMASK(TOK_LIT_NUM) | TMASK(TOK_LIT_STR) | TMASK(TOK_LIT_CHAR) | TMASK(TOK_IDENT)); + switch (l->tok) { + case TOK_IDENT: + printf("var %.*s\n", (int)l->ident.n, l->ident.s); + break; + case TOK_LIT_STR: + printf("lit_str %.*s\n", (int)l->ident.n, l->ident.s); + break; + case TOK_LIT_CHAR: + printf("lit_char %.*s\n", (int)l->ident.n, l->ident.s); + break; + case TOK_LIT_NUM: + printf("lit_num %.*s\n", (int)l->ident.n, l->ident.s); + break; + default: + break; + } + lex_next(l); + } + if (negate_after) puts("negate"); +} + +void parse_expr(Lexer *l) { + parse_term(l); + if (l->tok == TOK_LPAREN) { + lex_next(l); + puts("args_start"); + for (;;) { + parse_expr(l); + lex_expected(l, TMASK(TOK_COMMA) | TMASK(TOK_RPAREN)); + if (l->tok == TOK_RPAREN) break; + lex_next(l); + } + lex_next(l); + puts("args_end"); + puts("func_call"); + } + if (TMASK(l->tok) & (TMASK(TOK_PLUS) | TMASK(TOK_MINUS) | TMASK(TOK_ASTERISK) | TMASK(TOK_SLASH))) { + Token t = l->tok; + lex_next(l); + parse_expr(l); + switch (t) { + case TOK_PLUS: puts("add"); break; + case TOK_MINUS: puts("subtract"); break; + case TOK_ASTERISK: puts("multiply"); break; + case TOK_SLASH: puts("divide"); break; + default: break; + } + } +} + +void parse_decl(Lexer *l) { + switch (l->tok) { + case TOK_PROC: + parse_proc(l); + break; + default: + lex_error(l, LE_ERROR, S("Invalid declaration statement")); + } +} + +void parse_unit(Lexer *l) { + while (l->tok != TOK_EOF) { + parse_decl(l); + } +} + +int main(int argc, const char **argv) { + if (argc != 2) { + fprintf(stderr, "Usage: %s FILE\n", argv[0]); + return 1; + } + Lexer l = { 0 }; + lex_start(&l, argv[1]); + parse_unit(&l); + lex_free(&l); + return 0; +} diff --git a/str.h b/str.h new file mode 100644 index 0000000..b10bc64 --- /dev/null +++ b/str.h @@ -0,0 +1,152 @@ +#ifndef STR_H +#define STR_H + +#include +#include + +#include "arena.h" + +typedef struct { + char *s; + ptrdiff_t n; +} Str; + +typedef struct { + Str head, tail; +} Cut; + +#define S(s) (Str){s,sizeof(s)-1} + +char *str_to_cstr(Str s, Arena *a); +Str str_from_cstr(const char *s); +int str_eql(Str a, Str b); +int str_starts(Str a, Str b); +int str_ends(Str a, Str b); +void str_catc(Str *a, char b, Arena *m); +Str str_skip(Str a, ptrdiff_t n); +int is_space(char c); +Str str_trim_left(Str a); +Str str_trim_right(Str a); +Str str_trim(Str a); +Cut str_cut(Str s, char c); +Str str_findc(Str s, char c); +Str str_find(Str haystack, Str needle); +int str_contains(Str a, Str b); +Str str_dup(Str a, Arena *m); +void str_cat(Str *a, Str b, Arena *m); +Str str_replace_end(Str s, Str a, Str b, Arena *m); + +#ifdef STR_IMPL + +/* conversions */ + +char *str_to_cstr(Str s, Arena *a) { + char *r = new_arr(a, char, s.n + 1); + memcpy(r, s.s, s.n); + r[s.n] = 0; + return r; +} + +Str str_from_cstr(const char *s) { + return (Str) { (char*)s, strlen(s) }; +} + +/* pure functions */ + +int str_eql(Str a, Str b) { + return a.n == b.n && !memcmp(a.s, b.s, b.n); +} + +int str_starts(Str a, Str b) { + return a.n >= b.n && !memcmp(a.s, b.s, b.n); +} + +int str_ends(Str a, Str b) { + return a.n >= b.n && !memcmp(&a.s[a.n - b.n], b.s, b.n); +} + +void str_catc(Str *a, char b, Arena *m) { + a->s = resize(m, a->s, a->n, a->n + 1); + a->s[a->n++] = b; +} + +Str str_skip(Str a, ptrdiff_t n) { + return (Str) { a.s + n, a.n - n }; +} + +int is_space(char c) { + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; +} + +Str str_trim_left(Str a) { + while (a.n > 0 && is_space(a.s[0])) a.s++, a.n--; + return a; +} + +Str str_trim_right(Str a) { + while (a.n > 0 && is_space(a.s[a.n - 1])) a.n--; + return a; +} + +Str str_trim(Str a) { + return str_trim_left(str_trim_right(a)); +} + +/* splitting, searching */ + +Cut str_cut(Str s, char c) { + char *p = memchr(s.s, c, s.n); + if (!p) { + return (Cut) { s, { &s.s[s.n], 0 } }; + } else { + return (Cut) { + { s.s, p - s.s }, + { p + 1, &s.s[s.n] - (p + 1) } + }; + } +} + +Str str_findc(Str s, char c) { + char *p = memchr(s.s, c, s.n); + return p ? (Str) { p, s.n - (p - s.s) } : (Str) { &s.s[s.n], 0 }; +} + +Str str_find(Str haystack, Str needle) { + if (needle.n < 1) return haystack; + while (haystack.n > 0) { + haystack = str_findc(haystack, needle.s[0]); + if (str_starts(haystack, needle)) break; + if (haystack.n > 0) haystack = str_skip(haystack, 1); + } + return haystack; +} + +int str_contains(Str a, Str b) { + return str_find(a, b).n > 0; +} + +/* allocating */ + +Str str_dup(Str a, Arena *m) { + char *s = new_arr(m, char, a.n); + memcpy(s, a.s, a.n); + a.s = s; + return a; +} + +void str_cat(Str *a, Str b, Arena *m) { + a->s = resize(m, a->s, a->n, a->n + b.n); + memcpy(&a->s[a->n], b.s, b.n); + a->n += b.n; +} + +Str str_replace_end(Str s, Str a, Str b, Arena *m) { + if (!str_ends(s, a)) return s; + char *p = new_arr(m, char, s.n + b.n - a.n); + memcpy(p, s.s, s.n - a.n); + memcpy(p + s.n - a.n, b.s, b.n); + return (Str) { p, s.n + b.n - a.n }; +} + +#endif +#endif diff --git a/strio.h b/strio.h new file mode 100644 index 0000000..0d6c6f5 --- /dev/null +++ b/strio.h @@ -0,0 +1,234 @@ +#ifndef STRIO_H +#define STRIO_H + +#include + +#include "str.h" +#include "arena.h" + +int read_all(FILE *f, Str *buf, Arena *a); +int next_line(Str *src, Str *line); +void str_putf(Str s, FILE *f); +void str_put(Str s); + +int str_to_u64(Str s, uint64_t *out); +int str_to_i64(Str s, int64_t *out); + +void str_cat_i64(Str *out, int64_t c, char pad_char, int min_width, Arena *a); +void str_cat_u64(Str *out, uint64_t c, char pad_char, int min_width, Arena *a); +void str_cat_fmtv(Str *out, Arena *arena, const char *fmt, va_list ap); +void str_cat_fmt(Str *out, Arena *arena, const char *fmt, ...); + +Str str_fmtv(Arena *arena, const char *fmt, va_list ap); +Str str_fmt(Arena *arena, const char *fmt, ...); +const char *cstr_fmt(Arena *arena, const char *fmt, ...); + +#ifdef STRIO_IMPL + +#include +#include + +static inline long read_all_file_size(FILE *f) { + fseek(f, 0, SEEK_END); + long t = ftell(f); + fseek(f, 0, SEEK_SET); + return t > 0 ? t : -1; +} + +int read_all(FILE *f, Str *buf, Arena *a) { + if (!f) return -1; + long sz = read_all_file_size(f); + if (sz < 1) { + ptrdiff_t cap = 4096; + buf->s = new_arr(a, char, cap); + buf->n = 0; + while (!feof(f)) { + size_t n = fread(&buf->s[buf->n], 1, cap - buf->n, f); + if (n < 1) break; + buf->n += n; + if (buf->n >= cap) { + size_t c = cap; + while (buf->n >= cap) cap <<= 1; + buf->s = resize(a, buf->s, c, cap); + } + } + } else { + buf->n = sz; + buf->s = new_arr(a, char, sz); + size_t sz = fread(buf->s, 1, buf->n, f); + if (sz < (size_t)buf->n) return -1; + } + return ferror(f) ? -1 : 0; +} + +int next_line(Str *src, Str *line) { + if (src->n < 1) return 0; + line->s = src->s; + char *newln = memchr(src->s, '\n', src->n); + line->n = newln ? newln - src->s : src->n; + src->s += line->n + 1; + src->n -= line->n + 1; + if (line->n > 0 && line->s[line->n-1] == '\r') line->n--; + return 1; +} + +void str_putf(Str s, FILE *f) { + fwrite(s.s, 1, s.n, f); +} + +void str_put(Str s) { + str_putf(s, stdout); +} + +/* formatted conversion */ + +int str_to_u64(Str s, uint64_t *out) { + if (s.n < 1) return -1; + uint64_t acc = 0; + for (int i = 0; i < s.n; i++) { + char c = s.s[i]; + if (!(c >= '0' && c <= '9')) return -1; + acc = (acc * 10) + (c - '0'); + } + *out = acc; + return 0; +} + +int str_to_i64(Str s, int64_t *out) { + int64_t sign = 1; + if (s.n > 0 && s.s[0] == '-') { + s = str_skip(s, 1); + sign = -1; + } + uint64_t u64; + if (str_to_u64(s, &u64)) return -1; + *out = u64 * sign; + return 0; +} + +static void str_cat_u64_(char buf[32], int *n, uint64_t c) { + int i = 0; + buf[31] = '\0'; + do { + buf[32 - ++i] = (c % 10) + '0'; + c /= 10; + } while (c); + *n = i; +} + +void str_cat_u64(Str *out, uint64_t c, char pad_char, int min_width, Arena *a) { + int n; + /* more than enough for the largest 64-bit number + * log_10(1 << 64) ~= 19.3 digits max */ + char buf[32]; + str_cat_u64_(buf, &n, c); + while (n < min_width && ++n < 32) buf[32-n] = pad_char; + str_cat(out, (Str) { &buf[sizeof(buf) - n], n }, a); +} + +void str_cat_i64(Str *out, int64_t c, char pad_char, int min_width, Arena *a) { + /* more than enough for the largest 64-bit number + * log_10(1 << 64) ~= 19.3 digits max */ + int n, neg = 0; + char buf[32]; + if (c < 0) neg = 1, c = -c; + str_cat_u64_(buf, &n, c); + if (neg) buf[sizeof(buf) - ++n] = '-'; + while (n < min_width && ++n < 32) buf[32-n] = pad_char; + str_cat(out, (Str) { &buf[sizeof(buf) - n], n }, a); +} + +/* IMPORTANT: this is not and will not be printf() compatible + * + * %s - c string + * %S - Str + * %i - int32 + * %I - int64 + * %u - uint32 + * %U - uin64 + * + **/ +void str_cat_fmtv(Str *out, Arena *arena, const char *fmt, va_list ap) { + size_t n = strlen(fmt); + for (size_t i = 0; i < n; i++) { + const char *mch = memchr(&fmt[i], '%', n - i); + if (!mch) { + str_cat(out, (Str) { (char*)&fmt[i], n - i }, arena); + break; + } + size_t skip = mch - &fmt[i]; + if (mch != &fmt[i]) { + str_cat(out, (Str) { (char*)&fmt[i], skip }, arena); + i += skip; + } + if (i + 1 < n) { + int zero_pad = 0, min_width = 0; + i++; + if (fmt[i] == '0') { + zero_pad = 1; + i++; + } + while (i < n && fmt[i] >= '0' && fmt[i] <= '9') { + min_width = min_width * 10 + (fmt[i] - '0'); + i++; + } + if (i >= n) break; + switch (fmt[i]) { + case 's': + str_cat(out, str_from_cstr(va_arg(ap, const char *)), arena); + break; + case 'S': + str_cat(out, va_arg(ap, Str), arena); + break; + case 'i': + str_cat_i64(out, va_arg(ap, int32_t), zero_pad?'0':' ', min_width, arena); + break; + case 'I': + str_cat_i64(out, va_arg(ap, int64_t), zero_pad?'0':' ', min_width, arena); + break; + case 'u': + str_cat_u64(out, va_arg(ap, uint32_t), zero_pad?'0':' ', min_width, arena); + break; + case 'U': + str_cat_u64(out, va_arg(ap, uint64_t), zero_pad?'0':' ', min_width, arena); + break; + default: + str_catc(out, fmt[i], arena); + break; + } + } + } +} + +void str_cat_fmt(Str *out, Arena *arena, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + str_cat_fmtv(out, arena, fmt, ap); + va_end(ap); +} + +Str str_fmtv(Arena *arena, const char *fmt, va_list ap) { + Str s = { 0 }; + str_cat_fmtv(&s, arena, fmt, ap); + return s; +} + +Str str_fmt(Arena *arena, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + Str r = str_fmtv(arena, fmt, ap); + va_end(ap); + return r; +} + +const char *cstr_fmt(Arena *arena, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + Str r = str_fmtv(arena, fmt, ap); + str_catc(&r, '\0', arena); + va_end(ap); + return r.s; +} + +#endif +#endif diff --git a/test.lang b/test.lang new file mode 100644 index 0000000..7559516 --- /dev/null +++ b/test.lang @@ -0,0 +1,3 @@ +proc main { + return 1 +} diff --git a/utf8.h b/utf8.h new file mode 100644 index 0000000..10a4f2b --- /dev/null +++ b/utf8.h @@ -0,0 +1,64 @@ +#ifndef UTF8_H +#define UTF8_H + +#include "str.h" + +#define UTF8_INVALID (unsigned)-1 + +int utf8_len(unsigned cp); +unsigned utf8_next(Str *s); +void utf8_to_buf(unsigned cp, char *buf, int n); + +#ifdef UTF8_IMPL + +#include +#include + +unsigned utf8_next(Str *s) { + if (s->n < 1) return 0; + static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 }; + static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf }; + int len = niblen[(uint8_t)*s->s >> 4]; + if (!len) { s->n--; return *s->s++; } + if (s->n < len || (s->s[0] & (0x80 >> len))) return UTF8_INVALID; + unsigned cp = (unsigned)*s->s & cpmask[len]; + for (int i = 1; i < len; i++) { + if ((s->s[i] & 0xc0) != 0x80) return UTF8_INVALID; + cp = (cp << 6) | (s->s[i] & 0x3f); + } + s->s += len, s->n -= len; + return cp; +} + +unsigned utf8_next_unchecked(Str *s) { + if (s->n < 1) return 0; + static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 }; + static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf }; + int len = niblen[(uint8_t)*s->s >> 4]; + if (!len) { s->n--; return *s->s++; } + unsigned cp = (unsigned)*s->s & cpmask[len]; + for (int i = 1; i < len; i++) cp = (cp << 6) | (s->s[i] & 0x3f); + s->s += len, s->n -= len; + return cp; +} + +int utf8_len(unsigned cp) { + static const uint8_t tbl[33] = { + 6,6,6,6,6,6, 5,5,5,5,5, 4,4,4,4,4, + 3,3,3,3,3, 2,2,2,2, 1,1,1,1,1,1,1,1, + }; + return tbl[stdc_leading_zeros(cp)]; +} + +void utf8_to_buf(unsigned cp, char *buf, int n) { + if (n == 1) { + *buf = cp; + return; + } + static const uint8_t tbl[5] = { 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100 }; + for (int i = n; --i;) buf[i] = 0x80 | (cp & 0x3f), cp >>= 6; + buf[0] = tbl[n - 2] | cp; +} + +#endif +#endif -- cgit v1.2.3