summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWormHeamer2025-07-31 22:37:38 -0400
committerWormHeamer2025-07-31 22:37:38 -0400
commit842e22e9eb0f3dff7dabdaa41bcc2133e8f015f5 (patch)
tree78f42e68da656698526ff6099e78d82adab1d582
initial commit
-rw-r--r--.gitignore2
-rw-r--r--Makefile38
-rw-r--r--arena.h149
-rw-r--r--doc/demo.txt94
-rw-r--r--doc/test.txt119
-rw-r--r--impl.c6
-rw-r--r--lex.c184
-rw-r--r--lex.h81
-rw-r--r--main.c161
-rw-r--r--str.h152
-rw-r--r--strio.h234
-rw-r--r--test.lang3
-rw-r--r--utf8.h64
13 files changed, 1287 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d7756c2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+a.out
+*.o
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..8693d09
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,38 @@
+EXE = a.out
+RUNARGS = test.lang
+
+CFLAGS = -std=c23 -Wall -Wextra -Wpedantic ${CFLAGS_${DEBUG}}
+LDFLAGS = ${LDFLAGS_${DEBUG}}
+LDLIBS =
+
+OBJ != find -type f -name '*.c' | sed 's/\.c$$/.o/'
+
+DEBUG = 0
+GDB != which gf2 2> /dev/null || which gdb
+
+CFLAGS_1 = -g3 -fsanitize=undefined
+LDFLAGS_1 = -g3 -fsanitize=undefined
+LDFLAGS_0 = -Os -s
+
+PREFIX ?= ${HOME}/.local
+BINDIR = ${PREFIX}/bin
+
+.PHONY: run all clean install uninstall
+
+all: ${EXE}
+run: ${EXE}
+ ./${EXE} ${RUNARGS}
+debug: ${EXE}
+ ${GDB} -ex start --args ./${EXE} ${RUNARGS}
+
+clean:
+ rm -fv ${EXE} ${OBJ}
+
+${EXE}: ${OBJ}
+ ${CC} ${LDFLAGS} ${OBJ} -o ${EXE} ${LDLIBS}
+
+install: ${EXE}
+ cp ${EXE} ${BINDIR}/${EXE}
+
+uninstall:
+ rm ${BINDIR}/${EXE}
diff --git a/arena.h b/arena.h
new file mode 100644
index 0000000..045cde0
--- /dev/null
+++ b/arena.h
@@ -0,0 +1,149 @@
+#ifndef ARENA_H
+#define ARENA_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+typedef struct ArenaPg {
+ struct ArenaPg *prev, *next;
+ char *beg, *end;
+ char data[];
+} ArenaPg;
+
+typedef struct {
+ ArenaPg *pg;
+ char *beg;
+} ArenaMark;
+
+typedef struct {
+ ArenaPg *cur, *tail;
+} Arena;
+
+#define new(a, t)\
+ (t*)arena_zeroed(arena_alloc(a, sizeof(t), _Alignof(t)), sizeof(t))
+
+#define new_arr(a, t, n)\
+ arena_alloc(a, sizeof(t) * n, _Alignof(t))
+
+#define resize(a, p, old, new)\
+ arena_realloc(a, p, (old) * sizeof(*(p)), (new) * sizeof(*(p)),\
+ _Alignof(__typeof__(*(p))))
+
+void arena_free(Arena *a);
+
+void arena_save(Arena *a, ArenaMark *m);
+void arena_load(Arena *a, ArenaMark *m);
+
+void arena_reset(Arena *a);
+void arena_reserve(Arena *a, ptrdiff_t n);
+
+void *arena_alloc(Arena *a, ptrdiff_t n, ptrdiff_t align);
+void *arena_realloc(Arena *a, void *ptr, ptrdiff_t old, ptrdiff_t new, ptrdiff_t align);
+void *arena_zeroed(void *p, size_t n);
+
+#define ARENA_BACKEND_MALLOC 0
+#define ARENA_BACKEND_MMAP 1
+
+#ifndef ARENA_BACKEND
+#if defined(__linux__)
+# define ARENA_BACKEND ARENA_BACKEND_MMAP
+#else
+# define ARENA_BACKEND ARENA_BACKEND_MALLOC
+#endif
+#endif
+
+#ifdef ARENA_IMPL
+
+#include <stdio.h>
+#include <stdlib.h>
+static void arena_pg_alloc_fail(void) {
+ fprintf(stderr, "failed to allocate arena page\n");
+ abort();
+}
+
+#if ARENA_BACKEND == ARENA_BACKEND_MMAP
+#include <sys/mman.h>
+#include <unistd.h>
+#define ARENA_PG_SIZE sysconf(_SC_PAGESIZE)
+static inline void *arena_pg_alloc(ptrdiff_t n) {
+ void *p = mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ return p == MAP_FAILED ? NULL : p;
+}
+static inline void arena_pg_free(void *ptr, ptrdiff_t n) { munmap(ptr, n); }
+#elif ARENA_BACKEND == ARENA_BACKEND_MALLOC
+#define ARENA_PG_SIZE 8192
+static inline void *arena_pg_alloc(ptrdiff_t n) { return malloc(n); }
+static inline void arena_pg_free(void *ptr, ptrdiff_t n) { free(ptr); (void)n; }
+#endif
+
+void arena_free(Arena *a) {
+ while (a->tail) {
+ a->cur = a->tail->prev;
+ arena_pg_free(a->tail, (uintptr_t)(a->tail->end - (char*)a->tail));
+ a->tail = a->cur;
+ }
+}
+
+void arena_reserve(Arena *a, ptrdiff_t n) {
+ while (a->cur && a->cur->beg + n >= a->cur->end) a->cur = a->cur->next;
+ if (a->cur) return;
+ ptrdiff_t cap = n + sizeof(ArenaPg);
+ cap += (uintptr_t)-cap & (ARENA_PG_SIZE - 1);
+ ArenaPg *p = arena_pg_alloc(cap);
+ if (!p) arena_pg_alloc_fail();
+ p->next = NULL;
+ p->prev = a->tail;
+ p->beg = p->data;
+ p->end = (char*)p + cap;
+ if (a->tail) a->tail->next = p;
+ a->cur = (a->tail = p);
+}
+
+void *arena_alloc(Arena *a, ptrdiff_t n, ptrdiff_t align) {
+ arena_reserve(a, n + (align - 1));
+ char *ptr = a->cur->beg + (-(uintptr_t)a->cur->beg & (align - 1));
+ a->cur->beg = ptr + n;
+ return ptr;
+}
+
+void *arena_realloc(Arena *a, void *ptr, ptrdiff_t old, ptrdiff_t new, ptrdiff_t align) {
+ if (a->cur && ptr == a->cur->beg - old && (char*)ptr + new < a->cur->end) {
+ a->cur->beg += new - old;
+ return ptr;
+ } else {
+ void *p = arena_alloc(a, new, align);
+ if (ptr) memcpy(p, ptr, old);
+ return p;
+ }
+}
+
+void *arena_zeroed(void *p, size_t n) {
+ memset(p, 0, n);
+ return p;
+}
+
+void arena_reset(Arena *a) {
+ if (!a->cur) return;
+ while (a->cur->prev) {
+ a->cur->beg = a->cur->data;
+ a->cur = a->cur->prev;
+ }
+ a->cur->beg = a->cur->data;
+}
+
+void arena_save(Arena *a, ArenaMark *m) {
+ m->pg = a->cur;
+ if (a->cur) m->beg = a->cur->beg;
+}
+
+void arena_load(Arena *a, ArenaMark *m) {
+ while (a->cur && a->cur != m->pg) {
+ a->cur->beg = a->cur->data;
+ a->cur = a->cur->prev;
+ }
+ if (a->cur) a->cur->beg = m->beg;
+}
+
+#endif
+#endif
diff --git a/doc/demo.txt b/doc/demo.txt
new file mode 100644
index 0000000..58c0839
--- /dev/null
+++ b/doc/demo.txt
@@ -0,0 +1,94 @@
+type vec2 = [2]f32
+
+type Sprite = enum u8 {
+ BLANK,
+ PLAYER
+}
+
+type Player = struct {
+ pos, vel vec2
+}
+
+extern {
+ blit proc(spr Sprite, x, y i32),
+ exp func(v f32) f32
+}
+
+proc frame(dt f32) {
+ /* unary .. can splat an array as parameters */
+ blit(.PLAYER, ..plr.pos)
+ plr.pos += plr.vel * dt
+ plr.vel *= exp(-dt * 5)
+}
+
+/* statements
+
+ type Ident = T Alias Ident to type T
+ extern Ident T Define an external variable (or function!) of the given type
+ extern { a A, b B... } Block of extern declarations, formatted like a struct
+
+*/
+
+/* expressions
+
+ N Integer
+ N.N | NeN | N.NeN Float
+ [-+~] A Unary operator on term A
+ A [+-/*%&|^] B Binary operator on term A and expression B
+ A := B Type inference assignment
+ A, B, C := D
+ T { ... } Literal of type T; mostly only makes sense for structs,
+ but this can be used to do something like u8 { 2 } too.
+
+/* types
+
+ IDENT type of the given identifier
+ ^T pointer to type T
+ enum { FOO, BAR... } enum
+ enum T { FOO, BAR... } enum with backing type T, which must be an integer type
+ struct { a A, b B... } struct
+ A..B range (kinda an anonymous enum)
+ A and B must be ordinal types (enum value, integer, or character)
+ [N]T array of T of length N
+ [E]T array of T indexed by enum or range E
+ []T slice
+ (Ta, Tb...) tuple of given types; can be destructured to multiple assignment
+ set[E] a set of enum or range E
+ proc(a A, b B...) function pointer
+ func(a A, b B...) T function pointer with return value T
+
+*/
+
+Expr = LValue | Literal
+LValue = Ident | LValue '.' Ident
+Literal = Number
+Number = Integer | Float
+Integer = /[0-9]+/
+Float = Integer 'e' Integer | Integer '.' Integer | Integer '.' Integer 'e' Integer
+
+ IDENT Variable
+ LVALUE . IDENT Member access
+ [0-9]+ Integer
+ [0-9]+ . [0-9]+ Float
+ [0-9]+ . [0-9]+ e [0-9]+ Float
+ F(E1, E2...) function call with the given expression arguments
+
+*/
+
+/* import first tries to scan local directory for the package, then standard
+ * system directory; in said directory it will look for an IR cache file,
+ * or compile a new one if it's not present (or if its last-modified time
+ * is older than the compiler executable), which is imported with a name
+ * equivalent to the last part of the path.
+ *
+ * codegen is only performed on procedures that get called from main and their
+ * own dependencies, recursively. this allows to keep output binaries small.
+ * maybe supply some sort of export qualifier to do something similar for
+ * procedures that aren't main.
+ */
+
+import "io"
+
+proc main {
+ io.print("Hello, world!")
+}
diff --git a/doc/test.txt b/doc/test.txt
new file mode 100644
index 0000000..da8f501
--- /dev/null
+++ b/doc/test.txt
@@ -0,0 +1,119 @@
+type string = []char
+
+func split(src: string, by: char, a: ^Arena): [dyn]string do
+ let r: [dyn]string = {}
+ let s: string = ""
+ for c in src do
+ if c == by then
+ r.push(s, a)
+ s = ""
+ else
+ s.push(c, a)
+ end
+ end
+ return r
+end
+
+proc main do
+ let a = Arena()
+ let lines = split(stdin.readAll(@a), '\n', @a)
+ for i in 0..len(lines) do
+ print(lines[i])
+ end
+end
+
+---
+
+fn next_line(src, line: ^[]u8) -> bool {
+ if len(src) < 1 {
+ return false
+ }
+ let idx = find(src, '\n')
+ if idx <> -1 {
+ line^ := src^[0..idx]
+ } else {
+ line^ := src^
+ }
+ src^ = src^[len(line)..]
+ return true
+}
+
+fn main {
+ let a = Arena()
+ let buf = readall(stdin, &a)
+ let line: []u8
+ while next_line(&buf, &line) {
+ print(line)
+ }
+}
+
+---
+
+/* builtin slice type */
+
+fn print(str []u8) {
+ for i in str {
+ putchar(str[i])
+ }
+ putchar('\n')
+}
+
+/* iterators are just functions that return an initial state object and a
+ * step function that acts on it to produce a value */
+
+type LineIterFn = fn(it ^[]u8) ([]u8, bool)
+
+fn lines(src []u8) ([]u8, LineIterFn) {
+ fn step(it ^[]u8) []u8, bool {
+ if len(it^) < 1 { return {}, false }
+ idx := find(it^, '\n')
+ line := if idx == -1 { it^ } else { it^[0..idx] }
+ it^ = it^[len(line)..]
+ return line, true
+ }
+ return src, step
+}
+
+/* if multiple values used in if/while condition, only the last one is checked */
+
+fn main {
+ a := Arena {}
+ buf := readall(stdin, &a)
+ for line in lines(buf) {
+ print(line)
+ }
+}
+
+/* generics? */
+
+fn reverse[T any](buf []T) ([]T, fn(it ^[]T) (T, bool)) {
+ fn step(it ^[]T) (T, bool) {
+ if len(^it) < 1 { return {}, false }
+ n := len(it^) - 1
+ v := it^[n]
+ it^ := it^[0..n]
+ return v, true
+ }
+ return buf, step
+}
+
+fn foo {
+ s := "Hello, world!"
+ // print(reverse(s)) <- wouldn't actually work, because iterators can't be cast back to slices
+ a := Arena {}
+ print(collect(reverse(s), &a))
+}
+
+struct MapIter[In any, Out any] {
+ ctx: rawptr,
+ iter: fn(it rawptr) In,
+ map: fn(x ^In) Out
+}
+
+fn map[In any, Out any](p rawptr, f func(it rawptr) In, f func(x ^In) Out) (MapIter[In, Out], fn(it ^MapIter[In, Out]) (Out, bool)) {
+ fn step(it ^MapIter[In, Out]) (Out, bool) {
+ v, ok := it.iter(it.ctx)
+ if !ok { return {}, false }
+ return it.map(&v), true
+ }
+}
diff --git a/impl.c b/impl.c
new file mode 100644
index 0000000..96d8836
--- /dev/null
+++ b/impl.c
@@ -0,0 +1,6 @@
+#define STR_IMPL
+#define STRIO_IMPL
+#define ARENA_IMPL
+#include "str.h"
+#include "strio.h"
+#include "arena.h"
diff --git a/lex.c b/lex.c
new file mode 100644
index 0000000..7a1d345
--- /dev/null
+++ b/lex.c
@@ -0,0 +1,184 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "lex.h"
+#include "arena.h"
+#include "strio.h"
+
+void lex_start(Lexer *l, const char *path) {
+ l->filename = str_dup(str_from_cstr(path), &l->arena);
+ FILE *f = fopen(path, "r/o");
+ if (!f) {
+ fprintf(stderr, "Couldn't open file %s\n", path);
+ exit(1);
+ }
+ if (read_all(f, &l->buf, &l->arena)) {
+ fprintf(stderr, "Couldn't read file %s\n", path);
+ fclose(f);
+ exit(1);
+ }
+ lex_next(l);
+}
+
+void lex_free(Lexer *l) {
+ arena_free(&l->arena);
+}
+
+Str lex_mask_str(Lexer *l, TokMask t) {
+ Str s = S("");
+ for (Token i = 0; i < TOK_MAX; i++) {
+ if (t & TMASK(i)) {
+ if (s.n > 0) str_cat(&s, S(" or "), &l->arena);
+ str_cat(&s, str_from_cstr(lex_tok_str[i]), &l->arena);
+ }
+ }
+ return s;
+}
+
+void lex_expect(Lexer *l, TokMask t) {
+ lex_next(l);
+ lex_expected(l, t);
+}
+
+void lex_expect_not(Lexer *l, TokMask t) {
+ lex_next(l);
+ lex_expected_not(l, t);
+}
+
+void lex_expected(Lexer *l, TokMask t) {
+ if (!(TMASK(l->tok) & t)) {
+ lex_error(l, LE_ERROR, str_fmt(&l->arena, "Expected %S but got %s", lex_mask_str(l, t), lex_tok_str[l->tok]));
+ }
+}
+
+void lex_expected_not(Lexer *l, TokMask t) {
+ if (TMASK(l->tok) & t) {
+ lex_error(l, LE_ERROR, str_fmt(&l->arena, "Unexpected %s", lex_tok_str[l->tok]));
+ }
+}
+
+void lex_pos(Lexer *l, int *line, int *col) {
+ int ln = 0, c = 0;
+ for (int i = 0; i < l->ofs; i++) {
+ if (l->buf.s[i] == '\n') {
+ ln++;
+ c = 0;
+ } else {
+ c++;
+ if (l->buf.s[i] == '\t') {
+ c += (unsigned)-c & 7;
+ }
+ }
+ }
+ *line = ln;
+ *col = c;
+}
+
+void lex_error(Lexer *l, LexErr e, Str msg) {
+ int line, col;
+ l->ofs -= l->ident.n;
+ lex_pos(l, &line, &col);
+
+ fprintf(stderr, "%s", e == LE_ERROR ? "\x1b[1;31m" : "\x1b[1;33m");
+
+ fprintf(stderr, "%.*s:%d:%d: %.*s\n\n",
+ (int)l->filename.n, l->filename.s, line + 1, col + 1, (int)msg.n, msg.s);
+
+ {
+ int ofs = l->ofs;
+ int line_start = ofs;
+ while (line_start > 0 && l->buf.s[line_start - 1] != '\n') line_start--;
+ int line_end = line_start;
+ while (line_end < l->buf.n && l->buf.s[line_end] != '\n') line_end++;
+ fprintf(stderr, "%.*s\n", line_end - line_start, &l->buf.s[line_start]);
+ for (int i = 0; i < col; i++) putchar(' ');
+ for (int i = ofs; i < ofs + l->ident.n && i < line_end; i++) putchar('^');
+ putchar('\n');
+ }
+
+ fprintf(stderr, "\x1b[0m\n");
+
+ if (e == LE_ERROR) {
+ exit(1);
+ }
+}
+
+static inline int is_digit(int c) {
+ return c >= '0' && c <= '9';
+}
+
+static inline int is_ident_first_char(int c) {
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
+}
+
+static inline int is_ident_next_char(int c) {
+ return is_ident_first_char(c) || is_digit(c);
+}
+
+Token ident_to_keyword(Str ident) {
+ /* evil & stupid hack to avoid keeping a separate table of keywords */
+ for (Token t = 0; t < TOK_MAX; t++) {
+ if (str_eql(str_from_cstr(lex_tok_str[t]), ident)) {
+ return t;
+ }
+ }
+ return TOK_IDENT;
+}
+
+#define T(t) (l->tok = t)
+void lex_next(Lexer *l) {
+ int i = l->ofs;
+ while (i < l->buf.n && is_space(l->buf.s[i])) {
+ i++;
+ }
+ int start_ofs = i;
+ l->ident = (Str) { &l->buf.s[start_ofs], 0 };
+ if (i >= l->buf.n) {
+ l->tok = TOK_EOF;
+ return;
+ }
+ char c = l->buf.s[i++];
+ l->tok = TOK_MAX;
+ if (is_ident_first_char(c)) {
+ T(TOK_IDENT);
+ while (i < l->buf.n && is_ident_next_char(l->buf.s[i])) i++;
+ } else if (is_digit(c)) {
+ T(TOK_LIT_NUM);
+ while (i < l->buf.n && (is_digit(l->buf.s[i]) || l->buf.s[i] == '.' || l->buf.s[i] == 'e')) i++;
+ } else {
+ switch (c) {
+#define X(a,b) case b: T(a); break;
+ LEX_TOK_CHAR_LIST
+#undef X
+ case '\'':
+ T(TOK_LIT_CHAR);
+ if (i < l->buf.n && l->buf.s[i] == '\\') i += 2;
+ else i++;
+ if (i >= l->buf.n) lex_error(l, LE_ERROR, S("Unterminated character literal"));
+ if (l->buf.s[i] != '\'') lex_error(l, LE_ERROR, S("Overlong character literal"));
+ i++;
+ break;
+ case '"':
+ T(TOK_LIT_STR);
+ for (;;) {
+ if (i >= l->buf.n) {
+ lex_error(l, LE_ERROR, S("Unterminated string literal"));
+ }
+ if (l->buf.s[i] == '\\') {
+ i += 2;
+ continue;
+ }
+ if (l->buf.s[i++] == '"') break;
+ }
+ break;
+ }
+ }
+ if (l->tok == TOK_MAX) {
+ lex_error(l, LE_ERROR, S("Invalid token"));
+ }
+ l->ident.n = i - start_ofs;
+ l->ofs = i;
+ if (l->tok == TOK_IDENT) {
+ l->tok = ident_to_keyword(l->ident);
+ }
+}
diff --git a/lex.h b/lex.h
new file mode 100644
index 0000000..460636a
--- /dev/null
+++ b/lex.h
@@ -0,0 +1,81 @@
+#ifndef LEX_H
+#define LEX_H
+
+#include "str.h"
+
+#define LEX_TOK_LIST\
+ X(TOK_EOF, "end-of-file")\
+ X(TOK_IDENT, "identifier")\
+ X(TOK_PROC, "proc")\
+ X(TOK_LET, "let")\
+ X(TOK_VAR, "var")\
+ X(TOK_CONST, "const")\
+ X(TOK_RETURN, "return")\
+ X(TOK_LBRACE, "{")\
+ X(TOK_RBRACE, "}")\
+ X(TOK_LPAREN, "(")\
+ X(TOK_RPAREN, ")")\
+ X(TOK_PLUS, "+")\
+ X(TOK_MINUS, "-")\
+ X(TOK_ASTERISK, "*")\
+ X(TOK_SLASH, "/")\
+ X(TOK_EQUALS, "=")\
+ X(TOK_COLON, ":")\
+ X(TOK_COMMA, ",")\
+ X(TOK_LIT_STR, "string literal")\
+ X(TOK_LIT_CHAR, "character literal")\
+ X(TOK_LIT_NUM, "numeric literal")
+
+#define LEX_TOK_CHAR_LIST\
+ X(TOK_LBRACE, '{')\
+ X(TOK_RBRACE, '}')\
+ X(TOK_LPAREN, '(')\
+ X(TOK_RPAREN, ')')\
+ X(TOK_PLUS, '+')\
+ X(TOK_MINUS, '-')\
+ X(TOK_ASTERISK, '*')\
+ X(TOK_SLASH, '/')\
+ X(TOK_EQUALS, '=')\
+ X(TOK_COLON, ':')\
+ X(TOK_COMMA, ',')
+
+typedef enum {
+#define X(n, s) n,
+ LEX_TOK_LIST
+#undef X
+ TOK_MAX,
+} Token;
+
+static const char *lex_tok_str[TOK_MAX] = {
+#define X(n, s) s,
+ LEX_TOK_LIST
+#undef X
+};
+
+#define TMASK(t) (1L << t)
+typedef unsigned long TokMask;
+
+typedef struct {
+ Token tok;
+ Str ident;
+ Str filename, buf;
+ Arena arena;
+ int ofs;
+} Lexer;
+
+typedef enum {
+ LE_WARN,
+ LE_ERROR
+} LexErr;
+
+void lex_start(Lexer *l, const char *path);
+void lex_free(Lexer *l);
+void lex_expected(Lexer *l, TokMask t);
+void lex_expected_not(Lexer *l, TokMask t);
+void lex_expect(Lexer *l, TokMask t); /* next -> expected */
+void lex_expect_not(Lexer *l, TokMask t); /* next -> expected_not */
+void lex_error(Lexer *l, LexErr e, Str msg);
+void lex_pos(Lexer *l, int *line, int *col);
+void lex_next(Lexer *l);
+
+#endif
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..59a41a2
--- /dev/null
+++ b/main.c
@@ -0,0 +1,161 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "lex.h"
+
+/* node graph */
+
+typedef enum {
+ N_START,
+ N_STOP,
+ N_LIT_INT,
+ N_OP_ADD,
+ N_OP_SUB,
+ N_OP_MUL,
+ N_OP_DIV,
+ N_OP_NEG,
+} NodeType;
+
+typedef struct Node {
+ NodeType type;
+ struct Node **inputs, **outputs;
+} Node;
+
+typedef struct {
+ Node *start, *stop;
+} ProcGraph;
+
+/* parsing */
+
+void parse_expr(Lexer *l);
+
+void parse_stmt_let(Lexer *l) {
+ lex_expect(l, TMASK(TOK_IDENT));
+ Str name = l->ident;
+ lex_expect(l, TMASK(TOK_EQUALS) | TMASK(TOK_COLON));
+ lex_next(l);
+ parse_expr(l);
+ printf("name_bind %.*s\n", (int)name.n, name.s);
+}
+
+void parse_stmt(Lexer *l) {
+ /* TODO */
+ (void)l;
+ switch (l->tok) {
+ case TOK_LET:
+ parse_stmt_let(l);
+ break;
+ case TOK_RETURN:
+ lex_next(l);
+ parse_expr(l);
+ puts("return");
+ break;
+ default:
+ lex_expected(l, TMASK(TOK_RBRACE));
+ }
+}
+
+void parse_proc(Lexer *l) {
+ lex_expect(l, TMASK(TOK_IDENT));
+ lex_expect(l, TMASK(TOK_LBRACE));
+ lex_next(l);
+ while (l->tok != TOK_RBRACE) {
+ lex_expected_not(l, TMASK(TOK_EOF));
+ parse_stmt(l);
+ }
+ lex_expected(l, TMASK(TOK_RBRACE));
+ lex_next(l);
+}
+
+void parse_term(Lexer *l) {
+ int negate_after = l->tok == TOK_MINUS;
+ if (TMASK(l->tok) & (TMASK(TOK_MINUS) | TMASK(TOK_PLUS))) {
+ lex_next(l);
+ }
+ if (l->tok == TOK_LPAREN) {
+ lex_next(l);
+ parse_expr(l);
+ lex_expected(l, TMASK(TOK_RPAREN));
+ lex_next(l);
+ } else {
+ lex_expected(l, TMASK(TOK_LIT_NUM) | TMASK(TOK_LIT_STR) | TMASK(TOK_LIT_CHAR) | TMASK(TOK_IDENT));
+ switch (l->tok) {
+ case TOK_IDENT:
+ printf("var %.*s\n", (int)l->ident.n, l->ident.s);
+ break;
+ case TOK_LIT_STR:
+ printf("lit_str %.*s\n", (int)l->ident.n, l->ident.s);
+ break;
+ case TOK_LIT_CHAR:
+ printf("lit_char %.*s\n", (int)l->ident.n, l->ident.s);
+ break;
+ case TOK_LIT_NUM:
+ printf("lit_num %.*s\n", (int)l->ident.n, l->ident.s);
+ break;
+ default:
+ break;
+ }
+ lex_next(l);
+ }
+ if (negate_after) puts("negate");
+}
+
+void parse_expr(Lexer *l) {
+ parse_term(l);
+ if (l->tok == TOK_LPAREN) {
+ lex_next(l);
+ puts("args_start");
+ for (;;) {
+ parse_expr(l);
+ lex_expected(l, TMASK(TOK_COMMA) | TMASK(TOK_RPAREN));
+ if (l->tok == TOK_RPAREN) break;
+ lex_next(l);
+ }
+ lex_next(l);
+ puts("args_end");
+ puts("func_call");
+ }
+ if (TMASK(l->tok) & (TMASK(TOK_PLUS) | TMASK(TOK_MINUS) | TMASK(TOK_ASTERISK) | TMASK(TOK_SLASH))) {
+ Token t = l->tok;
+ lex_next(l);
+ parse_expr(l);
+ switch (t) {
+ case TOK_PLUS: puts("add"); break;
+ case TOK_MINUS: puts("subtract"); break;
+ case TOK_ASTERISK: puts("multiply"); break;
+ case TOK_SLASH: puts("divide"); break;
+ default: break;
+ }
+ }
+}
+
+void parse_decl(Lexer *l) {
+ switch (l->tok) {
+ case TOK_PROC:
+ parse_proc(l);
+ break;
+ default:
+ lex_error(l, LE_ERROR, S("Invalid declaration statement"));
+ }
+}
+
+void parse_unit(Lexer *l) {
+ while (l->tok != TOK_EOF) {
+ parse_decl(l);
+ }
+}
+
+int main(int argc, const char **argv) {
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s FILE\n", argv[0]);
+ return 1;
+ }
+ Lexer l = { 0 };
+ lex_start(&l, argv[1]);
+ parse_unit(&l);
+ lex_free(&l);
+ return 0;
+}
diff --git a/str.h b/str.h
new file mode 100644
index 0000000..b10bc64
--- /dev/null
+++ b/str.h
@@ -0,0 +1,152 @@
+#ifndef STR_H
+#define STR_H
+
+#include <string.h>
+#include <stddef.h>
+
+#include "arena.h"
+
+typedef struct {
+ char *s;
+ ptrdiff_t n;
+} Str;
+
+typedef struct {
+ Str head, tail;
+} Cut;
+
+#define S(s) (Str){s,sizeof(s)-1}
+
+char *str_to_cstr(Str s, Arena *a);
+Str str_from_cstr(const char *s);
+int str_eql(Str a, Str b);
+int str_starts(Str a, Str b);
+int str_ends(Str a, Str b);
+void str_catc(Str *a, char b, Arena *m);
+Str str_skip(Str a, ptrdiff_t n);
+int is_space(char c);
+Str str_trim_left(Str a);
+Str str_trim_right(Str a);
+Str str_trim(Str a);
+Cut str_cut(Str s, char c);
+Str str_findc(Str s, char c);
+Str str_find(Str haystack, Str needle);
+int str_contains(Str a, Str b);
+Str str_dup(Str a, Arena *m);
+void str_cat(Str *a, Str b, Arena *m);
+Str str_replace_end(Str s, Str a, Str b, Arena *m);
+
+#ifdef STR_IMPL
+
+/* conversions */
+
+char *str_to_cstr(Str s, Arena *a) {
+ char *r = new_arr(a, char, s.n + 1);
+ memcpy(r, s.s, s.n);
+ r[s.n] = 0;
+ return r;
+}
+
+Str str_from_cstr(const char *s) {
+ return (Str) { (char*)s, strlen(s) };
+}
+
+/* pure functions */
+
+int str_eql(Str a, Str b) {
+ return a.n == b.n && !memcmp(a.s, b.s, b.n);
+}
+
+int str_starts(Str a, Str b) {
+ return a.n >= b.n && !memcmp(a.s, b.s, b.n);
+}
+
+int str_ends(Str a, Str b) {
+ return a.n >= b.n && !memcmp(&a.s[a.n - b.n], b.s, b.n);
+}
+
+void str_catc(Str *a, char b, Arena *m) {
+ a->s = resize(m, a->s, a->n, a->n + 1);
+ a->s[a->n++] = b;
+}
+
+Str str_skip(Str a, ptrdiff_t n) {
+ return (Str) { a.s + n, a.n - n };
+}
+
+int is_space(char c) {
+ return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+}
+
+Str str_trim_left(Str a) {
+ while (a.n > 0 && is_space(a.s[0])) a.s++, a.n--;
+ return a;
+}
+
+Str str_trim_right(Str a) {
+ while (a.n > 0 && is_space(a.s[a.n - 1])) a.n--;
+ return a;
+}
+
+Str str_trim(Str a) {
+ return str_trim_left(str_trim_right(a));
+}
+
+/* splitting, searching */
+
+Cut str_cut(Str s, char c) {
+ char *p = memchr(s.s, c, s.n);
+ if (!p) {
+ return (Cut) { s, { &s.s[s.n], 0 } };
+ } else {
+ return (Cut) {
+ { s.s, p - s.s },
+ { p + 1, &s.s[s.n] - (p + 1) }
+ };
+ }
+}
+
+Str str_findc(Str s, char c) {
+ char *p = memchr(s.s, c, s.n);
+ return p ? (Str) { p, s.n - (p - s.s) } : (Str) { &s.s[s.n], 0 };
+}
+
+Str str_find(Str haystack, Str needle) {
+ if (needle.n < 1) return haystack;
+ while (haystack.n > 0) {
+ haystack = str_findc(haystack, needle.s[0]);
+ if (str_starts(haystack, needle)) break;
+ if (haystack.n > 0) haystack = str_skip(haystack, 1);
+ }
+ return haystack;
+}
+
+int str_contains(Str a, Str b) {
+ return str_find(a, b).n > 0;
+}
+
+/* allocating */
+
+Str str_dup(Str a, Arena *m) {
+ char *s = new_arr(m, char, a.n);
+ memcpy(s, a.s, a.n);
+ a.s = s;
+ return a;
+}
+
+void str_cat(Str *a, Str b, Arena *m) {
+ a->s = resize(m, a->s, a->n, a->n + b.n);
+ memcpy(&a->s[a->n], b.s, b.n);
+ a->n += b.n;
+}
+
+Str str_replace_end(Str s, Str a, Str b, Arena *m) {
+ if (!str_ends(s, a)) return s;
+ char *p = new_arr(m, char, s.n + b.n - a.n);
+ memcpy(p, s.s, s.n - a.n);
+ memcpy(p + s.n - a.n, b.s, b.n);
+ return (Str) { p, s.n + b.n - a.n };
+}
+
+#endif
+#endif
diff --git a/strio.h b/strio.h
new file mode 100644
index 0000000..0d6c6f5
--- /dev/null
+++ b/strio.h
@@ -0,0 +1,234 @@
+#ifndef STRIO_H
+#define STRIO_H
+
+#include <stdarg.h>
+
+#include "str.h"
+#include "arena.h"
+
+int read_all(FILE *f, Str *buf, Arena *a);
+int next_line(Str *src, Str *line);
+void str_putf(Str s, FILE *f);
+void str_put(Str s);
+
+int str_to_u64(Str s, uint64_t *out);
+int str_to_i64(Str s, int64_t *out);
+
+void str_cat_i64(Str *out, int64_t c, char pad_char, int min_width, Arena *a);
+void str_cat_u64(Str *out, uint64_t c, char pad_char, int min_width, Arena *a);
+void str_cat_fmtv(Str *out, Arena *arena, const char *fmt, va_list ap);
+void str_cat_fmt(Str *out, Arena *arena, const char *fmt, ...);
+
+Str str_fmtv(Arena *arena, const char *fmt, va_list ap);
+Str str_fmt(Arena *arena, const char *fmt, ...);
+const char *cstr_fmt(Arena *arena, const char *fmt, ...);
+
+#ifdef STRIO_IMPL
+
+#include <stdio.h>
+#include <stdint.h>
+
+static inline long read_all_file_size(FILE *f) {
+ fseek(f, 0, SEEK_END);
+ long t = ftell(f);
+ fseek(f, 0, SEEK_SET);
+ return t > 0 ? t : -1;
+}
+
+int read_all(FILE *f, Str *buf, Arena *a) {
+ if (!f) return -1;
+ long sz = read_all_file_size(f);
+ if (sz < 1) {
+ ptrdiff_t cap = 4096;
+ buf->s = new_arr(a, char, cap);
+ buf->n = 0;
+ while (!feof(f)) {
+ size_t n = fread(&buf->s[buf->n], 1, cap - buf->n, f);
+ if (n < 1) break;
+ buf->n += n;
+ if (buf->n >= cap) {
+ size_t c = cap;
+ while (buf->n >= cap) cap <<= 1;
+ buf->s = resize(a, buf->s, c, cap);
+ }
+ }
+ } else {
+ buf->n = sz;
+ buf->s = new_arr(a, char, sz);
+ size_t sz = fread(buf->s, 1, buf->n, f);
+ if (sz < (size_t)buf->n) return -1;
+ }
+ return ferror(f) ? -1 : 0;
+}
+
+int next_line(Str *src, Str *line) {
+ if (src->n < 1) return 0;
+ line->s = src->s;
+ char *newln = memchr(src->s, '\n', src->n);
+ line->n = newln ? newln - src->s : src->n;
+ src->s += line->n + 1;
+ src->n -= line->n + 1;
+ if (line->n > 0 && line->s[line->n-1] == '\r') line->n--;
+ return 1;
+}
+
+void str_putf(Str s, FILE *f) {
+ fwrite(s.s, 1, s.n, f);
+}
+
+void str_put(Str s) {
+ str_putf(s, stdout);
+}
+
+/* formatted conversion */
+
+int str_to_u64(Str s, uint64_t *out) {
+ if (s.n < 1) return -1;
+ uint64_t acc = 0;
+ for (int i = 0; i < s.n; i++) {
+ char c = s.s[i];
+ if (!(c >= '0' && c <= '9')) return -1;
+ acc = (acc * 10) + (c - '0');
+ }
+ *out = acc;
+ return 0;
+}
+
+int str_to_i64(Str s, int64_t *out) {
+ int64_t sign = 1;
+ if (s.n > 0 && s.s[0] == '-') {
+ s = str_skip(s, 1);
+ sign = -1;
+ }
+ uint64_t u64;
+ if (str_to_u64(s, &u64)) return -1;
+ *out = u64 * sign;
+ return 0;
+}
+
+static void str_cat_u64_(char buf[32], int *n, uint64_t c) {
+ int i = 0;
+ buf[31] = '\0';
+ do {
+ buf[32 - ++i] = (c % 10) + '0';
+ c /= 10;
+ } while (c);
+ *n = i;
+}
+
+void str_cat_u64(Str *out, uint64_t c, char pad_char, int min_width, Arena *a) {
+ int n;
+ /* more than enough for the largest 64-bit number
+ * log_10(1 << 64) ~= 19.3 digits max */
+ char buf[32];
+ str_cat_u64_(buf, &n, c);
+ while (n < min_width && ++n < 32) buf[32-n] = pad_char;
+ str_cat(out, (Str) { &buf[sizeof(buf) - n], n }, a);
+}
+
+void str_cat_i64(Str *out, int64_t c, char pad_char, int min_width, Arena *a) {
+ /* more than enough for the largest 64-bit number
+ * log_10(1 << 64) ~= 19.3 digits max */
+ int n, neg = 0;
+ char buf[32];
+ if (c < 0) neg = 1, c = -c;
+ str_cat_u64_(buf, &n, c);
+ if (neg) buf[sizeof(buf) - ++n] = '-';
+ while (n < min_width && ++n < 32) buf[32-n] = pad_char;
+ str_cat(out, (Str) { &buf[sizeof(buf) - n], n }, a);
+}
+
+/* IMPORTANT: this is not and will not be printf() compatible
+ *
+ * %s - c string
+ * %S - Str
+ * %i - int32
+ * %I - int64
+ * %u - uint32
+ * %U - uin64
+ *
+ **/
+void str_cat_fmtv(Str *out, Arena *arena, const char *fmt, va_list ap) {
+ size_t n = strlen(fmt);
+ for (size_t i = 0; i < n; i++) {
+ const char *mch = memchr(&fmt[i], '%', n - i);
+ if (!mch) {
+ str_cat(out, (Str) { (char*)&fmt[i], n - i }, arena);
+ break;
+ }
+ size_t skip = mch - &fmt[i];
+ if (mch != &fmt[i]) {
+ str_cat(out, (Str) { (char*)&fmt[i], skip }, arena);
+ i += skip;
+ }
+ if (i + 1 < n) {
+ int zero_pad = 0, min_width = 0;
+ i++;
+ if (fmt[i] == '0') {
+ zero_pad = 1;
+ i++;
+ }
+ while (i < n && fmt[i] >= '0' && fmt[i] <= '9') {
+ min_width = min_width * 10 + (fmt[i] - '0');
+ i++;
+ }
+ if (i >= n) break;
+ switch (fmt[i]) {
+ case 's':
+ str_cat(out, str_from_cstr(va_arg(ap, const char *)), arena);
+ break;
+ case 'S':
+ str_cat(out, va_arg(ap, Str), arena);
+ break;
+ case 'i':
+ str_cat_i64(out, va_arg(ap, int32_t), zero_pad?'0':' ', min_width, arena);
+ break;
+ case 'I':
+ str_cat_i64(out, va_arg(ap, int64_t), zero_pad?'0':' ', min_width, arena);
+ break;
+ case 'u':
+ str_cat_u64(out, va_arg(ap, uint32_t), zero_pad?'0':' ', min_width, arena);
+ break;
+ case 'U':
+ str_cat_u64(out, va_arg(ap, uint64_t), zero_pad?'0':' ', min_width, arena);
+ break;
+ default:
+ str_catc(out, fmt[i], arena);
+ break;
+ }
+ }
+ }
+}
+
+void str_cat_fmt(Str *out, Arena *arena, const char *fmt, ...) {
+ va_list ap;
+ va_start(ap, fmt);
+ str_cat_fmtv(out, arena, fmt, ap);
+ va_end(ap);
+}
+
+Str str_fmtv(Arena *arena, const char *fmt, va_list ap) {
+ Str s = { 0 };
+ str_cat_fmtv(&s, arena, fmt, ap);
+ return s;
+}
+
+Str str_fmt(Arena *arena, const char *fmt, ...) {
+ va_list ap;
+ va_start(ap, fmt);
+ Str r = str_fmtv(arena, fmt, ap);
+ va_end(ap);
+ return r;
+}
+
+const char *cstr_fmt(Arena *arena, const char *fmt, ...) {
+ va_list ap;
+ va_start(ap, fmt);
+ Str r = str_fmtv(arena, fmt, ap);
+ str_catc(&r, '\0', arena);
+ va_end(ap);
+ return r.s;
+}
+
+#endif
+#endif
diff --git a/test.lang b/test.lang
new file mode 100644
index 0000000..7559516
--- /dev/null
+++ b/test.lang
@@ -0,0 +1,3 @@
+proc main {
+ return 1
+}
diff --git a/utf8.h b/utf8.h
new file mode 100644
index 0000000..10a4f2b
--- /dev/null
+++ b/utf8.h
@@ -0,0 +1,64 @@
+#ifndef UTF8_H
+#define UTF8_H
+
+#include "str.h"
+
+#define UTF8_INVALID (unsigned)-1
+
+int utf8_len(unsigned cp);
+unsigned utf8_next(Str *s);
+void utf8_to_buf(unsigned cp, char *buf, int n);
+
+#ifdef UTF8_IMPL
+
+#include <stdbit.h>
+#include <stdint.h>
+
+unsigned utf8_next(Str *s) {
+ if (s->n < 1) return 0;
+ static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
+ static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
+ int len = niblen[(uint8_t)*s->s >> 4];
+ if (!len) { s->n--; return *s->s++; }
+ if (s->n < len || (s->s[0] & (0x80 >> len))) return UTF8_INVALID;
+ unsigned cp = (unsigned)*s->s & cpmask[len];
+ for (int i = 1; i < len; i++) {
+ if ((s->s[i] & 0xc0) != 0x80) return UTF8_INVALID;
+ cp = (cp << 6) | (s->s[i] & 0x3f);
+ }
+ s->s += len, s->n -= len;
+ return cp;
+}
+
+unsigned utf8_next_unchecked(Str *s) {
+ if (s->n < 1) return 0;
+ static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
+ static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
+ int len = niblen[(uint8_t)*s->s >> 4];
+ if (!len) { s->n--; return *s->s++; }
+ unsigned cp = (unsigned)*s->s & cpmask[len];
+ for (int i = 1; i < len; i++) cp = (cp << 6) | (s->s[i] & 0x3f);
+ s->s += len, s->n -= len;
+ return cp;
+}
+
+int utf8_len(unsigned cp) {
+ static const uint8_t tbl[33] = {
+ 6,6,6,6,6,6, 5,5,5,5,5, 4,4,4,4,4,
+ 3,3,3,3,3, 2,2,2,2, 1,1,1,1,1,1,1,1,
+ };
+ return tbl[stdc_leading_zeros(cp)];
+}
+
+void utf8_to_buf(unsigned cp, char *buf, int n) {
+ if (n == 1) {
+ *buf = cp;
+ return;
+ }
+ static const uint8_t tbl[5] = { 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100 };
+ for (int i = n; --i;) buf[i] = 0x80 | (cp & 0x3f), cp >>= 6;
+ buf[0] = tbl[n - 2] | cp;
+}
+
+#endif
+#endif