summary refs log tree commit diff
path: root/utf8.h
diff options
context:
space:
mode:
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h64
1 files changed, 64 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
new file mode 100644
index 0000000..10a4f2b
--- /dev/null
+++ b/utf8.h
@@ -0,0 +1,64 @@
+#ifndef UTF8_H
+#define UTF8_H
+
+#include "str.h"
+
+#define UTF8_INVALID (unsigned)-1
+
+int utf8_len(unsigned cp);
+unsigned utf8_next(Str *s);
+void utf8_to_buf(unsigned cp, char *buf, int n);
+
+#ifdef UTF8_IMPL
+
+#include <stdbit.h>
+#include <stdint.h>
+
+unsigned utf8_next(Str *s) { 
+	if (s->n < 1) return 0;
+	static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
+	static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
+	int len = niblen[(uint8_t)*s->s >> 4];
+	if (!len) { s->n--; return *s->s++; }
+	if (s->n < len || (s->s[0] & (0x80 >> len))) return UTF8_INVALID;
+	unsigned cp = (unsigned)*s->s & cpmask[len];
+	for (int i = 1; i < len; i++) {
+		if ((s->s[i] & 0xc0) != 0x80) return UTF8_INVALID;
+		cp = (cp << 6) | (s->s[i] & 0x3f);
+	}
+	s->s += len, s->n -= len;
+	return cp;
+}
+
+unsigned utf8_next_unchecked(Str *s) { 
+	if (s->n < 1) return 0;
+	static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
+	static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
+	int len = niblen[(uint8_t)*s->s >> 4];
+	if (!len) { s->n--; return *s->s++; }
+	unsigned cp = (unsigned)*s->s & cpmask[len];
+	for (int i = 1; i < len; i++) cp = (cp << 6) | (s->s[i] & 0x3f);
+	s->s += len, s->n -= len;
+	return cp;
+}
+
+int utf8_len(unsigned cp) {
+	static const uint8_t tbl[33] = {
+		6,6,6,6,6,6, 5,5,5,5,5, 4,4,4,4,4,
+		3,3,3,3,3, 2,2,2,2, 1,1,1,1,1,1,1,1,
+	};
+	return tbl[stdc_leading_zeros(cp)];
+}
+
+void utf8_to_buf(unsigned cp, char *buf, int n) {
+	if (n == 1) {
+		*buf = cp;
+		return;
+	}
+	static const uint8_t tbl[5] = { 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100 };
+	for (int i = n; --i;) buf[i] = 0x80 | (cp & 0x3f), cp >>= 6;
+	buf[0] = tbl[n - 2] | cp;
+}
+
+#endif
+#endif