summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorWormHeamer2025-07-31 22:37:38 -0400
committerWormHeamer2025-07-31 22:37:38 -0400
commit842e22e9eb0f3dff7dabdaa41bcc2133e8f015f5 (patch)
tree78f42e68da656698526ff6099e78d82adab1d582 /utf8.h
initial commit
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h64
1 files changed, 64 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
new file mode 100644
index 0000000..10a4f2b
--- /dev/null
+++ b/utf8.h
@@ -0,0 +1,64 @@
+#ifndef UTF8_H
+#define UTF8_H
+
+#include "str.h"
+
+#define UTF8_INVALID (unsigned)-1
+
+int utf8_len(unsigned cp);
+unsigned utf8_next(Str *s);
+void utf8_to_buf(unsigned cp, char *buf, int n);
+
+#ifdef UTF8_IMPL
+
+#include <stdbit.h>
+#include <stdint.h>
+
+unsigned utf8_next(Str *s) {
+ if (s->n < 1) return 0;
+ static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
+ static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
+ int len = niblen[(uint8_t)*s->s >> 4];
+ if (!len) { s->n--; return *s->s++; }
+ if (s->n < len || (s->s[0] & (0x80 >> len))) return UTF8_INVALID;
+ unsigned cp = (unsigned)*s->s & cpmask[len];
+ for (int i = 1; i < len; i++) {
+ if ((s->s[i] & 0xc0) != 0x80) return UTF8_INVALID;
+ cp = (cp << 6) | (s->s[i] & 0x3f);
+ }
+ s->s += len, s->n -= len;
+ return cp;
+}
+
+unsigned utf8_next_unchecked(Str *s) {
+ if (s->n < 1) return 0;
+ static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
+ static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
+ int len = niblen[(uint8_t)*s->s >> 4];
+ if (!len) { s->n--; return *s->s++; }
+ unsigned cp = (unsigned)*s->s & cpmask[len];
+ for (int i = 1; i < len; i++) cp = (cp << 6) | (s->s[i] & 0x3f);
+ s->s += len, s->n -= len;
+ return cp;
+}
+
+int utf8_len(unsigned cp) {
+ static const uint8_t tbl[33] = {
+ 6,6,6,6,6,6, 5,5,5,5,5, 4,4,4,4,4,
+ 3,3,3,3,3, 2,2,2,2, 1,1,1,1,1,1,1,1,
+ };
+ return tbl[stdc_leading_zeros(cp)];
+}
+
+void utf8_to_buf(unsigned cp, char *buf, int n) {
+ if (n == 1) {
+ *buf = cp;
+ return;
+ }
+ static const uint8_t tbl[5] = { 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100 };
+ for (int i = n; --i;) buf[i] = 0x80 | (cp & 0x3f), cp >>= 6;
+ buf[0] = tbl[n - 2] | cp;
+}
+
+#endif
+#endif