diff options
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h new file mode 100644 index 0000000..10a4f2b --- /dev/null +++ b/utf8.h @@ -0,0 +1,64 @@ +#ifndef UTF8_H +#define UTF8_H + +#include "str.h" + +#define UTF8_INVALID (unsigned)-1 + +int utf8_len(unsigned cp); +unsigned utf8_next(Str *s); +void utf8_to_buf(unsigned cp, char *buf, int n); + +#ifdef UTF8_IMPL + +#include <stdbit.h> +#include <stdint.h> + +unsigned utf8_next(Str *s) { + if (s->n < 1) return 0; + static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 }; + static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf }; + int len = niblen[(uint8_t)*s->s >> 4]; + if (!len) { s->n--; return *s->s++; } + if (s->n < len || (s->s[0] & (0x80 >> len))) return UTF8_INVALID; + unsigned cp = (unsigned)*s->s & cpmask[len]; + for (int i = 1; i < len; i++) { + if ((s->s[i] & 0xc0) != 0x80) return UTF8_INVALID; + cp = (cp << 6) | (s->s[i] & 0x3f); + } + s->s += len, s->n -= len; + return cp; +} + +unsigned utf8_next_unchecked(Str *s) { + if (s->n < 1) return 0; + static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 }; + static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf }; + int len = niblen[(uint8_t)*s->s >> 4]; + if (!len) { s->n--; return *s->s++; } + unsigned cp = (unsigned)*s->s & cpmask[len]; + for (int i = 1; i < len; i++) cp = (cp << 6) | (s->s[i] & 0x3f); + s->s += len, s->n -= len; + return cp; +} + +int utf8_len(unsigned cp) { + static const uint8_t tbl[33] = { + 6,6,6,6,6,6, 5,5,5,5,5, 4,4,4,4,4, + 3,3,3,3,3, 2,2,2,2, 1,1,1,1,1,1,1,1, + }; + return tbl[stdc_leading_zeros(cp)]; +} + +void utf8_to_buf(unsigned cp, char *buf, int n) { + if (n == 1) { + *buf = cp; + return; + } + static const uint8_t tbl[5] = { 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100 }; + for (int i = n; --i;) buf[i] = 0x80 | (cp & 0x3f), cp >>= 6; + buf[0] = tbl[n - 2] | cp; +} + +#endif +#endif |