#ifndef UTF8_H #define UTF8_H #include "str.h" #define UTF8_INVALID (unsigned)-1 int utf8_len(unsigned cp); unsigned utf8_next(Str *s); void utf8_to_buf(unsigned cp, char *buf, int n); #ifdef UTF8_IMPL #include #include unsigned utf8_next(Str *s) { if (s->n < 1) return 0; static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 }; static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf }; int len = niblen[(uint8_t)*s->s >> 4]; if (!len) { s->n--; return *s->s++; } if (s->n < len || (s->s[0] & (0x80 >> len))) return UTF8_INVALID; unsigned cp = (unsigned)*s->s & cpmask[len]; for (int i = 1; i < len; i++) { if ((s->s[i] & 0xc0) != 0x80) return UTF8_INVALID; cp = (cp << 6) | (s->s[i] & 0x3f); } s->s += len, s->n -= len; return cp; } unsigned utf8_next_unchecked(Str *s) { if (s->n < 1) return 0; static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 }; static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf }; int len = niblen[(uint8_t)*s->s >> 4]; if (!len) { s->n--; return *s->s++; } unsigned cp = (unsigned)*s->s & cpmask[len]; for (int i = 1; i < len; i++) cp = (cp << 6) | (s->s[i] & 0x3f); s->s += len, s->n -= len; return cp; } int utf8_len(unsigned cp) { static const uint8_t tbl[33] = { 6,6,6,6,6,6, 5,5,5,5,5, 4,4,4,4,4, 3,3,3,3,3, 2,2,2,2, 1,1,1,1,1,1,1,1, }; return tbl[stdc_leading_zeros(cp)]; } void utf8_to_buf(unsigned cp, char *buf, int n) { if (n == 1) { *buf = cp; return; } static const uint8_t tbl[5] = { 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100 }; for (int i = n; --i;) buf[i] = 0x80 | (cp & 0x3f), cp >>= 6; buf[0] = tbl[n - 2] | cp; } #endif #endif