summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--utf8.h122
1 files changed, 80 insertions, 42 deletions
diff --git a/utf8.h b/utf8.h
index 10a4f2b..20504b8 100644
--- a/utf8.h
+++ b/utf8.h
@@ -1,63 +1,101 @@
+/* utf-8 encoding and decoding library */
+
#ifndef UTF8_H
#define UTF8_H
-#include "str.h"
+#include "wrmr.h"
-#define UTF8_INVALID (unsigned)-1
+#define UTF8_INVALID 0xFFFD /* replacement character */
-int utf8_len(unsigned cp);
-unsigned utf8_next(Str *s);
-void utf8_to_buf(unsigned cp, char *buf, int n);
+u32 utf8_decode_len(const char *src, u32 ch_count);
+u32 utf8_encode_len(const u32 *src, u32 cp_count);
+void utf8_decode(u32 *dst, const char *src, u32 cp_count);
+void utf8_encode(char *dst, const u32 *src, u32 cp_count);
+u32 utf8_decode_ckd(const char **src, u32 *srcn);
+#define UTF8_IMPL
#ifdef UTF8_IMPL
#include <stdbit.h>
-#include <stdint.h>
-unsigned utf8_next(Str *s) {
- if (s->n < 1) return 0;
- static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
- static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
- int len = niblen[(uint8_t)*s->s >> 4];
- if (!len) { s->n--; return *s->s++; }
- if (s->n < len || (s->s[0] & (0x80 >> len))) return UTF8_INVALID;
- unsigned cp = (unsigned)*s->s & cpmask[len];
- for (int i = 1; i < len; i++) {
- if ((s->s[i] & 0xc0) != 0x80) return UTF8_INVALID;
- cp = (cp << 6) | (s->s[i] & 0x3f);
+/* packed array of 2-bit lengths for codepoints 0..10FFFF */
+#define UTF8_CP_LEN_BITS ((uint64_t)0xFFEAA550000)
+#define UTF8_CP_SHIFT(cp) ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1)
+#define UTF8_CP_LEN(cp) (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3))
+
+u32 utf8_encode_len(const u32 *src, u32 cp_count) {
+ u32 len = 0;
+ while (cp_count) len += UTF8_CP_LEN(src[--cp_count]);
+ return len;
+}
+
+u32 utf8_decode_len(const char *src, u32 ch_count) {
+ u32 i = 0, len = 0;
+ while (i < ch_count) {
+ i += stdc_leading_ones((u8)src[i]) + ((~src[i] & 0x80) >> 7);
+ len++;
}
- s->s += len, s->n -= len;
- return cp;
+ return len;
}
-unsigned utf8_next_unchecked(Str *s) {
- if (s->n < 1) return 0;
- static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
- static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
- int len = niblen[(uint8_t)*s->s >> 4];
- if (!len) { s->n--; return *s->s++; }
- unsigned cp = (unsigned)*s->s & cpmask[len];
- for (int i = 1; i < len; i++) cp = (cp << 6) | (s->s[i] & 0x3f);
- s->s += len, s->n -= len;
- return cp;
+void utf8_encode(char *dst, const u32 *src, u32 cp_count) {
+ while (cp_count--) {
+ u32 c = *src++;
+ ASSUME(c > 0 && c < 0x110000);
+ u32 len = UTF8_CP_LEN(c);
+ ASSUME(len > 0 && len < 5);
+ if (len > 1) {
+ for (u32 i = len; --i;) {
+ dst[i] = 0x80 | (c & 0x3f);
+ c >>= 6;
+ }
+ *dst = (0xf0 << (4 - len)) | c;
+ dst += len;
+ } else {
+ *dst++ = c;
+ }
+ }
}
-int utf8_len(unsigned cp) {
- static const uint8_t tbl[33] = {
- 6,6,6,6,6,6, 5,5,5,5,5, 4,4,4,4,4,
- 3,3,3,3,3, 2,2,2,2, 1,1,1,1,1,1,1,1,
- };
- return tbl[stdc_leading_zeros(cp)];
+void utf8_decode(u32 *dst, const char *src, u32 cp_count) {
+ while (cp_count--) {
+ u8 c = *src++;
+ u32 bits = stdc_leading_ones(c);
+ ASSUME(bits < 5);
+ u32 cp = c & (0xff >> bits);
+ while (bits-- > 1) {
+ c = *src++;
+ cp = (cp << 6) | (c & 0x3F);
+ }
+ *dst++ = cp;
+ }
}
-void utf8_to_buf(unsigned cp, char *buf, int n) {
- if (n == 1) {
- *buf = cp;
- return;
+u32 utf8_decode_ckd(const char **src, u32 *srcn) {
+ u32 n = *srcn;
+ if (!n) return 0;
+ const u8 *s = (const u8 *)*src;
+ u32 cp = *s++;
+ n -= 1;
+ u32 bits = stdc_leading_ones((u8)cp);
+ if (bits) {
+ if (bits >= 5 || n < bits) return UTF8_INVALID;
+ cp &= 0xff >> bits;
+ u32 cp_len = bits - 1;
+ ASSUME(cp_len > 0 && cp_len <= 4);
+ for (u32 i = 0; i < cp_len; i++) {
+ u8 c = s[i];
+ if ((c & 0xC0) != 0x80) return UTF8_INVALID;
+ cp = (cp << 6) | (c & 0x3F);
+ }
+ if (cp > 0x10FFFF) return UTF8_INVALID;
+ if (UTF8_CP_LEN(cp) != cp_len) return UTF8_INVALID;
+ s += cp_len;
+ n -= cp_len;
}
- static const uint8_t tbl[5] = { 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100 };
- for (int i = n; --i;) buf[i] = 0x80 | (cp & 0x3f), cp >>= 6;
- buf[0] = tbl[n - 2] | cp;
+ *src = (const char *)s;
+ *srcn = n;
+ return cp;
}
#endif