summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h89
1 files changed, 89 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
new file mode 100644
index 0000000..01c3336
--- /dev/null
+++ b/utf8.h
@@ -0,0 +1,89 @@
+/* utf-8 encoding and decoding library */
+
+#ifndef UTF8_H
+#define UTF8_H
+
+#include "wrmr.h"
+
+#define UTF8_INVALID 0xFFFD /* replacement character */
+
+u32 utf8_decode_len(const char *src, u32 ch_count);
+u32 utf8_encode_len(const u32 *src, u32 cp_count);
+void utf8_decode(u32 *dst, const char *src, u32 cp_count);
+void utf8_encode(char *dst, const u32 *src, u32 cp_count);
+u32 utf8_decode_at(const char *s, u32 i, u32 n);
+
+#ifdef UTF8_IMPL
+
+#include <stdbit.h>
+
+/* packed array of 2-bit lengths for codepoints 0..10FFFF */
+#define UTF8_CP_LEN_BITS ((uint64_t)0xFFEAA550000)
+#define UTF8_CP_SHIFT(cp) ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1)
+#define UTF8_CP_LEN(cp) (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3))
+
+u32 utf8_encode_len(const u32 *src, u32 cp_count) {
+ u32 len = 0;
+ while (cp_count) len += UTF8_CP_LEN(src[--cp_count]);
+ return len;
+}
+
+u32 utf8_decode_len(const char *src, u32 ch_count) {
+ u32 i = 0, len = 0;
+ while (i < ch_count) {
+ i += stdc_leading_ones((u8)src[i]) + ((~src[i] & 0x80) >> 7);
+ len++;
+ }
+ return len;
+}
+
+void utf8_encode(char *dst, const u32 *src, u32 cp_count) {
+ while (cp_count--) {
+ u32 c = *src++;
+ ASSUME(c > 0 && c < 0x110000);
+ u32 len = UTF8_CP_LEN(c);
+ ASSUME(len > 0 && len < 5);
+ if (len > 1) {
+ for (u32 i = len; --i;) {
+ dst[i] = 0x80 | (c & 0x3f);
+ c >>= 6;
+ }
+ *dst = (0xf0 << (4 - len)) | c;
+ dst += len;
+ } else {
+ *dst++ = c;
+ }
+ }
+}
+
+void utf8_decode(u32 *dst, const char *src, u32 cp_count) {
+ while (cp_count--) {
+ u8 c = *src++;
+ u32 bits = stdc_leading_ones(c);
+ ASSUME(bits < 5);
+ u32 cp = c & (0xff >> bits);
+ while (bits-- > 1) {
+ c = *src++;
+ cp = (cp << 6) | (c & 0x3F);
+ }
+ *dst++ = cp;
+ }
+}
+
+u32 utf8_decode_at(const char *s, u32 i, u32 n) {
+ if (i >= n) return 0;
+ u32 cp = (u8)s[i++];
+ u32 b = stdc_leading_ones((u8)cp);
+ if (!b) return cp;
+ u32 end = i + b - 1;
+ if (end >= n) return 0;
+ cp &= 0xff >> b;
+ while (i < end) {
+ u8 c = s[i++];
+ cp = (cp << 6) | (c & 0x3f);
+ }
+ return cp;
+}
+
+#endif
+#endif