1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
/* utf-8 encoding and decoding library */
#ifndef UTF8_H
#define UTF8_H
#include "wrmr.h"
#define UTF8_INVALID 0xFFFD /* replacement character */
u32 utf8_decode_len(const char *src, u32 ch_count);
u32 utf8_encode_len(const u32 *src, u32 cp_count);
void utf8_decode(u32 *dst, const char *src, u32 cp_count);
void utf8_encode(char *dst, const u32 *src, u32 cp_count);
u32 utf8_decode_ckd(const char **src, u32 *srcn);
#define UTF8_IMPL
#ifdef UTF8_IMPL
#include <stdbit.h>
/* packed array of 2-bit lengths for codepoints 0..10FFFF */
#define UTF8_CP_LEN_BITS ((uint64_t)0xFFEAA550000)
#define UTF8_CP_SHIFT(cp) ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1)
#define UTF8_CP_LEN(cp) (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3))
u32 utf8_encode_len(const u32 *src, u32 cp_count) {
u32 len = 0;
while (cp_count) len += UTF8_CP_LEN(src[--cp_count]);
return len;
}
u32 utf8_decode_len(const char *src, u32 ch_count) {
u32 i = 0, len = 0;
while (i < ch_count) {
i += stdc_leading_ones((u8)src[i]) + ((~src[i] & 0x80) >> 7);
len++;
}
return len;
}
void utf8_encode(char *dst, const u32 *src, u32 cp_count) {
while (cp_count--) {
u32 c = *src++;
ASSUME(c > 0 && c < 0x110000);
u32 len = UTF8_CP_LEN(c);
ASSUME(len > 0 && len < 5);
if (len > 1) {
for (u32 i = len; --i;) {
dst[i] = 0x80 | (c & 0x3f);
c >>= 6;
}
*dst = (0xf0 << (4 - len)) | c;
dst += len;
} else {
*dst++ = c;
}
}
}
void utf8_decode(u32 *dst, const char *src, u32 cp_count) {
while (cp_count--) {
u8 c = *src++;
u32 bits = stdc_leading_ones(c);
ASSUME(bits < 5);
u32 cp = c & (0xff >> bits);
while (bits-- > 1) {
c = *src++;
cp = (cp << 6) | (c & 0x3F);
}
*dst++ = cp;
}
}
u32 utf8_decode_ckd(const char **src, u32 *srcn) {
u32 n = *srcn;
if (!n) return 0;
const u8 *s = (const u8 *)*src;
u32 cp = *s++;
n -= 1;
u32 bits = stdc_leading_ones((u8)cp);
if (bits) {
if (bits >= 5 || n < bits) return UTF8_INVALID;
cp &= 0xff >> bits;
u32 cp_len = bits - 1;
ASSUME(cp_len > 0 && cp_len <= 4);
for (u32 i = 0; i < cp_len; i++) {
u8 c = s[i];
if ((c & 0xC0) != 0x80) return UTF8_INVALID;
cp = (cp << 6) | (c & 0x3F);
}
if (cp > 0x10FFFF) return UTF8_INVALID;
if (UTF8_CP_LEN(cp) != cp_len) return UTF8_INVALID;
s += cp_len;
n -= cp_len;
}
*src = (const char *)s;
*srcn = n;
return cp;
}
#endif
#endif
|