1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
#ifndef UTF8_H
#define UTF8_H
#include "str.h"
#define UTF8_INVALID (unsigned)-1
int utf8_len(unsigned cp);
unsigned utf8_next(Str *s);
void utf8_to_buf(unsigned cp, char *buf, int n);
#ifdef UTF8_IMPL
#include <stdbit.h>
#include <stdint.h>
unsigned utf8_next(Str *s) {
if (s->n < 1) return 0;
static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
int len = niblen[(uint8_t)*s->s >> 4];
if (!len) { s->n--; return *s->s++; }
if (s->n < len || (s->s[0] & (0x80 >> len))) return UTF8_INVALID;
unsigned cp = (unsigned)*s->s & cpmask[len];
for (int i = 1; i < len; i++) {
if ((s->s[i] & 0xc0) != 0x80) return UTF8_INVALID;
cp = (cp << 6) | (s->s[i] & 0x3f);
}
s->s += len, s->n -= len;
return cp;
}
unsigned utf8_next_unchecked(Str *s) {
if (s->n < 1) return 0;
static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
int len = niblen[(uint8_t)*s->s >> 4];
if (!len) { s->n--; return *s->s++; }
unsigned cp = (unsigned)*s->s & cpmask[len];
for (int i = 1; i < len; i++) cp = (cp << 6) | (s->s[i] & 0x3f);
s->s += len, s->n -= len;
return cp;
}
int utf8_len(unsigned cp) {
static const uint8_t tbl[33] = {
6,6,6,6,6,6, 5,5,5,5,5, 4,4,4,4,4,
3,3,3,3,3, 2,2,2,2, 1,1,1,1,1,1,1,1,
};
return tbl[stdc_leading_zeros(cp)];
}
void utf8_to_buf(unsigned cp, char *buf, int n) {
if (n == 1) {
*buf = cp;
return;
}
static const uint8_t tbl[5] = { 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100 };
for (int i = n; --i;) buf[i] = 0x80 | (cp & 0x3f), cp >>= 6;
buf[0] = tbl[n - 2] | cp;
}
#endif
#endif
|