summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorWormHeamer2025-12-28 04:19:59 -0500
committerWormHeamer2025-12-28 04:19:59 -0500
commit615601fb355709d611d18f878f77c993c312f6aa (patch)
tree5fb32e00c201944f9c77308da1ab9bee443bdf7c /utf8.h
parent9f4310c24ca39284ad768a82d368e749b18fd76c (diff)
lots of features & bugfixes both
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h12
1 files changed, 6 insertions, 6 deletions
diff --git a/utf8.h b/utf8.h
index 01c3336..164279b 100644
--- a/utf8.h
+++ b/utf8.h
@@ -6,6 +6,9 @@
#include "wrmr.h"
#define UTF8_INVALID 0xFFFD /* replacement character */
+#define UTF8_CP_LEN_BITS ((uint64_t)0xFFEAA550000)
+#define UTF8_CP_SHIFT(cp) ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1)
+#define UTF8_CP_LEN(cp) (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3))
u32 utf8_decode_len(const char *src, u32 ch_count);
u32 utf8_encode_len(const u32 *src, u32 cp_count);
@@ -18,9 +21,6 @@ u32 utf8_decode_at(const char *s, u32 i, u32 n);
#include <stdbit.h>
/* packed array of 2-bit lengths for codepoints 0..10FFFF */
-#define UTF8_CP_LEN_BITS ((uint64_t)0xFFEAA550000)
-#define UTF8_CP_SHIFT(cp) ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1)
-#define UTF8_CP_LEN(cp) (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3))
u32 utf8_encode_len(const u32 *src, u32 cp_count) {
u32 len = 0;
@@ -72,14 +72,14 @@ void utf8_decode(u32 *dst, const char *src, u32 cp_count) {
u32 utf8_decode_at(const char *s, u32 i, u32 n) {
if (i >= n) return 0;
- u32 cp = (u8)s[i++];
+ u32 cp = (u8)s[i];
u32 b = stdc_leading_ones((u8)cp);
if (!b) return cp;
u32 end = i + b - 1;
if (end >= n) return 0;
cp &= 0xff >> b;
- while (i < end) {
- u8 c = s[i++];
+ while (++i <= end) {
+ u8 c = s[i];
cp = (cp << 6) | (c & 0x3f);
}
return cp;