diff options
| author | Tyge Løvset <[email protected]> | 2022-05-31 15:04:52 +0200 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-05-31 15:04:52 +0200 |
| commit | eb9821bec4a292458499042392924595b3338085 (patch) | |
| tree | 90b6286619deefc2eaf72fce4fc67d5959c84557 /src/utf8code.c | |
| parent | 0a92ec2235b5f42e93012be14938bb11e3f3650a (diff) | |
| download | STC-modified-eb9821bec4a292458499042392924595b3338085.tar.gz STC-modified-eb9821bec4a292458499042392924595b3338085.zip | |
1) REMOVED files/modules not relevant: makes lib more focused:
- threads.h/threads.c (external lib)
- coptions.h - will be kept as a gist.
- more will follow, (examples, some benchmarks, etc).
2) Replaced UTF8 decoder with Björn Höhrmann's DFA decoder.
Diffstat (limited to 'src/utf8code.c')
| -rw-r--r-- | src/utf8code.c | 88 |
1 files changed, 35 insertions, 53 deletions
diff --git a/src/utf8code.c b/src/utf8code.c index 543070c3..0f74ffb6 100644 --- a/src/utf8code.c +++ b/src/utf8code.c @@ -3,28 +3,21 @@ #include <stc/cstr.h> #include "utf8tabs.inc" -// https://news.ycombinator.com/item?id=15423674 -// https://gist.github.com/s4y/344a355f8c1f99c6a4cb2347ec4323cc - -void utf8_decode(utf8_decode_t *d, const uint8_t b) -{ - switch (d->state) { - case UTF8_OK: - if (b < 0x80) d->codep = b, d->size = 1; - else if (b < 0xC2) d->state = UTF8_ERROR, d->size = 0; - else if (b < 0xE0) d->state = 1, d->codep = b & 0x1F, d->size = 2; - else if (b < 0xF0) d->state = 2, d->codep = b & 0x0F, d->size = 3; - else if (b < 0xF5) d->state = 3, d->codep = b & 0x07, d->size = 4; - else d->state = UTF8_ERROR, d->size = 0; - break; - case 1: case 2: case 3: - if ((b & 0xC0) == 0x80) { - d->state -= 1; - d->codep = (d->codep << 6) | (b & 0x3F); - } else - d->state = UTF8_ERROR, d->size = 0; - } -} +const uint8_t utf8_dtab[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; unsigned utf8_encode(char *out, uint32_t c) { @@ -52,27 +45,18 @@ unsigned utf8_encode(char *out, uint32_t c) return 0; } -void utf8_peek(utf8_decode_t* d, const char *s) { - utf8_decode(d, (uint8_t)*s++); - switch (d->size) { - case 4: utf8_decode(d, (uint8_t)*s++); - case 3: utf8_decode(d, (uint8_t)*s++); - case 2: utf8_decode(d, (uint8_t)*s++); - } -} - bool utf8_valid(const char* s) { - utf8_decode_t d = {UTF8_OK}; + utf8_decode_t d = {.state=0}; while (*s) utf8_decode(&d, (uint8_t)*s++); - return d.state == UTF8_OK; + return d.state == 0; } bool utf8_valid_n(const char* s, size_t n) { - utf8_decode_t d = {UTF8_OK}; + utf8_decode_t d = {.state=0}; while ((n-- != 0) & (*s != 0)) utf8_decode(&d, (uint8_t)*s++); - return d.state == UTF8_OK; + return d.state == 0; } uint32_t utf8_tolower(uint32_t c) { @@ -101,10 +85,10 @@ uint32_t utf8_toupper(uint32_t c) { } /* int utf8_icmp(const char* s1, const char* s2) { - utf8_decode_t d1 = {UTF8_OK}, d2 = {UTF8_OK}; - for (;; s1 += d1.size, s2 += d2.size) { - utf8_peek(&d1, s1); - utf8_peek(&d2, s2); + utf8_decode_t d1 = {.state=0}, d2 = {.state=0}; + for (;;) { + do { utf8_decode(&d1, (uint8_t)s1[j1++]); } while (d1.state); + do { utf8_decode(&d2, (uint8_t)s2[j2++]); } while (d2.state); int c = utf8_tolower(d1.codep) - utf8_tolower(d2.codep); if (c || !*s2) return c; @@ -113,14 +97,14 @@ int utf8_icmp(const char* s1, const char* s2) { */ int utf8_icmp_n(size_t u8max, const char* s1, const size_t n1, const char* s2, const size_t n2) { - utf8_decode_t d1 = {UTF8_OK}, d2 = {UTF8_OK}; + utf8_decode_t d1 = {.state=0}, d2 = {.state=0}; size_t j1 = 0, j2 = 0; - for (; u8max-- && ((j1 < n1) & (j2 < n2)); j1 += d1.size, j2 += d2.size) { - utf8_peek(&d1, s1 + j1); - utf8_peek(&d2, s2 + j2); + while (u8max-- && ((j1 < n1) & (j2 < n2))) { + do { utf8_decode(&d1, (uint8_t)s1[j1++]); } while (d1.state); + do { utf8_decode(&d2, (uint8_t)s2[j2++]); } while (d2.state); int c = utf8_tolower(d1.codep) - utf8_tolower(d2.codep); if (c || !s2[j2]) - return c; + return c; } return (j2 < n2) - (j1 < n1); } @@ -178,15 +162,13 @@ static cstr cstr_casefold(const cstr* self, struct fnfold fold) { cstr out = cstr_null; char *buf = cstr_reserve(&out, sv.size*3/2); uint32_t cp; size_t sz = 0; - utf8_decode_t d = {UTF8_OK}; - - for (; *sv.str; sv.str += d.size) { - utf8_peek(&d, sv.str); - switch (d.size) { - case 1: - buf[sz++] = (char)fold.conv_asc(*sv.str); - break; - default: + utf8_decode_t d = {.state=0}; + + while (*sv.str) { + do { utf8_decode(&d, (uint8_t)*sv.str++); } while (d.state); + if (d.codep < 128) + buf[sz++] = (char)fold.conv_asc(d.codep); + else { cp = fold.conv_u8(d.codep); sz += utf8_encode(buf + sz, cp); } |
