diff options
| author | Tyge Lovset <[email protected]> | 2022-05-30 10:17:07 +0200 |
|---|---|---|
| committer | Tyge Lovset <[email protected]> | 2022-05-30 10:17:07 +0200 |
| commit | b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3 (patch) | |
| tree | 8c97999b1ede5e0cf45c94b2035e94b0734dff1c /src | |
| parent | 831dc0843aeedcb45138a6ed576ea03f2dcd58f8 (diff) | |
| download | STC-modified-b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3.tar.gz STC-modified-b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3.zip | |
Large refactoring on strings / utf8 and some file structure.
Diffstat (limited to 'src')
| -rw-r--r-- | src/cregex.c | 3 | ||||
| -rw-r--r-- | src/utf8tabs.c (renamed from src/casefold.c) | 154 | ||||
| -rw-r--r-- | src/utf8tabs.h | 10 | ||||
| -rw-r--r-- | src/utf8tabs.py (renamed from src/casefold.py) | 101 | ||||
| -rw-r--r-- | src/utf8utils.c | 190 |
5 files changed, 209 insertions, 249 deletions
diff --git a/src/cregex.c b/src/cregex.c index 0f585f5d..34c78090 100644 --- a/src/cregex.c +++ b/src/cregex.c @@ -32,7 +32,6 @@ THE SOFTWARE. #include <string.h>
#include <ctype.h>
#include <stc/cregex.h>
-#include "cregex_utf8.c"
typedef uint32_t Rune; /* Utf8 code point */
typedef int32_t Token;
@@ -594,7 +593,7 @@ nextc(Parser *par, Rune *rp) return 2;
case 'p': case 'P': { /* https://www.regular-expressions.info/unicode.html */
static struct { const char* c; int n, r; } cls[] = {
- {"{Alpha}", 7, U8_LC}, {"{LC}", 4, U8_LC},
+ {"{Alpha}", 7, U8_LC}, {"{LC}", 4, U8_LC},
{"{Alnum}", 7, U8_Xan},
{"{Digit}", 7, U8_Nd}, {"{Nd}", 4, U8_Nd},
{"{Lower}", 7, U8_Ll}, {"{Ll}", 4, U8_Ll},
diff --git a/src/casefold.c b/src/utf8tabs.c index 1b0a9463..8168f78f 100644 --- a/src/casefold.c +++ b/src/utf8tabs.c @@ -1,8 +1,6 @@ -#include <ctype.h> -#define i_header -#include <stc/cstr.h> +#include "utf8tabs.h" -static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = { +struct CaseFold casefold[] = { {65, 90, 122}, {181, 181, 956}, {192, 214, 246}, {216, 222, 254}, {256, 302, 303}, {306, 310, 311}, {313, 327, 328}, {330, 374, 375}, {376, 376, 255}, {377, 381, 382}, {383, 383, 115}, {385, 385, 595}, {386, 388, 389}, {390, 390, 596}, @@ -47,7 +45,7 @@ static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = { {42948, 42948, 42900}, {42949, 42949, 642}, {42950, 42950, 7566}, {42951, 42953, 42954}, {42960, 42962, 42963}, {42968, 42970, 42971}, {43888, 43913, 5049}, {65313, 65338, 65370}, }; // 188 -static uint8_t cfold_low[] = { +uint8_t cfold_low[] = { 0, 138, 10, 111, 2, 139, 3, 8, 4, 5, 6, 7, 9, 59, 12, 14, 16, 20, 49, 25, 56, 52, 29, 31, 33, 35, 37, 39, 50, 40, 41, 42, 43, 44, 45, 17, 46, 47, 48, 51, 53, 55, 155, 58, 62, 152, 150, 153, 11, 13, 15, 18, 19, 171, 21, 172, 22, 167, 170, 24, @@ -59,149 +57,3 @@ static uint8_t cfold_low[] = { 144, 145, 54, 57, 149, 154, 156, 157, 158, 96, 97, 159, 106, 160, 161, 162, 163, 165, 166, 168, 180, 169, 179, 183, 184, 185, 178, 187, }; - -uint32_t utf8_tolower(uint32_t c) { - for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) { - if (c <= casefold[i].c1) { - if (c < casefold[i].c0) return c; - int d = casefold[i].m1 - casefold[i].c1; - if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1)); - return c + d; - } - } - return c; -} - -uint32_t utf8_toupper(uint32_t c) { - for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) { - struct CaseFold cfold = casefold[cfold_low[i]]; - if (c <= cfold.m1) { - int d = cfold.m1 - cfold.c1; - if (c < (uint32_t)(cfold.c0 + d)) return c; - if (d == 1) return c - ((cfold.m1 & 1) == (c & 1)); - return c - d; - } - } - return c; -} - -bool utf8_isupper(uint32_t c) { - return utf8_tolower(c) != c; -} - -bool utf8_islower(uint32_t c) { - return utf8_toupper(c) != c; -} - -bool utf8_isspace(uint32_t c) { - static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0, - 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000}; - for (size_t i=0; i<sizeof t/sizeof *t; ++i) - if (c == t[i]) return true; - return (c >= 0x2000) & (c <= 0x200A); -} - -bool utf8_isdigit(uint32_t c) { - return ((c >= '0') & (c <= '9')) || - ((c >= 0xFF10) & (c <= 0xFF19)); -} - -bool utf8_isxdigit(uint32_t c) { - static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10, - 0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46}; - for (size_t i=1; i<sizeof t/sizeof *t; i += 2) - if (c <= t[i]) return c >= t[i - 1]; - return false; -} - -bool utf8_isalnum(uint32_t c) { - if (c < 128) return isalnum(c) != 0; - if ((c >= 0xFF10) & (c <= 0xFF19)) return true; - return utf8_islower(c) || utf8_isupper(c); -} - -bool utf8_isalpha(uint32_t c) { - if (c < 128) return isalpha(c) != 0; - return utf8_islower(c) || utf8_isupper(c); -} - -static struct fnfold { - int (*conv_asc)(int); - uint32_t (*conv_u8)(uint32_t); -} -fn_tolower = {tolower, utf8_tolower}, -fn_toupper = {toupper, utf8_toupper}; - - -static cstr cstr_casefold(const cstr* self, struct fnfold fold) { - csview sv = cstr_sv(self); - cstr out = cstr_null; - char *buf = cstr_reserve(&out, sv.size*3/2); - uint32_t cp; size_t sz = 0; - utf8_decode_t d = {UTF8_OK}; - - for (; *sv.str; sv.str += d.size) { - utf8_peek(sv.str, &d); - switch (d.size) { - case 1: - buf[sz++] = (char)fold.conv_asc(*sv.str); - break; - default: - cp = fold.conv_u8(d.codep); - sz += utf8_encode(buf + sz, cp); - } - } - _cstr_set_size(&out, sz); - cstr_shrink_to_fit(&out); - return out; -} - -cstr cstr_tolower(const cstr* self) { - return cstr_casefold(self, fn_tolower); -} - -cstr cstr_toupper(const cstr* self) { - return cstr_casefold(self, fn_toupper); -} - -void cstr_lowercase(cstr* self) { - cstr_take(self, cstr_casefold(self, fn_tolower)); -} - -void cstr_uppercase(cstr* self) { - cstr_take(self, cstr_casefold(self, fn_toupper)); -} - -#ifdef TEST -int main() -{ - for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) - { - char x[3][5]={0}; - unsigned s0, s1, s2; - uint32_t a = casefold[i].c0; - uint32_t b = utf8_tolower(a); - uint32_t c = utf8_toupper(b); - - s0 = utf8_encode(x[0], a); - s1 = utf8_encode(x[1], b); - s2 = utf8_encode(x[2], c); - printf("%s %s %s - %u %u %u (%u %u %u)\n", x[0], x[1], x[2], a, b, c, s0, s1, s2); - } - c_auto (cstr, t1) - { - t1 = cstr_new("Die preußischen Köstlichkeiten."); - - cstr_buf b = cstr_buffer(&t1); - printf("%s, %llu %llu\n", b.data, b.size, b.cap); - cstr_lowercase(&t1); - b = cstr_buffer(&t1); - printf("%s, %llu %llu\n", b.data, b.size, b.cap); - - cstr_uppercase(&t1); - b = cstr_buffer(&t1); - printf("%s, %llu %llu\n", b.data, b.size, b.cap); - } -} -#endif - diff --git a/src/utf8tabs.h b/src/utf8tabs.h new file mode 100644 index 00000000..95251f75 --- /dev/null +++ b/src/utf8tabs.h @@ -0,0 +1,10 @@ +#ifndef utf8tabs_included +#define utf8tabs_included + +#include <stdint.h> +struct CaseFold { uint16_t c0, c1, m1; } ; + +extern struct CaseFold casefold[188]; +extern uint8_t cfold_low[188]; + +#endif diff --git a/src/casefold.py b/src/utf8tabs.py index 951f3bf6..563180e3 100644 --- a/src/casefold.py +++ b/src/utf8tabs.py @@ -8,7 +8,7 @@ def read_unidata(catfilter='Lu', casefilter='lowcase', big=False): 'decdig', 'digval', 'numval', 'mirrored', 'uc1name', 'comment', 'upcase', 'lowcase', 'titlecase'], usecols=['code', 'name', 'category', 'bidircat', 'upcase', 'lowcase', 'titlecase']) - if big: + if big: ud = ud[ud['code'] >= (1<<16)] else: ud = ud[ud['code'] < (1<<16)] @@ -84,7 +84,7 @@ def make_casefold(letters): def print_casefold(cfold): print(''' -static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = {''') +struct CaseFold casefold[] = {''') n = 1 s = 5 count = 0 @@ -99,7 +99,7 @@ static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = {''') break #print(' {%d, %d, %d}, // %s %s, %s\n ' % (a, b, c, chr(a), chr(a + x[2]), x[3]), end='') if True: # compact - if n == s: + if n == s: n = 0 if a > 1000: s = 4 @@ -129,7 +129,7 @@ def print_casefold_low(table): cfold_low = [i for i in range(len(table))] cfold_low.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0])) - print('static uint8_t cfold_low[] = {\n ', end='') + print('uint8_t cfold_low[] = {\n ', end='') for i in range(len(cfold_low)): print(" %d," % (cfold_low[i]), end='\n ' if (i+1) % 20 == 0 else '') print('\n};') @@ -138,98 +138,7 @@ def print_casefold_low(table): ########### main: if __name__ == "__main__": - print('''#include <stdint.h> -#include <stdio.h> -#include <ctype.h> -#include <stc/utf8.h> -#include <stdbool.h>''') - + print('#include "utf8tabs.h"') cfold = make_casetable() table = print_casefold(cfold) print_casefold_low(table) - - print(r''' -uint32_t utf8_tolower(uint32_t c) { - for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) { - if (c <= casefold[i].c1) { - if (c < casefold[i].c0) return c; - int d = casefold[i].m1 - casefold[i].c1; - if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1)); - return c + d; - } - } - return c; -} - -uint32_t utf8_toupper(uint32_t c) { - for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) { - struct CaseFold cfold = casefold[cfold_low[i]]; - if (c <= cfold.m1) { - int d = cfold.m1 - cfold.c1; - if (c < (uint32_t)(cfold.c0 + d)) return c; - if (d == 1) return c - ((cfold.m1 & 1) == (c & 1)); - return c - d; - } - } - return c; -} - -bool utf8_isupper(uint32_t c) { - return utf8_tolower(c) != c; -} - -bool utf8_islower(uint32_t c) { - return utf8_toupper(c) != c; -} - -bool utf8_isspace(uint32_t c) { - static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0, - 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000}; - for (size_t i=0; i<sizeof t/sizeof *t; ++i) - if (c == t[i]) return true; - return (c >= 0x2000) & (c <= 0x200A); -} - -bool utf8_isdigit(uint32_t c) { - return ((c >= '0') & (c <= '9')) || - ((c >= 0xFF10) & (c <= 0xFF19)); -} - -bool utf8_isxdigit(uint32_t c) { - static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10, - 0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46}; - for (size_t i=1; i<sizeof t/sizeof *t; i += 2) - if (c <= t[i]) return c >= t[i - 1]; - return false; -} - -bool utf8_isalnum(uint32_t c) { - if (c < 128) return isalnum(c) != 0; - if ((c >= 0xFF10) & (c <= 0xFF19)) return true; - return utf8_islower(c) || utf8_isupper(c); -} - -bool utf8_isalpha(uint32_t c) { - if (c < 128) return isalpha(c) != 0; - return utf8_islower(c) || utf8_isupper(c); -} - - -#ifdef TEST -int main() -{ - for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) - { - char x[3][5]={0}; - uint32_t a = casefold[i].c0; - uint32_t b = utf8_tolower(a); - uint32_t c = utf8_toupper(b); - - utf8_encode(x[0], a); - utf8_encode(x[1], b); - utf8_encode(x[2], c); - printf("%s %s %s - %u %u %u\n", x[0], x[1], x[2], a, b, c); - } -} -#endif -''') diff --git a/src/utf8utils.c b/src/utf8utils.c new file mode 100644 index 00000000..3b01ae39 --- /dev/null +++ b/src/utf8utils.c @@ -0,0 +1,190 @@ +#include <ctype.h> +#define i_header +#include <stc/cstr.h> + +#include "utf8tabs.h" +#include "utf8tabs.c" + +// https://news.ycombinator.com/item?id=15423674 +// https://gist.github.com/s4y/344a355f8c1f99c6a4cb2347ec4323cc + +void utf8_decode(utf8_decode_t *d, const uint8_t b) +{ + switch (d->state) { + case UTF8_OK: + if (b < 0x80) d->codep = b, d->size = 1; + else if (b < 0xC2) d->state = UTF8_ERROR, d->size = 0; + else if (b < 0xE0) d->state = 1, d->codep = b & 0x1F, d->size = 2; + else if (b < 0xF0) d->state = 2, d->codep = b & 0x0F, d->size = 3; + else if (b < 0xF5) d->state = 3, d->codep = b & 0x07, d->size = 4; + else d->state = UTF8_ERROR, d->size = 0; + break; + case 1: case 2: case 3: + if ((b & 0xC0) == 0x80) { + d->state -= 1; + d->codep = (d->codep << 6) | (b & 0x3F); + } else + d->state = UTF8_ERROR, d->size = 0; + } +} + +unsigned utf8_encode(char *out, uint32_t c) +{ + if (c < 0x80U) { + out[0] = (char) c; + return 1; + } else if (c < 0x0800U) { + out[0] = (char) ((c>>6 & 0x1F) | 0xC0); + out[1] = (char) ((c & 0x3F) | 0x80); + return 2; + } else if (c < 0x010000U) { + if ((c < 0xD800U) | (c >= 0xE000U)) { + out[0] = (char) ((c>>12 & 0x0F) | 0xE0); + out[1] = (char) ((c>>6 & 0x3F) | 0x80); + out[2] = (char) ((c & 0x3F) | 0x80); + return 3; + } + } else if (c < 0x110000U) { + out[0] = (char) ((c>>18 & 0x07) | 0xF0); + out[1] = (char) ((c>>12 & 0x3F) | 0x80); + out[2] = (char) ((c>>6 & 0x3F) | 0x80); + out[3] = (char) ((c & 0x3F) | 0x80); + return 4; + } + return 0; +} + +void utf8_peek(utf8_decode_t* d, const char *s) { + utf8_decode(d, (uint8_t)*s++); + switch (d->size) { + case 4: utf8_decode(d, (uint8_t)*s++); + case 3: utf8_decode(d, (uint8_t)*s++); + case 2: utf8_decode(d, (uint8_t)*s++); + } +} + +bool utf8_valid(const char* s) { + utf8_decode_t d = {UTF8_OK}; + while (*s) + utf8_decode(&d, (uint8_t)*s++); + return d.state == UTF8_OK; +} + +bool utf8_valid_n(const char* s, size_t n) { + utf8_decode_t d = {UTF8_OK}; + while ((n-- != 0) & (*s != 0)) + utf8_decode(&d, (uint8_t)*s++); + return d.state == UTF8_OK; +} + +uint32_t utf8_tolower(uint32_t c) { + for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) { + if (c <= casefold[i].c1) { + if (c < casefold[i].c0) return c; + int d = casefold[i].m1 - casefold[i].c1; + if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1)); + return c + d; + } + } + return c; +} + +uint32_t utf8_toupper(uint32_t c) { + for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) { + struct CaseFold cfold = casefold[cfold_low[i]]; + if (c <= cfold.m1) { + int d = cfold.m1 - cfold.c1; + if (c < (uint32_t)(cfold.c0 + d)) return c; + if (d == 1) return c - ((cfold.m1 & 1) == (c & 1)); + return c - d; + } + } + return c; +} + +bool utf8_isupper(uint32_t c) { + return utf8_tolower(c) != c; +} + +bool utf8_islower(uint32_t c) { + return utf8_toupper(c) != c; +} + +bool utf8_isspace(uint32_t c) { + static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0, + 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000}; + for (size_t i=0; i<sizeof t/sizeof *t; ++i) + if (c == t[i]) return true; + return (c >= 0x2000) & (c <= 0x200A); +} + +bool utf8_isdigit(uint32_t c) { + return ((c >= '0') & (c <= '9')) || + ((c >= 0xFF10) & (c <= 0xFF19)); +} + +bool utf8_isxdigit(uint32_t c) { + static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10, + 0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46}; + for (size_t i=1; i<sizeof t/sizeof *t; i += 2) + if (c <= t[i]) return c >= t[i - 1]; + return false; +} + +bool utf8_isalnum(uint32_t c) { + if (c < 128) return isalnum(c) != 0; + if ((c >= 0xFF10) & (c <= 0xFF19)) return true; + return utf8_islower(c) || utf8_isupper(c); +} + +bool utf8_isalpha(uint32_t c) { + if (c < 128) return isalpha(c) != 0; + return utf8_islower(c) || utf8_isupper(c); +} + +static struct fnfold { + int (*conv_asc)(int); + uint32_t (*conv_u8)(uint32_t); +} +fn_tolower = {tolower, utf8_tolower}, +fn_toupper = {toupper, utf8_toupper}; + + +static cstr cstr_casefold(const cstr* self, struct fnfold fold) { + csview sv = cstr_sv(self); + cstr out = cstr_null; + char *buf = cstr_reserve(&out, sv.size*3/2); + uint32_t cp; size_t sz = 0; + utf8_decode_t d = {UTF8_OK}; + + for (; *sv.str; sv.str += d.size) { + utf8_peek(&d, sv.str); + switch (d.size) { + case 1: + buf[sz++] = (char)fold.conv_asc(*sv.str); + break; + default: + cp = fold.conv_u8(d.codep); + sz += utf8_encode(buf + sz, cp); + } + } + _cstr_set_size(&out, sz); + cstr_shrink_to_fit(&out); + return out; +} + +cstr cstr_tolower(const cstr* self) { + return cstr_casefold(self, fn_tolower); +} + +cstr cstr_toupper(const cstr* self) { + return cstr_casefold(self, fn_toupper); +} + +void cstr_lowercase(cstr* self) { + cstr_take(self, cstr_casefold(self, fn_tolower)); +} + +void cstr_uppercase(cstr* self) { + cstr_take(self, cstr_casefold(self, fn_toupper)); +} |
