diff options
| author | Tyge Løvset <[email protected]> | 2022-06-09 15:41:18 +0200 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-06-09 15:41:18 +0200 |
| commit | 3ee36759b8567a72a8349c312fc7dbe975de9e02 (patch) | |
| tree | 131444f3c7a679fb32415aa957b94181980fbf17 /src | |
| parent | 7bbc74bb5c0778504f098974ba79de122cee7211 (diff) | |
| download | STC-modified-3ee36759b8567a72a8349c312fc7dbe975de9e02.tar.gz STC-modified-3ee36759b8567a72a8349c312fc7dbe975de9e02.zip | |
Removed adding circled letters and roman numerics from toupper/tolower
Diffstat (limited to 'src')
| -rw-r--r-- | src/utf8code.c | 30 | ||||
| -rw-r--r-- | src/utf8tabs.inc | 20 | ||||
| -rw-r--r-- | src/utf8tabs.py | 11 |
3 files changed, 29 insertions, 32 deletions
diff --git a/src/utf8code.c b/src/utf8code.c index 3f99c017..9613ba95 100644 --- a/src/utf8code.c +++ b/src/utf8code.c @@ -62,10 +62,10 @@ bool utf8_valid_n(const char* s, size_t n) { uint32_t utf8_casefold(uint32_t c) { for (size_t i=0; i < casefold_len; ++i) { const struct CaseMapping entry = casemappings[i]; - if (c <= entry.c1) { - if (c < entry.c0) return c; - int d = entry.m1 - entry.c1; - if (d == 1) return c + ((entry.c1 & 1) == (c & 1)); + if (c <= entry.c2) { + if (c < entry.c1) return c; + int d = entry.m2 - entry.c2; + if (d == 1) return c + ((entry.c2 & 1) == (c & 1)); return c + d; } } @@ -75,10 +75,10 @@ uint32_t utf8_casefold(uint32_t c) { uint32_t utf8_tolower(uint32_t c) { for (size_t i=0; i < sizeof upcase_ind/sizeof *upcase_ind; ++i) { const struct CaseMapping entry = casemappings[upcase_ind[i]]; - if (c <= entry.c1) { - if (c < entry.c0) return c; - int d = entry.m1 - entry.c1; - if (d == 1) return c + ((entry.c1 & 1) == (c & 1)); + if (c <= entry.c2) { + if (c < entry.c1) return c; + int d = entry.m2 - entry.c2; + if (d == 1) return c + ((entry.c2 & 1) == (c & 1)); return c + d; } } @@ -88,10 +88,10 @@ uint32_t utf8_tolower(uint32_t c) { uint32_t utf8_toupper(uint32_t c) { for (size_t i=0; i < sizeof lowcase_ind/sizeof *lowcase_ind; ++i) { const struct CaseMapping entry = casemappings[lowcase_ind[i]]; - if (c <= entry.m1) { - int d = entry.m1 - entry.c1; - if (c < (uint32_t)(entry.c0 + d)) return c; - if (d == 1) return c - ((entry.m1 & 1) == (c & 1)); + if (c <= entry.m2) { + int d = entry.m2 - entry.c2; + if (c < (uint32_t)(entry.c1 + d)) return c; + if (d == 1) return c - ((entry.m2 & 1) == (c & 1)); return c - d; } } @@ -164,8 +164,8 @@ bool utf8_isalpha(uint32_t c) { } static struct fncase { - int (*conv_asc)(int); - uint32_t (*conv_u8)(uint32_t); + int (*conv_asc)(int); + uint32_t (*conv_utf)(uint32_t); } fn_tofold = {tolower, utf8_casefold}, fn_tolower = {tolower, utf8_tolower}, @@ -184,7 +184,7 @@ static cstr cstr_tocase(const cstr* self, struct fncase fn) { if (d.codep < 128) buf[sz++] = (char)fn.conv_asc(d.codep); else { - cp = fn.conv_u8(d.codep); + cp = fn.conv_utf(d.codep); sz += utf8_encode(buf + sz, cp); } } diff --git a/src/utf8tabs.inc b/src/utf8tabs.inc index 28f30978..3ece077e 100644 --- a/src/utf8tabs.inc +++ b/src/utf8tabs.inc @@ -1,6 +1,6 @@ #include <stdint.h> -struct CaseMapping { uint16_t c0, c1, m1; }; +struct CaseMapping { uint16_t c1, c2, m2; }; static struct CaseMapping casemappings[] = { {0x0041, 0x005A, 0x007A}, // A a (26) LATIN CAPITAL LETTER A @@ -225,19 +225,19 @@ static struct CaseMapping casemappings[] = { enum { casefold_len = 192 }; -static uint8_t upcase_ind[164] = { +static uint8_t upcase_ind[162] = { 0, 2, 3, 4, 192, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 45, 193, 47, 48, 194, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 75, 80, 83, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 195, 196, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 125, 126, 129, 131, 132, 133, 134, 135, 136, 137, 139, 140, 141, 142, 143, 144, 145, 146, - 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, - 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, - 187, 188, 189, 191, + 120, 121, 125, 126, 129, 131, 132, 133, 134, 135, 136, 137, 139, 140, 141, 142, 144, 146, 147, 148, + 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, + 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, + 189, 191, }; -static uint8_t lowcase_ind[186] = { +static uint8_t lowcase_ind[184] = { 0, 197, 113, 2, 3, 8, 4, 198, 5, 6, 7, 9, 199, 60, 12, 14, 16, 20, 50, 25, 57, 53, 29, 31, 33, 35, 37, 39, 40, 51, 41, 43, 45, 193, 17, 47, 48, 194, 52, 54, 56, 158, 59, 63, 154, 152, 155, 11, 13, 15, 18, 19, 174, 21, 175, 22, 170, 173, 24, 23, @@ -245,7 +245,7 @@ static uint8_t lowcase_ind[186] = { 66, 88, 68, 69, 72, 200, 73, 70, 71, 201, 202, 203, 204, 75, 80, 205, 206, 86, 67, 207, 85, 87, 90, 89, 91, 92, 94, 93, 95, 96, 109, 110, 196, 208, 209, 210, 211, 212, 213, 214, 215, 167, 149, 185, 111, 216, 114, 115, 116, 117, 118, 119, 120, 121, 126, 129, 132, 136, 134, 137, - 122, 123, 124, 125, 127, 217, 130, 131, 133, 135, 138, 142, 143, 144, 145, 146, 147, 55, 58, 151, - 156, 157, 159, 160, 161, 97, 98, 99, 162, 163, 164, 165, 166, 168, 169, 171, 183, 172, 182, 186, - 187, 188, 189, 181, 195, 191, + 122, 123, 124, 125, 127, 217, 130, 131, 133, 135, 138, 142, 144, 146, 147, 55, 58, 151, 156, 157, + 159, 160, 161, 97, 98, 99, 162, 163, 164, 165, 166, 168, 169, 171, 183, 172, 182, 186, 187, 188, + 189, 181, 195, 191, }; diff --git a/src/utf8tabs.py b/src/utf8tabs.py index dfaec055..4a5781f1 100644 --- a/src/utf8tabs.py +++ b/src/utf8tabs.py @@ -112,7 +112,7 @@ def compile_table(casetype='lowcase', category=None): def main(): print('#include <stdint.h>\n') - print('struct CaseMapping { uint16_t c0, c1, m1; };\n') + print('struct CaseMapping { uint16_t c1, c2, m2; };\n') casemappings = compile_table('lowcase') # CaseFolding.txt upcase = compile_table('lowcase', 'Lu') # UnicodeData.txt uppercase @@ -145,15 +145,12 @@ def main(): print_table('casemappings', casemappings, style=1) print('enum { casefold_len = %d };' % casefolding_len) - # add "missing" mappings: - for c in ('Ⅰ', 'Ⓐ'): - upcase_ind.append(next(i for i,x in enumerate(casemappings) if x[0]==ord(c))) - for c in ('ẞ', 'Ⅰ', 'Ⓐ'): - lowcase_ind.append(next(i for i,x in enumerate(casemappings) if x[0]==ord(c))) - + # upcase => low upcase_ind.sort(key=lambda i: casemappings[i][0]) print_index_table('upcase_ind', upcase_ind) + # lowcase => up. add "missing" SHARP S caused by https://www.unicode.org/policies/stability_policy.html#Case_Pair + lowcase_ind.append(next(i for i,x in enumerate(casemappings) if x[0]==ord('ẞ'))) lowcase_ind.sort(key=lambda i: casemappings[i][2] - (casemappings[i][1] - casemappings[i][0])) print_index_table('lowcase_ind', lowcase_ind) |
