From b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3 Mon Sep 17 00:00:00 2001 From: Tyge Lovset Date: Mon, 30 May 2022 10:17:07 +0200 Subject: Large refactoring on strings / utf8 and some file structure. --- src/casefold.c | 207 ------------------------------------------------- src/casefold.py | 235 -------------------------------------------------------- src/cregex.c | 3 +- src/utf8tabs.c | 59 ++++++++++++++ src/utf8tabs.h | 10 +++ src/utf8tabs.py | 144 ++++++++++++++++++++++++++++++++++ src/utf8utils.c | 190 +++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 404 insertions(+), 444 deletions(-) delete mode 100644 src/casefold.c delete mode 100644 src/casefold.py create mode 100644 src/utf8tabs.c create mode 100644 src/utf8tabs.h create mode 100644 src/utf8tabs.py create mode 100644 src/utf8utils.c (limited to 'src') diff --git a/src/casefold.c b/src/casefold.c deleted file mode 100644 index 1b0a9463..00000000 --- a/src/casefold.c +++ /dev/null @@ -1,207 +0,0 @@ -#include -#define i_header -#include - -static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = { - {65, 90, 122}, {181, 181, 956}, {192, 214, 246}, {216, 222, 254}, - {256, 302, 303}, {306, 310, 311}, {313, 327, 328}, {330, 374, 375}, {376, 376, 255}, - {377, 381, 382}, {383, 383, 115}, {385, 385, 595}, {386, 388, 389}, {390, 390, 596}, - {391, 391, 392}, {393, 394, 599}, {395, 395, 396}, {398, 398, 477}, {399, 399, 601}, - {400, 400, 603}, {401, 401, 402}, {403, 403, 608}, {404, 404, 611}, {406, 406, 617}, - {407, 407, 616}, {408, 408, 409}, {412, 412, 623}, {413, 413, 626}, {415, 415, 629}, - {416, 420, 421}, {422, 422, 640}, {423, 423, 424}, {425, 425, 643}, {428, 428, 429}, - {430, 430, 648}, {431, 431, 432}, {433, 434, 651}, {435, 437, 438}, {439, 439, 658}, - {440, 442, 443}, {452, 452, 454}, {453, 453, 454}, {455, 455, 457}, {456, 456, 457}, - {458, 458, 460}, {459, 475, 476}, {478, 494, 495}, {497, 497, 499}, {498, 500, 501}, - {502, 502, 405}, {503, 503, 447}, {504, 542, 543}, {544, 544, 414}, {546, 562, 563}, - {570, 570, 11365}, {571, 571, 572}, {573, 573, 410}, {574, 574, 11366}, {577, 577, 578}, - {579, 579, 384}, {580, 580, 649}, {581, 581, 652}, {582, 590, 591}, {837, 837, 953}, - {880, 882, 883}, {886, 886, 887}, {895, 895, 1011}, {902, 902, 940}, {904, 906, 943}, - {908, 908, 972}, {910, 911, 974}, {913, 929, 961}, {931, 939, 971}, {962, 962, 963}, - {975, 975, 983}, {976, 976, 946}, {977, 977, 952}, {981, 981, 966}, {982, 982, 960}, - {984, 1006, 1007}, {1008, 1008, 954}, {1009, 1009, 961}, {1012, 1012, 952}, {1013, 1013, 949}, - {1015, 1015, 1016}, {1017, 1017, 1010}, {1018, 1018, 1019}, {1021, 1023, 893}, - {1024, 1039, 1119}, {1040, 1071, 1103}, {1120, 1152, 1153}, {1162, 1214, 1215}, - {1216, 1216, 1231}, {1217, 1229, 1230}, {1232, 1326, 1327}, {1329, 1366, 1414}, - {4256, 4293, 11557}, {4295, 4296, 11560}, {5112, 5117, 5109}, {7296, 7296, 1074}, - {7297, 7297, 1076}, {7298, 7298, 1086}, {7299, 7300, 1090}, {7301, 7301, 1090}, - {7302, 7302, 1098}, {7303, 7303, 1123}, {7304, 7304, 42571}, {7312, 7354, 4346}, - {7357, 7359, 4351}, {7680, 7828, 7829}, {7835, 7835, 7777}, {7838, 7838, 223}, - {7840, 7934, 7935}, {7944, 7951, 7943}, {7960, 7965, 7957}, {7976, 7983, 7975}, - {7992, 7999, 7991}, {8008, 8013, 8005}, {8025, 8028, 8020}, {8040, 8047, 8039}, - {8072, 8079, 8071}, {8088, 8095, 8087}, {8104, 8111, 8103}, {8120, 8121, 8113}, - {8122, 8123, 8049}, {8124, 8124, 8115}, {8126, 8126, 953}, {8136, 8139, 8053}, - {8140, 8140, 8131}, {8152, 8153, 8145}, {8154, 8155, 8055}, {8168, 8169, 8161}, - {8170, 8171, 8059}, {8172, 8172, 8165}, {8184, 8185, 8057}, {8186, 8187, 8061}, - {8188, 8188, 8179}, {8486, 8486, 969}, {8490, 8490, 107}, {8491, 8491, 229}, - {8498, 8498, 8526}, {8544, 8559, 8575}, {8579, 8579, 8580}, {9398, 9423, 9449}, - {11264, 11311, 11359}, {11360, 11360, 11361}, {11362, 11362, 619}, {11363, 11363, 7549}, - {11364, 11364, 637}, {11367, 11371, 11372}, {11373, 11373, 593}, {11374, 11374, 625}, - {11375, 11375, 592}, {11376, 11376, 594}, {11378, 11380, 11381}, {11390, 11391, 576}, - {11392, 11490, 11491}, {11499, 11501, 11502}, {11506, 11508, 11509}, {42562, 42604, 42605}, - {42624, 42650, 42651}, {42786, 42798, 42799}, {42802, 42862, 42863}, {42873, 42875, 42876}, - {42877, 42877, 7545}, {42878, 42886, 42887}, {42891, 42891, 42892}, {42893, 42893, 613}, - {42896, 42898, 42899}, {42902, 42920, 42921}, {42922, 42922, 614}, {42923, 42923, 604}, - {42924, 42924, 609}, {42925, 42925, 620}, {42926, 42926, 618}, {42928, 42928, 670}, - {42929, 42929, 647}, {42930, 42930, 669}, {42931, 42931, 43859}, {42932, 42946, 42947}, - {42948, 42948, 42900}, {42949, 42949, 642}, {42950, 42950, 7566}, {42951, 42953, 42954}, - {42960, 42962, 42963}, {42968, 42970, 42971}, {43888, 43913, 5049}, {65313, 65338, 65370}, -}; // 188 -static uint8_t cfold_low[] = { - 0, 138, 10, 111, 2, 139, 3, 8, 4, 5, 6, 7, 9, 59, 12, 14, 16, 20, 49, 25, - 56, 52, 29, 31, 33, 35, 37, 39, 50, 40, 41, 42, 43, 44, 45, 17, 46, 47, 48, 51, - 53, 55, 155, 58, 62, 152, 150, 153, 11, 13, 15, 18, 19, 171, 21, 172, 22, 167, 170, 24, - 23, 174, 146, 173, 26, 151, 27, 28, 148, 30, 181, 32, 176, 34, 60, 36, 61, 38, 177, 175, - 64, 65, 87, 67, 68, 71, 75, 83, 76, 82, 63, 126, 80, 1, 78, 81, 72, 73, 77, 137, - 69, 70, 74, 79, 85, 66, 84, 86, 89, 99, 100, 101, 102, 103, 104, 88, 90, 105, 91, 93, - 92, 94, 95, 107, 108, 186, 98, 164, 147, 182, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, - 124, 127, 130, 134, 132, 135, 120, 121, 122, 123, 125, 128, 129, 131, 133, 136, 140, 141, 142, 143, - 144, 145, 54, 57, 149, 154, 156, 157, 158, 96, 97, 159, 106, 160, 161, 162, 163, 165, 166, 168, - 180, 169, 179, 183, 184, 185, 178, 187, -}; - -uint32_t utf8_tolower(uint32_t c) { - for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) { - if (c <= casefold[i].c1) { - if (c < casefold[i].c0) return c; - int d = casefold[i].m1 - casefold[i].c1; - if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1)); - return c + d; - } - } - return c; -} - -uint32_t utf8_toupper(uint32_t c) { - for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) { - struct CaseFold cfold = casefold[cfold_low[i]]; - if (c <= cfold.m1) { - int d = cfold.m1 - cfold.c1; - if (c < (uint32_t)(cfold.c0 + d)) return c; - if (d == 1) return c - ((cfold.m1 & 1) == (c & 1)); - return c - d; - } - } - return c; -} - -bool utf8_isupper(uint32_t c) { - return utf8_tolower(c) != c; -} - -bool utf8_islower(uint32_t c) { - return utf8_toupper(c) != c; -} - -bool utf8_isspace(uint32_t c) { - static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0, - 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000}; - for (size_t i=0; i= 0x2000) & (c <= 0x200A); -} - -bool utf8_isdigit(uint32_t c) { - return ((c >= '0') & (c <= '9')) || - ((c >= 0xFF10) & (c <= 0xFF19)); -} - -bool utf8_isxdigit(uint32_t c) { - static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10, - 0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46}; - for (size_t i=1; i= t[i - 1]; - return false; -} - -bool utf8_isalnum(uint32_t c) { - if (c < 128) return isalnum(c) != 0; - if ((c >= 0xFF10) & (c <= 0xFF19)) return true; - return utf8_islower(c) || utf8_isupper(c); -} - -bool utf8_isalpha(uint32_t c) { - if (c < 128) return isalpha(c) != 0; - return utf8_islower(c) || utf8_isupper(c); -} - -static struct fnfold { - int (*conv_asc)(int); - uint32_t (*conv_u8)(uint32_t); -} -fn_tolower = {tolower, utf8_tolower}, -fn_toupper = {toupper, utf8_toupper}; - - -static cstr cstr_casefold(const cstr* self, struct fnfold fold) { - csview sv = cstr_sv(self); - cstr out = cstr_null; - char *buf = cstr_reserve(&out, sv.size*3/2); - uint32_t cp; size_t sz = 0; - utf8_decode_t d = {UTF8_OK}; - - for (; *sv.str; sv.str += d.size) { - utf8_peek(sv.str, &d); - switch (d.size) { - case 1: - buf[sz++] = (char)fold.conv_asc(*sv.str); - break; - default: - cp = fold.conv_u8(d.codep); - sz += utf8_encode(buf + sz, cp); - } - } - _cstr_set_size(&out, sz); - cstr_shrink_to_fit(&out); - return out; -} - -cstr cstr_tolower(const cstr* self) { - return cstr_casefold(self, fn_tolower); -} - -cstr cstr_toupper(const cstr* self) { - return cstr_casefold(self, fn_toupper); -} - -void cstr_lowercase(cstr* self) { - cstr_take(self, cstr_casefold(self, fn_tolower)); -} - -void cstr_uppercase(cstr* self) { - cstr_take(self, cstr_casefold(self, fn_toupper)); -} - -#ifdef TEST -int main() -{ - for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) - { - char x[3][5]={0}; - unsigned s0, s1, s2; - uint32_t a = casefold[i].c0; - uint32_t b = utf8_tolower(a); - uint32_t c = utf8_toupper(b); - - s0 = utf8_encode(x[0], a); - s1 = utf8_encode(x[1], b); - s2 = utf8_encode(x[2], c); - printf("%s %s %s - %u %u %u (%u %u %u)\n", x[0], x[1], x[2], a, b, c, s0, s1, s2); - } - c_auto (cstr, t1) - { - t1 = cstr_new("Die preußischen Köstlichkeiten."); - - cstr_buf b = cstr_buffer(&t1); - printf("%s, %llu %llu\n", b.data, b.size, b.cap); - cstr_lowercase(&t1); - b = cstr_buffer(&t1); - printf("%s, %llu %llu\n", b.data, b.size, b.cap); - - cstr_uppercase(&t1); - b = cstr_buffer(&t1); - printf("%s, %llu %llu\n", b.data, b.size, b.cap); - } -} -#endif - diff --git a/src/casefold.py b/src/casefold.py deleted file mode 100644 index 951f3bf6..00000000 --- a/src/casefold.py +++ /dev/null @@ -1,235 +0,0 @@ -#!python -import pandas as pd -import numpy as np - -def read_unidata(catfilter='Lu', casefilter='lowcase', big=False): - ud = pd.read_csv("ucd/UnicodeData.txt", sep=';', converters={0: lambda x: int(x, base=16)}, - names=['code', 'name', 'category', 'canclass', 'bidircat', 'chrdecomp', - 'decdig', 'digval', 'numval', 'mirrored', 'uc1name', 'comment', - 'upcase', 'lowcase', 'titlecase'], - usecols=['code', 'name', 'category', 'bidircat', 'upcase', 'lowcase', 'titlecase']) - if big: - ud = ud[ud['code'] >= (1<<16)] - else: - ud = ud[ud['code'] < (1<<16)] - ud = ud[ud['category'] == catfilter] - ud = ud.replace(np.nan, '0') - for k in ['upcase', 'lowcase', 'titlecase']: - ud[k] = ud[k].apply(int, base=16) - if casefilter: # 'lowcase', 'upcase', 'titlecase' - ud = ud[ud[casefilter] != 0] - return ud - - -def read_casefold(big=False): - cf = pd.read_csv("ucd/CaseFolding.txt", engine='python', sep='; #? ?', comment='#', - converters={0: lambda x: int(x, base=16)}, - names=['code', 'status', 'lowcase', 'name']) - if big: - cf = cf[cf['code'] >= (1<<16)] - else: - cf = cf[cf['code'] < (1<<16)] - cf = cf[cf.status.isin(['S', 'C'])] - cf['lowcase'] = cf['lowcase'].apply(int, base=16) - #print(cf['name'].values) - #print(cf) - return cf - - -def make_caselist(df): - letters=[] - for idx, row in df.iterrows(): - #print(idx+1, ':', row['code'], row['lowcase'] - row['code'], ',', chr(row['code']), chr(row['lowcase']), ',', row['name']) - letters.append([idx+1, row['code'], row['lowcase'], row['name']]) - return letters - - -def make_casefold(letters): - prevoffset = 0 - diffoffset = 0 - prev = [-1, 0, 0] - diff = [-1, 0, 0] - - out = [] - n = 1 - for x in letters: - offset = x[2] - x[1] - diffoffset = prevoffset - offset - if (diff[1] and x[1] - prev[1] != diff[1]) or (diff[2] and x[2] - prev[2] != diff[2]) or prevoffset != offset: - out.append([x[1], x[2], n, x[3]]) # , ';', chr(x[1]), chr(x[2]), ';', x[1], offset, "CHANGE") - #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset, "CHANGE", n+1) - diff[1] = 0 - diff[2] = 0 - n = 1 - else: - n += 1 - if diff[1] == 0: - diff[1] = x[1] - prev[1] - diff[2] = x[2] - prev[2] - #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset) - diff[1] = x[1] - prev[1] - diff[2] = x[2] - prev[2] - prev[2] = x[2] - prev[1] = x[1] - prevoffset = offset - - out.append(out[-1]) - out[-1][2] = 26 - cfold = [] - for i in range(0, len(out)-1): - d = out[i][1] - out[i][0] - cfold.append([out[i][0], out[i+1][2], d, out[i][3]]) - return cfold - - -def print_casefold(cfold): - print(''' -static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = {''') - n = 1 - s = 5 - count = 0 - table = [] - print(' ', end='') - for x in cfold: - d = 2 if abs(x[2]) == 1 else 1 - a = x[0] - b = x[0] + (x[1] - 1)*d - c = b + x[2] - if b >= 1<<16 or c >= 1<<16: # only to make sure... - break - #print(' {%d, %d, %d}, // %s %s, %s\n ' % (a, b, c, chr(a), chr(a + x[2]), x[3]), end='') - if True: # compact - if n == s: - n = 0 - if a > 1000: - s = 4 - print('\n ', end='') - print(' {%d, %d, %d},' % (a, b, c), end='') - table.append((a, b, c)) - else: - print(' {%d, %d, %d}, // ' % (a, b, c), end='') - for y in range(x[0], x[0] + x[1], d): - print('%s %s, ' % (chr(y), chr(y + x[2])), end='') - print('') - count += 1 - n += 1 - print('\n}; // %d' % (count)) - return table - - -def make_casetable(): - df = read_casefold() - #df = read_unidata() - letters = make_caselist(df) - cfold = make_casefold(letters) - return cfold - - -def print_casefold_low(table): - cfold_low = [i for i in range(len(table))] - cfold_low.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0])) - - print('static uint8_t cfold_low[] = {\n ', end='') - for i in range(len(cfold_low)): - print(" %d," % (cfold_low[i]), end='\n ' if (i+1) % 20 == 0 else '') - print('\n};') - - -########### main: - -if __name__ == "__main__": - print('''#include -#include -#include -#include -#include ''') - - cfold = make_casetable() - table = print_casefold(cfold) - print_casefold_low(table) - - print(r''' -uint32_t utf8_tolower(uint32_t c) { - for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) { - if (c <= casefold[i].c1) { - if (c < casefold[i].c0) return c; - int d = casefold[i].m1 - casefold[i].c1; - if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1)); - return c + d; - } - } - return c; -} - -uint32_t utf8_toupper(uint32_t c) { - for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) { - struct CaseFold cfold = casefold[cfold_low[i]]; - if (c <= cfold.m1) { - int d = cfold.m1 - cfold.c1; - if (c < (uint32_t)(cfold.c0 + d)) return c; - if (d == 1) return c - ((cfold.m1 & 1) == (c & 1)); - return c - d; - } - } - return c; -} - -bool utf8_isupper(uint32_t c) { - return utf8_tolower(c) != c; -} - -bool utf8_islower(uint32_t c) { - return utf8_toupper(c) != c; -} - -bool utf8_isspace(uint32_t c) { - static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0, - 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000}; - for (size_t i=0; i= 0x2000) & (c <= 0x200A); -} - -bool utf8_isdigit(uint32_t c) { - return ((c >= '0') & (c <= '9')) || - ((c >= 0xFF10) & (c <= 0xFF19)); -} - -bool utf8_isxdigit(uint32_t c) { - static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10, - 0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46}; - for (size_t i=1; i= t[i - 1]; - return false; -} - -bool utf8_isalnum(uint32_t c) { - if (c < 128) return isalnum(c) != 0; - if ((c >= 0xFF10) & (c <= 0xFF19)) return true; - return utf8_islower(c) || utf8_isupper(c); -} - -bool utf8_isalpha(uint32_t c) { - if (c < 128) return isalpha(c) != 0; - return utf8_islower(c) || utf8_isupper(c); -} - - -#ifdef TEST -int main() -{ - for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) - { - char x[3][5]={0}; - uint32_t a = casefold[i].c0; - uint32_t b = utf8_tolower(a); - uint32_t c = utf8_toupper(b); - - utf8_encode(x[0], a); - utf8_encode(x[1], b); - utf8_encode(x[2], c); - printf("%s %s %s - %u %u %u\n", x[0], x[1], x[2], a, b, c); - } -} -#endif -''') diff --git a/src/cregex.c b/src/cregex.c index 0f585f5d..34c78090 100644 --- a/src/cregex.c +++ b/src/cregex.c @@ -32,7 +32,6 @@ THE SOFTWARE. #include #include #include -#include "cregex_utf8.c" typedef uint32_t Rune; /* Utf8 code point */ typedef int32_t Token; @@ -594,7 +593,7 @@ nextc(Parser *par, Rune *rp) return 2; case 'p': case 'P': { /* https://www.regular-expressions.info/unicode.html */ static struct { const char* c; int n, r; } cls[] = { - {"{Alpha}", 7, U8_LC}, {"{LC}", 4, U8_LC}, + {"{Alpha}", 7, U8_LC}, {"{LC}", 4, U8_LC}, {"{Alnum}", 7, U8_Xan}, {"{Digit}", 7, U8_Nd}, {"{Nd}", 4, U8_Nd}, {"{Lower}", 7, U8_Ll}, {"{Ll}", 4, U8_Ll}, diff --git a/src/utf8tabs.c b/src/utf8tabs.c new file mode 100644 index 00000000..8168f78f --- /dev/null +++ b/src/utf8tabs.c @@ -0,0 +1,59 @@ +#include "utf8tabs.h" + +struct CaseFold casefold[] = { + {65, 90, 122}, {181, 181, 956}, {192, 214, 246}, {216, 222, 254}, + {256, 302, 303}, {306, 310, 311}, {313, 327, 328}, {330, 374, 375}, {376, 376, 255}, + {377, 381, 382}, {383, 383, 115}, {385, 385, 595}, {386, 388, 389}, {390, 390, 596}, + {391, 391, 392}, {393, 394, 599}, {395, 395, 396}, {398, 398, 477}, {399, 399, 601}, + {400, 400, 603}, {401, 401, 402}, {403, 403, 608}, {404, 404, 611}, {406, 406, 617}, + {407, 407, 616}, {408, 408, 409}, {412, 412, 623}, {413, 413, 626}, {415, 415, 629}, + {416, 420, 421}, {422, 422, 640}, {423, 423, 424}, {425, 425, 643}, {428, 428, 429}, + {430, 430, 648}, {431, 431, 432}, {433, 434, 651}, {435, 437, 438}, {439, 439, 658}, + {440, 442, 443}, {452, 452, 454}, {453, 453, 454}, {455, 455, 457}, {456, 456, 457}, + {458, 458, 460}, {459, 475, 476}, {478, 494, 495}, {497, 497, 499}, {498, 500, 501}, + {502, 502, 405}, {503, 503, 447}, {504, 542, 543}, {544, 544, 414}, {546, 562, 563}, + {570, 570, 11365}, {571, 571, 572}, {573, 573, 410}, {574, 574, 11366}, {577, 577, 578}, + {579, 579, 384}, {580, 580, 649}, {581, 581, 652}, {582, 590, 591}, {837, 837, 953}, + {880, 882, 883}, {886, 886, 887}, {895, 895, 1011}, {902, 902, 940}, {904, 906, 943}, + {908, 908, 972}, {910, 911, 974}, {913, 929, 961}, {931, 939, 971}, {962, 962, 963}, + {975, 975, 983}, {976, 976, 946}, {977, 977, 952}, {981, 981, 966}, {982, 982, 960}, + {984, 1006, 1007}, {1008, 1008, 954}, {1009, 1009, 961}, {1012, 1012, 952}, {1013, 1013, 949}, + {1015, 1015, 1016}, {1017, 1017, 1010}, {1018, 1018, 1019}, {1021, 1023, 893}, + {1024, 1039, 1119}, {1040, 1071, 1103}, {1120, 1152, 1153}, {1162, 1214, 1215}, + {1216, 1216, 1231}, {1217, 1229, 1230}, {1232, 1326, 1327}, {1329, 1366, 1414}, + {4256, 4293, 11557}, {4295, 4296, 11560}, {5112, 5117, 5109}, {7296, 7296, 1074}, + {7297, 7297, 1076}, {7298, 7298, 1086}, {7299, 7300, 1090}, {7301, 7301, 1090}, + {7302, 7302, 1098}, {7303, 7303, 1123}, {7304, 7304, 42571}, {7312, 7354, 4346}, + {7357, 7359, 4351}, {7680, 7828, 7829}, {7835, 7835, 7777}, {7838, 7838, 223}, + {7840, 7934, 7935}, {7944, 7951, 7943}, {7960, 7965, 7957}, {7976, 7983, 7975}, + {7992, 7999, 7991}, {8008, 8013, 8005}, {8025, 8028, 8020}, {8040, 8047, 8039}, + {8072, 8079, 8071}, {8088, 8095, 8087}, {8104, 8111, 8103}, {8120, 8121, 8113}, + {8122, 8123, 8049}, {8124, 8124, 8115}, {8126, 8126, 953}, {8136, 8139, 8053}, + {8140, 8140, 8131}, {8152, 8153, 8145}, {8154, 8155, 8055}, {8168, 8169, 8161}, + {8170, 8171, 8059}, {8172, 8172, 8165}, {8184, 8185, 8057}, {8186, 8187, 8061}, + {8188, 8188, 8179}, {8486, 8486, 969}, {8490, 8490, 107}, {8491, 8491, 229}, + {8498, 8498, 8526}, {8544, 8559, 8575}, {8579, 8579, 8580}, {9398, 9423, 9449}, + {11264, 11311, 11359}, {11360, 11360, 11361}, {11362, 11362, 619}, {11363, 11363, 7549}, + {11364, 11364, 637}, {11367, 11371, 11372}, {11373, 11373, 593}, {11374, 11374, 625}, + {11375, 11375, 592}, {11376, 11376, 594}, {11378, 11380, 11381}, {11390, 11391, 576}, + {11392, 11490, 11491}, {11499, 11501, 11502}, {11506, 11508, 11509}, {42562, 42604, 42605}, + {42624, 42650, 42651}, {42786, 42798, 42799}, {42802, 42862, 42863}, {42873, 42875, 42876}, + {42877, 42877, 7545}, {42878, 42886, 42887}, {42891, 42891, 42892}, {42893, 42893, 613}, + {42896, 42898, 42899}, {42902, 42920, 42921}, {42922, 42922, 614}, {42923, 42923, 604}, + {42924, 42924, 609}, {42925, 42925, 620}, {42926, 42926, 618}, {42928, 42928, 670}, + {42929, 42929, 647}, {42930, 42930, 669}, {42931, 42931, 43859}, {42932, 42946, 42947}, + {42948, 42948, 42900}, {42949, 42949, 642}, {42950, 42950, 7566}, {42951, 42953, 42954}, + {42960, 42962, 42963}, {42968, 42970, 42971}, {43888, 43913, 5049}, {65313, 65338, 65370}, +}; // 188 +uint8_t cfold_low[] = { + 0, 138, 10, 111, 2, 139, 3, 8, 4, 5, 6, 7, 9, 59, 12, 14, 16, 20, 49, 25, + 56, 52, 29, 31, 33, 35, 37, 39, 50, 40, 41, 42, 43, 44, 45, 17, 46, 47, 48, 51, + 53, 55, 155, 58, 62, 152, 150, 153, 11, 13, 15, 18, 19, 171, 21, 172, 22, 167, 170, 24, + 23, 174, 146, 173, 26, 151, 27, 28, 148, 30, 181, 32, 176, 34, 60, 36, 61, 38, 177, 175, + 64, 65, 87, 67, 68, 71, 75, 83, 76, 82, 63, 126, 80, 1, 78, 81, 72, 73, 77, 137, + 69, 70, 74, 79, 85, 66, 84, 86, 89, 99, 100, 101, 102, 103, 104, 88, 90, 105, 91, 93, + 92, 94, 95, 107, 108, 186, 98, 164, 147, 182, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, + 124, 127, 130, 134, 132, 135, 120, 121, 122, 123, 125, 128, 129, 131, 133, 136, 140, 141, 142, 143, + 144, 145, 54, 57, 149, 154, 156, 157, 158, 96, 97, 159, 106, 160, 161, 162, 163, 165, 166, 168, + 180, 169, 179, 183, 184, 185, 178, 187, +}; diff --git a/src/utf8tabs.h b/src/utf8tabs.h new file mode 100644 index 00000000..95251f75 --- /dev/null +++ b/src/utf8tabs.h @@ -0,0 +1,10 @@ +#ifndef utf8tabs_included +#define utf8tabs_included + +#include +struct CaseFold { uint16_t c0, c1, m1; } ; + +extern struct CaseFold casefold[188]; +extern uint8_t cfold_low[188]; + +#endif diff --git a/src/utf8tabs.py b/src/utf8tabs.py new file mode 100644 index 00000000..563180e3 --- /dev/null +++ b/src/utf8tabs.py @@ -0,0 +1,144 @@ +#!python +import pandas as pd +import numpy as np + +def read_unidata(catfilter='Lu', casefilter='lowcase', big=False): + ud = pd.read_csv("ucd/UnicodeData.txt", sep=';', converters={0: lambda x: int(x, base=16)}, + names=['code', 'name', 'category', 'canclass', 'bidircat', 'chrdecomp', + 'decdig', 'digval', 'numval', 'mirrored', 'uc1name', 'comment', + 'upcase', 'lowcase', 'titlecase'], + usecols=['code', 'name', 'category', 'bidircat', 'upcase', 'lowcase', 'titlecase']) + if big: + ud = ud[ud['code'] >= (1<<16)] + else: + ud = ud[ud['code'] < (1<<16)] + ud = ud[ud['category'] == catfilter] + ud = ud.replace(np.nan, '0') + for k in ['upcase', 'lowcase', 'titlecase']: + ud[k] = ud[k].apply(int, base=16) + if casefilter: # 'lowcase', 'upcase', 'titlecase' + ud = ud[ud[casefilter] != 0] + return ud + + +def read_casefold(big=False): + cf = pd.read_csv("ucd/CaseFolding.txt", engine='python', sep='; #? ?', comment='#', + converters={0: lambda x: int(x, base=16)}, + names=['code', 'status', 'lowcase', 'name']) + if big: + cf = cf[cf['code'] >= (1<<16)] + else: + cf = cf[cf['code'] < (1<<16)] + cf = cf[cf.status.isin(['S', 'C'])] + cf['lowcase'] = cf['lowcase'].apply(int, base=16) + #print(cf['name'].values) + #print(cf) + return cf + + +def make_caselist(df): + letters=[] + for idx, row in df.iterrows(): + #print(idx+1, ':', row['code'], row['lowcase'] - row['code'], ',', chr(row['code']), chr(row['lowcase']), ',', row['name']) + letters.append([idx+1, row['code'], row['lowcase'], row['name']]) + return letters + + +def make_casefold(letters): + prevoffset = 0 + diffoffset = 0 + prev = [-1, 0, 0] + diff = [-1, 0, 0] + + out = [] + n = 1 + for x in letters: + offset = x[2] - x[1] + diffoffset = prevoffset - offset + if (diff[1] and x[1] - prev[1] != diff[1]) or (diff[2] and x[2] - prev[2] != diff[2]) or prevoffset != offset: + out.append([x[1], x[2], n, x[3]]) # , ';', chr(x[1]), chr(x[2]), ';', x[1], offset, "CHANGE") + #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset, "CHANGE", n+1) + diff[1] = 0 + diff[2] = 0 + n = 1 + else: + n += 1 + if diff[1] == 0: + diff[1] = x[1] - prev[1] + diff[2] = x[2] - prev[2] + #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset) + diff[1] = x[1] - prev[1] + diff[2] = x[2] - prev[2] + prev[2] = x[2] + prev[1] = x[1] + prevoffset = offset + + out.append(out[-1]) + out[-1][2] = 26 + cfold = [] + for i in range(0, len(out)-1): + d = out[i][1] - out[i][0] + cfold.append([out[i][0], out[i+1][2], d, out[i][3]]) + return cfold + + +def print_casefold(cfold): + print(''' +struct CaseFold casefold[] = {''') + n = 1 + s = 5 + count = 0 + table = [] + print(' ', end='') + for x in cfold: + d = 2 if abs(x[2]) == 1 else 1 + a = x[0] + b = x[0] + (x[1] - 1)*d + c = b + x[2] + if b >= 1<<16 or c >= 1<<16: # only to make sure... + break + #print(' {%d, %d, %d}, // %s %s, %s\n ' % (a, b, c, chr(a), chr(a + x[2]), x[3]), end='') + if True: # compact + if n == s: + n = 0 + if a > 1000: + s = 4 + print('\n ', end='') + print(' {%d, %d, %d},' % (a, b, c), end='') + table.append((a, b, c)) + else: + print(' {%d, %d, %d}, // ' % (a, b, c), end='') + for y in range(x[0], x[0] + x[1], d): + print('%s %s, ' % (chr(y), chr(y + x[2])), end='') + print('') + count += 1 + n += 1 + print('\n}; // %d' % (count)) + return table + + +def make_casetable(): + df = read_casefold() + #df = read_unidata() + letters = make_caselist(df) + cfold = make_casefold(letters) + return cfold + + +def print_casefold_low(table): + cfold_low = [i for i in range(len(table))] + cfold_low.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0])) + + print('uint8_t cfold_low[] = {\n ', end='') + for i in range(len(cfold_low)): + print(" %d," % (cfold_low[i]), end='\n ' if (i+1) % 20 == 0 else '') + print('\n};') + + +########### main: + +if __name__ == "__main__": + print('#include "utf8tabs.h"') + cfold = make_casetable() + table = print_casefold(cfold) + print_casefold_low(table) diff --git a/src/utf8utils.c b/src/utf8utils.c new file mode 100644 index 00000000..3b01ae39 --- /dev/null +++ b/src/utf8utils.c @@ -0,0 +1,190 @@ +#include +#define i_header +#include + +#include "utf8tabs.h" +#include "utf8tabs.c" + +// https://news.ycombinator.com/item?id=15423674 +// https://gist.github.com/s4y/344a355f8c1f99c6a4cb2347ec4323cc + +void utf8_decode(utf8_decode_t *d, const uint8_t b) +{ + switch (d->state) { + case UTF8_OK: + if (b < 0x80) d->codep = b, d->size = 1; + else if (b < 0xC2) d->state = UTF8_ERROR, d->size = 0; + else if (b < 0xE0) d->state = 1, d->codep = b & 0x1F, d->size = 2; + else if (b < 0xF0) d->state = 2, d->codep = b & 0x0F, d->size = 3; + else if (b < 0xF5) d->state = 3, d->codep = b & 0x07, d->size = 4; + else d->state = UTF8_ERROR, d->size = 0; + break; + case 1: case 2: case 3: + if ((b & 0xC0) == 0x80) { + d->state -= 1; + d->codep = (d->codep << 6) | (b & 0x3F); + } else + d->state = UTF8_ERROR, d->size = 0; + } +} + +unsigned utf8_encode(char *out, uint32_t c) +{ + if (c < 0x80U) { + out[0] = (char) c; + return 1; + } else if (c < 0x0800U) { + out[0] = (char) ((c>>6 & 0x1F) | 0xC0); + out[1] = (char) ((c & 0x3F) | 0x80); + return 2; + } else if (c < 0x010000U) { + if ((c < 0xD800U) | (c >= 0xE000U)) { + out[0] = (char) ((c>>12 & 0x0F) | 0xE0); + out[1] = (char) ((c>>6 & 0x3F) | 0x80); + out[2] = (char) ((c & 0x3F) | 0x80); + return 3; + } + } else if (c < 0x110000U) { + out[0] = (char) ((c>>18 & 0x07) | 0xF0); + out[1] = (char) ((c>>12 & 0x3F) | 0x80); + out[2] = (char) ((c>>6 & 0x3F) | 0x80); + out[3] = (char) ((c & 0x3F) | 0x80); + return 4; + } + return 0; +} + +void utf8_peek(utf8_decode_t* d, const char *s) { + utf8_decode(d, (uint8_t)*s++); + switch (d->size) { + case 4: utf8_decode(d, (uint8_t)*s++); + case 3: utf8_decode(d, (uint8_t)*s++); + case 2: utf8_decode(d, (uint8_t)*s++); + } +} + +bool utf8_valid(const char* s) { + utf8_decode_t d = {UTF8_OK}; + while (*s) + utf8_decode(&d, (uint8_t)*s++); + return d.state == UTF8_OK; +} + +bool utf8_valid_n(const char* s, size_t n) { + utf8_decode_t d = {UTF8_OK}; + while ((n-- != 0) & (*s != 0)) + utf8_decode(&d, (uint8_t)*s++); + return d.state == UTF8_OK; +} + +uint32_t utf8_tolower(uint32_t c) { + for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) { + if (c <= casefold[i].c1) { + if (c < casefold[i].c0) return c; + int d = casefold[i].m1 - casefold[i].c1; + if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1)); + return c + d; + } + } + return c; +} + +uint32_t utf8_toupper(uint32_t c) { + for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) { + struct CaseFold cfold = casefold[cfold_low[i]]; + if (c <= cfold.m1) { + int d = cfold.m1 - cfold.c1; + if (c < (uint32_t)(cfold.c0 + d)) return c; + if (d == 1) return c - ((cfold.m1 & 1) == (c & 1)); + return c - d; + } + } + return c; +} + +bool utf8_isupper(uint32_t c) { + return utf8_tolower(c) != c; +} + +bool utf8_islower(uint32_t c) { + return utf8_toupper(c) != c; +} + +bool utf8_isspace(uint32_t c) { + static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0, + 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000}; + for (size_t i=0; i= 0x2000) & (c <= 0x200A); +} + +bool utf8_isdigit(uint32_t c) { + return ((c >= '0') & (c <= '9')) || + ((c >= 0xFF10) & (c <= 0xFF19)); +} + +bool utf8_isxdigit(uint32_t c) { + static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10, + 0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46}; + for (size_t i=1; i= t[i - 1]; + return false; +} + +bool utf8_isalnum(uint32_t c) { + if (c < 128) return isalnum(c) != 0; + if ((c >= 0xFF10) & (c <= 0xFF19)) return true; + return utf8_islower(c) || utf8_isupper(c); +} + +bool utf8_isalpha(uint32_t c) { + if (c < 128) return isalpha(c) != 0; + return utf8_islower(c) || utf8_isupper(c); +} + +static struct fnfold { + int (*conv_asc)(int); + uint32_t (*conv_u8)(uint32_t); +} +fn_tolower = {tolower, utf8_tolower}, +fn_toupper = {toupper, utf8_toupper}; + + +static cstr cstr_casefold(const cstr* self, struct fnfold fold) { + csview sv = cstr_sv(self); + cstr out = cstr_null; + char *buf = cstr_reserve(&out, sv.size*3/2); + uint32_t cp; size_t sz = 0; + utf8_decode_t d = {UTF8_OK}; + + for (; *sv.str; sv.str += d.size) { + utf8_peek(&d, sv.str); + switch (d.size) { + case 1: + buf[sz++] = (char)fold.conv_asc(*sv.str); + break; + default: + cp = fold.conv_u8(d.codep); + sz += utf8_encode(buf + sz, cp); + } + } + _cstr_set_size(&out, sz); + cstr_shrink_to_fit(&out); + return out; +} + +cstr cstr_tolower(const cstr* self) { + return cstr_casefold(self, fn_tolower); +} + +cstr cstr_toupper(const cstr* self) { + return cstr_casefold(self, fn_toupper); +} + +void cstr_lowercase(cstr* self) { + cstr_take(self, cstr_casefold(self, fn_tolower)); +} + +void cstr_uppercase(cstr* self) { + cstr_take(self, cstr_casefold(self, fn_toupper)); +} -- cgit v1.2.3