diff options
| author | Tyge Løvset <[email protected]> | 2022-02-23 07:32:42 +0100 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-02-23 07:32:42 +0100 |
| commit | 2e99df7992876c7270192342d3def4a4f7c82319 (patch) | |
| tree | bb3796ff581440e2eb62c368c8c6d24d79e98f89 /src/casefold.py | |
| parent | 575642c5f17e5a226cbfae7c4511583c964d5335 (diff) | |
| download | STC-modified-2e99df7992876c7270192342d3def4a4f7c82319.tar.gz STC-modified-2e99df7992876c7270192342d3def4a4f7c82319.zip | |
Added official unicode tables and python table generator for casefolding.
Diffstat (limited to 'src/casefold.py')
| -rw-r--r-- | src/casefold.py | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/src/casefold.py b/src/casefold.py new file mode 100644 index 00000000..17b0b99e --- /dev/null +++ b/src/casefold.py @@ -0,0 +1,253 @@ +#!python +import pandas as pd +import numpy as np + +def read_unidata(catfilter, casefilter=None, big=False): + ud = pd.read_csv("ucd/UnicodeData.txt", sep=';', converters={0: lambda x: int(x, base=16)}, + names=['code', 'name', 'category', 'canclass', 'bidircat', 'chrdecomp', + 'decdig', 'digval', 'numval', 'mirrored', 'uc1name', 'comment', + 'upcase', 'lowcase', 'titlecase'], + usecols=['code', 'name', 'category', 'bidircat', 'upcase', 'lowcase', 'titlecase']) + if big: + ud = ud[ud['code'] >= (1<<16)] + else: + ud = ud[ud['code'] < (1<<16)] + ud = ud[ud['category'] == catfilter] + ud = ud.replace(np.nan, '0') + for k in ['upcase', 'lowcase', 'titlecase']: + ud[k] = ud[k].apply(int, base=16) + if casefilter: # 'lowcase', 'upcase', 'titlecase' + ud = ud[ud[casefilter] != 0] + return ud + + +def read_casefold(big=False): + cf = pd.read_csv("ucd/CaseFolding.txt", engine='python', sep='; #? ?', comment='#', + converters={0: lambda x: int(x, base=16)}, + names=['code', 'status', 'lowcase', 'name']) + if big: + cf = cf[cf['code'] >= (1<<16)] + else: + cf = cf[cf['code'] < (1<<16)] + cf = cf[cf.status.isin(['S', 'C'])] + cf['lowcase'] = cf['lowcase'].apply(int, base=16) + #print(cf['name'].values) + #print(cf) + return cf + + +def make_caselist(df): + letters=[] + for idx, row in df.iterrows(): + #print(idx+1, ':', row['code'], row['lowcase'] - row['code'], ',', chr(row['code']), chr(row['lowcase']), ',', row['name']) + letters.append([idx+1, row['code'], row['lowcase'], row['name']]) + return letters + + +def make_casefold(letters): + prevoffset = 0 + diffoffset = 0 + prev = [-1, 0, 0] + diff = [-1, 0, 0] + + out = [] + n = 1 + for x in letters: + offset = x[2] - x[1] + diffoffset = prevoffset - offset + if (diff[1] and x[1] - prev[1] != diff[1]) or (diff[2] and x[2] - prev[2] != diff[2]) or prevoffset != offset: + out.append([x[1], x[2], n, x[3]]) # , ';', chr(x[1]), chr(x[2]), ';', x[1], offset, "CHANGE") + #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset, "CHANGE", n+1) + diff[1] = 0 + diff[2] = 0 + n = 1 + else: + n += 1 + if diff[1] == 0: + diff[1] = x[1] - prev[1] + diff[2] = x[2] - prev[2] + #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset) + diff[1] = x[1] - prev[1] + diff[2] = x[2] - prev[2] + prev[2] = x[2] + prev[1] = x[1] + prevoffset = offset + + out.append(out[-1]) + out[-1][2] = 26 + cfold = [] + for i in range(0, len(out)-1): + d = out[i][1] - out[i][0] + cfold.append([out[i][0], out[i+1][2], d, out[i][3]]) + return cfold + + +def print_casefold(cfold): + print(''' +static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = {''') + n = 1 + s = 5 + count = 0 + table = [] + print(' ', end='') + for x in cfold: + d = 2 if abs(x[2]) == 1 else 1 + a = x[0] + b = x[0] + (x[1] - 1)*d + c = b + x[2] + if b >= 1<<16 or c >= 1<<16: # only to make sure... + break + #print(' {%d, %d, %d}, // %s %s, %s\n ' % (a, b, c, chr(a), chr(a + x[2]), x[3]), end='') + if True: # compact + if n == s: + n = 0 + if a > 1000: + s = 4 + print('\n ', end='') + print(' {%d, %d, %d},' % (a, b, c), end='') + table.append((a, b, c)) + else: + print(' {%d, %d, %d}, // ' % (a, b, c), end='') + for y in range(x[0], x[0] + x[1], d): + print('%s %s, ' % (chr(y), chr(y + x[2])), end='') + print('') + count += 1 + n += 1 + print('\n}; // %d' % (count)) + return table + + +def make_casetable(): + df = read_casefold() + letters = make_caselist(df) + cfold = make_casefold(letters) + return cfold + + +def print_casefold_low(table): + cfold_low = [i for i in range(len(table))] + cfold_low.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0])) + + print('static uint8_t cfold_low[] = {\n ', end='') + for i in range(len(cfold_low)): + print(" %d," % (cfold_low[i]), end='\n ' if (i+1) % 20 == 0 else '') + print('\n};') + + +########### main: + +if __name__ == "__main__": + print('''#include <stdint.h> +#include <stdio.h> +#include <ctype.h> +#include <stdbool.h>''') + + cfold = make_casetable() + table = print_casefold(cfold) + print_casefold_low(table) + + print(r''' +uint32_t utf8_tolower(uint32_t c) { + for (int i=0; i < sizeof casefold/sizeof *casefold; ++i) { + if (c <= casefold[i].c1) { + if (c < casefold[i].c0) return c; + int d = casefold[i].m1 - casefold[i].c1; + if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1)); + return c + d; + } + } + return c; +} + +uint32_t utf8_toupper(uint32_t c) { + for (int i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) { + struct CaseFold cfold = casefold[cfold_low[i]]; + if (c <= cfold.m1) { + int d = cfold.m1 - cfold.c1; + if (c < cfold.c0 + d) return c; + if (d == 1) return c - ((cfold.m1 & 1) == (c & 1)); + return c - d; + } + } + return c; +} + +bool utf8_isupper(uint32_t c) { + return utf8_tolower(c) != c; +} + +bool utf8_islower(uint32_t c) { + return utf8_toupper(c) != c; +} + +bool utf8_isspace(uint32_t c) { + static uint16_t t[] = {0x09, 0x0D, 0x20, 0x85, 0xA0, 0x1680, + 0x2028, 0x2029, 0x202F, 0x205F, 0x3000}; + for (int i=0; i<sizeof t/sizeof *t; ++i) + if (c == t[i]) return true; + return (c >= 0x2000) & (c <= 0x200A); +} + +bool utf8_isdigit(uint32_t c) { + return ((c >= '0') & (c <= '9')) || + ((c >= 0xFF10) & (c <= 0xFF19)) || + ((c >= 0x1D7CE) & (c <= 0x1D7FF)); +} + +bool utf8_isxdigit(uint32_t c) { + static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, + 0xFF10, 0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46}; + for (int i=1; i<sizeof t/sizeof *t; i += 2) + if (c <= t[i]) return c >= t[i - 1]; + return false; +} + +bool utf8_isalnum(uint32_t c) { + if (c < 128) return isalnum(c) != 0; + if ((c >= 0xFF10) & (c <= 0xFF19) || + ((c >= 0x1D7CE) & (c <= 0x1D7FF))) return true; + return utf8_islower(c) || utf8_isupper(c); +} + + +#ifdef TEST +size_t utf8_encode(char *out, uint32_t c) +{ + char* p = out; + if (c < 0x80U) { + *p++ = (char) c; + } else if (c < 0x0800U) { + *p++ = (char) ((c>>6 & 0x1F) | 0xC0); + *p++ = (char) ((c & 0x3F) | 0x80); + } else if (c < 0x010000U) { + if (c < 0xD800U || c >= 0xE000U) { + *p++ = (char) ((c>>12 & 0x0F) | 0xE0); + *p++ = (char) ((c>>6 & 0x3F) | 0x80); + *p++ = (char) ((c & 0x3F) | 0x80); + } + } else if (c < 0x110000U) { + *p++ = (char) ((c>>18 & 0x07) | 0xF0); + *p++ = (char) ((c>>12 & 0x3F) | 0x80); + *p++ = (char) ((c>>6 & 0x3F) | 0x80); + *p++ = (char) ((c & 0x3F) | 0x80); + } + return p - out; +} + +int main() +{ + for (int i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) + { + char x[3][5]={0}; + uint32_t a = casefold[i].c0; + uint32_t b = utf8_tolower(a); + uint32_t c = utf8_toupper(b); + + utf8_encode(x[0], a); + utf8_encode(x[1], b); + utf8_encode(x[2], c); + printf("%s %s %s - %u %u %u\n", x[0], x[1], x[2], a, b, c); + } +} +#endif +''') |
