diff options
| author | Tyge Løvset <[email protected]> | 2023-01-05 15:12:52 +0100 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2023-01-05 15:12:52 +0100 |
| commit | 45bfcdc35788988f5c8b2b53a62f7fb8c5e1cf50 (patch) | |
| tree | a834aecc95006cfca2d6aa2c67c5db94d098fb76 /src | |
| parent | 1a98d0b660775f9a434197430905024519a0efbf (diff) | |
| download | STC-modified-45bfcdc35788988f5c8b2b53a62f7fb8c5e1cf50.tar.gz STC-modified-45bfcdc35788988f5c8b2b53a62f7fb8c5e1cf50.zip | |
Added clist_X_get_node(valptr) to complete the node API.
Diffstat (limited to 'src')
| -rw-r--r-- | src/utf8tabs.py | 26 |
1 files changed, 14 insertions, 12 deletions
diff --git a/src/utf8tabs.py b/src/utf8tabs.py index 4a5781f1..7ed5e7ae 100644 --- a/src/utf8tabs.py +++ b/src/utf8tabs.py @@ -2,16 +2,16 @@ import pandas as pd import numpy as np -_UNICODE_DIR = "https://www.unicode.org/Public/14.0.0/ucd" +_UNICODE_DIR = "https://www.unicode.org/Public/15.0.0/ucd" -def read_unidata(casetype='lowcase', category='Lu', big=False): +def read_unidata(casetype='lowcase', category='Lu', range32=False): df = pd.read_csv(_UNICODE_DIR+'/UnicodeData.txt', sep=';', converters={0: lambda x: int(x, base=16)}, names=['code', 'name', 'category', 'canclass', 'bidircat', 'chrdecomp', 'decdig', 'digval', 'numval', 'mirrored', 'uc1name', 'comment', 'upcase', 'lowcase', 'titlecase'], usecols=['code', 'name', 'category', 'bidircat', 'upcase', 'lowcase', 'titlecase']) - if big: + if range32: df = df[df['code'] >= (1<<16)] else: df = df[df['code'] < (1<<16)] @@ -27,11 +27,11 @@ def read_unidata(casetype='lowcase', category='Lu', big=False): return df -def read_casefold(big=False): +def read_casefold(range32=False): df = pd.read_csv(_UNICODE_DIR+'/CaseFolding.txt', engine='python', sep='; #? ?', comment='#', converters={0: lambda x: int(x, base=16)}, names=['code', 'status', 'lowcase', 'name']) # comment => 'name' - if big: + if range32: df = df[df['code'] >= (1<<16)] else: df = df[df['code'] < (1<<16)] @@ -100,11 +100,11 @@ def print_index_table(name, indtab): print('\n};') -def compile_table(casetype='lowcase', category=None): +def compile_table(casetype='lowcase', category=None, range32=False): if category: - df = read_unidata(casetype, category) + df = read_unidata(casetype, category, range32) else: - df = read_casefold() + df = read_casefold(range32) caselist = make_caselist(df, casetype) table = make_table(caselist) return table @@ -113,10 +113,11 @@ def compile_table(casetype='lowcase', category=None): def main(): print('#include <stdint.h>\n') print('struct CaseMapping { uint16_t c1, c2, m2; };\n') + range32 = False - casemappings = compile_table('lowcase') # CaseFolding.txt - upcase = compile_table('lowcase', 'Lu') # UnicodeData.txt uppercase - lowcase = compile_table('upcase', 'Ll') # UnicodeData.txt lowercase + casemappings = compile_table('lowcase', None, range32) # CaseFolding.txt + upcase = compile_table('lowcase', 'Lu', range32) # UnicodeData.txt uppercase + lowcase = compile_table('upcase', 'Ll', range32) # UnicodeData.txt lowercase casefolding_len = len(casemappings) @@ -150,7 +151,8 @@ def main(): print_index_table('upcase_ind', upcase_ind) # lowcase => up. add "missing" SHARP S caused by https://www.unicode.org/policies/stability_policy.html#Case_Pair - lowcase_ind.append(next(i for i,x in enumerate(casemappings) if x[0]==ord('ẞ'))) + if not range32: + lowcase_ind.append(next(i for i,x in enumerate(casemappings) if x[0]==ord('ẞ'))) lowcase_ind.sort(key=lambda i: casemappings[i][2] - (casemappings[i][1] - casemappings[i][0])) print_index_table('lowcase_ind', lowcase_ind) |
