summaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorTyge Løvset <[email protected]>2022-06-03 17:29:24 +0200
committerTyge Løvset <[email protected]>2022-06-03 17:29:24 +0200
commitbfcd2e3a6912ac392694c238fce8fc613d2f1525 (patch)
tree3917609c6cec567db1345da7da79339c753e6a22 /src
parentdd5ff55b2ec373370c894ad7e3aec884ce640a43 (diff)
downloadSTC-modified-bfcd2e3a6912ac392694c238fce8fc613d2f1525.tar.gz
STC-modified-bfcd2e3a6912ac392694c238fce8fc613d2f1525.zip
refactored utf8tabs.py, still has a bug though
Diffstat (limited to 'src')
-rw-r--r--src/utf8tabs.py135
1 files changed, 71 insertions, 64 deletions
diff --git a/src/utf8tabs.py b/src/utf8tabs.py
index 1b6ad8d7..0bb8cf64 100644
--- a/src/utf8tabs.py
+++ b/src/utf8tabs.py
@@ -4,7 +4,7 @@ import numpy as np
_UNICODE_DIR = "https://www.unicode.org/Public/14.0.0/ucd"
-def read_unidata(category='Lu', casetype='lowcase', big=False):
+def read_unidata(casetype='lowcase', category='Lu', big=False):
df = pd.read_csv(_UNICODE_DIR+'/UnicodeData.txt', sep=';', converters={0: lambda x: int(x, base=16)},
names=['code', 'name', 'category', 'canclass', 'bidircat', 'chrdecomp',
'decdig', 'digval', 'numval', 'mirrored', 'uc1name', 'comment',
@@ -40,28 +40,24 @@ def read_casefold(big=False):
return df
-def make_caselist(df):
- letters=[]
+def make_caselist(df, casetype):
+ caselist=[]
for idx, row in df.iterrows():
- #print(idx+1, ':', row['code'], row['lowcase'] - row['code'], ',', chr(row['code']), chr(row['lowcase']), ',', row['name'])
- letters.append([idx+1, row['code'], row['lowcase'], row['name']])
- return letters
+ caselist.append([idx+1, row['code'], row[casetype], row['name']])
+ return caselist
-def make_casefold(letters):
+def make_table(caselist):
prevoffset = 0
- diffoffset = 0
prev = [-1, 0, 0]
diff = [-1, 0, 0]
-
- out = []
n = 1
- for x in letters:
+
+ ranges = []
+ for x in caselist:
offset = x[2] - x[1]
- diffoffset = prevoffset - offset
if (diff[1] and x[1] - prev[1] != diff[1]) or (diff[2] and x[2] - prev[2] != diff[2]) or prevoffset != offset:
- out.append([x[1], x[2], n, x[3]]) # , ';', chr(x[1]), chr(x[2]), ';', x[1], offset, "CHANGE")
- #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset, "CHANGE", n+1)
+ ranges.append([x[1], x[2], n, x[3]])
diff[1] = 0
diff[2] = 0
n = 1
@@ -70,78 +66,89 @@ def make_casefold(letters):
if diff[1] == 0:
diff[1] = x[1] - prev[1]
diff[2] = x[2] - prev[2]
- #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset)
diff[1] = x[1] - prev[1]
diff[2] = x[2] - prev[2]
prev[2] = x[2]
prev[1] = x[1]
prevoffset = offset
- out.append(out[-1])
- out[-1][2] = 26
- cfold = []
- for i in range(0, len(out)-1):
- d = out[i][1] - out[i][0]
- cfold.append([out[i][0], out[i+1][2], d, out[i][3]])
- return cfold
+ ranges.append(ranges[-1])
+ ranges[-1][2] = 26
+ #for r in ranges:
+ # print("%04X, %04X, %d" % (r[0], r[1], r[2]))
+ #exit()
+
+ # next two for loops could be combined to eliminate rangelist
+ rangelist = []
+ for i in range(0, len(ranges)-1):
+ d = ranges[i][1] - ranges[i][0]
+ rangelist.append([ranges[i][0], ranges[i+1][2], d, ranges[i][3]])
-def print_casefold(cfold):
- print('#include <stdint.h>\n')
- print('struct CaseFold { uint16_t c0, c1, m1; };\n')
- print('static struct CaseFold casefold[] = {\n ', end='')
- n = 1
- s = 5
- count = 0
table = []
- for x in cfold:
- d = 2 if abs(x[2]) == 1 else 1
+ for x in rangelist:
+ if x[2] == -1:
+ print("ohoh:", x[0])
+ delta = 2 if x[2] == 1 else 1
a = x[0]
- b = x[0] + (x[1] - 1)*d
+ b = x[0] + (x[1] - 1)*delta
c = b + x[2]
if b >= 1<<16 or c >= 1<<16: # only to make sure...
break
- #print(' {%d, %d, %d}, // %s %s, %s\n ' % (a, b, c, chr(a), chr(a + x[2]), x[3]), end='')
- if True: # compact
- if n == s:
- n = 0
- if a > 1000:
- s = 4
- print('\n ', end='')
- print(' {%d, %d, %d},' % (a, b, c), end='')
- table.append((a, b, c))
- else:
- print(' {%d, %d, %d}, // ' % (a, b, c), end='')
- for y in range(x[0], x[0] + x[1], d):
- print('%s %s, ' % (chr(y), chr(y + x[2])), end='')
- print('')
- count += 1
- n += 1
- print('\n}; // %d\n' % (count))
+ table.append((a, b, c, x[3]))
return table
-def make_casetable():
- df = read_casefold()
- #df = read_unidata()
- letters = make_caselist(df)
- cfold = make_casefold(letters)
- return cfold
+def print_table(name, table, style=1):
+ print('static struct CaseFold %s[] = {' % (name))
+ for a,b,c,t in table:
+ if style == 1: # first char with name
+ print(' {%3d, %3d, %3d}, \t// %s %s, %s' % (a, b, c, chr(a), chr(a + c - b), t))
+ elif style == 2: # all chars
+ #print(' {%3d, %3d, %3d}, \t// ' % (a, b, c), end='')
+ print(' {0x%04X, 0x%04X, 0x%04X},\t// ' % (a, b, c), end='')
+ n = 0
+ for k in range(a, b+1, 2 if c - b == 1 else 1):
+ n += 1
+ if n % 17 == 0:
+ print('\n \t// ', end='')
+ print('%s %s, ' % (chr(k), chr(k + c - b)), end='')
+ print('')
+ print('}; // %d\n' % (len(table)))
-def print_casefold_inverse(table):
- cfold_low = [i for i in range(len(table))]
- cfold_low.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0]))
+def print_table_inv(name, table):
+ lowcase = [i for i in range(len(table))]
+ # sort by mapped value table[:][2] (= lowcase) of the first element in each range entry
+ lowcase.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0]))
- print('static uint8_t cfold_low[] = {\n ', end='')
- for i in range(len(cfold_low)):
- print(" %d," % (cfold_low[i]), end='\n ' if (i+1) % 20 == 0 else '')
+ print('static uint8_t %s[%d] = {\n ' % (name, len(table)), end='')
+ for i in range(len(lowcase)):
+ print(" %d," % (lowcase[i]), end='\n ' if (i+1) % 20 == 0 else '')
print('\n};')
+def compile_table(name, casetype='lowcase', category=None):
+ if category:
+ df = read_unidata(casetype, category)
+ else:
+ df = read_casefold()
+ caselist = make_caselist(df, casetype)
+ table = make_table(caselist)
+ print_table(name, table, 2)
+ return table
+
+
+def main():
+ print('#include <stdint.h>\n')
+ print('struct CaseFold { uint16_t c0, c1, m1; };\n')
+
+ table = compile_table('casefold') # casefold
+ #table = compile_table('tolower_tab', 'lowcase', 'Lu') # unicode
+ print_table_inv('cfold_low', table)
+
+
########### main:
if __name__ == "__main__":
- cfold = make_casetable()
- table = print_casefold(cfold)
- print_casefold_inverse(table)
+ main()