summaryrefslogtreecommitdiffhomepage
path: root/src/utf8tabs.py
diff options
context:
space:
mode:
authorTyge Løvset <[email protected]>2022-06-07 16:39:30 +0200
committerTyge Løvset <[email protected]>2022-06-07 16:39:30 +0200
commit48b3336d8f10d8097d7626732ede5896ec353407 (patch)
treee83f87925eb84fbbaefeb3fe4cee39edacf232e0 /src/utf8tabs.py
parentb65f70fdc80b19d869adabb5cb270807c96d152b (diff)
downloadSTC-modified-48b3336d8f10d8097d7626732ede5896ec353407.tar.gz
STC-modified-48b3336d8f10d8097d7626732ede5896ec353407.zip
Some improvements in utf8tabs.py
Diffstat (limited to 'src/utf8tabs.py')
-rw-r--r--src/utf8tabs.py25
1 files changed, 12 insertions, 13 deletions
diff --git a/src/utf8tabs.py b/src/utf8tabs.py
index 55be3f99..241f44af 100644
--- a/src/utf8tabs.py
+++ b/src/utf8tabs.py
@@ -60,7 +60,7 @@ def make_table(caselist):
offset = b - a
if abs(diff_a) > 2 or a - prev_a != diff_a or b - prev_b != diff_b or prev_offs != offset:
- if j > 0 and start_a not in [0xAB70, 0x13F8]:
+ if j > 0 and start_a not in [0xAB70, 0x13F8]: # BUG in CaseFolding.txt V14
table.append([start_a, prev_a, prev_b, start_name])
if j < n_1:
diff_a = caselist[j+1][0] - a
@@ -103,7 +103,7 @@ def print_index_table(name, indtab):
def make_inverse_ind(table):
inv = []
for i in range(len(table)):
- if table[i][2] not in [ord('i'), ord('s')]: # remove 'i'. 's' upcase mappings.
+ if table[i][2] not in [ord('i'), ord('s')]: # ignore 'i'. 's' upcase mappings.
inv.append(i)
# sort by mapped value table[:][2] (= inv) of the first element in each range entry
inv.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0]))
@@ -124,25 +124,24 @@ def main():
print('#include <stdint.h>\n')
print('struct CaseMapping { uint16_t c0, c1, m1; };\n')
- casemappings = compile_table('lowcase') # casefold
- lowcase = compile_table('lowcase', 'Lu') # unicode lowercase
+ casemappings = compile_table('lowcase') # Casefolding.txt
+ upcase = compile_table('lowcase', 'Lu') # UnicodeData.txt uppercase
- lowcase_ind = []
- for v in lowcase:
- try:
- lowcase_ind.append(casemappings.index(v))
- except:
- lowcase_ind.append(len(casemappings))
+ # merge in additional Lu => Ll mappings from UnicodeData.txt
+ for v in upcase:
+ if v not in casemappings:
casemappings.append(v)
- casefold_len = len(casemappings)
+ # sort casemappings by uppercase values:
casemappings.sort(key=lambda x: x[0])
-
print_table('casemappings', casemappings, style=1)
+
+ # index list sorted by mapped lowercase values:
upcase_ind = make_inverse_ind(casemappings)
- print('enum { casefold_len = %d };' % casefold_len)
+ print('enum { casefold_len = %d };' % len(casemappings))
print_index_table('upcase_ind', upcase_ind)
+
########### main:
if __name__ == "__main__":