diff options
| author | Tyge Lovset <[email protected]> | 2023-01-11 23:20:18 +0100 |
|---|---|---|
| committer | Tyge Lovset <[email protected]> | 2023-01-11 23:20:18 +0100 |
| commit | 87690debb5fb523acc3d341c34d20b85d3d63f26 (patch) | |
| tree | 16daa4f3aad726ee5a848ac4a598526d8aff0419 /src | |
| parent | e03b6f09a2ef716cf50b012f699a44bca528a9b6 (diff) | |
| download | STC-modified-87690debb5fb523acc3d341c34d20b85d3d63f26.tar.gz STC-modified-87690debb5fb523acc3d341c34d20b85d3d63f26.zip | |
cregex/utf8: Added some language char classes.
Diffstat (limited to 'src')
| -rw-r--r-- | src/cregex.c | 41 | ||||
| -rw-r--r-- | src/utf8code.c | 170 |
2 files changed, 175 insertions, 36 deletions
diff --git a/src/cregex.c b/src/cregex.c index 5483b243..57ba2f1e 100644 --- a/src/cregex.c +++ b/src/cregex.c @@ -151,6 +151,12 @@ enum { UTF_zl = UTF_GRP+2*U8G_Zl, UTF_ZL, /* utf8 separator line */ UTF_zp = UTF_GRP+2*U8G_Zp, UTF_ZP, /* utf8 separator paragraph */ UTF_zs = UTF_GRP+2*U8G_Zs, UTF_ZS, /* utf8 separator space */ + UTF_arabic = UTF_GRP+2*U8G_Arabic, UTF_ARABIC, + UTF_cyrillic = UTF_GRP+2*U8G_Cyrillic, UTF_CYRILLIC, + UTF_devanaga = UTF_GRP+2*U8G_Devanaga, UTF_DEVANAGA, + UTF_greek = UTF_GRP+2*U8G_Greek, UTF_GREEK, + UTF_han = UTF_GRP+2*U8G_Han, UTF_HAN, + UTF_latin = UTF_GRP+2*U8G_Latin, UTF_LATIN, TOK_ANY = 0x8200000, /* Any character except newline, . */ TOK_ANYNL , /* Any character including newline, . */ TOK_NOP , /* No operation, internal use only */ @@ -637,7 +643,7 @@ _lexasciiclass(_Parser *par, _Rune *rp) /* assume *rp == '[' and *par->exprp == static void _lexutfclass(_Parser *par, _Rune *rp) { - static struct { const char* c; int n, r; } cls[] = { + static struct { const char* c; uint32_t n, r; } cls[] = { {"{Alpha}", 7, UTF_al}, {"{L&}", 4, UTF_lc}, {"{Digit}", 7, UTF_nd}, {"{Nd}", 4, UTF_nd}, {"{Lower}", 7, UTF_ll}, {"{Ll}", 4, UTF_ll}, @@ -651,6 +657,9 @@ _lexutfclass(_Parser *par, _Rune *rp) {"{Pf}", 4, UTF_pf}, {"{Pi}", 4, UTF_pi}, {"{Zl}", 4, UTF_zl}, {"{Zp}", 4, UTF_zp}, {"{Zs}", 4, UTF_zs}, {"{Sc}", 4, UTF_sc}, + {"{Arabic}", 8, UTF_arabic}, {"{Cyrillic}", 10, UTF_cyrillic}, + {"{Devanaga}", 10, UTF_devanaga}, {"{Greek}", 7, UTF_greek}, + {"{Han}", 5, UTF_han}, {"{Latin}", 7, UTF_latin}, }; int inv = (*rp == 'P'); for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i) { @@ -924,18 +933,24 @@ _runematch(_Rune s, _Rune r) case UTF_LC: inv = 1; case UTF_lc: return inv ^ utf8_iscased(r); case UTF_AL: inv = 1; case UTF_al: return inv ^ utf8_isalpha(r); case UTF_WR: inv = 1; case UTF_wr: return inv ^ utf8_isword(r); - case UTF_CC: case UTF_cc: - case UTF_LT: case UTF_lt: - case UTF_ND: case UTF_nd: - case UTF_NL: case UTF_nl: - case UTF_PC: case UTF_pc: - case UTF_PD: case UTF_pd: - case UTF_PF: case UTF_pf: - case UTF_PI: case UTF_pi: - case UTF_SC: case UTF_sc: - case UTF_ZL: case UTF_zl: - case UTF_ZP: case UTF_zp: - case UTF_ZS: case UTF_zs: + case UTF_cc: case UTF_CC: + case UTF_lt: case UTF_LT: + case UTF_nd: case UTF_ND: + case UTF_nl: case UTF_NL: + case UTF_pc: case UTF_PC: + case UTF_pd: case UTF_PD: + case UTF_pf: case UTF_PF: + case UTF_pi: case UTF_PI: + case UTF_sc: case UTF_SC: + case UTF_zl: case UTF_ZL: + case UTF_zp: case UTF_ZP: + case UTF_zs: case UTF_ZS: + case UTF_arabic: case UTF_ARABIC: + case UTF_cyrillic: case UTF_CYRILLIC: + case UTF_devanaga: case UTF_DEVANAGA: + case UTF_greek: case UTF_GREEK: + case UTF_han: case UTF_HAN: + case UTF_latin: case UTF_LATIN: n = s - UTF_GRP; inv = n & 1; return inv ^ utf8_isgroup(n / 2, r); diff --git a/src/utf8code.c b/src/utf8code.c index c4866b78..5cc5d467 100644 --- a/src/utf8code.c +++ b/src/utf8code.c @@ -134,31 +134,19 @@ bool utf8_isgroup(int group, uint32_t c) { return false; } -bool utf8_iscased(uint32_t c) { - if (c < 128) return isalpha(c) != 0; - return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c); -} - -bool utf8_isalnum(uint32_t c) { - if (c < 128) return isalnum(c) != 0; - return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c) || - utf8_isgroup(U8G_Nd, c) || utf8_isgroup(U8G_Nl, c); -} - -bool utf8_isblank(uint32_t c) { - if (c < 128) return (c == ' ') | (c == '\t'); - return utf8_isgroup(U8G_Zs, c); -} - -bool utf8_isspace(uint32_t c) { - if (c < 128) return isspace(c) != 0; - return ((c == 8232) | (c == 8233)) || utf8_isgroup(U8G_Zs, c); +bool utf8_isalpha(uint32_t c) { + static int16_t groups[] = {U8G_Latin, U8G_Nl, U8G_Greek, U8G_Cyrillic, + U8G_Han, U8G_Arabic, U8G_Devanaga}; + for (unsigned j=0; j < c_ARRAYLEN(groups); ++j) + if (utf8_isgroup(groups[j], c)) + return true; + return false; } -bool utf8_isword(uint32_t c) { - if (c < 128) return (isalnum(c) != 0) | (c == '_'); - return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c) || - utf8_isgroup(U8G_Nd, c) || utf8_isgroup(U8G_Pc, c) || utf8_isgroup(U8G_Nl, c); +bool utf8_iscased(uint32_t c) { + if (c < 128) return isalpha(c) != 0; + return utf8_islower(c) || utf8_isupper(c) || + utf8_isgroup(U8G_Lt, c); } /* The tables below are extracted from the RE2 library */ @@ -327,6 +315,136 @@ static const URange16 Zs_range16[] = { // Space separator { 12288, 12288 }, }; +static const URange16 Arabic_range16[] = { + { 1536, 1540 }, + { 1542, 1547 }, + { 1549, 1562 }, + { 1564, 1566 }, + { 1568, 1599 }, + { 1601, 1610 }, + { 1622, 1647 }, + { 1649, 1756 }, + { 1758, 1791 }, + { 1872, 1919 }, + { 2160, 2190 }, + { 2192, 2193 }, + { 2200, 2273 }, + { 2275, 2303 }, + { 64336, 64450 }, + { 64467, 64829 }, + { 64832, 64911 }, + { 64914, 64967 }, + { 64975, 64975 }, + { 65008, 65023 }, + { 65136, 65140 }, + { 65142, 65276 }, +}; + +static const URange16 Cyrillic_range16[] = { + { 1024, 1156 }, + { 1159, 1327 }, + { 7296, 7304 }, + { 7467, 7467 }, + { 7544, 7544 }, + { 11744, 11775 }, + { 42560, 42655 }, + { 65070, 65071 }, +}; + +static const URange16 Devanagari_range16[] = { + { 2304, 2384 }, + { 2389, 2403 }, + { 2406, 2431 }, + { 43232, 43263 }, +}; + +static const URange16 Greek_range16[] = { + { 880, 883 }, + { 885, 887 }, + { 890, 893 }, + { 895, 895 }, + { 900, 900 }, + { 902, 902 }, + { 904, 906 }, + { 908, 908 }, + { 910, 929 }, + { 931, 993 }, + { 1008, 1023 }, + { 7462, 7466 }, + { 7517, 7521 }, + { 7526, 7530 }, + { 7615, 7615 }, + { 7936, 7957 }, + { 7960, 7965 }, + { 7968, 8005 }, + { 8008, 8013 }, + { 8016, 8023 }, + { 8025, 8025 }, + { 8027, 8027 }, + { 8029, 8029 }, + { 8031, 8061 }, + { 8064, 8116 }, + { 8118, 8132 }, + { 8134, 8147 }, + { 8150, 8155 }, + { 8157, 8175 }, + { 8178, 8180 }, + { 8182, 8190 }, + { 8486, 8486 }, + { 43877, 43877 }, +}; + +static const URange16 Han_range16[] = { + { 11904, 11929 }, + { 11931, 12019 }, + { 12032, 12245 }, + { 12293, 12293 }, + { 12295, 12295 }, + { 12321, 12329 }, + { 12344, 12347 }, + { 13312, 19903 }, + { 19968, 40959 }, + { 63744, 64109 }, + { 64112, 64217 }, +}; + +static const URange16 Latin_range16[] = { + { 65, 90 }, + { 97, 122 }, + { 170, 170 }, + { 186, 186 }, + { 192, 214 }, + { 216, 246 }, + { 248, 696 }, + { 736, 740 }, + { 7424, 7461 }, + { 7468, 7516 }, + { 7522, 7525 }, + { 7531, 7543 }, + { 7545, 7614 }, + { 7680, 7935 }, + { 8305, 8305 }, + { 8319, 8319 }, + { 8336, 8348 }, + { 8490, 8491 }, + { 8498, 8498 }, + { 8526, 8526 }, + { 8544, 8584 }, + { 11360, 11391 }, + { 42786, 42887 }, + { 42891, 42954 }, + { 42960, 42961 }, + { 42963, 42963 }, + { 42965, 42969 }, + { 42994, 43007 }, + { 43824, 43866 }, + { 43868, 43876 }, + { 43878, 43881 }, + { 64256, 64262 }, + { 65313, 65338 }, + { 65345, 65370 }, +}; + #define UNI_ENTRY(Code) \ { Code##_range16, sizeof(Code##_range16)/sizeof(URange16) } @@ -343,6 +461,12 @@ static const UGroup unicode_groups[U8G_SIZE] = { [U8G_Zl] = UNI_ENTRY(Zl), [U8G_Zp] = UNI_ENTRY(Zp), [U8G_Zs] = UNI_ENTRY(Zs), + [U8G_Arabic] = UNI_ENTRY(Arabic), + [U8G_Cyrillic] = UNI_ENTRY(Cyrillic), + [U8G_Devanaga] = UNI_ENTRY(Devanagari), + [U8G_Greek] = UNI_ENTRY(Greek), + [U8G_Han] = UNI_ENTRY(Han), + [U8G_Latin] = UNI_ENTRY(Latin), }; #endif |
