diff options
| author | Tyge Lovset <[email protected]> | 2023-01-11 23:20:18 +0100 |
|---|---|---|
| committer | Tyge Lovset <[email protected]> | 2023-01-11 23:20:18 +0100 |
| commit | 87690debb5fb523acc3d341c34d20b85d3d63f26 (patch) | |
| tree | 16daa4f3aad726ee5a848ac4a598526d8aff0419 /src/cregex.c | |
| parent | e03b6f09a2ef716cf50b012f699a44bca528a9b6 (diff) | |
| download | STC-modified-87690debb5fb523acc3d341c34d20b85d3d63f26.tar.gz STC-modified-87690debb5fb523acc3d341c34d20b85d3d63f26.zip | |
cregex/utf8: Added some language char classes.
Diffstat (limited to 'src/cregex.c')
| -rw-r--r-- | src/cregex.c | 41 |
1 files changed, 28 insertions, 13 deletions
diff --git a/src/cregex.c b/src/cregex.c index 5483b243..57ba2f1e 100644 --- a/src/cregex.c +++ b/src/cregex.c @@ -151,6 +151,12 @@ enum { UTF_zl = UTF_GRP+2*U8G_Zl, UTF_ZL, /* utf8 separator line */ UTF_zp = UTF_GRP+2*U8G_Zp, UTF_ZP, /* utf8 separator paragraph */ UTF_zs = UTF_GRP+2*U8G_Zs, UTF_ZS, /* utf8 separator space */ + UTF_arabic = UTF_GRP+2*U8G_Arabic, UTF_ARABIC, + UTF_cyrillic = UTF_GRP+2*U8G_Cyrillic, UTF_CYRILLIC, + UTF_devanaga = UTF_GRP+2*U8G_Devanaga, UTF_DEVANAGA, + UTF_greek = UTF_GRP+2*U8G_Greek, UTF_GREEK, + UTF_han = UTF_GRP+2*U8G_Han, UTF_HAN, + UTF_latin = UTF_GRP+2*U8G_Latin, UTF_LATIN, TOK_ANY = 0x8200000, /* Any character except newline, . */ TOK_ANYNL , /* Any character including newline, . */ TOK_NOP , /* No operation, internal use only */ @@ -637,7 +643,7 @@ _lexasciiclass(_Parser *par, _Rune *rp) /* assume *rp == '[' and *par->exprp == static void _lexutfclass(_Parser *par, _Rune *rp) { - static struct { const char* c; int n, r; } cls[] = { + static struct { const char* c; uint32_t n, r; } cls[] = { {"{Alpha}", 7, UTF_al}, {"{L&}", 4, UTF_lc}, {"{Digit}", 7, UTF_nd}, {"{Nd}", 4, UTF_nd}, {"{Lower}", 7, UTF_ll}, {"{Ll}", 4, UTF_ll}, @@ -651,6 +657,9 @@ _lexutfclass(_Parser *par, _Rune *rp) {"{Pf}", 4, UTF_pf}, {"{Pi}", 4, UTF_pi}, {"{Zl}", 4, UTF_zl}, {"{Zp}", 4, UTF_zp}, {"{Zs}", 4, UTF_zs}, {"{Sc}", 4, UTF_sc}, + {"{Arabic}", 8, UTF_arabic}, {"{Cyrillic}", 10, UTF_cyrillic}, + {"{Devanaga}", 10, UTF_devanaga}, {"{Greek}", 7, UTF_greek}, + {"{Han}", 5, UTF_han}, {"{Latin}", 7, UTF_latin}, }; int inv = (*rp == 'P'); for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i) { @@ -924,18 +933,24 @@ _runematch(_Rune s, _Rune r) case UTF_LC: inv = 1; case UTF_lc: return inv ^ utf8_iscased(r); case UTF_AL: inv = 1; case UTF_al: return inv ^ utf8_isalpha(r); case UTF_WR: inv = 1; case UTF_wr: return inv ^ utf8_isword(r); - case UTF_CC: case UTF_cc: - case UTF_LT: case UTF_lt: - case UTF_ND: case UTF_nd: - case UTF_NL: case UTF_nl: - case UTF_PC: case UTF_pc: - case UTF_PD: case UTF_pd: - case UTF_PF: case UTF_pf: - case UTF_PI: case UTF_pi: - case UTF_SC: case UTF_sc: - case UTF_ZL: case UTF_zl: - case UTF_ZP: case UTF_zp: - case UTF_ZS: case UTF_zs: + case UTF_cc: case UTF_CC: + case UTF_lt: case UTF_LT: + case UTF_nd: case UTF_ND: + case UTF_nl: case UTF_NL: + case UTF_pc: case UTF_PC: + case UTF_pd: case UTF_PD: + case UTF_pf: case UTF_PF: + case UTF_pi: case UTF_PI: + case UTF_sc: case UTF_SC: + case UTF_zl: case UTF_ZL: + case UTF_zp: case UTF_ZP: + case UTF_zs: case UTF_ZS: + case UTF_arabic: case UTF_ARABIC: + case UTF_cyrillic: case UTF_CYRILLIC: + case UTF_devanaga: case UTF_DEVANAGA: + case UTF_greek: case UTF_GREEK: + case UTF_han: case UTF_HAN: + case UTF_latin: case UTF_LATIN: n = s - UTF_GRP; inv = n & 1; return inv ^ utf8_isgroup(n / 2, r); |
