Finish regex updates.

author: Tyge Løvset <[email protected]> 2023-01-02 13:40:23 +0100
committer: Tyge Løvset <[email protected]> 2023-01-02 13:40:23 +0100
commit: 364b8833cb5d91bbe2c7640869912cde4de12846 (patch)
tree: 9d366de0b847e9910fdaa764eaf01bd76512a362
parent: 5c454e721656618c36674e0df676091106592c2d (diff)
download: STC-modified-364b8833cb5d91bbe2c7640869912cde4de12846.tar.gz
STC-modified-364b8833cb5d91bbe2c7640869912cde4de12846.zip
4 files changed, 58 insertions, 35 deletions
diff --git a/docs/cregex_api.md b/docs/cregex_api.md
index e528c83b..91868235 100644
--- a/docs/cregex_api.md
+++ b/docs/cregex_api.md
@@ -182,15 +182,19 @@ For reference, **cregex** uses the following files:
 | \n \t \r | Match UTF8 newline, tab, carriage return | |
 | \d \s \w | Match UTF8 digit, whitespace, alphanumeric character | |
 | \D \S \W | Do not match the groups described above | |
-| \p{Alnum} | Match UTF8 alpha numeric | * |
-| \p{XDigit} | Match UTF8 hex number | * |
-| \p{Nd} or \p{Digit} | Match UTF8 decimal number | * |
-| \p{Nl} | Match UTF8 numeric letter | * |
+| \p{Alpha} | Match UTF8 alpha (L& Ll) | * |
+| \p{Alnum} | Match UTF8 alphanumeric (Lu Ll Nd Nl) | * |
+| \p{Blank} | Match UTF8 blank (Zs \t) | * |
+| \p{Space} | Match UTF8 whitespace: (Zs \t\r\n\v\f] | * |
+| \p{Word} | Match UTF8 word character: (Alnum Pc) | * |
+| \p{XDigit} | Match hex number | * |
+| \p{Cc} or \p{Cntrl} | Match UTF8 control char | * |
 | \p{Ll} or \p{Lower} | Match UTF8 lowercase letter | * |
 | \p{Lu} or \p{Upper} | Match UTF8 uppercase letter | * |
 | \p{Lt} | Match UTF8 titlecase letter | * |
-| \p{L&} or \p{Alpha} | Match UTF8 cased letter | * |
-| \p{Cc} | Match UTF8 control char | * |
+| \p{L&} | Match UTF8 cased letter (Ll Lu Lt) | * |
+| \p{Nd} or \p{Digit} | Match UTF8 decimal number | * |
+| \p{Nl} | Match UTF8 numeric letter | * |
 | \p{Pc} | Match UTF8 connector punctuation | * |
 | \p{Pd} | Match UTF8 dash punctuation | * |
 | \p{Pi} | Match UTF8 initial punctuation | * |
@@ -198,7 +202,7 @@ For reference, **cregex** uses the following files:
 | \p{Sc} | Match UTF8 currency symbol | * |
 | \p{Zl} | Match UTF8 line separator | * |
 | \p{Zp} | Match UTF8 paragraph separator | * |
-| \p{Sz} or \p{Space} | Match UTF8 whitespace separator | * |
+| \p{Zs} | Match UTF8 space separator | * |
 | \P{***Class***} | Do not match the classes described above | * |
 | [:alnum:] [:alpha:] [:ascii:] | Match ASCII character class. NB: only to be used inside [] brackets | * |
 | [:blank:] [:cntrl:] [:digit:] | " | * |
diff --git a/include/stc/utf8.h b/include/stc/utf8.h
index da6643ea..3246e654 100644
--- a/include/stc/utf8.h
+++ b/include/stc/utf8.h
@@ -10,12 +10,13 @@ enum {
     U8G_Pc, U8G_Pd, U8G_Pf, U8G_Pi,
     U8G_Sc, U8G_Zl, U8G_Zp, U8G_Zs,
 };
-extern bool     utf8_isspace(uint32_t c);
-extern bool     utf8_isdigit(uint32_t c);
-extern bool     utf8_isxdigit(uint32_t c);
-extern bool     utf8_isalpha(uint32_t c);
-extern bool     utf8_isalnum(uint32_t c);
 extern bool     utf8_isgroup(int group, uint32_t c); 
+extern bool     utf8_isblank(uint32_t c);
+extern bool     utf8_iscased(uint32_t c);
+extern bool     utf8_isalnum(uint32_t c);
+extern bool     utf8_isdigit(uint32_t c);
+extern bool     utf8_isspace(uint32_t c);
+extern bool     utf8_isword(uint32_t c);
 extern uint32_t utf8_casefold(uint32_t c);
 extern uint32_t utf8_tolower(uint32_t c);
 extern uint32_t utf8_toupper(uint32_t c);
@@ -24,6 +25,11 @@ extern int      utf8_icmp_sv(csview s1, csview s2);
 extern unsigned utf8_encode(char *out, uint32_t c);
 extern uint32_t utf8_peek_off(const char *s, int offset);
 
+STC_INLINE bool utf8_isalpha(uint32_t c) {
+    return utf8_iscased(c) || utf8_isgroup(U8G_Nl, c);
+}
+
+
 /* following functions uses src/utf8code.c */
 
 STC_INLINE bool utf8_isupper(uint32_t c) { return utf8_tolower(c) != c; }
diff --git a/src/cregex.c b/src/cregex.c
index 006c2d05..dcfefe24 100644
--- a/src/cregex.c
+++ b/src/cregex.c
@@ -130,12 +130,14 @@ enum {
     ASC_lo      , ASC_LO,       /* lower */
     ASC_up      , ASC_UP,       /* upper */
     ASC_xd      , ASC_XD,       /* hex */
+    UTF_al      , UTF_AL,       /* utf8 alpha */
     UTF_an      , UTF_AN,       /* utf8 alphanumeric */
-    UTF_wr      , UTF_WR,       /* utf8 word */
-    UTF_xd      , UTF_XD,       /* utf8 hex digit */
+    UTF_bl      , UTF_BL,       /* utf8 blank */
     UTF_lc      , UTF_LC,       /* utf8 letter cased */
     UTF_ll      , UTF_LL,       /* utf8 letter lowercase */
     UTF_lu      , UTF_LU,       /* utf8 letter uppercase */
+    UTF_sp      , UTF_SP,       /* utf8 space */
+    UTF_wr      , UTF_WR,       /* utf8 word */
     UTF_GRP = 0x8150000,
     UTF_cc = UTF_GRP+2*U8G_Cc, UTF_CC, /* utf8 control char */
     UTF_lt = UTF_GRP+2*U8G_Lt, UTF_LT, /* utf8 letter titlecase */
@@ -636,18 +638,19 @@ static void
 _lexutfclass(_Parser *par, _Rune *rp)
 {
     static struct { const char* c; int n, r; } cls[] = {
-        {"{Alpha}", 7, UTF_lc}, {"{L&}", 4, UTF_lc},
+        {"{Alpha}", 7, UTF_al}, {"{L&}", 4, UTF_lc},
         {"{Digit}", 7, UTF_nd}, {"{Nd}", 4, UTF_nd},
         {"{Lower}", 7, UTF_ll}, {"{Ll}", 4, UTF_ll},
         {"{Upper}", 7, UTF_lu}, {"{Lu}", 4, UTF_lu},
-        {"{Space}", 7, UTF_zs}, {"{Zs}", 4, UTF_zs},
-        {"{Alnum}", 7, UTF_an},
-        {"{XDigit}", 8, UTF_xd},
-        {"{Cc}", 4, UTF_cc}, {"{Sc}", 4, UTF_sc},
+        {"{Cntrl}", 7, UTF_cc}, {"{Cc}", 4, UTF_cc}, 
+        {"{Alnum}", 7, UTF_an}, {"{Blank}", 7, UTF_bl}, 
+        {"{Space}", 7, UTF_sp}, {"{Word}", 6, UTF_wr},
+        {"{XDigit}", 8, ASC_xd},
         {"{Lt}", 4, UTF_lt}, {"{Nl}", 4, UTF_nl},
         {"{Pc}", 4, UTF_pc}, {"{Pd}", 4, UTF_pd},
         {"{Pf}", 4, UTF_pf}, {"{Pi}", 4, UTF_pi},
         {"{Zl}", 4, UTF_zl}, {"{Zp}", 4, UTF_zp},
+        {"{Zs}", 4, UTF_zs}, {"{Sc}", 4, UTF_sc},
     };
     int inv = (*rp == 'P');
     for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i) {
@@ -671,8 +674,8 @@ _lexutfclass(_Parser *par, _Rune *rp)
     case 'a': rune = '\a'; break; \
     case 'd': rune = UTF_nd; break; \
     case 'D': rune = UTF_ND; break; \
-    case 's': rune = UTF_zs; break; \
-    case 'S': rune = UTF_ZS; break; \
+    case 's': rune = UTF_sp; break; \
+    case 'S': rune = UTF_SP; break; \
     case 'w': rune = UTF_wr; break; \
     case 'W': rune = UTF_WR; break
 
@@ -914,11 +917,13 @@ _runematch(_Rune s, _Rune r)
     case ASC_UP: inv = 1; case ASC_up: return inv ^ (isupper(r) != 0);
     case ASC_XD: inv = 1; case ASC_xd: return inv ^ (isxdigit(r) != 0);
     case UTF_AN: inv = 1; case UTF_an: return inv ^ utf8_isalnum(r);
-    case UTF_WR: inv = 1; case UTF_wr: return inv ^ (utf8_isalnum(r) | (r == '_'));
-    case UTF_XD: inv = 1; case UTF_xd: return inv ^ utf8_isxdigit(r);
+    case UTF_BL: inv = 1; case UTF_bl: return inv ^ utf8_isblank(r);
+    case UTF_SP: inv = 1; case UTF_sp: return inv ^ utf8_isspace(r);
     case UTF_LL: inv = 1; case UTF_ll: return inv ^ utf8_islower(r);
     case UTF_LU: inv = 1; case UTF_lu: return inv ^ utf8_isupper(r);
-    case UTF_LC: inv = 1; case UTF_lc: return inv ^ utf8_isalpha(r); 
+    case UTF_LC: inv = 1; case UTF_lc: return inv ^ utf8_iscased(r); 
+    case UTF_AL: inv = 1; case UTF_al: return inv ^ utf8_isalpha(r);
+    case UTF_WR: inv = 1; case UTF_wr: return inv ^ utf8_isword(r);
     case UTF_CC: case UTF_cc:
     case UTF_LT: case UTF_lt:
     case UTF_ND: case UTF_nd:
diff --git a/src/utf8code.c b/src/utf8code.c
index 8f2ce107..71b086c2 100644
--- a/src/utf8code.c
+++ b/src/utf8code.c
@@ -136,23 +136,31 @@ bool utf8_isgroup(int group, uint32_t c) {
     return false;
 }
 
-bool utf8_isxdigit(uint32_t c) {
-    static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10,
-                           0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46};
-    for (size_t i=1; i<sizeof t/sizeof *t; i += 2)
-        if (c <= t[i]) return c >= t[i - 1];
-    return false;
+bool utf8_iscased(uint32_t c) {
+    if (c < 128) return isalpha(c) != 0;
+    return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c);
 }
 
 bool utf8_isalnum(uint32_t c) {
     if (c < 128) return isalnum(c) != 0;
-    if ((c >= 0xFF10) & (c <= 0xFF19)) return true;
-    return utf8_islower(c) || utf8_isupper(c);
+    return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c) ||
+           utf8_isgroup(U8G_Nd, c) || utf8_isgroup(U8G_Nl, c);
 }
 
-bool utf8_isalpha(uint32_t c) {
-    if (c < 128) return isalpha(c) != 0;
-    return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c);
+bool utf8_isblank(uint32_t c) {
+    if (c < 128) return isblank(c) != 0;
+    return utf8_isgroup(U8G_Zs, c);
+}
+
+bool utf8_isspace(uint32_t c) {
+    if (c < 128) return isspace(c) != 0;
+    return ((c == 8232) | (c == 8233)) || utf8_isgroup(U8G_Zs, c);
+}
+
+bool utf8_isword(uint32_t c) {
+    if (c < 128) return (isalnum(c) != 0) | (c == '_');
+    return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c) ||
+           utf8_isgroup(U8G_Nd, c) || utf8_isgroup(U8G_Pc, c) || utf8_isgroup(U8G_Nl, c);
 }
 
 static const URange16 Cc_range16[] = { // Control
author	Tyge Løvset <[email protected]>	2023-01-02 13:40:23 +0100
committer	Tyge Løvset <[email protected]>	2023-01-02 13:40:23 +0100
commit	364b8833cb5d91bbe2c7640869912cde4de12846 (patch)
tree	9d366de0b847e9910fdaa764eaf01bd76512a362
parent	5c454e721656618c36674e0df676091106592c2d (diff)
download	STC-modified-364b8833cb5d91bbe2c7640869912cde4de12846.tar.gz STC-modified-364b8833cb5d91bbe2c7640869912cde4de12846.zip