5 files changed, 208 insertions, 47 deletions
diff --git a/docs/cregex_api.md b/docs/cregex_api.md
index 20cb5d6d..b2e2a95a 100644
--- a/docs/cregex_api.md
+++ b/docs/cregex_api.md
@@ -203,6 +203,12 @@ For reference, **cregex** uses the following files:
 | \p{Space} | Match UTF8 whitespace: (Zs \t\r\n\v\f] | * |
 | \p{Word} | Match UTF8 word character: (Alnum Pc) | * |
 | \p{XDigit} | Match hex number | * |
+| \p{Arabic} | Language class | * |
+| \p{Cyrillic} | Language class | * |
+| \p{Devanaga} | Language class | * |
+| \p{Greek} | Language class | * |
+| \p{Han} | Language class | * |
+| \p{Latin} | Language class | * |
 | \P{***Class***} | Do not match the classes described above | * |
 | [:alnum:] [:alpha:] [:ascii:] | Match ASCII character class. NB: only to be used inside [] brackets | * |
 | [:blank:] [:cntrl:] [:digit:] | " | * |
diff --git a/include/stc/utf8.h b/include/stc/utf8.h
index 4e38a5c8..f0b28a80 100644
--- a/include/stc/utf8.h
+++ b/include/stc/utf8.h
@@ -1,6 +1,7 @@
 #ifndef UTF8_H_INCLUDED
 #define UTF8_H_INCLUDED
 
+#include <ctype.h>
 #include "forward.h"
 #include "ccommon.h"
 
@@ -9,32 +10,46 @@ enum {
     U8G_Cc, U8G_Lt, U8G_Nd, U8G_Nl,
     U8G_Pc, U8G_Pd, U8G_Pf, U8G_Pi,
     U8G_Sc, U8G_Zl, U8G_Zp, U8G_Zs,
+    U8G_Arabic, U8G_Cyrillic,
+    U8G_Devanaga, U8G_Greek,
+    U8G_Han, U8G_Latin,
     U8G_SIZE
 };
+
 extern bool     utf8_isgroup(int group, uint32_t c); 
-extern bool     utf8_isblank(uint32_t c);
-extern bool     utf8_iscased(uint32_t c);
-extern bool     utf8_isalnum(uint32_t c);
-extern bool     utf8_isdigit(uint32_t c);
-extern bool     utf8_isspace(uint32_t c);
-extern bool     utf8_isword(uint32_t c);
+extern bool     utf8_isalpha(uint32_t c);
 extern uint32_t utf8_casefold(uint32_t c);
 extern uint32_t utf8_tolower(uint32_t c);
 extern uint32_t utf8_toupper(uint32_t c);
+extern bool     utf8_iscased(uint32_t c);
 extern bool     utf8_valid_n(const char* s, size_t nbytes);
 extern int      utf8_icmp_sv(csview s1, csview s2);
 extern unsigned utf8_encode(char *out, uint32_t c);
 extern uint32_t utf8_peek_off(const char *s, int offset);
 
-STC_INLINE bool utf8_isalpha(uint32_t c) {
-    return utf8_iscased(c) || utf8_isgroup(U8G_Nl, c);
+STC_INLINE bool utf8_isupper(uint32_t c) 
+    { return utf8_tolower(c) != c; }
+
+STC_INLINE bool utf8_islower(uint32_t c) 
+    { return utf8_toupper(c) != c; }
+
+STC_INLINE bool utf8_isalnum(uint32_t c) {
+    if (c < 128) return isalnum(c) != 0;
+    return utf8_isalpha(c) || utf8_isgroup(U8G_Nd, c);
 }
 
+STC_INLINE bool utf8_isword(uint32_t c)
+    { return utf8_isalnum(c) || utf8_isgroup(U8G_Pc, c); }
 
-/* following functions uses src/utf8code.c */
+STC_INLINE bool utf8_isblank(uint32_t c) {
+    if (c < 128) return (c == ' ') | (c == '\t');
+    return utf8_isgroup(U8G_Zs, c);
+}
 
-STC_INLINE bool utf8_isupper(uint32_t c) { return utf8_tolower(c) != c; }
-STC_INLINE bool utf8_islower(uint32_t c) { return utf8_toupper(c) != c; }
+STC_INLINE bool utf8_isspace(uint32_t c) {
+    if (c < 128) return isspace(c) != 0;
+    return ((c == 8232) | (c == 8233)) || utf8_isgroup(U8G_Zs, c);
+}
 
 /* decode next utf8 codepoint. https://bjoern.hoehrmann.de/utf-8/decoder/dfa */
 typedef struct { uint32_t state, codep; } utf8_decode_t;
diff --git a/misc/examples/regex2.c b/misc/examples/regex2.c
index abae5695..66ab9f72 100644
--- a/misc/examples/regex2.c
+++ b/misc/examples/regex2.c
@@ -12,6 +12,7 @@ int main()
         },
         {"!((abc|123)+)!", "!123abcabc!"},
         {"(\\p{L&}+ )+(\\p{Nd}+)", "Großpackung süßigkeiten 199"},
+        {"\\p{Han}+", "This is Han: 王明：那是杂志吗？"},
     };
 
     c_AUTO (cregex, re)
diff --git a/src/cregex.c b/src/cregex.c
index 5483b243..57ba2f1e 100644
--- a/src/cregex.c
+++ b/src/cregex.c
@@ -151,6 +151,12 @@ enum {
     UTF_zl = UTF_GRP+2*U8G_Zl, UTF_ZL, /* utf8 separator line */
     UTF_zp = UTF_GRP+2*U8G_Zp, UTF_ZP, /* utf8 separator paragraph */
     UTF_zs = UTF_GRP+2*U8G_Zs, UTF_ZS, /* utf8 separator space */
+    UTF_arabic = UTF_GRP+2*U8G_Arabic, UTF_ARABIC,
+    UTF_cyrillic = UTF_GRP+2*U8G_Cyrillic, UTF_CYRILLIC,
+    UTF_devanaga = UTF_GRP+2*U8G_Devanaga, UTF_DEVANAGA,
+    UTF_greek = UTF_GRP+2*U8G_Greek, UTF_GREEK,
+    UTF_han = UTF_GRP+2*U8G_Han, UTF_HAN,
+    UTF_latin = UTF_GRP+2*U8G_Latin, UTF_LATIN,
     TOK_ANY     = 0x8200000,    /* Any character except newline, . */
     TOK_ANYNL   ,               /* Any character including newline, . */
     TOK_NOP     ,               /* No operation, internal use only */
@@ -637,7 +643,7 @@ _lexasciiclass(_Parser *par, _Rune *rp) /* assume *rp == '[' and *par->exprp ==
 static void
 _lexutfclass(_Parser *par, _Rune *rp)
 {
-    static struct { const char* c; int n, r; } cls[] = {
+    static struct { const char* c; uint32_t n, r; } cls[] = {
         {"{Alpha}", 7, UTF_al}, {"{L&}", 4, UTF_lc},
         {"{Digit}", 7, UTF_nd}, {"{Nd}", 4, UTF_nd},
         {"{Lower}", 7, UTF_ll}, {"{Ll}", 4, UTF_ll},
@@ -651,6 +657,9 @@ _lexutfclass(_Parser *par, _Rune *rp)
         {"{Pf}", 4, UTF_pf}, {"{Pi}", 4, UTF_pi},
         {"{Zl}", 4, UTF_zl}, {"{Zp}", 4, UTF_zp},
         {"{Zs}", 4, UTF_zs}, {"{Sc}", 4, UTF_sc},
+        {"{Arabic}", 8, UTF_arabic}, {"{Cyrillic}", 10, UTF_cyrillic},
+        {"{Devanaga}", 10, UTF_devanaga}, {"{Greek}", 7, UTF_greek},
+        {"{Han}", 5, UTF_han}, {"{Latin}", 7, UTF_latin},
     };
     int inv = (*rp == 'P');
     for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i) {
@@ -924,18 +933,24 @@ _runematch(_Rune s, _Rune r)
     case UTF_LC: inv = 1; case UTF_lc: return inv ^ utf8_iscased(r); 
     case UTF_AL: inv = 1; case UTF_al: return inv ^ utf8_isalpha(r);
     case UTF_WR: inv = 1; case UTF_wr: return inv ^ utf8_isword(r);
-    case UTF_CC: case UTF_cc:
-    case UTF_LT: case UTF_lt:
-    case UTF_ND: case UTF_nd:
-    case UTF_NL: case UTF_nl:
-    case UTF_PC: case UTF_pc:
-    case UTF_PD: case UTF_pd:
-    case UTF_PF: case UTF_pf:
-    case UTF_PI: case UTF_pi:
-    case UTF_SC: case UTF_sc:
-    case UTF_ZL: case UTF_zl:
-    case UTF_ZP: case UTF_zp:
-    case UTF_ZS: case UTF_zs:
+    case UTF_cc: case UTF_CC:
+    case UTF_lt: case UTF_LT:
+    case UTF_nd: case UTF_ND:
+    case UTF_nl: case UTF_NL:
+    case UTF_pc: case UTF_PC:
+    case UTF_pd: case UTF_PD:
+    case UTF_pf: case UTF_PF:
+    case UTF_pi: case UTF_PI:
+    case UTF_sc: case UTF_SC:
+    case UTF_zl: case UTF_ZL:
+    case UTF_zp: case UTF_ZP:
+    case UTF_zs: case UTF_ZS:
+    case UTF_arabic: case UTF_ARABIC:
+    case UTF_cyrillic: case UTF_CYRILLIC:
+    case UTF_devanaga: case UTF_DEVANAGA:
+    case UTF_greek: case UTF_GREEK:
+    case UTF_han: case UTF_HAN:
+    case UTF_latin: case UTF_LATIN:
         n = s - UTF_GRP;
         inv = n & 1;
         return inv ^ utf8_isgroup(n / 2, r);
diff --git a/src/utf8code.c b/src/utf8code.c
index c4866b78..5cc5d467 100644
--- a/src/utf8code.c
+++ b/src/utf8code.c
@@ -134,31 +134,19 @@ bool utf8_isgroup(int group, uint32_t c) {
     return false;
 }
 
-bool utf8_iscased(uint32_t c) {
-    if (c < 128) return isalpha(c) != 0;
-    return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c);
-}
-
-bool utf8_isalnum(uint32_t c) {
-    if (c < 128) return isalnum(c) != 0;
-    return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c) ||
-           utf8_isgroup(U8G_Nd, c) || utf8_isgroup(U8G_Nl, c);
-}
-
-bool utf8_isblank(uint32_t c) {
-    if (c < 128) return (c == ' ') | (c == '\t');
-    return utf8_isgroup(U8G_Zs, c);
-}
-
-bool utf8_isspace(uint32_t c) {
-    if (c < 128) return isspace(c) != 0;
-    return ((c == 8232) | (c == 8233)) || utf8_isgroup(U8G_Zs, c);
+bool utf8_isalpha(uint32_t c) {
+    static int16_t groups[] = {U8G_Latin, U8G_Nl, U8G_Greek, U8G_Cyrillic,
+                               U8G_Han, U8G_Arabic, U8G_Devanaga};
+    for (unsigned j=0; j < c_ARRAYLEN(groups); ++j)
+        if (utf8_isgroup(groups[j], c))
+            return true;
+    return false;
 }
 
-bool utf8_isword(uint32_t c) {
-    if (c < 128) return (isalnum(c) != 0) | (c == '_');
-    return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c) ||
-           utf8_isgroup(U8G_Nd, c) || utf8_isgroup(U8G_Pc, c) || utf8_isgroup(U8G_Nl, c);
+bool utf8_iscased(uint32_t c) {
+    if (c < 128) return isalpha(c) != 0;
+    return utf8_islower(c) || utf8_isupper(c) || 
+           utf8_isgroup(U8G_Lt, c);
 }
 
 /* The tables below are extracted from the RE2 library */
@@ -327,6 +315,136 @@ static const URange16 Zs_range16[] = { // Space separator
     { 12288, 12288 },
 };
 
+static const URange16 Arabic_range16[] = {
+	{ 1536, 1540 },
+	{ 1542, 1547 },
+	{ 1549, 1562 },
+	{ 1564, 1566 },
+	{ 1568, 1599 },
+	{ 1601, 1610 },
+	{ 1622, 1647 },
+	{ 1649, 1756 },
+	{ 1758, 1791 },
+	{ 1872, 1919 },
+	{ 2160, 2190 },
+	{ 2192, 2193 },
+	{ 2200, 2273 },
+	{ 2275, 2303 },
+	{ 64336, 64450 },
+	{ 64467, 64829 },
+	{ 64832, 64911 },
+	{ 64914, 64967 },
+	{ 64975, 64975 },
+	{ 65008, 65023 },
+	{ 65136, 65140 },
+	{ 65142, 65276 },
+};
+
+static const URange16 Cyrillic_range16[] = {
+	{ 1024, 1156 },
+	{ 1159, 1327 },
+	{ 7296, 7304 },
+	{ 7467, 7467 },
+	{ 7544, 7544 },
+	{ 11744, 11775 },
+	{ 42560, 42655 },
+	{ 65070, 65071 },
+};
+
+static const URange16 Devanagari_range16[] = {
+	{ 2304, 2384 },
+	{ 2389, 2403 },
+	{ 2406, 2431 },
+	{ 43232, 43263 },
+};
+
+static const URange16 Greek_range16[] = {
+	{ 880, 883 },
+	{ 885, 887 },
+	{ 890, 893 },
+	{ 895, 895 },
+	{ 900, 900 },
+	{ 902, 902 },
+	{ 904, 906 },
+	{ 908, 908 },
+	{ 910, 929 },
+	{ 931, 993 },
+	{ 1008, 1023 },
+	{ 7462, 7466 },
+	{ 7517, 7521 },
+	{ 7526, 7530 },
+	{ 7615, 7615 },
+	{ 7936, 7957 },
+	{ 7960, 7965 },
+	{ 7968, 8005 },
+	{ 8008, 8013 },
+	{ 8016, 8023 },
+	{ 8025, 8025 },
+	{ 8027, 8027 },
+	{ 8029, 8029 },
+	{ 8031, 8061 },
+	{ 8064, 8116 },
+	{ 8118, 8132 },
+	{ 8134, 8147 },
+	{ 8150, 8155 },
+	{ 8157, 8175 },
+	{ 8178, 8180 },
+	{ 8182, 8190 },
+	{ 8486, 8486 },
+	{ 43877, 43877 },
+};
+
+static const URange16 Han_range16[] = {
+	{ 11904, 11929 },
+	{ 11931, 12019 },
+	{ 12032, 12245 },
+	{ 12293, 12293 },
+	{ 12295, 12295 },
+	{ 12321, 12329 },
+	{ 12344, 12347 },
+	{ 13312, 19903 },
+	{ 19968, 40959 },
+	{ 63744, 64109 },
+	{ 64112, 64217 },
+};
+
+static const URange16 Latin_range16[] = {
+	{ 65, 90 },
+	{ 97, 122 },
+	{ 170, 170 },
+	{ 186, 186 },
+	{ 192, 214 },
+	{ 216, 246 },
+	{ 248, 696 },
+	{ 736, 740 },
+	{ 7424, 7461 },
+	{ 7468, 7516 },
+	{ 7522, 7525 },
+	{ 7531, 7543 },
+	{ 7545, 7614 },
+	{ 7680, 7935 },
+	{ 8305, 8305 },
+	{ 8319, 8319 },
+	{ 8336, 8348 },
+	{ 8490, 8491 },
+	{ 8498, 8498 },
+	{ 8526, 8526 },
+	{ 8544, 8584 },
+	{ 11360, 11391 },
+	{ 42786, 42887 },
+	{ 42891, 42954 },
+	{ 42960, 42961 },
+	{ 42963, 42963 },
+	{ 42965, 42969 },
+	{ 42994, 43007 },
+	{ 43824, 43866 },
+	{ 43868, 43876 },
+	{ 43878, 43881 },
+	{ 64256, 64262 },
+	{ 65313, 65338 },
+	{ 65345, 65370 },
+};
+
 #define UNI_ENTRY(Code) \
     { Code##_range16, sizeof(Code##_range16)/sizeof(URange16) }
 
@@ -343,6 +461,12 @@ static const UGroup unicode_groups[U8G_SIZE] = {
     [U8G_Zl] = UNI_ENTRY(Zl),
     [U8G_Zp] = UNI_ENTRY(Zp),
     [U8G_Zs] = UNI_ENTRY(Zs),
+    [U8G_Arabic] = UNI_ENTRY(Arabic),
+    [U8G_Cyrillic] = UNI_ENTRY(Cyrillic),
+    [U8G_Devanaga] = UNI_ENTRY(Devanagari),
+    [U8G_Greek] = UNI_ENTRY(Greek),
+    [U8G_Han] = UNI_ENTRY(Han),
+    [U8G_Latin] = UNI_ENTRY(Latin),
 };
 
 #endif