summaryrefslogtreecommitdiffhomepage
path: root/src/utf8code.c
diff options
context:
space:
mode:
authorTyge Lovset <[email protected]>2023-01-02 09:00:47 +0100
committerTyge Lovset <[email protected]>2023-01-02 09:00:47 +0100
commit8c6ba8a3444e4b8640e7fe04f565cef57c850432 (patch)
tree10119ea3bbef5ac2818874849231bad19ad7aa5a /src/utf8code.c
parent91e79fc60713c1f09e940a7ee83ff2f8aa4f9d69 (diff)
downloadSTC-modified-8c6ba8a3444e4b8640e7fe04f565cef57c850432.tar.gz
STC-modified-8c6ba8a3444e4b8640e7fe04f565cef57c850432.zip
Added selected unicode character classes.
Diffstat (limited to 'src/utf8code.c')
-rw-r--r--src/utf8code.c220
1 files changed, 209 insertions, 11 deletions
diff --git a/src/utf8code.c b/src/utf8code.c
index 5dfb7d30..6fe8515e 100644
--- a/src/utf8code.c
+++ b/src/utf8code.c
@@ -112,17 +112,29 @@ int utf8_icmp_sv(const csview s1, const csview s2) {
return (int)(s1.size - s2.size);
}
-bool utf8_isspace(uint32_t c) {
- static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0,
- 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
- for (size_t i=0; i<sizeof t/sizeof *t; ++i)
- if (c == t[i]) return true;
- return (c >= 0x2000) & (c <= 0x200A);
-}
+typedef struct {
+ uint16_t lo;
+ uint16_t hi;
+} URange16;
+
+typedef struct {
+ const char *name;
+ const URange16 *r16;
+ int nr16;
+} UGroup;
+
+static const UGroup unicode_groups[];
+static const int num_unicode_groups;
+static const int Lt_group;
-bool utf8_isdigit(uint32_t c) {
- return ((c >= '0') & (c <= '9')) ||
- ((c >= 0xFF10) & (c <= 0xFF19));
+bool utf8_isgroup(int group, uint32_t c) {
+ for (int j=0; j<unicode_groups[group].nr16; ++j) {
+ if (c < unicode_groups[group].r16[j].lo)
+ return false;
+ if (c <= unicode_groups[group].r16[j].hi)
+ return true;
+ }
+ return false;
}
bool utf8_isxdigit(uint32_t c) {
@@ -141,6 +153,192 @@ bool utf8_isalnum(uint32_t c) {
bool utf8_isalpha(uint32_t c) {
if (c < 128) return isalpha(c) != 0;
- return utf8_islower(c) || utf8_isupper(c);
+ return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(Lt_group, c);
}
+
+static const URange16 Cc_range16[] = { // Control
+ { 0, 31 },
+ { 127, 159 },
+};
+
+static const URange16 Lt_range16[] = { // Title case
+ { 453, 453 },
+ { 456, 456 },
+ { 459, 459 },
+ { 498, 498 },
+ { 8072, 8079 },
+ { 8088, 8095 },
+ { 8104, 8111 },
+ { 8124, 8124 },
+ { 8140, 8140 },
+ { 8188, 8188 },
+};
+
+static const URange16 Nd_range16[] = { // Decimal number
+ { 48, 57 },
+ { 1632, 1641 },
+ { 1776, 1785 },
+ { 1984, 1993 },
+ { 2406, 2415 },
+ { 2534, 2543 },
+ { 2662, 2671 },
+ { 2790, 2799 },
+ { 2918, 2927 },
+ { 3046, 3055 },
+ { 3174, 3183 },
+ { 3302, 3311 },
+ { 3430, 3439 },
+ { 3558, 3567 },
+ { 3664, 3673 },
+ { 3792, 3801 },
+ { 3872, 3881 },
+ { 4160, 4169 },
+ { 4240, 4249 },
+ { 6112, 6121 },
+ { 6160, 6169 },
+ { 6470, 6479 },
+ { 6608, 6617 },
+ { 6784, 6793 },
+ { 6800, 6809 },
+ { 6992, 7001 },
+ { 7088, 7097 },
+ { 7232, 7241 },
+ { 7248, 7257 },
+ { 42528, 42537 },
+ { 43216, 43225 },
+ { 43264, 43273 },
+ { 43472, 43481 },
+ { 43504, 43513 },
+ { 43600, 43609 },
+ { 44016, 44025 },
+ { 65296, 65305 },
+};
+
+static const URange16 Nl_range16[] = { // Number letter
+ { 5870, 5872 },
+ { 8544, 8578 },
+ { 8581, 8584 },
+ { 12295, 12295 },
+ { 12321, 12329 },
+ { 12344, 12346 },
+ { 42726, 42735 },
+};
+
+static const URange16 Pc_range16[] = { // Connector punctuation
+ { 95, 95 },
+ { 8255, 8256 },
+ { 8276, 8276 },
+ { 65075, 65076 },
+ { 65101, 65103 },
+ { 65343, 65343 },
+};
+
+static const URange16 Pd_range16[] = { // Dash punctuation
+ { 45, 45 },
+ { 1418, 1418 },
+ { 1470, 1470 },
+ { 5120, 5120 },
+ { 6150, 6150 },
+ { 8208, 8213 },
+ { 11799, 11799 },
+ { 11802, 11802 },
+ { 11834, 11835 },
+ { 11840, 11840 },
+ { 11869, 11869 },
+ { 12316, 12316 },
+ { 12336, 12336 },
+ { 12448, 12448 },
+ { 65073, 65074 },
+ { 65112, 65112 },
+ { 65123, 65123 },
+ { 65293, 65293 },
+};
+
+static const URange16 Pf_range16[] = { // Final punctuation
+ { 187, 187 },
+ { 8217, 8217 },
+ { 8221, 8221 },
+ { 8250, 8250 },
+ { 11779, 11779 },
+ { 11781, 11781 },
+ { 11786, 11786 },
+ { 11789, 11789 },
+ { 11805, 11805 },
+ { 11809, 11809 },
+};
+
+static const URange16 Pi_range16[] = { // Initial punctuation
+ { 171, 171 },
+ { 8216, 8216 },
+ { 8219, 8220 },
+ { 8223, 8223 },
+ { 8249, 8249 },
+ { 11778, 11778 },
+ { 11780, 11780 },
+ { 11785, 11785 },
+ { 11788, 11788 },
+ { 11804, 11804 },
+ { 11808, 11808 },
+};
+
+static const URange16 Sc_range16[] = { // Currency symbol
+ { 36, 36 },
+ { 162, 165 },
+ { 1423, 1423 },
+ { 1547, 1547 },
+ { 2046, 2047 },
+ { 2546, 2547 },
+ { 2555, 2555 },
+ { 2801, 2801 },
+ { 3065, 3065 },
+ { 3647, 3647 },
+ { 6107, 6107 },
+ { 8352, 8384 },
+ { 43064, 43064 },
+ { 65020, 65020 },
+ { 65129, 65129 },
+ { 65284, 65284 },
+ { 65504, 65505 },
+ { 65509, 65510 },
+};
+
+static const URange16 Zl_range16[] = { // Line separator
+ { 8232, 8232 },
+};
+
+static const URange16 Zp_range16[] = { // Paragraph separator
+ { 8233, 8233 },
+};
+
+static const URange16 Zs_range16[] = { // Space separator
+ { 32, 32 },
+ { 160, 160 },
+ { 5760, 5760 },
+ { 8192, 8202 },
+ { 8239, 8239 },
+ { 8287, 8287 },
+ { 12288, 12288 },
+};
+
+#define UNI_ENTRY(Code) \
+ { #Code, Code##_range16, sizeof(Code##_range16)/2 }
+
+static const UGroup unicode_groups[] = {
+ UNI_ENTRY(Cc),
+ UNI_ENTRY(Lt),
+ UNI_ENTRY(Nd),
+ UNI_ENTRY(Nl),
+ UNI_ENTRY(Pc),
+ UNI_ENTRY(Pd),
+ UNI_ENTRY(Pf),
+ UNI_ENTRY(Pi),
+ UNI_ENTRY(Sc),
+ UNI_ENTRY(Zl),
+ UNI_ENTRY(Zp),
+ UNI_ENTRY(Zs),
+};
+
+static const int Lt_group = 1;
+static const int num_unicode_groups = sizeof unicode_groups / sizeof unicode_groups[0];
+
#endif