Large refactoring on strings / utf8 and some file structure.

author: Tyge Lovset <[email protected]> 2022-05-30 10:17:07 +0200
committer: Tyge Lovset <[email protected]> 2022-05-30 10:17:07 +0200
commit: b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3 (patch)
tree: 8c97999b1ede5e0cf45c94b2035e94b0734dff1c
parent: 831dc0843aeedcb45138a6ed576ea03f2dcd58f8 (diff)
download: STC-modified-b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3.tar.gz
STC-modified-b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3.zip
16 files changed, 376 insertions, 438 deletions
diff --git a/docs/csview_api.md b/docs/csview_api.md
index 4c92a1e5..55ac017e 100644
--- a/docs/csview_api.md
+++ b/docs/csview_api.md
@@ -68,7 +68,7 @@ size_t          utf8_size_n(const char *s, size_t n);               // number of
 const char*     utf8_at(const char *s, size_t index);               // from UTF8 index to char* position
 size_t          utf8_pos(const char* s, size_t index);              // from UTF8 index to byte index position
 unsigned        utf8_codep_size(const char* s);                     // 0-4 (0 if s[0] is illegal utf8)
-void            utf8_peek(const char *s, utf8_decode_t* d);         // next codepoint as uint32_t
+void            utf8_peek(utf8_decode_t* d, const char *s);         // next codepoint as uint32_t
 uint32_t        utf8_decode(utf8_decode_t *d, uint8_t byte,         // d holds state, size and unicode point
                             const uint32_t byte);                   // decode next utf8 codepoint.
 unsigned        utf8_encode(char *out, uint32_t cp);                // encode unicode cp into out
diff --git a/examples/regex1.c b/examples/regex1.c
index 48dfe515..7481fbb1 100644
--- a/examples/regex1.c
+++ b/examples/regex1.c
@@ -20,7 +20,7 @@ int main(int argc, char* argv[])
             // Exit when the user inputs q
             if (cstr_equals(input, "q"))
                 break;
-            
+
             if (cregex_find(&float_expr, cstr_str(&input), 0, NULL, 0) > 0)
                 printf("Input is a float\n");
             else
@@ -30,4 +30,4 @@ int main(int argc, char* argv[])
 }
 
 #include "../src/cregex.c"
-#include "../src/casefold.c"
+#include "../src/utf8utils.c"
diff --git a/examples/regex2.c b/examples/regex2.c
index 60fd707a..7089956f 100644
--- a/examples/regex2.c
+++ b/examples/regex2.c
@@ -33,4 +33,4 @@ int main()
 }
 
 #include "../src/cregex.c"
-#include "../src/casefold.c"
+#include "../src/utf8utils.c"
diff --git a/examples/regex_match.c b/examples/regex_match.c
index 5d2ff215..0aa740d4 100644
--- a/examples/regex_match.c
+++ b/examples/regex_match.c
@@ -35,4 +35,4 @@ int main()
 }
 
 #include "../src/cregex.c"
-#include "../src/casefold.c"
+#include "../src/utf8utils.c"
diff --git a/include/stc/alt/cstr.h b/include/stc/alt/cstr.h
index a43c7cc4..0012d364 100644
--- a/include/stc/alt/cstr.h
+++ b/include/stc/alt/cstr.h
@@ -35,7 +35,7 @@
 #define cstr_npos (SIZE_MAX >> 1)
 typedef struct { size_t size, cap; char chr[1]; } cstr_priv;
 #define _cstr_p(self) c_unchecked_container_of((self)->str, cstr_priv, chr)
-#ifdef _i_static 
+#ifdef i_static 
     static cstr_priv _cstr_nullrep = {0, 0, {0}};
     static const cstr cstr_null = {_cstr_nullrep.chr};
 #else
@@ -189,7 +189,7 @@ STC_INLINE uint64_t cstr_hash(const cstr *self) {
 /* -------------------------- IMPLEMENTATION ------------------------- */
 #if defined(i_implement)
 
-#ifndef _i_static
+#ifndef i_static
 static cstr_priv _cstr_nullrep = {0, 0, {0}};
 const cstr cstr_null = {_cstr_nullrep.chr};
 #endif
diff --git a/include/stc/cbits.h b/include/stc/cbits.h
index 4c104bac..e35e4225 100644
--- a/include/stc/cbits.h
+++ b/include/stc/cbits.h
@@ -321,3 +321,6 @@ STC_DEF bool _cbits_disjoint(const uint64_t* set, const uint64_t* other, const s
 #undef i_len
 #undef i_type
 #undef i_opt
+#undef i_header
+#undef i_implement
+#undef i_static
diff --git a/include/stc/ccommon.h b/include/stc/ccommon.h
index 46d53bc1..f27190af 100644
--- a/include/stc/ccommon.h
+++ b/include/stc/ccommon.h
@@ -106,9 +106,6 @@
 #define c_no_atomic             (1<<1)
 #define c_no_clone              (1<<2)
 #define c_no_cmp                (1<<3)
-#define c_static                (1<<4)
-#define c_header                (1<<5)
-#define c_implement             (1<<6)
 
 /* Generic algorithms */
 
@@ -254,19 +251,16 @@ STC_INLINE char* c_strnstrn(const char *s, const char *needle,
 
 #undef STC_API
 #undef STC_DEF
-#undef _i_static
-#undef i_implement
 
-#if !c_option(c_static) && (c_option(c_header|c_implement) || defined(STC_HEADER) || \
-                            defined(STC_IMPLEMENT) || defined(STC_IMPLEMENTATION))
+#if !defined i_static && (defined(i_header) || defined(i_implement) || \
+                          defined(STC_HEADER) || defined(STC_IMPLEMENT))
 #  define STC_API extern
 #  define STC_DEF
 #else
-#  define _i_static
+#  define i_static
 #  define STC_API static inline
 #  define STC_DEF static inline
 #endif
-#if defined(_i_static) || c_option(c_implement) || defined(STC_IMPLEMENT) \
-                                                || defined(STC_IMPLEMENTATION)
+#if defined(i_static) || defined(STC_IMPLEMENT)
 #  define i_implement
 #endif
diff --git a/include/stc/cstr.h b/include/stc/cstr.h
index 1d57437e..561bb1ca 100644
--- a/include/stc/cstr.h
+++ b/include/stc/cstr.h
@@ -169,7 +169,22 @@ STC_INLINE size_t cstr_length(cstr s)
 STC_INLINE size_t cstr_capacity(cstr s)
     { return cstr_is_long(&s) ? cstr_l_cap(&s) : cstr_s_cap; }
 
-// utf8:
+// utf8 methods defined in/depending on src/utf8utils.c:
+cstr cstr_tolower(const cstr* self);
+cstr cstr_toupper(const cstr* self);
+void cstr_lowercase(cstr* self);
+void cstr_uppercase(cstr* self);
+
+STC_INLINE bool cstr_valid_u8(const cstr* self) 
+    { return utf8_valid(cstr_str(self)); }
+
+STC_INLINE utf8_decode_t cstr_peek(const cstr* self, size_t bytepos) {
+    utf8_decode_t d = {UTF8_OK};
+    utf8_peek(&d, cstr_str(self) + bytepos);
+    return d;
+}
+
+// other utf8 
 
 STC_INLINE size_t cstr_size_u8(cstr s) 
     { return utf8_size(cstr_str(&s)); }
@@ -183,6 +198,7 @@ STC_INLINE csview cstr_at(const cstr* self, size_t bytepos) {
     sv.size = utf8_codep_size(sv.str);
     return sv;
 }
+
 STC_INLINE csview cstr_at_u8(const cstr* self, size_t u8idx) {
     csview sv = cstr_sv(self);
     sv.str = utf8_at(sv.str, u8idx);
@@ -193,14 +209,7 @@ STC_INLINE csview cstr_at_u8(const cstr* self, size_t u8idx) {
 STC_INLINE size_t cstr_pos_u8(const cstr* self, size_t u8idx) 
     { return utf8_pos(cstr_str(self), u8idx); }
 
-STC_INLINE bool cstr_valid_u8(const cstr* self) 
-    { return utf8_valid(cstr_str(self)); }
-
-STC_INLINE utf8_decode_t cstr_peek(const cstr* self, size_t bytepos) {
-    utf8_decode_t d = {UTF8_OK};
-    utf8_peek(cstr_str(self) + bytepos, &d);
-    return d;
-}
+// utf8 iterator
 
 STC_INLINE cstr_iter cstr_begin(const cstr* self) { 
     const char* str = cstr_str(self);
@@ -525,4 +534,8 @@ STC_DEF int cstr_printf(cstr* self, const char* fmt, ...) {
 #endif
 #endif // CSTR_H_INCLUDED
 #undef i_opt
+#undef i_header
+#undef i_static
+#undef i_implement
+//#undef i_implement
 #endif // !STC_CSTR_V1
diff --git a/include/stc/csview.h b/include/stc/csview.h
index 270a79f8..ba0a7568 100644
--- a/include/stc/csview.h
+++ b/include/stc/csview.h
@@ -27,49 +27,60 @@
 #include "forward.h"
 #include "utf8.h"
 
-#define                 csview_null  c_sv("")
-#define                 csview_new(literal) c_sv(literal)
-#define                 csview_npos  (SIZE_MAX >> 1)
-
-STC_API csview          csview_substr(csview sv, intptr_t pos, size_t n);
-STC_API csview          csview_slice(csview sv, intptr_t p1, intptr_t p2);
-STC_API csview          csview_token(csview sv, csview sep, size_t* start);
-
-STC_INLINE csview       csview_init() { return csview_null; }
-STC_INLINE csview       csview_from(const char* str)
-                            { return c_make(csview){str, strlen(str)}; }
-STC_INLINE csview       csview_from_n(const char* str, size_t n)
-                            { return c_make(csview){str, n}; }
-STC_INLINE size_t       csview_size(csview sv) { return sv.size; }
-STC_INLINE size_t       csview_length(csview sv) { return sv.size; }
-STC_INLINE bool         csview_empty(csview sv) { return sv.size == 0; }
-STC_INLINE char         csview_front(csview sv) { return sv.str[0]; }
-STC_INLINE char         csview_back(csview sv) { return sv.str[sv.size - 1]; }
-
-STC_INLINE void         csview_clear(csview* self) { *self = csview_null; }
-
-STC_INLINE bool         csview_equals(csview sv, csview sv2)
-                            { return sv.size == sv2.size && !memcmp(sv.str, sv2.str, sv.size); }
-STC_INLINE size_t       csview_find(csview sv, csview needle)
-                            { char* res = c_strnstrn(sv.str, needle.str, sv.size, needle.size);
-                              return res ? res - sv.str : csview_npos; }
-STC_INLINE bool         csview_contains(csview sv, csview needle)
-                            { return c_strnstrn(sv.str, needle.str, sv.size, needle.size) != NULL; }
-STC_INLINE bool         csview_starts_with(csview sv, csview sub)
-                            { if (sub.size > sv.size) return false;
-                              return !memcmp(sv.str, sub.str, sub.size); }
-STC_INLINE bool         csview_ends_with(csview sv, csview sub)
-                            { if (sub.size > sv.size) return false;
-                              return !memcmp(sv.str + sv.size - sub.size, sub.str, sub.size); }
-STC_INLINE csview_iter  csview_begin(const csview* self)
-                            { return c_make(csview_iter){.chr = {self->str, utf8_codep_size(self->str)}}; }
-STC_INLINE csview_iter  csview_end(const csview* self)
-                            { return c_make(csview_iter){self->str + self->size}; }
-STC_INLINE void         csview_next(csview_iter* it)
-                            { it->ref += it->chr.size; it->chr.size = utf8_codep_size(it->ref); }
+#define             csview_null  c_sv("")
+#define             csview_new(literal) c_sv(literal)
+#define             csview_npos  (SIZE_MAX >> 1)
+
+STC_API csview      csview_substr(csview sv, intptr_t pos, size_t n);
+STC_API csview      csview_slice(csview sv, intptr_t p1, intptr_t p2);
+STC_API csview      csview_token(csview sv, csview sep, size_t* start);
+
+STC_INLINE csview   csview_init() { return csview_null; }
+STC_INLINE csview   csview_from(const char* str)
+                        { return c_make(csview){str, strlen(str)}; }
+STC_INLINE csview   csview_from_n(const char* str, size_t n)
+                        { return c_make(csview){str, n}; }
+STC_INLINE void     csview_clear(csview* self) { *self = csview_null; }
+
+STC_INLINE size_t   csview_size(csview sv) { return sv.size; }
+STC_INLINE size_t   csview_length(csview sv) { return sv.size; }
+STC_INLINE bool     csview_empty(csview sv) { return sv.size == 0; }
+STC_INLINE char     csview_front(csview sv) { return sv.str[0]; }
+STC_INLINE char     csview_back(csview sv) { return sv.str[sv.size - 1]; }
+
+STC_INLINE bool csview_equals(csview sv, csview sv2)
+    { return sv.size == sv2.size && !memcmp(sv.str, sv2.str, sv.size); }
+
+STC_INLINE size_t csview_find(csview sv, csview needle) {
+    char* res = c_strnstrn(sv.str, needle.str, sv.size, needle.size);
+    return res ? res - sv.str : csview_npos;
+}
+
+STC_INLINE bool csview_contains(csview sv, csview needle)
+    { return c_strnstrn(sv.str, needle.str, sv.size, needle.size) != NULL; }
+
+STC_INLINE bool csview_starts_with(csview sv, csview sub) {
+    if (sub.size > sv.size) return false;
+    return !memcmp(sv.str, sub.str, sub.size);
+}
+
+STC_INLINE bool csview_ends_with(csview sv, csview sub) {
+    if (sub.size > sv.size) return false;
+    return !memcmp(sv.str + sv.size - sub.size, sub.str, sub.size);
+}
+
+/* iterator */
+STC_INLINE csview_iter csview_begin(const csview* self)
+    { return c_make(csview_iter){.chr = {self->str, utf8_codep_size(self->str)}}; }
+
+STC_INLINE csview_iter csview_end(const csview* self)
+    { return c_make(csview_iter){self->str + self->size}; }
+
+STC_INLINE void csview_next(csview_iter* it)
+    { it->ref += it->chr.size; it->chr.size = utf8_codep_size(it->ref); }
 
 /* utf8 */
-STC_INLINE bool csview_valid_u8(csview sv)
+STC_INLINE bool csview_valid_u8(csview sv) // depends on src/utf8utils.c
     { return utf8_valid_n(sv.str, sv.size); }
 
 STC_INLINE size_t csview_size_u8(csview sv)
@@ -84,36 +95,50 @@ STC_INLINE csview csview_substr_u8(csview sv, size_t u8pos, size_t u8len) {
 /* csview interaction with cstr: */
 #ifdef CSTR_H_INCLUDED
 
-STC_INLINE csview       csview_from_s(const cstr* self)
-                            { return c_make(csview){cstr_str(self), cstr_size(*self)}; }
-
-STC_INLINE cstr         cstr_from_sv(csview sv)
-                            { return cstr_from_n(sv.str, sv.size); }
-STC_INLINE csview       cstr_substr(const cstr* self, intptr_t pos, size_t n)
-                            { return csview_substr(csview_from_s(self), pos, n); }
-STC_INLINE csview       cstr_slice(const cstr* self, intptr_t p1, intptr_t p2)
-                            { return csview_slice(csview_from_s(self), p1, p2); }
-STC_INLINE csview       cstr_assign_sv(cstr* self, csview sv)
-                            { return c_make(csview){cstr_assign_n(self, sv.str, sv.size), sv.size}; }
-STC_INLINE void         cstr_append_sv(cstr* self, csview sv)
-                            { cstr_append_n(self, sv.str, sv.size); }
-STC_INLINE void         cstr_insert_sv(cstr* self, size_t pos, csview sv)
-                            { cstr_replace_n(self, pos, 0, sv.str, sv.size); }
-STC_INLINE void         cstr_replace_sv(cstr* self, csview sub, csview with)
-                            { cstr_replace_n(self, sub.str - cstr_str(self), sub.size, with.str, with.size); }
-STC_INLINE bool         cstr_equals_sv(cstr s, csview sv)
-                            { return sv.size == cstr_size(s) && !memcmp(cstr_str(&s), sv.str, sv.size); }
-STC_INLINE size_t       cstr_find_sv(cstr s, csview needle)
-                            { char* res = c_strnstrn(cstr_str(&s), needle.str, cstr_size(s), needle.size);
-                              return res ? res - cstr_str(&s) : cstr_npos; }
-STC_INLINE bool         cstr_contains_sv(cstr s, csview needle)
-                            { return c_strnstrn(cstr_str(&s), needle.str, cstr_size(s), needle.size) != NULL; }
-STC_INLINE bool         cstr_starts_with_sv(cstr s, csview sub)
-                            { if (sub.size > cstr_size(s)) return false;
-                              return !memcmp(cstr_str(&s), sub.str, sub.size); }
-STC_INLINE bool         cstr_ends_with_sv(cstr s, csview sub)
-                            { if (sub.size > cstr_size(s)) return false;
-                              return !memcmp(cstr_str(&s) + cstr_size(s) - sub.size, sub.str, sub.size); }
+STC_INLINE csview csview_from_s(const cstr* self)
+    { return c_make(csview){cstr_str(self), cstr_size(*self)}; }
+
+STC_INLINE cstr cstr_from_sv(csview sv)
+    { return cstr_from_n(sv.str, sv.size); }
+
+STC_INLINE csview cstr_substr(const cstr* self, intptr_t pos, size_t n)
+    { return csview_substr(csview_from_s(self), pos, n); }
+
+STC_INLINE csview cstr_slice(const cstr* self, intptr_t p1, intptr_t p2)
+    { return csview_slice(csview_from_s(self), p1, p2); }
+
+STC_INLINE csview cstr_assign_sv(cstr* self, csview sv)
+    { return c_make(csview){cstr_assign_n(self, sv.str, sv.size), sv.size}; }
+
+STC_INLINE void cstr_append_sv(cstr* self, csview sv)
+    { cstr_append_n(self, sv.str, sv.size); }
+
+STC_INLINE void cstr_insert_sv(cstr* self, size_t pos, csview sv)
+    { cstr_replace_n(self, pos, 0, sv.str, sv.size); }
+
+STC_INLINE void cstr_replace_sv(cstr* self, csview sub, csview with)
+    { cstr_replace_n(self, sub.str - cstr_str(self), sub.size, with.str, with.size); }
+
+STC_INLINE bool cstr_equals_sv(cstr s, csview sv)
+    { return sv.size == cstr_size(s) && !memcmp(cstr_str(&s), sv.str, sv.size); }
+
+STC_INLINE size_t cstr_find_sv(cstr s, csview needle) { 
+    char* res = c_strnstrn(cstr_str(&s), needle.str, cstr_size(s), needle.size);
+    return res ? res - cstr_str(&s) : cstr_npos;
+}
+
+STC_INLINE bool cstr_contains_sv(cstr s, csview needle)
+    { return c_strnstrn(cstr_str(&s), needle.str, cstr_size(s), needle.size) != NULL; }
+
+STC_INLINE bool cstr_starts_with_sv(cstr s, csview sub) {
+    if (sub.size > cstr_size(s)) return false;
+    return !memcmp(cstr_str(&s), sub.str, sub.size);
+}
+
+STC_INLINE bool cstr_ends_with_sv(cstr s, csview sub) {
+    if (sub.size > cstr_size(s)) return false;
+    return !memcmp(cstr_str(&s) + cstr_size(s) - sub.size, sub.str, sub.size);
+}
 #endif
 /* ---- Container helper functions ---- */
 
@@ -165,3 +190,6 @@ csview_token(csview sv, csview sep, size_t* start) {
 #endif
 #endif
 #undef i_opt
+#undef i_header
+#undef i_implement
+#undef i_static
diff --git a/include/stc/template.h b/include/stc/template.h
index 4d2f0f58..db50a6ca 100644
--- a/include/stc/template.h
+++ b/include/stc/template.h
@@ -49,13 +49,6 @@
   #define i_size uint32_t
 #endif
 
-#if defined i_key_str || defined i_val_str || defined i_key_ssv || defined i_val_ssv
-  #include "cstr.h"
-  #if defined i_key_ssv || defined i_val_ssv
-  #include "csview.h"
-  #endif
-#endif
-
 #if !(defined i_key || defined i_key_str || defined i_key_ssv || \
       defined i_key_bind || defined i_key_arcbox)
   #define _i_key_from_val
@@ -288,6 +281,10 @@
 #undef i_keyto
 #undef i_keydrop
 
+#undef i_header
+#undef i_implement
+#undef i_static
+
 #undef _i_prefix
 #undef _i_has_from
 #undef _i_key_from_val
diff --git a/include/stc/utf8.h b/include/stc/utf8.h
index f11af046..dffd9f15 100644
--- a/include/stc/utf8.h
+++ b/include/stc/utf8.h
@@ -25,12 +25,37 @@ int main()
 #include "ccommon.h"
 #include <ctype.h>
 
+// utf8 methods defined in src/utf8utils.c:
+bool        utf8_islower(uint32_t c);
+bool        utf8_isupper(uint32_t c);
+bool        utf8_isspace(uint32_t c);
+bool        utf8_isdigit(uint32_t c);
+bool        utf8_isxdigit(uint32_t c);
+bool        utf8_isalpha(uint32_t c);
+bool        utf8_isalnum(uint32_t c);
+uint32_t    utf8_tolower(uint32_t c);
+uint32_t    utf8_toupper(uint32_t c);
+bool        utf8_valid(const char* s);
+bool        utf8_valid_n(const char* s, size_t n);
+
+/* encode/decode next utf8 codepoint. */
 enum { UTF8_OK = 0, UTF8_ERROR = 4 };
 typedef struct { uint32_t state, codep, size; } utf8_decode_t;
 
-/* encode/decode next utf8 codepoint. */
-STC_API unsigned utf8_encode(char *out, uint32_t c);
-STC_API void     utf8_decode(utf8_decode_t *d, const uint8_t b);
+void        utf8_peek(utf8_decode_t* d, const char *s);
+unsigned    utf8_encode(char *out, uint32_t c);
+void        utf8_decode(utf8_decode_t *d, const uint8_t b);
+
+/* number of characters in the utf8 codepoint from s */
+STC_INLINE unsigned utf8_codep_size(const char *s) {
+    unsigned b = (uint8_t)*s;
+    if (b < 0x80) return 1;
+    if (b < 0xC2) return 0;
+    if (b < 0xE0) return 2;
+    if (b < 0xF0) return 3;
+    if (b < 0xF5) return 4;
+    return 0;
+}
 
 /* number of codepoints in the utf8 string s */
 STC_INLINE size_t utf8_size(const char *s) {
@@ -56,86 +81,4 @@ STC_INLINE const char* utf8_at(const char *s, size_t index) {
 STC_INLINE size_t utf8_pos(const char*   s, size_t index)
     { return utf8_at(s, index) - s; }
 
-STC_INLINE void utf8_peek(const char *s, utf8_decode_t* d) {
-    utf8_decode(d, (uint8_t)*s++);
-    switch (d->size) {
-        case 4: utf8_decode(d, (uint8_t)*s++);
-        case 3: utf8_decode(d, (uint8_t)*s++);
-        case 2: utf8_decode(d, (uint8_t)*s++);
-    }
-}
-
-STC_INLINE unsigned utf8_codep_size(const char *s) {
-    utf8_decode_t d = {UTF8_OK};
-    utf8_decode(&d, (uint8_t)*s);
-    return d.size;
-}
-
-STC_INLINE bool utf8_valid(const char* s) {
-    utf8_decode_t d = {UTF8_OK};
-    while (*s)
-        utf8_decode(&d, (uint8_t)*s++);
-    return d.state == UTF8_OK;
-}
-
-STC_INLINE bool utf8_valid_n(const char* s, size_t n) {
-    utf8_decode_t d = {UTF8_OK};
-    while ((n-- != 0) & (*s != 0))
-        utf8_decode(&d, (uint8_t)*s++);
-    return d.state == UTF8_OK;
-}
-
-// --------------------------- IMPLEMENTATION ---------------------------------
-#ifdef i_implement
-// https://news.ycombinator.com/item?id=15423674
-// https://gist.github.com/s4y/344a355f8c1f99c6a4cb2347ec4323cc
-
-STC_DEF void utf8_decode(utf8_decode_t *d, const uint8_t b)
-{
-    switch (d->state) {
-    case UTF8_OK:
-        if      (b < 0x80) d->codep = b, d->size = 1;
-        else if (b < 0xC2) d->state = UTF8_ERROR, d->size = 0;
-        else if (b < 0xE0) d->state = 1, d->codep = b & 0x1F, d->size = 2;
-        else if (b < 0xF0) d->state = 2, d->codep = b & 0x0F, d->size = 3;
-        else if (b < 0xF5) d->state = 3, d->codep = b & 0x07, d->size = 4;
-        else d->state = UTF8_ERROR, d->size = 0;
-        break;
-    case 1: case 2: case 3:
-        if ((b & 0xC0) == 0x80) {
-            d->state -= 1;
-            d->codep = (d->codep << 6) | (b & 0x3F);
-        } else
-            d->state = UTF8_ERROR, d->size = 0;
-    }
-}
-
-STC_DEF unsigned utf8_encode(char *out, uint32_t c)
-{
-    if (c < 0x80U) {
-        out[0] = (char) c;
-        return 1;
-    } else if (c < 0x0800U) {
-        out[0] = (char) ((c>>6  & 0x1F) | 0xC0);
-        out[1] = (char) ((c     & 0x3F) | 0x80);
-        return 2;
-    } else if (c < 0x010000U) {
-        if ((c < 0xD800U) | (c >= 0xE000U)) {
-            out[0] = (char) ((c>>12 & 0x0F) | 0xE0);
-            out[1] = (char) ((c>>6  & 0x3F) | 0x80);
-            out[2] = (char) ((c     & 0x3F) | 0x80);
-            return 3;
-        }
-    } else if (c < 0x110000U) {
-        out[0] = (char) ((c>>18 & 0x07) | 0xF0);
-        out[1] = (char) ((c>>12 & 0x3F) | 0x80);
-        out[2] = (char) ((c>>6  & 0x3F) | 0x80);
-        out[3] = (char) ((c     & 0x3F) | 0x80);
-        return 4;
-    }
-    return 0;
-}
-
-#endif
 #endif
-#undef i_opt
diff --git a/src/cregex.c b/src/cregex.c
index 0f585f5d..34c78090 100644
--- a/src/cregex.c
+++ b/src/cregex.c
@@ -32,7 +32,6 @@ THE SOFTWARE.
 #include <string.h>
 #include <ctype.h>
 #include <stc/cregex.h>
-#include "cregex_utf8.c"
 
 typedef uint32_t Rune; /* Utf8 code point */
 typedef int32_t Token;
@@ -594,7 +593,7 @@ nextc(Parser *par, Rune *rp)
                 return 2;
             case 'p': case 'P': { /* https://www.regular-expressions.info/unicode.html */
                 static struct { const char* c; int n, r; } cls[] = {
-                    {"{Alpha}", 7, U8_LC}, {"{LC}", 4, U8_LC}, 
+                    {"{Alpha}", 7, U8_LC}, {"{LC}", 4, U8_LC},
                     {"{Alnum}", 7, U8_Xan},
                     {"{Digit}", 7, U8_Nd}, {"{Nd}", 4, U8_Nd},
                     {"{Lower}", 7, U8_Ll}, {"{Ll}", 4, U8_Ll},
diff --git a/src/casefold.c b/src/utf8tabs.c
index 1b0a9463..8168f78f 100644
--- a/src/casefold.c
+++ b/src/utf8tabs.c
@@ -1,8 +1,6 @@
-#include <ctype.h>
-#define i_header
-#include <stc/cstr.h>
+#include "utf8tabs.h"
 
-static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = {
+struct CaseFold casefold[] = {
     {65, 90, 122}, {181, 181, 956}, {192, 214, 246}, {216, 222, 254},
     {256, 302, 303}, {306, 310, 311}, {313, 327, 328}, {330, 374, 375}, {376, 376, 255},
     {377, 381, 382}, {383, 383, 115}, {385, 385, 595}, {386, 388, 389}, {390, 390, 596},
@@ -47,7 +45,7 @@ static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = {
     {42948, 42948, 42900}, {42949, 42949, 642}, {42950, 42950, 7566}, {42951, 42953, 42954},
     {42960, 42962, 42963}, {42968, 42970, 42971}, {43888, 43913, 5049}, {65313, 65338, 65370},
 }; // 188
-static uint8_t cfold_low[] = {
+uint8_t cfold_low[] = {
     0, 138, 10, 111, 2, 139, 3, 8, 4, 5, 6, 7, 9, 59, 12, 14, 16, 20, 49, 25,
     56, 52, 29, 31, 33, 35, 37, 39, 50, 40, 41, 42, 43, 44, 45, 17, 46, 47, 48, 51,
     53, 55, 155, 58, 62, 152, 150, 153, 11, 13, 15, 18, 19, 171, 21, 172, 22, 167, 170, 24,
@@ -59,149 +57,3 @@ static uint8_t cfold_low[] = {
     144, 145, 54, 57, 149, 154, 156, 157, 158, 96, 97, 159, 106, 160, 161, 162, 163, 165, 166, 168,
     180, 169, 179, 183, 184, 185, 178, 187,
 };
-
-uint32_t utf8_tolower(uint32_t c) {
-    for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) {
-        if (c <= casefold[i].c1) {
-            if (c < casefold[i].c0) return c;
-            int d = casefold[i].m1 - casefold[i].c1;
-            if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1));
-            return c + d;
-        }
-    }
-    return c;
-}
-
-uint32_t utf8_toupper(uint32_t c) {
-    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) {
-        struct CaseFold cfold = casefold[cfold_low[i]];
-        if (c <= cfold.m1) {
-            int d = cfold.m1 - cfold.c1;
-            if (c < (uint32_t)(cfold.c0 + d)) return c;
-            if (d == 1) return c - ((cfold.m1 & 1) == (c & 1));
-            return c - d;
-        }
-    }
-    return c;
-}
-
-bool utf8_isupper(uint32_t c) {
-    return utf8_tolower(c) != c;
-}
-
-bool utf8_islower(uint32_t c) {
-    return utf8_toupper(c) != c;
-}
-
-bool utf8_isspace(uint32_t c) {
-    static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0,
-                           0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
-    for (size_t i=0; i<sizeof t/sizeof *t; ++i)
-        if (c == t[i]) return true;
-    return (c >= 0x2000) & (c <= 0x200A);
-}
-
-bool utf8_isdigit(uint32_t c) {
-    return ((c >= '0') & (c <= '9')) || 
-           ((c >= 0xFF10) & (c <= 0xFF19));
-}
-
-bool utf8_isxdigit(uint32_t c) {
-    static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10, 
-                           0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46};
-    for (size_t i=1; i<sizeof t/sizeof *t; i += 2)
-        if (c <= t[i]) return c >= t[i - 1];
-    return false;
-}
-
-bool utf8_isalnum(uint32_t c) {
-    if (c < 128) return isalnum(c) != 0;
-    if ((c >= 0xFF10) & (c <= 0xFF19)) return true;
-    return utf8_islower(c) || utf8_isupper(c);
-}
-
-bool utf8_isalpha(uint32_t c) {
-    if (c < 128) return isalpha(c) != 0;
-    return utf8_islower(c) || utf8_isupper(c);
-}
-
-static struct fnfold {
-    int (*conv_asc)(int);
-    uint32_t (*conv_u8)(uint32_t);
-}
-fn_tolower = {tolower, utf8_tolower},
-fn_toupper = {toupper, utf8_toupper};
-
-
-static cstr cstr_casefold(const cstr* self, struct fnfold fold) {
-    csview sv = cstr_sv(self);
-    cstr out = cstr_null;
-    char *buf = cstr_reserve(&out, sv.size*3/2);
-    uint32_t cp; size_t sz = 0;
-    utf8_decode_t d = {UTF8_OK};
-
-    for (; *sv.str; sv.str += d.size) {
-        utf8_peek(sv.str, &d);
-        switch (d.size) {
-        case 1:
-            buf[sz++] = (char)fold.conv_asc(*sv.str);
-            break;
-        default: 
-            cp = fold.conv_u8(d.codep);
-            sz += utf8_encode(buf + sz, cp);
-        }
-    }
-    _cstr_set_size(&out, sz);
-    cstr_shrink_to_fit(&out);
-    return out;
-}
-
-cstr cstr_tolower(const cstr* self) {
-    return cstr_casefold(self, fn_tolower);
-}
-
-cstr cstr_toupper(const cstr* self) {
-    return cstr_casefold(self, fn_toupper);
-}
-
-void cstr_lowercase(cstr* self) {
-    cstr_take(self, cstr_casefold(self, fn_tolower));
-}
-
-void cstr_uppercase(cstr* self) {
-    cstr_take(self, cstr_casefold(self, fn_toupper));
-}
-
-#ifdef TEST
-int main()
-{
-    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i)
-    {
-        char x[3][5]={0};
-        unsigned s0, s1, s2;
-        uint32_t a = casefold[i].c0;
-        uint32_t b = utf8_tolower(a);
-        uint32_t c = utf8_toupper(b);
-
-        s0 = utf8_encode(x[0], a);
-        s1 = utf8_encode(x[1], b);
-        s2 = utf8_encode(x[2], c);
-        printf("%s %s %s - %u %u %u (%u %u %u)\n", x[0], x[1], x[2], a, b, c, s0, s1, s2);
-    }
-    c_auto (cstr, t1)
-    {
-        t1 = cstr_new("Die preußischen Köstlichkeiten.");
-
-        cstr_buf b = cstr_buffer(&t1);
-        printf("%s, %llu %llu\n", b.data, b.size, b.cap);
-        cstr_lowercase(&t1);
-        b = cstr_buffer(&t1);
-        printf("%s, %llu %llu\n", b.data, b.size, b.cap);
-
-        cstr_uppercase(&t1);
-        b = cstr_buffer(&t1);
-        printf("%s, %llu %llu\n", b.data, b.size, b.cap);
-    }
-}
-#endif
-
diff --git a/src/utf8tabs.h b/src/utf8tabs.h
new file mode 100644
index 00000000..95251f75
--- /dev/null
+++ b/src/utf8tabs.h
@@ -0,0 +1,10 @@
+#ifndef utf8tabs_included
+#define utf8tabs_included
+
+#include <stdint.h>
+struct CaseFold { uint16_t c0, c1, m1; } ;
+
+extern struct CaseFold casefold[188];
+extern uint8_t cfold_low[188];
+
+#endif
diff --git a/src/casefold.py b/src/utf8tabs.py
index 951f3bf6..563180e3 100644
--- a/src/casefold.py
+++ b/src/utf8tabs.py
@@ -8,7 +8,7 @@ def read_unidata(catfilter='Lu', casefilter='lowcase', big=False):
                              'decdig', 'digval', 'numval', 'mirrored', 'uc1name', 'comment',
                              'upcase', 'lowcase', 'titlecase'],
                       usecols=['code', 'name', 'category', 'bidircat', 'upcase', 'lowcase', 'titlecase'])
-    if big: 
+    if big:
         ud = ud[ud['code'] >= (1<<16)]
     else:
         ud = ud[ud['code'] < (1<<16)]
@@ -84,7 +84,7 @@ def make_casefold(letters):
 
 def print_casefold(cfold):
     print('''
-static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = {''')
+struct CaseFold casefold[] = {''')
     n = 1
     s = 5
     count = 0
@@ -99,7 +99,7 @@ static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = {''')
             break
         #print(' {%d, %d, %d}, // %s %s, %s\n   ' % (a, b, c, chr(a), chr(a + x[2]), x[3]), end='')
         if True: # compact
-            if n == s: 
+            if n == s:
                 n = 0
                 if a > 1000:
                     s = 4
@@ -129,7 +129,7 @@ def print_casefold_low(table):
     cfold_low = [i for i in range(len(table))]
     cfold_low.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0]))
 
-    print('static uint8_t cfold_low[] = {\n   ', end='')
+    print('uint8_t cfold_low[] = {\n   ', end='')
     for i in range(len(cfold_low)):
         print(" %d," % (cfold_low[i]), end='\n   ' if (i+1) % 20 == 0 else '')
     print('\n};')
@@ -138,98 +138,7 @@ def print_casefold_low(table):
 ########### main:
 
 if __name__ == "__main__":
-    print('''#include <stdint.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <stc/utf8.h>
-#include <stdbool.h>''')
-
+    print('#include "utf8tabs.h"')
     cfold = make_casetable()
     table = print_casefold(cfold)
     print_casefold_low(table)
-
-    print(r'''
-uint32_t utf8_tolower(uint32_t c) {
-    for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) {
-        if (c <= casefold[i].c1) {
-            if (c < casefold[i].c0) return c;
-            int d = casefold[i].m1 - casefold[i].c1;
-            if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1));
-            return c + d;
-        }
-    }
-    return c;
-}
-
-uint32_t utf8_toupper(uint32_t c) {
-    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) {
-        struct CaseFold cfold = casefold[cfold_low[i]];
-        if (c <= cfold.m1) {
-            int d = cfold.m1 - cfold.c1;
-            if (c < (uint32_t)(cfold.c0 + d)) return c;
-            if (d == 1) return c - ((cfold.m1 & 1) == (c & 1));
-            return c - d;
-        }
-    }
-    return c;
-}
-
-bool utf8_isupper(uint32_t c) {
-    return utf8_tolower(c) != c;
-}
-
-bool utf8_islower(uint32_t c) {
-    return utf8_toupper(c) != c;
-}
-
-bool utf8_isspace(uint32_t c) {
-    static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0,
-                           0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
-    for (size_t i=0; i<sizeof t/sizeof *t; ++i)
-        if (c == t[i]) return true;
-    return (c >= 0x2000) & (c <= 0x200A);
-}
-
-bool utf8_isdigit(uint32_t c) {
-    return ((c >= '0') & (c <= '9')) || 
-           ((c >= 0xFF10) & (c <= 0xFF19));
-}
-
-bool utf8_isxdigit(uint32_t c) {
-    static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10, 
-                           0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46};
-    for (size_t i=1; i<sizeof t/sizeof *t; i += 2)
-        if (c <= t[i]) return c >= t[i - 1];
-    return false;
-}
-
-bool utf8_isalnum(uint32_t c) {
-    if (c < 128) return isalnum(c) != 0;
-    if ((c >= 0xFF10) & (c <= 0xFF19)) return true;
-    return utf8_islower(c) || utf8_isupper(c);
-}
-
-bool utf8_isalpha(uint32_t c) {
-    if (c < 128) return isalpha(c) != 0;
-    return utf8_islower(c) || utf8_isupper(c);
-}
-
-
-#ifdef TEST
-int main()
-{
-    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i)
-    {
-        char x[3][5]={0};
-        uint32_t a = casefold[i].c0;
-        uint32_t b = utf8_tolower(a);
-        uint32_t c = utf8_toupper(b);
-
-        utf8_encode(x[0], a);
-        utf8_encode(x[1], b);
-        utf8_encode(x[2], c);
-        printf("%s %s %s - %u %u %u\n", x[0], x[1], x[2], a, b, c);
-    }
-}
-#endif
-''')
diff --git a/src/utf8utils.c b/src/utf8utils.c
new file mode 100644
index 00000000..3b01ae39
--- /dev/null
+++ b/src/utf8utils.c
@@ -0,0 +1,190 @@
+#include <ctype.h>
+#define i_header
+#include <stc/cstr.h>
+
+#include "utf8tabs.h"
+#include "utf8tabs.c"
+
+// https://news.ycombinator.com/item?id=15423674
+// https://gist.github.com/s4y/344a355f8c1f99c6a4cb2347ec4323cc
+
+void utf8_decode(utf8_decode_t *d, const uint8_t b)
+{
+    switch (d->state) {
+    case UTF8_OK:
+        if      (b < 0x80) d->codep = b, d->size = 1;
+        else if (b < 0xC2) d->state = UTF8_ERROR, d->size = 0;
+        else if (b < 0xE0) d->state = 1, d->codep = b & 0x1F, d->size = 2;
+        else if (b < 0xF0) d->state = 2, d->codep = b & 0x0F, d->size = 3;
+        else if (b < 0xF5) d->state = 3, d->codep = b & 0x07, d->size = 4;
+        else d->state = UTF8_ERROR, d->size = 0;
+        break;
+    case 1: case 2: case 3:
+        if ((b & 0xC0) == 0x80) {
+            d->state -= 1;
+            d->codep = (d->codep << 6) | (b & 0x3F);
+        } else
+            d->state = UTF8_ERROR, d->size = 0;
+    }
+}
+
+unsigned utf8_encode(char *out, uint32_t c)
+{
+    if (c < 0x80U) {
+        out[0] = (char) c;
+        return 1;
+    } else if (c < 0x0800U) {
+        out[0] = (char) ((c>>6  & 0x1F) | 0xC0);
+        out[1] = (char) ((c     & 0x3F) | 0x80);
+        return 2;
+    } else if (c < 0x010000U) {
+        if ((c < 0xD800U) | (c >= 0xE000U)) {
+            out[0] = (char) ((c>>12 & 0x0F) | 0xE0);
+            out[1] = (char) ((c>>6  & 0x3F) | 0x80);
+            out[2] = (char) ((c     & 0x3F) | 0x80);
+            return 3;
+        }
+    } else if (c < 0x110000U) {
+        out[0] = (char) ((c>>18 & 0x07) | 0xF0);
+        out[1] = (char) ((c>>12 & 0x3F) | 0x80);
+        out[2] = (char) ((c>>6  & 0x3F) | 0x80);
+        out[3] = (char) ((c     & 0x3F) | 0x80);
+        return 4;
+    }
+    return 0;
+}
+
+void utf8_peek(utf8_decode_t* d, const char *s) {
+    utf8_decode(d, (uint8_t)*s++);
+    switch (d->size) {
+        case 4: utf8_decode(d, (uint8_t)*s++);
+        case 3: utf8_decode(d, (uint8_t)*s++);
+        case 2: utf8_decode(d, (uint8_t)*s++);
+    }
+}
+
+bool utf8_valid(const char* s) {
+    utf8_decode_t d = {UTF8_OK};
+    while (*s)
+        utf8_decode(&d, (uint8_t)*s++);
+    return d.state == UTF8_OK;
+}
+
+bool utf8_valid_n(const char* s, size_t n) {
+    utf8_decode_t d = {UTF8_OK};
+    while ((n-- != 0) & (*s != 0))
+        utf8_decode(&d, (uint8_t)*s++);
+    return d.state == UTF8_OK;
+}
+
+uint32_t utf8_tolower(uint32_t c) {
+    for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) {
+        if (c <= casefold[i].c1) {
+            if (c < casefold[i].c0) return c;
+            int d = casefold[i].m1 - casefold[i].c1;
+            if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1));
+            return c + d;
+        }
+    }
+    return c;
+}
+
+uint32_t utf8_toupper(uint32_t c) {
+    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) {
+        struct CaseFold cfold = casefold[cfold_low[i]];
+        if (c <= cfold.m1) {
+            int d = cfold.m1 - cfold.c1;
+            if (c < (uint32_t)(cfold.c0 + d)) return c;
+            if (d == 1) return c - ((cfold.m1 & 1) == (c & 1));
+            return c - d;
+        }
+    }
+    return c;
+}
+
+bool utf8_isupper(uint32_t c) {
+    return utf8_tolower(c) != c;
+}
+
+bool utf8_islower(uint32_t c) {
+    return utf8_toupper(c) != c;
+}
+
+bool utf8_isspace(uint32_t c) {
+    static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0,
+                           0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
+    for (size_t i=0; i<sizeof t/sizeof *t; ++i)
+        if (c == t[i]) return true;
+    return (c >= 0x2000) & (c <= 0x200A);
+}
+
+bool utf8_isdigit(uint32_t c) {
+    return ((c >= '0') & (c <= '9')) ||
+           ((c >= 0xFF10) & (c <= 0xFF19));
+}
+
+bool utf8_isxdigit(uint32_t c) {
+    static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10,
+                           0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46};
+    for (size_t i=1; i<sizeof t/sizeof *t; i += 2)
+        if (c <= t[i]) return c >= t[i - 1];
+    return false;
+}
+
+bool utf8_isalnum(uint32_t c) {
+    if (c < 128) return isalnum(c) != 0;
+    if ((c >= 0xFF10) & (c <= 0xFF19)) return true;
+    return utf8_islower(c) || utf8_isupper(c);
+}
+
+bool utf8_isalpha(uint32_t c) {
+    if (c < 128) return isalpha(c) != 0;
+    return utf8_islower(c) || utf8_isupper(c);
+}
+
+static struct fnfold {
+    int (*conv_asc)(int);
+    uint32_t (*conv_u8)(uint32_t);
+}
+fn_tolower = {tolower, utf8_tolower},
+fn_toupper = {toupper, utf8_toupper};
+
+
+static cstr cstr_casefold(const cstr* self, struct fnfold fold) {
+    csview sv = cstr_sv(self);
+    cstr out = cstr_null;
+    char *buf = cstr_reserve(&out, sv.size*3/2);
+    uint32_t cp; size_t sz = 0;
+    utf8_decode_t d = {UTF8_OK};
+
+    for (; *sv.str; sv.str += d.size) {
+        utf8_peek(&d, sv.str);
+        switch (d.size) {
+        case 1:
+            buf[sz++] = (char)fold.conv_asc(*sv.str);
+            break;
+        default:
+            cp = fold.conv_u8(d.codep);
+            sz += utf8_encode(buf + sz, cp);
+        }
+    }
+    _cstr_set_size(&out, sz);
+    cstr_shrink_to_fit(&out);
+    return out;
+}
+
+cstr cstr_tolower(const cstr* self) {
+    return cstr_casefold(self, fn_tolower);
+}
+
+cstr cstr_toupper(const cstr* self) {
+    return cstr_casefold(self, fn_toupper);
+}
+
+void cstr_lowercase(cstr* self) {
+    cstr_take(self, cstr_casefold(self, fn_tolower));
+}
+
+void cstr_uppercase(cstr* self) {
+    cstr_take(self, cstr_casefold(self, fn_toupper));
+}
author	Tyge Lovset <[email protected]>	2022-05-30 10:17:07 +0200
committer	Tyge Lovset <[email protected]>	2022-05-30 10:17:07 +0200
commit	b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3 (patch)
tree	8c97999b1ede5e0cf45c94b2035e94b0734dff1c
parent	831dc0843aeedcb45138a6ed576ea03f2dcd58f8 (diff)
download	STC-modified-b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3.tar.gz STC-modified-b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3.zip