diff options
| author | Tyge Løvset <[email protected]> | 2022-01-08 23:13:32 +0100 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-01-08 23:13:32 +0100 |
| commit | 63562f5135243ac2a2553b4e7360c59e86686d6f (patch) | |
| tree | 439a3853a6bfded32a3031b577457bfba470f1a7 /include/stc/cregex.h | |
| parent | 0a9910eee6582e6ee414071a0d5e7062448989cf (diff) | |
| download | STC-modified-63562f5135243ac2a2553b4e7360c59e86686d6f.tar.gz STC-modified-63562f5135243ac2a2553b4e7360c59e86686d6f.zip | |
Moved utf8 from cregex.h to separate file. Splitted csview.h into another file strings.h.
Diffstat (limited to 'include/stc/cregex.h')
| -rw-r--r-- | include/stc/cregex.h | 161 |
1 files changed, 41 insertions, 120 deletions
diff --git a/include/stc/cregex.h b/include/stc/cregex.h index eafefc4c..6259576e 100644 --- a/include/stc/cregex.h +++ b/include/stc/cregex.h @@ -22,11 +22,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "ccommon.h" - #ifndef CREGEX_INCLUDED #define CREGEX_INCLUDED +#include "ccommon.h" +#include "utf8.h" #include <stdlib.h> #include <setjmp.h> #include <stdarg.h> @@ -56,9 +56,6 @@ typedef enum { cregex_UNCLOSED_SUBEXPRESSION, } cregex_error_t; -/* check if a given string is valid utf8 */ -STC_API bool cregex_valid_utf8(const char *s); - /* create an empty expression */ STC_INLINE cregex cregex_init(void) { cregex re = {NULL}; return re; } @@ -92,82 +89,6 @@ STC_API void cregex_drop(cregex *re); /* -------------------------- IMPLEMENTATION ------------------------- */ #if defined(_i_implement) -enum { - _rx_UTF8_ACCEPT = 0, - _rx_UTF8_REJECT = 1 -}; - -static const uint8_t _rx_utf8d[] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df - 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef - 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff - 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 - 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 - 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 - 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 -}; - -static inline uint32_t _rx_utf8_decode(uint32_t *state, uint32_t *codep, - const uint32_t byte) -{ - const uint32_t type = _rx_utf8d[byte]; - const uint32_t x = (uint32_t) -(*state != _rx_UTF8_ACCEPT); - - *codep = (x & ((byte & 0x3fu) | (*codep << 6))) - | (~x & ((0xff >> type) & (byte))); - - *state = _rx_utf8d[256 + (*state << 4) + type]; - return *state; -} - -static bool _rx_utf8_count_codepoints(size_t *count, const uint8_t *s) -{ - uint32_t state = _rx_UTF8_ACCEPT, codepoint; - - for (*count = 0; *s; ++s) - *count += !_rx_utf8_decode(&state, &codepoint, *s); - return state == _rx_UTF8_ACCEPT; -} - -STC_DEF bool cregex_valid_utf8(const char *s) -{ - size_t count; - bool valid = _rx_utf8_count_codepoints(&count, (const uint8_t *)s); - return valid; -} - -static inline uint32_t _rx_utf8_peek(const char *s) -{ - uint32_t state = _rx_UTF8_ACCEPT, codepoint; - _rx_utf8_decode(&state, &codepoint, (uint8_t)s[0]); - return codepoint; -} - -static inline uint32_t _rx_utf8_char_width(uint8_t c) -{ - uint32_t ret = ((c & 0xF0) == 0xE0); - ret += (ret << 1); // 3 - ret |= (c < 0x80); // 1 - ret |= ((c & 0xE0) == 0xC0) << 1; // 2 - ret |= ((c & 0xF8) == 0xF0) << 2; // 4 - return ret; -} - -static inline const char *_rx_utf8_next(const char *s) -{ - const char* t = s + _rx_utf8_char_width((uint8_t)s[0]); - - uintptr_t p = (uintptr_t)t; - p &= (uintptr_t) -(*s != 0); - return (const char *)p; -} /* function pointer type used to evaluate if a regex node * matched a given string */ @@ -243,8 +164,8 @@ static bool _rx_char_is_match(cregex_node *node, const char *orig, const char *c return false; } - *next = _rx_utf8_next(cur); - return node->chr.chr == _rx_utf8_peek(cur); + *next = utf8_next(cur); + return node->chr.chr == utf8_peek(cur); } static bool _rx_start_is_match(cregex_node *node, const char *orig, const char *cur, @@ -272,7 +193,7 @@ static bool _rx_any_is_match(cregex_node *node, const char *orig, const char *cu const char **next) { if (*cur) { - *next = _rx_utf8_next(cur); + *next = utf8_next(cur); return true; } @@ -305,8 +226,8 @@ static bool _rx_class_is_match(cregex_node *node, const char *orig, const char * if (*cur == 0) return false; - const uint32_t chr = _rx_utf8_peek(cur); - *next = _rx_utf8_next(cur); + const uint32_t chr = utf8_peek(cur); + *next = utf8_next(cur); bool found = false; for (_rx_RangeNode *range = cls->ranges; range != NULL; @@ -362,7 +283,7 @@ struct { } _rx_CompileException; /* set global error value to the default value */ -static inline void clear_compile_exception(void) +static inline void _rx_clear_compile_exception(void) { _rx_CompileException.err = cregex_OK; _rx_CompileException.s = NULL; @@ -381,8 +302,8 @@ static size_t _rx_calc_compiled_escaped_len(const char *s, const char **leftover if (*s == 0) _rx_throw_compile_exception(cregex_UNEXPECTED_EOL, s); - const uint32_t chr = _rx_utf8_peek(s); - *leftover = _rx_utf8_next(s); + const uint32_t chr = utf8_peek(s); + *leftover = utf8_next(s); switch (chr) { case 's': @@ -409,7 +330,7 @@ static size_t _rx_calc_compiled_escaped_len(const char *s, const char **leftover } static const size_t _rx_calc_compiled_class_len(const char *s, - const char **leftover) + const char **leftover) { if (*s == '^') s++; @@ -417,19 +338,19 @@ static const size_t _rx_calc_compiled_class_len(const char *s, size_t ret = 1; while (*s && *s != ']') { - uint32_t chr = _rx_utf8_peek(s); - s = _rx_utf8_next(s); + uint32_t chr = utf8_peek(s); + s = utf8_next(s); if (chr == '\\') { - s = _rx_utf8_next(s); + s = utf8_next(s); } if (*s == '-' && s[1] != ']') { s++; - chr = _rx_utf8_peek(s); - s = _rx_utf8_next(s); + chr = utf8_peek(s); + s = utf8_next(s); if (chr == '\\') - s = _rx_utf8_next(s); + s = utf8_next(s); } ret++; @@ -452,9 +373,9 @@ static const size_t _rx_calc_compiled_len(const char *s) if (*s == 0) { return 1; } else { - const uint32_t chr = _rx_utf8_peek(s); + const uint32_t chr = utf8_peek(s); size_t ret = 0; - s = _rx_utf8_next(s); + s = utf8_next(s); switch (chr) { case '{': { @@ -515,12 +436,12 @@ static size_t _rx_parse_digit(const char *s, const char **leftover) size_t ret = 0; while (*s) { - uint32_t chr = _rx_utf8_peek(s); + uint32_t chr = utf8_peek(s); if (_rx_is_digit(chr)) { ret *= 10; ret += chr - '0'; - s = _rx_utf8_next(s); + s = utf8_next(s); } else { break; } @@ -538,7 +459,7 @@ static void _rx_parse_complex_quant(const char *re, const char **leftover, if (*re == 0) _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re); - uint32_t tmp = _rx_utf8_peek(re); + uint32_t tmp = utf8_peek(re); size_t min = 0, max = SIZE_MAX; if (_rx_is_digit(tmp)) { @@ -547,11 +468,11 @@ static void _rx_parse_complex_quant(const char *re, const char **leftover, _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re); } - tmp = _rx_utf8_peek(re); + tmp = utf8_peek(re); if (tmp == ',') { - re = _rx_utf8_next(re); - if (_rx_is_digit(_rx_utf8_peek(re))) + re = utf8_next(re); + if (_rx_is_digit(utf8_peek(re))) max = _rx_parse_digit(re, &re); else max = SIZE_MAX; @@ -559,7 +480,7 @@ static void _rx_parse_complex_quant(const char *re, const char **leftover, max = min; } - tmp = _rx_utf8_peek(re); + tmp = utf8_peek(re); if (tmp == '}') { *leftover = re + 1; *min_p = min; @@ -614,8 +535,8 @@ static cregex_node *_rx_compile_next_escaped(const char *re, const char **leftov if (*re == 0) _rx_throw_compile_exception(cregex_UNEXPECTED_EOL, re); - const uint32_t chr = _rx_utf8_peek(re); - *leftover = _rx_utf8_next(re); + const uint32_t chr = utf8_peek(re); + *leftover = utf8_next(re); cregex_node *ret = cur + 1; switch (chr) { @@ -694,21 +615,21 @@ static cregex_node *_rx_compile_next_complex_class(const char *re, while (*re && *re != ']') { uint32_t first = 0, last = 0; - first = _rx_utf8_peek(re); - re = _rx_utf8_next(re); + first = utf8_peek(re); + re = utf8_next(re); if (first == '\\') { if (*re == 0) _rx_throw_compile_exception( cregex_INVALID_COMPLEX_CLASS, re); - first = _rx_utf8_peek(re); - re = _rx_utf8_next(re); + first = utf8_peek(re); + re = utf8_next(re); } if (*re == '-' && re[1] != ']' && re[1]) { re++; - last = _rx_utf8_peek(re); - re = _rx_utf8_next(re); + last = utf8_peek(re); + re = utf8_next(re); if (last == '\\') { if (*re == 0) @@ -716,8 +637,8 @@ static cregex_node *_rx_compile_next_complex_class(const char *re, cregex_INVALID_COMPLEX_CLASS, re); - last = _rx_utf8_peek(re); - re = _rx_utf8_next(re); + last = utf8_peek(re); + re = utf8_next(re); } } else { last = first; @@ -813,8 +734,8 @@ static cregex_node *_rx_compile_next(const char *re, const char **leftover, if (*re == 0) return NULL; - const uint32_t chr = _rx_utf8_peek(re); - re = _rx_utf8_next(re); + const uint32_t chr = utf8_peek(re); + re = utf8_next(re); cregex_node *next = cur + 1; switch (chr) { @@ -908,13 +829,13 @@ STC_DEF cregex cregex_new(const char *re) { cregex ret = {NULL}; - clear_compile_exception(); + _rx_clear_compile_exception(); if (re == NULL) { _rx_CompileException.err = cregex_INVALID_PARAMS; return ret; } - if (!cregex_valid_utf8(re)) { + if (!utf8_is_valid(re)) { _rx_CompileException.err = cregex_INVALID_UTF8; _rx_CompileException.s = NULL; return ret; @@ -952,7 +873,7 @@ STC_DEF bool cregex_find(cregex re, const char *s, cregex_match *m) m->start = SIZE_MAX; m->end = SIZE_MAX; - for (const char *tmp_s = s; *tmp_s; tmp_s = _rx_utf8_next(tmp_s)) { + for (const char *tmp_s = s; *tmp_s; tmp_s = utf8_next(tmp_s)) { const char *next = NULL; if (_rx_is_match(re.nodes, s, tmp_s, &next)) { m->start = tmp_s - s; |
