diff options
| author | Tyge Løvset <[email protected]> | 2022-01-30 15:41:16 +0100 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-01-30 15:41:16 +0100 |
| commit | ba3f9c10466bf724cada8713cd2934bdc56bb26e (patch) | |
| tree | 1d23c17858ac8651a20611d1037b92deaaeb4da8 | |
| parent | df626f83dbf7789d622a1c5f4ff9fec3f61bc61b (diff) | |
| download | STC-modified-ba3f9c10466bf724cada8713cd2934bdc56bb26e.tar.gz STC-modified-ba3f9c10466bf724cada8713cd2934bdc56bb26e.zip | |
Replaced utf8_decode() and friends code.
| -rw-r--r-- | docs/csview_api.md | 2 | ||||
| -rw-r--r-- | examples/regex1.c | 2 | ||||
| -rw-r--r-- | include/stc/csview.h | 4 | ||||
| -rw-r--r-- | include/stc/utf8.h | 119 |
4 files changed, 59 insertions, 68 deletions
diff --git a/docs/csview_api.md b/docs/csview_api.md index defc29fd..75e50373 100644 --- a/docs/csview_api.md +++ b/docs/csview_api.md @@ -69,7 +69,7 @@ size_t utf8_pos(const char* s, size_t index); // from UTF8 ind const char* utf8_next(const char *s); // next codepoint as char*; NULL if *s == 0 uint32_t utf8_peek(const char *s); // next codepoint as uint32_t -size_t utf8_codepoint_size(const char* s); // 1-4 (0 if s[0] is illegal first cp char) +size_t utf8_codep_size(const char* s); // 1-4 (0 if s[0] is illegal first cp char) uint32_t utf8_decode(uint32_t *state, uint32_t *codep, const uint32_t byte); // decode next utf8 codepoint. ``` diff --git a/examples/regex1.c b/examples/regex1.c index e24e5400..02d0f5f4 100644 --- a/examples/regex1.c +++ b/examples/regex1.c @@ -10,7 +10,7 @@ int main(int argc, char* argv[]) c_auto (cstr, input) c_auto (cregex, float_expr) { - float_expr = cregex_new("[+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)"); + float_expr = cregex_new("[+-]?[0-9]+(\\.[0-9]*)?|\\.[0-9]+"); // Until "q" is given, ask for another number while (true) { diff --git a/include/stc/csview.h b/include/stc/csview.h index 7370336a..2979b2da 100644 --- a/include/stc/csview.h +++ b/include/stc/csview.h @@ -66,11 +66,11 @@ STC_INLINE bool csview_ends_with(csview sv, csview sub) { if (sub.size > sv.size) return false;
return !memcmp(sv.str + sv.size - sub.size, sub.str, sub.size); }
STC_INLINE csview_iter csview_begin(const csview* self)
- { return c_make(csview_iter){.cp = {self->str, utf8_codepoint_size(self->str)}}; }
+ { return c_make(csview_iter){.cp = {self->str, utf8_codep_size(self->str)}}; }
STC_INLINE csview_iter csview_end(const csview* self)
{ return c_make(csview_iter){self->str + self->size}; }
STC_INLINE void csview_next(csview_iter* it)
- { it->ref += it->cp.size; it->cp.size = utf8_codepoint_size(it->ref); }
+ { it->ref += it->cp.size; it->cp.size = utf8_codep_size(it->ref); }
/* utf8 */
STC_INLINE bool utf8_valid_sv(csview sv)
diff --git a/include/stc/utf8.h b/include/stc/utf8.h index e84a9b7e..c72534ee 100644 --- a/include/stc/utf8.h +++ b/include/stc/utf8.h @@ -24,19 +24,17 @@ int main() #include "ccommon.h"
#include <ctype.h>
-enum utf8_state {
- utf8_ACCEPT = 0,
- utf8_REJECT = 12
-};
-
/* number of codepoints in the utf8 string s, or SIZE_MAX if invalid utf8: */
+enum { UTF8_OK = 0, UTF8_ERROR = 4 };
+typedef struct { uint32_t state, codep, len; } utf8_decode_t;
+
+/* decode next utf8 codepoint. */
+STC_API uint32_t utf8_decode(utf8_decode_t *c, const uint8_t b);
+STC_API const uint8_t* utf8_nextc(utf8_decode_t *c, const uint8_t* s);
STC_API size_t utf8_size(const char *s);
STC_API size_t utf8_size_n(const char *s, size_t n);
STC_API const char* utf8_at(const char *s, size_t index);
-/* decode next utf8 codepoint. */
-STC_API uint32_t utf8_decode(uint32_t *state, uint32_t *codep, const uint32_t byte);
-
STC_INLINE size_t utf8_pos(const char* s, size_t index)
{ return utf8_at(s, index) - s; }
@@ -44,89 +42,82 @@ STC_INLINE bool utf8_valid(const char* s) { return utf8_size(s) != SIZE_MAX; }
STC_INLINE uint32_t utf8_peek(const char *s) {
- uint32_t state = 0, codepoint;
- utf8_decode(&state, &codepoint, (uint8_t)*s);
- return codepoint;
+ utf8_decode_t ctx = {UTF8_OK, 0};
+ utf8_nextc(&ctx, (const uint8_t*)s);
+ return ctx.codep;
}
-STC_INLINE size_t utf8_codepoint_size(const char* s) {
- uint8_t u = *(const uint8_t *)s;
- size_t ret = (u & 0xF0) == 0xE0;
- ret += ret << 1; // 3
- ret |= u < 0x80; // 1
- ret |= ((0xC1 < u) & (u < 0xE0)) << 1; // 2
- ret |= ((0xEF < u) & (u < 0xF5)) << 2; // 4
- return ret;
+STC_INLINE size_t utf8_codep_size(const char *s) {
+ utf8_decode_t ctx = {UTF8_OK, 0};
+ utf8_nextc(&ctx, (const uint8_t*)s);
+ return ctx.len;
}
-STC_INLINE const char *utf8_next(const char *s) {
- const char* t = s + utf8_codepoint_size(s);
-
- uintptr_t p = (uintptr_t)t;
- p &= (uintptr_t) -(*s != 0);
- return (const char *)p;
+STC_INLINE const char* utf8_next(const char *s) {
+ if (!*s) return NULL;
+ utf8_decode_t ctx = {UTF8_OK, 0};
+ return (const char*) utf8_nextc(&ctx, (const uint8_t*)s);
}
// --------------------------- IMPLEMENTATION ---------------------------------
-// Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
-// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
#ifdef _i_implement
+// https://news.ycombinator.com/item?id=15423674
+// https://gist.github.com/s4y/344a355f8c1f99c6a4cb2347ec4323cc
-static const uint8_t utf8_table[] = {
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3,11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
- 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
- 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
- 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
- 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
- 12,36,12,12,12,12,12,12,12,12,12,12,
-};
-
-STC_DEF uint32_t utf8_decode(uint32_t *state, uint32_t *codep,
- const uint32_t byte)
+STC_DEF uint32_t utf8_decode(utf8_decode_t *c, const uint8_t b)
{
- const uint32_t type = utf8_table[byte];
- const uint32_t x = (uint32_t) -(*state != 0);
-
- *codep = (x & ((byte & 0x3fu) | (*codep << 6)))
- | (~x & ((0xff >> type) & byte));
-
- return *state = utf8_table[256 + *state + type];
+ switch (c->state) {
+ case UTF8_OK:
+ if (b < 0x80) c->codep = b, c->len = 1;
+ else if (b < 0xc2) c->state = UTF8_ERROR;
+ else if (b < 0xe0) c->state = 1, c->codep = b & 0x1f, c->len = 2;
+ else if (b < 0xf0) c->state = 2, c->codep = b & 0xf, c->len = 3;
+ else if (b < 0xf5) c->state = 3, c->codep = b & 0x7, c->len = 4;
+ else c->state = UTF8_ERROR;
+ break;
+ case 1: case 2: case 3:
+ if ((b & 0xc0) == 0x80) {
+ c->state -= 1;
+ c->codep = (c->codep << 6) | (b & 0x3f);
+ } else
+ c->state = UTF8_ERROR;
+ }
+ return c->state;
}
+STC_DEF const uint8_t* utf8_nextc(utf8_decode_t *c, const uint8_t* s) {
+ utf8_decode(c, *s++);
+ switch (c->len) {
+ case 4: utf8_decode(c, *s++);
+ case 3: utf8_decode(c, *s++);
+ case 2: utf8_decode(c, *s++);
+ }
+ return s;
+}
STC_DEF size_t utf8_size(const char *s)
{
- uint32_t state = 0, codepoint;
+ utf8_decode_t ctx = {UTF8_OK, 0};
size_t size = 0;
-
while (*s)
- size += !utf8_decode(&state, &codepoint, (uint8_t)*s++);
- return size | (size_t) -(state != 0);
+ size += !utf8_decode(&ctx, (uint8_t)*s++);
+ return !ctx.state ? size : SIZE_MAX;
}
STC_DEF size_t utf8_size_n(const char *s, size_t n)
{
- uint32_t state = 0, codepoint;
+ utf8_decode_t ctx = {UTF8_OK, 0};
size_t size = 0;
-
while ((n-- != 0) & (*s != 0))
- size += !utf8_decode(&state, &codepoint, (uint8_t)*s++);
- return size | (size_t) -(state != 0);
+ size += !utf8_decode(&ctx, (uint8_t)*s++);
+ return !ctx.state ? size : SIZE_MAX;
}
STC_DEF const char* utf8_at(const char *s, size_t index)
{
- uint32_t state = 0, codepoint;
-
- for (size_t k = 0; (k < index) & (*s != 0); ++s)
- k += !utf8_decode(&state, &codepoint, (uint8_t)*s);
+ utf8_decode_t ctx = {UTF8_OK, 0};
+ for (size_t i = 0; (i < index) & (*s != 0); ++s)
+ i += !utf8_decode(&ctx, (uint8_t)*s);
return s;
}
|
