diff options
| author | Tyge Løvset <[email protected]> | 2022-06-09 16:59:52 +0200 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-06-09 16:59:52 +0200 |
| commit | d463acdbee5bb3a9509cb8414602f495408583b4 (patch) | |
| tree | d037f43f3f3a5a2466054419cb6d6101508602a0 | |
| parent | 3ee36759b8567a72a8349c312fc7dbe975de9e02 (diff) | |
| download | STC-modified-d463acdbee5bb3a9509cb8414602f495408583b4.tar.gz STC-modified-d463acdbee5bb3a9509cb8414602f495408583b4.zip | |
Added utf8_peek()
| -rw-r--r-- | include/stc/utf8.h | 3 | ||||
| -rw-r--r-- | src/utf8code.c | 10 |
2 files changed, 12 insertions, 1 deletions
diff --git a/include/stc/utf8.h b/include/stc/utf8.h index 53d6cf40..b7edd2cb 100644 --- a/include/stc/utf8.h +++ b/include/stc/utf8.h @@ -41,6 +41,7 @@ bool utf8_valid_n(const char* s, size_t n); int utf8_icmp_n(size_t u8max, const char* s1, size_t n1, const char* s2, size_t n2); unsigned utf8_encode(char *out, uint32_t c); +uint32_t utf8_peek(const char *s, int pos); /* decode next utf8 codepoint. https://bjoern.hoehrmann.de/utf-8/decoder/dfa */ typedef struct { uint32_t state, codep; } utf8_decode_t; @@ -90,7 +91,7 @@ STC_INLINE const char* utf8_at(const char *s, size_t index) { return s; } -STC_INLINE size_t utf8_pos(const char* s, size_t index) +STC_INLINE size_t utf8_pos(const char* s, size_t index) { return utf8_at(s, index) - s; } #endif diff --git a/src/utf8code.c b/src/utf8code.c index 9613ba95..1c1e4336 100644 --- a/src/utf8code.c +++ b/src/utf8code.c @@ -45,6 +45,16 @@ unsigned utf8_encode(char *out, uint32_t c) return 0; } +uint32_t utf8_peek(const char* s, int pos) { + int inc = 1; + if (pos < 0) pos = -pos, inc = -1; + while (pos) + pos -= (*(s += inc) & 0xC0) != 0x80; + utf8_decode_t d = {.state=0}; + do { utf8_decode(&d, (uint8_t)*s++); } while (d.state); + return d.codep; +} + bool utf8_valid(const char* s) { utf8_decode_t d = {.state=0}; while (*s) |
