summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorTyge Løvset <[email protected]>2022-06-09 16:59:52 +0200
committerTyge Løvset <[email protected]>2022-06-09 16:59:52 +0200
commitd463acdbee5bb3a9509cb8414602f495408583b4 (patch)
treed037f43f3f3a5a2466054419cb6d6101508602a0
parent3ee36759b8567a72a8349c312fc7dbe975de9e02 (diff)
downloadSTC-modified-d463acdbee5bb3a9509cb8414602f495408583b4.tar.gz
STC-modified-d463acdbee5bb3a9509cb8414602f495408583b4.zip
Added utf8_peek()
-rw-r--r--include/stc/utf8.h3
-rw-r--r--src/utf8code.c10
2 files changed, 12 insertions, 1 deletions
diff --git a/include/stc/utf8.h b/include/stc/utf8.h
index 53d6cf40..b7edd2cb 100644
--- a/include/stc/utf8.h
+++ b/include/stc/utf8.h
@@ -41,6 +41,7 @@ bool utf8_valid_n(const char* s, size_t n);
int utf8_icmp_n(size_t u8max, const char* s1, size_t n1,
const char* s2, size_t n2);
unsigned utf8_encode(char *out, uint32_t c);
+uint32_t utf8_peek(const char *s, int pos);
/* decode next utf8 codepoint. https://bjoern.hoehrmann.de/utf-8/decoder/dfa */
typedef struct { uint32_t state, codep; } utf8_decode_t;
@@ -90,7 +91,7 @@ STC_INLINE const char* utf8_at(const char *s, size_t index) {
return s;
}
-STC_INLINE size_t utf8_pos(const char* s, size_t index)
+STC_INLINE size_t utf8_pos(const char* s, size_t index)
{ return utf8_at(s, index) - s; }
#endif
diff --git a/src/utf8code.c b/src/utf8code.c
index 9613ba95..1c1e4336 100644
--- a/src/utf8code.c
+++ b/src/utf8code.c
@@ -45,6 +45,16 @@ unsigned utf8_encode(char *out, uint32_t c)
return 0;
}
+uint32_t utf8_peek(const char* s, int pos) {
+ int inc = 1;
+ if (pos < 0) pos = -pos, inc = -1;
+ while (pos)
+ pos -= (*(s += inc) & 0xC0) != 0x80;
+ utf8_decode_t d = {.state=0};
+ do { utf8_decode(&d, (uint8_t)*s++); } while (d.state);
+ return d.codep;
+}
+
bool utf8_valid(const char* s) {
utf8_decode_t d = {.state=0};
while (*s)