summaryrefslogtreecommitdiffhomepage
path: root/include
diff options
context:
space:
mode:
authorTyge Løvset <[email protected]>2022-01-12 14:28:22 +0100
committerTyge Løvset <[email protected]>2022-01-12 14:28:22 +0100
commit705421d189f71a95650bf44d9152a3d527b544fb (patch)
treeef6ed58e8b1e717a1b61d0a2042c3e18e653fca8 /include
parentd616e7db3a2325646e8647cdc433b12e9438c251 (diff)
downloadSTC-modified-705421d189f71a95650bf44d9152a3d527b544fb.tar.gz
STC-modified-705421d189f71a95650bf44d9152a3d527b544fb.zip
Fixed utf8 API and functions.
Diffstat (limited to 'include')
-rw-r--r--include/stc/utf8.h59
1 files changed, 52 insertions, 7 deletions
diff --git a/include/stc/utf8.h b/include/stc/utf8.h
index f397c0d0..3041b26e 100644
--- a/include/stc/utf8.h
+++ b/include/stc/utf8.h
@@ -10,11 +10,16 @@ enum utf8_state {
};
/* number of codepoints in the utf8 string s, or SIZE_MAX if invalid utf8: */
-STC_API size_t utf8_codepoint_count(const char *s);
-STC_API size_t utf8_codepoint_count_n(const char *s, size_t n);
+STC_API size_t utf8_size(const char *s);
+STC_API size_t utf8_size_n(const char *s, size_t n);
+STC_API const char* utf8_at(const char *s, size_t index);
+
/* decode next utf8 codepoint. */
STC_API uint32_t utf8_decode(uint32_t *state, uint32_t *codep, const uint32_t byte);
+STC_INLINE bool utf8_valid(const char* str)
+ { return utf8_size(str) != SIZE_MAX; }
+
STC_INLINE uint32_t utf8_peek(const char *s)
{
uint32_t state = 0, codepoint;
@@ -41,10 +46,38 @@ STC_INLINE const char *utf8_next(const char *s)
return (const char *)p;
}
+#ifdef CSVIEW_H_INCLUDED
+STC_INLINE size_t csview_size_utf8(csview sv)
+ { return utf8_size(sv.str); }
+
+STC_INLINE bool csview_valid_utf8(csview sv)
+ { return utf8_valid(sv.str); }
+
+STC_INLINE csview csview_from_utf8cp(const char* str)
+ { return c_make(csview){str, utf8_codepoint_width((uint8_t)str[0])}; }
+
+STC_INLINE csview csview_at_utf8(const char* str, size_t idx)
+ { return csview_from_utf8cp(utf8_at(str, idx)); }
+#endif
+
+#ifdef CSTR_H_INCLUDED
+STC_INLINE size_t cstr_size_utf8(cstr s)
+ { return utf8_size(cstr_str(&s)); }
+
+STC_INLINE bool cstr_valid_utf8(cstr s)
+ { return utf8_valid(cstr_str(&s)); }
+
+#endif
+
// --------------------------- IMPLEMENTATION ---------------------------------
-#ifdef _i_implement
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
-STC_DEF const uint8_t utf8_table[] = {
+#ifdef _i_implement
+#ifdef _i_static
+static
+#endif
+const uint8_t utf8_table[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -73,24 +106,36 @@ STC_DEF uint32_t utf8_decode(uint32_t *state, uint32_t *codep,
}
-STC_DEF size_t utf8_codepoint_count(const char *s)
+STC_DEF size_t utf8_size(const char *s)
{
uint32_t state = 0, codepoint;
size_t size = 0;
+
while (*s)
size += !utf8_decode(&state, &codepoint, (uint8_t)*s++);
return size | (size_t) -(state != 0);
}
-STC_DEF size_t utf8_codepoint_count_n(const char *s, size_t n)
+STC_DEF size_t utf8_size_n(const char *s, size_t n)
{
uint32_t state = 0, codepoint;
size_t size = 0;
- while (n-- && *s)
+
+ while ((n-- != 0) & (*s != 0))
size += !utf8_decode(&state, &codepoint, (uint8_t)*s++);
return size | (size_t) -(state != 0);
}
+STC_DEF const char* utf8_at(const char *s, size_t index)
+{
+ uint32_t state = 0, codepoint;
+
+ for (size_t size = 0; (size < index) & (*s != 0); ++s)
+ size += !utf8_decode(&state, &codepoint, (uint8_t)*s);
+ return s;
+}
+
+
#endif
#endif
#undef i_opt