utf8 fixes and improvements. Some api changes.

author: Tyge Løvset <[email protected]> 2022-06-10 11:29:17 +0200
committer: Tyge Løvset <[email protected]> 2022-06-10 11:29:17 +0200
commit: 8883fc8108428878d3d6291ba8981cf6df72499c (patch)
tree: 9fbdc79019501714dc984c1fbd5eb2c7ea979bb4 /include
parent: f1d09dfcc7570e69eb6e9688b736f7b031b22b2d (diff)
download: STC-modified-8883fc8108428878d3d6291ba8981cf6df72499c.tar.gz
STC-modified-8883fc8108428878d3d6291ba8981cf6df72499c.zip
3 files changed, 22 insertions, 18 deletions
diff --git a/include/stc/cstr.h b/include/stc/cstr.h
index 2dc2ccae..41db1cd3 100644
--- a/include/stc/cstr.h
+++ b/include/stc/cstr.h
@@ -189,21 +189,24 @@ STC_INLINE size_t cstr_u8size(cstr s)
 STC_INLINE size_t cstr_u8size_n(cstr s, size_t nbytes) 
     { return utf8_size_n(cstr_str(&s), nbytes); }
 
-STC_INLINE csview cstr_view_at(const cstr* self, size_t u8idx) {
+STC_INLINE size_t cstr_bytepos(const cstr* self, size_t u8idx)
+    { return utf8_pos(cstr_str(self), u8idx); }
+
+STC_INLINE const char* cstr_at(const cstr* self, size_t u8idx) 
+    { return utf8_at(cstr_str(self), u8idx); }
+
+STC_INLINE csview cstr_chr(const cstr* self, size_t u8idx) {
     csview sv = cstr_sv(self);
     sv.str = utf8_at(sv.str, u8idx);
-    sv.size = utf8_codep_size(sv.str);
+    sv.size = utf8_chr_size(sv.str);
     return sv;
 }
 
-STC_INLINE const char* cstr_at(const cstr* self, size_t u8idx) 
-    { return utf8_at(cstr_str(self), u8idx); }
-
 // utf8 iterator
 
 STC_INLINE cstr_iter cstr_begin(const cstr* self) { 
     const char* str = cstr_str(self);
-    return c_make(cstr_iter){.chr = {str, utf8_codep_size(str)}};
+    return c_make(cstr_iter){.chr = {str, utf8_chr_size(str)}};
 }
 STC_INLINE cstr_iter cstr_end(const cstr* self) {
     csview sv = cstr_sv(self);
@@ -211,7 +214,7 @@ STC_INLINE cstr_iter cstr_end(const cstr* self) {
 }
 STC_INLINE void cstr_next(cstr_iter* it) {
     it->ref += it->chr.size;
-    it->chr.size = utf8_codep_size(it->ref);
+    it->chr.size = utf8_chr_size(it->ref);
 }
 
 
diff --git a/include/stc/csview.h b/include/stc/csview.h
index 6cfd6e82..6d12901b 100644
--- a/include/stc/csview.h
+++ b/include/stc/csview.h
@@ -81,25 +81,25 @@ STC_INLINE csview csview_slice(csview sv, size_t p1, size_t p2) {
 
 /* iterator */
 STC_INLINE csview_iter csview_begin(const csview* self)
-    { return c_make(csview_iter){.chr = {self->str, utf8_codep_size(self->str)}}; }
+    { return c_make(csview_iter){.chr = {self->str, utf8_chr_size(self->str)}}; }
 
 STC_INLINE csview_iter csview_end(const csview* self)
     { return c_make(csview_iter){self->str + self->size}; }
 
 STC_INLINE void csview_next(csview_iter* it)
-    { it->ref += it->chr.size; it->chr.size = utf8_codep_size(it->ref); }
+    { it->ref += it->chr.size; it->chr.size = utf8_chr_size(it->ref); }
 
 /* utf8 */
-STC_INLINE size_t csview_size_u8(csview sv)
+STC_INLINE size_t csview_u8size(csview sv)
     { return utf8_size_n(sv.str, sv.size); }
 
-STC_INLINE csview csview_substr_u8(csview sv, size_t u8pos, size_t u8len) {
+STC_INLINE csview csview_u8substr(csview sv, size_t u8pos, size_t u8len) {
     sv.str = utf8_at(sv.str, u8pos);
     sv.size = utf8_pos(sv.str, u8len);
     return sv;
 }
 
-STC_INLINE bool csview_valid_u8(csview sv) // depends on src/utf8code.c
+STC_INLINE bool csview_valid_utf8(csview sv) // depends on src/utf8code.c
     { return utf8_valid_n(sv.str, sv.size); }
 
 
diff --git a/include/stc/utf8.h b/include/stc/utf8.h
index b7edd2cb..31ea3aa9 100644
--- a/include/stc/utf8.h
+++ b/include/stc/utf8.h
@@ -37,7 +37,7 @@ uint32_t    utf8_casefold(uint32_t c);
 uint32_t    utf8_tolower(uint32_t c);
 uint32_t    utf8_toupper(uint32_t c);
 bool        utf8_valid(const char* s);
-bool        utf8_valid_n(const char* s, size_t n);
+bool        utf8_valid_n(const char* s, size_t nbytes);
 int         utf8_icmp_n(size_t u8max, const char* s1, size_t n1,
                                       const char* s2, size_t n2);
 unsigned    utf8_encode(char *out, uint32_t c);
@@ -60,7 +60,7 @@ STC_INLINE int utf8_icmp(const char* s1, const char* s2) {
 }
 
 /* number of characters in the utf8 codepoint from s */
-STC_INLINE unsigned utf8_codep_size(const char *s) {
+STC_INLINE unsigned utf8_chr_size(const char *s) {
     unsigned b = (uint8_t)*s;
     if (b < 0x80) return 1;
     if (b < 0xC2) return 0;
@@ -74,14 +74,15 @@ STC_INLINE unsigned utf8_codep_size(const char *s) {
 STC_INLINE size_t utf8_size(const char *s) {
     size_t size = 0;
     while (*s)
-        size += (*s++ & 0xC0) != 0x80;
+        size += (*++s & 0xC0) != 0x80;
     return size;
 }
 
-STC_INLINE size_t utf8_size_n(const char *s, size_t n) {
+STC_INLINE size_t utf8_size_n(const char *s, size_t nbytes) {
     size_t size = 0;
-    while ((n-- != 0) & (*s != 0))
-        size += (*s++ & 0xC0) != 0x80;
+    while ((nbytes-- != 0) & (*s != 0)) {
+        size += (*++s & 0xC0) != 0x80;
+    }
     return size;
 }
author	Tyge Løvset <[email protected]>	2022-06-10 11:29:17 +0200
committer	Tyge Løvset <[email protected]>	2022-06-10 11:29:17 +0200
commit	8883fc8108428878d3d6291ba8981cf6df72499c (patch)
tree	9fbdc79019501714dc984c1fbd5eb2c7ea979bb4 /include
parent	f1d09dfcc7570e69eb6e9688b736f7b031b22b2d (diff)
download	STC-modified-8883fc8108428878d3d6291ba8981cf6df72499c.tar.gz STC-modified-8883fc8108428878d3d6291ba8981cf6df72499c.zip