summaryrefslogtreecommitdiffhomepage
path: root/include/stc/utf8.h
blob: 50fbefc75413ecfc0fad614bdd9189000359bdf9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#ifndef UTF8_H_INCLUDED
#define UTF8_H_INCLUDED
/*
// Example:
#include <stc/cstr.h>
#include <stc/csview.h>

int main()
{
    c_auto (cstr, s1) {
        s1 = cstr_new("hell😀 w😀rld");
        printf("%s\n", cstr_str(&s1));
        cstr_replace_sv(&s1, utf8_substr(cstr_str(&s1), 7, 1), c_sv("🐨"));
        printf("%s\n", cstr_str(&s1));

        csview sv = csview_from_s(s1);
        c_foreach (i, csview, sv)
            printf("%" c_PRIsv ",", c_ARGsv(i.codep));
    }
}
// Output:
// hell😀 w😀rld
// hell😀 w🐨rld
// h,e,l,l,😀, ,w,🐨,r,l,d,
*/
#include "ccommon.h"
#include <ctype.h>

/* number of codepoints in the utf8 string s, or SIZE_MAX if invalid utf8: */
enum { UTF8_OK = 0, UTF8_ERROR = 4 };
typedef struct { uint32_t state, codep, size; } utf8_decode_t;

/* decode next utf8 codepoint. */
STC_API size_t          utf8_encode(char *out, uint32_t c);
STC_API uint32_t        utf8_decode(utf8_decode_t *d, const uint8_t b);
STC_API const uint8_t*  utf8_next(utf8_decode_t *d, const uint8_t* u);
STC_API size_t          utf8_size(const char *s);
STC_API size_t          utf8_size_n(const char *s, size_t n);
STC_API const char*     utf8_at(const char *s, size_t index);

STC_INLINE size_t utf8_pos(const char* s, size_t index) 
    { return utf8_at(s, index) - s; }

STC_INLINE bool utf8_valid(const char* s)
    { return utf8_size(s) != SIZE_MAX; }

STC_INLINE uint32_t utf8_peek(const char *s) {
    utf8_decode_t d = {UTF8_OK, 0};
    utf8_next(&d, (const uint8_t*)s);
    return d.codep;
}

STC_INLINE size_t utf8_codep_size(const char *s) {
    utf8_decode_t d = {UTF8_OK, 0};
    utf8_next(&d, (const uint8_t*)s);
    return d.size;
}

// --------------------------- IMPLEMENTATION ---------------------------------
#ifdef i_implement
// https://news.ycombinator.com/item?id=15423674
// https://gist.github.com/s4y/344a355f8c1f99c6a4cb2347ec4323cc

STC_DEF uint32_t utf8_decode(utf8_decode_t *d, const uint8_t b)
{
    switch (d->state) {
    case UTF8_OK:
        if      (b < 0x80) d->codep = b, d->size = 1;
        else if (b < 0xC2) d->state = UTF8_ERROR, d->size = 0;
        else if (b < 0xE0) d->state = 1, d->codep = b & 0x1F, d->size = 2;
        else if (b < 0xF0) d->state = 2, d->codep = b & 0x0F, d->size = 3;
        else if (b < 0xF5) d->state = 3, d->codep = b & 0x07, d->size = 4;
        else d->state = UTF8_ERROR, d->size = 0;
        break;
    case 1: case 2: case 3:
        if ((b & 0xC0) == 0x80) {
            d->state -= 1;
            d->codep = (d->codep << 6) | (b & 0x3F);
        } else
            d->state = UTF8_ERROR, d->size = 0;
    }
    return d->state;
}

STC_DEF size_t utf8_encode(char *out, uint32_t c)
{
    char* p = out;
    if (c < 0x80U) {
        *p++ = (char) c;
    } else if (c < 0x0800U) {
        *p++ = (char) ((c>>6  & 0x1F) | 0xC0);
        *p++ = (char) ((c     & 0x3F) | 0x80);
    } else if (c < 0x010000U) {
        if (c < 0xD800U || c >= 0xE000U) {
            *p++ = (char) ((c>>12 & 0x0F) | 0xE0);
            *p++ = (char) ((c>>6  & 0x3F) | 0x80);
            *p++ = (char) ((c     & 0x3F) | 0x80);
        }
    } else if (c < 0x110000U) {
        *p++ = (char) ((c>>18 & 0x07) | 0xF0);
        *p++ = (char) ((c>>12 & 0x3F) | 0x80);
        *p++ = (char) ((c>>6  & 0x3F) | 0x80);
        *p++ = (char) ((c     & 0x3F) | 0x80);
    }
    return p - out;
}

STC_DEF const uint8_t* utf8_next(utf8_decode_t *d, const uint8_t* u) {
    utf8_decode(d, *u++);
    switch (d->size) {
        case 4: utf8_decode(d, *u++);
        case 3: utf8_decode(d, *u++);
        case 2: utf8_decode(d, *u++);
    }
    return u;
}

STC_DEF size_t utf8_size(const char *s)
{
    utf8_decode_t d = {UTF8_OK, 0};
    size_t size = 0;
    while (*s)
        size += !utf8_decode(&d, (uint8_t)*s++);
    return d.state ? SIZE_MAX : size;
}

STC_DEF size_t utf8_size_n(const char *s, size_t n)
{
    utf8_decode_t d = {UTF8_OK, 0};
    size_t size = 0;
    while ((n-- != 0) & (*s != 0))
        size += !utf8_decode(&d, (uint8_t)*s++);
    return !d.state ? size : SIZE_MAX;
}

STC_DEF const char* utf8_at(const char *s, size_t index)
{
    utf8_decode_t d = {UTF8_OK, 0};
    for (size_t i = 0; (i < index) & (*s != 0); ++s)
        i += !utf8_decode(&d, (uint8_t)*s);
    return s;
}

#endif
#endif
#undef i_opt