diff options
| -rw-r--r-- | examples/regex_match.c | 8 | ||||
| -rw-r--r-- | include/stc/cstr.h | 2 | ||||
| -rw-r--r-- | include/stc/utf8.h | 7 | ||||
| -rw-r--r-- | src/cregex.c | 18 | ||||
| -rw-r--r-- | src/utf8code.c | 9 |
5 files changed, 24 insertions, 20 deletions
diff --git a/examples/regex_match.c b/examples/regex_match.c index e60fd519..05161b90 100644 --- a/examples/regex_match.c +++ b/examples/regex_match.c @@ -24,6 +24,14 @@ int main() printf("%" c_PRIsv " ; ", c_ARGsv(m[0])); } puts(""); + + res = cregex_compile(&re, "(.+)\\b(.+)", 0); + printf("groups: %d\n", res); + if ((res = cregex_find(&re, "hello@wørld", 10, m, 0)) > 0) { + c_forrange (i, res) + printf("match: [%" c_PRIsv "]\n", c_ARGsv(m[i])); + } else + printf("err: %d\n", res); } } diff --git a/include/stc/cstr.h b/include/stc/cstr.h index 8116fce2..9c2c9c0c 100644 --- a/include/stc/cstr.h +++ b/include/stc/cstr.h @@ -171,7 +171,7 @@ STC_INLINE size_t cstr_capacity(cstr s) extern cstr cstr_tofold(const cstr* self); extern cstr cstr_tolower(const cstr* self); extern cstr cstr_toupper(const cstr* self); -extern void cstr_foldcase(cstr* self); +extern void cstr_casefold(cstr* self); extern void cstr_lowercase(cstr* self); extern void cstr_uppercase(cstr* self); diff --git a/include/stc/utf8.h b/include/stc/utf8.h index fb06de62..41d2f315 100644 --- a/include/stc/utf8.h +++ b/include/stc/utf8.h @@ -36,7 +36,6 @@ bool utf8_isalnum(uint32_t c); uint32_t utf8_casefold(uint32_t c); uint32_t utf8_tolower(uint32_t c); uint32_t utf8_toupper(uint32_t c); -bool utf8_valid(const char* s); bool utf8_valid_n(const char* s, size_t nbytes); int utf8_icmp_n(size_t u8max, const char* s1, size_t n1, const char* s2, size_t n2); @@ -59,7 +58,11 @@ STC_INLINE int utf8_icmp(const char* s1, const char* s2) { return utf8_icmp_n(~(size_t)0, s1, ~(size_t)0, s2, ~(size_t)0); } -/* number of characters in the utf8 codepoint from s */ +STC_INLINE bool utf8_valid(const char* s) { + return utf8_valid_n(s, ~(size_t)0); +} + +/* number of bytes in the utf8 codepoint from s */ STC_INLINE unsigned utf8_chr_size(const char *s) { unsigned b = (uint8_t)*s; if (b < 0x80) return 1; diff --git a/src/cregex.c b/src/cregex.c index 575f995c..69fc6cbb 100644 --- a/src/cregex.c +++ b/src/cregex.c @@ -210,10 +210,10 @@ static const char* utfruneicase(const char *s, Rune c) { Rune r; - c = utf8_tolower(c); + c = utf8_casefold(c); for (;;) { int n = chartorune(&r, s); - if (utf8_tolower(r) == c) return s; + if (utf8_casefold(r) == c) return s; if ((r == 0) | (n == 0)) return NULL; s += n; } @@ -793,17 +793,17 @@ bldcclass(Parser *par) } static Reprog* -regcomp1(Parser *par, const char *s, int cflags) +regcomp1(Reprog *progp, Parser *par, const char *s, int cflags) { Token token; Reprog *volatile pp; /* get memory for the program. estimated max usage */ const int instcap = 5 + 6*strlen(s); - pp = (Reprog *)malloc(sizeof(Reprog) + instcap*sizeof(Reinst)); + pp = (Reprog *)realloc(progp, sizeof(Reprog) + instcap*sizeof(Reinst)); if (pp == NULL) { + pp = progp; rcerror(par, creg_outofmemory); - return NULL; } pp->flags.caseless = (cflags & creg_caseless) != 0; pp->flags.dotall = (cflags & creg_dotall) != 0; @@ -918,7 +918,7 @@ runematch(Rune s, Rune r, bool icase) case UTF_XD: inv = 1; case UTF_xd: return inv ^ utf8_isxdigit(r); } - return icase ? utf8_tolower(s) == utf8_tolower(r) : s == r; + return icase ? utf8_casefold(s) == utf8_casefold(r) : s == r; } /* @@ -1033,8 +1033,8 @@ regexec1(const Reprog *progp, /* program to run */ case NWBOUND: ok = true; case WBOUND: /* fallthrough */ - if (ok ^ (s == bol || s == j->eol || ((utf8_isalnum(s[-1]) || s[-1] == '_') - ^ (utf8_isalnum(s[ 0]) || s[ 0] == '_')))) + if (ok ^ (s == bol || s == j->eol || ((utf8_isalnum(utf8_peek(s, -1)) || s[-1] == '_') + ^ (utf8_isalnum(utf8_peek(s, 0)) || s[0] == '_')))) continue; break; case NCCLASS: @@ -1205,7 +1205,7 @@ void cregex_replace( int cregex_compile(cregex *rx, const char* pattern, int cflags) { Parser par; - rx->prog = regcomp1(&par, pattern, cflags); + rx->prog = regcomp1(rx->prog, &par, pattern, cflags); if (rx->prog) return 1 + rx->prog->nsubids; return par.errors; diff --git a/src/utf8code.c b/src/utf8code.c index f64ede70..dff10409 100644 --- a/src/utf8code.c +++ b/src/utf8code.c @@ -56,13 +56,6 @@ uint32_t utf8_peek(const char* s, int pos) { return d.codep; } -bool utf8_valid(const char* s) { - utf8_decode_t d = {.state=0}; - while (*s) - utf8_decode(&d, (uint8_t)*s++); - return d.state == 0; -} - bool utf8_valid_n(const char* s, size_t nbytes) { utf8_decode_t d = {.state=0}; while ((nbytes-- != 0) & (*s != 0)) @@ -216,7 +209,7 @@ cstr cstr_toupper(const cstr* self) { return cstr_tocase(self, fn_toupper); } -void cstr_foldcase(cstr* self) { +void cstr_casefold(cstr* self) { cstr_take(self, cstr_tocase(self, fn_tofold)); } |
