diff options
Diffstat (limited to 'src/string.c')
| -rw-r--r-- | src/string.c | 325 |
1 files changed, 187 insertions, 138 deletions
diff --git a/src/string.c b/src/string.c index b7eef5888..44e3c9069 100644 --- a/src/string.c +++ b/src/string.c @@ -10,6 +10,7 @@ #ifndef MRB_WITHOUT_FLOAT #include <float.h> +#include <math.h> #endif #include <limits.h> #include <stddef.h> @@ -24,7 +25,7 @@ typedef struct mrb_shared_string { int refcnt; - mrb_int capa; + mrb_ssize capa; char *ptr; } mrb_shared_string; @@ -40,8 +41,8 @@ str_init_normal_capa(mrb_state *mrb, struct RString *s, if (p) memcpy(dst, p, len); dst[len] = '\0'; s->as.heap.ptr = dst; - s->as.heap.len = (mrb_int)len; - s->as.heap.aux.capa = (mrb_int)capa; + s->as.heap.len = (mrb_ssize)len; + s->as.heap.aux.capa = (mrb_ssize)capa; RSTR_UNSET_TYPE_FLAG(s); return s; } @@ -66,7 +67,7 @@ static struct RString* str_init_nofree(struct RString *s, const char *p, size_t len) { s->as.heap.ptr = (char *)p; - s->as.heap.len = (mrb_int)len; + s->as.heap.len = (mrb_ssize)len; s->as.heap.aux.capa = 0; /* nofree */ RSTR_SET_TYPE_FLAG(s, NOFREE); return s; @@ -118,7 +119,7 @@ str_new_static(mrb_state *mrb, const char *p, size_t len) if (RSTR_EMBEDDABLE_P(len)) { return str_init_embed(mrb_obj_alloc_string(mrb), p, len); } - if (len >= MRB_INT_MAX) { + if (len >= MRB_SSIZE_MAX) { mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); } return str_init_nofree(mrb_obj_alloc_string(mrb), p, len); @@ -130,7 +131,7 @@ str_new(mrb_state *mrb, const char *p, size_t len) if (RSTR_EMBEDDABLE_P(len)) { return str_init_embed(mrb_obj_alloc_string(mrb), p, len); } - if (len >= MRB_INT_MAX) { + if (len >= MRB_SSIZE_MAX) { mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); } if (p && mrb_ro_data_p(p)) { @@ -162,7 +163,7 @@ mrb_str_new_capa(mrb_state *mrb, size_t capa) if (RSTR_EMBEDDABLE_P(capa)) { s = str_init_embed(mrb_obj_alloc_string(mrb), NULL, 0); } - else if (capa >= MRB_INT_MAX) { + else if (capa >= MRB_SSIZE_MAX) { mrb_raise(mrb, E_ARGUMENT_ERROR, "string capacity size too big"); /* not reached */ s = NULL; @@ -190,8 +191,8 @@ mrb_str_buf_new(mrb_state *mrb, size_t capa) static void resize_capa(mrb_state *mrb, struct RString *s, size_t capacity) { -#if SIZE_MAX > MRB_INT_MAX - mrb_assert(capacity < MRB_INT_MAX); +#if SIZE_MAX > MRB_SSIZE_MAX + mrb_assert(capacity < MRB_SSIZE_MAX); #endif if (RSTR_EMBED_P(s)) { if (!RSTR_EMBEDDABLE_P(capacity)) { @@ -200,7 +201,7 @@ resize_capa(mrb_state *mrb, struct RString *s, size_t capacity) } else { s->as.heap.ptr = (char*)mrb_realloc(mrb, RSTR_PTR(s), capacity+1); - s->as.heap.aux.capa = (mrb_int)capacity; + s->as.heap.aux.capa = (mrb_ssize)capacity; } } @@ -246,6 +247,28 @@ str_decref(mrb_state *mrb, mrb_shared_string *shared) } static void +str_modify_keep_ascii(mrb_state *mrb, struct RString *s) +{ + if (RSTR_SHARED_P(s)) { + mrb_shared_string *shared = s->as.heap.aux.shared; + + if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) { + s->as.heap.aux.capa = shared->capa; + s->as.heap.ptr[s->as.heap.len] = '\0'; + RSTR_UNSET_SHARED_FLAG(s); + mrb_free(mrb, shared); + } + else { + str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len); + str_decref(mrb, shared); + } + } + else if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) { + str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len); + } +} + +static void check_null_byte(mrb_state *mrb, mrb_value str) { mrb_to_str(mrb, str); @@ -278,8 +301,8 @@ static const char utf8len_codepage[256] = 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1, }; -static mrb_int -utf8len(const char* p, const char* e) +mrb_int +mrb_utf8len(const char* p, const char* e) { mrb_int len; mrb_int i; @@ -295,14 +318,14 @@ utf8len(const char* p, const char* e) } mrb_int -mrb_utf8_len(const char *str, mrb_int byte_len) +mrb_utf8_strlen(const char *str, mrb_int byte_len) { mrb_int total = 0; const char *p = str; const char *e = p + byte_len; while (p < e) { - p += utf8len(p, e); + p += mrb_utf8len(p, e); total++; } return total; @@ -318,7 +341,7 @@ utf8_strlen(mrb_value str) return byte_len; } else { - mrb_int utf8_len = mrb_utf8_len(RSTR_PTR(s), byte_len); + mrb_int utf8_len = mrb_utf8_strlen(RSTR_PTR(s), byte_len); if (byte_len == utf8_len) RSTR_SET_ASCII_FLAG(s); return utf8_len; } @@ -339,7 +362,7 @@ chars2bytes(mrb_value s, mrb_int off, mrb_int idx) const char *e = RSTRING_END(s); for (b=i=0; p<e && i<idx; i++) { - n = utf8len(p, e); + n = mrb_utf8len(p, e); b += n; p += n; } @@ -356,7 +379,7 @@ bytes2chars(char *p, mrb_int len, mrb_int bi) mrb_int i; for (i = 0; p < pivot; i ++) { - p += utf8len(p, e); + p += mrb_utf8len(p, e); } if (p != pivot) return -1; return i; @@ -377,7 +400,7 @@ char_adjust(const char *beg, const char *end, const char *ptr) while (p > beg) { p --; if ((*p & 0xc0) != 0x80) { - int clen = utf8len(p, end); + int clen = mrb_utf8len(p, end); if (clen > ptr - p) return p; break; } @@ -440,10 +463,10 @@ str_index_str_by_char_search(mrb_state *mrb, const char *p, const char *pend, co } pivot = p + qstable[(unsigned char)p[slen - 1]]; - if (pivot > pend || pivot < p /* overflowed */) { return -1; } + if (pivot >= pend || pivot < p /* overflowed */) { return -1; } do { - p += utf8len(p, pend); + p += mrb_utf8len(p, pend); off ++; } while (p < pivot); } @@ -462,7 +485,7 @@ str_index_str_by_char(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) for (; pos > 0; pos --) { if (pend - p < 1) { return -1; } - p += utf8len(p, pend); + p += mrb_utf8len(p, pend); } if (slen < 1) { return off; } @@ -481,25 +504,45 @@ str_index_str_by_char(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) #define str_index_str_by_char(mrb, str, sub, pos) str_index_str(mrb, str, sub, pos) #endif +#ifndef MRB_QS_SHORT_STRING_LENGTH +#define MRB_QS_SHORT_STRING_LENGTH 2048 +#endif + static inline mrb_int mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n) { - const unsigned char *x = xs, *xe = xs + m; - const unsigned char *y = ys; - int i; - ptrdiff_t qstable[256]; + if (n + m < MRB_QS_SHORT_STRING_LENGTH) { + const unsigned char *y = ys; + const unsigned char *ye = ys+n-m+1; - /* Preprocessing */ - for (i = 0; i < 256; ++i) - qstable[i] = m + 1; - for (; x < xe; ++x) - qstable[*x] = xe - x; - /* Searching */ - for (; y + m <= ys + n; y += *(qstable + y[m])) { - if (*xs == *y && memcmp(xs, y, m) == 0) - return (mrb_int)(y - ys); + for (;;) { + y = (const unsigned char*)memchr(y, xs[0], (size_t)(ye-y)); + if (y == NULL) return -1; + if (memcmp(xs, y, m) == 0) { + return (mrb_int)(y - ys); + } + y++; + } + return -1; + } + else { + const unsigned char *x = xs, *xe = xs + m; + const unsigned char *y = ys; + int i; + ptrdiff_t qstable[256]; + + /* Preprocessing */ + for (i = 0; i < 256; ++i) + qstable[i] = m + 1; + for (; x < xe; ++x) + qstable[*x] = xe - x; + /* Searching */ + for (; y + m <= ys + n; y += *(qstable + y[m])) { + if (*xs == *y && memcmp(xs, y, m) == 0) + return (mrb_int)(y - ys); + } + return -1; } - return -1; } static mrb_int @@ -531,7 +574,7 @@ str_share(mrb_state *mrb, struct RString *orig, struct RString *s) size_t len = (size_t)orig->as.heap.len; mrb_assert(!RSTR_EMBED_P(orig)); - if (RSTR_NOFREE_P(orig) || RSTR_POOL_P(orig)) { + if (RSTR_NOFREE_P(orig)) { str_init_nofree(s, orig->as.heap.ptr, len); } else if (RSTR_SHARED_P(orig)) { @@ -540,13 +583,13 @@ str_share(mrb_state *mrb, struct RString *orig, struct RString *s) else if (RSTR_FSHARED_P(orig)) { str_init_fshared(orig, s, orig->as.heap.aux.fshared); } - else if (mrb_frozen_p(orig)) { + else if (mrb_frozen_p(orig) && !RSTR_POOL_P(orig)) { str_init_fshared(orig, s, orig); } else { if (orig->as.heap.aux.capa > orig->as.heap.len) { orig->as.heap.ptr = (char *)mrb_realloc(mrb, orig->as.heap.ptr, len+1); - orig->as.heap.aux.capa = len; + orig->as.heap.aux.capa = (mrb_ssize)len; } str_init_shared(mrb, orig, s, NULL); str_init_shared(mrb, orig, orig, s->as.heap.aux.shared); @@ -554,12 +597,9 @@ str_share(mrb_state *mrb, struct RString *orig, struct RString *s) } mrb_value -mrb_str_pool(mrb_state *mrb, mrb_value str) +mrb_str_pool(mrb_state *mrb, const char *p, mrb_int len, mrb_bool nofree) { struct RString *s = (struct RString *)mrb_malloc(mrb, sizeof(struct RString)); - struct RString *orig = mrb_str_ptr(str); - const char *p = RSTR_PTR(orig); - size_t len = (size_t)RSTR_LEN(orig); s->tt = MRB_TT_STRING; s->c = mrb->string_class; @@ -568,7 +608,7 @@ mrb_str_pool(mrb_state *mrb, mrb_value str) if (RSTR_EMBEDDABLE_P(len)) { str_init_embed(s, p, len); } - else if (RSTR_NOFREE_P(orig)) { + else if (nofree) { str_init_nofree(s, p, len); } else { @@ -591,8 +631,8 @@ mrb_str_byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) } else { str_share(mrb, orig, s); - s->as.heap.ptr += beg; - s->as.heap.len = len; + s->as.heap.ptr += (mrb_ssize)beg; + s->as.heap.len = (mrb_ssize)len; } RSTR_COPY_ASCII_FLAG(s, orig); return mrb_obj_value(s); @@ -814,22 +854,7 @@ MRB_API void mrb_str_modify_keep_ascii(mrb_state *mrb, struct RString *s) { mrb_check_frozen(mrb, s); - if (RSTR_SHARED_P(s)) { - mrb_shared_string *shared = s->as.heap.aux.shared; - - if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) { - s->as.heap.aux.capa = shared->capa; - s->as.heap.ptr[s->as.heap.len] = '\0'; - mrb_free(mrb, shared); - } - else { - str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len); - str_decref(mrb, shared); - } - } - else if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) { - str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len); - } + str_modify_keep_ascii(mrb, s); } MRB_API void @@ -954,7 +979,7 @@ mrb_str_times(mrb_state *mrb, mrb_value self) if (times < 0) { mrb_raise(mrb, E_ARGUMENT_ERROR, "negative argument"); } - if (times && MRB_INT_MAX / times < RSTRING_LEN(self)) { + if (times && MRB_SSIZE_MAX / times < RSTRING_LEN(self)) { mrb_raise(mrb, E_ARGUMENT_ERROR, "argument too big"); } @@ -1088,7 +1113,6 @@ mrb_str_equal_m(mrb_state *mrb, mrb_value str1) return mrb_bool_value(mrb_str_equal(mrb, str1, str2)); } /* ---------------------------------- */ -mrb_value mrb_mod_to_s(mrb_state *mrb, mrb_value klass); MRB_API mrb_value mrb_str_to_str(mrb_state *mrb, mrb_value str) @@ -1295,7 +1319,7 @@ str_replace_partial(mrb_state *mrb, mrb_value src, mrb_int pos, mrb_int end, mrb replen = (mrb_nil_p(rep) ? 0 : RSTRING_LEN(rep)); newlen = replen + len - (end - pos); - if (newlen >= MRB_INT_MAX || newlen < replen /* overflowed */) { + if (newlen >= MRB_SSIZE_MAX || newlen < replen /* overflowed */) { mrb_raise(mrb, E_RUNTIME_ERROR, "string size too big"); } @@ -1338,7 +1362,7 @@ str_escape(mrb_state *mrb, mrb_value str, mrb_bool inspect) unsigned char c, cc; #ifdef MRB_UTF8_STRING if (inspect) { - mrb_int clen = utf8len(p, pend); + mrb_int clen = mrb_utf8len(p, pend); if (clen > 1) { mrb_int i; @@ -1641,7 +1665,7 @@ mrb_str_chop_bang(mrb_state *mrb, mrb_value str) const char* t = RSTR_PTR(s), *p = t; const char* e = p + RSTR_LEN(s); while (p<e) { - mrb_int clen = utf8len(p, e); + mrb_int clen = mrb_utf8len(p, e); if (p + clen>=e) break; p += clen; } @@ -1942,15 +1966,10 @@ mrb_str_intern(mrb_state *mrb, mrb_value self) MRB_API mrb_value mrb_obj_as_string(mrb_state *mrb, mrb_value obj) { - mrb_value str; - if (mrb_string_p(obj)) { return obj; } - str = mrb_funcall(mrb, obj, "to_s", 0); - if (!mrb_string_p(str)) - return mrb_any_to_s(mrb, obj); - return str; + return mrb_str_to_str(mrb, obj); } MRB_API mrb_value @@ -2018,7 +2037,7 @@ mrb_str_reverse_bang(mrb_state *mrb, mrb_value str) p = RSTR_PTR(s); e = p + RSTR_LEN(s); while (p<e) { - mrb_int clen = utf8len(p, e); + mrb_int clen = mrb_utf8len(p, e); str_reverse(p, p + clen - 1); p += clen; } @@ -2241,7 +2260,7 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) return result; } -static mrb_value +mrb_value mrb_str_len_to_inum(mrb_state *mrb, const char *str, mrb_int len, mrb_int base, int badcheck) { const char *p = str; @@ -2352,7 +2371,7 @@ mrb_str_len_to_inum(mrb_state *mrb, const char *str, mrb_int len, mrb_int base, if (*(p - 1) == '0') p--; } - if (p == pend) { + if (p == pend || *p == '_') { if (badcheck) goto bad; return mrb_fixnum_value(0); } @@ -2391,9 +2410,10 @@ mrb_str_len_to_inum(mrb_state *mrb, const char *str, mrb_int len, mrb_int base, } val = (mrb_int)n; if (badcheck) { - if (p == str) goto bad; /* no number */ + if (p == str) goto bad; /* no number */ + if (*(p - 1) == '_') goto bad; /* trailing '_' */ while (p<pend && ISSPACE(*p)) p++; - if (p<pend) goto bad; /* trailing garbage */ + if (p<pend) goto bad; /* trailing garbage */ } return mrb_fixnum_value(sign ? val : -val); @@ -2427,15 +2447,12 @@ mrb_string_value_cstr(mrb_state *mrb, mrb_value *ptr) if (p[len] == '\0') { return p; } - if (mrb_frozen_p(ps) || RSTR_CAPA(ps) == len) { - ps = str_new(mrb, NULL, len+1); - memcpy(RSTR_PTR(ps), p, len); - RSTR_SET_LEN(ps, len); - *ptr = mrb_obj_value(ps); - } - else { - mrb_str_modify(mrb, ps); - } + + /* + * Even after str_modify_keep_ascii(), NULL termination is not ensured if + * RSTR_SET_LEN() is used explicitly (e.g. String#delete_suffix!). + */ + str_modify_keep_ascii(mrb, ps); RSTR_PTR(ps)[len] = '\0'; return RSTR_PTR(ps); } @@ -2492,73 +2509,105 @@ mrb_str_to_i(mrb_state *mrb, mrb_value self) } #ifndef MRB_WITHOUT_FLOAT -MRB_API double -mrb_cstr_to_dbl(mrb_state *mrb, const char * p, mrb_bool badcheck) +double +mrb_str_len_to_dbl(mrb_state *mrb, const char *s, size_t len, mrb_bool badcheck) { + char buf[DBL_DIG * 4 + 20]; + const char *p = s, *p2; + const char *pend = p + len; char *end; - char buf[DBL_DIG * 4 + 10]; + char *n; + char prev = 0; double d; - - enum {max_width = 20}; + mrb_bool dot = FALSE; if (!p) return 0.0; - while (ISSPACE(*p)) p++; - - if (!badcheck && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { - return 0.0; + while (p<pend && ISSPACE(*p)) p++; + p2 = p; + + if (pend - p > 2 && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { + mrb_value x; + + if (!badcheck) return 0.0; + x = mrb_str_len_to_inum(mrb, p, pend-p, 0, badcheck); + if (mrb_fixnum_p(x)) + d = (double)mrb_fixnum(x); + else /* if (mrb_float_p(x)) */ + d = mrb_float(x); + return d; + } + while (p < pend) { + if (!*p) { + if (badcheck) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string for Float contains null byte"); + /* not reached */ + } + pend = p; + p = p2; + goto nocopy; + } + if (!badcheck && *p == ' ') { + pend = p; + p = p2; + goto nocopy; + } + if (*p == '_') break; + p++; } + p = p2; + n = buf; + while (p < pend) { + char c = *p++; + if (c == '.') dot = TRUE; + if (c == '_') { + /* remove an underscore between digits */ + if (n == buf || !ISDIGIT(prev) || p == pend) { + if (badcheck) goto bad; + break; + } + } + else if (badcheck && prev == '_' && !ISDIGIT(c)) goto bad; + else { + const char *bend = buf+sizeof(buf)-1; + if (n==bend) { /* buffer overflow */ + if (dot) break; /* cut off remaining fractions */ + return INFINITY; + } + *n++ = c; + } + prev = c; + } + *n = '\0'; + p = buf; + pend = n; +nocopy: d = mrb_float_read(p, &end); if (p == end) { if (badcheck) { bad: - mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for float(%s)", p); + mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for float(%!s)", s); /* not reached */ } return d; } - if (*end) { - char *n = buf; - char *e = buf + sizeof(buf) - 1; - char prev = 0; - - while (p < end && n < e) prev = *n++ = *p++; - while (*p) { - if (*p == '_') { - /* remove underscores between digits */ - if (badcheck) { - if (n == buf || !ISDIGIT(prev)) goto bad; - ++p; - if (!ISDIGIT(*p)) goto bad; - } - else { - while (*++p == '_'); - continue; - } - } - prev = *p++; - if (n < e) *n++ = prev; - } - *n = '\0'; - p = buf; - - if (!badcheck && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { - return 0.0; - } - - d = mrb_float_read(p, &end); - if (badcheck) { - if (!end || p == end) goto bad; - while (*end && ISSPACE(*end)) end++; - if (*end) goto bad; - } + if (badcheck) { + if (!end || p == end) goto bad; + while (end<pend && ISSPACE(*end)) end++; + if (end<pend) goto bad; } return d; } MRB_API double +mrb_cstr_to_dbl(mrb_state *mrb, const char *s, mrb_bool badcheck) +{ + return mrb_str_len_to_dbl(mrb, s, strlen(s), badcheck); +} + +MRB_API double mrb_str_to_dbl(mrb_state *mrb, mrb_value str, mrb_bool badcheck) { - return mrb_cstr_to_dbl(mrb, RSTRING_CSTR(mrb, str), badcheck); + return mrb_str_len_to_dbl(mrb, RSTRING_PTR(str), RSTRING_LEN(str), badcheck); } /* 15.2.10.5.39 */ @@ -2678,21 +2727,21 @@ mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len) capa = RSTR_CAPA(s); total = RSTR_LEN(s)+len; - if (total >= MRB_INT_MAX) { + if (total >= MRB_SSIZE_MAX) { size_error: mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); } if (capa <= total) { if (capa == 0) capa = 1; while (capa <= total) { - if (capa <= MRB_INT_MAX / 2) { + if (capa <= MRB_SSIZE_MAX / 2) { capa *= 2; } else { capa = total+1; } } - if (capa <= total || capa > MRB_INT_MAX) { + if (capa <= total || capa > MRB_SSIZE_MAX) { goto size_error; } resize_capa(mrb, s, capa); @@ -2701,7 +2750,7 @@ mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len) ptr = RSTR_PTR(s) + off; } memcpy(RSTR_PTR(s) + RSTR_LEN(s), ptr, len); - mrb_assert_int_fit(size_t, total, mrb_int, MRB_INT_MAX); + mrb_assert_int_fit(size_t, total, mrb_ssize, MRB_SSIZE_MAX); RSTR_SET_LEN(s, total); RSTR_PTR(s)[total] = '\0'; /* sentinel */ return str; @@ -2710,7 +2759,7 @@ mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len) MRB_API mrb_value mrb_str_cat_cstr(mrb_state *mrb, mrb_value str, const char *ptr) { - return mrb_str_cat(mrb, str, ptr, strlen(ptr)); + return mrb_str_cat(mrb, str, ptr, ptr ? strlen(ptr) : 0); } MRB_API mrb_value @@ -2873,7 +2922,7 @@ mrb_init_string(mrb_state *mrb) { struct RClass *s; - mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << MRB_STR_EMBED_LEN_BITSIZE), + mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << MRB_STR_EMBED_LEN_BIT), "pointer size too big for embedded string"); mrb->string_class = s = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */ |
