diff options
Diffstat (limited to 'src/string.c')
| -rw-r--r-- | src/string.c | 1151 |
1 files changed, 694 insertions, 457 deletions
diff --git a/src/string.c b/src/string.c index 2668a2c85..190e87123 100644 --- a/src/string.c +++ b/src/string.c @@ -20,13 +20,12 @@ #include <mruby/class.h> #include <mruby/range.h> #include <mruby/string.h> -#include <mruby/re.h> +#include <mruby/numeric.h> typedef struct mrb_shared_string { - mrb_bool nofree : 1; int refcnt; + mrb_int capa; char *ptr; - mrb_int len; } mrb_shared_string; const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz"; @@ -34,55 +33,114 @@ const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz"; #define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class)) static struct RString* -str_new_static(mrb_state *mrb, const char *p, size_t len) +str_init_normal_capa(mrb_state *mrb, struct RString *s, + const char *p, size_t len, size_t capa) { - struct RString *s; + char *dst = (char *)mrb_malloc(mrb, capa + 1); + if (p) memcpy(dst, p, len); + dst[len] = '\0'; + s->as.heap.ptr = dst; + s->as.heap.len = (mrb_int)len; + s->as.heap.aux.capa = (mrb_int)capa; + RSTR_UNSET_TYPE_FLAG(s); + return s; +} - if (len >= MRB_INT_MAX) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); - } - s = mrb_obj_alloc_string(mrb); +static struct RString* +str_init_normal(mrb_state *mrb, struct RString *s, const char *p, size_t len) +{ + return str_init_normal_capa(mrb, s, p, len, len); +} + +static struct RString* +str_init_embed(struct RString *s, const char *p, size_t len) +{ + if (p) memcpy(RSTR_EMBED_PTR(s), p, len); + RSTR_EMBED_PTR(s)[len] = '\0'; + RSTR_SET_TYPE_FLAG(s, EMBED); + RSTR_SET_EMBED_LEN(s, len); + return s; +} + +static struct RString* +str_init_nofree(struct RString *s, const char *p, size_t len) +{ + s->as.heap.ptr = (char *)p; s->as.heap.len = (mrb_int)len; s->as.heap.aux.capa = 0; /* nofree */ - s->as.heap.ptr = (char *)p; - s->flags = MRB_STR_NOFREE; + RSTR_SET_TYPE_FLAG(s, NOFREE); + return s; +} +static struct RString* +str_init_shared(mrb_state *mrb, const struct RString *orig, struct RString *s, mrb_shared_string *shared) +{ + if (shared) { + shared->refcnt++; + } + else { + shared = (mrb_shared_string *)mrb_malloc(mrb, sizeof(mrb_shared_string)); + shared->refcnt = 1; + shared->ptr = orig->as.heap.ptr; + shared->capa = orig->as.heap.aux.capa; + } + s->as.heap.ptr = orig->as.heap.ptr; + s->as.heap.len = orig->as.heap.len; + s->as.heap.aux.shared = shared; + RSTR_SET_TYPE_FLAG(s, SHARED); return s; } static struct RString* -str_new(mrb_state *mrb, const char *p, size_t len) +str_init_fshared(const struct RString *orig, struct RString *s, struct RString *fshared) { - struct RString *s; + s->as.heap.ptr = orig->as.heap.ptr; + s->as.heap.len = orig->as.heap.len; + s->as.heap.aux.fshared = fshared; + RSTR_SET_TYPE_FLAG(s, FSHARED); + return s; +} - if (p && mrb_ro_data_p(p)) { - return str_new_static(mrb, p, len); - } - s = mrb_obj_alloc_string(mrb); - if (len <= RSTRING_EMBED_LEN_MAX) { - RSTR_SET_EMBED_FLAG(s); - RSTR_SET_EMBED_LEN(s, len); - if (p) { - memcpy(s->as.ary, p, len); - } +static struct RString* +str_init_modifiable(mrb_state *mrb, struct RString *s, const char *p, size_t len) +{ + if (RSTR_EMBEDDABLE_P(len)) { + return str_init_embed(s, p, len); } else { - if (len >= MRB_INT_MAX) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); - } - s->as.heap.ptr = (char *)mrb_malloc(mrb, len+1); - s->as.heap.len = (mrb_int)len; - s->as.heap.aux.capa = (mrb_int)len; - if (p) { - memcpy(s->as.heap.ptr, p, len); - } + return str_init_normal(mrb, s, p, len); } - RSTR_PTR(s)[len] = '\0'; - return s; +} + +static struct RString* +str_new_static(mrb_state *mrb, const char *p, size_t len) +{ + if (RSTR_EMBEDDABLE_P(len)) { + return str_init_embed(mrb_obj_alloc_string(mrb), p, len); + } + if (len >= MRB_INT_MAX) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); + } + return str_init_nofree(mrb_obj_alloc_string(mrb), p, len); +} + +static struct RString* +str_new(mrb_state *mrb, const char *p, size_t len) +{ + if (RSTR_EMBEDDABLE_P(len)) { + return str_init_embed(mrb_obj_alloc_string(mrb), p, len); + } + if (len >= MRB_INT_MAX) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); + } + if (p && mrb_ro_data_p(p)) { + return str_init_nofree(mrb_obj_alloc_string(mrb), p, len); + } + return str_init_normal(mrb, mrb_obj_alloc_string(mrb), p, len); } static inline void -str_with_class(mrb_state *mrb, struct RString *s, mrb_value obj) +str_with_class(struct RString *s, mrb_value obj) { s->c = mrb_str_ptr(obj)->c; } @@ -92,7 +150,7 @@ mrb_str_new_empty(mrb_state *mrb, mrb_value str) { struct RString *s = str_new(mrb, 0, 0); - str_with_class(mrb, s, str); + str_with_class(s, str); return mrb_obj_value(s); } @@ -101,15 +159,15 @@ mrb_str_new_capa(mrb_state *mrb, size_t capa) { struct RString *s; - s = mrb_obj_alloc_string(mrb); - - if (capa >= MRB_INT_MAX) { + if (RSTR_EMBEDDABLE_P(capa)) { + s = str_init_embed(mrb_obj_alloc_string(mrb), NULL, 0); + } + else if (capa >= MRB_INT_MAX) { mrb_raise(mrb, E_ARGUMENT_ERROR, "string capacity size too big"); } - s->as.heap.len = 0; - s->as.heap.aux.capa = (mrb_int)capa; - s->as.heap.ptr = (char *)mrb_malloc(mrb, capa+1); - RSTR_PTR(s)[0] = '\0'; + else { + s = str_init_normal_capa(mrb, mrb_obj_alloc_string(mrb), NULL, 0, capa); + } return mrb_obj_value(s); } @@ -134,14 +192,8 @@ resize_capa(mrb_state *mrb, struct RString *s, size_t capacity) mrb_assert(capacity < MRB_INT_MAX); #endif if (RSTR_EMBED_P(s)) { - if (RSTRING_EMBED_LEN_MAX < capacity) { - char *const tmp = (char *)mrb_malloc(mrb, capacity+1); - const mrb_int len = RSTR_EMBED_LEN(s); - memcpy(tmp, s->as.ary, len); - RSTR_UNSET_EMBED_FLAG(s); - s->as.heap.ptr = tmp; - s->as.heap.len = len; - s->as.heap.aux.capa = (mrb_int)capacity; + if (!RSTR_EMBEDDABLE_P(capacity)) { + str_init_normal_capa(mrb, s, RSTR_EMBED_PTR(s), RSTR_EMBED_LEN(s), capacity); } } else { @@ -156,13 +208,6 @@ mrb_str_new(mrb_state *mrb, const char *p, size_t len) return mrb_obj_value(str_new(mrb, p, len)); } -/* - * call-seq: (Caution! NULL string) - * String.new(str="") => new_str - * - * Returns a new string object containing a copy of <i>str</i>. - */ - MRB_API mrb_value mrb_str_new_cstr(mrb_state *mrb, const char *p) { @@ -193,13 +238,20 @@ str_decref(mrb_state *mrb, mrb_shared_string *shared) { shared->refcnt--; if (shared->refcnt == 0) { - if (!shared->nofree) { - mrb_free(mrb, shared->ptr); - } + mrb_free(mrb, shared->ptr); mrb_free(mrb, shared); } } +static void +check_null_byte(mrb_state *mrb, mrb_value str) +{ + mrb_to_str(mrb, str); + if (memchr(RSTRING_PTR(str), '\0', RSTRING_LEN(str))) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); + } +} + void mrb_gc_free_str(mrb_state *mrb, struct RString *str) { @@ -230,35 +282,47 @@ utf8len(const char* p, const char* e) mrb_int len; mrb_int i; + if ((unsigned char)*p < 0x80) return 1; len = utf8len_codepage[(unsigned char)*p]; - if (p + len > e) return 1; + if (len == 1) return 1; + if (len > e - p) return 1; for (i = 1; i < len; ++i) if ((p[i] & 0xc0) != 0x80) return 1; return len; } -static mrb_int -utf8_strlen(mrb_value str, mrb_int len) +mrb_int +mrb_utf8_len(const char *str, mrb_int byte_len) { mrb_int total = 0; - char* p = RSTRING_PTR(str); - char* e = p; - if (RSTRING(str)->flags & MRB_STR_NO_UTF) { - return RSTRING_LEN(str); - } - e += len < 0 ? RSTRING_LEN(str) : len; - while (p<e) { + const char *p = str; + const char *e = p + byte_len; + + while (p < e) { p += utf8len(p, e); total++; } - if (RSTRING_LEN(str) == total) { - RSTRING(str)->flags |= MRB_STR_NO_UTF; - } return total; } -#define RSTRING_CHAR_LEN(s) utf8_strlen(s, -1) +static mrb_int +utf8_strlen(mrb_value str) +{ + struct RString *s = mrb_str_ptr(str); + mrb_int byte_len = RSTR_LEN(s); + + if (RSTR_ASCII_P(s)) { + return byte_len; + } + else { + mrb_int utf8_len = mrb_utf8_len(RSTR_PTR(s), byte_len); + if (byte_len == utf8_len) RSTR_SET_ASCII_FLAG(s); + return utf8_len; + } +} + +#define RSTRING_CHAR_LEN(s) utf8_strlen(s) /* map character index to byte offset index */ static mrb_int @@ -278,25 +342,136 @@ chars2bytes(mrb_value s, mrb_int off, mrb_int idx) /* map byte offset to character index */ static mrb_int -bytes2chars(char *p, mrb_int bi) +bytes2chars(char *p, mrb_int len, mrb_int bi) { - mrb_int i, b, n; + const char *e = p + (size_t)len; + const char *pivot = p + bi; + mrb_int i; - for (b=i=0; b<bi; i++) { - n = utf8len_codepage[(unsigned char)*p]; - b += n; - p += n; + for (i = 0; p < pivot; i ++) { + p += utf8len(p, e); } - if (b != bi) return -1; + if (p != pivot) return -1; return i; } +static const char * +char_adjust(const char *beg, const char *end, const char *ptr) +{ + if ((ptr > beg || ptr < end) && (*ptr & 0xc0) == 0x80) { + const int utf8_adjust_max = 3; + const char *p; + + if (ptr - beg > utf8_adjust_max) { + beg = ptr - utf8_adjust_max; + } + + p = ptr; + while (p > beg) { + p --; + if ((*p & 0xc0) != 0x80) { + int clen = utf8len(p, end); + if (clen > ptr - p) return p; + break; + } + } + } + + return ptr; +} + +static const char * +char_backtrack(const char *ptr, const char *end) +{ + if (ptr < end) { + const int utf8_bytelen_max = 4; + const char *p; + + if (end - ptr > utf8_bytelen_max) { + ptr = end - utf8_bytelen_max; + } + + p = end; + while (p > ptr) { + p --; + if ((*p & 0xc0) != 0x80) { + int clen = utf8len_codepage[(unsigned char)*p]; + if (clen == end - p) { return p; } + break; + } + } + } + + return end - 1; +} + +static mrb_int +str_index_str_by_char_search(mrb_state *mrb, const char *p, const char *pend, const char *s, const mrb_int slen, mrb_int off) +{ + /* Based on Quick Search algorithm (Boyer-Moore-Horspool algorithm) */ + + ptrdiff_t qstable[1 << CHAR_BIT]; + + /* Preprocessing */ + { + mrb_int i; + + for (i = 0; i < 1 << CHAR_BIT; i ++) { + qstable[i] = slen; + } + for (i = 0; i < slen; i ++) { + qstable[(unsigned char)s[i]] = slen - (i + 1); + } + } + + /* Searching */ + while (p < pend && pend - p >= slen) { + const char *pivot; + + if (memcmp(p, s, slen) == 0) { + return off; + } + + pivot = p + qstable[(unsigned char)p[slen - 1]]; + if (pivot > pend || pivot < p /* overflowed */) { return -1; } + + do { + p += utf8len(p, pend); + off ++; + } while (p < pivot); + } + + return -1; +} + +static mrb_int +str_index_str_by_char(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) +{ + const char *p = RSTRING_PTR(str); + const char *pend = p + RSTRING_LEN(str); + const char *s = RSTRING_PTR(sub); + const mrb_int slen = RSTRING_LEN(sub); + mrb_int off = pos; + + for (; pos > 0; pos --) { + if (pend - p < 1) { return -1; } + p += utf8len(p, pend); + } + + if (slen < 1) { return off; } + + return str_index_str_by_char_search(mrb, p, pend, s, slen, off); +} + #define BYTES_ALIGN_CHECK(pos) if (pos < 0) return mrb_nil_value(); #else #define RSTRING_CHAR_LEN(s) RSTRING_LEN(s) #define chars2bytes(p, off, ci) (ci) -#define bytes2chars(p, bi) (bi) +#define bytes2chars(p, end, bi) (bi) +#define char_adjust(beg, end, ptr) (ptr) +#define char_backtrack(ptr, end) ((end) - 1) #define BYTES_ALIGN_CHECK(pos) +#define str_index_str_by_char(mrb, str, sub, pos) str_index_str(mrb, str, sub, pos) #endif static inline mrb_int @@ -346,111 +521,114 @@ mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n) static void str_make_shared(mrb_state *mrb, struct RString *orig, struct RString *s) { - mrb_shared_string *shared; - mrb_int len = RSTR_LEN(orig); + size_t len = (size_t)orig->as.heap.len; mrb_assert(!RSTR_EMBED_P(orig)); - if (RSTR_SHARED_P(orig)) { - shared = orig->as.heap.aux.shared; - shared->refcnt++; - s->as.heap.ptr = orig->as.heap.ptr; - s->as.heap.len = len; - s->as.heap.aux.shared = shared; - RSTR_SET_SHARED_FLAG(s); - RSTR_UNSET_EMBED_FLAG(s); + if (RSTR_NOFREE_P(orig)) { + str_init_nofree(s, orig->as.heap.ptr, len); + } + else if (RSTR_SHARED_P(orig)) { + str_init_shared(mrb, orig, s, orig->as.heap.aux.shared); } else if (RSTR_FSHARED_P(orig)) { - struct RString *fs; - - fs = orig->as.heap.aux.fshared; - s->as.heap.ptr = orig->as.heap.ptr; - s->as.heap.len = len; - s->as.heap.aux.fshared = fs; - RSTR_SET_FSHARED_FLAG(s); - RSTR_UNSET_EMBED_FLAG(s); + str_init_fshared(orig, s, orig->as.heap.aux.fshared); } else if (MRB_FROZEN_P(orig) && !RSTR_POOL_P(orig)) { - s->as.heap.ptr = orig->as.heap.ptr; - s->as.heap.len = len; - s->as.heap.aux.fshared = orig; - RSTR_SET_FSHARED_FLAG(s); - RSTR_UNSET_EMBED_FLAG(s); + str_init_fshared(orig, s, orig); } else { - shared = (mrb_shared_string *)mrb_malloc(mrb, sizeof(mrb_shared_string)); - shared->refcnt = 2; - shared->nofree = !!RSTR_NOFREE_P(orig); - if (!shared->nofree && orig->as.heap.aux.capa > orig->as.heap.len) { - shared->ptr = (char *)mrb_realloc(mrb, orig->as.heap.ptr, len+1); - orig->as.heap.ptr = shared->ptr; - } - else { - shared->ptr = orig->as.heap.ptr; + if (orig->as.heap.aux.capa > orig->as.heap.len) { + orig->as.heap.ptr = (char *)mrb_realloc(mrb, orig->as.heap.ptr, len+1); + orig->as.heap.aux.capa = len; } - orig->as.heap.aux.shared = shared; - RSTR_SET_SHARED_FLAG(orig); - shared->len = len; - s->as.heap.aux.shared = shared; - s->as.heap.ptr = shared->ptr; - s->as.heap.len = len; - RSTR_SET_SHARED_FLAG(s); - RSTR_UNSET_EMBED_FLAG(s); + str_init_shared(mrb, orig, s, NULL); + str_init_shared(mrb, orig, orig, s->as.heap.aux.shared); } } -static mrb_value -byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) +mrb_value +mrb_str_pool(mrb_state *mrb, mrb_value str) +{ + struct RString *s = (struct RString *)mrb_malloc(mrb, sizeof(struct RString)); + struct RString *orig = mrb_str_ptr(str); + const char *p = RSTR_PTR(orig); + size_t len = (size_t)RSTR_LEN(orig); + + s->tt = MRB_TT_STRING; + s->c = mrb->string_class; + s->flags = 0; + + if (RSTR_EMBEDDABLE_P(len)) { + str_init_embed(s, p, len); + } + else if (RSTR_NOFREE_P(orig)) { + str_init_nofree(s, p, len); + } + else { + str_init_normal(mrb, s, p, len); + } + RSTR_SET_POOL_FLAG(s); + MRB_SET_FROZEN_FLAG(s); + return mrb_obj_value(s); +} + +mrb_value +mrb_str_byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) { struct RString *orig, *s; orig = mrb_str_ptr(str); - if (RSTR_EMBED_P(orig) || RSTR_LEN(orig) == 0 || len <= RSTRING_EMBED_LEN_MAX) { - s = str_new(mrb, RSTR_PTR(orig)+beg, len); + s = mrb_obj_alloc_string(mrb); + if (RSTR_EMBEDDABLE_P(len)) { + str_init_embed(s, RSTR_PTR(orig)+beg, len); } else { - s = mrb_obj_alloc_string(mrb); str_make_shared(mrb, orig, s); s->as.heap.ptr += beg; s->as.heap.len = len; } + RSTR_COPY_ASCII_FLAG(s, orig); return mrb_obj_value(s); } + +static void +str_range_to_bytes(mrb_value str, mrb_int *pos, mrb_int *len) +{ + *pos = chars2bytes(str, 0, *pos); + *len = chars2bytes(str, *pos, *len); +} #ifdef MRB_UTF8_STRING static inline mrb_value str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) { - beg = chars2bytes(str, 0, beg); - len = chars2bytes(str, beg, len); - - return byte_subseq(mrb, str, beg, len); + str_range_to_bytes(str, &beg, &len); + return mrb_str_byte_subseq(mrb, str, beg, len); } #else -#define str_subseq(mrb, str, beg, len) byte_subseq(mrb, str, beg, len) +#define str_subseq(mrb, str, beg, len) mrb_str_byte_subseq(mrb, str, beg, len) #endif -static mrb_value -str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) +mrb_bool +mrb_str_beg_len(mrb_int str_len, mrb_int *begp, mrb_int *lenp) { - mrb_int clen = RSTRING_CHAR_LEN(str); - - if (len < 0) return mrb_nil_value(); - if (clen == 0) { - len = 0; + if (str_len < *begp || *lenp < 0) return FALSE; + if (*begp < 0) { + *begp += str_len; + if (*begp < 0) return FALSE; } - else if (beg < 0) { - beg = clen + beg; - } - if (beg > clen) return mrb_nil_value(); - if (beg < 0) { - beg += clen; - if (beg < 0) return mrb_nil_value(); - } - if (len > clen - beg) - len = clen - beg; - if (len <= 0) { - len = 0; + if (*lenp > str_len - *begp) + *lenp = str_len - *begp; + if (*lenp <= 0) { + *lenp = 0; } - return str_subseq(mrb, str, beg, len); + return TRUE; +} + +static mrb_value +str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) +{ + return mrb_str_beg_len(RSTRING_CHAR_LEN(str), &beg, &len) ? + str_subseq(mrb, str, beg, len) : mrb_nil_value(); } MRB_API mrb_int @@ -490,41 +668,25 @@ str_index_str(mrb_state *mrb, mrb_value str, mrb_value str2, mrb_int offset) return mrb_str_index(mrb, str, ptr, len, offset); } -static void -check_frozen(mrb_state *mrb, struct RString *s) -{ - if (MRB_FROZEN_P(s)) { - mrb_raise(mrb, E_FROZEN_ERROR, "can't modify frozen string"); - } -} - static mrb_value str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2) { - mrb_int len; + size_t len; - check_frozen(mrb, s1); + mrb_check_frozen(mrb, s1); if (s1 == s2) return mrb_obj_value(s1); - s1->flags &= ~MRB_STR_NO_UTF; - s1->flags |= s2->flags&MRB_STR_NO_UTF; - len = RSTR_LEN(s2); + RSTR_COPY_ASCII_FLAG(s1, s2); if (RSTR_SHARED_P(s1)) { str_decref(mrb, s1->as.heap.aux.shared); - RSTR_UNSET_SHARED_FLAG(s1); } else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1) && !RSTR_FSHARED_P(s1) && s1->as.heap.ptr) { mrb_free(mrb, s1->as.heap.ptr); } - RSTR_UNSET_FSHARED_FLAG(s1); - RSTR_UNSET_NOFREE_FLAG(s1); - if (len <= RSTRING_EMBED_LEN_MAX) { - RSTR_UNSET_SHARED_FLAG(s1); - RSTR_UNSET_FSHARED_FLAG(s1); - RSTR_SET_EMBED_FLAG(s1); - memcpy(s1->as.ary, RSTR_PTR(s2), len); - RSTR_SET_EMBED_LEN(s1, len); + len = (size_t)RSTR_LEN(s2); + if (RSTR_EMBEDDABLE_P(len)) { + str_init_embed(s1, RSTR_PTR(s2), len); } else { str_make_shared(mrb, s2, s1); @@ -536,7 +698,7 @@ str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2) static mrb_int str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) { - char *s, *sbeg, *t; + const char *s, *sbeg, *t; struct RString *ps = mrb_str_ptr(str); mrb_int len = RSTRING_LEN(sub); @@ -549,11 +711,12 @@ str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) s = RSTR_PTR(ps) + pos; t = RSTRING_PTR(sub); if (len) { + s = char_adjust(sbeg, sbeg + RSTR_LEN(ps), s); while (sbeg <= s) { if (memcmp(s, t, len) == 0) { return (mrb_int)(s - RSTR_PTR(ps)); } - s--; + s = char_backtrack(sbeg, s); } return -1; } @@ -641,67 +804,34 @@ mrb_locale_from_utf8(const char *utf8, int len) #endif MRB_API void -mrb_str_modify(mrb_state *mrb, struct RString *s) +mrb_str_modify_keep_ascii(mrb_state *mrb, struct RString *s) { - check_frozen(mrb, s); - s->flags &= ~MRB_STR_NO_UTF; + mrb_check_frozen(mrb, s); if (RSTR_SHARED_P(s)) { mrb_shared_string *shared = s->as.heap.aux.shared; - if (shared->nofree == 0 && shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) { - s->as.heap.ptr = shared->ptr; - s->as.heap.aux.capa = shared->len; - RSTR_PTR(s)[s->as.heap.len] = '\0'; + if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) { + s->as.heap.aux.capa = shared->capa; + s->as.heap.ptr[s->as.heap.len] = '\0'; mrb_free(mrb, shared); } else { - char *ptr, *p; - mrb_int len; - - p = RSTR_PTR(s); - len = s->as.heap.len; - if (len < RSTRING_EMBED_LEN_MAX) { - RSTR_SET_EMBED_FLAG(s); - RSTR_SET_EMBED_LEN(s, len); - ptr = RSTR_PTR(s); - } - else { - ptr = (char *)mrb_malloc(mrb, (size_t)len + 1); - s->as.heap.ptr = ptr; - s->as.heap.aux.capa = len; - } - if (p) { - memcpy(ptr, p, len); - } - ptr[len] = '\0'; + str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len); str_decref(mrb, shared); } - RSTR_UNSET_SHARED_FLAG(s); - return; } - if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) { - char *p = s->as.heap.ptr; - mrb_int len = s->as.heap.len; - - RSTR_UNSET_FSHARED_FLAG(s); - RSTR_UNSET_NOFREE_FLAG(s); - RSTR_UNSET_FSHARED_FLAG(s); - if (len < RSTRING_EMBED_LEN_MAX) { - RSTR_SET_EMBED_FLAG(s); - RSTR_SET_EMBED_LEN(s, len); - } - else { - s->as.heap.ptr = (char *)mrb_malloc(mrb, (size_t)len+1); - s->as.heap.aux.capa = len; - } - if (p) { - memcpy(RSTR_PTR(s), p, len); - } - RSTR_PTR(s)[len] = '\0'; - return; + else if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) { + str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len); } } +MRB_API void +mrb_str_modify(mrb_state *mrb, struct RString *s) +{ + mrb_str_modify_keep_ascii(mrb, s); + RSTR_UNSET_ASCII_FLAG(s); +} + MRB_API mrb_value mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len) { @@ -728,23 +858,11 @@ mrb_str_to_cstr(mrb_state *mrb, mrb_value str0) { struct RString *s; - if (!mrb_string_p(str0)) { - mrb_raise(mrb, E_TYPE_ERROR, "expected String"); - } - + check_null_byte(mrb, str0); s = str_new(mrb, RSTRING_PTR(str0), RSTRING_LEN(str0)); - if ((strlen(RSTR_PTR(s)) ^ RSTR_LEN(s)) != 0) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); - } return RSTR_PTR(s); } -/* - * call-seq: (Caution! String("abcd") change) - * String("abcdefg") = String("abcd") + String("efg") - * - * Returns a new string object containing a copy of <i>str</i>. - */ MRB_API void mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other) { @@ -752,12 +870,6 @@ mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other) mrb_str_cat_str(mrb, self, other); } -/* - * call-seq: (Caution! String("abcd") remain) - * String("abcdefg") = String("abcd") + String("efg") - * - * Returns a new string object containing a copy of <i>str</i>. - */ MRB_API mrb_value mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b) { @@ -775,10 +887,13 @@ mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b) /* 15.2.10.5.2 */ /* - * call-seq: (Caution! String("abcd") remain) for stack_argument - * String("abcdefg") = String("abcd") + String("efg") + * call-seq: + * str + other_str -> new_str * - * Returns a new string object containing a copy of <i>str</i>. + * Concatenation---Returns a new <code>String</code> containing + * <i>other_str</i> concatenated to <i>str</i>. + * + * "Hello from " + self.to_s #=> "Hello from main" */ static mrb_value mrb_str_plus_m(mrb_state *mrb, mrb_value self) @@ -838,7 +953,7 @@ mrb_str_times(mrb_state *mrb, mrb_value self) len = RSTRING_LEN(self)*times; str2 = str_new(mrb, 0, len); - str_with_class(mrb, str2, self); + str_with_class(str2, self); p = RSTR_PTR(str2); if (len > 0) { n = RSTRING_LEN(self); @@ -850,6 +965,7 @@ mrb_str_times(mrb_state *mrb, mrb_value self) memcpy(p + n, p, len-n); } p[RSTR_LEN(str2)] = '\0'; + RSTR_COPY_ASCII_FLAG(str2, mrb_str_ptr(self)); return mrb_obj_value(str2); } @@ -979,15 +1095,27 @@ mrb_str_equal_m(mrb_state *mrb, mrb_value str1) return mrb_bool_value(mrb_str_equal(mrb, str1, str2)); } /* ---------------------------------- */ +mrb_value mrb_mod_to_s(mrb_state *mrb, mrb_value klass); + MRB_API mrb_value mrb_str_to_str(mrb_state *mrb, mrb_value str) { - if (!mrb_string_p(str)) { + switch (mrb_type(str)) { + case MRB_TT_STRING: + return str; + case MRB_TT_SYMBOL: + return mrb_sym2str(mrb, mrb_symbol(str)); + case MRB_TT_FIXNUM: + return mrb_fixnum_to_str(mrb, str, 10); + case MRB_TT_CLASS: + case MRB_TT_MODULE: + return mrb_mod_to_s(mrb, str); + default: return mrb_convert_type(mrb, str, MRB_TT_STRING, "String", "to_s"); } - return str; } +/* obslete: use RSTRING_PTR() */ MRB_API const char* mrb_string_value_ptr(mrb_state *mrb, mrb_value str) { @@ -995,6 +1123,7 @@ mrb_string_value_ptr(mrb_state *mrb, mrb_value str) return RSTRING_PTR(str); } +/* obslete: use RSTRING_LEN() */ MRB_API mrb_int mrb_string_value_len(mrb_state *mrb, mrb_value ptr) { @@ -1002,76 +1131,101 @@ mrb_string_value_len(mrb_state *mrb, mrb_value ptr) return RSTRING_LEN(ptr); } -void -mrb_noregexp(mrb_state *mrb, mrb_value self) -{ - mrb_raise(mrb, E_NOTIMP_ERROR, "Regexp class not implemented"); -} - -void -mrb_regexp_check(mrb_state *mrb, mrb_value obj) -{ - if (mrb_regexp_p(mrb, obj)) { - mrb_noregexp(mrb, obj); - } -} - MRB_API mrb_value mrb_str_dup(mrb_state *mrb, mrb_value str) { struct RString *s = mrb_str_ptr(str); struct RString *dup = str_new(mrb, 0, 0); - str_with_class(mrb, dup, str); + str_with_class(dup, str); return str_replace(mrb, dup, s); } -static mrb_value -mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx) -{ - mrb_int idx; +enum str_convert_range { + /* `beg` and `len` are byte unit in `0 ... str.bytesize` */ + STR_BYTE_RANGE_CORRECTED = 1, - mrb_regexp_check(mrb, indx); - switch (mrb_type(indx)) { - case MRB_TT_FIXNUM: - idx = mrb_fixnum(indx); + /* `beg` and `len` are char unit in any range */ + STR_CHAR_RANGE = 2, -num_index: - str = str_substr(mrb, str, idx, 1); - if (!mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value(); - return str; + /* `beg` and `len` are char unit in `0 ... str.size` */ + STR_CHAR_RANGE_CORRECTED = 3, - case MRB_TT_STRING: - if (str_index_str(mrb, str, indx, 0) != -1) - return mrb_str_dup(mrb, indx); - return mrb_nil_value(); + /* `beg` is out of range */ + STR_OUT_OF_RANGE = -1 +}; - case MRB_TT_RANGE: - goto range_arg; +static enum str_convert_range +str_convert_range(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen, mrb_int *beg, mrb_int *len) +{ + if (!mrb_undef_p(alen)) { + *beg = mrb_int(mrb, indx); + *len = mrb_int(mrb, alen); + return STR_CHAR_RANGE; + } + else { + switch (mrb_type(indx)) { + case MRB_TT_FIXNUM: + *beg = mrb_fixnum(indx); + *len = 1; + return STR_CHAR_RANGE; - default: - indx = mrb_Integer(mrb, indx); - if (mrb_nil_p(indx)) { - range_arg: - { - mrb_int beg, len; - - len = RSTRING_CHAR_LEN(str); - switch (mrb_range_beg_len(mrb, indx, &beg, &len, len, TRUE)) { - case 1: - return str_subseq(mrb, str, beg, len); - case 2: - return mrb_nil_value(); + case MRB_TT_STRING: + *beg = str_index_str(mrb, str, indx, 0); + if (*beg < 0) { break; } + *len = RSTRING_LEN(indx); + return STR_BYTE_RANGE_CORRECTED; + + case MRB_TT_RANGE: + goto range_arg; + + default: + indx = mrb_to_int(mrb, indx); + if (mrb_fixnum_p(indx)) { + *beg = mrb_fixnum(indx); + *len = 1; + return STR_CHAR_RANGE; + } +range_arg: + *len = RSTRING_CHAR_LEN(str); + switch (mrb_range_beg_len(mrb, indx, beg, len, *len, TRUE)) { + case MRB_RANGE_OK: + return STR_CHAR_RANGE_CORRECTED; + case MRB_RANGE_OUT: + return STR_OUT_OF_RANGE; default: break; - } } + mrb_raise(mrb, E_TYPE_ERROR, "can't convert to Fixnum"); + } + } + return STR_OUT_OF_RANGE; +} + +static mrb_value +mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen) +{ + mrb_int beg, len; + + switch (str_convert_range(mrb, str, indx, alen, &beg, &len)) { + case STR_CHAR_RANGE_CORRECTED: + return str_subseq(mrb, str, beg, len); + case STR_CHAR_RANGE: + str = str_substr(mrb, str, beg, len); + if (mrb_undef_p(alen) && !mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value(); + return str; + case STR_BYTE_RANGE_CORRECTED: + if (mrb_string_p(indx)) { + return mrb_str_dup(mrb, indx); + } + else { + return mrb_str_byte_subseq(mrb, str, beg, len); } - idx = mrb_fixnum(indx); - goto num_index; + case STR_OUT_OF_RANGE: + default: + return mrb_nil_value(); } - return mrb_nil_value(); /* not reached */ } /* 15.2.10.5.6 */ @@ -1118,20 +1272,118 @@ static mrb_value mrb_str_aref_m(mrb_state *mrb, mrb_value str) { mrb_value a1, a2; - mrb_int argc; - argc = mrb_get_args(mrb, "o|o", &a1, &a2); - if (argc == 2) { - mrb_int n1, n2; + if (mrb_get_args(mrb, "o|o", &a1, &a2) == 1) { + a2 = mrb_undef_value(); + } + + return mrb_str_aref(mrb, str, a1, a2); +} + +static mrb_noreturn void +str_out_of_index(mrb_state *mrb, mrb_value index) +{ + mrb_raisef(mrb, E_INDEX_ERROR, "index %v out of string", index); +} - mrb_regexp_check(mrb, a1); - mrb_get_args(mrb, "ii", &n1, &n2); - return str_substr(mrb, str, n1, n2); +static mrb_value +str_replace_partial(mrb_state *mrb, mrb_value src, mrb_int pos, mrb_int end, mrb_value rep) +{ + const mrb_int shrink_threshold = 256; + struct RString *str = mrb_str_ptr(src); + mrb_int len = RSTR_LEN(str); + mrb_int replen, newlen; + char *strp; + + if (end > len) { end = len; } + + if (pos < 0 || pos > len) { + str_out_of_index(mrb, mrb_fixnum_value(pos)); } - if (argc != 1) { - mrb_raisef(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%S for 1)", mrb_fixnum_value(argc)); + + replen = (mrb_nil_p(rep) ? 0 : RSTRING_LEN(rep)); + newlen = replen + len - (end - pos); + + if (newlen >= MRB_INT_MAX || newlen < replen /* overflowed */) { + mrb_raise(mrb, E_RUNTIME_ERROR, "string size too big"); + } + + mrb_str_modify(mrb, str); + + if (len < newlen) { + resize_capa(mrb, str, newlen); + } + + strp = RSTR_PTR(str); + + memmove(strp + newlen - (len - end), strp + end, len - end); + if (!mrb_nil_p(rep)) { + memmove(strp + pos, RSTRING_PTR(rep), replen); + } + RSTR_SET_LEN(str, newlen); + strp[newlen] = '\0'; + + if (len - newlen >= shrink_threshold) { + resize_capa(mrb, str, newlen); + } + + return src; +} + +static void +mrb_str_aset(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen, mrb_value replace) +{ + mrb_int beg, len, charlen; + + mrb_to_str(mrb, replace); + + switch (str_convert_range(mrb, str, indx, alen, &beg, &len)) { + case STR_OUT_OF_RANGE: + default: + mrb_raise(mrb, E_INDEX_ERROR, "string not matched"); + case STR_CHAR_RANGE: + if (len < 0) { + mrb_raisef(mrb, E_INDEX_ERROR, "negative length %v", alen); + } + charlen = RSTRING_CHAR_LEN(str); + if (beg < 0) { beg += charlen; } + if (beg < 0 || beg > charlen) { str_out_of_index(mrb, indx); } + /* fall through */ + case STR_CHAR_RANGE_CORRECTED: + str_range_to_bytes(str, &beg, &len); + /* fall through */ + case STR_BYTE_RANGE_CORRECTED: + str_replace_partial(mrb, str, beg, beg + len, replace); + } +} + +/* + * call-seq: + * str[fixnum] = replace + * str[fixnum, fixnum] = replace + * str[range] = replace + * str[regexp] = replace + * str[regexp, fixnum] = replace + * str[other_str] = replace + * + * Modify +self+ by replacing the content of +self+. + * The portion of the string affected is determined using the same criteria as +String#[]+. + */ +static mrb_value +mrb_str_aset_m(mrb_state *mrb, mrb_value str) +{ + mrb_value indx, alen, replace; + + switch (mrb_get_args(mrb, "oo|S!", &indx, &alen, &replace)) { + case 2: + replace = alen; + alen = mrb_undef_value(); + break; + case 3: + break; } - return mrb_str_aref(mrb, str, a1); + mrb_str_aset(mrb, str, indx, alen, replace); + return str; } /* 15.2.10.5.8 */ @@ -1154,7 +1406,7 @@ mrb_str_capitalize_bang(mrb_state *mrb, mrb_value str) mrb_bool modify = FALSE; struct RString *s = mrb_str_ptr(str); - mrb_str_modify(mrb, s); + mrb_str_modify_keep_ascii(mrb, s); if (RSTR_LEN(s) == 0 || !RSTR_PTR(s)) return mrb_nil_value(); p = RSTR_PTR(s); pend = RSTR_PTR(s) + RSTR_LEN(s); if (ISLOWER(*p)) { @@ -1213,7 +1465,7 @@ mrb_str_chomp_bang(mrb_state *mrb, mrb_value str) struct RString *s = mrb_str_ptr(str); argc = mrb_get_args(mrb, "|S", &rs); - mrb_str_modify(mrb, s); + mrb_str_modify_keep_ascii(mrb, s); len = RSTR_LEN(s); if (argc == 0) { if (len == 0) return mrb_nil_value(); @@ -1312,7 +1564,7 @@ mrb_str_chop_bang(mrb_state *mrb, mrb_value str) { struct RString *s = mrb_str_ptr(str); - mrb_str_modify(mrb, s); + mrb_str_modify_keep_ascii(mrb, s); if (RSTR_LEN(s) > 0) { mrb_int len; #ifdef MRB_UTF8_STRING @@ -1381,7 +1633,7 @@ mrb_str_downcase_bang(mrb_state *mrb, mrb_value str) mrb_bool modify = FALSE; struct RString *s = mrb_str_ptr(str); - mrb_str_modify(mrb, s); + mrb_str_modify_keep_ascii(mrb, s); p = RSTR_PTR(s); pend = RSTR_PTR(s) + RSTR_LEN(s); while (p < pend) { @@ -1538,32 +1790,26 @@ mrb_str_include(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_index_m(mrb_state *mrb, mrb_value str) { - mrb_value *argv; - mrb_int argc; mrb_value sub; - mrb_int pos, clen; + mrb_int pos; - mrb_get_args(mrb, "*!", &argv, &argc); - if (argc == 2) { - mrb_get_args(mrb, "oi", &sub, &pos); - } - else { - pos = 0; - if (argc > 0) - sub = argv[0]; - else + switch (mrb_get_args(mrb, "|oi", &sub, &pos)) { + case 0: sub = mrb_nil_value(); + /* fall through */ + case 1: + pos = 0; + break; + case 2: + if (pos < 0) { + mrb_int clen = RSTRING_CHAR_LEN(str); + pos += clen; + if (pos < 0) { + return mrb_nil_value(); + } + } + break; } - mrb_regexp_check(mrb, sub); - clen = RSTRING_CHAR_LEN(str); - if (pos < 0) { - pos += clen; - if (pos < 0) { - return mrb_nil_value(); - } - } - if (pos > clen) return mrb_nil_value(); - pos = chars2bytes(str, 0, pos); switch (mrb_type(sub)) { default: { @@ -1571,24 +1817,21 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str) tmp = mrb_check_string_type(mrb, sub); if (mrb_nil_p(tmp)) { - mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub); + mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %v given", sub); } sub = tmp; } /* fall through */ case MRB_TT_STRING: - pos = str_index_str(mrb, str, sub, pos); + pos = str_index_str_by_char(mrb, str, sub, pos); break; } if (pos == -1) return mrb_nil_value(); - pos = bytes2chars(RSTRING_PTR(str), pos); BYTES_ALIGN_CHECK(pos); return mrb_fixnum_value(pos); } -#define STR_REPLACE_SHARED_MIN 10 - /* 15.2.10.5.24 */ /* 15.2.10.5.28 */ /* @@ -1698,6 +1941,18 @@ mrb_ptr_to_str(mrb_state *mrb, void *p) return mrb_obj_value(p_str); } +static inline void +str_reverse(char *p, char *e) +{ + char c; + + while (p < e) { + c = *p; + *p++ = *e; + *e-- = c; + } +} + /* 15.2.10.5.30 */ /* * call-seq: @@ -1708,53 +1963,38 @@ mrb_ptr_to_str(mrb_state *mrb, void *p) static mrb_value mrb_str_reverse_bang(mrb_state *mrb, mrb_value str) { + struct RString *s = mrb_str_ptr(str); + char *p, *e; + #ifdef MRB_UTF8_STRING mrb_int utf8_len = RSTRING_CHAR_LEN(str); - mrb_int len = RSTRING_LEN(str); - - if (utf8_len == len) goto bytes; - if (utf8_len > 1) { - char *buf; - char *p, *e, *r; - - mrb_str_modify(mrb, mrb_str_ptr(str)); - len = RSTRING_LEN(str); - buf = (char*)mrb_malloc(mrb, (size_t)len); - p = buf; - e = buf + len; - - memcpy(buf, RSTRING_PTR(str), len); - r = RSTRING_PTR(str) + len; + mrb_int len = RSTR_LEN(s); + if (utf8_len < 2) return str; + if (utf8_len < len) { + mrb_str_modify(mrb, s); + p = RSTR_PTR(s); + e = p + RSTR_LEN(s); while (p<e) { mrb_int clen = utf8len(p, e); - r -= clen; - memcpy(r, p, clen); + str_reverse(p, p + clen - 1); p += clen; } - mrb_free(mrb, buf); + goto bytes; } - return str; - - bytes: #endif - { - struct RString *s = mrb_str_ptr(str); - char *p, *e; - char c; + if (RSTR_LEN(s) > 1) { mrb_str_modify(mrb, s); - if (RSTR_LEN(s) > 1) { - p = RSTR_PTR(s); - e = p + RSTR_LEN(s) - 1; - while (p < e) { - c = *p; - *p++ = *e; - *e-- = c; - } - } - return str; + goto bytes; } + return str; + + bytes: + p = RSTR_PTR(s); + e = p + RSTR_LEN(s) - 1; + str_reverse(p, e); + return str; } /* ---------------------------------- */ @@ -1797,32 +2037,27 @@ mrb_str_reverse(mrb_state *mrb, mrb_value str) static mrb_value mrb_str_rindex(mrb_state *mrb, mrb_value str) { - mrb_value *argv; - mrb_int argc; mrb_value sub; mrb_int pos, len = RSTRING_CHAR_LEN(str); - mrb_get_args(mrb, "*!", &argv, &argc); - if (argc == 2) { - mrb_get_args(mrb, "oi", &sub, &pos); - if (pos < 0) { - pos += len; + switch (mrb_get_args(mrb, "|oi", &sub, &pos)) { + case 0: + sub = mrb_nil_value(); + /* fall through */ + case 1: + pos = len; + break; + case 2: if (pos < 0) { - mrb_regexp_check(mrb, sub); - return mrb_nil_value(); + pos += len; + if (pos < 0) { + return mrb_nil_value(); + } } - } - if (pos > len) pos = len; - } - else { - pos = len; - if (argc > 0) - sub = argv[0]; - else - sub = mrb_nil_value(); + if (pos > len) pos = len; + break; } pos = chars2bytes(str, 0, pos); - mrb_regexp_check(mrb, sub); switch (mrb_type(sub)) { default: { @@ -1830,7 +2065,7 @@ mrb_str_rindex(mrb_state *mrb, mrb_value str) tmp = mrb_check_string_type(mrb, sub); if (mrb_nil_p(tmp)) { - mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub); + mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %v given", sub); } sub = tmp; } @@ -1838,7 +2073,7 @@ mrb_str_rindex(mrb_state *mrb, mrb_value str) case MRB_TT_STRING: pos = str_rindex(mrb, str, sub, pos); if (pos >= 0) { - pos = bytes2chars(RSTRING_PTR(str), pos); + pos = bytes2chars(RSTRING_PTR(str), RSTRING_LEN(str), pos); BYTES_ALIGN_CHECK(pos); return mrb_fixnum_value(pos); } @@ -1916,16 +2151,11 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) if (argc == 0 || mrb_nil_p(spat)) { split_type = awk; } - else { - if (mrb_string_p(spat)) { - split_type = string; - if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') { - split_type = awk; - } - } - else { - mrb_noregexp(mrb, str); - } + else if (!mrb_string_p(spat)) { + mrb_raise(mrb, E_TYPE_ERROR, "expected String"); + } + else if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') { + split_type = awk; } result = mrb_ary_new(mrb); @@ -1951,7 +2181,7 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) } } else if (ISSPACE(c)) { - mrb_ary_push(mrb, result, byte_subseq(mrb, str, beg, end-beg)); + mrb_ary_push(mrb, result, mrb_str_byte_subseq(mrb, str, beg, end-beg)); mrb_gc_arena_restore(mrb, ai); skip = TRUE; beg = idx; @@ -1962,7 +2192,7 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) } } } - else if (split_type == string) { + else { /* split_type == string */ mrb_int str_len = RSTRING_LEN(str); mrb_int pat_len = RSTRING_LEN(spat); mrb_int idx = 0; @@ -1976,22 +2206,19 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) else { end = chars2bytes(str, idx, 1); } - mrb_ary_push(mrb, result, byte_subseq(mrb, str, idx, end)); + mrb_ary_push(mrb, result, mrb_str_byte_subseq(mrb, str, idx, end)); mrb_gc_arena_restore(mrb, ai); idx += end + pat_len; if (lim_p && lim <= ++i) break; } beg = idx; } - else { - mrb_noregexp(mrb, str); - } if (RSTRING_LEN(str) > 0 && (lim_p || RSTRING_LEN(str) > beg || lim < 0)) { if (RSTRING_LEN(str) == beg) { tmp = mrb_str_new_empty(mrb, str); } else { - tmp = byte_subseq(mrb, str, beg, RSTRING_LEN(str)-beg); + tmp = mrb_str_byte_subseq(mrb, str, beg, RSTRING_LEN(str)-beg); } mrb_ary_push(mrb, result, tmp); } @@ -2005,7 +2232,7 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) return result; } -MRB_API mrb_value +static mrb_value mrb_str_len_to_inum(mrb_state *mrb, const char *str, mrb_int len, mrb_int base, int badcheck) { const char *p = str; @@ -2089,7 +2316,7 @@ mrb_str_len_to_inum(mrb_state *mrb, const char *str, mrb_int len, mrb_int base, break; default: if (base < 2 || 36 < base) { - mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %S", mrb_fixnum_value(base)); + mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %i", base); } break; } /* end of switch (base) { */ @@ -2149,8 +2376,7 @@ mrb_str_len_to_inum(mrb_state *mrb, const char *str, mrb_int len, mrb_int base, else #endif { - mrb_raisef(mrb, E_ARGUMENT_ERROR, "string (%S) too big for integer", - mrb_str_new(mrb, str, pend-str)); + mrb_raisef(mrb, E_RANGE_ERROR, "string (%l) too big for integer", str, pend-str); } } } @@ -2166,35 +2392,49 @@ mrb_str_len_to_inum(mrb_state *mrb, const char *str, mrb_int len, mrb_int base, mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); /* not reached */ bad: - mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for number(%S)", - mrb_inspect(mrb, mrb_str_new(mrb, str, pend-str))); + mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for number(%!l)", str, pend-str); /* not reached */ return mrb_fixnum_value(0); } MRB_API mrb_value -mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck) +mrb_cstr_to_inum(mrb_state *mrb, const char *str, mrb_int base, mrb_bool badcheck) { return mrb_str_len_to_inum(mrb, str, strlen(str), base, badcheck); } +/* obslete: use RSTRING_CSTR() or mrb_string_cstr() */ MRB_API const char* mrb_string_value_cstr(mrb_state *mrb, mrb_value *ptr) { - mrb_value str = mrb_to_str(mrb, *ptr); - struct RString *ps = mrb_str_ptr(str); - mrb_int len = mrb_str_strlen(mrb, ps); - char *p = RSTR_PTR(ps); + struct RString *ps; + const char *p; + mrb_int len; - if (!p || p[len] != '\0') { - if (MRB_FROZEN_P(ps)) { - *ptr = str = mrb_str_dup(mrb, str); - ps = mrb_str_ptr(str); - } + check_null_byte(mrb, *ptr); + ps = mrb_str_ptr(*ptr); + p = RSTR_PTR(ps); + len = RSTR_LEN(ps); + if (p[len] == '\0') { + return p; + } + if (MRB_FROZEN_P(ps) || RSTR_CAPA(ps) == len) { + ps = str_new(mrb, NULL, len+1); + memcpy(RSTR_PTR(ps), p, len); + RSTR_SET_LEN(ps, len); + *ptr = mrb_obj_value(ps); + } + else { mrb_str_modify(mrb, ps); - return RSTR_PTR(ps); } - return p; + RSTR_PTR(ps)[len] = '\0'; + return RSTR_PTR(ps); +} + +MRB_API const char* +mrb_string_cstr(mrb_state *mrb, mrb_value str) +{ + return mrb_string_value_cstr(mrb, &str); } MRB_API mrb_value @@ -2203,7 +2443,8 @@ mrb_str_to_inum(mrb_state *mrb, mrb_value str, mrb_int base, mrb_bool badcheck) const char *s; mrb_int len; - s = mrb_string_value_ptr(mrb, str); + mrb_to_str(mrb, str); + s = RSTRING_PTR(str); len = RSTRING_LEN(str); return mrb_str_len_to_inum(mrb, s, len, base, badcheck); } @@ -2236,7 +2477,7 @@ mrb_str_to_i(mrb_state *mrb, mrb_value self) mrb_get_args(mrb, "|i", &base); if (base < 0) { - mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %S", mrb_fixnum_value(base)); + mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %i", base); } return mrb_str_to_inum(mrb, self, base, FALSE); } @@ -2261,7 +2502,7 @@ mrb_cstr_to_dbl(mrb_state *mrb, const char * p, mrb_bool badcheck) if (p == end) { if (badcheck) { bad: - mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for float(%S)", mrb_str_new_cstr(mrb, p)); + mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for float(%s)", p); /* not reached */ } return d; @@ -2308,22 +2549,7 @@ bad: MRB_API double mrb_str_to_dbl(mrb_state *mrb, mrb_value str, mrb_bool badcheck) { - char *s; - mrb_int len; - - mrb_to_str(mrb, str); - s = RSTRING_PTR(str); - len = RSTRING_LEN(str); - if (s) { - if (badcheck && memchr(s, '\0', len)) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string for Float contains null byte"); - } - if (s[len]) { /* no sentinel somehow */ - struct RString *temp_str = str_new(mrb, s, len); - s = RSTR_PTR(temp_str); - } - } - return mrb_cstr_to_dbl(mrb, s, badcheck); + return mrb_cstr_to_dbl(mrb, RSTRING_CSTR(mrb, str), badcheck); } /* 15.2.10.5.39 */ @@ -2378,7 +2604,7 @@ mrb_str_upcase_bang(mrb_state *mrb, mrb_value str) char *p, *pend; mrb_bool modify = FALSE; - mrb_str_modify(mrb, s); + mrb_str_modify_keep_ascii(mrb, s); p = RSTRING_PTR(str); pend = RSTRING_END(str); while (p < pend) { @@ -2459,7 +2685,7 @@ mrb_str_dump(mrb_state *mrb, mrb_value str) } result = str_new(mrb, 0, len); - str_with_class(mrb, result, str); + str_with_class(result, str); p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); q = RSTR_PTR(result); *q++ = '"'; @@ -2621,6 +2847,9 @@ mrb_str_inspect(mrb_state *mrb, mrb_value str) const char *p, *pend; char buf[CHAR_ESC_LEN + 1]; mrb_value result = mrb_str_new_lit(mrb, "\""); +#ifdef MRB_UTF8_STRING + uint32_t ascii_flag = MRB_STR_ASCII; +#endif p = RSTRING_PTR(str); pend = RSTRING_END(str); for (;p < pend; p++) { @@ -2637,6 +2866,7 @@ mrb_str_inspect(mrb_state *mrb, mrb_value str) } mrb_str_cat(mrb, result, buf, clen); p += clen-1; + ascii_flag = 0; continue; } #endif @@ -2678,6 +2908,10 @@ mrb_str_inspect(mrb_state *mrb, mrb_value str) } } mrb_str_cat_lit(mrb, result, "\""); +#ifdef MRB_UTF8_STRING + mrb_str_ptr(str)->flags |= ascii_flag; + mrb_str_ptr(result)->flags |= ascii_flag; +#endif return result; } @@ -2711,7 +2945,8 @@ mrb_init_string(mrb_state *mrb) { struct RClass *s; - mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << 5), "pointer size too big for embedded string"); + mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << MRB_STR_EMBED_LEN_BITSIZE), + "pointer size too big for embedded string"); mrb->string_class = s = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */ MRB_SET_INSTANCE_TT(s, MRB_TT_STRING); @@ -2723,6 +2958,7 @@ mrb_init_string(mrb_state *mrb) mrb_define_method(mrb, s, "+", mrb_str_plus_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.4 */ mrb_define_method(mrb, s, "*", mrb_str_times, MRB_ARGS_REQ(1)); /* 15.2.10.5.5 */ mrb_define_method(mrb, s, "[]", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.6 */ + mrb_define_method(mrb, s, "[]=", mrb_str_aset_m, MRB_ARGS_ANY()); mrb_define_method(mrb, s, "capitalize", mrb_str_capitalize, MRB_ARGS_NONE()); /* 15.2.10.5.7 */ mrb_define_method(mrb, s, "capitalize!", mrb_str_capitalize_bang, MRB_ARGS_NONE()); /* 15.2.10.5.8 */ mrb_define_method(mrb, s, "chomp", mrb_str_chomp, MRB_ARGS_ANY()); /* 15.2.10.5.9 */ @@ -2754,6 +2990,7 @@ mrb_init_string(mrb_state *mrb) #endif mrb_define_method(mrb, s, "to_i", mrb_str_to_i, MRB_ARGS_ANY()); /* 15.2.10.5.39 */ mrb_define_method(mrb, s, "to_s", mrb_str_to_s, MRB_ARGS_NONE()); /* 15.2.10.5.40 */ + mrb_define_method(mrb, s, "to_str", mrb_str_to_s, MRB_ARGS_NONE()); mrb_define_method(mrb, s, "to_sym", mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.41 */ mrb_define_method(mrb, s, "upcase", mrb_str_upcase, MRB_ARGS_NONE()); /* 15.2.10.5.42 */ mrb_define_method(mrb, s, "upcase!", mrb_str_upcase_bang, MRB_ARGS_NONE()); /* 15.2.10.5.43 */ @@ -2842,7 +3079,7 @@ mrb_float_read(const char *string, char **endPtr) */ p = string; - while (isspace(*p)) { + while (ISSPACE(*p)) { p += 1; } if (*p == '-') { @@ -2865,7 +3102,7 @@ mrb_float_read(const char *string, char **endPtr) for (mantSize = 0; ; mantSize += 1) { c = *p; - if (!isdigit(c)) { + if (!ISDIGIT(c)) { if ((c != '.') || (decPt >= 0)) { break; } @@ -2950,7 +3187,7 @@ mrb_float_read(const char *string, char **endPtr) } expSign = FALSE; } - while (isdigit(*p)) { + while (ISDIGIT(*p)) { exp = exp * 10 + (*p - '0'); if (exp > 19999) { exp = 19999; |
