diff options
Diffstat (limited to 'src/string.c')
| -rw-r--r-- | src/string.c | 2672 |
1 files changed, 1528 insertions, 1144 deletions
diff --git a/src/string.c b/src/string.c index 23cd76747..03d9c8a9c 100644 --- a/src/string.c +++ b/src/string.c @@ -4,65 +4,209 @@ ** See Copyright Notice in mruby.h */ -#include <ctype.h> +#ifdef _MSC_VER +# define _CRT_NONSTDC_NO_DEPRECATE +#endif + +#ifndef MRB_NO_FLOAT #include <float.h> +#include <math.h> +#endif #include <limits.h> #include <stddef.h> #include <stdlib.h> #include <string.h> -#include "mruby.h" -#include "mruby/array.h" -#include "mruby/class.h" -#include "mruby/range.h" -#include "mruby/string.h" -#include "mruby/numeric.h" -#include "mruby/re.h" - -const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz"; +#include <mruby.h> +#include <mruby/array.h> +#include <mruby/class.h> +#include <mruby/range.h> +#include <mruby/string.h> +#include <mruby/numeric.h> +#include <mruby/presym.h> typedef struct mrb_shared_string { - mrb_bool nofree : 1; int refcnt; + mrb_ssize capa; char *ptr; - mrb_int len; } mrb_shared_string; -static mrb_value str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2); -static mrb_value mrb_str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len); +const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz"; -MRB_API mrb_int -mrb_str_strlen(mrb_state *mrb, struct RString *s) +#define mrb_obj_alloc_string(mrb) MRB_OBJ_ALLOC((mrb), MRB_TT_STRING, (mrb)->string_class) + +static struct RString* +str_init_normal_capa(mrb_state *mrb, struct RString *s, + const char *p, size_t len, size_t capa) +{ + char *dst = (char *)mrb_malloc(mrb, capa + 1); + if (p) memcpy(dst, p, len); + dst[len] = '\0'; + s->as.heap.ptr = dst; + s->as.heap.len = (mrb_ssize)len; + s->as.heap.aux.capa = (mrb_ssize)capa; + RSTR_UNSET_TYPE_FLAG(s); + return s; +} + +static struct RString* +str_init_normal(mrb_state *mrb, struct RString *s, const char *p, size_t len) { - mrb_int i, max = RSTR_LEN(s); - char *p = RSTR_PTR(s); + return str_init_normal_capa(mrb, s, p, len, len); +} - if (!p) return 0; - for (i=0; i<max; i++) { - if (p[i] == '\0') { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); - } +static struct RString* +str_init_embed(struct RString *s, const char *p, size_t len) +{ + if (p) memcpy(RSTR_EMBED_PTR(s), p, len); + RSTR_EMBED_PTR(s)[len] = '\0'; + RSTR_SET_TYPE_FLAG(s, EMBED); + RSTR_SET_EMBED_LEN(s, len); + return s; +} + +static struct RString* +str_init_nofree(struct RString *s, const char *p, size_t len) +{ + s->as.heap.ptr = (char *)p; + s->as.heap.len = (mrb_ssize)len; + s->as.heap.aux.capa = 0; /* nofree */ + RSTR_SET_TYPE_FLAG(s, NOFREE); + return s; +} + +static struct RString* +str_init_shared(mrb_state *mrb, const struct RString *orig, struct RString *s, mrb_shared_string *shared) +{ + if (shared) { + shared->refcnt++; } - return max; + else { + shared = (mrb_shared_string *)mrb_malloc(mrb, sizeof(mrb_shared_string)); + shared->refcnt = 1; + shared->ptr = orig->as.heap.ptr; + shared->capa = orig->as.heap.aux.capa; + } + s->as.heap.ptr = orig->as.heap.ptr; + s->as.heap.len = orig->as.heap.len; + s->as.heap.aux.shared = shared; + RSTR_SET_TYPE_FLAG(s, SHARED); + return s; } -static inline void -resize_capa(mrb_state *mrb, struct RString *s, mrb_int capacity) +static struct RString* +str_init_fshared(const struct RString *orig, struct RString *s, struct RString *fshared) +{ + s->as.heap.ptr = orig->as.heap.ptr; + s->as.heap.len = orig->as.heap.len; + s->as.heap.aux.fshared = fshared; + RSTR_SET_TYPE_FLAG(s, FSHARED); + return s; +} + +static struct RString* +str_init_modifiable(mrb_state *mrb, struct RString *s, const char *p, size_t len) +{ + if (RSTR_EMBEDDABLE_P(len)) { + return str_init_embed(s, p, len); + } + else { + return str_init_normal(mrb, s, p, len); + } +} + +static struct RString* +str_new_static(mrb_state *mrb, const char *p, size_t len) +{ + if (RSTR_EMBEDDABLE_P(len)) { + return str_init_embed(mrb_obj_alloc_string(mrb), p, len); + } + if (len >= MRB_SSIZE_MAX) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); + } + return str_init_nofree(mrb_obj_alloc_string(mrb), p, len); +} + +static struct RString* +str_new(mrb_state *mrb, const char *p, size_t len) +{ + if (RSTR_EMBEDDABLE_P(len)) { + return str_init_embed(mrb_obj_alloc_string(mrb), p, len); + } + if (len >= MRB_SSIZE_MAX) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); + } + if (p && mrb_ro_data_p(p)) { + return str_init_nofree(mrb_obj_alloc_string(mrb), p, len); + } + return str_init_normal(mrb, mrb_obj_alloc_string(mrb), p, len); +} + +MRB_API mrb_value +mrb_str_new_capa(mrb_state *mrb, size_t capa) +{ + struct RString *s; + + if (RSTR_EMBEDDABLE_P(capa)) { + s = str_init_embed(mrb_obj_alloc_string(mrb), NULL, 0); + } + else if (capa >= MRB_SSIZE_MAX) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string capacity size too big"); + /* not reached */ + s = NULL; + } + else { + s = str_init_normal_capa(mrb, mrb_obj_alloc_string(mrb), NULL, 0, capa); + } + + return mrb_obj_value(s); +} + +static void +resize_capa(mrb_state *mrb, struct RString *s, size_t capacity) { +#if SIZE_MAX > MRB_SSIZE_MAX + mrb_assert(capacity < MRB_SSIZE_MAX); +#endif if (RSTR_EMBED_P(s)) { - if (RSTRING_EMBED_LEN_MAX < capacity) { - char *const tmp = (char *)mrb_malloc(mrb, capacity+1); - const mrb_int len = RSTR_EMBED_LEN(s); - memcpy(tmp, s->as.ary, len); - RSTR_UNSET_EMBED_FLAG(s); - s->as.heap.ptr = tmp; - s->as.heap.len = len; - s->as.heap.aux.capa = capacity; + if (!RSTR_EMBEDDABLE_P(capacity)) { + str_init_normal_capa(mrb, s, RSTR_EMBED_PTR(s), RSTR_EMBED_LEN(s), capacity); } } else { - s->as.heap.ptr = (char *)mrb_realloc(mrb, RSTR_PTR(s), capacity+1); - s->as.heap.aux.capa = capacity; + s->as.heap.ptr = (char*)mrb_realloc(mrb, RSTR_PTR(s), capacity+1); + s->as.heap.aux.capa = (mrb_ssize)capacity; + } +} + +MRB_API mrb_value +mrb_str_new(mrb_state *mrb, const char *p, size_t len) +{ + return mrb_obj_value(str_new(mrb, p, len)); +} + +MRB_API mrb_value +mrb_str_new_cstr(mrb_state *mrb, const char *p) +{ + struct RString *s; + size_t len; + + if (p) { + len = strlen(p); + } + else { + len = 0; } + + s = str_new(mrb, p, len); + + return mrb_obj_value(s); +} + +MRB_API mrb_value +mrb_str_new_static(mrb_state *mrb, const char *p, size_t len) +{ + struct RString *s = str_new_static(mrb, p, len); + return mrb_obj_value(s); } static void @@ -70,341 +214,643 @@ str_decref(mrb_state *mrb, mrb_shared_string *shared) { shared->refcnt--; if (shared->refcnt == 0) { - if (!shared->nofree) { - mrb_free(mrb, shared->ptr); - } + mrb_free(mrb, shared->ptr); mrb_free(mrb, shared); } } -MRB_API void -mrb_str_modify(mrb_state *mrb, struct RString *s) +static void +str_modify_keep_ascii(mrb_state *mrb, struct RString *s) { if (RSTR_SHARED_P(s)) { mrb_shared_string *shared = s->as.heap.aux.shared; if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) { - s->as.heap.ptr = shared->ptr; - s->as.heap.aux.capa = shared->len; - RSTR_PTR(s)[s->as.heap.len] = '\0'; + s->as.heap.aux.capa = shared->capa; + s->as.heap.ptr[s->as.heap.len] = '\0'; + RSTR_UNSET_SHARED_FLAG(s); mrb_free(mrb, shared); } else { - char *ptr, *p; - mrb_int len; - - p = RSTR_PTR(s); - len = s->as.heap.len; - ptr = (char *)mrb_malloc(mrb, (size_t)len + 1); - if (p) { - memcpy(ptr, p, len); - } - ptr[len] = '\0'; - s->as.heap.ptr = ptr; - s->as.heap.aux.capa = len; + str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len); str_decref(mrb, shared); } - RSTR_UNSET_SHARED_FLAG(s); - return; } - if (RSTR_NOFREE_P(s)) { - char *p = s->as.heap.ptr; + else if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) { + str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len); + } +} - s->as.heap.ptr = (char *)mrb_malloc(mrb, (size_t)s->as.heap.len+1); - if (p) { - memcpy(RSTR_PTR(s), p, s->as.heap.len); - } - RSTR_PTR(s)[s->as.heap.len] = '\0'; - s->as.heap.aux.capa = s->as.heap.len; - RSTR_UNSET_NOFREE_FLAG(s); - return; +static void +check_null_byte(mrb_state *mrb, mrb_value str) +{ + mrb_to_str(mrb, str); + if (memchr(RSTRING_PTR(str), '\0', RSTRING_LEN(str))) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); } } -MRB_API mrb_value -mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len) +void +mrb_gc_free_str(mrb_state *mrb, struct RString *str) +{ + if (RSTR_EMBED_P(str)) + /* no code */; + else if (RSTR_SHARED_P(str)) + str_decref(mrb, str->as.heap.aux.shared); + else if (!RSTR_NOFREE_P(str) && !RSTR_FSHARED_P(str)) + mrb_free(mrb, str->as.heap.ptr); +} + +#ifdef MRB_UTF8_STRING +static const char utf8len_codepage[256] = +{ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1, +}; + +#define utf8_islead(c) ((unsigned char)((c)&0xc0) != 0x80) + +mrb_int +mrb_utf8len(const char* p, const char* e) +{ + mrb_int len; + mrb_int i; + + if ((unsigned char)*p < 0x80) return 1; + len = utf8len_codepage[(unsigned char)*p]; + if (len == 1) return 1; + if (len > e - p) return 1; + for (i = 1; i < len; ++i) + if (utf8_islead(p[i])) + return 1; + return len; +} + +mrb_int +mrb_utf8_strlen(const char *str, mrb_int byte_len) +{ + mrb_int len = 0; + const char *p = str; + const char *e = p + byte_len; + + while (p < e) { + p += mrb_utf8len(p, e); + len++; + } + return len; +} + +static mrb_int +utf8_strlen(mrb_value str) { - mrb_int slen; struct RString *s = mrb_str_ptr(str); + mrb_int byte_len = RSTR_LEN(s); - mrb_str_modify(mrb, s); - slen = RSTR_LEN(s); - if (len != slen) { - if (slen < len || slen - len > 256) { - resize_capa(mrb, s, len); - } - RSTR_SET_LEN(s, len); - RSTR_PTR(s)[len] = '\0'; /* sentinel */ + if (RSTR_ASCII_P(s)) { + return byte_len; + } + else { + mrb_int utf8_len = mrb_utf8_strlen(RSTR_PTR(s), byte_len); + if (byte_len == utf8_len) RSTR_SET_ASCII_FLAG(s); + return utf8_len; } - return str; } -#define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class)) +#define RSTRING_CHAR_LEN(s) utf8_strlen(s) -static struct RString* -str_new_static(mrb_state *mrb, const char *p, size_t len) +/* map character index to byte offset index */ +static mrb_int +chars2bytes(mrb_value s, mrb_int off, mrb_int idx) { - struct RString *s; - - if (len >= MRB_INT_MAX) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); + if (RSTR_ASCII_P(mrb_str_ptr(s))) { + return idx; } - s = mrb_obj_alloc_string(mrb); - s->as.heap.len = len; - s->as.heap.aux.capa = 0; /* nofree */ - s->as.heap.ptr = (char *)p; - s->flags = MRB_STR_NOFREE; + else { + mrb_int i, b, n; + const char *p = RSTRING_PTR(s) + off; + const char *e = RSTRING_END(s); - return s; + for (b=i=0; p<e && i<idx; i++) { + n = mrb_utf8len(p, e); + b += n; + p += n; + } + return b; + } } -static struct RString* -str_new(mrb_state *mrb, const char *p, size_t len) +/* map byte offset to character index */ +static mrb_int +bytes2chars(char *p, mrb_int len, mrb_int bi) { - struct RString *s; + const char *e = p + (size_t)len; + const char *pivot = p + bi; + mrb_int i; - if (mrb_ro_data_p(p)) { - return str_new_static(mrb, p, len); + for (i = 0; p < pivot; i ++) { + p += mrb_utf8len(p, e); } - s = mrb_obj_alloc_string(mrb); - if (len < RSTRING_EMBED_LEN_MAX) { - RSTR_SET_EMBED_FLAG(s); - RSTR_SET_EMBED_LEN(s, len); - if (p) { - memcpy(s->as.ary, p, len); - } - } else { - if (len >= MRB_INT_MAX) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); + if (p != pivot) return -1; + return i; +} + +static const char * +char_adjust(const char *beg, const char *end, const char *ptr) +{ + if ((ptr > beg || ptr < end) && (*ptr & 0xc0) == 0x80) { + const int utf8_adjust_max = 3; + const char *p; + + if (ptr - beg > utf8_adjust_max) { + beg = ptr - utf8_adjust_max; } - s->as.heap.len = len; - s->as.heap.aux.capa = len; - s->as.heap.ptr = (char *)mrb_malloc(mrb, len+1); - if (p) { - memcpy(s->as.heap.ptr, p, len); + + p = ptr; + while (p > beg) { + p --; + if ((*p & 0xc0) != 0x80) { + int clen = mrb_utf8len(p, end); + if (clen > ptr - p) return p; + break; + } } } - RSTR_PTR(s)[len] = '\0'; - return s; -} -static inline void -str_with_class(mrb_state *mrb, struct RString *s, mrb_value obj) -{ - s->c = mrb_str_ptr(obj)->c; + return ptr; } -static mrb_value -mrb_str_new_empty(mrb_state *mrb, mrb_value str) +static const char * +char_backtrack(const char *ptr, const char *end) { - struct RString *s = str_new(mrb, 0, 0); + if (ptr < end) { + const int utf8_bytelen_max = 4; + const char *p; - str_with_class(mrb, s, str); - return mrb_obj_value(s); -} + if (end - ptr > utf8_bytelen_max) { + ptr = end - utf8_bytelen_max; + } -#ifndef MRB_STR_BUF_MIN_SIZE -# define MRB_STR_BUF_MIN_SIZE 128 -#endif + p = end; + while (p > ptr) { + p --; + if ((*p & 0xc0) != 0x80) { + int clen = utf8len_codepage[(unsigned char)*p]; + if (clen == end - p) { return p; } + break; + } + } + } -MRB_API mrb_value -mrb_str_buf_new(mrb_state *mrb, size_t capa) + return end - 1; +} + +static mrb_int +str_index_str_by_char_search(mrb_state *mrb, const char *p, const char *pend, const char *s, const mrb_int slen, mrb_int off) { - struct RString *s; + /* Based on Quick Search algorithm (Boyer-Moore-Horspool algorithm) */ - s = mrb_obj_alloc_string(mrb); + ptrdiff_t qstable[1 << CHAR_BIT]; - if (capa >= MRB_INT_MAX) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string capacity size too big"); + /* Preprocessing */ + { + mrb_int i; + + for (i = 0; i < 1 << CHAR_BIT; i ++) { + qstable[i] = slen; + } + for (i = 0; i < slen; i ++) { + qstable[(unsigned char)s[i]] = slen - (i + 1); + } } - if (capa < MRB_STR_BUF_MIN_SIZE) { - capa = MRB_STR_BUF_MIN_SIZE; + + /* Searching */ + while (p < pend && pend - p >= slen) { + const char *pivot; + + if (memcmp(p, s, slen) == 0) { + return off; + } + + pivot = p + qstable[(unsigned char)p[slen - 1]]; + if (pivot >= pend || pivot < p /* overflowed */) { return -1; } + + do { + p += mrb_utf8len(p, pend); + off ++; + } while (p < pivot); } - s->as.heap.len = 0; - s->as.heap.aux.capa = capa; - s->as.heap.ptr = (char *)mrb_malloc(mrb, capa+1); - RSTR_PTR(s)[0] = '\0'; - return mrb_obj_value(s); + return -1; } -static void -str_buf_cat(mrb_state *mrb, struct RString *s, const char *ptr, size_t len) +static mrb_int +str_index_str_by_char(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) { - size_t capa; - size_t total; - ptrdiff_t off = -1; + const char *p = RSTRING_PTR(str); + const char *pend = p + RSTRING_LEN(str); + const char *s = RSTRING_PTR(sub); + const mrb_int slen = RSTRING_LEN(sub); + mrb_int off = pos; - if (len == 0) return; - mrb_str_modify(mrb, s); - if (ptr >= RSTR_PTR(s) && ptr <= RSTR_PTR(s) + (size_t)RSTR_LEN(s)) { - off = ptr - RSTR_PTR(s); + for (; pos > 0; pos --) { + if (pend - p < 1) { return -1; } + p += mrb_utf8len(p, pend); } - if (RSTR_EMBED_P(s)) - capa = RSTRING_EMBED_LEN_MAX; - else - capa = s->as.heap.aux.capa; + if (slen < 1) { return off; } - if (RSTR_LEN(s) >= MRB_INT_MAX - (mrb_int)len) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); - } - total = RSTR_LEN(s)+len; - if (capa <= total) { - while (total > capa) { - if (capa + 1 >= MRB_INT_MAX / 2) { - capa = (total + 4095) / 4096; - break; + return str_index_str_by_char_search(mrb, p, pend, s, slen, off); +} + +#define BYTES_ALIGN_CHECK(pos) if (pos < 0) return mrb_nil_value(); +#else +#define RSTRING_CHAR_LEN(s) RSTRING_LEN(s) +#define chars2bytes(p, off, ci) (ci) +#define bytes2chars(p, end, bi) (bi) +#define char_adjust(beg, end, ptr) (ptr) +#define char_backtrack(ptr, end) ((end) - 1) +#define BYTES_ALIGN_CHECK(pos) +#define str_index_str_by_char(mrb, str, sub, pos) str_index_str(mrb, str, sub, pos) +#endif + +#ifndef MRB_QS_SHORT_STRING_LENGTH +#define MRB_QS_SHORT_STRING_LENGTH 2048 +#endif + +static inline mrb_int +mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n) +{ + if (n + m < MRB_QS_SHORT_STRING_LENGTH) { + const unsigned char *y = ys; + const unsigned char *ye = ys+n-m+1; + + for (;;) { + y = (const unsigned char*)memchr(y, xs[0], (size_t)(ye-y)); + if (y == NULL) return -1; + if (memcmp(xs, y, m) == 0) { + return (mrb_int)(y - ys); } - capa = (capa + 1) * 2; + y++; } - resize_capa(mrb, s, capa); + return -1; } - if (off != -1) { - ptr = RSTR_PTR(s) + off; + else { + const unsigned char *x = xs, *xe = xs + m; + const unsigned char *y = ys; + int i; + ptrdiff_t qstable[256]; + + /* Preprocessing */ + for (i = 0; i < 256; ++i) + qstable[i] = m + 1; + for (; x < xe; ++x) + qstable[*x] = xe - x; + /* Searching */ + for (; y + m <= ys + n; y += *(qstable + y[m])) { + if (*xs == *y && memcmp(xs, y, m) == 0) + return (mrb_int)(y - ys); + } + return -1; } - memcpy(RSTR_PTR(s) + RSTR_LEN(s), ptr, len); - mrb_assert_int_fit(size_t, total, mrb_int, MRB_INT_MAX); - RSTR_SET_LEN(s, total); - RSTR_PTR(s)[total] = '\0'; /* sentinel */ } -MRB_API mrb_value -mrb_str_new(mrb_state *mrb, const char *p, size_t len) +static mrb_int +mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n) { - return mrb_obj_value(str_new(mrb, p, len)); -} + const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0; -/* - * call-seq: (Caution! NULL string) - * String.new(str="") => new_str - * - * Returns a new string object containing a copy of <i>str</i>. - */ + if (m > n) return -1; + else if (m == n) { + return memcmp(x0, y0, m) == 0 ? 0 : -1; + } + else if (m < 1) { + return 0; + } + else if (m == 1) { + const unsigned char *ys = (const unsigned char *)memchr(y, *x, n); -MRB_API mrb_value -mrb_str_new_cstr(mrb_state *mrb, const char *p) + if (ys) + return (mrb_int)(ys - y); + else + return -1; + } + return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n); +} + +static void +str_share(mrb_state *mrb, struct RString *orig, struct RString *s) { - struct RString *s; - size_t len; + size_t len = (size_t)orig->as.heap.len; - if (p) { - len = strlen(p); + mrb_assert(!RSTR_EMBED_P(orig)); + if (RSTR_NOFREE_P(orig)) { + str_init_nofree(s, orig->as.heap.ptr, len); + } + else if (RSTR_SHARED_P(orig)) { + str_init_shared(mrb, orig, s, orig->as.heap.aux.shared); + } + else if (RSTR_FSHARED_P(orig)) { + str_init_fshared(orig, s, orig->as.heap.aux.fshared); } else { - len = 0; + if (orig->as.heap.aux.capa > orig->as.heap.len) { + orig->as.heap.ptr = (char *)mrb_realloc(mrb, orig->as.heap.ptr, len+1); + orig->as.heap.aux.capa = (mrb_ssize)len; + } + str_init_shared(mrb, orig, s, NULL); + str_init_shared(mrb, orig, orig, s->as.heap.aux.shared); } +} - s = str_new(mrb, p, len); +mrb_value +mrb_str_byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) +{ + struct RString *orig, *s; + orig = mrb_str_ptr(str); + s = mrb_obj_alloc_string(mrb); + if (RSTR_EMBEDDABLE_P(len)) { + str_init_embed(s, RSTR_PTR(orig)+beg, len); + } + else { + str_share(mrb, orig, s); + s->as.heap.ptr += (mrb_ssize)beg; + s->as.heap.len = (mrb_ssize)len; + } + RSTR_COPY_ASCII_FLAG(s, orig); return mrb_obj_value(s); } -MRB_API mrb_value -mrb_str_new_static(mrb_state *mrb, const char *p, size_t len) +static void +str_range_to_bytes(mrb_value str, mrb_int *pos, mrb_int *len) { - struct RString *s = str_new_static(mrb, p, len); - return mrb_obj_value(s); + *pos = chars2bytes(str, 0, *pos); + *len = chars2bytes(str, *pos, *len); +} +#ifdef MRB_UTF8_STRING +static inline mrb_value +str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) +{ + str_range_to_bytes(str, &beg, &len); + return mrb_str_byte_subseq(mrb, str, beg, len); } +#else +#define str_subseq(mrb, str, beg, len) mrb_str_byte_subseq(mrb, str, beg, len) +#endif -void -mrb_gc_free_str(mrb_state *mrb, struct RString *str) +mrb_bool +mrb_str_beg_len(mrb_int str_len, mrb_int *begp, mrb_int *lenp) { - if (RSTR_EMBED_P(str)) - /* no code */; - else if (RSTR_SHARED_P(str)) - str_decref(mrb, str->as.heap.aux.shared); - else if (!RSTR_NOFREE_P(str)) - mrb_free(mrb, str->as.heap.ptr); + if (str_len < *begp || *lenp < 0) return FALSE; + if (*begp < 0) { + *begp += str_len; + if (*begp < 0) return FALSE; + } + if (*lenp > str_len - *begp) + *lenp = str_len - *begp; + if (*lenp <= 0) { + *lenp = 0; + } + return TRUE; } -MRB_API char* -mrb_str_to_cstr(mrb_state *mrb, mrb_value str0) +static mrb_value +str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) { - struct RString *s; + return mrb_str_beg_len(RSTRING_CHAR_LEN(str), &beg, &len) ? + str_subseq(mrb, str, beg, len) : mrb_nil_value(); +} - if (!mrb_string_p(str0)) { - mrb_raise(mrb, E_TYPE_ERROR, "expected String"); +MRB_API mrb_int +mrb_str_index(mrb_state *mrb, mrb_value str, const char *sptr, mrb_int slen, mrb_int offset) +{ + mrb_int pos; + char *s; + mrb_int len; + + len = RSTRING_LEN(str); + if (offset < 0) { + offset += len; + if (offset < 0) return -1; + } + if (len - offset < slen) return -1; + s = RSTRING_PTR(str); + if (offset) { + s += offset; + } + if (slen == 0) return offset; + /* need proceed one character at a time */ + len = RSTRING_LEN(str) - offset; + pos = mrb_memsearch(sptr, slen, s, len); + if (pos < 0) return pos; + return pos + offset; +} + +static mrb_int +str_index_str(mrb_state *mrb, mrb_value str, mrb_value str2, mrb_int offset) +{ + const char *ptr; + mrb_int len; + + ptr = RSTRING_PTR(str2); + len = RSTRING_LEN(str2); + + return mrb_str_index(mrb, str, ptr, len, offset); +} + +static mrb_value +str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2) +{ + size_t len; + + mrb_check_frozen(mrb, s1); + if (s1 == s2) return mrb_obj_value(s1); + RSTR_COPY_ASCII_FLAG(s1, s2); + if (RSTR_SHARED_P(s1)) { + str_decref(mrb, s1->as.heap.aux.shared); + } + else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1) && !RSTR_FSHARED_P(s1) + && s1->as.heap.ptr) { + mrb_free(mrb, s1->as.heap.ptr); } - s = str_new(mrb, RSTRING_PTR(str0), RSTRING_LEN(str0)); - if ((strlen(RSTR_PTR(s)) ^ RSTR_LEN(s)) != 0) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); + len = (size_t)RSTR_LEN(s2); + if (RSTR_EMBEDDABLE_P(len)) { + str_init_embed(s1, RSTR_PTR(s2), len); } - return RSTR_PTR(s); + else { + str_share(mrb, s2, s1); + } + + return mrb_obj_value(s1); } -static void -str_make_shared(mrb_state *mrb, struct RString *s) +static mrb_int +str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) { - if (!RSTR_SHARED_P(s)) { - mrb_shared_string *shared = (mrb_shared_string *)mrb_malloc(mrb, sizeof(mrb_shared_string)); + const char *s, *sbeg, *t; + struct RString *ps = mrb_str_ptr(str); + mrb_int len = RSTRING_LEN(sub); - shared->refcnt = 1; - if (RSTR_EMBED_P(s)) { - const mrb_int len = RSTR_EMBED_LEN(s); - char *const tmp = (char *)mrb_malloc(mrb, len+1); - memcpy(tmp, s->as.ary, len); - tmp[len] = '\0'; - RSTR_UNSET_EMBED_FLAG(s); - s->as.heap.ptr = tmp; - s->as.heap.len = len; - shared->nofree = FALSE; - shared->ptr = s->as.heap.ptr; - } - else if (RSTR_NOFREE_P(s)) { - shared->nofree = TRUE; - shared->ptr = s->as.heap.ptr; - RSTR_UNSET_NOFREE_FLAG(s); - } - else { - shared->nofree = FALSE; - if (s->as.heap.aux.capa > s->as.heap.len) { - s->as.heap.ptr = shared->ptr = (char *)mrb_realloc(mrb, s->as.heap.ptr, s->as.heap.len+1); - } - else { - shared->ptr = s->as.heap.ptr; + /* substring longer than string */ + if (RSTR_LEN(ps) < len) return -1; + if (RSTR_LEN(ps) - pos < len) { + pos = RSTR_LEN(ps) - len; + } + sbeg = RSTR_PTR(ps); + s = RSTR_PTR(ps) + pos; + t = RSTRING_PTR(sub); + if (len) { + s = char_adjust(sbeg, sbeg + RSTR_LEN(ps), s); + while (sbeg <= s) { + if (memcmp(s, t, len) == 0) { + return (mrb_int)(s - RSTR_PTR(ps)); } + s = char_backtrack(sbeg, s); } - shared->len = s->as.heap.len; - s->as.heap.aux.shared = shared; - RSTR_SET_SHARED_FLAG(s); + return -1; + } + else { + return pos; } } -/* - * call-seq: (Caution! String("abcd") change) - * String("abcdefg") = String("abcd") + String("efg") - * - * Returns a new string object containing a copy of <i>str</i>. - */ -MRB_API void -mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other) +MRB_API mrb_int +mrb_str_strlen(mrb_state *mrb, struct RString *s) { - struct RString *s1 = mrb_str_ptr(self), *s2; - mrb_int len; + mrb_int i, max = RSTR_LEN(s); + char *p = RSTR_PTR(s); - mrb_str_modify(mrb, s1); - if (!mrb_string_p(other)) { - other = mrb_str_to_str(mrb, other); + if (!p) return 0; + for (i=0; i<max; i++) { + if (p[i] == '\0') { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); + } } - s2 = mrb_str_ptr(other); - len = RSTR_LEN(s1) + RSTR_LEN(s2); + return max; +} + +#ifdef _WIN32 +#include <windows.h> + +char* +mrb_utf8_from_locale(const char *str, int len) +{ + wchar_t* wcsp; + char* mbsp; + int mbssize, wcssize; + + if (len == 0) + return strdup(""); + if (len == -1) + len = (int)strlen(str); + wcssize = MultiByteToWideChar(GetACP(), 0, str, len, NULL, 0); + wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t)); + if (!wcsp) + return NULL; + wcssize = MultiByteToWideChar(GetACP(), 0, str, len, wcsp, wcssize + 1); + wcsp[wcssize] = 0; + + mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL); + mbsp = (char*) malloc((mbssize + 1)); + if (!mbsp) { + free(wcsp); + return NULL; + } + mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL); + mbsp[mbssize] = 0; + free(wcsp); + return mbsp; +} + +char* +mrb_locale_from_utf8(const char *utf8, int len) +{ + wchar_t* wcsp; + char* mbsp; + int mbssize, wcssize; + + if (len == 0) + return strdup(""); + if (len == -1) + len = (int)strlen(utf8); + wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, NULL, 0); + wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t)); + if (!wcsp) + return NULL; + wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, wcsp, wcssize + 1); + wcsp[wcssize] = 0; + mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL); + mbsp = (char*) malloc((mbssize + 1)); + if (!mbsp) { + free(wcsp); + return NULL; + } + mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL); + mbsp[mbssize] = 0; + free(wcsp); + return mbsp; +} +#endif + +MRB_API void +mrb_str_modify_keep_ascii(mrb_state *mrb, struct RString *s) +{ + mrb_check_frozen(mrb, s); + str_modify_keep_ascii(mrb, s); +} - if (RSTRING_CAPA(self) < len) { - resize_capa(mrb, s1, len); +MRB_API void +mrb_str_modify(mrb_state *mrb, struct RString *s) +{ + mrb_str_modify_keep_ascii(mrb, s); + RSTR_UNSET_ASCII_FLAG(s); +} + +MRB_API mrb_value +mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len) +{ + mrb_int slen; + struct RString *s = mrb_str_ptr(str); + + if (len < 0) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "negative (or overflowed) string size"); } - memcpy(RSTR_PTR(s1)+RSTR_LEN(s1), RSTR_PTR(s2), RSTR_LEN(s2)); - RSTR_SET_LEN(s1, len); - RSTR_PTR(s1)[len] = '\0'; + mrb_str_modify(mrb, s); + slen = RSTR_LEN(s); + if (len != slen) { + if (slen < len || slen - len > 256) { + resize_capa(mrb, s, len); + } + RSTR_SET_LEN(s, len); + RSTR_PTR(s)[len] = '\0'; /* sentinel */ + } + return str; +} + +MRB_API char* +mrb_str_to_cstr(mrb_state *mrb, mrb_value str0) +{ + struct RString *s; + + check_null_byte(mrb, str0); + s = str_new(mrb, RSTRING_PTR(str0), RSTRING_LEN(str0)); + return RSTR_PTR(s); +} + +MRB_API void +mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other) +{ + other = mrb_obj_as_string(mrb, other); + mrb_str_cat_str(mrb, self, other); } -/* - * call-seq: (Caution! String("abcd") remain) - * String("abcdefg") = String("abcd") + String("efg") - * - * Returns a new string object containing a copy of <i>str</i>. - */ MRB_API mrb_value mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b) { @@ -422,10 +868,13 @@ mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b) /* 15.2.10.5.2 */ /* - * call-seq: (Caution! String("abcd") remain) for stack_argument - * String("abcdefg") = String("abcd") + String("efg") + * call-seq: + * str + other_str -> new_str * - * Returns a new string object containing a copy of <i>str</i>. + * Concatenation---Returns a new <code>String</code> containing + * <i>other_str</i> concatenated to <i>str</i>. + * + * "Hello from " + self.to_s #=> "Hello from main" */ static mrb_value mrb_str_plus_m(mrb_state *mrb, mrb_value self) @@ -440,15 +889,22 @@ mrb_str_plus_m(mrb_state *mrb, mrb_value self) /* 15.2.10.5.33 */ /* * call-seq: - * len = strlen(String("abcd")) + * "abcd".size => int * * Returns the length of string. */ static mrb_value mrb_str_size(mrb_state *mrb, mrb_value self) { - struct RString *s = mrb_str_ptr(self); - return mrb_fixnum_value(RSTR_LEN(s)); + mrb_int len = RSTRING_CHAR_LEN(self); + return mrb_fixnum_value(len); +} + +static mrb_value +mrb_str_bytesize(mrb_state *mrb, mrb_value self) +{ + mrb_int len = RSTRING_LEN(self); + return mrb_fixnum_value(len); } /* 15.2.10.5.1 */ @@ -472,13 +928,12 @@ mrb_str_times(mrb_state *mrb, mrb_value self) if (times < 0) { mrb_raise(mrb, E_ARGUMENT_ERROR, "negative argument"); } - if (times && MRB_INT_MAX / times < RSTRING_LEN(self)) { + if (times && MRB_SSIZE_MAX / times < RSTRING_LEN(self)) { mrb_raise(mrb, E_ARGUMENT_ERROR, "argument too big"); } len = RSTRING_LEN(self)*times; str2 = str_new(mrb, 0, len); - str_with_class(mrb, str2, self); p = RSTR_PTR(str2); if (len > 0) { n = RSTRING_LEN(self); @@ -490,6 +945,7 @@ mrb_str_times(mrb_state *mrb, mrb_value self) memcpy(p + n, p, len-n); } p[RSTR_LEN(str2)] = '\0'; + RSTR_COPY_ASCII_FLAG(str2, mrb_str_ptr(self)); return mrb_obj_value(str2); } @@ -553,26 +1009,11 @@ mrb_str_cmp(mrb_state *mrb, mrb_value str1, mrb_value str2) static mrb_value mrb_str_cmp_m(mrb_state *mrb, mrb_value str1) { - mrb_value str2; + mrb_value str2 = mrb_get_arg1(mrb); mrb_int result; - mrb_get_args(mrb, "o", &str2); if (!mrb_string_p(str2)) { - if (!mrb_respond_to(mrb, str2, mrb_intern_lit(mrb, "to_s"))) { - return mrb_nil_value(); - } - else if (!mrb_respond_to(mrb, str2, mrb_intern_lit(mrb, "<=>"))) { - return mrb_nil_value(); - } - else { - mrb_value tmp = mrb_funcall(mrb, str2, "<=>", 1, str1); - - if (mrb_nil_p(tmp)) return mrb_nil_value(); - if (!mrb_fixnum(tmp)) { - return mrb_funcall(mrb, mrb_fixnum_value(0), "-", 1, tmp); - } - result = -mrb_fixnum(tmp); - } + return mrb_nil_value(); } else { result = mrb_str_cmp(mrb, str1, str2); @@ -594,15 +1035,7 @@ str_eql(mrb_state *mrb, const mrb_value str1, const mrb_value str2) MRB_API mrb_bool mrb_str_equal(mrb_state *mrb, mrb_value str1, mrb_value str2) { - if (mrb_immediate_p(str2)) return FALSE; - if (!mrb_string_p(str2)) { - if (mrb_nil_p(str2)) return FALSE; - if (!mrb_respond_to(mrb, str2, mrb_intern_lit(mrb, "to_str"))) { - return FALSE; - } - str2 = mrb_funcall(mrb, str2, "to_str", 0); - return mrb_equal(mrb, str2, str1); - } + if (!mrb_string_p(str2)) return FALSE; return str_eql(mrb, str1, str2); } @@ -620,171 +1053,122 @@ mrb_str_equal(mrb_state *mrb, mrb_value str1, mrb_value str2) static mrb_value mrb_str_equal_m(mrb_state *mrb, mrb_value str1) { - mrb_value str2; - - mrb_get_args(mrb, "o", &str2); + mrb_value str2 = mrb_get_arg1(mrb); return mrb_bool_value(mrb_str_equal(mrb, str1, str2)); } /* ---------------------------------- */ -MRB_API mrb_value -mrb_str_to_str(mrb_state *mrb, mrb_value str) -{ - mrb_value s; - - if (!mrb_string_p(str)) { - s = mrb_check_convert_type(mrb, str, MRB_TT_STRING, "String", "to_str"); - if (mrb_nil_p(s)) { - s = mrb_convert_type(mrb, str, MRB_TT_STRING, "String", "to_s"); - } - return s; - } - return str; -} +/* obslete: use RSTRING_PTR() */ MRB_API const char* -mrb_string_value_ptr(mrb_state *mrb, mrb_value ptr) +mrb_string_value_ptr(mrb_state *mrb, mrb_value str) { - mrb_value str = mrb_str_to_str(mrb, ptr); + str = mrb_obj_as_string(mrb, str); return RSTRING_PTR(str); } -void -mrb_noregexp(mrb_state *mrb, mrb_value self) +/* obslete: use RSTRING_LEN() */ +MRB_API mrb_int +mrb_string_value_len(mrb_state *mrb, mrb_value ptr) { - mrb_raise(mrb, E_NOTIMP_ERROR, "Regexp class not implemented"); + mrb_to_str(mrb, ptr); + return RSTRING_LEN(ptr); } -void -mrb_regexp_check(mrb_state *mrb, mrb_value obj) +MRB_API mrb_value +mrb_str_dup(mrb_state *mrb, mrb_value str) { - if (mrb_regexp_p(mrb, obj)) { - mrb_noregexp(mrb, obj); - } + struct RString *s = mrb_str_ptr(str); + struct RString *dup = str_new(mrb, 0, 0); + + return str_replace(mrb, dup, s); } -static inline mrb_int -mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n) -{ - const unsigned char *x = xs, *xe = xs + m; - const unsigned char *y = ys; - int i, qstable[256]; +enum str_convert_range { + /* `beg` and `len` are byte unit in `0 ... str.bytesize` */ + STR_BYTE_RANGE_CORRECTED = 1, - /* Preprocessing */ - for (i = 0; i < 256; ++i) - qstable[i] = m + 1; - for (; x < xe; ++x) - qstable[*x] = xe - x; - /* Searching */ - for (; y + m <= ys + n; y += *(qstable + y[m])) { - if (*xs == *y && memcmp(xs, y, m) == 0) - return y - ys; - } - return -1; -} + /* `beg` and `len` are char unit in any range */ + STR_CHAR_RANGE = 2, -static mrb_int -mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n) -{ - const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0; + /* `beg` and `len` are char unit in `0 ... str.size` */ + STR_CHAR_RANGE_CORRECTED = 3, - if (m > n) return -1; - else if (m == n) { - return memcmp(x0, y0, m) == 0 ? 0 : -1; - } - else if (m < 1) { - return 0; - } - else if (m == 1) { - const unsigned char *ys = y, *ye = ys + n; - for (; y < ye; ++y) { - if (*x == *y) - return y - ys; - } - return -1; - } - return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n); -} + /* `beg` is out of range */ + STR_OUT_OF_RANGE = -1 +}; -static mrb_int -mrb_str_index(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int offset) +static enum str_convert_range +str_convert_range(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen, mrb_int *beg, mrb_int *len) { - mrb_int pos; - char *s, *sptr; - mrb_int len, slen; - - len = RSTRING_LEN(str); - slen = RSTRING_LEN(sub); - if (offset < 0) { - offset += len; - if (offset < 0) return -1; + if (!mrb_undef_p(alen)) { + *beg = mrb_as_int(mrb, indx); + *len = mrb_as_int(mrb, alen); + return STR_CHAR_RANGE; } - if (len - offset < slen) return -1; - s = RSTRING_PTR(str); - if (offset) { - s += offset; - } - if (slen == 0) return offset; - /* need proceed one character at a time */ - sptr = RSTRING_PTR(sub); - slen = RSTRING_LEN(sub); - len = RSTRING_LEN(str) - offset; - pos = mrb_memsearch(sptr, slen, s, len); - if (pos < 0) return pos; - return pos + offset; -} + else { + switch (mrb_type(indx)) { + case MRB_TT_INTEGER: + *beg = mrb_integer(indx); + *len = 1; + return STR_CHAR_RANGE; -MRB_API mrb_value -mrb_str_dup(mrb_state *mrb, mrb_value str) -{ - struct RString *s = mrb_str_ptr(str); - struct RString *dup = str_new(mrb, 0, 0); + case MRB_TT_STRING: + *beg = str_index_str(mrb, str, indx, 0); + if (*beg < 0) { break; } + *len = RSTRING_LEN(indx); + return STR_BYTE_RANGE_CORRECTED; - str_with_class(mrb, dup, str); - return str_replace(mrb, dup, s); + case MRB_TT_RANGE: + goto range_arg; + + default: + indx = mrb_to_int(mrb, indx); + if (mrb_integer_p(indx)) { + *beg = mrb_integer(indx); + *len = 1; + return STR_CHAR_RANGE; + } +range_arg: + *len = RSTRING_CHAR_LEN(str); + switch (mrb_range_beg_len(mrb, indx, beg, len, *len, TRUE)) { + case MRB_RANGE_OK: + return STR_CHAR_RANGE_CORRECTED; + case MRB_RANGE_OUT: + return STR_OUT_OF_RANGE; + default: + break; + } + + mrb_raise(mrb, E_TYPE_ERROR, "can't convert to Integer"); + } + } + return STR_OUT_OF_RANGE; } static mrb_value -mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx) +mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen) { - mrb_int idx; - - mrb_regexp_check(mrb, indx); - switch (mrb_type(indx)) { - case MRB_TT_FLOAT: - indx = mrb_flo_to_fixnum(mrb, indx); - /* fall through */ - case MRB_TT_FIXNUM: - idx = mrb_fixnum(indx); + mrb_int beg, len; -num_index: - str = mrb_str_substr(mrb, str, idx, 1); - if (!mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value(); + switch (str_convert_range(mrb, str, indx, alen, &beg, &len)) { + case STR_CHAR_RANGE_CORRECTED: + return str_subseq(mrb, str, beg, len); + case STR_CHAR_RANGE: + str = str_substr(mrb, str, beg, len); + if (mrb_undef_p(alen) && !mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value(); return str; - - case MRB_TT_STRING: - if (mrb_str_index(mrb, str, indx, 0) != -1) + case STR_BYTE_RANGE_CORRECTED: + if (mrb_string_p(indx)) { return mrb_str_dup(mrb, indx); - return mrb_nil_value(); - - case MRB_TT_RANGE: - /* check if indx is Range */ - { - mrb_int beg, len; - - len = RSTRING_LEN(str); - if (mrb_range_beg_len(mrb, indx, &beg, &len, len)) { - return mrb_str_subseq(mrb, str, beg, len); - } - else { - return mrb_nil_value(); - } } + else { + return mrb_str_byte_subseq(mrb, str, beg, len); + } + case STR_OUT_OF_RANGE: default: - idx = mrb_fixnum(indx); - goto num_index; + return mrb_nil_value(); } - return mrb_nil_value(); /* not reached */ } /* 15.2.10.5.6 */ @@ -794,16 +1178,14 @@ num_index: * str[fixnum] => fixnum or nil * str[fixnum, fixnum] => new_str or nil * str[range] => new_str or nil - * str[regexp] => new_str or nil - * str[regexp, fixnum] => new_str or nil * str[other_str] => new_str or nil * str.slice(fixnum) => fixnum or nil * str.slice(fixnum, fixnum) => new_str or nil * str.slice(range) => new_str or nil * str.slice(other_str) => new_str or nil * - * Element Reference---If passed a single <code>Fixnum</code>, returns the code - * of the character at that position. If passed two <code>Fixnum</code> + * Element Reference---If passed a single <code>Integer</code>, returns the code + * of the character at that position. If passed two <code>Integer</code> * objects, returns a substring starting at the offset given by the first, and * a length given by the second. If given a range, a substring containing * characters at offsets given by the range is returned. In all three cases, if @@ -831,17 +1213,198 @@ static mrb_value mrb_str_aref_m(mrb_state *mrb, mrb_value str) { mrb_value a1, a2; - int argc; - argc = mrb_get_args(mrb, "o|o", &a1, &a2); - if (argc == 2) { - mrb_regexp_check(mrb, a1); - return mrb_str_substr(mrb, str, mrb_fixnum(a1), mrb_fixnum(a2)); + if (mrb_get_args(mrb, "o|o", &a1, &a2) == 1) { + a2 = mrb_undef_value(); + } + + return mrb_str_aref(mrb, str, a1, a2); +} + +static mrb_noreturn void +str_out_of_index(mrb_state *mrb, mrb_value index) +{ + mrb_raisef(mrb, E_INDEX_ERROR, "index %v out of string", index); +} + +static mrb_value +str_replace_partial(mrb_state *mrb, mrb_value src, mrb_int pos, mrb_int end, mrb_value rep) +{ + const mrb_int shrink_threshold = 256; + struct RString *str = mrb_str_ptr(src); + mrb_int len = RSTR_LEN(str); + mrb_int replen, newlen; + char *strp; + + if (end > len) { end = len; } + + if (pos < 0 || pos > len) { + str_out_of_index(mrb, mrb_fixnum_value(pos)); + } + + replen = (mrb_nil_p(rep) ? 0 : RSTRING_LEN(rep)); + newlen = replen + (len - (end - pos)); + + if (newlen >= MRB_SSIZE_MAX || newlen < replen /* overflowed */) { + mrb_raise(mrb, E_RUNTIME_ERROR, "string size too big"); + } + + mrb_str_modify(mrb, str); + + if (len < newlen) { + resize_capa(mrb, str, newlen); + } + + strp = RSTR_PTR(str); + + memmove(strp + newlen - (len - end), strp + end, len - end); + if (!mrb_nil_p(rep)) { + memmove(strp + pos, RSTRING_PTR(rep), replen); + } + RSTR_SET_LEN(str, newlen); + strp[newlen] = '\0'; + + if (len - newlen >= shrink_threshold) { + resize_capa(mrb, str, newlen); + } + + return src; +} + +#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{')) + +static mrb_value +str_escape(mrb_state *mrb, mrb_value str, mrb_bool inspect) +{ + const char *p, *pend; + char buf[4]; /* `\x??` or UTF-8 character */ + mrb_value result = mrb_str_new_lit(mrb, "\""); +#ifdef MRB_UTF8_STRING + uint32_t ascii_flag = MRB_STR_ASCII; +#endif + + p = RSTRING_PTR(str); pend = RSTRING_END(str); + for (;p < pend; p++) { + unsigned char c, cc; +#ifdef MRB_UTF8_STRING + if (inspect) { + mrb_int clen = mrb_utf8len(p, pend); + if (clen > 1) { + mrb_int i; + + for (i=0; i<clen; i++) { + buf[i] = p[i]; + } + mrb_str_cat(mrb, result, buf, clen); + p += clen-1; + ascii_flag = 0; + continue; + } + } +#endif + c = *p; + if (c == '"'|| c == '\\' || (c == '#' && IS_EVSTR(p+1, pend))) { + buf[0] = '\\'; buf[1] = c; + mrb_str_cat(mrb, result, buf, 2); + continue; + } + if (ISPRINT(c)) { + buf[0] = c; + mrb_str_cat(mrb, result, buf, 1); + continue; + } + switch (c) { + case '\n': cc = 'n'; break; + case '\r': cc = 'r'; break; + case '\t': cc = 't'; break; + case '\f': cc = 'f'; break; + case '\013': cc = 'v'; break; + case '\010': cc = 'b'; break; + case '\007': cc = 'a'; break; + case 033: cc = 'e'; break; + default: cc = 0; break; + } + if (cc) { + buf[0] = '\\'; + buf[1] = (char)cc; + mrb_str_cat(mrb, result, buf, 2); + continue; + } + else { + buf[0] = '\\'; + buf[1] = 'x'; + buf[3] = mrb_digitmap[c % 16]; c /= 16; + buf[2] = mrb_digitmap[c % 16]; + mrb_str_cat(mrb, result, buf, 4); + continue; + } + } + mrb_str_cat_lit(mrb, result, "\""); +#ifdef MRB_UTF8_STRING + if (inspect) { + mrb_str_ptr(str)->flags |= ascii_flag; + mrb_str_ptr(result)->flags |= ascii_flag; + } + else { + RSTR_SET_ASCII_FLAG(mrb_str_ptr(result)); + } +#endif + + return result; +} + +static void +mrb_str_aset(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen, mrb_value replace) +{ + mrb_int beg, len, charlen; + + mrb_to_str(mrb, replace); + + switch (str_convert_range(mrb, str, indx, alen, &beg, &len)) { + case STR_OUT_OF_RANGE: + default: + mrb_raise(mrb, E_INDEX_ERROR, "string not matched"); + case STR_CHAR_RANGE: + if (len < 0) { + mrb_raisef(mrb, E_INDEX_ERROR, "negative length %v", alen); + } + charlen = RSTRING_CHAR_LEN(str); + if (beg < 0) { beg += charlen; } + if (beg < 0 || beg > charlen) { str_out_of_index(mrb, indx); } + /* fall through */ + case STR_CHAR_RANGE_CORRECTED: + str_range_to_bytes(str, &beg, &len); + /* fall through */ + case STR_BYTE_RANGE_CORRECTED: + str_replace_partial(mrb, str, beg, beg + len, replace); } - if (argc != 1) { - mrb_raisef(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%S for 1)", mrb_fixnum_value(argc)); +} + +/* + * call-seq: + * str[fixnum] = replace + * str[fixnum, fixnum] = replace + * str[range] = replace + * str[other_str] = replace + * + * Modify +self+ by replacing the content of +self+. + * The portion of the string affected is determined using the same criteria as +String#[]+. + */ +static mrb_value +mrb_str_aset_m(mrb_state *mrb, mrb_value str) +{ + mrb_value indx, alen, replace; + + switch (mrb_get_args(mrb, "oo|S!", &indx, &alen, &replace)) { + case 2: + replace = alen; + alen = mrb_undef_value(); + break; + case 3: + break; } - return mrb_str_aref(mrb, str, a1); + mrb_str_aset(mrb, str, indx, alen, replace); + return str; } /* 15.2.10.5.8 */ @@ -864,7 +1427,7 @@ mrb_str_capitalize_bang(mrb_state *mrb, mrb_value str) mrb_bool modify = FALSE; struct RString *s = mrb_str_ptr(str); - mrb_str_modify(mrb, s); + mrb_str_modify_keep_ascii(mrb, s); if (RSTR_LEN(s) == 0 || !RSTR_PTR(s)) return mrb_nil_value(); p = RSTR_PTR(s); pend = RSTR_PTR(s) + RSTR_LEN(s); if (ISLOWER(*p)) { @@ -906,7 +1469,7 @@ mrb_str_capitalize(mrb_state *mrb, mrb_value self) /* 15.2.10.5.10 */ /* * call-seq: - * str.chomp!(separator=$/) => str or nil + * str.chomp!(separator="\n") => str or nil * * Modifies <i>str</i> in place as described for <code>String#chomp</code>, * returning <i>str</i>, or <code>nil</code> if no modifications were made. @@ -919,11 +1482,13 @@ mrb_str_chomp_bang(mrb_state *mrb, mrb_value str) char *p, *pp; mrb_int rslen; mrb_int len; + mrb_int argc; struct RString *s = mrb_str_ptr(str); - mrb_str_modify(mrb, s); + argc = mrb_get_args(mrb, "|S", &rs); + mrb_str_modify_keep_ascii(mrb, s); len = RSTR_LEN(s); - if (mrb_get_args(mrb, "|S", &rs) == 0) { + if (argc == 0) { if (len == 0) return mrb_nil_value(); smart_chomp: if (RSTR_PTR(s)[len-1] == '\n') { @@ -980,12 +1545,11 @@ mrb_str_chomp_bang(mrb_state *mrb, mrb_value str) /* 15.2.10.5.9 */ /* * call-seq: - * str.chomp(separator=$/) => new_str + * str.chomp(separator="\n") => new_str * * Returns a new <code>String</code> with the given record separator removed - * from the end of <i>str</i> (if present). If <code>$/</code> has not been - * changed from the default Ruby record separator, then <code>chomp</code> also - * removes carriage return characters (that is it will remove <code>\n</code>, + * from the end of <i>str</i> (if present). <code>chomp</code> also removes + * carriage return characters (that is it will remove <code>\n</code>, * <code>\r</code>, and <code>\r\n</code>). * * "hello".chomp #=> "hello" @@ -1020,10 +1584,21 @@ mrb_str_chop_bang(mrb_state *mrb, mrb_value str) { struct RString *s = mrb_str_ptr(str); - mrb_str_modify(mrb, s); + mrb_str_modify_keep_ascii(mrb, s); if (RSTR_LEN(s) > 0) { mrb_int len; +#ifdef MRB_UTF8_STRING + const char* t = RSTR_PTR(s), *p = t; + const char* e = p + RSTR_LEN(s); + while (p<e) { + mrb_int clen = mrb_utf8len(p, e); + if (p + clen>=e) break; + p += clen; + } + len = p - t; +#else len = RSTR_LEN(s) - 1; +#endif if (RSTR_PTR(s)[len] == '\n') { if (len > 0 && RSTR_PTR(s)[len-1] == '\r') { @@ -1078,7 +1653,7 @@ mrb_str_downcase_bang(mrb_state *mrb, mrb_value str) mrb_bool modify = FALSE; struct RString *s = mrb_str_ptr(str); - mrb_str_modify(mrb, s); + mrb_str_modify_keep_ascii(mrb, s); p = RSTR_PTR(s); pend = RSTR_PTR(s) + RSTR_LEN(s); while (p < pend) { @@ -1100,7 +1675,7 @@ mrb_str_downcase_bang(mrb_state *mrb, mrb_value str) * * Returns a copy of <i>str</i> with all uppercase letters replaced with their * lowercase counterparts. The operation is locale insensitive---only - * characters ``A'' to ``Z'' are affected. + * characters 'A' to 'Z' are affected. * * "hEllO".downcase #=> "hello" */ @@ -1142,72 +1717,38 @@ mrb_str_empty_p(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_eql(mrb_state *mrb, mrb_value self) { - mrb_value str2; + mrb_value str2 = mrb_get_arg1(mrb); mrb_bool eql_p; - mrb_get_args(mrb, "o", &str2); - eql_p = (mrb_type(str2) == MRB_TT_STRING) && str_eql(mrb, self, str2); + eql_p = (mrb_string_p(str2)) && str_eql(mrb, self, str2); return mrb_bool_value(eql_p); } -static mrb_value -mrb_str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) -{ - struct RString *orig, *s; - mrb_shared_string *shared; - - orig = mrb_str_ptr(str); - if (RSTR_EMBED_P(orig)) { - s = str_new(mrb, orig->as.ary+beg, len); - } else { - str_make_shared(mrb, orig); - shared = orig->as.heap.aux.shared; - s = mrb_obj_alloc_string(mrb); - s->as.heap.ptr = orig->as.heap.ptr + beg; - s->as.heap.len = len; - s->as.heap.aux.shared = shared; - RSTR_SET_SHARED_FLAG(s); - shared->refcnt++; - } - - return mrb_obj_value(s); -} - MRB_API mrb_value mrb_str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) { - if (len < 0) return mrb_nil_value(); - if (!RSTRING_LEN(str)) { - len = 0; - } - if (beg > RSTRING_LEN(str)) return mrb_nil_value(); - if (beg < 0) { - beg += RSTRING_LEN(str); - if (beg < 0) return mrb_nil_value(); - } - if (beg + len > RSTRING_LEN(str)) - len = RSTRING_LEN(str) - beg; - if (len <= 0) { - len = 0; - } - return mrb_str_subseq(mrb, str, beg, len); + return str_substr(mrb, str, beg, len); } -mrb_int +uint32_t mrb_str_hash(mrb_state *mrb, mrb_value str) { /* 1-8-7 */ struct RString *s = mrb_str_ptr(str); mrb_int len = RSTR_LEN(s); char *p = RSTR_PTR(s); - mrb_int key = 0; + uint32_t hash = 0; - while (len--) { - key = key*65599 + *p; - p++; + for(int i = 0; i < len; ++i) { + hash += p[i]; + hash += (hash << 10); + hash ^= (hash >> 6); } - return key + (key>>5); + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + return hash; } /* 15.2.10.5.20 */ @@ -1240,146 +1781,52 @@ mrb_str_hash_m(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_include(mrb_state *mrb, mrb_value self) { - mrb_int i; mrb_value str2; - mrb_bool include_p; - - mrb_get_args(mrb, "o", &str2); - if (mrb_fixnum_p(str2)) { - include_p = (memchr(RSTRING_PTR(self), mrb_fixnum(str2), RSTRING_LEN(self)) != NULL); - } - else { - str2 = mrb_str_to_str(mrb, str2); - i = mrb_str_index(mrb, self, str2, 0); - include_p = (i != -1); - } - - return mrb_bool_value(include_p); + mrb_get_args(mrb, "S", &str2); + if (str_index_str(mrb, self, str2, 0) < 0) + return mrb_bool_value(FALSE); + return mrb_bool_value(TRUE); } /* 15.2.10.5.22 */ /* * call-seq: * str.index(substring [, offset]) => fixnum or nil - * str.index(fixnum [, offset]) => fixnum or nil - * str.index(regexp [, offset]) => fixnum or nil * * Returns the index of the first occurrence of the given - * <i>substring</i>, - * character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. - * Returns - * <code>nil</code> if not found. + * <i>substring</i>. Returns <code>nil</code> if not found. * If the second parameter is present, it * specifies the position in the string to begin the search. * - * "hello".index('e') #=> 1 + * "hello".index('l') #=> 2 * "hello".index('lo') #=> 3 * "hello".index('a') #=> nil - * "hello".index(101) #=> 1(101=0x65='e') - * "hello".index(/[aeiou]/, -3) #=> 4 + * "hello".index('l', -2) #=> 3 */ static mrb_value mrb_str_index_m(mrb_state *mrb, mrb_value str) { - mrb_value *argv; - mrb_int argc; mrb_value sub; mrb_int pos; - mrb_get_args(mrb, "*", &argv, &argc); - if (argc == 2) { - pos = mrb_fixnum(argv[1]); - sub = argv[0]; - } - else { + if (mrb_get_args(mrb, "S|i", &sub, &pos) == 1) { pos = 0; - if (argc > 0) - sub = argv[0]; - else - sub = mrb_nil_value(); } - mrb_regexp_check(mrb, sub); - if (pos < 0) { - pos += RSTRING_LEN(str); + else if (pos < 0) { + mrb_int clen = RSTRING_CHAR_LEN(str); + pos += clen; if (pos < 0) { return mrb_nil_value(); } } - - switch (mrb_type(sub)) { - case MRB_TT_FIXNUM: { - int c = mrb_fixnum(sub); - mrb_int len = RSTRING_LEN(str); - unsigned char *p = (unsigned char*)RSTRING_PTR(str); - - for (;pos<len;pos++) { - if (p[pos] == c) return mrb_fixnum_value(pos); - } - return mrb_nil_value(); - } - - default: { - mrb_value tmp; - - tmp = mrb_check_string_type(mrb, sub); - if (mrb_nil_p(tmp)) { - mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub); - } - sub = tmp; - } - /* fall through */ - case MRB_TT_STRING: - pos = mrb_str_index(mrb, str, sub, pos); - break; - } + pos = str_index_str_by_char(mrb, str, sub, pos); if (pos == -1) return mrb_nil_value(); + BYTES_ALIGN_CHECK(pos); return mrb_fixnum_value(pos); } -#define STR_REPLACE_SHARED_MIN 10 - -static mrb_value -str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2) -{ - long len; - - len = RSTR_LEN(s2); - if (RSTR_SHARED_P(s1)) { - str_decref(mrb, s1->as.heap.aux.shared); - } - else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1)) { - mrb_free(mrb, s1->as.heap.ptr); - } - - RSTR_UNSET_NOFREE_FLAG(s1); - - if (RSTR_SHARED_P(s2)) { -L_SHARE: - RSTR_UNSET_EMBED_FLAG(s1); - s1->as.heap.ptr = s2->as.heap.ptr; - s1->as.heap.len = len; - s1->as.heap.aux.shared = s2->as.heap.aux.shared; - RSTR_SET_SHARED_FLAG(s1); - s1->as.heap.aux.shared->refcnt++; - } - else { - if (len <= RSTRING_EMBED_LEN_MAX) { - RSTR_UNSET_SHARED_FLAG(s1); - RSTR_SET_EMBED_FLAG(s1); - memcpy(s1->as.ary, RSTR_PTR(s2), len); - RSTR_SET_EMBED_LEN(s1, len); - } - else { - str_make_shared(mrb, s2); - goto L_SHARE; - } - } - - return mrb_obj_value(s1); -} - /* 15.2.10.5.24 */ /* 15.2.10.5.28 */ /* @@ -1410,9 +1857,11 @@ mrb_str_init(mrb_state *mrb, mrb_value self) { mrb_value str2; - if (mrb_get_args(mrb, "|S", &str2) == 1) { - str_replace(mrb, mrb_str_ptr(self), mrb_str_ptr(str2)); + if (mrb_get_args(mrb, "|S", &str2) == 0) { + struct RString *s = str_new(mrb, 0, 0); + str2 = mrb_obj_value(s); } + str_replace(mrb, mrb_str_ptr(self), mrb_str_ptr(str2)); return self; } @@ -1424,7 +1873,7 @@ mrb_str_init(mrb_state *mrb, mrb_value self) * str.to_sym => symbol * * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the - * symbol if it did not previously exist. See <code>Symbol#id2name</code>. + * symbol if it did not previously exist. * * "Koala".intern #=> :Koala * s = 'cat'.to_sym #=> :cat @@ -1446,15 +1895,20 @@ mrb_str_intern(mrb_state *mrb, mrb_value self) MRB_API mrb_value mrb_obj_as_string(mrb_state *mrb, mrb_value obj) { - mrb_value str; - - if (mrb_string_p(obj)) { + switch (mrb_type(obj)) { + case MRB_TT_STRING: return obj; + case MRB_TT_SYMBOL: + return mrb_sym_str(mrb, mrb_symbol(obj)); + case MRB_TT_INTEGER: + return mrb_integer_to_str(mrb, obj, 10); + case MRB_TT_SCLASS: + case MRB_TT_CLASS: + case MRB_TT_MODULE: + return mrb_mod_to_s(mrb, obj); + default: + return mrb_type_convert(mrb, obj, MRB_TT_STRING, MRB_SYM(to_s)); } - str = mrb_funcall(mrb, obj, "to_s", 0); - if (!mrb_string_p(str)) - return mrb_any_to_s(mrb, obj); - return str; } MRB_API mrb_value @@ -1487,45 +1941,16 @@ mrb_ptr_to_str(mrb_state *mrb, void *p) return mrb_obj_value(p_str); } -MRB_API mrb_value -mrb_string_type(mrb_state *mrb, mrb_value str) -{ - return mrb_convert_type(mrb, str, MRB_TT_STRING, "String", "to_str"); -} - -MRB_API mrb_value -mrb_check_string_type(mrb_state *mrb, mrb_value str) -{ - return mrb_check_convert_type(mrb, str, MRB_TT_STRING, "String", "to_str"); -} - -/* ---------------------------------- */ -/* 15.2.10.5.29 */ -/* - * call-seq: - * str.reverse => new_str - * - * Returns a new string with the characters from <i>str</i> in reverse order. - * - * "stressed".reverse #=> "desserts" - */ -static mrb_value -mrb_str_reverse(mrb_state *mrb, mrb_value str) +static inline void +str_reverse(char *p, char *e) { - struct RString *s2; - char *s, *e, *p; - - if (RSTRING_LEN(str) <= 1) return mrb_str_dup(mrb, str); - - s2 = str_new(mrb, 0, RSTRING_LEN(str)); - str_with_class(mrb, s2, str); - s = RSTRING_PTR(str); e = RSTRING_END(str) - 1; - p = RSTR_PTR(s2); + char c; - while (e >= s) { - *p++ = *e--; + while (p < e) { + c = *p; + *p++ = *e; + *e-- = c; } - return mrb_obj_value(s2); } /* 15.2.10.5.30 */ @@ -1540,146 +1965,96 @@ mrb_str_reverse_bang(mrb_state *mrb, mrb_value str) { struct RString *s = mrb_str_ptr(str); char *p, *e; - char c; - mrb_str_modify(mrb, s); - if (RSTR_LEN(s) > 1) { +#ifdef MRB_UTF8_STRING + mrb_int utf8_len = RSTRING_CHAR_LEN(str); + mrb_int len = RSTR_LEN(s); + + if (utf8_len < 2) return str; + if (utf8_len < len) { + mrb_str_modify(mrb, s); p = RSTR_PTR(s); - e = p + RSTR_LEN(s) - 1; - while (p < e) { - c = *p; - *p++ = *e; - *e-- = c; + e = p + RSTR_LEN(s); + while (p<e) { + mrb_int clen = mrb_utf8len(p, e); + str_reverse(p, p + clen - 1); + p += clen; } + goto bytes; + } +#endif + + if (RSTR_LEN(s) > 1) { + mrb_str_modify(mrb, s); + goto bytes; } return str; + + bytes: + p = RSTR_PTR(s); + e = p + RSTR_LEN(s) - 1; + str_reverse(p, e); + return str; } +/* ---------------------------------- */ +/* 15.2.10.5.29 */ /* * call-seq: - * str.rindex(substring [, fixnum]) => fixnum or nil - * str.rindex(fixnum [, fixnum]) => fixnum or nil - * str.rindex(regexp [, fixnum]) => fixnum or nil + * str.reverse => new_str * - * Returns the index of the last occurrence of the given <i>substring</i>, - * character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns - * <code>nil</code> if not found. If the second parameter is present, it - * specifies the position in the string to end the search---characters beyond - * this point will not be considered. + * Returns a new string with the characters from <i>str</i> in reverse order. * - * "hello".rindex('e') #=> 1 - * "hello".rindex('l') #=> 3 - * "hello".rindex('a') #=> nil - * "hello".rindex(101) #=> 1 - * "hello".rindex(/[aeiou]/, -2) #=> 1 + * "stressed".reverse #=> "desserts" */ -static mrb_int -mrb_str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) +static mrb_value +mrb_str_reverse(mrb_state *mrb, mrb_value str) { - char *s, *sbeg, *t; - struct RString *ps = mrb_str_ptr(str); - mrb_int len = RSTRING_LEN(sub); - - /* substring longer than string */ - if (RSTR_LEN(ps) < len) return -1; - if (RSTR_LEN(ps) - pos < len) { - pos = RSTR_LEN(ps) - len; - } - sbeg = RSTR_PTR(ps); - s = RSTR_PTR(ps) + pos; - t = RSTRING_PTR(sub); - if (len) { - while (sbeg <= s) { - if (memcmp(s, t, len) == 0) { - return s - RSTR_PTR(ps); - } - s--; - } - return -1; - } - else { - return pos; - } + mrb_value str2 = mrb_str_dup(mrb, str); + mrb_str_reverse_bang(mrb, str2); + return str2; } /* 15.2.10.5.31 */ /* * call-seq: - * str.rindex(substring [, fixnum]) => fixnum or nil - * str.rindex(fixnum [, fixnum]) => fixnum or nil - * str.rindex(regexp [, fixnum]) => fixnum or nil + * str.rindex(substring [, offset]) => fixnum or nil * - * Returns the index of the last occurrence of the given <i>substring</i>, - * character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns - * <code>nil</code> if not found. If the second parameter is present, it - * specifies the position in the string to end the search---characters beyond - * this point will not be considered. + * Returns the index of the last occurrence of the given <i>substring</i>. + * Returns <code>nil</code> if not found. If the second parameter is + * present, it specifies the position in the string to end the + * search---characters beyond this point will not be considered. * * "hello".rindex('e') #=> 1 * "hello".rindex('l') #=> 3 * "hello".rindex('a') #=> nil - * "hello".rindex(101) #=> 1 - * "hello".rindex(/[aeiou]/, -2) #=> 1 + * "hello".rindex('l', 2) #=> 2 */ static mrb_value -mrb_str_rindex_m(mrb_state *mrb, mrb_value str) +mrb_str_rindex(mrb_state *mrb, mrb_value str) { - mrb_value *argv; - mrb_int argc; mrb_value sub; - mrb_value vpos; - mrb_int pos, len = RSTRING_LEN(str); + mrb_int pos, len = RSTRING_CHAR_LEN(str); - mrb_get_args(mrb, "*", &argv, &argc); - if (argc == 2) { - sub = argv[0]; - vpos = argv[1]; - pos = mrb_fixnum(vpos); + if (mrb_get_args(mrb, "S|i", &sub, &pos) == 1) { + pos = len; + } + else { if (pos < 0) { pos += len; if (pos < 0) { - mrb_regexp_check(mrb, sub); return mrb_nil_value(); } } if (pos > len) pos = len; } - else { - pos = len; - if (argc > 0) - sub = argv[0]; - else - sub = mrb_nil_value(); + pos = chars2bytes(str, 0, pos); + pos = str_rindex(mrb, str, sub, pos); + if (pos >= 0) { + pos = bytes2chars(RSTRING_PTR(str), RSTRING_LEN(str), pos); + BYTES_ALIGN_CHECK(pos); + return mrb_fixnum_value(pos); } - mrb_regexp_check(mrb, sub); - - switch (mrb_type(sub)) { - case MRB_TT_FIXNUM: { - int c = mrb_fixnum(sub); - unsigned char *p = (unsigned char*)RSTRING_PTR(str); - - for (pos=len-1;pos>=0;pos--) { - if (p[pos] == c) return mrb_fixnum_value(pos); - } - return mrb_nil_value(); - } - - default: { - mrb_value tmp; - - tmp = mrb_check_string_type(mrb, sub); - if (mrb_nil_p(tmp)) { - mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub); - } - sub = tmp; - } - /* fall through */ - case MRB_TT_STRING: - pos = mrb_str_rindex(mrb, str, sub, pos); - if (pos >= 0) return mrb_fixnum_value(pos); - break; - - } /* end of switch (TYPE(sub)) */ return mrb_nil_value(); } @@ -1687,23 +2062,18 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) /* * call-seq: - * str.split(pattern=$;, [limit]) => anArray + * str.split(separator=nil, [limit]) => anArray * * Divides <i>str</i> into substrings based on a delimiter, returning an array * of these substrings. * - * If <i>pattern</i> is a <code>String</code>, then its contents are used as - * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single + * If <i>separator</i> is a <code>String</code>, then its contents are used as + * the delimiter when splitting <i>str</i>. If <i>separator</i> is a single * space, <i>str</i> is split on whitespace, with leading whitespace and runs * of contiguous whitespace characters ignored. * - * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the - * pattern matches. Whenever the pattern matches a zero-length string, - * <i>str</i> is split into individual characters. - * - * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If - * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is - * split on whitespace as if ` ' were specified. + * If <i>separator</i> is omitted or <code>nil</code> (which is the default), + * <i>str</i> is split on whitespace as if ' ' were specified. * * If the <i>limit</i> parameter is omitted, trailing null fields are * suppressed. If <i>limit</i> is a positive number, at most that number of @@ -1714,11 +2084,6 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) * * " now's the time".split #=> ["now's", "the", "time"] * " now's the time".split(' ') #=> ["now's", "the", "time"] - * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"] - * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"] - * "hello".split(//) #=> ["h", "e", "l", "l", "o"] - * "hello".split(//, 3) #=> ["h", "e", "llo"] - * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"] * * "mellow yellow".split("ello") #=> ["m", "w y", "w"] * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"] @@ -1729,13 +2094,14 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) static mrb_value mrb_str_split_m(mrb_state *mrb, mrb_value str) { - int argc; + mrb_int argc; mrb_value spat = mrb_nil_value(); - enum {awk, string, regexp} split_type = string; - long i = 0, lim_p; + enum {awk, string} split_type = string; + mrb_int i = 0; mrb_int beg; mrb_int end; mrb_int lim = 0; + mrb_bool lim_p; mrb_value result, tmp; argc = mrb_get_args(mrb, "|oi", &spat, &lim); @@ -1752,91 +2118,74 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) if (argc == 0 || mrb_nil_p(spat)) { split_type = awk; } - else { - if (mrb_string_p(spat)) { - split_type = string; - if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') { - split_type = awk; - } - } - else { - mrb_noregexp(mrb, str); - } + else if (!mrb_string_p(spat)) { + mrb_raise(mrb, E_TYPE_ERROR, "expected String"); + } + else if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') { + split_type = awk; } result = mrb_ary_new(mrb); beg = 0; if (split_type == awk) { - char *ptr = RSTRING_PTR(str); - char *eptr = RSTRING_END(str); - char *bptr = ptr; mrb_bool skip = TRUE; + mrb_int idx = 0; + mrb_int str_len = RSTRING_LEN(str); unsigned int c; + int ai = mrb_gc_arena_save(mrb); - end = beg; - while (ptr < eptr) { - int ai = mrb_gc_arena_save(mrb); - c = (unsigned char)*ptr++; + idx = end = beg; + while (idx < str_len) { + c = (unsigned char)RSTRING_PTR(str)[idx++]; if (skip) { if (ISSPACE(c)) { - beg = ptr - bptr; + beg = idx; } else { - end = ptr - bptr; + end = idx; skip = FALSE; if (lim_p && lim <= i) break; } } else if (ISSPACE(c)) { - mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, beg, end-beg)); + mrb_ary_push(mrb, result, mrb_str_byte_subseq(mrb, str, beg, end-beg)); mrb_gc_arena_restore(mrb, ai); skip = TRUE; - beg = ptr - bptr; + beg = idx; if (lim_p) ++i; } else { - end = ptr - bptr; + end = idx; } } } - else if (split_type == string) { - char *ptr = RSTRING_PTR(str); /* s->as.ary */ - char *temp = ptr; - char *eptr = RSTRING_END(str); - mrb_int slen = RSTRING_LEN(spat); + else { /* split_type == string */ + mrb_int str_len = RSTRING_LEN(str); + mrb_int pat_len = RSTRING_LEN(spat); + mrb_int idx = 0; + int ai = mrb_gc_arena_save(mrb); - if (slen == 0) { - int ai = mrb_gc_arena_save(mrb); - while (ptr < eptr) { - mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, ptr-temp, 1)); - mrb_gc_arena_restore(mrb, ai); - ptr++; - if (lim_p && lim <= ++i) break; + while (idx < str_len) { + if (pat_len > 0) { + end = mrb_memsearch(RSTRING_PTR(spat), pat_len, RSTRING_PTR(str)+idx, str_len - idx); + if (end < 0) break; } - } - else { - char *sptr = RSTRING_PTR(spat); - int ai = mrb_gc_arena_save(mrb); - - while (ptr < eptr && - (end = mrb_memsearch(sptr, slen, ptr, eptr - ptr)) >= 0) { - mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, ptr - temp, end)); - mrb_gc_arena_restore(mrb, ai); - ptr += end + slen; - if (lim_p && lim <= ++i) break; + else { + end = chars2bytes(str, idx, 1); } + mrb_ary_push(mrb, result, mrb_str_byte_subseq(mrb, str, idx, end)); + mrb_gc_arena_restore(mrb, ai); + idx += end + pat_len; + if (lim_p && lim <= ++i) break; } - beg = ptr - temp; - } - else { - mrb_noregexp(mrb, str); + beg = idx; } if (RSTRING_LEN(str) > 0 && (lim_p || RSTRING_LEN(str) > beg || lim < 0)) { if (RSTRING_LEN(str) == beg) { - tmp = mrb_str_new_empty(mrb, str); + tmp = mrb_str_new(mrb, 0, 0); } else { - tmp = mrb_str_subseq(mrb, str, beg, RSTRING_LEN(str)-beg); + tmp = mrb_str_byte_subseq(mrb, str, beg, RSTRING_LEN(str)-beg); } mrb_ary_push(mrb, result, tmp); } @@ -1850,13 +2199,14 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) return result; } -MRB_API mrb_value -mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck) +static mrb_value +mrb_str_len_to_inum(mrb_state *mrb, const char *str, size_t len, mrb_int base, int badcheck) { - const char *p; + const char *p = str; + const char *pend = str + len; char sign = 1; - int c, uscore; - unsigned long n = 0; + int c; + mrb_int n = 0; mrb_int val; #define conv_digit(c) \ @@ -1865,26 +2215,23 @@ mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck) ISUPPER(c) ? ((c) - 'A' + 10) : \ -1) - if (!str) { + if (!p) { if (badcheck) goto bad; return mrb_fixnum_value(0); } - while (ISSPACE(*str)) str++; + while (p<pend && ISSPACE(*p)) + p++; - if (str[0] == '+') { - str++; + if (p[0] == '+') { + p++; } - else if (str[0] == '-') { - str++; + else if (p[0] == '-') { + p++; sign = 0; } - if (str[0] == '+' || str[0] == '-') { - if (badcheck) goto bad; - return mrb_fixnum_value(0); - } if (base <= 0) { - if (str[0] == '0') { - switch (str[1]) { + if (p[0] == '0') { + switch (p[1]) { case 'x': case 'X': base = 16; break; @@ -1899,6 +2246,7 @@ mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck) break; default: base = 8; + break; } } else if (base < -1) { @@ -1910,102 +2258,140 @@ mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck) } switch (base) { case 2: - if (str[0] == '0' && (str[1] == 'b'||str[1] == 'B')) { - str += 2; + if (p[0] == '0' && (p[1] == 'b'||p[1] == 'B')) { + p += 2; } break; case 3: break; case 8: - if (str[0] == '0' && (str[1] == 'o'||str[1] == 'O')) { - str += 2; + if (p[0] == '0' && (p[1] == 'o'||p[1] == 'O')) { + p += 2; } case 4: case 5: case 6: case 7: break; case 10: - if (str[0] == '0' && (str[1] == 'd'||str[1] == 'D')) { - str += 2; + if (p[0] == '0' && (p[1] == 'd'||p[1] == 'D')) { + p += 2; } case 9: case 11: case 12: case 13: case 14: case 15: break; case 16: - if (str[0] == '0' && (str[1] == 'x'||str[1] == 'X')) { - str += 2; + if (p[0] == '0' && (p[1] == 'x'||p[1] == 'X')) { + p += 2; } break; default: if (base < 2 || 36 < base) { - mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %S", mrb_fixnum_value(base)); + mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %i", base); } break; } /* end of switch (base) { */ - if (*str == '0') { /* squeeze preceeding 0s */ - uscore = 0; - while ((c = *++str) == '0' || c == '_') { + if (p>=pend) { + if (badcheck) goto bad; + return mrb_fixnum_value(0); + } + if (*p == '0') { /* squeeze preceding 0s */ + p++; + while (p<pend) { + c = *p++; if (c == '_') { - if (++uscore >= 2) + if (p<pend && *p == '_') { + if (badcheck) goto bad; break; + } + continue; + } + if (c != '0') { + p--; + break; } - else - uscore = 0; } - if (!(c = *str) || ISSPACE(c)) --str; + if (*(p - 1) == '0') + p--; } - c = *str; - c = conv_digit(c); - if (c < 0 || c >= base) { + if (p == pend || *p == '_') { if (badcheck) goto bad; return mrb_fixnum_value(0); } - - uscore = 0; - for (p=str;*p;p++) { + for ( ;p<pend;p++) { if (*p == '_') { - if (uscore == 0) { - uscore++; + p++; + if (p==pend) { + if (badcheck) goto bad; continue; } - if (badcheck) goto bad; - break; + if (*p == '_') { + if (badcheck) goto bad; + break; + } + } + if (badcheck && *p == '\0') { + goto nullbyte; } - uscore = 0; c = conv_digit(*p); if (c < 0 || c >= base) { - if (badcheck) goto bad; break; } - n *= base; + if (mrb_int_mul_overflow(n, base, &n)) goto overflow; + if (MRB_INT_MAX - c < n) { + if (sign == 0 && MRB_INT_MAX - n == c - 1) { + n = MRB_INT_MIN; + sign = 1; + break; + } + overflow: + mrb_raisef(mrb, E_RANGE_ERROR, "string (%l) too big for integer", str, pend-str); + } n += c; } - if (n > MRB_INT_MAX) { - mrb_raisef(mrb, E_ARGUMENT_ERROR, "string (%S) too big for integer", mrb_str_new_cstr(mrb, str)); - } - val = n; + val = (mrb_int)n; if (badcheck) { - if (p == str) goto bad; /* no number */ - while (*p && ISSPACE(*p)) p++; - if (*p) goto bad; /* trailing garbage */ + if (p == str) goto bad; /* no number */ + if (*(p - 1) == '_') goto bad; /* trailing '_' */ + while (p<pend && ISSPACE(*p)) p++; + if (p<pend) goto bad; /* trailing garbage */ } - return mrb_fixnum_value(sign ? val : -val); -bad: - mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for number(%S)", mrb_str_new_cstr(mrb, str)); + return mrb_int_value(mrb, sign ? val : -val); + nullbyte: + mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); + /* not reached */ + bad: + mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for number(%!l)", str, pend-str); /* not reached */ return mrb_fixnum_value(0); } +/* obslete: use RSTRING_CSTR() or mrb_string_cstr() */ MRB_API const char* mrb_string_value_cstr(mrb_state *mrb, mrb_value *ptr) { - struct RString *ps = mrb_str_ptr(*ptr); - mrb_int len = mrb_str_strlen(mrb, ps); - char *p = RSTR_PTR(ps); + struct RString *ps; + const char *p; + mrb_int len; - if (!p || p[len] != '\0') { - mrb_str_modify(mrb, ps); - return RSTR_PTR(ps); + check_null_byte(mrb, *ptr); + ps = mrb_str_ptr(*ptr); + p = RSTR_PTR(ps); + len = RSTR_LEN(ps); + if (p[len] == '\0') { + return p; } - return p; + + /* + * Even after str_modify_keep_ascii(), NULL termination is not ensured if + * RSTR_SET_LEN() is used explicitly (e.g. String#delete_suffix!). + */ + str_modify_keep_ascii(mrb, ps); + RSTR_PTR(ps)[len] = '\0'; + return RSTR_PTR(ps); +} + +MRB_API const char* +mrb_string_cstr(mrb_state *mrb, mrb_value str) +{ + return mrb_string_value_cstr(mrb, &str); } MRB_API mrb_value @@ -2014,21 +2400,10 @@ mrb_str_to_inum(mrb_state *mrb, mrb_value str, mrb_int base, mrb_bool badcheck) const char *s; mrb_int len; - str = mrb_str_to_str(mrb, str); - if (badcheck) { - s = mrb_string_value_cstr(mrb, &str); - } - else { - s = RSTRING_PTR(str); - } - if (s) { - len = RSTRING_LEN(str); - if (s[len]) { /* no sentinel somehow */ - struct RString *temp_str = str_new(mrb, s, len); - s = RSTR_PTR(temp_str); - } - } - return mrb_cstr_to_inum(mrb, s, base, badcheck); + mrb_to_str(mrb, str); + s = RSTRING_PTR(str); + len = RSTRING_LEN(str); + return mrb_str_len_to_inum(mrb, s, len, base, badcheck); } /* 15.2.10.5.38 */ @@ -2058,71 +2433,98 @@ mrb_str_to_i(mrb_state *mrb, mrb_value self) mrb_int base = 10; mrb_get_args(mrb, "|i", &base); - if (base < 0) { - mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %S", mrb_fixnum_value(base)); + if (base < 0 || 36 < base) { + mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %i", base); } return mrb_str_to_inum(mrb, self, base, FALSE); } -MRB_API double -mrb_cstr_to_dbl(mrb_state *mrb, const char * p, mrb_bool badcheck) +#ifndef MRB_NO_FLOAT +static double +mrb_str_len_to_dbl(mrb_state *mrb, const char *s, size_t len, mrb_bool badcheck) { + char buf[DBL_DIG * 4 + 20]; + const char *p = s, *p2; + const char *pend = p + len; char *end; + char *n; + char prev = 0; double d; - - enum {max_width = 20}; + mrb_bool dot = FALSE; if (!p) return 0.0; - while (ISSPACE(*p)) p++; - - if (!badcheck && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { - return 0.0; + while (p<pend && ISSPACE(*p)) p++; + p2 = p; + + if (pend - p > 2 && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { + mrb_value x; + + if (!badcheck) return 0.0; + x = mrb_str_len_to_inum(mrb, p, pend-p, 0, badcheck); + if (mrb_integer_p(x)) + d = (double)mrb_integer(x); + else /* if (mrb_float_p(x)) */ + d = mrb_float(x); + return d; } - d = strtod(p, &end); - if (p == end) { - if (badcheck) { -bad: - mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for float(%S)", mrb_str_new_cstr(mrb, p)); - /* not reached */ + while (p < pend) { + if (!*p) { + if (badcheck) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string for Float contains null byte"); + /* not reached */ + } + pend = p; + p = p2; + goto nocopy; } - return d; + if (!badcheck && *p == ' ') { + pend = p; + p = p2; + goto nocopy; + } + if (*p == '_') break; + p++; } - if (*end) { - char buf[DBL_DIG * 4 + 10]; - char *n = buf; - char *e = buf + sizeof(buf) - 1; - char prev = 0; - - while (p < end && n < e) prev = *n++ = *p++; - while (*p) { - if (*p == '_') { - /* remove underscores between digits */ - if (badcheck) { - if (n == buf || !ISDIGIT(prev)) goto bad; - ++p; - if (!ISDIGIT(*p)) goto bad; - } - else { - while (*++p == '_'); - continue; - } + p = p2; + n = buf; + while (p < pend) { + char c = *p++; + if (c == '.') dot = TRUE; + if (c == '_') { + /* remove an underscore between digits */ + if (n == buf || !ISDIGIT(prev) || p == pend) { + if (badcheck) goto bad; + break; } - prev = *p++; - if (n < e) *n++ = prev; } - *n = '\0'; - p = buf; - - if (!badcheck && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { - return 0.0; + else if (badcheck && prev == '_' && !ISDIGIT(c)) goto bad; + else { + const char *bend = buf+sizeof(buf)-1; + if (n==bend) { /* buffer overflow */ + if (dot) break; /* cut off remaining fractions */ + return INFINITY; + } + *n++ = c; } - - d = strtod(p, &end); + prev = c; + } + *n = '\0'; + p = buf; + pend = n; +nocopy: + d = mrb_float_read(p, &end); + if (p == end) { if (badcheck) { - if (!end || p == end) goto bad; - while (*end && ISSPACE(*end)) end++; - if (*end) goto bad; +bad: + mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for float(%!s)", s); + /* not reached */ } + return d; + } + if (badcheck) { + if (!end || p == end) goto bad; + while (end<pend && ISSPACE(*end)) end++; + if (end<pend) goto bad; } return d; } @@ -2130,22 +2532,7 @@ bad: MRB_API double mrb_str_to_dbl(mrb_state *mrb, mrb_value str, mrb_bool badcheck) { - char *s; - mrb_int len; - - str = mrb_str_to_str(mrb, str); - s = RSTRING_PTR(str); - len = RSTRING_LEN(str); - if (s) { - if (badcheck && memchr(s, '\0', len)) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string for Float contains null byte"); - } - if (s[len]) { /* no sentinel somehow */ - struct RString *temp_str = str_new(mrb, s, len); - s = RSTR_PTR(temp_str); - } - } - return mrb_cstr_to_dbl(mrb, s, badcheck); + return mrb_str_len_to_dbl(mrb, RSTRING_PTR(str), RSTRING_LEN(str), badcheck); } /* 15.2.10.5.39 */ @@ -2154,7 +2541,7 @@ mrb_str_to_dbl(mrb_state *mrb, mrb_value str, mrb_bool badcheck) * str.to_f => float * * Returns the result of interpreting leading characters in <i>str</i> as a - * floating point number. Extraneous characters past the end of a valid number + * floating-point number. Extraneous characters past the end of a valid number * are ignored. If there is not a valid number at the start of <i>str</i>, * <code>0.0</code> is returned. This method never raises an exception. * @@ -2167,12 +2554,12 @@ mrb_str_to_f(mrb_state *mrb, mrb_value self) { return mrb_float_value(mrb, mrb_str_to_dbl(mrb, self, FALSE)); } +#endif /* 15.2.10.5.40 */ /* * call-seq: * str.to_s => str - * str.to_str => str * * Returns the receiver. */ @@ -2200,7 +2587,7 @@ mrb_str_upcase_bang(mrb_state *mrb, mrb_value str) char *p, *pend; mrb_bool modify = FALSE; - mrb_str_modify(mrb, s); + mrb_str_modify_keep_ascii(mrb, s); p = RSTRING_PTR(str); pend = RSTRING_END(str); while (p < pend) { @@ -2222,7 +2609,7 @@ mrb_str_upcase_bang(mrb_state *mrb, mrb_value str) * * Returns a copy of <i>str</i> with all lowercase letters replaced with their * uppercase counterparts. The operation is locale insensitive---only - * characters ``a'' to ``z'' are affected. + * characters 'a' to 'z' are affected. * * "hEllO".upcase #=> "HELLO" */ @@ -2236,8 +2623,6 @@ mrb_str_upcase(mrb_state *mrb, mrb_value self) return str; } -#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{')) - /* * call-seq: * str.dump -> new_str @@ -2248,143 +2633,76 @@ mrb_str_upcase(mrb_state *mrb, mrb_value self) mrb_value mrb_str_dump(mrb_state *mrb, mrb_value str) { - mrb_int len; - const char *p, *pend; - char *q; - struct RString *result; - - len = 2; /* "" */ - p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); - while (p < pend) { - unsigned char c = *p++; - switch (c) { - case '"': case '\\': - case '\n': case '\r': - case '\t': case '\f': - case '\013': case '\010': case '\007': case '\033': - len += 2; - break; + return str_escape(mrb, str, FALSE); +} - case '#': - len += IS_EVSTR(p, pend) ? 2 : 1; - break; +MRB_API mrb_value +mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len) +{ + struct RString *s = mrb_str_ptr(str); + size_t capa; + size_t total; + ptrdiff_t off = -1; - default: - if (ISPRINT(c)) { - len++; - } - else { - len += 4; /* \NNN */ - } - break; - } + if (len == 0) return str; + mrb_str_modify(mrb, s); + if (ptr >= RSTR_PTR(s) && ptr <= RSTR_PTR(s) + (size_t)RSTR_LEN(s)) { + off = ptr - RSTR_PTR(s); } - result = str_new(mrb, 0, len); - str_with_class(mrb, result, str); - p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); - q = RSTR_PTR(result); - *q++ = '"'; - while (p < pend) { - unsigned char c = *p++; - - switch (c) { - case '"': - case '\\': - *q++ = '\\'; - *q++ = c; - break; - - case '\n': - *q++ = '\\'; - *q++ = 'n'; - break; - - case '\r': - *q++ = '\\'; - *q++ = 'r'; - break; - - case '\t': - *q++ = '\\'; - *q++ = 't'; - break; - - case '\f': - *q++ = '\\'; - *q++ = 'f'; - break; - - case '\013': - *q++ = '\\'; - *q++ = 'v'; - break; - - case '\010': - *q++ = '\\'; - *q++ = 'b'; - break; - - case '\007': - *q++ = '\\'; - *q++ = 'a'; - break; - - case '\033': - *q++ = '\\'; - *q++ = 'e'; - break; - - case '#': - if (IS_EVSTR(p, pend)) *q++ = '\\'; - *q++ = '#'; - break; - - default: - if (ISPRINT(c)) { - *q++ = c; - } - else { - *q++ = '\\'; - q[2] = '0' + c % 8; c /= 8; - q[1] = '0' + c % 8; c /= 8; - q[0] = '0' + c % 8; - q += 3; - } + capa = RSTR_CAPA(s); + total = RSTR_LEN(s)+len; + if (total >= MRB_SSIZE_MAX) { + size_error: + mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); + } + if (capa <= total) { + if (capa == 0) capa = 1; + while (capa <= total) { + if (capa <= MRB_SSIZE_MAX / 2) { + capa *= 2; + } + else { + capa = total+1; + } } + if (capa <= total || capa > MRB_SSIZE_MAX) { + goto size_error; + } + resize_capa(mrb, s, capa); } - *q = '"'; - return mrb_obj_value(result); -} - -MRB_API mrb_value -mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len) -{ - str_buf_cat(mrb, mrb_str_ptr(str), ptr, len); + if (off != -1) { + ptr = RSTR_PTR(s) + off; + } + memcpy(RSTR_PTR(s) + RSTR_LEN(s), ptr, len); + mrb_assert_int_fit(size_t, total, mrb_ssize, MRB_SSIZE_MAX); + RSTR_SET_LEN(s, total); + RSTR_PTR(s)[total] = '\0'; /* sentinel */ return str; } MRB_API mrb_value mrb_str_cat_cstr(mrb_state *mrb, mrb_value str, const char *ptr) { - return mrb_str_cat(mrb, str, ptr, strlen(ptr)); + return mrb_str_cat(mrb, str, ptr, ptr ? strlen(ptr) : 0); } MRB_API mrb_value mrb_str_cat_str(mrb_state *mrb, mrb_value str, mrb_value str2) { + if (mrb_str_ptr(str) == mrb_str_ptr(str2)) { + mrb_str_modify(mrb, mrb_str_ptr(str)); + } return mrb_str_cat(mrb, str, RSTRING_PTR(str2), RSTRING_LEN(str2)); } MRB_API mrb_value -mrb_str_append(mrb_state *mrb, mrb_value str, mrb_value str2) +mrb_str_append(mrb_state *mrb, mrb_value str1, mrb_value str2) { - str2 = mrb_str_to_str(mrb, str2); - return mrb_str_cat_str(mrb, str, str2); + mrb_to_str(mrb, str2); + return mrb_str_cat_str(mrb, str1, str2); } -#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ - /* * call-seq: * str.inspect -> string @@ -2399,54 +2717,7 @@ mrb_str_append(mrb_state *mrb, mrb_value str, mrb_value str2) mrb_value mrb_str_inspect(mrb_state *mrb, mrb_value str) { - const char *p, *pend; - char buf[CHAR_ESC_LEN + 1]; - mrb_value result = mrb_str_new_lit(mrb, "\""); - - p = RSTRING_PTR(str); pend = RSTRING_END(str); - for (;p < pend; p++) { - unsigned char c, cc; - - c = *p; - if (c == '"'|| c == '\\' || (c == '#' && IS_EVSTR(p, pend))) { - buf[0] = '\\'; buf[1] = c; - mrb_str_cat(mrb, result, buf, 2); - continue; - } - if (ISPRINT(c)) { - buf[0] = c; - mrb_str_cat(mrb, result, buf, 1); - continue; - } - switch (c) { - case '\n': cc = 'n'; break; - case '\r': cc = 'r'; break; - case '\t': cc = 't'; break; - case '\f': cc = 'f'; break; - case '\013': cc = 'v'; break; - case '\010': cc = 'b'; break; - case '\007': cc = 'a'; break; - case 033: cc = 'e'; break; - default: cc = 0; break; - } - if (cc) { - buf[0] = '\\'; - buf[1] = (char)cc; - mrb_str_cat(mrb, result, buf, 2); - continue; - } - else { - buf[0] = '\\'; - buf[3] = '0' + c % 8; c /= 8; - buf[2] = '0' + c % 8; c /= 8; - buf[1] = '0' + c % 8; - mrb_str_cat(mrb, result, buf, 4); - continue; - } - } - mrb_str_cat_lit(mrb, result, "\""); - - return result; + return str_escape(mrb, str, TRUE); } /* @@ -2472,30 +2743,137 @@ mrb_str_bytes(mrb_state *mrb, mrb_value str) return a; } +/* + * call-seq: + * str.getbyte(index) -> 0 .. 255 + * + * returns the <i>index</i>th byte as an integer. + */ +static mrb_value +mrb_str_getbyte(mrb_state *mrb, mrb_value str) +{ + mrb_int pos; + mrb_get_args(mrb, "i", &pos); + + if (pos < 0) + pos += RSTRING_LEN(str); + if (pos < 0 || RSTRING_LEN(str) <= pos) + return mrb_nil_value(); + + return mrb_fixnum_value((unsigned char)RSTRING_PTR(str)[pos]); +} + +/* + * call-seq: + * str.setbyte(index, integer) -> integer + * + * modifies the <i>index</i>th byte as <i>integer</i>. + */ +static mrb_value +mrb_str_setbyte(mrb_state *mrb, mrb_value str) +{ + mrb_int pos, byte; + mrb_int len; + + mrb_get_args(mrb, "ii", &pos, &byte); + + len = RSTRING_LEN(str); + if (pos < -len || len <= pos) + mrb_raisef(mrb, E_INDEX_ERROR, "index %i out of string", pos); + if (pos < 0) + pos += len; + + mrb_str_modify(mrb, mrb_str_ptr(str)); + byte &= 0xff; + RSTRING_PTR(str)[pos] = (unsigned char)byte; + return mrb_fixnum_value((unsigned char)byte); +} + +/* + * call-seq: + * str.byteslice(integer) -> new_str or nil + * str.byteslice(integer, integer) -> new_str or nil + * str.byteslice(range) -> new_str or nil + * + * Byte Reference---If passed a single Integer, returns a + * substring of one byte at that position. If passed two Integer + * objects, returns a substring starting at the offset given by the first, and + * a length given by the second. If given a Range, a substring containing + * bytes at offsets given by the range is returned. In all three cases, if + * an offset is negative, it is counted from the end of <i>str</i>. Returns + * <code>nil</code> if the initial offset falls outside the string, the length + * is negative, or the beginning of the range is greater than the end. + * The encoding of the resulted string keeps original encoding. + * + * "hello".byteslice(1) #=> "e" + * "hello".byteslice(-1) #=> "o" + * "hello".byteslice(1, 2) #=> "el" + * "\x80\u3042".byteslice(1, 3) #=> "\u3042" + * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042" + */ +static mrb_value +mrb_str_byteslice(mrb_state *mrb, mrb_value str) +{ + mrb_value a1; + mrb_int str_len = RSTRING_LEN(str), beg, len; + mrb_bool empty = TRUE; + + len = mrb_get_argc(mrb); + switch (len) { + case 2: + mrb_get_args(mrb, "ii", &beg, &len); + break; + case 1: + a1 = mrb_get_arg1(mrb); + if (mrb_range_p(a1)) { + if (mrb_range_beg_len(mrb, a1, &beg, &len, str_len, TRUE) != MRB_RANGE_OK) { + return mrb_nil_value(); + } + } + else { + beg = mrb_integer(mrb_to_int(mrb, a1)); + len = 1; + empty = FALSE; + } + break; + default: + mrb_argnum_error(mrb, len, 1, 2); + break; + } + if (mrb_str_beg_len(str_len, &beg, &len) && (empty || len != 0)) { + return mrb_str_byte_subseq(mrb, str, beg, len); + } + else { + return mrb_nil_value(); + } +} + /* ---------------------------*/ void mrb_init_string(mrb_state *mrb) { struct RClass *s; - mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << 5), "pointer size too big for embedded string"); + mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << MRB_STR_EMBED_LEN_BIT), + "pointer size too big for embedded string"); - s = mrb->string_class = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */ + mrb->string_class = s = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */ MRB_SET_INSTANCE_TT(s, MRB_TT_STRING); - mrb_define_method(mrb, s, "bytesize", mrb_str_size, MRB_ARGS_NONE()); + mrb_define_method(mrb, s, "bytesize", mrb_str_bytesize, MRB_ARGS_NONE()); mrb_define_method(mrb, s, "<=>", mrb_str_cmp_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.1 */ mrb_define_method(mrb, s, "==", mrb_str_equal_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.2 */ mrb_define_method(mrb, s, "+", mrb_str_plus_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.4 */ mrb_define_method(mrb, s, "*", mrb_str_times, MRB_ARGS_REQ(1)); /* 15.2.10.5.5 */ mrb_define_method(mrb, s, "[]", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.6 */ + mrb_define_method(mrb, s, "[]=", mrb_str_aset_m, MRB_ARGS_ANY()); mrb_define_method(mrb, s, "capitalize", mrb_str_capitalize, MRB_ARGS_NONE()); /* 15.2.10.5.7 */ mrb_define_method(mrb, s, "capitalize!", mrb_str_capitalize_bang, MRB_ARGS_NONE()); /* 15.2.10.5.8 */ mrb_define_method(mrb, s, "chomp", mrb_str_chomp, MRB_ARGS_ANY()); /* 15.2.10.5.9 */ mrb_define_method(mrb, s, "chomp!", mrb_str_chomp_bang, MRB_ARGS_ANY()); /* 15.2.10.5.10 */ - mrb_define_method(mrb, s, "chop", mrb_str_chop, MRB_ARGS_REQ(1)); /* 15.2.10.5.11 */ - mrb_define_method(mrb, s, "chop!", mrb_str_chop_bang, MRB_ARGS_REQ(1)); /* 15.2.10.5.12 */ + mrb_define_method(mrb, s, "chop", mrb_str_chop, MRB_ARGS_NONE()); /* 15.2.10.5.11 */ + mrb_define_method(mrb, s, "chop!", mrb_str_chop_bang, MRB_ARGS_NONE()); /* 15.2.10.5.12 */ mrb_define_method(mrb, s, "downcase", mrb_str_downcase, MRB_ARGS_NONE()); /* 15.2.10.5.13 */ mrb_define_method(mrb, s, "downcase!", mrb_str_downcase_bang, MRB_ARGS_NONE()); /* 15.2.10.5.14 */ mrb_define_method(mrb, s, "empty?", mrb_str_empty_p, MRB_ARGS_NONE()); /* 15.2.10.5.16 */ @@ -2503,7 +2881,7 @@ mrb_init_string(mrb_state *mrb) mrb_define_method(mrb, s, "hash", mrb_str_hash_m, MRB_ARGS_NONE()); /* 15.2.10.5.20 */ mrb_define_method(mrb, s, "include?", mrb_str_include, MRB_ARGS_REQ(1)); /* 15.2.10.5.21 */ - mrb_define_method(mrb, s, "index", mrb_str_index_m, MRB_ARGS_ANY()); /* 15.2.10.5.22 */ + mrb_define_method(mrb, s, "index", mrb_str_index_m, MRB_ARGS_ARG(1,1)); /* 15.2.10.5.22 */ mrb_define_method(mrb, s, "initialize", mrb_str_init, MRB_ARGS_REQ(1)); /* 15.2.10.5.23 */ mrb_define_method(mrb, s, "initialize_copy", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.24 */ mrb_define_method(mrb, s, "intern", mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.25 */ @@ -2511,12 +2889,14 @@ mrb_init_string(mrb_state *mrb) mrb_define_method(mrb, s, "replace", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.28 */ mrb_define_method(mrb, s, "reverse", mrb_str_reverse, MRB_ARGS_NONE()); /* 15.2.10.5.29 */ mrb_define_method(mrb, s, "reverse!", mrb_str_reverse_bang, MRB_ARGS_NONE()); /* 15.2.10.5.30 */ - mrb_define_method(mrb, s, "rindex", mrb_str_rindex_m, MRB_ARGS_ANY()); /* 15.2.10.5.31 */ + mrb_define_method(mrb, s, "rindex", mrb_str_rindex, MRB_ARGS_ANY()); /* 15.2.10.5.31 */ mrb_define_method(mrb, s, "size", mrb_str_size, MRB_ARGS_NONE()); /* 15.2.10.5.33 */ mrb_define_method(mrb, s, "slice", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.34 */ mrb_define_method(mrb, s, "split", mrb_str_split_m, MRB_ARGS_ANY()); /* 15.2.10.5.35 */ +#ifndef MRB_NO_FLOAT mrb_define_method(mrb, s, "to_f", mrb_str_to_f, MRB_ARGS_NONE()); /* 15.2.10.5.38 */ +#endif mrb_define_method(mrb, s, "to_i", mrb_str_to_i, MRB_ARGS_ANY()); /* 15.2.10.5.39 */ mrb_define_method(mrb, s, "to_s", mrb_str_to_s, MRB_ARGS_NONE()); /* 15.2.10.5.40 */ mrb_define_method(mrb, s, "to_str", mrb_str_to_s, MRB_ARGS_NONE()); @@ -2525,4 +2905,8 @@ mrb_init_string(mrb_state *mrb) mrb_define_method(mrb, s, "upcase!", mrb_str_upcase_bang, MRB_ARGS_NONE()); /* 15.2.10.5.43 */ mrb_define_method(mrb, s, "inspect", mrb_str_inspect, MRB_ARGS_NONE()); /* 15.2.10.5.46(x) */ mrb_define_method(mrb, s, "bytes", mrb_str_bytes, MRB_ARGS_NONE()); + + mrb_define_method(mrb, s, "getbyte", mrb_str_getbyte, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "setbyte", mrb_str_setbyte, MRB_ARGS_REQ(2)); + mrb_define_method(mrb, s, "byteslice", mrb_str_byteslice, MRB_ARGS_ARG(1,1)); } |
