diff options
Diffstat (limited to 'src/string.c')
| -rw-r--r-- | src/string.c | 1384 |
1 files changed, 791 insertions, 593 deletions
diff --git a/src/string.c b/src/string.c index 6570c89fb..6664eabd6 100644 --- a/src/string.c +++ b/src/string.c @@ -4,20 +4,21 @@ ** See Copyright Notice in mruby.h */ -#include <ctype.h> +#ifdef _MSC_VER +# define _CRT_NONSTDC_NO_DEPRECATE +#endif + #include <float.h> #include <limits.h> #include <stddef.h> #include <stdlib.h> #include <string.h> -#include "mruby.h" -#include "mruby/array.h" -#include "mruby/class.h" -#include "mruby/range.h" -#include "mruby/string.h" -#include "mruby/re.h" - -const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz"; +#include <mruby.h> +#include <mruby/array.h> +#include <mruby/class.h> +#include <mruby/range.h> +#include <mruby/string.h> +#include <mruby/re.h> typedef struct mrb_shared_string { mrb_bool nofree : 1; @@ -26,129 +27,39 @@ typedef struct mrb_shared_string { mrb_int len; } mrb_shared_string; -static mrb_value str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2); -static mrb_value mrb_str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len); - -mrb_int -mrb_str_strlen(mrb_state *mrb, struct RString *s) -{ - mrb_int i, max = RSTR_LEN(s); - char *p = RSTR_PTR(s); - - if (!p) return 0; - for (i=0; i<max; i++) { - if (p[i] == '\0') { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); - } - } - return max; -} - -static inline void -resize_capa(mrb_state *mrb, struct RString *s, mrb_int capacity) -{ - if (RSTR_EMBED_P(s)) { - if (RSTRING_EMBED_LEN_MAX < capacity) { - char *const tmp = (char *)mrb_malloc(mrb, capacity+1); - const mrb_int len = RSTR_EMBED_LEN(s); - memcpy(tmp, s->as.ary, len); - RSTR_UNSET_EMBED_FLAG(s); - s->as.heap.ptr = tmp; - s->as.heap.len = len; - s->as.heap.aux.capa = capacity; - } - } - else { - s->as.heap.ptr = (char *)mrb_realloc(mrb, RSTR_PTR(s), capacity+1); - s->as.heap.aux.capa = capacity; - } -} +const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz"; -static void -str_decref(mrb_state *mrb, mrb_shared_string *shared) -{ - shared->refcnt--; - if (shared->refcnt == 0) { - if (!shared->nofree) { - mrb_free(mrb, shared->ptr); - } - mrb_free(mrb, shared); - } -} +#define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class)) -void -mrb_str_modify(mrb_state *mrb, struct RString *s) +static struct RString* +str_new_static(mrb_state *mrb, const char *p, size_t len) { - if (RSTR_SHARED_P(s)) { - mrb_shared_string *shared = s->as.heap.aux.shared; - - if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) { - s->as.heap.ptr = shared->ptr; - s->as.heap.aux.capa = shared->len; - RSTR_PTR(s)[s->as.heap.len] = '\0'; - mrb_free(mrb, shared); - } - else { - char *ptr, *p; - mrb_int len; - - p = RSTR_PTR(s); - len = s->as.heap.len; - ptr = (char *)mrb_malloc(mrb, (size_t)len + 1); - if (p) { - memcpy(ptr, p, len); - } - ptr[len] = '\0'; - s->as.heap.ptr = ptr; - s->as.heap.aux.capa = len; - str_decref(mrb, shared); - } - RSTR_UNSET_SHARED_FLAG(s); - return; - } - if (RSTR_NOFREE_P(s)) { - char *p = s->as.heap.ptr; + struct RString *s; - s->as.heap.ptr = (char *)mrb_malloc(mrb, (size_t)s->as.heap.len+1); - if (p) { - memcpy(RSTR_PTR(s), p, s->as.heap.len); - } - RSTR_PTR(s)[s->as.heap.len] = '\0'; - s->as.heap.aux.capa = s->as.heap.len; - RSTR_UNSET_NOFREE_FLAG(s); - return; + if (len >= MRB_INT_MAX) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); } -} - -mrb_value -mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len) -{ - mrb_int slen; - struct RString *s = mrb_str_ptr(str); + s = mrb_obj_alloc_string(mrb); + s->as.heap.len = len; + s->as.heap.aux.capa = 0; /* nofree */ + s->as.heap.ptr = (char *)p; + s->flags = MRB_STR_NOFREE; - mrb_str_modify(mrb, s); - slen = RSTR_LEN(s); - if (len != slen) { - if (slen < len || slen - len > 256) { - resize_capa(mrb, s, len); - } - RSTR_SET_LEN(s, len); - RSTR_PTR(s)[len] = '\0'; /* sentinel */ - } - return str; + return s; } -#define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class)) - static struct RString* str_new(mrb_state *mrb, const char *p, size_t len) { struct RString *s; + if (p && mrb_ro_data_p(p)) { + return str_new_static(mrb, p, len); + } s = mrb_obj_alloc_string(mrb); if (len < RSTRING_EMBED_LEN_MAX) { RSTR_SET_EMBED_FLAG(s); - RSTR_SET_EMBED_LEN(s,len); + RSTR_SET_EMBED_LEN(s, len); if (p) { memcpy(s->as.ary, p, len); } @@ -167,7 +78,7 @@ str_new(mrb_state *mrb, const char *p, size_t len) return s; } -static void +static inline void str_with_class(mrb_state *mrb, struct RString *s, mrb_value obj) { s->c = mrb_str_ptr(obj)->c; @@ -186,7 +97,7 @@ mrb_str_new_empty(mrb_state *mrb, mrb_value str) # define MRB_STR_BUF_MIN_SIZE 128 #endif -mrb_value +MRB_API mrb_value mrb_str_buf_new(mrb_state *mrb, size_t capa) { struct RString *s; @@ -207,6 +118,26 @@ mrb_str_buf_new(mrb_state *mrb, size_t capa) return mrb_obj_value(s); } +static inline void +resize_capa(mrb_state *mrb, struct RString *s, mrb_int capacity) +{ + if (RSTR_EMBED_P(s)) { + if (RSTRING_EMBED_LEN_MAX < capacity) { + char *const tmp = (char *)mrb_malloc(mrb, capacity+1); + const mrb_int len = RSTR_EMBED_LEN(s); + memcpy(tmp, s->as.ary, len); + RSTR_UNSET_EMBED_FLAG(s); + s->as.heap.ptr = tmp; + s->as.heap.len = len; + s->as.heap.aux.capa = capacity; + } + } + else { + s->as.heap.ptr = (char *)mrb_realloc(mrb, RSTR_PTR(s), capacity+1); + s->as.heap.aux.capa = capacity; + } +} + static void str_buf_cat(mrb_state *mrb, struct RString *s, const char *ptr, size_t len) { @@ -248,7 +179,7 @@ str_buf_cat(mrb_state *mrb, struct RString *s, const char *ptr, size_t len) RSTR_PTR(s)[total] = '\0'; /* sentinel */ } -mrb_value +MRB_API mrb_value mrb_str_new(mrb_state *mrb, const char *p, size_t len) { return mrb_obj_value(str_new(mrb, p, len)); @@ -261,7 +192,7 @@ mrb_str_new(mrb_state *mrb, const char *p, size_t len) * Returns a new string object containing a copy of <i>str</i>. */ -mrb_value +MRB_API mrb_value mrb_str_new_cstr(mrb_state *mrb, const char *p) { struct RString *s; @@ -279,20 +210,23 @@ mrb_str_new_cstr(mrb_state *mrb, const char *p) return mrb_obj_value(s); } -mrb_value +MRB_API mrb_value mrb_str_new_static(mrb_state *mrb, const char *p, size_t len) { - struct RString *s; + struct RString *s = str_new_static(mrb, p, len); + return mrb_obj_value(s); +} - if (len >= MRB_INT_MAX) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big"); +static void +str_decref(mrb_state *mrb, mrb_shared_string *shared) +{ + shared->refcnt--; + if (shared->refcnt == 0) { + if (!shared->nofree) { + mrb_free(mrb, shared->ptr); + } + mrb_free(mrb, shared); } - s = mrb_obj_alloc_string(mrb); - s->as.heap.len = len; - s->as.heap.aux.capa = 0; /* nofree */ - s->as.heap.ptr = (char *)p; - s->flags = MRB_STR_NOFREE; - return mrb_obj_value(s); } void @@ -306,20 +240,126 @@ mrb_gc_free_str(mrb_state *mrb, struct RString *str) mrb_free(mrb, str->as.heap.ptr); } -char * -mrb_str_to_cstr(mrb_state *mrb, mrb_value str0) +#ifdef MRB_UTF8_STRING +static const char utf8len_codepage[256] = { - struct RString *s; + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1, +}; - if (!mrb_string_p(str0)) { - mrb_raise(mrb, E_TYPE_ERROR, "expected String"); +static mrb_int +utf8len(const char* p, const char* e) +{ + mrb_int len; + mrb_int i; + + len = utf8len_codepage[(unsigned char)*p]; + if (p + len > e) return 1; + for (i = 1; i < len; ++i) + if ((p[i] & 0xc0) != 0x80) + return 1; + return len; +} + +static mrb_int +utf8_strlen(mrb_value str, mrb_int len) +{ + mrb_int total = 0; + char* p = RSTRING_PTR(str); + char* e = p; + e += len < 0 ? RSTRING_LEN(str) : len; + while (p<e) { + p += utf8len(p, e); + total++; } + return total; +} - s = str_new(mrb, RSTRING_PTR(str0), RSTRING_LEN(str0)); - if ((strlen(RSTR_PTR(s)) ^ RSTR_LEN(s)) != 0) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); +#define RSTRING_CHAR_LEN(s) utf8_strlen(s, -1) + +/* map character index to byte offset index */ +static mrb_int +chars2bytes(mrb_value s, mrb_int off, mrb_int idx) +{ + mrb_int i, b, n; + const char *p = RSTRING_PTR(s) + off; + const char *e = RSTRING_END(s); + + for (b=i=0; p<e && i<idx; i++) { + n = utf8len(p, e); + b += n; + p += n; } - return RSTR_PTR(s); + return b; +} + +/* map byte offset to character index */ +static mrb_int +bytes2chars(char *p, mrb_int bi) +{ + mrb_int i, b, n; + + for (b=i=0; b<bi; i++) { + n = utf8len(p, p+bi); + b += n; + p += n; + } + return i; +} + +#else +#define RSTRING_CHAR_LEN(s) RSTRING_LEN(s) +#define chars2bytes(p, off, ci) (ci) +#define bytes2chars(p, bi) (bi) +#endif + +static inline mrb_int +mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n) +{ + const unsigned char *x = xs, *xe = xs + m; + const unsigned char *y = ys; + int i, qstable[256]; + + /* Preprocessing */ + for (i = 0; i < 256; ++i) + qstable[i] = m + 1; + for (; x < xe; ++x) + qstable[*x] = xe - x; + /* Searching */ + for (; y + m <= ys + n; y += *(qstable + y[m])) { + if (*xs == *y && memcmp(xs, y, m) == 0) + return y - ys; + } + return -1; +} + +static mrb_int +mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n) +{ + const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0; + + if (m > n) return -1; + else if (m == n) { + return memcmp(x0, y0, m) == 0 ? 0 : -1; + } + else if (m < 1) { + return 0; + } + else if (m == 1) { + const unsigned char *ys = y, *ye = ys + n; + for (; y < ye; ++y) { + if (*x == *y) + return y - ys; + } + return -1; + } + return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n); } static void @@ -360,18 +400,336 @@ str_make_shared(mrb_state *mrb, struct RString *s) } } -/* - * call-seq: - * char* str = String("abcd"), len=strlen("abcd") - * - * Returns a new string object containing a copy of <i>str</i>. - */ -const char* -mrb_str_body(mrb_value str, int *len_p) +static mrb_value +byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) +{ + struct RString *orig, *s; + mrb_shared_string *shared; + + orig = mrb_str_ptr(str); + if (RSTR_EMBED_P(orig)) { + s = str_new(mrb, orig->as.ary+beg, len); + } + else { + str_make_shared(mrb, orig); + shared = orig->as.heap.aux.shared; + s = mrb_obj_alloc_string(mrb); + s->as.heap.ptr = orig->as.heap.ptr + beg; + s->as.heap.len = len; + s->as.heap.aux.shared = shared; + RSTR_SET_SHARED_FLAG(s); + shared->refcnt++; + } + + return mrb_obj_value(s); +} +#ifdef MRB_UTF8_STRING +static inline mrb_value +str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) +{ + beg = chars2bytes(str, 0, beg); + len = chars2bytes(str, beg, len); + + return byte_subseq(mrb, str, beg, len); +} +#else +#define str_subseq(mrb, str, beg, len) byte_subseq(mrb, str, beg, len) +#endif + +static mrb_value +str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) +{ + mrb_int clen = RSTRING_CHAR_LEN(str); + + if (len < 0) return mrb_nil_value(); + if (clen == 0) { + len = 0; + } + else if (beg < 0) { + beg = clen + beg; + } + if (beg > clen) return mrb_nil_value(); + if (beg < 0) { + beg += clen; + if (beg < 0) return mrb_nil_value(); + } + if (beg + len > clen) + len = clen - beg; + if (len <= 0) { + len = 0; + } + return str_subseq(mrb, str, beg, len); +} + +static mrb_int +str_index(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int offset) +{ + mrb_int pos; + char *s, *sptr; + mrb_int len, slen; + + len = RSTRING_LEN(str); + slen = RSTRING_LEN(sub); + if (offset < 0) { + offset += len; + if (offset < 0) return -1; + } + if (len - offset < slen) return -1; + s = RSTRING_PTR(str); + if (offset) { + s += offset; + } + if (slen == 0) return offset; + /* need proceed one character at a time */ + sptr = RSTRING_PTR(sub); + slen = RSTRING_LEN(sub); + len = RSTRING_LEN(str) - offset; + pos = mrb_memsearch(sptr, slen, s, len); + if (pos < 0) return pos; + return pos + offset; +} + +static void +check_frozen(mrb_state *mrb, struct RString *s) +{ + if (RSTR_FROZEN_P(s)) { + mrb_raise(mrb, E_RUNTIME_ERROR, "can't modify frozen string"); + } +} + +static mrb_value +str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2) +{ + long len; + + check_frozen(mrb, s1); + len = RSTR_LEN(s2); + if (RSTR_SHARED_P(s1)) { + str_decref(mrb, s1->as.heap.aux.shared); + } + else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1)) { + mrb_free(mrb, s1->as.heap.ptr); + } + + RSTR_UNSET_NOFREE_FLAG(s1); + + if (RSTR_SHARED_P(s2)) { +L_SHARE: + RSTR_UNSET_EMBED_FLAG(s1); + s1->as.heap.ptr = s2->as.heap.ptr; + s1->as.heap.len = len; + s1->as.heap.aux.shared = s2->as.heap.aux.shared; + RSTR_SET_SHARED_FLAG(s1); + s1->as.heap.aux.shared->refcnt++; + } + else { + if (len <= RSTRING_EMBED_LEN_MAX) { + RSTR_UNSET_SHARED_FLAG(s1); + RSTR_SET_EMBED_FLAG(s1); + memcpy(s1->as.ary, RSTR_PTR(s2), len); + RSTR_SET_EMBED_LEN(s1, len); + } + else { + str_make_shared(mrb, s2); + goto L_SHARE; + } + } + + return mrb_obj_value(s1); +} + +static mrb_int +str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) +{ + char *s, *sbeg, *t; + struct RString *ps = mrb_str_ptr(str); + mrb_int len = RSTRING_LEN(sub); + + /* substring longer than string */ + if (RSTR_LEN(ps) < len) return -1; + if (RSTR_LEN(ps) - pos < len) { + pos = RSTR_LEN(ps) - len; + } + sbeg = RSTR_PTR(ps); + s = RSTR_PTR(ps) + pos; + t = RSTRING_PTR(sub); + if (len) { + while (sbeg <= s) { + if (memcmp(s, t, len) == 0) { + return s - RSTR_PTR(ps); + } + s--; + } + return -1; + } + else { + return pos; + } +} + +MRB_API mrb_int +mrb_str_strlen(mrb_state *mrb, struct RString *s) +{ + mrb_int i, max = RSTR_LEN(s); + char *p = RSTR_PTR(s); + + if (!p) return 0; + for (i=0; i<max; i++) { + if (p[i] == '\0') { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); + } + } + return max; +} + +#ifdef _WIN32 +#include <windows.h> + +char* +mrb_utf8_from_locale(const char *str, size_t len) +{ + wchar_t* wcsp; + char* mbsp; + size_t mbssize, wcssize; + + if (len == 0) + return strdup(""); + if (len == -1) + len = strlen(str); + wcssize = MultiByteToWideChar(GetACP(), 0, str, len, NULL, 0); + wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t)); + if (!wcsp) + return NULL; + wcssize = MultiByteToWideChar(GetACP(), 0, str, len, wcsp, wcssize + 1); + wcsp[wcssize] = 0; + + mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL); + mbsp = (char*) malloc((mbssize + 1)); + if (!mbsp) { + free(wcsp); + return NULL; + } + mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL); + mbsp[mbssize] = 0; + free(wcsp); + return mbsp; +} + +char* +mrb_locale_from_utf8(const char *utf8, size_t len) +{ + wchar_t* wcsp; + char* mbsp; + size_t mbssize, wcssize; + + if (len == 0) + return strdup(""); + if (len == -1) + len = strlen(utf8); + wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, NULL, 0); + wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t)); + if (!wcsp) + return NULL; + wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, wcsp, wcssize + 1); + wcsp[wcssize] = 0; + mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL); + mbsp = (char*) malloc((mbssize + 1)); + if (!mbsp) { + free(wcsp); + return NULL; + } + mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL); + mbsp[mbssize] = 0; + free(wcsp); + return mbsp; +} +#endif + +MRB_API void +mrb_str_modify(mrb_state *mrb, struct RString *s) +{ + check_frozen(mrb, s); + if (RSTR_SHARED_P(s)) { + mrb_shared_string *shared = s->as.heap.aux.shared; + + if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) { + s->as.heap.ptr = shared->ptr; + s->as.heap.aux.capa = shared->len; + RSTR_PTR(s)[s->as.heap.len] = '\0'; + mrb_free(mrb, shared); + } + else { + char *ptr, *p; + mrb_int len; + + p = RSTR_PTR(s); + len = s->as.heap.len; + ptr = (char *)mrb_malloc(mrb, (size_t)len + 1); + if (p) { + memcpy(ptr, p, len); + } + ptr[len] = '\0'; + s->as.heap.ptr = ptr; + s->as.heap.aux.capa = len; + str_decref(mrb, shared); + } + RSTR_UNSET_SHARED_FLAG(s); + return; + } + if (RSTR_NOFREE_P(s)) { + char *p = s->as.heap.ptr; + + s->as.heap.ptr = (char *)mrb_malloc(mrb, (size_t)s->as.heap.len+1); + if (p) { + memcpy(RSTR_PTR(s), p, s->as.heap.len); + } + RSTR_PTR(s)[s->as.heap.len] = '\0'; + s->as.heap.aux.capa = s->as.heap.len; + RSTR_UNSET_NOFREE_FLAG(s); + return; + } +} + +static mrb_value +mrb_str_freeze(mrb_state *mrb, mrb_value str) +{ + struct RString *s = mrb_str_ptr(str); + + RSTR_SET_FROZEN_FLAG(s); + return str; +} + +MRB_API mrb_value +mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len) { + mrb_int slen; struct RString *s = mrb_str_ptr(str); - *len_p = RSTR_LEN(s); + mrb_str_modify(mrb, s); + slen = RSTR_LEN(s); + if (len != slen) { + if (slen < len || slen - len > 256) { + resize_capa(mrb, s, len); + } + RSTR_SET_LEN(s, len); + RSTR_PTR(s)[len] = '\0'; /* sentinel */ + } + return str; +} + +MRB_API char* +mrb_str_to_cstr(mrb_state *mrb, mrb_value str0) +{ + struct RString *s; + + if (!mrb_string_p(str0)) { + mrb_raise(mrb, E_TYPE_ERROR, "expected String"); + } + + s = str_new(mrb, RSTRING_PTR(str0), RSTRING_LEN(str0)); + if ((strlen(RSTR_PTR(s)) ^ RSTR_LEN(s)) != 0) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); + } return RSTR_PTR(s); } @@ -381,7 +739,7 @@ mrb_str_body(mrb_value str, int *len_p) * * Returns a new string object containing a copy of <i>str</i>. */ -void +MRB_API void mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other) { struct RString *s1 = mrb_str_ptr(self), *s2; @@ -408,7 +766,7 @@ mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other) * * Returns a new string object containing a copy of <i>str</i>. */ -mrb_value +MRB_API mrb_value mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b) { struct RString *s = mrb_str_ptr(a); @@ -439,32 +797,26 @@ mrb_str_plus_m(mrb_state *mrb, mrb_value self) return mrb_str_plus(mrb, self, str); } +/* 15.2.10.5.26 */ +/* 15.2.10.5.33 */ /* * call-seq: - * len = strlen(String("abcd")) + * "abcd".size => int * - * Returns a new string object containing a copy of <i>str</i>. + * Returns the length of string. */ static mrb_value -mrb_str_bytesize(mrb_state *mrb, mrb_value self) +mrb_str_size(mrb_state *mrb, mrb_value self) { - struct RString *s = mrb_str_ptr(self); - return mrb_fixnum_value(RSTR_LEN(s)); + mrb_int len = RSTRING_CHAR_LEN(self); + return mrb_fixnum_value(len); } -/* 15.2.10.5.26 */ -/* 15.2.10.5.33 */ -/* - * call-seq: - * len = strlen(String("abcd")) - * - * Returns a new string object containing a copy of <i>str</i>. - */ -mrb_value -mrb_str_size(mrb_state *mrb, mrb_value self) +static mrb_value +mrb_str_bytesize(mrb_state *mrb, mrb_value self) { - struct RString *s = mrb_str_ptr(self); - return mrb_fixnum_value(RSTR_LEN(s)); + mrb_int len = RSTRING_LEN(self); + return mrb_fixnum_value(len); } /* 15.2.10.5.1 */ @@ -521,7 +873,7 @@ mrb_str_times(mrb_state *mrb, mrb_value self) * = 0 * < -1 */ -int +MRB_API int mrb_str_cmp(mrb_state *mrb, mrb_value str1, mrb_value str2) { mrb_int len; @@ -607,7 +959,7 @@ str_eql(mrb_state *mrb, const mrb_value str1, const mrb_value str2) return FALSE; } -mrb_bool +MRB_API mrb_bool mrb_str_equal(mrb_state *mrb, mrb_value str1, mrb_value str2) { if (mrb_immediate_p(str2)) return FALSE; @@ -643,7 +995,7 @@ mrb_str_equal_m(mrb_state *mrb, mrb_value str1) return mrb_bool_value(mrb_str_equal(mrb, str1, str2)); } /* ---------------------------------- */ -mrb_value +MRB_API mrb_value mrb_str_to_str(mrb_state *mrb, mrb_value str) { mrb_value s; @@ -658,100 +1010,35 @@ mrb_str_to_str(mrb_state *mrb, mrb_value str) return str; } -char * +MRB_API const char* mrb_string_value_ptr(mrb_state *mrb, mrb_value ptr) { mrb_value str = mrb_str_to_str(mrb, ptr); return RSTRING_PTR(str); } -static mrb_value -noregexp(mrb_state *mrb, mrb_value self) -{ - mrb_raise(mrb, E_NOTIMP_ERROR, "Regexp class not implemented"); - return mrb_nil_value(); -} - -static void -regexp_check(mrb_state *mrb, mrb_value obj) +MRB_API mrb_int +mrb_string_value_len(mrb_state *mrb, mrb_value ptr) { - if (mrb_regexp_p(mrb, obj)) { - noregexp(mrb, obj); - } -} - -static inline mrb_int -mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n) -{ - const unsigned char *x = xs, *xe = xs + m; - const unsigned char *y = ys; - int i, qstable[256]; - - /* Preprocessing */ - for (i = 0; i < 256; ++i) - qstable[i] = m + 1; - for (; x < xe; ++x) - qstable[*x] = xe - x; - /* Searching */ - for (; y + m <= ys + n; y += *(qstable + y[m])) { - if (*xs == *y && memcmp(xs, y, m) == 0) - return y - ys; - } - return -1; + mrb_value str = mrb_str_to_str(mrb, ptr); + return RSTRING_LEN(str); } -static mrb_int -mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n) +void +mrb_noregexp(mrb_state *mrb, mrb_value self) { - const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0; - - if (m > n) return -1; - else if (m == n) { - return memcmp(x0, y0, m) == 0 ? 0 : -1; - } - else if (m < 1) { - return 0; - } - else if (m == 1) { - const unsigned char *ys = y, *ye = ys + n; - for (; y < ye; ++y) { - if (*x == *y) - return y - ys; - } - return -1; - } - return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n); + mrb_raise(mrb, E_NOTIMP_ERROR, "Regexp class not implemented"); } -static mrb_int -mrb_str_index(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int offset) +void +mrb_regexp_check(mrb_state *mrb, mrb_value obj) { - mrb_int pos; - char *s, *sptr; - mrb_int len, slen; - - len = RSTRING_LEN(str); - slen = RSTRING_LEN(sub); - if (offset < 0) { - offset += len; - if (offset < 0) return -1; - } - if (len - offset < slen) return -1; - s = RSTRING_PTR(str); - if (offset) { - s += offset; + if (mrb_regexp_p(mrb, obj)) { + mrb_noregexp(mrb, obj); } - if (slen == 0) return offset; - /* need proceed one character at a time */ - sptr = RSTRING_PTR(sub); - slen = RSTRING_LEN(sub); - len = RSTRING_LEN(str) - offset; - pos = mrb_memsearch(sptr, slen, s, len); - if (pos < 0) return pos; - return pos + offset; } -mrb_value +MRB_API mrb_value mrb_str_dup(mrb_state *mrb, mrb_value str) { struct RString *s = mrb_str_ptr(str); @@ -766,18 +1053,18 @@ mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx) { mrb_int idx; - regexp_check(mrb, indx); + mrb_regexp_check(mrb, indx); switch (mrb_type(indx)) { case MRB_TT_FIXNUM: idx = mrb_fixnum(indx); num_index: - str = mrb_str_substr(mrb, str, idx, 1); + str = str_substr(mrb, str, idx, 1); if (!mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value(); return str; case MRB_TT_STRING: - if (mrb_str_index(mrb, str, indx, 0) != -1) + if (str_index(mrb, str, indx, 0) != -1) return mrb_str_dup(mrb, indx); return mrb_nil_value(); @@ -786,15 +1073,20 @@ num_index: { mrb_int beg, len; - len = RSTRING_LEN(str); + len = RSTRING_CHAR_LEN(str); if (mrb_range_beg_len(mrb, indx, &beg, &len, len)) { - return mrb_str_subseq(mrb, str, beg, len); + return str_subseq(mrb, str, beg, len); } else { return mrb_nil_value(); } } + case MRB_TT_FLOAT: default: + indx = mrb_Integer(mrb, indx); + if (mrb_nil_p(indx)) { + mrb_raise(mrb, E_TYPE_ERROR, "can't convert to Fixnum"); + } idx = mrb_fixnum(indx); goto num_index; } @@ -831,6 +1123,7 @@ num_index: * * a = "hello there" * a[1] #=> 101(1.8.7) "e"(1.9.2) + * a[1.1] #=> "e"(1.9.2) * a[1,3] #=> "ell" * a[1..3] #=> "ell" * a[-3,2] #=> "er" @@ -848,8 +1141,8 @@ mrb_str_aref_m(mrb_state *mrb, mrb_value str) argc = mrb_get_args(mrb, "o|o", &a1, &a2); if (argc == 2) { - regexp_check(mrb, a1); - return mrb_str_substr(mrb, str, mrb_fixnum(a1), mrb_fixnum(a2)); + mrb_regexp_check(mrb, a1); + return str_substr(mrb, str, mrb_fixnum(a1), mrb_fixnum(a2)); } if (argc != 1) { mrb_raisef(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%S for 1)", mrb_fixnum_value(argc)); @@ -919,7 +1212,7 @@ mrb_str_capitalize(mrb_state *mrb, mrb_value self) /* 15.2.10.5.10 */ /* * call-seq: - * str.chomp!(separator=$/) => str or nil + * str.chomp!(separator="\n") => str or nil * * Modifies <i>str</i> in place as described for <code>String#chomp</code>, * returning <i>str</i>, or <code>nil</code> if no modifications were made. @@ -993,7 +1286,7 @@ mrb_str_chomp_bang(mrb_state *mrb, mrb_value str) /* 15.2.10.5.9 */ /* * call-seq: - * str.chomp(separator=$/) => new_str + * str.chomp(separator="\n") => new_str * * Returns a new <code>String</code> with the given record separator removed * from the end of <i>str</i> (if present). If <code>$/</code> has not been @@ -1036,7 +1329,18 @@ mrb_str_chop_bang(mrb_state *mrb, mrb_value str) mrb_str_modify(mrb, s); if (RSTR_LEN(s) > 0) { mrb_int len; +#ifdef MRB_UTF8_STRING + const char* t = RSTR_PTR(s), *p = t; + const char* e = p + RSTR_LEN(s); + while (p<e) { + mrb_int clen = utf8len(p, e); + if (p + clen>=e) break; + p += clen; + } + len = p - t; +#else len = RSTR_LEN(s) - 1; +#endif if (RSTR_PTR(s)[len] == '\n') { if (len > 0 && RSTR_PTR(s)[len-1] == '\r') { @@ -1113,7 +1417,7 @@ mrb_str_downcase_bang(mrb_state *mrb, mrb_value str) * * Returns a copy of <i>str</i> with all uppercase letters replaced with their * lowercase counterparts. The operation is locale insensitive---only - * characters ``A'' to ``Z'' are affected. + * characters 'A' to 'Z' are affected. * * "hEllO".downcase #=> "hello" */ @@ -1164,47 +1468,10 @@ mrb_str_eql(mrb_state *mrb, mrb_value self) return mrb_bool_value(eql_p); } -static mrb_value -mrb_str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) -{ - struct RString *orig, *s; - mrb_shared_string *shared; - - orig = mrb_str_ptr(str); - if (RSTR_EMBED_P(orig)) { - s = str_new(mrb, orig->as.ary+beg, len); - } else { - str_make_shared(mrb, orig); - shared = orig->as.heap.aux.shared; - s = mrb_obj_alloc_string(mrb); - s->as.heap.ptr = orig->as.heap.ptr + beg; - s->as.heap.len = len; - s->as.heap.aux.shared = shared; - RSTR_SET_SHARED_FLAG(s); - shared->refcnt++; - } - - return mrb_obj_value(s); -} - -mrb_value +MRB_API mrb_value mrb_str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) { - if (len < 0) return mrb_nil_value(); - if (!RSTRING_LEN(str)) { - len = 0; - } - if (beg > RSTRING_LEN(str)) return mrb_nil_value(); - if (beg < 0) { - beg += RSTRING_LEN(str); - if (beg < 0) return mrb_nil_value(); - } - if (beg + len > RSTRING_LEN(str)) - len = RSTRING_LEN(str) - beg; - if (len <= 0) { - len = 0; - } - return mrb_str_subseq(mrb, str, beg, len); + return str_substr(mrb, str, beg, len); } mrb_int @@ -1263,7 +1530,7 @@ mrb_str_include(mrb_state *mrb, mrb_value self) } else { str2 = mrb_str_to_str(mrb, str2); - i = mrb_str_index(mrb, self, str2, 0); + i = str_index(mrb, self, str2, 0); include_p = (i != -1); } @@ -1293,12 +1560,12 @@ mrb_str_include(mrb_state *mrb, mrb_value self) * "hello".index(/[aeiou]/, -3) #=> 4 */ static mrb_value -mrb_str_index_m(mrb_state *mrb, mrb_value str) +mrb_str_index(mrb_state *mrb, mrb_value str) { mrb_value *argv; mrb_int argc; mrb_value sub; - mrb_int pos; + mrb_int pos, clen; mrb_get_args(mrb, "*", &argv, &argc); if (argc == 2) { @@ -1312,26 +1579,18 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str) else sub = mrb_nil_value(); } - regexp_check(mrb, sub); + mrb_regexp_check(mrb, sub); + clen = RSTRING_CHAR_LEN(str); if (pos < 0) { - pos += RSTRING_LEN(str); + pos += clen; if (pos < 0) { return mrb_nil_value(); } } + if (pos >= clen) return mrb_nil_value(); + pos = chars2bytes(str, 0, pos); switch (mrb_type(sub)) { - case MRB_TT_FIXNUM: { - int c = mrb_fixnum(sub); - mrb_int len = RSTRING_LEN(str); - unsigned char *p = (unsigned char*)RSTRING_PTR(str); - - for (;pos<len;pos++) { - if (p[pos] == c) return mrb_fixnum_value(pos); - } - return mrb_nil_value(); - } - default: { mrb_value tmp; @@ -1343,56 +1602,17 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str) } /* fall through */ case MRB_TT_STRING: - pos = mrb_str_index(mrb, str, sub, pos); + pos = str_index(mrb, str, sub, pos); break; } if (pos == -1) return mrb_nil_value(); + pos = bytes2chars(RSTRING_PTR(str), pos); return mrb_fixnum_value(pos); } #define STR_REPLACE_SHARED_MIN 10 -static mrb_value -str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2) -{ - long len; - - len = RSTR_LEN(s2); - if (RSTR_SHARED_P(s1)) { - str_decref(mrb, s1->as.heap.aux.shared); - } - else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1)) { - mrb_free(mrb, s1->as.heap.ptr); - } - - RSTR_UNSET_NOFREE_FLAG(s1); - - if (RSTR_SHARED_P(s2)) { -L_SHARE: - RSTR_UNSET_EMBED_FLAG(s1); - s1->as.heap.ptr = s2->as.heap.ptr; - s1->as.heap.len = len; - s1->as.heap.aux.shared = s2->as.heap.aux.shared; - RSTR_SET_SHARED_FLAG(s1); - s1->as.heap.aux.shared->refcnt++; - } - else { - if (len <= RSTRING_EMBED_LEN_MAX) { - RSTR_UNSET_SHARED_FLAG(s1); - RSTR_SET_EMBED_FLAG(s1); - memcpy(s1->as.ary, RSTR_PTR(s2), len); - RSTR_SET_EMBED_LEN(s1, len); - } - else { - str_make_shared(mrb, s2); - goto L_SHARE; - } - } - - return mrb_obj_value(s1); -} - /* 15.2.10.5.24 */ /* 15.2.10.5.28 */ /* @@ -1450,13 +1670,13 @@ mrb_str_init(mrb_state *mrb, mrb_value self) * * 'cat and dog'.to_sym #=> :"cat and dog" */ -mrb_value +MRB_API mrb_value mrb_str_intern(mrb_state *mrb, mrb_value self) { return mrb_symbol_value(mrb_intern_str(mrb, self)); } /* ---------------------------------- */ -mrb_value +MRB_API mrb_value mrb_obj_as_string(mrb_state *mrb, mrb_value obj) { mrb_value str; @@ -1470,7 +1690,7 @@ mrb_obj_as_string(mrb_state *mrb, mrb_value obj) return str; } -mrb_value +MRB_API mrb_value mrb_ptr_to_str(mrb_state *mrb, void *p) { struct RString *p_str; @@ -1500,119 +1720,93 @@ mrb_ptr_to_str(mrb_state *mrb, void *p) return mrb_obj_value(p_str); } -mrb_value +MRB_API mrb_value mrb_string_type(mrb_state *mrb, mrb_value str) { return mrb_convert_type(mrb, str, MRB_TT_STRING, "String", "to_str"); } -mrb_value +MRB_API mrb_value mrb_check_string_type(mrb_state *mrb, mrb_value str) { return mrb_check_convert_type(mrb, str, MRB_TT_STRING, "String", "to_str"); } -/* ---------------------------------- */ -/* 15.2.10.5.29 */ +/* 15.2.10.5.30 */ /* * call-seq: - * str.reverse => new_str - * - * Returns a new string with the characters from <i>str</i> in reverse order. + * str.reverse! => str * - * "stressed".reverse #=> "desserts" + * Reverses <i>str</i> in place. */ static mrb_value -mrb_str_reverse(mrb_state *mrb, mrb_value str) +mrb_str_reverse_bang(mrb_state *mrb, mrb_value str) { - struct RString *s2; - char *s, *e, *p; +#ifdef MRB_UTF8_STRING + mrb_int utf8_len = RSTRING_CHAR_LEN(str); + mrb_int len = RSTRING_LEN(str); + + if (utf8_len == len) goto bytes; + if (utf8_len > 1) { + char *buf; + char *p, *e, *r; - if (RSTRING_LEN(str) <= 1) return mrb_str_dup(mrb, str); + mrb_str_modify(mrb, mrb_str_ptr(str)); + len = RSTRING_LEN(str); + buf = mrb_malloc(mrb, (size_t)len); + p = buf; + e = buf + len; - s2 = str_new(mrb, 0, RSTRING_LEN(str)); - str_with_class(mrb, s2, str); - s = RSTRING_PTR(str); e = RSTRING_END(str) - 1; - p = RSTR_PTR(s2); + memcpy(buf, RSTRING_PTR(str), len); + r = RSTRING_PTR(str) + len; - while (e >= s) { - *p++ = *e--; + while (p<e) { + mrb_int clen = utf8len(p, e); + r -= clen; + memcpy(r, p, clen); + p += clen; + } + mrb_free(mrb, buf); } - return mrb_obj_value(s2); -} + return str; -/* 15.2.10.5.30 */ -/* - * call-seq: - * str.reverse! => str - * - * Reverses <i>str</i> in place. - */ -static mrb_value -mrb_str_reverse_bang(mrb_state *mrb, mrb_value str) -{ - struct RString *s = mrb_str_ptr(str); - char *p, *e; - char c; + bytes: +#endif + { + struct RString *s = mrb_str_ptr(str); + char *p, *e; + char c; - mrb_str_modify(mrb, s); - if (RSTR_LEN(s) > 1) { - p = RSTR_PTR(s); - e = p + RSTR_LEN(s) - 1; - while (p < e) { + mrb_str_modify(mrb, s); + if (RSTR_LEN(s) > 1) { + p = RSTR_PTR(s); + e = p + RSTR_LEN(s) - 1; + while (p < e) { c = *p; *p++ = *e; *e-- = c; + } } + return str; } - return str; } +/* ---------------------------------- */ +/* 15.2.10.5.29 */ /* * call-seq: - * str.rindex(substring [, fixnum]) => fixnum or nil - * str.rindex(fixnum [, fixnum]) => fixnum or nil - * str.rindex(regexp [, fixnum]) => fixnum or nil + * str.reverse => new_str * - * Returns the index of the last occurrence of the given <i>substring</i>, - * character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns - * <code>nil</code> if not found. If the second parameter is present, it - * specifies the position in the string to end the search---characters beyond - * this point will not be considered. + * Returns a new string with the characters from <i>str</i> in reverse order. * - * "hello".rindex('e') #=> 1 - * "hello".rindex('l') #=> 3 - * "hello".rindex('a') #=> nil - * "hello".rindex(101) #=> 1 - * "hello".rindex(/[aeiou]/, -2) #=> 1 + * "stressed".reverse #=> "desserts" */ -static mrb_int -mrb_str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) +static mrb_value +mrb_str_reverse(mrb_state *mrb, mrb_value str) { - char *s, *sbeg, *t; - struct RString *ps = mrb_str_ptr(str); - mrb_int len = RSTRING_LEN(sub); - - /* substring longer than string */ - if (RSTR_LEN(ps) < len) return -1; - if (RSTR_LEN(ps) - pos < len) { - pos = RSTR_LEN(ps) - len; - } - sbeg = RSTR_PTR(ps); - s = RSTR_PTR(ps) + pos; - t = RSTRING_PTR(sub); - if (len) { - while (sbeg <= s) { - if (memcmp(s, t, len) == 0) { - return s - RSTR_PTR(ps); - } - s--; - } - return -1; - } - else { - return pos; - } + mrb_value str2 = mrb_str_dup(mrb, str); + mrb_str_reverse_bang(mrb, str2); + return str2; } /* 15.2.10.5.31 */ @@ -1635,13 +1829,13 @@ mrb_str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) * "hello".rindex(/[aeiou]/, -2) #=> 1 */ static mrb_value -mrb_str_rindex_m(mrb_state *mrb, mrb_value str) +mrb_str_rindex(mrb_state *mrb, mrb_value str) { mrb_value *argv; mrb_int argc; mrb_value sub; mrb_value vpos; - mrb_int pos, len = RSTRING_LEN(str); + mrb_int pos, len = RSTRING_CHAR_LEN(str); mrb_get_args(mrb, "*", &argv, &argc); if (argc == 2) { @@ -1651,7 +1845,7 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) if (pos < 0) { pos += len; if (pos < 0) { - regexp_check(mrb, sub); + mrb_regexp_check(mrb, sub); return mrb_nil_value(); } } @@ -1664,20 +1858,11 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) else sub = mrb_nil_value(); } - regexp_check(mrb, sub); + pos = chars2bytes(str, 0, pos); + len = chars2bytes(str, pos, len); + mrb_regexp_check(mrb, sub); switch (mrb_type(sub)) { - case MRB_TT_FIXNUM: { - int c = mrb_fixnum(sub); - mrb_int len = RSTRING_LEN(str); - unsigned char *p = (unsigned char*)RSTRING_PTR(str); - - for (pos=len-1;pos>=0;pos--) { - if (p[pos] == c) return mrb_fixnum_value(pos); - } - return mrb_nil_value(); - } - default: { mrb_value tmp; @@ -1689,8 +1874,11 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) } /* fall through */ case MRB_TT_STRING: - pos = mrb_str_rindex(mrb, str, sub, pos); - if (pos >= 0) return mrb_fixnum_value(pos); + pos = str_rindex(mrb, str, sub, pos); + if (pos >= 0) { + pos = bytes2chars(RSTRING_PTR(str), pos); + return mrb_fixnum_value(pos); + } break; } /* end of switch (TYPE(sub)) */ @@ -1701,7 +1889,7 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) /* * call-seq: - * str.split(pattern=$;, [limit]) => anArray + * str.split(pattern="\n", [limit]) => anArray * * Divides <i>str</i> into substrings based on a delimiter, returning an array * of these substrings. @@ -1717,7 +1905,7 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) * * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is - * split on whitespace as if ` ' were specified. + * split on whitespace as if ' ' were specified. * * If the <i>limit</i> parameter is omitted, trailing null fields are * suppressed. If <i>limit</i> is a positive number, at most that number of @@ -1729,10 +1917,8 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) * " now's the time".split #=> ["now's", "the", "time"] * " now's the time".split(' ') #=> ["now's", "the", "time"] * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"] - * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"] * "hello".split(//) #=> ["h", "e", "l", "l", "o"] * "hello".split(//, 3) #=> ["h", "e", "llo"] - * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"] * * "mellow yellow".split("ello") #=> ["m", "w y", "w"] * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"] @@ -1774,83 +1960,73 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) } } else { - noregexp(mrb, str); + mrb_noregexp(mrb, str); } } result = mrb_ary_new(mrb); beg = 0; if (split_type == awk) { - char *ptr = RSTRING_PTR(str); - char *eptr = RSTRING_END(str); - char *bptr = ptr; mrb_bool skip = TRUE; + mrb_int idx = 0; + mrb_int str_len = RSTRING_LEN(str); unsigned int c; + int ai = mrb_gc_arena_save(mrb); - end = beg; - while (ptr < eptr) { - int ai = mrb_gc_arena_save(mrb); - c = (unsigned char)*ptr++; + idx = end = beg; + while (idx < str_len) { + c = (unsigned char)RSTRING_PTR(str)[idx++]; if (skip) { if (ISSPACE(c)) { - beg = ptr - bptr; + beg = idx; } else { - end = ptr - bptr; + end = idx; skip = FALSE; if (lim_p && lim <= i) break; } } else if (ISSPACE(c)) { - mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, beg, end-beg)); + mrb_ary_push(mrb, result, byte_subseq(mrb, str, beg, end-beg)); mrb_gc_arena_restore(mrb, ai); skip = TRUE; - beg = ptr - bptr; + beg = idx; if (lim_p) ++i; } else { - end = ptr - bptr; + end = idx; } } } else if (split_type == string) { - char *ptr = RSTRING_PTR(str); // s->as.ary - char *temp = ptr; - char *eptr = RSTRING_END(str); - mrb_int slen = RSTRING_LEN(spat); - - if (slen == 0) { - int ai = mrb_gc_arena_save(mrb); - while (ptr < eptr) { - mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, ptr-temp, 1)); - mrb_gc_arena_restore(mrb, ai); - ptr++; - if (lim_p && lim <= ++i) break; + mrb_int str_len = RSTRING_LEN(str); + mrb_int pat_len = RSTRING_LEN(spat); + mrb_int idx = 0; + int ai = mrb_gc_arena_save(mrb); + + while (idx < str_len) { + if (pat_len > 0) { + end = mrb_memsearch(RSTRING_PTR(spat), pat_len, RSTRING_PTR(str)+idx, str_len - idx); + if (end < 0) break; + } else { + end = chars2bytes(str, idx, 1); } + mrb_ary_push(mrb, result, byte_subseq(mrb, str, idx, end)); + mrb_gc_arena_restore(mrb, ai); + idx += end + pat_len; + if (lim_p && lim <= ++i) break; } - else { - char *sptr = RSTRING_PTR(spat); - int ai = mrb_gc_arena_save(mrb); - - while (ptr < eptr && - (end = mrb_memsearch(sptr, slen, ptr, eptr - ptr)) >= 0) { - mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, ptr - temp, end)); - mrb_gc_arena_restore(mrb, ai); - ptr += end + slen; - if (lim_p && lim <= ++i) break; - } - } - beg = ptr - temp; + beg = idx; } else { - noregexp(mrb, str); + mrb_noregexp(mrb, str); } if (RSTRING_LEN(str) > 0 && (lim_p || RSTRING_LEN(str) > beg || lim < 0)) { if (RSTRING_LEN(str) == beg) { tmp = mrb_str_new_empty(mrb, str); } else { - tmp = mrb_str_subseq(mrb, str, beg, RSTRING_LEN(str)-beg); + tmp = byte_subseq(mrb, str, beg, RSTRING_LEN(str)-beg); } mrb_ary_push(mrb, result, tmp); } @@ -1864,44 +2040,39 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) return result; } -mrb_value -mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck) +MRB_API mrb_value +mrb_str_len_to_inum(mrb_state *mrb, const char *str, size_t len, int base, int badcheck) { - const char *p; + const char *p = str; + const char *pend = str + len; char sign = 1; - int c, uscore; - unsigned long n = 0; + int c; + uint64_t n = 0; mrb_int val; -#undef ISDIGIT -#define ISDIGIT(c) ('0' <= (c) && (c) <= '9') #define conv_digit(c) \ - (!ISASCII(c) ? -1 : \ - isdigit(c) ? ((c) - '0') : \ - islower(c) ? ((c) - 'a' + 10) : \ - isupper(c) ? ((c) - 'A' + 10) : \ + (ISDIGIT(c) ? ((c) - '0') : \ + ISLOWER(c) ? ((c) - 'a' + 10) : \ + ISUPPER(c) ? ((c) - 'A' + 10) : \ -1) - if (!str) { + if (!p) { if (badcheck) goto bad; return mrb_fixnum_value(0); } - while (ISSPACE(*str)) str++; + while (p<pend && ISSPACE(*p)) + p++; - if (str[0] == '+') { - str++; + if (p[0] == '+') { + p++; } - else if (str[0] == '-') { - str++; + else if (p[0] == '-') { + p++; sign = 0; } - if (str[0] == '+' || str[0] == '-') { - if (badcheck) goto bad; - return mrb_fixnum_value(0); - } if (base <= 0) { - if (str[0] == '0') { - switch (str[1]) { + if (p[0] == '0') { + switch (p[1]) { case 'x': case 'X': base = 16; break; @@ -1916,6 +2087,7 @@ mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck) break; default: base = 8; + break; } } else if (base < -1) { @@ -1927,27 +2099,27 @@ mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck) } switch (base) { case 2: - if (str[0] == '0' && (str[1] == 'b'||str[1] == 'B')) { - str += 2; + if (p[0] == '0' && (p[1] == 'b'||p[1] == 'B')) { + p += 2; } break; case 3: break; case 8: - if (str[0] == '0' && (str[1] == 'o'||str[1] == 'O')) { - str += 2; + if (p[0] == '0' && (p[1] == 'o'||p[1] == 'O')) { + p += 2; } case 4: case 5: case 6: case 7: break; case 10: - if (str[0] == '0' && (str[1] == 'd'||str[1] == 'D')) { - str += 2; + if (p[0] == '0' && (p[1] == 'd'||p[1] == 'D')) { + p += 2; } case 9: case 11: case 12: case 13: case 14: case 15: break; case 16: - if (str[0] == '0' && (str[1] == 'x'||str[1] == 'X')) { - str += 2; + if (p[0] == '0' && (p[1] == 'x'||p[1] == 'X')) { + p += 2; } break; default: @@ -1956,96 +2128,109 @@ mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck) } break; } /* end of switch (base) { */ - if (*str == '0') { /* squeeze preceeding 0s */ - uscore = 0; - while ((c = *++str) == '0' || c == '_') { + if (p>=pend) { + if (badcheck) goto bad; + return mrb_fixnum_value(0); + } + if (*p == '0') { /* squeeze preceding 0s */ + p++; + while (p<pend) { + c = *p++; if (c == '_') { - if (++uscore >= 2) + if (p<pend && *p == '_') { + if (badcheck) goto bad; break; + } + continue; + } + if (c != '0') { + p--; + break; } - else - uscore = 0; } - if (!(c = *str) || ISSPACE(c)) --str; } - c = *str; - c = conv_digit(c); - if (c < 0 || c >= base) { + if (p == pend) { if (badcheck) goto bad; return mrb_fixnum_value(0); } - - uscore = 0; - for (p=str;*p;p++) { + for ( ;p<pend;p++) { if (*p == '_') { - if (uscore == 0) { - uscore++; + p++; + if (p==pend) { + if (badcheck) goto bad; continue; } - if (badcheck) goto bad; - break; + if (*p == '_') { + if (badcheck) goto bad; + break; + } + } + if (badcheck && *p == '\0') { + goto nullbyte; } - uscore = 0; c = conv_digit(*p); if (c < 0 || c >= base) { - if (badcheck) goto bad; break; } n *= base; n += c; + if (n > (uint64_t)MRB_INT_MAX + (sign ? 0 : 1)) { + mrb_raisef(mrb, E_ARGUMENT_ERROR, "string (%S) too big for integer", + mrb_str_new(mrb, str, pend-str)); + } } - if (n > MRB_INT_MAX) { - mrb_raisef(mrb, E_ARGUMENT_ERROR, "string (%S) too big for integer", mrb_str_new_cstr(mrb, str)); - } - val = n; + val = (mrb_int)n; if (badcheck) { if (p == str) goto bad; /* no number */ - while (*p && ISSPACE(*p)) p++; - if (*p) goto bad; /* trailing garbage */ + while (p<pend && ISSPACE(*p)) p++; + if (p<pend) goto bad; /* trailing garbage */ } return mrb_fixnum_value(sign ? val : -val); -bad: - mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for number(%S)", mrb_str_new_cstr(mrb, str)); + nullbyte: + mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); + /* not reached */ + bad: + mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for number(%S)", + mrb_inspect(mrb, mrb_str_new(mrb, str, pend-str))); /* not reached */ return mrb_fixnum_value(0); } -char * +MRB_API mrb_value +mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck) +{ + return mrb_str_len_to_inum(mrb, str, strlen(str), base, badcheck); +} + +MRB_API const char* mrb_string_value_cstr(mrb_state *mrb, mrb_value *ptr) { - struct RString *ps = mrb_str_ptr(*ptr); + mrb_value str = mrb_str_to_str(mrb, *ptr); + struct RString *ps = mrb_str_ptr(str); mrb_int len = mrb_str_strlen(mrb, ps); char *p = RSTR_PTR(ps); if (!p || p[len] != '\0') { + if (RSTR_FROZEN_P(ps)) { + *ptr = str = mrb_str_dup(mrb, str); + ps = mrb_str_ptr(str); + } mrb_str_modify(mrb, ps); return RSTR_PTR(ps); } return p; } -mrb_value +MRB_API mrb_value mrb_str_to_inum(mrb_state *mrb, mrb_value str, mrb_int base, mrb_bool badcheck) { - char *s; + const char *s; mrb_int len; - str = mrb_str_to_str(mrb, str); - if (badcheck) { - s = mrb_string_value_cstr(mrb, &str); - } - else { - s = RSTRING_PTR(str); - } - if (s) { - len = RSTRING_LEN(str); - if (s[len]) { /* no sentinel somehow */ - struct RString *temp_str = str_new(mrb, s, len); - s = RSTR_PTR(temp_str); - } - } - return mrb_cstr_to_inum(mrb, s, base, badcheck); + s = mrb_string_value_ptr(mrb, str); + len = RSTRING_LEN(str); + return mrb_str_len_to_inum(mrb, s, len, base, badcheck); } /* 15.2.10.5.38 */ @@ -2081,16 +2266,14 @@ mrb_str_to_i(mrb_state *mrb, mrb_value self) return mrb_str_to_inum(mrb, self, base, FALSE); } -double +MRB_API double mrb_cstr_to_dbl(mrb_state *mrb, const char * p, mrb_bool badcheck) { char *end; + char buf[DBL_DIG * 4 + 10]; double d; enum {max_width = 20}; -#define OutOfRange() (((w = end - p) > max_width) ? \ - (w = max_width, ellipsis = "...") : \ - (w = (int)(end - p), ellipsis = "")) if (!p) return 0.0; while (ISSPACE(*p)) p++; @@ -2108,7 +2291,6 @@ bad: return d; } if (*end) { - char buf[DBL_DIG * 4 + 10]; char *n = buf; char *e = buf + sizeof(buf) - 1; char prev = 0; @@ -2147,7 +2329,7 @@ bad: return d; } -double +MRB_API double mrb_str_to_dbl(mrb_state *mrb, mrb_value str, mrb_bool badcheck) { char *s; @@ -2242,7 +2424,7 @@ mrb_str_upcase_bang(mrb_state *mrb, mrb_value str) * * Returns a copy of <i>str</i> with all lowercase letters replaced with their * uppercase counterparts. The operation is locale insensitive---only - * characters ``a'' to ``z'' are affected. + * characters 'a' to 'z' are affected. * * "hEllO".upcase #=> "HELLO" */ @@ -2377,30 +2559,30 @@ mrb_str_dump(mrb_state *mrb, mrb_value str) return mrb_obj_value(result); } -mrb_value +MRB_API mrb_value mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len) { str_buf_cat(mrb, mrb_str_ptr(str), ptr, len); return str; } -mrb_value +MRB_API mrb_value mrb_str_cat_cstr(mrb_state *mrb, mrb_value str, const char *ptr) { return mrb_str_cat(mrb, str, ptr, strlen(ptr)); } -mrb_value +MRB_API mrb_value mrb_str_cat_str(mrb_state *mrb, mrb_value str, mrb_value str2) { return mrb_str_cat(mrb, str, RSTRING_PTR(str2), RSTRING_LEN(str2)); } -mrb_value -mrb_str_append(mrb_state *mrb, mrb_value str, mrb_value str2) +MRB_API mrb_value +mrb_str_append(mrb_state *mrb, mrb_value str1, mrb_value str2) { str2 = mrb_str_to_str(mrb, str2); - return mrb_str_cat_str(mrb, str, str2); + return mrb_str_cat_str(mrb, str1, str2); } #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ @@ -2426,7 +2608,21 @@ mrb_str_inspect(mrb_state *mrb, mrb_value str) p = RSTRING_PTR(str); pend = RSTRING_END(str); for (;p < pend; p++) { unsigned char c, cc; +#ifdef MRB_UTF8_STRING + mrb_int clen; + clen = utf8len(p, pend); + if (clen > 1) { + mrb_int i; + + for (i=0; i<clen; i++) { + buf[i] = p[i]; + } + mrb_str_cat(mrb, result, buf, clen); + p += clen-1; + continue; + } +#endif c = *p; if (c == '"'|| c == '\\' || (c == '#' && IS_EVSTR(p, pend))) { buf[0] = '\\'; buf[1] = c; @@ -2500,7 +2696,7 @@ mrb_init_string(mrb_state *mrb) mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << 5), "pointer size too big for embedded string"); - s = mrb->string_class = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */ + mrb->string_class = s = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */ MRB_SET_INSTANCE_TT(s, MRB_TT_STRING); mrb_define_method(mrb, s, "bytesize", mrb_str_bytesize, MRB_ARGS_NONE()); @@ -2511,7 +2707,7 @@ mrb_init_string(mrb_state *mrb) mrb_define_method(mrb, s, "*", mrb_str_times, MRB_ARGS_REQ(1)); /* 15.2.10.5.5 */ mrb_define_method(mrb, s, "[]", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.6 */ mrb_define_method(mrb, s, "capitalize", mrb_str_capitalize, MRB_ARGS_NONE()); /* 15.2.10.5.7 */ - mrb_define_method(mrb, s, "capitalize!", mrb_str_capitalize_bang, MRB_ARGS_REQ(1)); /* 15.2.10.5.8 */ + mrb_define_method(mrb, s, "capitalize!", mrb_str_capitalize_bang, MRB_ARGS_NONE()); /* 15.2.10.5.8 */ mrb_define_method(mrb, s, "chomp", mrb_str_chomp, MRB_ARGS_ANY()); /* 15.2.10.5.9 */ mrb_define_method(mrb, s, "chomp!", mrb_str_chomp_bang, MRB_ARGS_ANY()); /* 15.2.10.5.10 */ mrb_define_method(mrb, s, "chop", mrb_str_chop, MRB_ARGS_REQ(1)); /* 15.2.10.5.11 */ @@ -2521,9 +2717,9 @@ mrb_init_string(mrb_state *mrb) mrb_define_method(mrb, s, "empty?", mrb_str_empty_p, MRB_ARGS_NONE()); /* 15.2.10.5.16 */ mrb_define_method(mrb, s, "eql?", mrb_str_eql, MRB_ARGS_REQ(1)); /* 15.2.10.5.17 */ - mrb_define_method(mrb, s, "hash", mrb_str_hash_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.20 */ + mrb_define_method(mrb, s, "hash", mrb_str_hash_m, MRB_ARGS_NONE()); /* 15.2.10.5.20 */ mrb_define_method(mrb, s, "include?", mrb_str_include, MRB_ARGS_REQ(1)); /* 15.2.10.5.21 */ - mrb_define_method(mrb, s, "index", mrb_str_index_m, MRB_ARGS_ANY()); /* 15.2.10.5.22 */ + mrb_define_method(mrb, s, "index", mrb_str_index, MRB_ARGS_ANY()); /* 15.2.10.5.22 */ mrb_define_method(mrb, s, "initialize", mrb_str_init, MRB_ARGS_REQ(1)); /* 15.2.10.5.23 */ mrb_define_method(mrb, s, "initialize_copy", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.24 */ mrb_define_method(mrb, s, "intern", mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.25 */ @@ -2531,7 +2727,7 @@ mrb_init_string(mrb_state *mrb) mrb_define_method(mrb, s, "replace", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.28 */ mrb_define_method(mrb, s, "reverse", mrb_str_reverse, MRB_ARGS_NONE()); /* 15.2.10.5.29 */ mrb_define_method(mrb, s, "reverse!", mrb_str_reverse_bang, MRB_ARGS_NONE()); /* 15.2.10.5.30 */ - mrb_define_method(mrb, s, "rindex", mrb_str_rindex_m, MRB_ARGS_ANY()); /* 15.2.10.5.31 */ + mrb_define_method(mrb, s, "rindex", mrb_str_rindex, MRB_ARGS_ANY()); /* 15.2.10.5.31 */ mrb_define_method(mrb, s, "size", mrb_str_size, MRB_ARGS_NONE()); /* 15.2.10.5.33 */ mrb_define_method(mrb, s, "slice", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.34 */ mrb_define_method(mrb, s, "split", mrb_str_split_m, MRB_ARGS_ANY()); /* 15.2.10.5.35 */ @@ -2541,8 +2737,10 @@ mrb_init_string(mrb_state *mrb) mrb_define_method(mrb, s, "to_s", mrb_str_to_s, MRB_ARGS_NONE()); /* 15.2.10.5.40 */ mrb_define_method(mrb, s, "to_str", mrb_str_to_s, MRB_ARGS_NONE()); mrb_define_method(mrb, s, "to_sym", mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.41 */ - mrb_define_method(mrb, s, "upcase", mrb_str_upcase, MRB_ARGS_REQ(1)); /* 15.2.10.5.42 */ - mrb_define_method(mrb, s, "upcase!", mrb_str_upcase_bang, MRB_ARGS_REQ(1)); /* 15.2.10.5.43 */ + mrb_define_method(mrb, s, "upcase", mrb_str_upcase, MRB_ARGS_NONE()); /* 15.2.10.5.42 */ + mrb_define_method(mrb, s, "upcase!", mrb_str_upcase_bang, MRB_ARGS_NONE()); /* 15.2.10.5.43 */ mrb_define_method(mrb, s, "inspect", mrb_str_inspect, MRB_ARGS_NONE()); /* 15.2.10.5.46(x) */ mrb_define_method(mrb, s, "bytes", mrb_str_bytes, MRB_ARGS_NONE()); + + mrb_define_method(mrb, s, "freeze", mrb_str_freeze, MRB_ARGS_NONE()); } |
