summaryrefslogtreecommitdiffhomepage
path: root/src/string.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/string.c')
-rw-r--r--src/string.c290
1 files changed, 175 insertions, 115 deletions
diff --git a/src/string.c b/src/string.c
index 2b9a5cfde..8d1d53521 100644
--- a/src/string.c
+++ b/src/string.c
@@ -24,7 +24,7 @@
typedef struct mrb_shared_string {
int refcnt;
- mrb_int len;
+ mrb_int capa;
char *ptr;
} mrb_shared_string;
@@ -32,7 +32,7 @@ const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
#define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class))
-static void
+static struct RString*
str_init_normal_capa(mrb_state *mrb, struct RString *s,
const char *p, size_t len, size_t capa)
{
@@ -42,33 +42,37 @@ str_init_normal_capa(mrb_state *mrb, struct RString *s,
s->as.heap.ptr = dst;
s->as.heap.len = (mrb_int)len;
s->as.heap.aux.capa = (mrb_int)capa;
+ RSTR_UNSET_TYPE_FLAG(s);
+ return s;
}
-static void
+static struct RString*
str_init_normal(mrb_state *mrb, struct RString *s, const char *p, size_t len)
{
- str_init_normal_capa(mrb, s, p, len, len);
+ return str_init_normal_capa(mrb, s, p, len, len);
}
-static void
+static struct RString*
str_init_embed(struct RString *s, const char *p, size_t len)
{
- if (p) memcpy(s->as.ary, p, len);
- s->as.ary[len] = '\0';
- RSTR_SET_EMBED_FLAG(s);
+ if (p) memcpy(RSTR_EMBED_PTR(s), p, len);
+ RSTR_EMBED_PTR(s)[len] = '\0';
+ RSTR_SET_TYPE_FLAG(s, EMBED);
RSTR_SET_EMBED_LEN(s, len);
+ return s;
}
-static void
+static struct RString*
str_init_nofree(struct RString *s, const char *p, size_t len)
{
s->as.heap.ptr = (char *)p;
s->as.heap.len = (mrb_int)len;
s->as.heap.aux.capa = 0; /* nofree */
- RSTR_SET_NOFREE_FLAG(s);
+ RSTR_SET_TYPE_FLAG(s, NOFREE);
+ return s;
}
-static void
+static struct RString*
str_init_shared(mrb_state *mrb, const struct RString *orig, struct RString *s, mrb_shared_string *shared)
{
if (shared) {
@@ -76,64 +80,63 @@ str_init_shared(mrb_state *mrb, const struct RString *orig, struct RString *s, m
}
else {
shared = (mrb_shared_string *)mrb_malloc(mrb, sizeof(mrb_shared_string));
- shared->refcnt = 2;
+ shared->refcnt = 1;
shared->ptr = orig->as.heap.ptr;
- shared->len = orig->as.heap.len;
+ shared->capa = orig->as.heap.aux.capa;
}
s->as.heap.ptr = orig->as.heap.ptr;
s->as.heap.len = orig->as.heap.len;
s->as.heap.aux.shared = shared;
- RSTR_SET_SHARED_FLAG(s);
+ RSTR_SET_TYPE_FLAG(s, SHARED);
+ return s;
}
-static void
+static struct RString*
str_init_fshared(const struct RString *orig, struct RString *s, struct RString *fshared)
{
s->as.heap.ptr = orig->as.heap.ptr;
s->as.heap.len = orig->as.heap.len;
s->as.heap.aux.fshared = fshared;
- RSTR_SET_FSHARED_FLAG(s);
+ RSTR_SET_TYPE_FLAG(s, FSHARED);
+ return s;
}
-static void
-str_init(mrb_state *mrb, struct RString *s, const char *p, size_t len)
+static struct RString*
+str_init_modifiable(mrb_state *mrb, struct RString *s, const char *p, size_t len)
{
if (RSTR_EMBEDDABLE_P(len)) {
- str_init_embed(s, p, len);
+ return str_init_embed(s, p, len);
}
else {
- str_init_normal(mrb, s, p, len);
+ return str_init_normal(mrb, s, p, len);
}
}
static struct RString*
str_new_static(mrb_state *mrb, const char *p, size_t len)
{
- struct RString *s;
-
+ if (RSTR_EMBEDDABLE_P(len)) {
+ return str_init_embed(mrb_obj_alloc_string(mrb), p, len);
+ }
if (len >= MRB_INT_MAX) {
mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
}
- s = mrb_obj_alloc_string(mrb);
- str_init_nofree(s, p, len);
-
- return s;
+ return str_init_nofree(mrb_obj_alloc_string(mrb), p, len);
}
static struct RString*
str_new(mrb_state *mrb, const char *p, size_t len)
{
- struct RString *s;
-
+ if (RSTR_EMBEDDABLE_P(len)) {
+ return str_init_embed(mrb_obj_alloc_string(mrb), p, len);
+ }
if (len >= MRB_INT_MAX) {
mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
}
if (p && mrb_ro_data_p(p)) {
- return str_new_static(mrb, p, len);
+ return str_init_nofree(mrb_obj_alloc_string(mrb), p, len);
}
- s = mrb_obj_alloc_string(mrb);
- str_init(mrb, s, p, len);
- return s;
+ return str_init_normal(mrb, mrb_obj_alloc_string(mrb), p, len);
}
static inline void
@@ -156,12 +159,17 @@ mrb_str_new_capa(mrb_state *mrb, size_t capa)
{
struct RString *s;
- s = mrb_obj_alloc_string(mrb);
-
- if (capa >= MRB_INT_MAX) {
+ if (RSTR_EMBEDDABLE_P(capa)) {
+ s = str_init_embed(mrb_obj_alloc_string(mrb), NULL, 0);
+ }
+ else if (capa >= MRB_INT_MAX) {
mrb_raise(mrb, E_ARGUMENT_ERROR, "string capacity size too big");
+ /* not reached */
+ s = NULL;
+ }
+ else {
+ s = str_init_normal_capa(mrb, mrb_obj_alloc_string(mrb), NULL, 0, capa);
}
- str_init_normal_capa(mrb, s, NULL, 0, capa);
return mrb_obj_value(s);
}
@@ -187,8 +195,7 @@ resize_capa(mrb_state *mrb, struct RString *s, size_t capacity)
#endif
if (RSTR_EMBED_P(s)) {
if (!RSTR_EMBEDDABLE_P(capacity)) {
- str_init_normal_capa(mrb, s, s->as.ary, RSTR_EMBED_LEN(s), capacity);
- RSTR_UNSET_EMBED_FLAG(s);
+ str_init_normal_capa(mrb, s, RSTR_EMBED_PTR(s), RSTR_EMBED_LEN(s), capacity);
}
}
else {
@@ -337,19 +344,69 @@ chars2bytes(mrb_value s, mrb_int off, mrb_int idx)
/* map byte offset to character index */
static mrb_int
-bytes2chars(char *p, mrb_int bi)
+bytes2chars(char *p, mrb_int len, mrb_int bi)
{
- mrb_int i, b, n;
+ const char *e = p + (size_t)len;
+ const char *pivot = p + bi;
+ mrb_int i;
- for (b=i=0; b<bi; i++) {
- n = utf8len_codepage[(unsigned char)*p];
- b += n;
- p += n;
+ for (i = 0; p < pivot; i ++) {
+ p += utf8len(p, e);
}
- if (b != bi) return -1;
+ if (p != pivot) return -1;
return i;
}
+static const char *
+char_adjust(const char *beg, const char *end, const char *ptr)
+{
+ if ((ptr > beg || ptr < end) && (*ptr & 0xc0) == 0x80) {
+ const int utf8_adjust_max = 3;
+ const char *p;
+
+ if (ptr - beg > utf8_adjust_max) {
+ beg = ptr - utf8_adjust_max;
+ }
+
+ p = ptr;
+ while (p > beg) {
+ p --;
+ if ((*p & 0xc0) != 0x80) {
+ int clen = utf8len(p, end);
+ if (clen > ptr - p) return p;
+ break;
+ }
+ }
+ }
+
+ return ptr;
+}
+
+static const char *
+char_backtrack(const char *ptr, const char *end)
+{
+ if (ptr < end) {
+ const int utf8_bytelen_max = 4;
+ const char *p;
+
+ if (end - ptr > utf8_bytelen_max) {
+ ptr = end - utf8_bytelen_max;
+ }
+
+ p = end;
+ while (p > ptr) {
+ p --;
+ if ((*p & 0xc0) != 0x80) {
+ int clen = utf8len_codepage[(unsigned char)*p];
+ if (clen == end - p) { return p; }
+ break;
+ }
+ }
+ }
+
+ return end - 1;
+}
+
static mrb_int
str_index_str_by_char_search(mrb_state *mrb, const char *p, const char *pend, const char *s, const mrb_int slen, mrb_int off)
{
@@ -412,7 +469,9 @@ str_index_str_by_char(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
#else
#define RSTRING_CHAR_LEN(s) RSTRING_LEN(s)
#define chars2bytes(p, off, ci) (ci)
-#define bytes2chars(p, bi) (bi)
+#define bytes2chars(p, end, bi) (bi)
+#define char_adjust(beg, end, ptr) (ptr)
+#define char_backtrack(ptr, end) ((end) - 1)
#define BYTES_ALIGN_CHECK(pos)
#define str_index_str_by_char(mrb, str, sub, pos) str_index_str(mrb, str, sub, pos)
#endif
@@ -464,38 +523,58 @@ mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n)
static void
str_make_shared(mrb_state *mrb, struct RString *orig, struct RString *s)
{
- mrb_int len = RSTR_LEN(orig);
+ size_t len = (size_t)orig->as.heap.len;
mrb_assert(!RSTR_EMBED_P(orig));
if (RSTR_NOFREE_P(orig)) {
str_init_nofree(s, orig->as.heap.ptr, len);
- RSTR_UNSET_EMBED_FLAG(s);
}
else if (RSTR_SHARED_P(orig)) {
str_init_shared(mrb, orig, s, orig->as.heap.aux.shared);
- RSTR_UNSET_EMBED_FLAG(s);
}
else if (RSTR_FSHARED_P(orig)) {
str_init_fshared(orig, s, orig->as.heap.aux.fshared);
- RSTR_UNSET_EMBED_FLAG(s);
}
else if (MRB_FROZEN_P(orig) && !RSTR_POOL_P(orig)) {
str_init_fshared(orig, s, orig);
- RSTR_UNSET_EMBED_FLAG(s);
}
else {
if (orig->as.heap.aux.capa > orig->as.heap.len) {
orig->as.heap.ptr = (char *)mrb_realloc(mrb, orig->as.heap.ptr, len+1);
+ orig->as.heap.aux.capa = len;
}
str_init_shared(mrb, orig, s, NULL);
- RSTR_UNSET_EMBED_FLAG(s);
- orig->as.heap.aux.shared = s->as.heap.aux.shared;
- RSTR_SET_SHARED_FLAG(orig);
- RSTR_UNSET_EMBED_FLAG(s);
+ str_init_shared(mrb, orig, orig, s->as.heap.aux.shared);
}
}
mrb_value
+mrb_str_pool(mrb_state *mrb, mrb_value str)
+{
+ struct RString *s = (struct RString *)mrb_malloc(mrb, sizeof(struct RString));
+ struct RString *orig = mrb_str_ptr(str);
+ const char *p = RSTR_PTR(orig);
+ size_t len = (size_t)RSTR_LEN(orig);
+
+ s->tt = MRB_TT_STRING;
+ s->c = mrb->string_class;
+ s->flags = 0;
+
+ if (RSTR_EMBEDDABLE_P(len)) {
+ str_init_embed(s, p, len);
+ }
+ else if (RSTR_NOFREE_P(orig)) {
+ str_init_nofree(s, p, len);
+ }
+ else {
+ str_init_normal(mrb, s, p, len);
+ }
+ RSTR_SET_POOL_FLAG(s);
+ MRB_SET_FROZEN_FLAG(s);
+ return mrb_obj_value(s);
+}
+
+mrb_value
mrb_str_byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
{
struct RString *orig, *s;
@@ -601,7 +680,6 @@ str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2)
RSTR_COPY_ASCII_FLAG(s1, s2);
if (RSTR_SHARED_P(s1)) {
str_decref(mrb, s1->as.heap.aux.shared);
- RSTR_UNSET_SHARED_FLAG(s1);
}
else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1) && !RSTR_FSHARED_P(s1)
&& s1->as.heap.ptr) {
@@ -609,8 +687,6 @@ str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2)
}
len = (size_t)RSTR_LEN(s2);
- RSTR_UNSET_FSHARED_FLAG(s1);
- RSTR_UNSET_NOFREE_FLAG(s1);
if (RSTR_EMBEDDABLE_P(len)) {
str_init_embed(s1, RSTR_PTR(s2), len);
}
@@ -624,7 +700,7 @@ str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2)
static mrb_int
str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
{
- char *s, *sbeg, *t;
+ const char *s, *sbeg, *t;
struct RString *ps = mrb_str_ptr(str);
mrb_int len = RSTRING_LEN(sub);
@@ -637,11 +713,12 @@ str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
s = RSTR_PTR(ps) + pos;
t = RSTRING_PTR(sub);
if (len) {
+ s = char_adjust(sbeg, sbeg + RSTR_LEN(ps), s);
while (sbeg <= s) {
if (memcmp(s, t, len) == 0) {
return (mrb_int)(s - RSTR_PTR(ps));
}
- s--;
+ s = char_backtrack(sbeg, s);
}
return -1;
}
@@ -736,29 +813,17 @@ mrb_str_modify_keep_ascii(mrb_state *mrb, struct RString *s)
mrb_shared_string *shared = s->as.heap.aux.shared;
if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) {
- s->as.heap.ptr = shared->ptr;
- s->as.heap.aux.capa = shared->len;
- RSTR_PTR(s)[s->as.heap.len] = '\0';
+ s->as.heap.aux.capa = shared->capa;
+ s->as.heap.ptr[s->as.heap.len] = '\0';
mrb_free(mrb, shared);
}
else {
- char *p = RSTR_PTR(s);
- size_t len = (size_t)s->as.heap.len;
-
- str_init(mrb, s, p, len);
+ str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len);
str_decref(mrb, shared);
}
- RSTR_UNSET_SHARED_FLAG(s);
- return;
}
- if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) {
- char *p = s->as.heap.ptr;
- size_t len = (size_t)s->as.heap.len;
-
- RSTR_UNSET_NOFREE_FLAG(s);
- RSTR_UNSET_FSHARED_FLAG(s);
- str_init(mrb, s, p, len);
- return;
+ else if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) {
+ str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len);
}
}
@@ -1727,28 +1792,25 @@ mrb_str_include(mrb_state *mrb, mrb_value self)
static mrb_value
mrb_str_index_m(mrb_state *mrb, mrb_value str)
{
- mrb_value *argv;
- mrb_int argc;
mrb_value sub;
- mrb_int pos, clen;
+ mrb_int pos;
- mrb_get_args(mrb, "*!", &argv, &argc);
- if (argc == 2) {
- mrb_get_args(mrb, "oi", &sub, &pos);
- }
- else {
- pos = 0;
- if (argc > 0)
- sub = argv[0];
- else
+ switch (mrb_get_args(mrb, "|oi", &sub, &pos)) {
+ case 0:
sub = mrb_nil_value();
- }
- if (pos < 0) {
- clen = RSTRING_CHAR_LEN(str);
- pos += clen;
- if (pos < 0) {
- return mrb_nil_value();
- }
+ /* fall through */
+ case 1:
+ pos = 0;
+ break;
+ case 2:
+ if (pos < 0) {
+ mrb_int clen = RSTRING_CHAR_LEN(str);
+ pos += clen;
+ if (pos < 0) {
+ return mrb_nil_value();
+ }
+ }
+ break;
}
switch (mrb_type(sub)) {
@@ -1977,28 +2039,25 @@ mrb_str_reverse(mrb_state *mrb, mrb_value str)
static mrb_value
mrb_str_rindex(mrb_state *mrb, mrb_value str)
{
- mrb_value *argv;
- mrb_int argc;
mrb_value sub;
mrb_int pos, len = RSTRING_CHAR_LEN(str);
- mrb_get_args(mrb, "*!", &argv, &argc);
- if (argc == 2) {
- mrb_get_args(mrb, "oi", &sub, &pos);
- if (pos < 0) {
- pos += len;
+ switch (mrb_get_args(mrb, "|oi", &sub, &pos)) {
+ case 0:
+ sub = mrb_nil_value();
+ /* fall through */
+ case 1:
+ pos = len;
+ break;
+ case 2:
if (pos < 0) {
- return mrb_nil_value();
+ pos += len;
+ if (pos < 0) {
+ return mrb_nil_value();
+ }
}
- }
- if (pos > len) pos = len;
- }
- else {
- pos = len;
- if (argc > 0)
- sub = argv[0];
- else
- sub = mrb_nil_value();
+ if (pos > len) pos = len;
+ break;
}
pos = chars2bytes(str, 0, pos);
@@ -2016,7 +2075,7 @@ mrb_str_rindex(mrb_state *mrb, mrb_value str)
case MRB_TT_STRING:
pos = str_rindex(mrb, str, sub, pos);
if (pos >= 0) {
- pos = bytes2chars(RSTRING_PTR(str), pos);
+ pos = bytes2chars(RSTRING_PTR(str), RSTRING_LEN(str), pos);
BYTES_ALIGN_CHECK(pos);
return mrb_fixnum_value(pos);
}
@@ -2888,7 +2947,8 @@ mrb_init_string(mrb_state *mrb)
{
struct RClass *s;
- mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << 5), "pointer size too big for embedded string");
+ mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << MRB_STR_EMBED_LEN_BITSIZE),
+ "pointer size too big for embedded string");
mrb->string_class = s = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */
MRB_SET_INSTANCE_TT(s, MRB_TT_STRING);