From 055468bfa0d97edce5b1c4ae5d9847349cb4fee5 Mon Sep 17 00:00:00 2001 From: mattn Date: Tue, 7 Jan 2014 14:25:08 +0900 Subject: Add mruby-string-utf8 --- mrbgems/mruby-string-utf8/src/string.c | 249 +++++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 mrbgems/mruby-string-utf8/src/string.c (limited to 'mrbgems/mruby-string-utf8/src/string.c') diff --git a/mrbgems/mruby-string-utf8/src/string.c b/mrbgems/mruby-string-utf8/src/string.c new file mode 100644 index 000000000..bcf682648 --- /dev/null +++ b/mrbgems/mruby-string-utf8/src/string.c @@ -0,0 +1,249 @@ +#include "mruby.h" +#include "mruby/string.h" +#include "mruby/range.h" +#include +#include + +/* TODO: duplicate definition in src/re.h */ +#define REGEXP_CLASS "Regexp" + +static size_t utf8len_tab[256] = +{ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, +}; + +static size_t +utf8len(unsigned char* p) +{ + size_t len; + int i; + + if (*p == 0) + return 0; + len = utf8len_tab[*p]; + for (i = 1; i < len; ++i) + if ((p[i] & 0xc0) != 0x80) + return 1; + return len; +} + +static size_t +mrb_utf8_strlen(mrb_value str) +{ + size_t total = 0; + unsigned char* p = (unsigned char*) RSTRING_PTR(str); + while (*p) { + p += utf8len(p); + total++; + } + return total; +} + +#define RSTRING_LEN_UTF8(s) mrb_utf8_strlen(s) + +static mrb_value +noregexp(mrb_state *mrb, mrb_value self) +{ + mrb_raise(mrb, E_NOTIMP_ERROR, "Regexp class not implemented"); + return mrb_nil_value(); +} + +static void +regexp_check(mrb_state *mrb, mrb_value obj) +{ + if (!memcmp(mrb_obj_classname(mrb, obj), REGEXP_CLASS, sizeof(REGEXP_CLASS) - 1)) { + noregexp(mrb, obj); + } +} + +static inline mrb_int +mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n) +{ + const unsigned char *x = xs, *xe = xs + m; + const unsigned char *y = ys; + int i, qstable[256]; + + /* Preprocessing */ + for (i = 0; i < 256; ++i) + qstable[i] = m + 1; + for (; x < xe; ++x) + qstable[*x] = xe - x; + /* Searching */ + for (; y + m <= ys + n; y += *(qstable + y[m])) { + if (*xs == *y && memcmp(xs, y, m) == 0) + return y - ys; + } + return -1; +} +static mrb_int +mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n) +{ + const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0; + + if (m > n) return -1; + else if (m == n) { + return memcmp(x0, y0, m) == 0 ? 0 : -1; + } + else if (m < 1) { + return 0; + } + else if (m == 1) { + const unsigned char *ys = y, *ye = ys + n; + for (; y < ye; ++y) { + if (*x == *y) + return y - ys; + } + return -1; + } + return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n); +} + +static mrb_value +str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) +{ + int i; + unsigned char *p = (unsigned char*) RSTRING_PTR(str), *t; + for (i = 0; i < beg && *p; i++) { + p += utf8len(p); + } + t = p; + for (i = 0; i < len && *p; i++) { + t += utf8len(t); + } + return mrb_str_new(mrb, (const char*)p, (int)(t - p)); +} + +static mrb_value +str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) +{ + mrb_value str2; + int len8 = RSTRING_LEN_UTF8(str); + + if (len < 0) return mrb_nil_value(); + if (len8 == 0) { + len = 0; + } else if (beg < 0) { + beg = len8 + beg; + } + if (beg > len8) return mrb_nil_value(); + if (beg < 0) { + beg += len8; + if (beg < 0) return mrb_nil_value(); + } + if (beg + len > len8) + len = len8 - beg; + if (len <= 0) { + len = 0; + } + str2 = str_subseq(mrb, str, beg, len); + + return str2; +} + +static mrb_int +str_index(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int offset) +{ + mrb_int pos; + char *s, *sptr; + mrb_int len, slen; + + len = RSTRING_LEN(str); + slen = RSTRING_LEN(sub); + if (offset < 0) { + offset += len; + if (offset < 0) return -1; + } + if (len - offset < slen) return -1; + s = RSTRING_PTR(str); + if (offset) { + s += offset; + } + if (slen == 0) return offset; + /* need proceed one character at a time */ + sptr = RSTRING_PTR(sub); + slen = RSTRING_LEN(sub); + len = RSTRING_LEN(str) - offset; + pos = mrb_memsearch(sptr, slen, s, len); + if (pos < 0) return pos; + return pos + offset; +} + +static mrb_value +mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx) +{ + mrb_int idx; + + regexp_check(mrb, indx); + switch (mrb_type(indx)) { + case MRB_TT_FIXNUM: + idx = mrb_fixnum(indx); + +num_index: + str = str_substr(mrb, str, idx, 1); + if (!mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value(); + return str; + + case MRB_TT_STRING: + if (str_index(mrb, str, indx, 0) != -1) + return mrb_str_dup(mrb, indx); + return mrb_nil_value(); + + case MRB_TT_RANGE: + /* check if indx is Range */ + { + mrb_int beg, len; + mrb_value tmp; + + len = RSTRING_LEN_UTF8(str); + if (mrb_range_beg_len(mrb, indx, &beg, &len, len)) { + tmp = str_subseq(mrb, str, beg, len); + return tmp; + } + else { + return mrb_nil_value(); + } + } + default: + idx = mrb_fixnum(indx); + goto num_index; + } + return mrb_nil_value(); /* not reached */ +} + +static mrb_value +mrb_str_aref_m(mrb_state *mrb, mrb_value str) +{ + mrb_value a1, a2; + int argc; + + argc = mrb_get_args(mrb, "o|o", &a1, &a2); + if (argc == 2) { + regexp_check(mrb, a1); + return str_substr(mrb, str, mrb_fixnum(a1), mrb_fixnum(a2)); + } + if (argc != 1) { + mrb_raisef(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%S for 1)", mrb_fixnum_value(argc)); + } + return mrb_str_aref(mrb, str, a1); +} + +void +mrb_mruby_string_utf8_gem_init(mrb_state* mrb) +{ + struct RClass * s = mrb->string_class; + + mrb_define_method(mrb, s, "[]", mrb_str_aref_m, MRB_ARGS_ANY()); + mrb_define_method(mrb, s, "slice", mrb_str_aref_m, MRB_ARGS_ANY()); +} + +void +mrb_mruby_string_utf8_gem_final(mrb_state* mrb) +{ +} -- cgit v1.2.3