diff options
Diffstat (limited to 'mrbgems/mruby-string-ext/src/string.c')
| -rw-r--r-- | mrbgems/mruby-string-ext/src/string.c | 1137 |
1 files changed, 1086 insertions, 51 deletions
diff --git a/mrbgems/mruby-string-ext/src/string.c b/mrbgems/mruby-string-ext/src/string.c index f04f12c4b..e3dabcc18 100644 --- a/mrbgems/mruby-string-ext/src/string.c +++ b/mrbgems/mruby-string-ext/src/string.c @@ -1,23 +1,94 @@ -#include <ctype.h> #include <string.h> -#include "mruby.h" -#include "mruby/array.h" -#include "mruby/class.h" -#include "mruby/string.h" +#include <mruby.h> +#include <mruby/array.h> +#include <mruby/class.h> +#include <mruby/string.h> +#include <mruby/range.h> + +#define ENC_ASCII_8BIT "ASCII-8BIT" +#define ENC_BINARY "BINARY" +#define ENC_UTF8 "UTF-8" + +#define ENC_COMP_P(enc, enc_lit) \ + str_casecmp_p(RSTRING_PTR(enc), RSTRING_LEN(enc), enc_lit, sizeof(enc_lit"")-1) + +#ifdef MRB_NO_FLOAT +# define mrb_float_p(o) FALSE +#endif + +static mrb_bool +str_casecmp_p(const char *s1, mrb_int len1, const char *s2, mrb_int len2) +{ + const char *e1, *e2; + + if (len1 != len2) return FALSE; + e1 = s1 + len1; + e2 = s2 + len2; + while (s1 < e1 && s2 < e2) { + if (*s1 != *s2 && TOUPPER(*s1) != TOUPPER(*s2)) return FALSE; + ++s1; + ++s2; + } + return TRUE; +} static mrb_value -mrb_str_getbyte(mrb_state *mrb, mrb_value str) +int_chr_binary(mrb_state *mrb, mrb_value num) { - mrb_int pos; - mrb_get_args(mrb, "i", &pos); + mrb_int cp = mrb_as_int(mrb, num); + char c; + mrb_value str; - if (pos < 0) - pos += RSTRING_LEN(str); - if (pos < 0 || RSTRING_LEN(str) <= pos) - return mrb_nil_value(); + if (cp < 0 || 0xff < cp) { + mrb_raisef(mrb, E_RANGE_ERROR, "%v out of char range", num); + } + c = (char)cp; + str = mrb_str_new(mrb, &c, 1); + RSTR_SET_ASCII_FLAG(mrb_str_ptr(str)); + return str; +} - return mrb_fixnum_value((unsigned char)RSTRING_PTR(str)[pos]); +#ifdef MRB_UTF8_STRING +static mrb_value +int_chr_utf8(mrb_state *mrb, mrb_value num) +{ + mrb_int cp = mrb_int(mrb, num); + char utf8[4]; + mrb_int len; + mrb_value str; + uint32_t ascii_flag = 0; + + if (cp < 0 || 0x10FFFF < cp) { + mrb_raisef(mrb, E_RANGE_ERROR, "%v out of char range", num); + } + if (cp < 0x80) { + utf8[0] = (char)cp; + len = 1; + ascii_flag = MRB_STR_ASCII; + } + else if (cp < 0x800) { + utf8[0] = (char)(0xC0 | (cp >> 6)); + utf8[1] = (char)(0x80 | (cp & 0x3F)); + len = 2; + } + else if (cp < 0x10000) { + utf8[0] = (char)(0xE0 | (cp >> 12)); + utf8[1] = (char)(0x80 | ((cp >> 6) & 0x3F)); + utf8[2] = (char)(0x80 | ( cp & 0x3F)); + len = 3; + } + else { + utf8[0] = (char)(0xF0 | (cp >> 18)); + utf8[1] = (char)(0x80 | ((cp >> 12) & 0x3F)); + utf8[2] = (char)(0x80 | ((cp >> 6) & 0x3F)); + utf8[3] = (char)(0x80 | ( cp & 0x3F)); + len = 4; + } + str = mrb_str_new(mrb, utf8, len); + mrb_str_ptr(str)->flags |= ascii_flag; + return str; } +#endif /* * call-seq: @@ -83,18 +154,27 @@ mrb_str_swapcase(mrb_state *mrb, mrb_value self) * * Append---Concatenates the given object to <i>str</i>. If the object is a * <code>Integer</code>, it is considered as a codepoint, and is converted - * to a character before concatenation. + * to a character before concatenation + * (equivalent to <code>str.concat(integer.chr(__ENCODING__))</code>). * * a = "hello " * a << "world" #=> "hello world" * a.concat(33) #=> "hello world!" */ static mrb_value -mrb_str_concat2(mrb_state *mrb, mrb_value self) +mrb_str_concat_m(mrb_state *mrb, mrb_value self) { - mrb_value str; - mrb_get_args(mrb, "S", &str); - mrb_str_concat(mrb, self, str); + mrb_value str = mrb_get_arg1(mrb); + + if (mrb_integer_p(str) || mrb_float_p(str)) +#ifdef MRB_UTF8_STRING + str = int_chr_utf8(mrb, str); +#else + str = int_chr_binary(mrb, str); +#endif + else + mrb_ensure_string_type(mrb, str); + mrb_str_cat_str(mrb, self, str); return self; } @@ -114,14 +194,15 @@ mrb_str_concat2(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_start_with(mrb_state *mrb, mrb_value self) { - mrb_value *argv, sub; + const mrb_value *argv; mrb_int argc, i; mrb_get_args(mrb, "*", &argv, &argc); for (i = 0; i < argc; i++) { size_t len_l, len_r; int ai = mrb_gc_arena_save(mrb); - sub = mrb_string_type(mrb, argv[i]); + mrb_value sub = argv[i]; + mrb_ensure_string_type(mrb, sub); mrb_gc_arena_restore(mrb, ai); len_l = RSTRING_LEN(self); len_r = RSTRING_LEN(sub); @@ -143,14 +224,15 @@ mrb_str_start_with(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_end_with(mrb_state *mrb, mrb_value self) { - mrb_value *argv, sub; + const mrb_value *argv; mrb_int argc, i; mrb_get_args(mrb, "*", &argv, &argc); for (i = 0; i < argc; i++) { size_t len_l, len_r; int ai = mrb_gc_arena_save(mrb); - sub = mrb_string_type(mrb, argv[i]); + mrb_value sub = argv[i]; + mrb_ensure_string_type(mrb, sub); mrb_gc_arena_restore(mrb, ai); len_l = RSTRING_LEN(self); len_r = RSTRING_LEN(sub); @@ -165,16 +247,601 @@ mrb_str_end_with(mrb_state *mrb, mrb_value self) return mrb_false_value(); } +enum tr_pattern_type { + TR_UNINITIALIZED = 0, + TR_IN_ORDER = 1, + TR_RANGE = 2, +}; + +/* + #tr Pattern syntax + + <syntax> ::= (<pattern>)* | '^' (<pattern>)* + <pattern> ::= <in order> | <range> + <in order> ::= (<ch>)+ + <range> ::= <ch> '-' <ch> +*/ +struct tr_pattern { + uint8_t type; // 1:in-order, 2:range + mrb_bool flag_reverse : 1; + mrb_bool flag_on_heap : 1; + uint16_t n; + union { + uint16_t start_pos; + char ch[2]; + } val; + struct tr_pattern *next; +}; + +#define STATIC_TR_PATTERN { 0 } + +static inline void +tr_free_pattern(mrb_state *mrb, struct tr_pattern *pat) +{ + while (pat) { + struct tr_pattern *p = pat->next; + if (pat->flag_on_heap) { + mrb_free(mrb, pat); + } + pat = p; + } +} + +static struct tr_pattern* +tr_parse_pattern(mrb_state *mrb, struct tr_pattern *ret, const mrb_value v_pattern, mrb_bool flag_reverse_enable) +{ + const char *pattern = RSTRING_PTR(v_pattern); + mrb_int pattern_length = RSTRING_LEN(v_pattern); + mrb_bool flag_reverse = FALSE; + struct tr_pattern *pat1; + mrb_int i = 0; + + if(flag_reverse_enable && pattern_length >= 2 && pattern[0] == '^') { + flag_reverse = TRUE; + i++; + } + + while (i < pattern_length) { + /* is range pattern ? */ + mrb_bool const ret_uninit = (ret->type == TR_UNINITIALIZED); + pat1 = ret_uninit + ? ret + : (struct tr_pattern*)mrb_malloc_simple(mrb, sizeof(struct tr_pattern)); + if ((i+2) < pattern_length && pattern[i] != '\\' && pattern[i+1] == '-') { + if (pat1 == NULL && ret) { + nomem: + tr_free_pattern(mrb, ret); + mrb_exc_raise(mrb, mrb_obj_value(mrb->nomem_err)); + return NULL; /* not reached */ + } + pat1->type = TR_RANGE; + pat1->flag_reverse = flag_reverse; + pat1->flag_on_heap = !ret_uninit; + pat1->n = pattern[i+2] - pattern[i] + 1; + pat1->next = NULL; + pat1->val.ch[0] = pattern[i]; + pat1->val.ch[1] = pattern[i+2]; + i += 3; + } + else { + /* in order pattern. */ + mrb_int start_pos = i++; + mrb_int len; + + while (i < pattern_length) { + if ((i+2) < pattern_length && pattern[i] != '\\' && pattern[i+1] == '-') + break; + i++; + } + + len = i - start_pos; + if (len > UINT16_MAX) { + mrb_raise(mrb, E_ARGUMENT_ERROR, "tr pattern too long (max 65535)"); + } + if (pat1 == NULL && ret) { + goto nomem; + } + pat1->type = TR_IN_ORDER; + pat1->flag_reverse = flag_reverse; + pat1->flag_on_heap = !ret_uninit; + pat1->n = (uint16_t)len; + pat1->next = NULL; + pat1->val.start_pos = (uint16_t)start_pos; + } + + if (ret == NULL || ret_uninit) { + ret = pat1; + } + else { + struct tr_pattern *p = ret; + while (p->next != NULL) { + p = p->next; + } + p->next = pat1; + } + } + + return ret; +} + +static inline mrb_int +tr_find_character(const struct tr_pattern *pat, const char *pat_str, int ch) +{ + mrb_int ret = -1; + mrb_int n_sum = 0; + mrb_int flag_reverse = pat ? pat->flag_reverse : 0; + + while (pat != NULL) { + if (pat->type == TR_IN_ORDER) { + int i; + for (i = 0; i < pat->n; i++) { + if (pat_str[pat->val.start_pos + i] == ch) ret = n_sum + i; + } + } + else if (pat->type == TR_RANGE) { + if (pat->val.ch[0] <= ch && ch <= pat->val.ch[1]) + ret = n_sum + ch - pat->val.ch[0]; + } + else { + mrb_assert(pat->type == TR_UNINITIALIZED); + } + n_sum += pat->n; + pat = pat->next; + } + + if (flag_reverse) { + return (ret < 0) ? MRB_INT_MAX : -1; + } + return ret; +} + +static inline mrb_int +tr_get_character(const struct tr_pattern *pat, const char *pat_str, mrb_int n_th) +{ + mrb_int n_sum = 0; + + while (pat != NULL) { + if (n_th < (n_sum + pat->n)) { + mrb_int i = (n_th - n_sum); + + switch (pat->type) { + case TR_IN_ORDER: + return pat_str[pat->val.start_pos + i]; + case TR_RANGE: + return pat->val.ch[0]+i; + case TR_UNINITIALIZED: + return -1; + } + } + if (pat->next == NULL) { + switch (pat->type) { + case TR_IN_ORDER: + return pat_str[pat->val.start_pos + pat->n - 1]; + case TR_RANGE: + return pat->val.ch[1]; + case TR_UNINITIALIZED: + return -1; + } + } + n_sum += pat->n; + pat = pat->next; + } + + return -1; +} + +static inline void +tr_bitmap_set(uint8_t bitmap[32], uint8_t ch) +{ + uint8_t idx1 = ch / 8; + uint8_t idx2 = ch % 8; + bitmap[idx1] |= (1<<idx2); +} + +static inline mrb_bool +tr_bitmap_detect(uint8_t bitmap[32], uint8_t ch) +{ + uint8_t idx1 = ch / 8; + uint8_t idx2 = ch % 8; + if (bitmap[idx1] & (1<<idx2)) + return TRUE; + return FALSE; +} + +/* compile patter to bitmap */ +static void +tr_compile_pattern(const struct tr_pattern *pat, mrb_value pstr, uint8_t bitmap[32]) +{ + const char *pattern = RSTRING_PTR(pstr); + mrb_int flag_reverse = pat ? pat->flag_reverse : 0; + int i; + + for (i=0; i<32; i++) { + bitmap[i] = 0; + } + while (pat != NULL) { + if (pat->type == TR_IN_ORDER) { + for (i = 0; i < pat->n; i++) { + tr_bitmap_set(bitmap, pattern[pat->val.start_pos + i]); + } + } + else if (pat->type == TR_RANGE) { + for (i = pat->val.ch[0]; i < pat->val.ch[1]; i++) { + tr_bitmap_set(bitmap, i); + } + } + else { + mrb_assert(pat->type == TR_UNINITIALIZED); + } + pat = pat->next; + } + + if (flag_reverse) { + for (i=0; i<32; i++) { + bitmap[i] ^= 0xff; + } + } +} + +static mrb_bool +str_tr(mrb_state *mrb, mrb_value str, mrb_value p1, mrb_value p2, mrb_bool squeeze) +{ + struct tr_pattern pat = STATIC_TR_PATTERN; + struct tr_pattern rep_storage = STATIC_TR_PATTERN; + char *s; + mrb_int len; + mrb_int i; + mrb_int j; + mrb_bool flag_changed = FALSE; + mrb_int lastch = -1; + struct tr_pattern *rep; + + mrb_str_modify(mrb, mrb_str_ptr(str)); + tr_parse_pattern(mrb, &pat, p1, TRUE); + rep = tr_parse_pattern(mrb, &rep_storage, p2, FALSE); + s = RSTRING_PTR(str); + len = RSTRING_LEN(str); + + for (i=j=0; i<len; i++,j++) { + mrb_int n = tr_find_character(&pat, RSTRING_PTR(p1), s[i]); + + if (i>j) s[j] = s[i]; + if (n >= 0) { + flag_changed = TRUE; + if (rep == NULL) { + j--; + } + else { + mrb_int c = tr_get_character(rep, RSTRING_PTR(p2), n); + + if (c < 0 || (squeeze && c == lastch)) { + j--; + continue; + } + if (c > 0x80) { + mrb_raisef(mrb, E_ARGUMENT_ERROR, "character (%i) out of range", c); + } + lastch = c; + s[i] = (char)c; + } + } + } + + tr_free_pattern(mrb, &pat); + tr_free_pattern(mrb, rep); + + if (flag_changed) { + RSTR_SET_LEN(RSTRING(str), j); + RSTRING_PTR(str)[j] = 0; + } + return flag_changed; +} + +/* + * call-seq: + * str.tr(from_str, to_str) => new_str + * + * Returns a copy of str with the characters in from_str replaced by the + * corresponding characters in to_str. If to_str is shorter than from_str, + * it is padded with its last character in order to maintain the + * correspondence. + * + * "hello".tr('el', 'ip') #=> "hippo" + * "hello".tr('aeiou', '*') #=> "h*ll*" + * "hello".tr('aeiou', 'AA*') #=> "hAll*" + * + * Both strings may use the c1-c2 notation to denote ranges of characters, + * and from_str may start with a ^, which denotes all characters except + * those listed. + * + * "hello".tr('a-y', 'b-z') #=> "ifmmp" + * "hello".tr('^aeiou', '*') #=> "*e**o" + * + * The backslash character \ can be used to escape ^ or - and is otherwise + * ignored unless it appears at the end of a range or the end of the + * from_str or to_str: + * + * + * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld" + * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld" + * + * "hello\r\nworld".tr("\r", "") #=> "hello\nworld" + * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold" + * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld" + * + * "X['\\b']".tr("X\\", "") #=> "['b']" + * "X['\\b']".tr("X-\\]", "") #=> "'b'" + * + * Note: conversion is effective only in ASCII region. + */ +static mrb_value +mrb_str_tr(mrb_state *mrb, mrb_value str) +{ + mrb_value dup; + mrb_value p1, p2; + + mrb_get_args(mrb, "SS", &p1, &p2); + dup = mrb_str_dup(mrb, str); + str_tr(mrb, dup, p1, p2, FALSE); + return dup; +} + +/* + * call-seq: + * str.tr!(from_str, to_str) -> str or nil + * + * Translates str in place, using the same rules as String#tr. + * Returns str, or nil if no changes were made. + */ +static mrb_value +mrb_str_tr_bang(mrb_state *mrb, mrb_value str) +{ + mrb_value p1, p2; + + mrb_get_args(mrb, "SS", &p1, &p2); + if (str_tr(mrb, str, p1, p2, FALSE)) { + return str; + } + return mrb_nil_value(); +} + +/* + * call-seq: + * str.tr_s(from_str, to_str) -> new_str + * + * Processes a copy of str as described under String#tr, then removes + * duplicate characters in regions that were affected by the translation. + * + * "hello".tr_s('l', 'r') #=> "hero" + * "hello".tr_s('el', '*') #=> "h*o" + * "hello".tr_s('el', 'hx') #=> "hhxo" + */ +static mrb_value +mrb_str_tr_s(mrb_state *mrb, mrb_value str) +{ + mrb_value dup; + mrb_value p1, p2; + + mrb_get_args(mrb, "SS", &p1, &p2); + dup = mrb_str_dup(mrb, str); + str_tr(mrb, dup, p1, p2, TRUE); + return dup; +} + +/* + * call-seq: + * str.tr_s!(from_str, to_str) -> str or nil + * + * Performs String#tr_s processing on str in place, returning + * str, or nil if no changes were made. + */ +static mrb_value +mrb_str_tr_s_bang(mrb_state *mrb, mrb_value str) +{ + mrb_value p1, p2; + + mrb_get_args(mrb, "SS", &p1, &p2); + if (str_tr(mrb, str, p1, p2, TRUE)) { + return str; + } + return mrb_nil_value(); +} + +static mrb_bool +str_squeeze(mrb_state *mrb, mrb_value str, mrb_value v_pat) +{ + struct tr_pattern pat_storage = STATIC_TR_PATTERN; + struct tr_pattern *pat = NULL; + mrb_int i, j; + char *s; + mrb_int len; + mrb_bool flag_changed = FALSE; + mrb_int lastch = -1; + uint8_t bitmap[32]; + + mrb_str_modify(mrb, mrb_str_ptr(str)); + if (!mrb_nil_p(v_pat)) { + pat = tr_parse_pattern(mrb, &pat_storage, v_pat, TRUE); + tr_compile_pattern(pat, v_pat, bitmap); + tr_free_pattern(mrb, pat); + } + s = RSTRING_PTR(str); + len = RSTRING_LEN(str); + + if (pat) { + for (i=j=0; i<len; i++,j++) { + if (i>j) s[j] = s[i]; + if (tr_bitmap_detect(bitmap, s[i]) && s[i] == lastch) { + flag_changed = TRUE; + j--; + } + lastch = s[i]; + } + } + else { + for (i=j=0; i<len; i++,j++) { + if (i>j) s[j] = s[i]; + if (s[i] >= 0 && s[i] == lastch) { + flag_changed = TRUE; + j--; + } + lastch = s[i]; + } + } + + if (flag_changed) { + RSTR_SET_LEN(RSTRING(str), j); + RSTRING_PTR(str)[j] = 0; + } + return flag_changed; +} + +/* + * call-seq: + * str.squeeze([other_str]) -> new_str + * + * Builds a set of characters from the other_str + * parameter(s) using the procedure described for String#count. Returns a + * new string where runs of the same character that occur in this set are + * replaced by a single character. If no arguments are given, all runs of + * identical characters are replaced by a single character. + * + * "yellow moon".squeeze #=> "yelow mon" + * " now is the".squeeze(" ") #=> " now is the" + * "putters shoot balls".squeeze("m-z") #=> "puters shot balls" + */ +static mrb_value +mrb_str_squeeze(mrb_state *mrb, mrb_value str) +{ + mrb_value pat = mrb_nil_value(); + mrb_value dup; + + mrb_get_args(mrb, "|S", &pat); + dup = mrb_str_dup(mrb, str); + str_squeeze(mrb, dup, pat); + return dup; +} + +/* + * call-seq: + * str.squeeze!([other_str]) -> str or nil + * + * Squeezes str in place, returning either str, or nil if no + * changes were made. + */ +static mrb_value +mrb_str_squeeze_bang(mrb_state *mrb, mrb_value str) +{ + mrb_value pat = mrb_nil_value(); + + mrb_get_args(mrb, "|S", &pat); + if (str_squeeze(mrb, str, pat)) { + return str; + } + return mrb_nil_value(); +} + +static mrb_bool +str_delete(mrb_state *mrb, mrb_value str, mrb_value v_pat) +{ + struct tr_pattern pat = STATIC_TR_PATTERN; + mrb_int i, j; + char *s; + mrb_int len; + mrb_bool flag_changed = FALSE; + uint8_t bitmap[32]; + + mrb_str_modify(mrb, mrb_str_ptr(str)); + tr_parse_pattern(mrb, &pat, v_pat, TRUE); + tr_compile_pattern(&pat, v_pat, bitmap); + tr_free_pattern(mrb, &pat); + + s = RSTRING_PTR(str); + len = RSTRING_LEN(str); + + for (i=j=0; i<len; i++,j++) { + if (i>j) s[j] = s[i]; + if (tr_bitmap_detect(bitmap, s[i])) { + flag_changed = TRUE; + j--; + } + } + if (flag_changed) { + RSTR_SET_LEN(RSTRING(str), j); + RSTRING_PTR(str)[j] = 0; + } + return flag_changed; +} + +static mrb_value +mrb_str_delete(mrb_state *mrb, mrb_value str) +{ + mrb_value pat; + mrb_value dup; + + mrb_get_args(mrb, "S", &pat); + dup = mrb_str_dup(mrb, str); + str_delete(mrb, dup, pat); + return dup; +} + +static mrb_value +mrb_str_delete_bang(mrb_state *mrb, mrb_value str) +{ + mrb_value pat; + + mrb_get_args(mrb, "S", &pat); + if (str_delete(mrb, str, pat)) { + return str; + } + return mrb_nil_value(); +} + +/* + * call_seq: + * str.count([other_str]) -> integer + * + * Each other_str parameter defines a set of characters to count. The + * intersection of these sets defines the characters to count in str. Any + * other_str that starts with a caret ^ is negated. The sequence c1-c2 + * means all characters between c1 and c2. The backslash character \ can + * be used to escape ^ or - and is otherwise ignored unless it appears at + * the end of a sequence or the end of a other_str. + */ +static mrb_value +mrb_str_count(mrb_state *mrb, mrb_value str) +{ + mrb_value v_pat = mrb_nil_value(); + mrb_int i; + char *s; + mrb_int len; + mrb_int count = 0; + struct tr_pattern pat = STATIC_TR_PATTERN; + uint8_t bitmap[32]; + + mrb_get_args(mrb, "S", &v_pat); + tr_parse_pattern(mrb, &pat, v_pat, TRUE); + tr_compile_pattern(&pat, v_pat, bitmap); + tr_free_pattern(mrb, &pat); + + s = RSTRING_PTR(str); + len = RSTRING_LEN(str); + for (i = 0; i < len; i++) { + if (tr_bitmap_detect(bitmap, s[i])) count++; + } + return mrb_fixnum_value(count); +} + static mrb_value mrb_str_hex(mrb_state *mrb, mrb_value self) { - return mrb_str_to_inum(mrb, self, 16, FALSE); + return mrb_str_to_integer(mrb, self, 16, FALSE); } static mrb_value mrb_str_oct(mrb_state *mrb, mrb_value self) { - return mrb_str_to_inum(mrb, self, 8, FALSE); + return mrb_str_to_integer(mrb, self, 8, FALSE); } /* @@ -194,41 +861,387 @@ mrb_str_chr(mrb_state *mrb, mrb_value self) /* * call-seq: - * string.lines -> array of string + * int.chr([encoding]) -> string * - * Returns strings per line; + * Returns a string containing the character represented by the +int+'s value + * according to +encoding+. +"ASCII-8BIT"+ (+"BINARY"+) and +"UTF-8"+ (only + * with +MRB_UTF8_STRING+) can be specified as +encoding+ (default is + * +"ASCII-8BIT"+). * - * a = "abc\ndef" - * a.lines #=> ["abc\n", "def"] + * 65.chr #=> "A" + * 230.chr #=> "\xE6" + * 230.chr("ASCII-8BIT") #=> "\xE6" + * 230.chr("UTF-8") #=> "\u00E6" */ static mrb_value -mrb_str_lines(mrb_state *mrb, mrb_value self) +mrb_int_chr(mrb_state *mrb, mrb_value num) +{ + mrb_value enc; + mrb_bool enc_given; + + mrb_get_args(mrb, "|S?", &enc, &enc_given); + if (!enc_given || + ENC_COMP_P(enc, ENC_ASCII_8BIT) || + ENC_COMP_P(enc, ENC_BINARY)) { + return int_chr_binary(mrb, num); + } +#ifdef MRB_UTF8_STRING + else if (ENC_COMP_P(enc, ENC_UTF8)) { + return int_chr_utf8(mrb, num); + } +#endif + else { + mrb_raisef(mrb, E_ARGUMENT_ERROR, "unknown encoding name - %v", enc); + } + /* not reached */ + return mrb_nil_value(); +} + +/* + * call-seq: + * string.succ -> string + * + * Returns next sequence of the string; + * + * a = "abc" + * a.succ #=> "abd" + */ +static mrb_value +mrb_str_succ_bang(mrb_state *mrb, mrb_value self) { mrb_value result; - mrb_value blk; - int ai; - mrb_int len; - mrb_value arg; - char *p = RSTRING_PTR(self), *t; - char *e = p + RSTRING_LEN(self); + unsigned char *p, *e, *b, *t; + const char *prepend; + struct RString *s = mrb_str_ptr(self); + mrb_int l; - mrb_get_args(mrb, "&", &blk); + if (RSTRING_LEN(self) == 0) + return self; - result = mrb_ary_new(mrb); + mrb_str_modify(mrb, s); + l = RSTRING_LEN(self); + b = p = (unsigned char*) RSTRING_PTR(self); + t = e = p + l; + *(e--) = 0; - if (!mrb_nil_p(blk)) { - while (p < e) { - t = p; - while (p < e && *p != '\n') p++; - if (*p == '\n') p++; - len = (mrb_int) (p - t); - arg = mrb_str_new(mrb, t, len); - mrb_yield_argv(mrb, blk, 1, &arg); + // find trailing ascii/number + while (e >= b) { + if (ISALNUM(*e)) + break; + e--; + } + if (e < b) { + e = p + l - 1; + result = mrb_str_new_lit(mrb, ""); + } + else { + // find leading letter of the ascii/number + b = e; + while (b > p) { + if (!ISALNUM(*b) || (ISALNUM(*b) && *b != '9' && *b != 'z' && *b != 'Z')) + break; + b--; } - return self; + if (!ISALNUM(*b)) + b++; + result = mrb_str_new(mrb, (char*) p, b - p); } + + while (e >= b) { + if (!ISALNUM(*e)) { + if (*e == 0xff) { + mrb_str_cat_lit(mrb, result, "\x01"); + (*e) = 0; + } + else + (*e)++; + break; + } + prepend = NULL; + if (*e == '9') { + if (e == b) prepend = "1"; + *e = '0'; + } + else if (*e == 'z') { + if (e == b) prepend = "a"; + *e = 'a'; + } + else if (*e == 'Z') { + if (e == b) prepend = "A"; + *e = 'A'; + } + else { + (*e)++; + break; + } + if (prepend) mrb_str_cat_cstr(mrb, result, prepend); + e--; + } + result = mrb_str_cat(mrb, result, (char*) b, t - b); + l = RSTRING_LEN(result); + mrb_str_resize(mrb, self, l); + memcpy(RSTRING_PTR(self), RSTRING_PTR(result), l); + return self; +} + +static mrb_value +mrb_str_succ(mrb_state *mrb, mrb_value self) +{ + mrb_value str; + + str = mrb_str_dup(mrb, self); + mrb_str_succ_bang(mrb, str); + return str; +} + +#ifdef MRB_UTF8_STRING +static const char utf8len_codepage_zero[256] = +{ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, +}; + +static mrb_int +utf8code(unsigned char* p) +{ + mrb_int len; + + if (p[0] < 0x80) + return p[0]; + + len = utf8len_codepage_zero[p[0]]; + if (len > 1 && (p[1] & 0xc0) == 0x80) { + if (len == 2) + return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f); + if ((p[2] & 0xc0) == 0x80) { + if (len == 3) + return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + + (p[2] & 0x3f); + if ((p[3] & 0xc0) == 0x80) { + if (len == 4) + return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) + + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f); + if ((p[4] & 0xc0) == 0x80) { + if (len == 5) + return ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18) + + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6) + + (p[4] & 0x3f); + if ((p[5] & 0xc0) == 0x80 && len == 6) + return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24) + + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12) + + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f); + } + } + } + } + return p[0]; +} + +static mrb_value +mrb_str_ord(mrb_state* mrb, mrb_value str) +{ + if (RSTRING_LEN(str) == 0) + mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string"); + return mrb_fixnum_value(utf8code((unsigned char*) RSTRING_PTR(str))); +} +#else +static mrb_value +mrb_str_ord(mrb_state* mrb, mrb_value str) +{ + if (RSTRING_LEN(str) == 0) + mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string"); + return mrb_fixnum_value((unsigned char)RSTRING_PTR(str)[0]); +} +#endif + +/* + * call-seq: + * str.delete_prefix!(prefix) -> self or nil + * + * Deletes leading <code>prefix</code> from <i>str</i>, returning + * <code>nil</code> if no change was made. + * + * "hello".delete_prefix!("hel") #=> "lo" + * "hello".delete_prefix!("llo") #=> nil + */ +static mrb_value +mrb_str_del_prefix_bang(mrb_state *mrb, mrb_value self) +{ + mrb_int plen, slen; + const char *ptr; + char *s; + struct RString *str = RSTRING(self); + + mrb_get_args(mrb, "s", &ptr, &plen); + slen = RSTR_LEN(str); + if (plen > slen) return mrb_nil_value(); + s = RSTR_PTR(str); + if (memcmp(s, ptr, plen) != 0) return mrb_nil_value(); + if (!mrb_frozen_p(str) && (RSTR_SHARED_P(str) || RSTR_FSHARED_P(str))) { + str->as.heap.ptr += plen; + } + else { + mrb_str_modify(mrb, str); + s = RSTR_PTR(str); + memmove(s, s+plen, slen-plen); + } + RSTR_SET_LEN(str, slen-plen); + return self; +} + +/* + * call-seq: + * str.delete_prefix(prefix) -> new_str + * + * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted. + * + * "hello".delete_prefix("hel") #=> "lo" + * "hello".delete_prefix("llo") #=> "hello" + */ +static mrb_value +mrb_str_del_prefix(mrb_state *mrb, mrb_value self) +{ + mrb_int plen, slen; + const char *ptr; + + mrb_get_args(mrb, "s", &ptr, &plen); + slen = RSTRING_LEN(self); + if (plen > slen) return mrb_str_dup(mrb, self); + if (memcmp(RSTRING_PTR(self), ptr, plen) != 0) + return mrb_str_dup(mrb, self); + return mrb_str_substr(mrb, self, plen, slen-plen); +} + +/* + * call-seq: + * str.delete_suffix!(suffix) -> self or nil + * + * Deletes trailing <code>suffix</code> from <i>str</i>, returning + * <code>nil</code> if no change was made. + * + * "hello".delete_suffix!("llo") #=> "he" + * "hello".delete_suffix!("hel") #=> nil + */ +static mrb_value +mrb_str_del_suffix_bang(mrb_state *mrb, mrb_value self) +{ + mrb_int plen, slen; + const char *ptr; + char *s; + struct RString *str = RSTRING(self); + + mrb_get_args(mrb, "s", &ptr, &plen); + slen = RSTR_LEN(str); + if (plen > slen) return mrb_nil_value(); + s = RSTR_PTR(str); + if (memcmp(s+slen-plen, ptr, plen) != 0) return mrb_nil_value(); + if (!mrb_frozen_p(str) && (RSTR_SHARED_P(str) || RSTR_FSHARED_P(str))) { + /* no need to modify string */ + } + else { + mrb_str_modify(mrb, str); + } + RSTR_SET_LEN(str, slen-plen); + return self; +} + +/* + * call-seq: + * str.delete_suffix(suffix) -> new_str + * + * Returns a copy of <i>str</i> with leading <code>suffix</code> deleted. + * + * "hello".delete_suffix("hel") #=> "lo" + * "hello".delete_suffix("llo") #=> "hello" + */ +static mrb_value +mrb_str_del_suffix(mrb_state *mrb, mrb_value self) +{ + mrb_int plen, slen; + const char *ptr; + + mrb_get_args(mrb, "s", &ptr, &plen); + slen = RSTRING_LEN(self); + if (plen > slen) return mrb_str_dup(mrb, self); + if (memcmp(RSTRING_PTR(self)+slen-plen, ptr, plen) != 0) + return mrb_str_dup(mrb, self); + return mrb_str_substr(mrb, self, 0, slen-plen); +} + +#define lesser(a,b) (((a)>(b))?(b):(a)) + +/* + * call-seq: + * str.casecmp(other_str) -> -1, 0, +1 or nil + * + * Case-insensitive version of <code>String#<=></code>. + * + * "abcdef".casecmp("abcde") #=> 1 + * "aBcDeF".casecmp("abcdef") #=> 0 + * "abcdef".casecmp("abcdefg") #=> -1 + * "abcdef".casecmp("ABCDEF") #=> 0 + */ +static mrb_value +mrb_str_casecmp(mrb_state *mrb, mrb_value self) +{ + mrb_value str; + + mrb_get_args(mrb, "o", &str); + if (!mrb_string_p(str)) return mrb_nil_value(); + + struct RString *s1 = mrb_str_ptr(self); + struct RString *s2 = mrb_str_ptr(str); + mrb_int len = lesser(RSTR_LEN(s1), RSTR_LEN(s2)); + char *p1 = RSTR_PTR(s1); + char *p2 = RSTR_PTR(s2); + + for (mrb_int i=0; i<len; i++) { + int c1 = p1[i], c2 = p2[i]; + if (ISASCII(c1) && ISUPPER(c1)) c1 = TOLOWER(c1); + if (ISASCII(c2) && ISUPPER(c2)) c2 = TOLOWER(c2); + if (c1 > c2) return mrb_fixnum_value(1); + if (c1 < c2) return mrb_fixnum_value(-1); + } + if (RSTR_LEN(s1) == RSTR_LEN(s2)) return mrb_fixnum_value(0); + if (RSTR_LEN(s1) > RSTR_LEN(s2)) return mrb_fixnum_value(1); + return mrb_fixnum_value(-1); +} + +/* + * call-seq: + * str.casecmp?(other) -> true, false, or nil + * + * Returns true if str and other_str are equal after case folding, + * false if they are not equal, and nil if other is not a string. + */ +static mrb_value +mrb_str_casecmp_p(mrb_state *mrb, mrb_value self) +{ + mrb_value c = mrb_str_casecmp(mrb, self); + if (mrb_nil_p(c)) return c; + return mrb_bool_value(mrb_fixnum(c) == 0); +} + +static mrb_value +mrb_str_lines(mrb_state *mrb, mrb_value self) +{ + mrb_value result; + int ai; + mrb_int len; + char *b = RSTRING_PTR(self); + char *p = b, *t; + char *e = b + RSTRING_LEN(self); + + mrb->c->ci->mid = 0; + result = mrb_ary_new(mrb); + ai = mrb_gc_arena_save(mrb); while (p < e) { - ai = mrb_gc_arena_save(mrb); t = p; while (p < e && *p != '\n') p++; if (*p == '\n') p++; @@ -245,17 +1258,39 @@ mrb_mruby_string_ext_gem_init(mrb_state* mrb) struct RClass * s = mrb->string_class; mrb_define_method(mrb, s, "dump", mrb_str_dump, MRB_ARGS_NONE()); - mrb_define_method(mrb, s, "getbyte", mrb_str_getbyte, MRB_ARGS_REQ(1)); mrb_define_method(mrb, s, "swapcase!", mrb_str_swapcase_bang, MRB_ARGS_NONE()); mrb_define_method(mrb, s, "swapcase", mrb_str_swapcase, MRB_ARGS_NONE()); - mrb_define_method(mrb, s, "concat", mrb_str_concat2, MRB_ARGS_REQ(1)); - mrb_define_method(mrb, s, "<<", mrb_str_concat2, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "concat", mrb_str_concat_m, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "<<", mrb_str_concat_m, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "count", mrb_str_count, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "tr", mrb_str_tr, MRB_ARGS_REQ(2)); + mrb_define_method(mrb, s, "tr!", mrb_str_tr_bang, MRB_ARGS_REQ(2)); + mrb_define_method(mrb, s, "tr_s", mrb_str_tr_s, MRB_ARGS_REQ(2)); + mrb_define_method(mrb, s, "tr_s!", mrb_str_tr_s_bang, MRB_ARGS_REQ(2)); + mrb_define_method(mrb, s, "squeeze", mrb_str_squeeze, MRB_ARGS_OPT(1)); + mrb_define_method(mrb, s, "squeeze!", mrb_str_squeeze_bang, MRB_ARGS_OPT(1)); + mrb_define_method(mrb, s, "delete", mrb_str_delete, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "delete!", mrb_str_delete_bang, MRB_ARGS_REQ(1)); mrb_define_method(mrb, s, "start_with?", mrb_str_start_with, MRB_ARGS_REST()); mrb_define_method(mrb, s, "end_with?", mrb_str_end_with, MRB_ARGS_REST()); mrb_define_method(mrb, s, "hex", mrb_str_hex, MRB_ARGS_NONE()); mrb_define_method(mrb, s, "oct", mrb_str_oct, MRB_ARGS_NONE()); mrb_define_method(mrb, s, "chr", mrb_str_chr, MRB_ARGS_NONE()); - mrb_define_method(mrb, s, "lines", mrb_str_lines, MRB_ARGS_NONE()); + mrb_define_method(mrb, s, "succ", mrb_str_succ, MRB_ARGS_NONE()); + mrb_define_method(mrb, s, "succ!", mrb_str_succ_bang, MRB_ARGS_NONE()); + mrb_define_method(mrb, s, "next", mrb_str_succ, MRB_ARGS_NONE()); + mrb_define_method(mrb, s, "next!", mrb_str_succ_bang, MRB_ARGS_NONE()); + mrb_define_method(mrb, s, "ord", mrb_str_ord, MRB_ARGS_NONE()); + mrb_define_method(mrb, s, "delete_prefix!", mrb_str_del_prefix_bang, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "delete_prefix", mrb_str_del_prefix, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "delete_suffix!", mrb_str_del_suffix_bang, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "delete_suffix", mrb_str_del_suffix, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "casecmp", mrb_str_casecmp, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, s, "casecmp?", mrb_str_casecmp_p, MRB_ARGS_REQ(1)); + + mrb_define_method(mrb, s, "__lines", mrb_str_lines, MRB_ARGS_NONE()); + + mrb_define_method(mrb, mrb_class_get(mrb, "Integer"), "chr", mrb_int_chr, MRB_ARGS_OPT(1)); } void |
