summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorYukihiro "Matz" Matsumoto <[email protected]>2018-09-20 15:13:13 +0900
committerYukihiro "Matz" Matsumoto <[email protected]>2018-09-21 00:03:07 +0900
commit58ba38fe1e11828190596b44e0789dd8a0607ff3 (patch)
tree8325ab9b013efd4330c3446f19890dc7fc3635bd
parentf23f2bbdad7a15bec8812b029cb23c2117d7c63c (diff)
downloadmruby-58ba38fe1e11828190596b44e0789dd8a0607ff3.tar.gz
mruby-58ba38fe1e11828190596b44e0789dd8a0607ff3.zip
Add `String#tr` and `#tr!` to `mruby-string-ext` gem; fix #4086
This patch is based on `mruby/c` implementation by Hirohito Higashi. We might need to add `#tr_s`, `#squeeze` and `#delete` as well. Adding them should not be too hard using functions we implemented here.
-rw-r--r--mrbgems/mruby-string-ext/src/string.c261
-rw-r--r--mrbgems/mruby-string-ext/test/string.rb13
2 files changed, 274 insertions, 0 deletions
diff --git a/mrbgems/mruby-string-ext/src/string.c b/mrbgems/mruby-string-ext/src/string.c
index 142d449f4..9d318cb1a 100644
--- a/mrbgems/mruby-string-ext/src/string.c
+++ b/mrbgems/mruby-string-ext/src/string.c
@@ -235,6 +235,265 @@ mrb_str_end_with(mrb_state *mrb, mrb_value self)
return mrb_false_value();
}
+/*
+ #tr Pattern syntax
+
+ <syntax> ::= (<pattern>)* | '^' (<pattern>)*
+ <pattern> ::= <in order> | <range>
+ <in order> ::= (<ch>)+
+ <range> ::= <ch> '-' <ch>
+*/
+struct tr_pattern {
+ uint8_t type; // 1:in-order, 2:range
+ mrb_bool flag_reverse;
+ int16_t n;
+ struct tr_pattern *next;
+ char ch[];
+};
+
+static void
+tr_pattern_free(mrb_state *mrb, struct tr_pattern *pat)
+{
+ while (pat) {
+ struct tr_pattern *p = pat->next;
+ mrb_free(mrb, pat);
+ pat = p;
+ }
+}
+
+static struct tr_pattern*
+tr_parse_pattern(mrb_state *mrb, const mrb_value v_pattern, mrb_bool flag_reverse_enable)
+{
+ const char *pattern = RSTRING_PTR(v_pattern);
+ int pattern_length = RSTRING_LEN(v_pattern);
+ mrb_bool flag_reverse = FALSE;
+ struct tr_pattern *ret = NULL;
+ struct tr_pattern *pat1;
+ int i = 0;
+
+ if(flag_reverse_enable && pattern_length >= 2 && pattern[0] == '^') {
+ flag_reverse = TRUE;
+ i++;
+ }
+
+ while (i < pattern_length) {
+ /* is range pattern ? */
+ if ((i+2) < pattern_length && pattern[i] != '\\' && pattern[i+1] == '-') {
+ pat1 = (struct tr_pattern*)mrb_malloc_simple(mrb, sizeof(struct tr_pattern) + 2);
+ if (pat1 == NULL && ret) {
+ nomem:
+ tr_pattern_free(mrb, ret);
+ mrb_exc_raise(mrb, mrb_obj_value(mrb->nomem_err));
+ return NULL; /* not reached */
+ }
+ pat1->type = 2;
+ pat1->flag_reverse = flag_reverse;
+ pat1->n = pattern[i+2] - pattern[i] + 1;
+ pat1->next = NULL;
+ pat1->ch[0] = pattern[i];
+ pat1->ch[1] = pattern[i+2];
+ i += 3;
+ }
+ else {
+ /* in order pattern. */
+ int start_pos = i++;
+ int len;
+
+ while (i < pattern_length) {
+ if ((i+2) < pattern_length && pattern[i] != '\\' && pattern[i+1] == '-')
+ break;
+ i++;
+ }
+
+ len = i - start_pos;
+ pat1 = (struct tr_pattern*)mrb_malloc_simple(mrb, sizeof(struct tr_pattern) + len);
+ if (pat1 == NULL && ret) {
+ goto nomem;
+ }
+ pat1->type = 1;
+ pat1->flag_reverse = flag_reverse;
+ pat1->n = len;
+ pat1->next = NULL;
+ memcpy(pat1->ch, &pattern[start_pos], len);
+ }
+
+ if (ret == NULL) {
+ ret = pat1;
+ }
+ else {
+ struct tr_pattern *p = ret;
+ while (p->next != NULL) {
+ p = p->next;
+ }
+ p->next = pat1;
+ }
+ }
+
+ return ret;
+}
+
+static mrb_int
+tr_find_character(const struct tr_pattern *pat, int ch)
+{
+ mrb_int ret = -1;
+ mrb_int n_sum = 0;
+ mrb_int flag_reverse = pat ? pat->flag_reverse : 0;
+
+ while (pat != NULL) {
+ if (pat->type == 1) { /* pat->type == 1 in-order */
+ int i;
+ for (i = 0; i < pat->n; i++) {
+ if (pat->ch[i] == ch) ret = n_sum + i;
+ }
+ }
+ else { /* pat->type == 2 range */
+ if (pat->ch[0] <= ch && ch <= pat->ch[1])
+ ret = n_sum + ch - pat->ch[0];
+ }
+ n_sum += pat->n;
+ pat = pat->next;
+ }
+
+ if (flag_reverse) {
+ return (ret < 0) ? MRB_INT_MAX : -1;
+ }
+ return ret;
+}
+
+static mrb_int
+tr_get_character(const struct tr_pattern *pat, mrb_int n_th)
+{
+ mrb_int n_sum = 0;
+ while (pat != NULL) {
+ if (n_th < (n_sum + pat->n)) {
+ mrb_int i = (n_th - n_sum);
+ return (pat->type == 1) ? pat->ch[i] :pat->ch[0] + i;
+ }
+ if (pat->next == NULL) {
+ return (pat->type == 1) ? pat->ch[pat->n - 1] : pat->ch[1];
+ }
+ n_sum += pat->n;
+ pat = pat->next;
+ }
+
+ return -1;
+}
+
+static mrb_bool
+str_tr(mrb_state *mrb, mrb_value str, mrb_value p1, mrb_value p2)
+{
+ struct tr_pattern *pat;
+ struct tr_pattern *rep;
+ char *s;
+ mrb_int len;
+ mrb_int i;
+ mrb_bool flag_changed = FALSE;
+
+ mrb_str_modify(mrb, mrb_str_ptr(str));
+ pat = tr_parse_pattern(mrb, p1, TRUE);
+ rep = tr_parse_pattern(mrb, p2, FALSE);
+ s = RSTRING_PTR(str);
+ len = RSTRING_LEN(str);
+
+ for (i = 0; i < len; i++) {
+ mrb_int n = tr_find_character(pat, s[i]);
+
+ if (n >= 0) {
+ flag_changed = TRUE;
+ if (rep == NULL) {
+ memmove(s + i, s + i + 1, len - i);
+ len--;
+ i--;
+ }
+ else {
+ mrb_int c = tr_get_character(rep, n);
+
+ if (c < 0 || c > 0x80) {
+ mrb_raisef(mrb, E_ARGUMENT_ERROR, "character (%S) out of range",
+ mrb_fixnum_value((mrb_int)c));
+ }
+ s[i] = c;
+ }
+ }
+ }
+
+ tr_pattern_free(mrb, pat);
+ if (rep) tr_pattern_free(mrb, rep);
+
+ RSTR_SET_LEN(RSTRING(str), len);
+ RSTRING_PTR(str)[len] = 0;
+
+ return flag_changed;
+}
+
+/*
+ * call-seq:
+ * str.tr(from_str, to_str) => new_str
+ *
+ * Returns a copy of str with the characters in from_str replaced by the
+ * corresponding characters in to_str. If to_str is shorter than from_str,
+ * it is padded with its last character in order to maintain the
+ * correspondence.
+ *
+ * "hello".tr('el', 'ip') #=> "hippo"
+ * "hello".tr('aeiou', '*') #=> "h*ll*"
+ * "hello".tr('aeiou', 'AA*') #=> "hAll*"
+ *
+ * Both strings may use the c1-c2 notation to denote ranges of characters,
+ * and from_str may start with a ^, which denotes all characters except
+ * those listed.
+ *
+ * "hello".tr('a-y', 'b-z') #=> "ifmmp"
+ * "hello".tr('^aeiou', '*') #=> "*e**o"
+ *
+ * The backslash character \ can be used to escape ^ or - and is otherwise
+ * ignored unless it appears at the end of a range or the end of the
+ * from_str or to_str:
+ *
+ *
+ * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
+ * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
+ *
+ * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
+ * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
+ * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
+ *
+ * "X['\\b']".tr("X\\", "") #=> "['b']"
+ * "X['\\b']".tr("X-\\]", "") #=> "'b'"
+ *
+ * Note: conversion is effective only in ASCII region.
+ */
+static mrb_value
+mrb_str_tr(mrb_state *mrb, mrb_value str)
+{
+ mrb_value dup;
+ mrb_value p1, p2;
+
+ mrb_get_args(mrb, "SS", &p1, &p2);
+ dup = mrb_str_dup(mrb, str);
+ str_tr(mrb, dup, p1, p2);
+ return dup;
+}
+
+/*
+ * call-seq:
+ * str.tr!(from_str, to_str) -> str or nil
+ *
+ * Translates str in place, using the same rules as String#tr.
+ * Returns str, or nil if no changes were made.
+ */
+static mrb_value
+mrb_str_tr_bang(mrb_state *mrb, mrb_value str)
+{
+ mrb_value p1, p2;
+
+ mrb_get_args(mrb, "SS", &p1, &p2);
+ if (str_tr(mrb, str, p1, p2)) {
+ return str;
+ }
+ return mrb_nil_value();
+}
+
static mrb_value
mrb_str_hex(mrb_state *mrb, mrb_value self)
{
@@ -620,6 +879,8 @@ mrb_mruby_string_ext_gem_init(mrb_state* mrb)
mrb_define_method(mrb, s, "swapcase", mrb_str_swapcase, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "concat", mrb_str_concat_m, MRB_ARGS_REQ(1));
mrb_define_method(mrb, s, "<<", mrb_str_concat_m, MRB_ARGS_REQ(1));
+ mrb_define_method(mrb, s, "tr", mrb_str_tr, MRB_ARGS_REQ(2));
+ mrb_define_method(mrb, s, "tr!", mrb_str_tr_bang, MRB_ARGS_REQ(2));
mrb_define_method(mrb, s, "start_with?", mrb_str_start_with, MRB_ARGS_REST());
mrb_define_method(mrb, s, "end_with?", mrb_str_end_with, MRB_ARGS_REST());
mrb_define_method(mrb, s, "hex", mrb_str_hex, MRB_ARGS_NONE());
diff --git a/mrbgems/mruby-string-ext/test/string.rb b/mrbgems/mruby-string-ext/test/string.rb
index b6146fb90..6b8a89c4d 100644
--- a/mrbgems/mruby-string-ext/test/string.rb
+++ b/mrbgems/mruby-string-ext/test/string.rb
@@ -147,6 +147,19 @@ assert('String#casecmp') do
assert_equal 0, "abcdef".casecmp(o)
end
+assert('String#tr') do
+ assert_equal "ABC", "abc".tr('a-z', 'A-Z')
+ assert_equal "hippo", "hello".tr('el', 'ip')
+ assert_equal "Ruby", "Lisp".tr("Lisp", "Ruby")
+ assert_equal "*e**o", "hello".tr('^aeiou', '*')
+end
+
+assert('String#tr!') do
+ s = "abcdefghijklmnopqR"
+ assert_equal "ab12222hijklmnopqR", s.tr!("cdefg", "12")
+ assert_equal "ab12222hijklmnopqR", s
+end
+
assert('String#start_with?') do
assert_true "hello".start_with?("heaven", "hell")
assert_true !"hello".start_with?("heaven", "paradise")