diff options
| author | KOBAYASHI Shuji <[email protected]> | 2019-07-23 20:16:46 +0900 |
|---|---|---|
| committer | KOBAYASHI Shuji <[email protected]> | 2019-07-23 20:16:46 +0900 |
| commit | e86aa61f203ec1589d37798ceb8b40385c7f85e0 (patch) | |
| tree | 59e08a25785f3a36b1a586e322dc9e023b004720 /mrbgems/mruby-string-ext/src | |
| parent | 5166e316efc9bd211b3d950d3586a45b22a5f9d5 (diff) | |
| download | mruby-e86aa61f203ec1589d37798ceb8b40385c7f85e0.tar.gz mruby-e86aa61f203ec1589d37798ceb8b40385c7f85e0.zip | |
Add encoding argument to `Integral#chr`
Currently, `Integral#chr` in mruby changes behavior by `MRB_UTF8_STRING`
setting.
before this patch:
$ bin/mruby -e 'p 171.chr' #=> "\xab" (`MRB_UTF8_STRING` is disabled)
$ bin/mruby -e 'p 171.chr' #=> "«" (`MRB_UTF8_STRING` is enabled)
This behavior is incompatible with Ruby, and a little inconvenient because
it can't be interpreted as ASCII-8BIT with `MRB_UTF8_STRING`, I think.
So add encoding argument according to Ruby.
after this patch:
$ bin/mruby -e 'p 171.chr' #=> "\xab"
$ bin/mruby -e 'p 171.chr("ASCII-8BIT")' #=> "\xab"
$ bin/mruby -e 'p 171.chr("UTF-8")' #=> "«"
Allow only `String` for encoding because mruby doesn't have `Encoding`
class, and `"ASCII-8BIT"` (`"BINARY"`) and `"UTF-8"` (only with
`MRB_UTF8_STRING`) are valid value (default is `"ASCII-8BIT"`).
Diffstat (limited to 'mrbgems/mruby-string-ext/src')
| -rw-r--r-- | mrbgems/mruby-string-ext/src/string.c | 166 |
1 files changed, 123 insertions, 43 deletions
diff --git a/mrbgems/mruby-string-ext/src/string.c b/mrbgems/mruby-string-ext/src/string.c index ab9919650..036a0a463 100644 --- a/mrbgems/mruby-string-ext/src/string.c +++ b/mrbgems/mruby-string-ext/src/string.c @@ -5,6 +5,91 @@ #include <mruby/string.h> #include <mruby/range.h> +#define ENC_ASCII_8BIT "ASCII-8BIT" +#define ENC_BINARY "BINARY" +#define ENC_UTF8 "UTF-8" + +#define ENC_COMP_P(enc, enc_lit) \ + str_casecmp_p(RSTRING_PTR(enc), RSTRING_LEN(enc), enc_lit, sizeof(enc_lit"")-1) + +#ifdef MRB_WITHOUT_FLOAT +# define mrb_float_p(o) FALSE +#endif + +static mrb_bool +str_casecmp_p(const char *s1, mrb_int len1, const char *s2, mrb_int len2) +{ + const char *e1, *e2; + + if (len1 != len2) return FALSE; + e1 = s1 + len1; + e2 = s2 + len2; + while (s1 < e1 && s2 < e2) { + if (*s1 != *s2 && TOUPPER(*s1) != TOUPPER(*s2)) return FALSE; + ++s1; + ++s2; + } + return TRUE; +} + +static mrb_value +int_chr_binary(mrb_state *mrb, mrb_value num) +{ + mrb_int cp = mrb_int(mrb, num); + char c; + mrb_value str; + + if (cp < 0 || 0xff < cp) { + mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num); + } + c = (char)cp; + str = mrb_str_new(mrb, &c, 1); + RSTR_SET_ASCII_FLAG(mrb_str_ptr(str)); + return str; +} + +#ifdef MRB_UTF8_STRING +static mrb_value +int_chr_utf8(mrb_state *mrb, mrb_value num) +{ + mrb_int cp = mrb_int(mrb, num); + char utf8[4]; + mrb_int len; + mrb_value str; + uint32_t ascii_flag = 0; + + if (cp < 0 || 0x10FFFF < cp) { + mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num); + } + if (cp < 0x80) { + utf8[0] = (char)cp; + len = 1; + ascii_flag = MRB_STR_ASCII; + } + else if (cp < 0x800) { + utf8[0] = (char)(0xC0 | (cp >> 6)); + utf8[1] = (char)(0x80 | (cp & 0x3F)); + len = 2; + } + else if (cp < 0x10000) { + utf8[0] = (char)(0xE0 | (cp >> 12)); + utf8[1] = (char)(0x80 | ((cp >> 6) & 0x3F)); + utf8[2] = (char)(0x80 | ( cp & 0x3F)); + len = 3; + } + else { + utf8[0] = (char)(0xF0 | (cp >> 18)); + utf8[1] = (char)(0x80 | ((cp >> 12) & 0x3F)); + utf8[2] = (char)(0x80 | ((cp >> 6) & 0x3F)); + utf8[3] = (char)(0x80 | ( cp & 0x3F)); + len = 4; + } + str = mrb_str_new(mrb, utf8, len); + mrb_str_ptr(str)->flags |= ascii_flag; + return str; +} +#endif + static mrb_value mrb_str_getbyte(mrb_state *mrb, mrb_value str) { @@ -125,8 +210,6 @@ mrb_str_swapcase(mrb_state *mrb, mrb_value self) return str; } -static mrb_value mrb_int_chr(mrb_state *mrb, mrb_value num); - /* * call-seq: * str << integer -> str @@ -136,7 +219,8 @@ static mrb_value mrb_int_chr(mrb_state *mrb, mrb_value num); * * Append---Concatenates the given object to <i>str</i>. If the object is a * <code>Integer</code>, it is considered as a codepoint, and is converted - * to a character before concatenation. + * to a character before concatenation + * (equivalent to <code>str.concat(integer.chr(__ENCODING__))</code>). * * a = "hello " * a << "world" #=> "hello world" @@ -148,8 +232,12 @@ mrb_str_concat_m(mrb_state *mrb, mrb_value self) mrb_value str; mrb_get_args(mrb, "o", &str); - if (mrb_fixnum_p(str)) - str = mrb_int_chr(mrb, str); + if (mrb_fixnum_p(str) || mrb_float_p(str)) +#ifdef MRB_UTF8_STRING + str = int_chr_utf8(mrb, str); +#else + str = int_chr_binary(mrb, str); +#endif else str = mrb_ensure_string_type(mrb, str); mrb_str_concat(mrb, self, str); @@ -800,7 +888,7 @@ mrb_str_count(mrb_state *mrb, mrb_value str) tr_parse_pattern(mrb, &pat, v_pat, TRUE); tr_compile_pattern(&pat, v_pat, bitmap); tr_free_pattern(mrb, &pat); - + s = RSTRING_PTR(str); len = RSTRING_LEN(str); for (i = 0; i < len; i++) { @@ -836,49 +924,40 @@ mrb_str_chr(mrb_state *mrb, mrb_value self) return mrb_str_substr(mrb, self, 0, 1); } +/* + * call-seq: + * int.chr([encoding]) -> string + * + * Returns a string containing the character represented by the +int+'s value + * according to +encoding+. +"ASCII-8BIT"+ (+"BINARY"+) and +"UTF-8"+ (only + * with +MRB_UTF8_STRING+) can be specified as +encoding+ (default is + * +"ASCII-8BIT"+). + * + * 65.chr #=> "A" + * 230.chr #=> "\xE6" + * 230.chr("ASCII-8BIT") #=> "\xE6" + * 230.chr("UTF-8") #=> "\u00E6" + */ static mrb_value mrb_int_chr(mrb_state *mrb, mrb_value num) { - mrb_int cp = mrb_fixnum(num); -#ifdef MRB_UTF8_STRING - char utf8[4]; - mrb_int len; - - if (cp < 0 || 0x10FFFF < cp) { - mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num); - } - if (cp < 0x80) { - utf8[0] = (char)cp; - len = 1; + mrb_value enc; + mrb_bool enc_given; + + mrb_get_args(mrb, "|S?", &enc, &enc_given); + if (!enc_given || + ENC_COMP_P(enc, ENC_ASCII_8BIT) || + ENC_COMP_P(enc, ENC_BINARY)) { + return int_chr_binary(mrb, num); } - else if (cp < 0x800) { - utf8[0] = (char)(0xC0 | (cp >> 6)); - utf8[1] = (char)(0x80 | (cp & 0x3F)); - len = 2; - } - else if (cp < 0x10000) { - utf8[0] = (char)(0xE0 | (cp >> 12)); - utf8[1] = (char)(0x80 | ((cp >> 6) & 0x3F)); - utf8[2] = (char)(0x80 | ( cp & 0x3F)); - len = 3; +#ifdef MRB_UTF8_STRING + else if (ENC_COMP_P(enc, ENC_UTF8)) { + return int_chr_utf8(mrb, num); } +#endif else { - utf8[0] = (char)(0xF0 | (cp >> 18)); - utf8[1] = (char)(0x80 | ((cp >> 12) & 0x3F)); - utf8[2] = (char)(0x80 | ((cp >> 6) & 0x3F)); - utf8[3] = (char)(0x80 | ( cp & 0x3F)); - len = 4; - } - return mrb_str_new(mrb, utf8, len); -#else - char c; - - if (cp < 0 || 0xff < cp) { - mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num); + mrb_raisef(mrb, E_ARGUMENT_ERROR, "unknown encoding name - %S", enc); } - c = (char)cp; - return mrb_str_new(mrb, &c, 1); -#endif } /* @@ -1219,7 +1298,8 @@ mrb_mruby_string_ext_gem_init(mrb_state* mrb) mrb_define_method(mrb, s, "delete_suffix", mrb_str_del_suffix, MRB_ARGS_REQ(1)); mrb_define_method(mrb, s, "__lines", mrb_str_lines, MRB_ARGS_NONE()); - mrb_define_method(mrb, mrb_module_get(mrb, "Integral"), "chr", mrb_int_chr, MRB_ARGS_NONE()); + + mrb_define_method(mrb, mrb_module_get(mrb, "Integral"), "chr", mrb_int_chr, MRB_ARGS_OPT(1)); } void |
