diff options
| author | Yukihiro "Matz" Matsumoto <[email protected]> | 2016-12-01 18:42:40 +0900 |
|---|---|---|
| committer | GitHub <[email protected]> | 2016-12-01 18:42:40 +0900 |
| commit | f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f (patch) | |
| tree | f7af4f06597947fd8e5d31307bca554414a23234 | |
| parent | 4d807fc619e7feccd2bc079ef76a9e817e36fa59 (diff) | |
| parent | 0f08914ac0d433545a4224ee1c3f8d3eb8d51e68 (diff) | |
| download | mruby-f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f.tar.gz mruby-f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f.zip | |
Merge pull request #3312 from nobu/feature/multi-unicode-escape
Feature/multi unicode escape
| -rw-r--r-- | mrbgems/mruby-compiler/core/parse.y | 102 | ||||
| -rw-r--r-- | test/t/unicode.rb | 32 |
2 files changed, 77 insertions, 57 deletions
diff --git a/mrbgems/mruby-compiler/core/parse.y b/mrbgems/mruby-compiler/core/parse.y index ef522d239..f0c45b85b 100644 --- a/mrbgems/mruby-compiler/core/parse.y +++ b/mrbgems/mruby-compiler/core/parse.y @@ -3752,6 +3752,44 @@ scan_hex(const int *start, int len, int *retlen) return retval; } +static int32_t +read_escape_unicode(parser_state *p, size_t limit) +{ + int32_t c; + int buf[9]; + int i; + + /* Look for opening brace */ + i = 0; + buf[0] = nextc(p); + if (buf[0] < 0) goto eof; + if (ISXDIGIT(buf[0])) { + /* \uxxxx form */ + for (i=1; i<limit; i++) { + buf[i] = nextc(p); + if (buf[i] < 0) goto eof; + if (!ISXDIGIT(buf[i])) { + pushback(p, buf[i]); + break; + } + } + } + else { + pushback(p, buf[0]); + } + c = scan_hex(buf, i, &i); + if (i == 0) { + eof: + yyerror(p, "Invalid escape character syntax"); + return -1; + } + if (c < 0 || c > 0x10FFFF || (c & 0xFFFFF800) == 0xD800) { + yyerror(p, "Invalid Unicode code point"); + return -1; + } + return c; +} + /* Return negative to indicate Unicode code point */ static int32_t read_escape(parser_state *p) @@ -3824,53 +3862,17 @@ read_escape(parser_state *p) return c; case 'u': /* Unicode */ - { - int buf[9]; - int i; - - /* Look for opening brace */ - i = 0; - buf[0] = nextc(p); - if (buf[0] < 0) goto eof; - if (buf[0] == '{') { + if (peek(p, '{')) { /* \u{xxxxxxxx} form */ - for (i=0; i<9; i++) { - buf[i] = nextc(p); - if (buf[i] < 0) goto eof; - if (buf[i] == '}') { - break; - } - else if (!ISXDIGIT(buf[i])) { - yyerror(p, "Invalid escape character syntax"); - pushback(p, buf[i]); - return 0; - } - } - } - else if (ISXDIGIT(buf[0])) { - /* \uxxxx form */ - for (i=1; i<4; i++) { - buf[i] = nextc(p); - if (buf[i] < 0) goto eof; - if (!ISXDIGIT(buf[i])) { - pushback(p, buf[i]); - break; - } - } + nextc(p); + c = read_escape_unicode(p, 8); + if (c < 0) return 0; + if (nextc(p) != '}') goto eof; } else { - pushback(p, buf[0]); - } - c = scan_hex(buf, i, &i); - if (i == 0) { - yyerror(p, "Invalid escape character syntax"); - return 0; + c = read_escape_unicode(p, 4); + if (c < 0) return 0; } - if (c < 0 || c > 0x10FFFF || (c & 0xFFFFF800) == 0xD800) { - yyerror(p, "Invalid Unicode code point"); - return 0; - } - } return -c; case 'b':/* backspace */ @@ -3993,6 +3995,20 @@ parse_string(parser_state *p) tokadd(p, '\\'); tokadd(p, c); } + else if (c == 'u' && peek(p, '{')) { + /* \u{xxxx xxxx xxxx} form */ + nextc(p); + while (1) { + do c = nextc(p); while (ISSPACE(c)); + if (c == '}') break; + pushback(p, c); + c = read_escape_unicode(p, 8); + if (c < 0) break; + tokadd(p, -c); + } + if (hinf) + hinf->line_head = FALSE; + } else { pushback(p, c); tokadd(p, read_escape(p)); diff --git a/test/t/unicode.rb b/test/t/unicode.rb index 7edd65ef2..8622ae08a 100644 --- a/test/t/unicode.rb +++ b/test/t/unicode.rb @@ -2,34 +2,38 @@ assert('bare \u notation test') do # Mininum and maximum one byte characters - assert_equal("\u0000", "\x00") - assert_equal("\u007F", "\x7F") + assert_equal("\x00", "\u0000") + assert_equal("\x7F", "\u007F") # Mininum and maximum two byte characters - assert_equal("\u0080", "\xC2\x80") - assert_equal("\u07FF", "\xDF\xBF") + assert_equal("\xC2\x80", "\u0080") + assert_equal("\xDF\xBF", "\u07FF") # Mininum and maximum three byte characters - assert_equal("\u0800", "\xE0\xA0\x80") - assert_equal("\uFFFF", "\xEF\xBF\xBF") + assert_equal("\xE0\xA0\x80", "\u0800") + assert_equal("\xEF\xBF\xBF", "\uFFFF") # Four byte characters require the \U notation end assert('braced \u notation test') do # Mininum and maximum one byte characters - assert_equal("\u{0000}", "\x00") - assert_equal("\u{007F}", "\x7F") + assert_equal("\x00", "\u{0000}") + assert_equal("\x7F", "\u{007F}") # Mininum and maximum two byte characters - assert_equal("\u{0080}", "\xC2\x80") - assert_equal("\u{07FF}", "\xDF\xBF") + assert_equal("\xC2\x80", "\u{0080}") + assert_equal("\xDF\xBF", "\u{07FF}") # Mininum and maximum three byte characters - assert_equal("\u{0800}", "\xE0\xA0\x80") - assert_equal("\u{FFFF}", "\xEF\xBF\xBF") + assert_equal("\xE0\xA0\x80", "\u{0800}") + assert_equal("\xEF\xBF\xBF", "\u{FFFF}") # Mininum and maximum four byte characters - assert_equal("\u{10000}", "\xF0\x90\x80\x80") - assert_equal("\u{10FFFF}", "\xF4\x8F\xBF\xBF") + assert_equal("\xF0\x90\x80\x80", "\u{10000}") + assert_equal("\xF4\x8F\xBF\xBF", "\u{10FFFF}") +end + +assert('braced multiple \u notation test') do + assert_equal("ABC", "\u{41 42 43}") end |
