summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorYukihiro "Matz" Matsumoto <[email protected]>2016-12-01 18:42:40 +0900
committerGitHub <[email protected]>2016-12-01 18:42:40 +0900
commitf1cf6ef8179313b31a12ed4e5eba509d3b2aed0f (patch)
treef7af4f06597947fd8e5d31307bca554414a23234
parent4d807fc619e7feccd2bc079ef76a9e817e36fa59 (diff)
parent0f08914ac0d433545a4224ee1c3f8d3eb8d51e68 (diff)
downloadmruby-f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f.tar.gz
mruby-f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f.zip
Merge pull request #3312 from nobu/feature/multi-unicode-escape
Feature/multi unicode escape
-rw-r--r--mrbgems/mruby-compiler/core/parse.y102
-rw-r--r--test/t/unicode.rb32
2 files changed, 77 insertions, 57 deletions
diff --git a/mrbgems/mruby-compiler/core/parse.y b/mrbgems/mruby-compiler/core/parse.y
index ef522d239..f0c45b85b 100644
--- a/mrbgems/mruby-compiler/core/parse.y
+++ b/mrbgems/mruby-compiler/core/parse.y
@@ -3752,6 +3752,44 @@ scan_hex(const int *start, int len, int *retlen)
return retval;
}
+static int32_t
+read_escape_unicode(parser_state *p, size_t limit)
+{
+ int32_t c;
+ int buf[9];
+ int i;
+
+ /* Look for opening brace */
+ i = 0;
+ buf[0] = nextc(p);
+ if (buf[0] < 0) goto eof;
+ if (ISXDIGIT(buf[0])) {
+ /* \uxxxx form */
+ for (i=1; i<limit; i++) {
+ buf[i] = nextc(p);
+ if (buf[i] < 0) goto eof;
+ if (!ISXDIGIT(buf[i])) {
+ pushback(p, buf[i]);
+ break;
+ }
+ }
+ }
+ else {
+ pushback(p, buf[0]);
+ }
+ c = scan_hex(buf, i, &i);
+ if (i == 0) {
+ eof:
+ yyerror(p, "Invalid escape character syntax");
+ return -1;
+ }
+ if (c < 0 || c > 0x10FFFF || (c & 0xFFFFF800) == 0xD800) {
+ yyerror(p, "Invalid Unicode code point");
+ return -1;
+ }
+ return c;
+}
+
/* Return negative to indicate Unicode code point */
static int32_t
read_escape(parser_state *p)
@@ -3824,53 +3862,17 @@ read_escape(parser_state *p)
return c;
case 'u': /* Unicode */
- {
- int buf[9];
- int i;
-
- /* Look for opening brace */
- i = 0;
- buf[0] = nextc(p);
- if (buf[0] < 0) goto eof;
- if (buf[0] == '{') {
+ if (peek(p, '{')) {
/* \u{xxxxxxxx} form */
- for (i=0; i<9; i++) {
- buf[i] = nextc(p);
- if (buf[i] < 0) goto eof;
- if (buf[i] == '}') {
- break;
- }
- else if (!ISXDIGIT(buf[i])) {
- yyerror(p, "Invalid escape character syntax");
- pushback(p, buf[i]);
- return 0;
- }
- }
- }
- else if (ISXDIGIT(buf[0])) {
- /* \uxxxx form */
- for (i=1; i<4; i++) {
- buf[i] = nextc(p);
- if (buf[i] < 0) goto eof;
- if (!ISXDIGIT(buf[i])) {
- pushback(p, buf[i]);
- break;
- }
- }
+ nextc(p);
+ c = read_escape_unicode(p, 8);
+ if (c < 0) return 0;
+ if (nextc(p) != '}') goto eof;
}
else {
- pushback(p, buf[0]);
- }
- c = scan_hex(buf, i, &i);
- if (i == 0) {
- yyerror(p, "Invalid escape character syntax");
- return 0;
+ c = read_escape_unicode(p, 4);
+ if (c < 0) return 0;
}
- if (c < 0 || c > 0x10FFFF || (c & 0xFFFFF800) == 0xD800) {
- yyerror(p, "Invalid Unicode code point");
- return 0;
- }
- }
return -c;
case 'b':/* backspace */
@@ -3993,6 +3995,20 @@ parse_string(parser_state *p)
tokadd(p, '\\');
tokadd(p, c);
}
+ else if (c == 'u' && peek(p, '{')) {
+ /* \u{xxxx xxxx xxxx} form */
+ nextc(p);
+ while (1) {
+ do c = nextc(p); while (ISSPACE(c));
+ if (c == '}') break;
+ pushback(p, c);
+ c = read_escape_unicode(p, 8);
+ if (c < 0) break;
+ tokadd(p, -c);
+ }
+ if (hinf)
+ hinf->line_head = FALSE;
+ }
else {
pushback(p, c);
tokadd(p, read_escape(p));
diff --git a/test/t/unicode.rb b/test/t/unicode.rb
index 7edd65ef2..8622ae08a 100644
--- a/test/t/unicode.rb
+++ b/test/t/unicode.rb
@@ -2,34 +2,38 @@
assert('bare \u notation test') do
# Mininum and maximum one byte characters
- assert_equal("\u0000", "\x00")
- assert_equal("\u007F", "\x7F")
+ assert_equal("\x00", "\u0000")
+ assert_equal("\x7F", "\u007F")
# Mininum and maximum two byte characters
- assert_equal("\u0080", "\xC2\x80")
- assert_equal("\u07FF", "\xDF\xBF")
+ assert_equal("\xC2\x80", "\u0080")
+ assert_equal("\xDF\xBF", "\u07FF")
# Mininum and maximum three byte characters
- assert_equal("\u0800", "\xE0\xA0\x80")
- assert_equal("\uFFFF", "\xEF\xBF\xBF")
+ assert_equal("\xE0\xA0\x80", "\u0800")
+ assert_equal("\xEF\xBF\xBF", "\uFFFF")
# Four byte characters require the \U notation
end
assert('braced \u notation test') do
# Mininum and maximum one byte characters
- assert_equal("\u{0000}", "\x00")
- assert_equal("\u{007F}", "\x7F")
+ assert_equal("\x00", "\u{0000}")
+ assert_equal("\x7F", "\u{007F}")
# Mininum and maximum two byte characters
- assert_equal("\u{0080}", "\xC2\x80")
- assert_equal("\u{07FF}", "\xDF\xBF")
+ assert_equal("\xC2\x80", "\u{0080}")
+ assert_equal("\xDF\xBF", "\u{07FF}")
# Mininum and maximum three byte characters
- assert_equal("\u{0800}", "\xE0\xA0\x80")
- assert_equal("\u{FFFF}", "\xEF\xBF\xBF")
+ assert_equal("\xE0\xA0\x80", "\u{0800}")
+ assert_equal("\xEF\xBF\xBF", "\u{FFFF}")
# Mininum and maximum four byte characters
- assert_equal("\u{10000}", "\xF0\x90\x80\x80")
- assert_equal("\u{10FFFF}", "\xF4\x8F\xBF\xBF")
+ assert_equal("\xF0\x90\x80\x80", "\u{10000}")
+ assert_equal("\xF4\x8F\xBF\xBF", "\u{10FFFF}")
+end
+
+assert('braced multiple \u notation test') do
+ assert_equal("ABC", "\u{41 42 43}")
end