From 992ba476a95136eaad5b9b208d4ca5a1ca31324d Mon Sep 17 00:00:00 2001 From: dearblue Date: Sun, 15 Sep 2019 23:50:24 +0900 Subject: Fix broken UTF-8 characters by `IO#getc` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Character (multi-byte UTF-8) is destroyed when character spanning `IO::BUF_SIZE` (4096 bytes) exist. - Prepare file: ```ruby File.open("sample", "wb") { |f| f << "●" * 1370 } ``` - Before patched: ```ruby File.open("sample") { |f| a = []; while ch = f.getc; a << ch; end; p a } # => ["●", "●", ..., "●", "\xe2", "\x97", "\x8f", "●", "●", "●", "●"] - After patched: ```ruby File.open("sample") { |f| a = []; while ch = f.getc; a << ch; end; p a } # => ["●", "●", ..., "●", "●", "●", "●", "●", "●"] --- mrbgems/mruby-io/mrblib/io.rb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mrbgems/mruby-io/mrblib/io.rb b/mrbgems/mruby-io/mrblib/io.rb index 32bac1f0d..6b83644ef 100644 --- a/mrbgems/mruby-io/mrblib/io.rb +++ b/mrbgems/mruby-io/mrblib/io.rb @@ -170,8 +170,14 @@ class IO end def _read_buf - return @buf if @buf && @buf.bytesize > 0 - @buf = sysread(BUF_SIZE) + return @buf if @buf && @buf.bytesize >= 4 # maximum UTF-8 character is 4 bytes + @buf ||= "" + begin + @buf += sysread(BUF_SIZE) + rescue EOFError => e + raise e if @buf.empty? + end + @buf end def ungetc(substr) -- cgit v1.2.3