From 0f516bbe1dd61759faf2609970bd5cd855931221 Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Sat, 22 Jun 2019 12:07:21 +0100 Subject: Add paragraph mode to String#each_line in mrblib mruby/mruby#4511 demonstrated an infinite loop in `String#each_line` when given an empty string separator. In MRI, an empty separator places String#each_line in paragraph mode, where the String is separated on successive runs of newlines. In paragraph mode, the String `"abc\n\n\ndef\nxyz"` is split into `["abc\n\n\n", "def\nxyz"]`. This commit makes the String#each_line implementation as close to ruby/spec compliant as possible given the limitations of mruby core. With this patch, the following specs fail for `String#each_line`: - uses `$/` as the separator when none is given (can be fixed by aliasing and redefining the method to use $/ as the default value of separator in mruby-io) - when no block is given returned Enumerator size should return nil (`Enumerator#size` is not supported on mruby) - tries to convert the separator to a string using to_str (`String#to_str` is not implemented on mruby) This patch has similar memory consumption compared to the prior implementation and is takes 4x the time the prior implementation takes to execute: ```console /usr/bin/time -l ./bin/mruby -e '("aaa\n\nbbbbb\n\n\n\n\ncccc" * 100000).each_line("\n") { }'; ``` --- mrblib/string.rb | 67 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/mrblib/string.rb b/mrblib/string.rb index b0fe4033e..adf19d00c 100644 --- a/mrblib/string.rb +++ b/mrblib/string.rb @@ -11,18 +11,63 @@ class String # and pass the respective line. # # ISO 15.2.10.5.15 - def each_line(rs = "\n", &block) - return to_enum(:each_line, rs, &block) unless block - return block.call(self) if rs.nil? - rs.__to_str - offset = 0 - rs_len = rs.length - this = dup - while pos = this.index(rs, offset) - block.call(this[offset, pos + rs_len - offset]) - offset = pos + rs_len + def each_line(separator = "\n", getline_args = nil) + return to_enum(:each_line, separator, getline_args) unless block_given? + + if separator.nil? + yield self + return self + end + raise TypeError unless separator.is_a?(String) + + start = 0 + pointer = 0 + string = dup + + if separator.empty? + matched_newlines = 0 + while pointer < string.length + c = string[pointer] + if c == "\n" + matched_newlines += 1 + elsif matched_newlines > 1 && self.class == String + yield string[start...pointer] + matched_newlines = 0 + start = pointer + elsif matched_newlines > 1 + yield self.class.new(string[start...pointer]) + matched_newlines = 0 + start = pointer + else + matched_newlines = 0 + end + pointer += 1 + end + else + matched_length = 0 + separator_length = separator.length + while pointer < string.length + c = string[pointer] + pointer += 1 + matched_length += 1 if c == separator[matched_length] + next unless matched_length == separator_length + + if self.class == String + yield string[start...pointer] + else + yield self.class.new(string[start...pointer]) + end + matched_length = 0 + start = pointer + end + end + return self if start == string.length + + if self.class == String + yield string[start..-1] + else + yield self.class.new(string[start..-1]) end - block.call(this[offset, this.size - offset]) if this.size > offset self end -- cgit v1.2.3