From 0f516bbe1dd61759faf2609970bd5cd855931221 Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Sat, 22 Jun 2019 12:07:21 +0100 Subject: Add paragraph mode to String#each_line in mrblib mruby/mruby#4511 demonstrated an infinite loop in `String#each_line` when given an empty string separator. In MRI, an empty separator places String#each_line in paragraph mode, where the String is separated on successive runs of newlines. In paragraph mode, the String `"abc\n\n\ndef\nxyz"` is split into `["abc\n\n\n", "def\nxyz"]`. This commit makes the String#each_line implementation as close to ruby/spec compliant as possible given the limitations of mruby core. With this patch, the following specs fail for `String#each_line`: - uses `$/` as the separator when none is given (can be fixed by aliasing and redefining the method to use $/ as the default value of separator in mruby-io) - when no block is given returned Enumerator size should return nil (`Enumerator#size` is not supported on mruby) - tries to convert the separator to a string using to_str (`String#to_str` is not implemented on mruby) This patch has similar memory consumption compared to the prior implementation and is takes 4x the time the prior implementation takes to execute: ```console /usr/bin/time -l ./bin/mruby -e '("aaa\n\nbbbbb\n\n\n\n\ncccc" * 100000).each_line("\n") { }'; ``` --- mrblib/string.rb | 67 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/mrblib/string.rb b/mrblib/string.rb index b0fe4033e..adf19d00c 100644 --- a/mrblib/string.rb +++ b/mrblib/string.rb @@ -11,18 +11,63 @@ class String # and pass the respective line. # # ISO 15.2.10.5.15 - def each_line(rs = "\n", &block) - return to_enum(:each_line, rs, &block) unless block - return block.call(self) if rs.nil? - rs.__to_str - offset = 0 - rs_len = rs.length - this = dup - while pos = this.index(rs, offset) - block.call(this[offset, pos + rs_len - offset]) - offset = pos + rs_len + def each_line(separator = "\n", getline_args = nil) + return to_enum(:each_line, separator, getline_args) unless block_given? + + if separator.nil? + yield self + return self + end + raise TypeError unless separator.is_a?(String) + + start = 0 + pointer = 0 + string = dup + + if separator.empty? + matched_newlines = 0 + while pointer < string.length + c = string[pointer] + if c == "\n" + matched_newlines += 1 + elsif matched_newlines > 1 && self.class == String + yield string[start...pointer] + matched_newlines = 0 + start = pointer + elsif matched_newlines > 1 + yield self.class.new(string[start...pointer]) + matched_newlines = 0 + start = pointer + else + matched_newlines = 0 + end + pointer += 1 + end + else + matched_length = 0 + separator_length = separator.length + while pointer < string.length + c = string[pointer] + pointer += 1 + matched_length += 1 if c == separator[matched_length] + next unless matched_length == separator_length + + if self.class == String + yield string[start...pointer] + else + yield self.class.new(string[start...pointer]) + end + matched_length = 0 + start = pointer + end + end + return self if start == string.length + + if self.class == String + yield string[start..-1] + else + yield self.class.new(string[start..-1]) end - block.call(this[offset, this.size - offset]) if this.size > offset self end -- cgit v1.2.3 From a932b8c96a4312753993ce0b98d2e9eedec17de0 Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Sat, 22 Jun 2019 12:55:02 +0100 Subject: Speed up base case by 2x Make non-paragraph mode twice as fast. Performance is within a factor of 2 of the original implementation. --- mrblib/string.rb | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/mrblib/string.rb b/mrblib/string.rb index adf19d00c..0bac52a20 100644 --- a/mrblib/string.rb +++ b/mrblib/string.rb @@ -44,20 +44,13 @@ class String pointer += 1 end else - matched_length = 0 - separator_length = separator.length - while pointer < string.length - c = string[pointer] - pointer += 1 - matched_length += 1 if c == separator[matched_length] - next unless matched_length == separator_length - + while (pointer = string.index(separator, start)) + pointer += separator.length if self.class == String yield string[start...pointer] else yield self.class.new(string[start...pointer]) end - matched_length = 0 start = pointer end end -- cgit v1.2.3 From 8ea45b862aa9ff358b3642ac14b03304fcd6846e Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Sun, 23 Jun 2019 03:29:47 +0100 Subject: Optimize String#each_line --- mrblib/string.rb | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/mrblib/string.rb b/mrblib/string.rb index 0bac52a20..60129a845 100644 --- a/mrblib/string.rb +++ b/mrblib/string.rb @@ -11,7 +11,7 @@ class String # and pass the respective line. # # ISO 15.2.10.5.15 - def each_line(separator = "\n", getline_args = nil) + def each_line(separator = "\n") return to_enum(:each_line, separator, getline_args) unless block_given? if separator.nil? @@ -20,22 +20,25 @@ class String end raise TypeError unless separator.is_a?(String) - start = 0 pointer = 0 + start = 0 string = dup + self_len = length + sep_len = separator.length + should_yield_subclass_instances = self.class != String if separator.empty? matched_newlines = 0 - while pointer < string.length + while pointer < self_len c = string[pointer] if c == "\n" matched_newlines += 1 - elsif matched_newlines > 1 && self.class == String - yield string[start...pointer] + elsif matched_newlines > 1 && should_yield_subclass_instances + yield self.class.new(string[start, pointer - start]) matched_newlines = 0 start = pointer elsif matched_newlines > 1 - yield self.class.new(string[start...pointer]) + yield string[start, pointer - start] matched_newlines = 0 start = pointer else @@ -45,21 +48,21 @@ class String end else while (pointer = string.index(separator, start)) - pointer += separator.length - if self.class == String - yield string[start...pointer] + pointer += sep_len + if should_yield_subclass_instances + yield self.class.new(string[start, pointer - start]) else - yield self.class.new(string[start...pointer]) + yield string[start, pointer - start] end start = pointer end end - return self if start == string.length + return self if start == self_len - if self.class == String - yield string[start..-1] + if should_yield_subclass_instances + yield self.class.new(string[start, self_len - start]) else - yield self.class.new(string[start..-1]) + yield string[start, self_len - start] end self end -- cgit v1.2.3 From 6b92acf361c8f1979dcda95c41324cccc8d767a9 Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Sun, 23 Jun 2019 03:40:39 +0100 Subject: Use explicit block parameter --- mrblib/string.rb | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mrblib/string.rb b/mrblib/string.rb index 60129a845..f1c7f3ba4 100644 --- a/mrblib/string.rb +++ b/mrblib/string.rb @@ -11,11 +11,11 @@ class String # and pass the respective line. # # ISO 15.2.10.5.15 - def each_line(separator = "\n") - return to_enum(:each_line, separator, getline_args) unless block_given? + def each_line(separator = "\n", &block) + return to_enum(:each_line, separator) unless block if separator.nil? - yield self + block.call(self) return self end raise TypeError unless separator.is_a?(String) @@ -34,11 +34,11 @@ class String if c == "\n" matched_newlines += 1 elsif matched_newlines > 1 && should_yield_subclass_instances - yield self.class.new(string[start, pointer - start]) + block.call(self.class.new(string[start, pointer - start])) matched_newlines = 0 start = pointer elsif matched_newlines > 1 - yield string[start, pointer - start] + block.call(string[start, pointer - start]) matched_newlines = 0 start = pointer else @@ -50,9 +50,9 @@ class String while (pointer = string.index(separator, start)) pointer += sep_len if should_yield_subclass_instances - yield self.class.new(string[start, pointer - start]) + block.call(self.class.new(string[start, pointer - start])) else - yield string[start, pointer - start] + block.call(string[start, pointer - start]) end start = pointer end @@ -60,9 +60,9 @@ class String return self if start == self_len if should_yield_subclass_instances - yield self.class.new(string[start, self_len - start]) + block.call(self.class.new(string[start, self_len - start])) else - yield string[start, self_len - start] + block.call(string[start, self_len - start]) end self end -- cgit v1.2.3 From 46c972f746f37a26104b1e9f317ce5a02daa9f40 Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Tue, 25 Jun 2019 00:01:51 +0200 Subject: Unify loops to minimize bytecode size --- mrblib/string.rb | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/mrblib/string.rb b/mrblib/string.rb index f1c7f3ba4..d72002209 100644 --- a/mrblib/string.rb +++ b/mrblib/string.rb @@ -20,42 +20,26 @@ class String end raise TypeError unless separator.is_a?(String) - pointer = 0 + paragraph_mode = false + if separator.empty? + paragraph_mode = true + separator = "\n\n" + end start = 0 string = dup self_len = length sep_len = separator.length should_yield_subclass_instances = self.class != String - if separator.empty? - matched_newlines = 0 - while pointer < self_len - c = string[pointer] - if c == "\n" - matched_newlines += 1 - elsif matched_newlines > 1 && should_yield_subclass_instances - block.call(self.class.new(string[start, pointer - start])) - matched_newlines = 0 - start = pointer - elsif matched_newlines > 1 - block.call(string[start, pointer - start]) - matched_newlines = 0 - start = pointer - else - matched_newlines = 0 - end - pointer += 1 - end - else - while (pointer = string.index(separator, start)) - pointer += sep_len - if should_yield_subclass_instances - block.call(self.class.new(string[start, pointer - start])) - else - block.call(string[start, pointer - start]) - end - start = pointer + while (pointer = string.index(separator, start)) + pointer += sep_len + pointer += 1 while paragraph_mode && string[pointer] == "\n" + if should_yield_subclass_instances + block.call(self.class.new(string[start, pointer - start])) + else + block.call(string[start, pointer - start]) end + start = pointer end return self if start == self_len -- cgit v1.2.3