From 52a255b41448dbf2ee711b01c6c621d3a4b2b16c Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 25 Jun 2024 14:14:01 +0900 Subject: [PATCH] Optimize BaseParser#unnormalize method to replace "\r\n" with "\n" only when "\r\n" is included ## Why? See: https://github.com/ruby/rexml/pull/158#issuecomment-2187663068 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.674 17.567 32.759 32.316 i/s - 100.000 times in 5.657973s 5.692371s 3.052595s 3.094448s sax 25.261 25.377 48.889 49.911 i/s - 100.000 times in 3.958626s 3.940640s 2.045460s 2.003575s pull 28.968 29.121 61.584 61.774 i/s - 100.000 times in 3.452132s 3.433967s 1.623789s 1.618809s stream 28.395 28.803 55.289 57.970 i/s - 100.000 times in 3.521761s 3.471812s 1.808673s 1.725029s Comparison: dom before(YJIT): 32.8 i/s after(YJIT): 32.3 i/s - 1.01x slower before: 17.7 i/s - 1.85x slower after: 17.6 i/s - 1.86x slower sax after(YJIT): 49.9 i/s before(YJIT): 48.9 i/s - 1.02x slower after: 25.4 i/s - 1.97x slower before: 25.3 i/s - 1.98x slower pull after(YJIT): 61.8 i/s before(YJIT): 61.6 i/s - 1.00x slower after: 29.1 i/s - 2.12x slower before: 29.0 i/s - 2.13x slower stream after(YJIT): 58.0 i/s before(YJIT): 55.3 i/s - 1.05x slower after: 28.8 i/s - 2.01x slower before: 28.4 i/s - 2.04x slower ``` - YJIT=ON : 0.98x - 1.05x faster - YJIT=OFF : 0.98x - 1.02x faster --------- Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 6 +++++- test/test_pullparser.rb | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 275372ee..02759e70 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -511,7 +511,11 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) + if string.include?("\r") + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) + else + rv = string.dup + end matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( Private::CHARACTER_REFERENCES ) { diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index b6a48c93..073d896d 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -82,6 +82,27 @@ def test_character_references end end + def test_text_content_with_line_breaks + source = "AB\nC\r\n" + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B\n", events['b']) + assert_equal("C\n", events['c']) + end + def test_peek_unshift source = "" REXML::Parsers::PullParser.new(source)