Skip to content

Commit

Permalink
parse pi: improve invalid case detection
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Aug 1, 2024
1 parent 73661ef commit e2546e6
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 17 deletions.
35 changes: 20 additions & 15 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,10 @@ class BaseParser
}

module Private
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
NAME_PATTERN = /\s*#{NAME}/um
NAME_PATTERN = /#{NAME}/um
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
Expand Down Expand Up @@ -242,7 +241,7 @@ def pull_event
if @document_status == nil
start_position = @source.position
if @source.match("<?", true)
return process_instruction(start_position)
return process_instruction
elsif @source.match("<!", true)
if @source.match("--", true)
md = @source.match(/(.*?)-->/um, true)
Expand Down Expand Up @@ -442,7 +441,7 @@ def pull_event
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
elsif @source.match("?", true)
return process_instruction(start_position)
return process_instruction
else
# Get the next tag
md = @source.match(Private::TAG_PATTERN, true)
Expand Down Expand Up @@ -588,14 +587,14 @@ def need_source_encoding_update?(xml_declaration_encoding)
def parse_name(base_error_message)
md = @source.match(Private::NAME_PATTERN, true)
unless md
if @source.match(/\s*\S/um)
if @source.match(/\S/um)
message = "#{base_error_message}: invalid name"
else
message = "#{base_error_message}: name is missing"
end
raise REXML::ParseException.new(message, @source)
end
md[1]
md[0]
end

def parse_id(base_error_message,
Expand Down Expand Up @@ -664,18 +663,24 @@ def parse_id_invalid_details(accept_external_id:,
end
end

def process_instruction(start_position)
match_data = @source.match(Private::INSTRUCTION_END, true)
unless match_data
message = "Invalid processing instruction node"
@source.position = start_position
raise REXML::ParseException.new(message, @source)
def process_instruction
name = parse_name("Malformed XML: Invalid processing instruction node")
if @source.match(/\s+/um, true)
match_data = @source.match(/(.*?)\?>/um, true)
unless match_data
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
end
content = match_data[1]
else
content = nil
unless @source.match("?>", true)
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
end
end
if match_data[1] == "xml"
if name == "xml"
if @document_status
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
end
content = match_data[2]
version = VERSION.match(content)
version = version[1] unless version.nil?
encoding = ENCODING.match(content)
Expand All @@ -690,7 +695,7 @@ def process_instruction(start_position)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
end
[:processing_instruction, match_data[1], match_data[2]]
[:processing_instruction, name, content]
end

def parse_attributes(prefixes, curr_ns)
Expand Down
35 changes: 33 additions & 2 deletions test/parse/test_processing_instruction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,37 @@ def test_no_name
parse("<??>")
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Invalid processing instruction node
Malformed XML: Invalid processing instruction node: invalid name
Line: 1
Position: 4
Last 80 unconsumed characters:
<??>
?>
DETAIL
end

def test_unclosed_content
exception = assert_raise(REXML::ParseException) do
parse("<?name content")
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed XML: Unclosed processing instruction
Line: 1
Position: 14
Last 80 unconsumed characters:
content
DETAIL
end

def test_unclosed_no_content
exception = assert_raise(REXML::ParseException) do
parse("<?name")
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed XML: Unclosed processing instruction
Line: 1
Position: 6
Last 80 unconsumed characters:
DETAIL
end

Expand Down Expand Up @@ -79,6 +105,11 @@ def test_after_root
assert_equal("abc", events[:processing_instruction])
end

def test_content_question
document = REXML::Document.new("<a><?name con?tent?></a>")
assert_equal("con?tent", document.root.children.first.content)
end

def test_linear_performance_gt
seq = [10000, 50000, 100000, 150000, 200000]
assert_linear_performance(seq, rehearsal: 10) do |n|
Expand Down

0 comments on commit e2546e6

Please # to comment.