Skip to content

Commit 9b311e5

Browse files
koumame
authored andcommitted
Fix a bug that invalid document declaration may be accepted
HackerOne: HO-1104077 It's caused by ignoring garbage before "\n<!DOCTYPE..." and after "<!DOCTYPE\n". Reported by Juho Nurminen. Thanks!!!
1 parent f9d88e4 commit 9b311e5

File tree

3 files changed

+326
-95
lines changed

3 files changed

+326
-95
lines changed

lib/rexml/parsers/baseparser.rb

+126-74
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ class BaseParser
5050

5151
DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
5252
DOCTYPE_END = /\A\s*\]\s*>/um
53-
DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
5453
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
5554
COMMENT_START = /\A<!--/u
5655
COMMENT_PATTERN = /<!--(.*?)-->/um
@@ -69,7 +68,6 @@ class BaseParser
6968
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
7069

7170
ENTITY_START = /\A\s*<!ENTITY/
72-
IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
7371
ELEMENTDECL_START = /\A\s*<!ELEMENT/um
7472
ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
7573
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
@@ -101,8 +99,9 @@ class BaseParser
10199
ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
102100

103101
NOTATIONDECL_START = /\A\s*<!NOTATION/um
104-
PUBLIC = /\A\s*<!NOTATION\s+#{NAME}\s+(PUBLIC)\s+#{PUBIDLITERAL}(?:\s+#{SYSTEMLITERAL})?\s*>/um
105-
SYSTEM = /\A\s*<!NOTATION\s+#{NAME}\s+(SYSTEM)\s+#{SYSTEMLITERAL}\s*>/um
102+
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
103+
EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
104+
PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
106105

107106
EREFERENCE = /&(?!#{NAME};)/
108107

@@ -225,24 +224,37 @@ def pull_event
225224
when INSTRUCTION_START
226225
return process_instruction
227226
when DOCTYPE_START
228-
md = @source.match( DOCTYPE_PATTERN, true )
227+
base_error_message = "Malformed DOCTYPE"
228+
@source.match(DOCTYPE_START, true)
229229
@nsstack.unshift(curr_ns=Set.new)
230-
identity = md[1]
231-
close = md[2]
232-
identity =~ IDENTITY
233-
name = $1
234-
raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
235-
pub_sys = $2.nil? ? nil : $2.strip
236-
long_name = $4.nil? ? nil : $4.strip
237-
uri = $6.nil? ? nil : $6.strip
238-
args = [ :start_doctype, name, pub_sys, long_name, uri ]
239-
if close == ">"
230+
name = parse_name(base_error_message)
231+
if @source.match(/\A\s*\[/um, true)
232+
id = [nil, nil, nil]
233+
@document_status = :in_doctype
234+
elsif @source.match(/\A\s*>/um, true)
235+
id = [nil, nil, nil]
240236
@document_status = :after_doctype
241-
@source.read if @source.buffer.size<2
242-
md = @source.match(/^\s*/um, true)
243-
@stack << [ :end_doctype ]
244237
else
245-
@document_status = :in_doctype
238+
id = parse_id(base_error_message,
239+
accept_external_id: true,
240+
accept_public_id: false)
241+
if id[0] == "SYSTEM"
242+
# For backward compatibility
243+
id[1], id[2] = id[2], nil
244+
end
245+
if @source.match(/\A\s*\[/um, true)
246+
@document_status = :in_doctype
247+
elsif @source.match(/\A\s*>/um, true)
248+
@document_status = :after_doctype
249+
else
250+
message = "#{base_error_message}: garbage after external ID"
251+
raise REXML::ParseException.new(message, @source)
252+
end
253+
end
254+
args = [:start_doctype, name, *id]
255+
if @document_status == :after_doctype
256+
@source.match(/\A\s*/um, true)
257+
@stack << [ :end_doctype ]
246258
end
247259
return args
248260
when /^\s+/
@@ -313,27 +325,24 @@ def pull_event
313325
end
314326
return [ :attlistdecl, element, pairs, contents ]
315327
when NOTATIONDECL_START
316-
md = nil
317-
if @source.match( PUBLIC )
318-
md = @source.match( PUBLIC, true )
319-
pubid = system = nil
320-
pubid_literal = md[3]
321-
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
322-
system_literal = md[4]
323-
system = system_literal[1..-2] if system_literal # Remove quote
324-
vals = [md[1], md[2], pubid, system]
325-
elsif @source.match( SYSTEM )
326-
md = @source.match( SYSTEM, true )
327-
system = nil
328-
system_literal = md[3]
329-
system = system_literal[1..-2] if system_literal # Remove quote
330-
vals = [md[1], md[2], nil, system]
331-
else
332-
details = notation_decl_invalid_details
333-
message = "Malformed notation declaration: #{details}"
328+
base_error_message = "Malformed notation declaration"
329+
unless @source.match(/\A\s*<!NOTATION\s+/um, true)
330+
if @source.match(/\A\s*<!NOTATION\s*>/um)
331+
message = "#{base_error_message}: name is missing"
332+
else
333+
message = "#{base_error_message}: invalid declaration name"
334+
end
335+
raise REXML::ParseException.new(message, @source)
336+
end
337+
name = parse_name(base_error_message)
338+
id = parse_id(base_error_message,
339+
accept_external_id: true,
340+
accept_public_id: true)
341+
unless @source.match(/\A\s*>/um, true)
342+
message = "#{base_error_message}: garbage before end >"
334343
raise REXML::ParseException.new(message, @source)
335344
end
336-
return [ :notationdecl, *vals ]
345+
return [:notationdecl, name, *id]
337346
when DOCTYPE_END
338347
@document_status = :after_doctype
339348
@source.match( DOCTYPE_END, true )
@@ -488,6 +497,85 @@ def need_source_encoding_update?(xml_declaration_encoding)
488497
true
489498
end
490499

500+
def parse_name(base_error_message)
501+
md = @source.match(/\A\s*#{NAME}/um, true)
502+
unless md
503+
if @source.match(/\A\s*\S/um)
504+
message = "#{base_error_message}: invalid name"
505+
else
506+
message = "#{base_error_message}: name is missing"
507+
end
508+
raise REXML::ParseException.new(message, @source)
509+
end
510+
md[1]
511+
end
512+
513+
def parse_id(base_error_message,
514+
accept_external_id:,
515+
accept_public_id:)
516+
if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
517+
pubid = system = nil
518+
pubid_literal = md[1]
519+
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
520+
system_literal = md[2]
521+
system = system_literal[1..-2] if system_literal # Remove quote
522+
["PUBLIC", pubid, system]
523+
elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
524+
pubid = system = nil
525+
pubid_literal = md[1]
526+
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
527+
["PUBLIC", pubid, nil]
528+
elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
529+
system = nil
530+
system_literal = md[1]
531+
system = system_literal[1..-2] if system_literal # Remove quote
532+
["SYSTEM", nil, system]
533+
else
534+
details = parse_id_invalid_details(accept_external_id: accept_external_id,
535+
accept_public_id: accept_public_id)
536+
message = "#{base_error_message}: #{details}"
537+
raise REXML::ParseException.new(message, @source)
538+
end
539+
end
540+
541+
def parse_id_invalid_details(accept_external_id:,
542+
accept_public_id:)
543+
public = /\A\s*PUBLIC/um
544+
system = /\A\s*SYSTEM/um
545+
if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
546+
if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
547+
return "public ID literal is missing"
548+
end
549+
unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
550+
return "invalid public ID literal"
551+
end
552+
if accept_public_id
553+
if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
554+
return "system ID literal is missing"
555+
end
556+
unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
557+
return "invalid system literal"
558+
end
559+
"garbage after system literal"
560+
else
561+
"garbage after public ID literal"
562+
end
563+
elsif accept_external_id and @source.match(/#{system}/um)
564+
if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
565+
return "system literal is missing"
566+
end
567+
unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
568+
return "invalid system literal"
569+
end
570+
"garbage after system literal"
571+
else
572+
unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
573+
return "invalid ID type"
574+
end
575+
"ID type is missing"
576+
end
577+
end
578+
491579
def process_instruction
492580
match_data = @source.match(INSTRUCTION_PATTERN, true)
493581
unless match_data
@@ -580,42 +668,6 @@ def parse_attributes(prefixes, curr_ns)
580668
end
581669
return attributes, closed
582670
end
583-
584-
def notation_decl_invalid_details
585-
name = /#{NOTATIONDECL_START}\s+#{NAME}/um
586-
public = /#{name}\s+PUBLIC/um
587-
system = /#{name}\s+SYSTEM/um
588-
if @source.match(/#{NOTATIONDECL_START}\s*>/um)
589-
return "name is missing"
590-
elsif not @source.match(/#{name}[\s>]/um)
591-
return "invalid name"
592-
elsif @source.match(/#{name}\s*>/um)
593-
return "ID type is missing"
594-
elsif not @source.match(/#{name}\s+(?:PUBLIC|SYSTEM)[\s>]/um)
595-
return "invalid ID type"
596-
elsif @source.match(/#{public}/um)
597-
if @source.match(/#{public}\s*>/um)
598-
return "public ID literal is missing"
599-
elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
600-
return "invalid public ID literal"
601-
elsif @source.match(/#{public}\s+#{PUBIDLITERAL}[^\s>]/um)
602-
return "garbage after public ID literal"
603-
elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
604-
return "invalid system literal"
605-
elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*>/um)
606-
return "garbage after system literal"
607-
end
608-
elsif @source.match(/#{system}/um)
609-
if @source.match(/#{system}\s*>/um)
610-
return "system literal is missing"
611-
elsif not @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
612-
return "invalid system literal"
613-
elsif not @source.match(/#{system}\s+#{SYSTEMLITERAL}\s*>/um)
614-
return "garbage after system literal"
615-
end
616-
end
617-
"end > is missing"
618-
end
619671
end
620672
end
621673
end

0 commit comments

Comments
 (0)