@@ -50,7 +50,6 @@ class BaseParser
50
50
51
51
DOCTYPE_START = /\A \s *<!DOCTYPE\s /um
52
52
DOCTYPE_END = /\A \s *\] \s *>/um
53
- DOCTYPE_PATTERN = /\s *<!DOCTYPE\s +(.*?)(\[ |>)/um
54
53
ATTRIBUTE_PATTERN = /\s *(#{ QNAME_STR } )\s *=\s *(["'])(.*?)\4 /um
55
54
COMMENT_START = /\A <!--/u
56
55
COMMENT_PATTERN = /<!--(.*?)-->/um
@@ -69,7 +68,6 @@ class BaseParser
69
68
STANDALONE = /\b standalone\s *=\s *["'](.*?)['"]/um
70
69
71
70
ENTITY_START = /\A \s *<!ENTITY/
72
- IDENTITY = /^([!\* \w \- ]+)(\s +#{ NCNAME_STR } )?(\s +["'](.*?)['"])?(\s +['"](.*?)["'])?/u
73
71
ELEMENTDECL_START = /\A \s *<!ELEMENT/um
74
72
ELEMENTDECL_PATTERN = /\A \s *(<!ELEMENT.*?)>/um
75
73
SYSTEMENTITY = /\A \s *(%.*?;)\s *$/um
@@ -101,8 +99,9 @@ class BaseParser
101
99
ENTITYDECL = /\s *(?:#{ GEDECL } )|(?:#{ PEDECL } )/um
102
100
103
101
NOTATIONDECL_START = /\A \s *<!NOTATION/um
104
- PUBLIC = /\A \s *<!NOTATION\s +#{ NAME } \s +(PUBLIC)\s +#{ PUBIDLITERAL } (?:\s +#{ SYSTEMLITERAL } )?\s *>/um
105
- SYSTEM = /\A \s *<!NOTATION\s +#{ NAME } \s +(SYSTEM)\s +#{ SYSTEMLITERAL } \s *>/um
102
+ EXTERNAL_ID_PUBLIC = /\A \s *PUBLIC\s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } \s */um
103
+ EXTERNAL_ID_SYSTEM = /\A \s *SYSTEM\s +#{ SYSTEMLITERAL } \s */um
104
+ PUBLIC_ID = /\A \s *PUBLIC\s +#{ PUBIDLITERAL } \s */um
106
105
107
106
EREFERENCE = /&(?!#{ NAME } ;)/
108
107
@@ -225,24 +224,37 @@ def pull_event
225
224
when INSTRUCTION_START
226
225
return process_instruction
227
226
when DOCTYPE_START
228
- md = @source . match ( DOCTYPE_PATTERN , true )
227
+ base_error_message = "Malformed DOCTYPE"
228
+ @source . match ( DOCTYPE_START , true )
229
229
@nsstack . unshift ( curr_ns = Set . new )
230
- identity = md [ 1 ]
231
- close = md [ 2 ]
232
- identity =~ IDENTITY
233
- name = $1
234
- raise REXML ::ParseException . new ( "DOCTYPE is missing a name" ) if name . nil?
235
- pub_sys = $2. nil? ? nil : $2. strip
236
- long_name = $4. nil? ? nil : $4. strip
237
- uri = $6. nil? ? nil : $6. strip
238
- args = [ :start_doctype , name , pub_sys , long_name , uri ]
239
- if close == ">"
230
+ name = parse_name ( base_error_message )
231
+ if @source . match ( /\A \s *\[ /um , true )
232
+ id = [ nil , nil , nil ]
233
+ @document_status = :in_doctype
234
+ elsif @source . match ( /\A \s *>/um , true )
235
+ id = [ nil , nil , nil ]
240
236
@document_status = :after_doctype
241
- @source . read if @source . buffer . size <2
242
- md = @source . match ( /^\s */um , true )
243
- @stack << [ :end_doctype ]
244
237
else
245
- @document_status = :in_doctype
238
+ id = parse_id ( base_error_message ,
239
+ accept_external_id : true ,
240
+ accept_public_id : false )
241
+ if id [ 0 ] == "SYSTEM"
242
+ # For backward compatibility
243
+ id [ 1 ] , id [ 2 ] = id [ 2 ] , nil
244
+ end
245
+ if @source . match ( /\A \s *\[ /um , true )
246
+ @document_status = :in_doctype
247
+ elsif @source . match ( /\A \s *>/um , true )
248
+ @document_status = :after_doctype
249
+ else
250
+ message = "#{ base_error_message } : garbage after external ID"
251
+ raise REXML ::ParseException . new ( message , @source )
252
+ end
253
+ end
254
+ args = [ :start_doctype , name , *id ]
255
+ if @document_status == :after_doctype
256
+ @source . match ( /\A \s */um , true )
257
+ @stack << [ :end_doctype ]
246
258
end
247
259
return args
248
260
when /^\s +/
@@ -313,27 +325,24 @@ def pull_event
313
325
end
314
326
return [ :attlistdecl , element , pairs , contents ]
315
327
when NOTATIONDECL_START
316
- md = nil
317
- if @source . match ( PUBLIC )
318
- md = @source . match ( PUBLIC , true )
319
- pubid = system = nil
320
- pubid_literal = md [ 3 ]
321
- pubid = pubid_literal [ 1 ..-2 ] if pubid_literal # Remove quote
322
- system_literal = md [ 4 ]
323
- system = system_literal [ 1 ..-2 ] if system_literal # Remove quote
324
- vals = [ md [ 1 ] , md [ 2 ] , pubid , system ]
325
- elsif @source . match ( SYSTEM )
326
- md = @source . match ( SYSTEM , true )
327
- system = nil
328
- system_literal = md [ 3 ]
329
- system = system_literal [ 1 ..-2 ] if system_literal # Remove quote
330
- vals = [ md [ 1 ] , md [ 2 ] , nil , system ]
331
- else
332
- details = notation_decl_invalid_details
333
- message = "Malformed notation declaration: #{ details } "
328
+ base_error_message = "Malformed notation declaration"
329
+ unless @source . match ( /\A \s *<!NOTATION\s +/um , true )
330
+ if @source . match ( /\A \s *<!NOTATION\s *>/um )
331
+ message = "#{ base_error_message } : name is missing"
332
+ else
333
+ message = "#{ base_error_message } : invalid declaration name"
334
+ end
335
+ raise REXML ::ParseException . new ( message , @source )
336
+ end
337
+ name = parse_name ( base_error_message )
338
+ id = parse_id ( base_error_message ,
339
+ accept_external_id : true ,
340
+ accept_public_id : true )
341
+ unless @source . match ( /\A \s *>/um , true )
342
+ message = "#{ base_error_message } : garbage before end >"
334
343
raise REXML ::ParseException . new ( message , @source )
335
344
end
336
- return [ :notationdecl , * vals ]
345
+ return [ :notationdecl , name , * id ]
337
346
when DOCTYPE_END
338
347
@document_status = :after_doctype
339
348
@source . match ( DOCTYPE_END , true )
@@ -488,6 +497,85 @@ def need_source_encoding_update?(xml_declaration_encoding)
488
497
true
489
498
end
490
499
500
+ def parse_name ( base_error_message )
501
+ md = @source . match ( /\A \s *#{ NAME } /um , true )
502
+ unless md
503
+ if @source . match ( /\A \s *\S /um )
504
+ message = "#{ base_error_message } : invalid name"
505
+ else
506
+ message = "#{ base_error_message } : name is missing"
507
+ end
508
+ raise REXML ::ParseException . new ( message , @source )
509
+ end
510
+ md [ 1 ]
511
+ end
512
+
513
+ def parse_id ( base_error_message ,
514
+ accept_external_id :,
515
+ accept_public_id :)
516
+ if accept_external_id and ( md = @source . match ( EXTERNAL_ID_PUBLIC , true ) )
517
+ pubid = system = nil
518
+ pubid_literal = md [ 1 ]
519
+ pubid = pubid_literal [ 1 ..-2 ] if pubid_literal # Remove quote
520
+ system_literal = md [ 2 ]
521
+ system = system_literal [ 1 ..-2 ] if system_literal # Remove quote
522
+ [ "PUBLIC" , pubid , system ]
523
+ elsif accept_public_id and ( md = @source . match ( PUBLIC_ID , true ) )
524
+ pubid = system = nil
525
+ pubid_literal = md [ 1 ]
526
+ pubid = pubid_literal [ 1 ..-2 ] if pubid_literal # Remove quote
527
+ [ "PUBLIC" , pubid , nil ]
528
+ elsif accept_external_id and ( md = @source . match ( EXTERNAL_ID_SYSTEM , true ) )
529
+ system = nil
530
+ system_literal = md [ 1 ]
531
+ system = system_literal [ 1 ..-2 ] if system_literal # Remove quote
532
+ [ "SYSTEM" , nil , system ]
533
+ else
534
+ details = parse_id_invalid_details ( accept_external_id : accept_external_id ,
535
+ accept_public_id : accept_public_id )
536
+ message = "#{ base_error_message } : #{ details } "
537
+ raise REXML ::ParseException . new ( message , @source )
538
+ end
539
+ end
540
+
541
+ def parse_id_invalid_details ( accept_external_id :,
542
+ accept_public_id :)
543
+ public = /\A \s *PUBLIC/um
544
+ system = /\A \s *SYSTEM/um
545
+ if ( accept_external_id or accept_public_id ) and @source . match ( /#{ public } /um )
546
+ if @source . match ( /#{ public } (?:\s +[^'"]|\s *[\[ >])/um )
547
+ return "public ID literal is missing"
548
+ end
549
+ unless @source . match ( /#{ public } \s +#{ PUBIDLITERAL } /um )
550
+ return "invalid public ID literal"
551
+ end
552
+ if accept_public_id
553
+ if @source . match ( /#{ public } \s +#{ PUBIDLITERAL } \s +[^'"]/um )
554
+ return "system ID literal is missing"
555
+ end
556
+ unless @source . match ( /#{ public } \s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } /um )
557
+ return "invalid system literal"
558
+ end
559
+ "garbage after system literal"
560
+ else
561
+ "garbage after public ID literal"
562
+ end
563
+ elsif accept_external_id and @source . match ( /#{ system } /um )
564
+ if @source . match ( /#{ system } (?:\s +[^'"]|\s *[\[ >])/um )
565
+ return "system literal is missing"
566
+ end
567
+ unless @source . match ( /#{ system } \s +#{ SYSTEMLITERAL } /um )
568
+ return "invalid system literal"
569
+ end
570
+ "garbage after system literal"
571
+ else
572
+ unless @source . match ( /\A \s *(?:PUBLIC|SYSTEM)\s /um )
573
+ return "invalid ID type"
574
+ end
575
+ "ID type is missing"
576
+ end
577
+ end
578
+
491
579
def process_instruction
492
580
match_data = @source . match ( INSTRUCTION_PATTERN , true )
493
581
unless match_data
@@ -580,42 +668,6 @@ def parse_attributes(prefixes, curr_ns)
580
668
end
581
669
return attributes , closed
582
670
end
583
-
584
- def notation_decl_invalid_details
585
- name = /#{ NOTATIONDECL_START } \s +#{ NAME } /um
586
- public = /#{ name } \s +PUBLIC/um
587
- system = /#{ name } \s +SYSTEM/um
588
- if @source . match ( /#{ NOTATIONDECL_START } \s *>/um )
589
- return "name is missing"
590
- elsif not @source . match ( /#{ name } [\s >]/um )
591
- return "invalid name"
592
- elsif @source . match ( /#{ name } \s *>/um )
593
- return "ID type is missing"
594
- elsif not @source . match ( /#{ name } \s +(?:PUBLIC|SYSTEM)[\s >]/um )
595
- return "invalid ID type"
596
- elsif @source . match ( /#{ public } /um )
597
- if @source . match ( /#{ public } \s *>/um )
598
- return "public ID literal is missing"
599
- elsif not @source . match ( /#{ public } \s +#{ PUBIDLITERAL } /um )
600
- return "invalid public ID literal"
601
- elsif @source . match ( /#{ public } \s +#{ PUBIDLITERAL } [^\s >]/um )
602
- return "garbage after public ID literal"
603
- elsif not @source . match ( /#{ public } \s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } /um )
604
- return "invalid system literal"
605
- elsif not @source . match ( /#{ public } \s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } \s *>/um )
606
- return "garbage after system literal"
607
- end
608
- elsif @source . match ( /#{ system } /um )
609
- if @source . match ( /#{ system } \s *>/um )
610
- return "system literal is missing"
611
- elsif not @source . match ( /#{ system } \s +#{ SYSTEMLITERAL } /um )
612
- return "invalid system literal"
613
- elsif not @source . match ( /#{ system } \s +#{ SYSTEMLITERAL } \s *>/um )
614
- return "garbage after system literal"
615
- end
616
- end
617
- "end > is missing"
618
- end
619
671
end
620
672
end
621
673
end
0 commit comments