Skip to content

Commit

Permalink
Forbid whitespace before XML prolog
Browse files Browse the repository at this point in the history
  • Loading branch information
kornelski committed May 22, 2023
1 parent 9ca52b9 commit 535914e
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 12 deletions.
15 changes: 9 additions & 6 deletions src/reader/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,11 @@ pub(crate) struct PullParser {
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
enum Encountered {
None = 0,
Declaration = 1,
Comment = 2,
Doctype = 3,
Element = 4,
AnyChars, // whitespace before <?xml is not allowed
Declaration,
Comment,
Doctype,
Element,
}

impl PullParser {
Expand All @@ -117,7 +118,7 @@ impl PullParser {
PullParser {
config,
lexer,
st: State::OutsideTag,
st: State::DocumentStart,
state_after_reference: State::OutsideTag,
buf: String::new(),
entities: HashMap::new(),
Expand Down Expand Up @@ -159,7 +160,7 @@ impl PullParser {

// If declaration was not parsed and we have encountered an element,
// emit this declaration as the next event.
if prev_enc < Encountered::Declaration {
if prev_enc == Encountered::None {
self.push_pos();
Some(Ok(XmlEvent::StartDocument {
version: DEFAULT_VERSION,
Expand Down Expand Up @@ -191,6 +192,7 @@ pub enum State {
InsideDeclaration(DeclarationSubstate),
InsideDoctype(DoctypeSubstate),
InsideReference,
DocumentStart,
}

#[derive(Copy, Clone, PartialEq)]
Expand Down Expand Up @@ -423,6 +425,7 @@ impl PullParser {
State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
State::InsideDoctype(s) => self.inside_doctype(t, s),
State::InsideDeclaration(s) => self.inside_declaration(t, s),
State::DocumentStart => self.document_start(t),
}
}

Expand Down
52 changes: 51 additions & 1 deletion src/reader/parser/outside_tag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ impl PullParser {
Token::ProcessingInstructionStart =>
self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event),


Token::CDataStart if self.depth() > 0 => {
self.into_state(State::InsideCData, next_event)
},
Expand All @@ -138,4 +137,55 @@ impl PullParser {
}
}
}

pub fn document_start(&mut self, t: Token) -> Option<Result> {
debug_assert!(self.encountered < Encountered::Declaration);

match t {
Token::Character(c) => {
let next_event = self.set_encountered(Encountered::AnyChars);

if !is_whitespace_char(c) {
return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
}
self.inside_whitespace = true;

// skip whitespace outside of the root element
if (self.config.c.trim_whitespace && self.buf.is_empty()) ||
(self.depth() == 0 && self.config.c.ignore_root_level_whitespace) {
return self.into_state(State::OutsideTag, next_event);
}

self.push_pos();
self.buf.push(c);
self.into_state(State::OutsideTag, next_event)
},

Token::CommentStart => {
let next_event = self.set_encountered(Encountered::Comment);
self.into_state(State::InsideComment, next_event)
}

Token::OpeningTagStart => {
let next_event = self.set_encountered(Encountered::Element);
self.nst.push_empty();
self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
},

Token::DoctypeStart => {
let next_event = self.set_encountered(Encountered::Doctype);
// We don't have a doctype event so skip this position
// FIXME: update when we have a doctype event
self.next_pos();
self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
},

Token::ProcessingInstructionStart => {
self.push_pos();
self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName))
},

_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
}
}
}
2 changes: 1 addition & 1 deletion tests/event_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ fn tabs_1() {
test(
b"\t<a>\t<b/></a>",
br#"
|1:2 StartDocument(1.0, UTF-8)
|1:1 StartDocument(1.0, UTF-8)
|1:2 StartElement(a)
|1:6 StartElement(b)
|1:6 EndElement(b)
Expand Down
2 changes: 0 additions & 2 deletions tests/oasis.fail.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
o-p04pass1 p04pass1.xml names with all valid ASCII characters, and one from each other class in NameChar ; 5:8 Element A.-:̀· prefix is unbound
o-p05pass1 p05pass1.xml various valid Name constructions ; 2:8 Element A:._-0 prefix is unbound
o-p01fail1 p01fail1.xml S cannot occur before the prolog
o-p09fail1 p09fail1.xml EntityValue excludes '%'
o-p09fail2 p09fail2.xml EntityValue excludes '&'
o-p12fail1 p12fail1.xml '"' excluded
Expand All @@ -10,7 +9,6 @@ o-p12fail4 p12fail4.xml '>' excluded
o-p12fail5 p12fail5.xml '<' excluded
o-p12fail6 p12fail6.xml built-in entity refs excluded
o-p12fail7 p12fail7.xml The public ID has a tab character, which is disallowed
o-p22fail1 p22fail1.xml prolog must start with XML decl
o-p30fail1 p30fail1.xml An XML declaration is not the same as a TextDecl
o-p31fail1 p31fail1.xml external subset excludes doctypedecl
o-p32fail3 p32fail3.xml initial S is required
Expand Down
1 change: 0 additions & 1 deletion tests/sun-not-wf.fail.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ pubid02 pubid02.xml Illegal characters in public ID
pubid03 pubid03.xml Illegal characters in public ID
pubid04 pubid04.xml Illegal characters in public ID
pubid05 pubid05.xml SGML-ism: public ID without system ID
sgml02 sgml02.xml XML declaration must be at the very beginning of a document; it"s not a processing instruction
sgml04 sgml04.xml ATTLIST declarations apply to only one element, unlike SGML
sgml05 sgml05.xml ELEMENT declarations apply to only one element, unlike SGML
sgml06 sgml06.xml ATTLIST declarations are never global, unlike in SGML
Expand Down
1 change: 0 additions & 1 deletion tests/xmltest.fail.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ not-wf-sa-136 136.xml Tag omission is invalid in XML.
not-wf-sa-137 137.xml Space is required before a content model.
not-wf-sa-138 138.xml Invalid syntax for content particle.
not-wf-sa-139 139.xml The element-content model should not be empty.
not-wf-sa-147 147.xml XML Declaration may not be preceded by whitespace.
not-wf-sa-149 149.xml XML Declaration may not be within a DTD.
not-wf-sa-158 158.xml SGML-ism: "#NOTATION gif" can't have attributes.
not-wf-sa-159 159.xml Uses '&' unquoted in an entity declaration, which is illegal syntax for an entity reference.
Expand Down

0 comments on commit 535914e

Please # to comment.