diff --git a/src/core/parser/PDFObjectParser.ts b/src/core/parser/PDFObjectParser.ts index 15d093ec7..88e603b1d 100644 --- a/src/core/parser/PDFObjectParser.ts +++ b/src/core/parser/PDFObjectParser.ts @@ -27,8 +27,6 @@ import { DigitChars, NumericChars } from 'src/core/syntax/Numeric'; import { WhitespaceChars } from 'src/core/syntax/Whitespace'; import { charFromCode } from 'src/utils'; -const { Newline, CarriageReturn } = CharCodes; - // TODO: Throw error if eof is reached before finishing object parse... class PDFObjectParser extends BaseParser { static forBytes = (bytes: Uint8Array, context: PDFContext) => @@ -204,19 +202,19 @@ class PDFObjectParser extends BaseParser { } } - // TODO: Handle 'stream \r\n' (https://github.com/Hopding/pdf-lib/issues/119) protected parseDictOrStream(): PDFDict | PDFStream { const dict = this.parseDict(); this.skipWhitespaceAndComments(); - if (!this.matchKeyword(Keywords.stream)) return dict; - // Move past the EOL marker following `stream` (\r\n or \r or \n) - const byte = this.bytes.peek(); - if (byte === Newline) this.bytes.next(); - if (byte === CarriageReturn) { - this.bytes.next(); - if (this.bytes.peek() === Newline) this.bytes.next(); + if ( + !this.matchKeyword(Keywords.streamEOF1) && + !this.matchKeyword(Keywords.streamEOF2) && + !this.matchKeyword(Keywords.streamEOF3) && + !this.matchKeyword(Keywords.streamEOF4) && + !this.matchKeyword(Keywords.stream) + ) { + return dict; } const start = this.bytes.offset(); @@ -225,26 +223,30 @@ class PDFObjectParser extends BaseParser { // Move to end of stream, while handling nested streams let nestingLvl = 1; + let end = this.bytes.offset(); + while (!this.bytes.done()) { - if (this.matchKeyword(Keywords.stream)) nestingLvl += 1; - if (this.matchKeyword(Keywords.endstream)) nestingLvl -= 1; + end = this.bytes.offset(); + + if (this.matchKeyword(Keywords.stream)) { + nestingLvl += 1; + } else if ( + this.matchKeyword(Keywords.EOF1endstream) || + this.matchKeyword(Keywords.EOF2endstream) || + this.matchKeyword(Keywords.EOF3endstream) || + this.matchKeyword(Keywords.endstream) + ) { + nestingLvl -= 1; + } else { + this.bytes.next(); + } + if (nestingLvl === 0) break; - this.bytes.next(); } // TODO: Create proper error object for this if (nestingLvl !== 0) throw new Error('FIX ME!'); - let end = this.bytes.offset() - Keywords.endstream.length; - - // Move back our `end` marker to account for the EOL marker that should - // be in front of `endstream` (\r\n or \r or \n) - const twoBack = this.bytes.peekAt(end - 2); - const oneBack = this.bytes.peekAt(end - 1); - if (twoBack === CarriageReturn && oneBack === Newline) end -= 2; - else if (oneBack === CarriageReturn) end -= 1; - else if (oneBack === Newline) end -= 1; - const contents = this.bytes.slice(start, end); return PDFRawStream.of(dict, contents); diff --git a/src/core/syntax/Keywords.ts b/src/core/syntax/Keywords.ts index 596a19259..12e2f957c 100644 --- a/src/core/syntax/Keywords.ts +++ b/src/core/syntax/Keywords.ts @@ -1,5 +1,28 @@ import CharCodes from 'src/core/syntax/CharCodes'; +const { Space, CarriageReturn, Newline } = CharCodes; + +const stream = [ + CharCodes.s, + CharCodes.t, + CharCodes.r, + CharCodes.e, + CharCodes.a, + CharCodes.m, +]; + +const endstream = [ + CharCodes.e, + CharCodes.n, + CharCodes.d, + CharCodes.s, + CharCodes.t, + CharCodes.r, + CharCodes.e, + CharCodes.a, + CharCodes.m, +]; + export const Keywords = { header: [ CharCodes.Percent, @@ -48,23 +71,13 @@ export const Keywords = { true: [CharCodes.t, CharCodes.r, CharCodes.u, CharCodes.e], false: [CharCodes.f, CharCodes.a, CharCodes.l, CharCodes.s, CharCodes.e], null: [CharCodes.n, CharCodes.u, CharCodes.l, CharCodes.l], - stream: [ - CharCodes.s, - CharCodes.t, - CharCodes.r, - CharCodes.e, - CharCodes.a, - CharCodes.m, - ], - endstream: [ - CharCodes.e, - CharCodes.n, - CharCodes.d, - CharCodes.s, - CharCodes.t, - CharCodes.r, - CharCodes.e, - CharCodes.a, - CharCodes.m, - ], + stream, + streamEOF1: [...stream, Space, CarriageReturn, Newline], + streamEOF2: [...stream, CarriageReturn, Newline], + streamEOF3: [...stream, CarriageReturn], + streamEOF4: [...stream, Newline], + endstream, + EOF1endstream: [CarriageReturn, Newline, ...endstream], + EOF2endstream: [CarriageReturn, ...endstream], + EOF3endstream: [Newline, ...endstream], }; diff --git a/tests/core/parser/PDFObjectParser.spec.ts b/tests/core/parser/PDFObjectParser.spec.ts index f4ecae968..3d89ec1b8 100644 --- a/tests/core/parser/PDFObjectParser.spec.ts +++ b/tests/core/parser/PDFObjectParser.spec.ts @@ -509,6 +509,16 @@ describe(`PDFObjectParser`, () => { }); }); + // Note that the ' \r\n' sequence following the 'stream' keyword is + // technically invalid (per the specification). But some PDFs have it, so + // we will support it anyways. + it(`handles streams with a space, carriage return, and a newline following the 'stream' keyword`, () => { + expectParse(`<<>>\r\nstream \r\n Stuff and Things \nendstream`); + expectParseStr(`<<>>\r\nstream \r\n Stuff and Things \nendstream`).toBe( + '<<\n/Length 18\n>>\nstream\n Stuff and Things \nendstream', + ); + }); + it(`handles streams with a carriage return and a newline following the 'stream' keyword`, () => { expectParse(`<<>>\r\nstream\r\n Stuff and Things \nendstream`); expectParseStr(`<<>>\r\nstream\r\n Stuff and Things \nendstream`).toBe(