Adapt parser to handle #119

Hopding · Jun 8, 2019 · 2ffb88f · 2ffb88f
1 parent 9e6148b
commit 2ffb88f
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 42 deletions.
diff --git a/src/core/parser/PDFObjectParser.ts b/src/core/parser/PDFObjectParser.ts
@@ -27,8 +27,6 @@ import { DigitChars, NumericChars } from 'src/core/syntax/Numeric';
 import { WhitespaceChars } from 'src/core/syntax/Whitespace';
 import { charFromCode } from 'src/utils';
 
-const { Newline, CarriageReturn } = CharCodes;
-
 // TODO: Throw error if eof is reached before finishing object parse...
 class PDFObjectParser extends BaseParser {
   static forBytes = (bytes: Uint8Array, context: PDFContext) =>
@@ -204,19 +202,19 @@ class PDFObjectParser extends BaseParser {
     }
   }
 
-  // TODO: Handle 'stream \r\n' (https://github.com/Hopding/pdf-lib/issues/119)
   protected parseDictOrStream(): PDFDict | PDFStream {
     const dict = this.parseDict();
 
     this.skipWhitespaceAndComments();
-    if (!this.matchKeyword(Keywords.stream)) return dict;
 
-    // Move past the EOL marker following `stream` (\r\n or \r or \n)
-    const byte = this.bytes.peek();
-    if (byte === Newline) this.bytes.next();
-    if (byte === CarriageReturn) {
-      this.bytes.next();
-      if (this.bytes.peek() === Newline) this.bytes.next();
+    if (
+      !this.matchKeyword(Keywords.streamEOF1) &&
+      !this.matchKeyword(Keywords.streamEOF2) &&
+      !this.matchKeyword(Keywords.streamEOF3) &&
+      !this.matchKeyword(Keywords.streamEOF4) &&
+      !this.matchKeyword(Keywords.stream)
+    ) {
+      return dict;
     }
 
     const start = this.bytes.offset();
@@ -225,26 +223,30 @@ class PDFObjectParser extends BaseParser {
 
     // Move to end of stream, while handling nested streams
     let nestingLvl = 1;
+    let end = this.bytes.offset();
+
     while (!this.bytes.done()) {
-      if (this.matchKeyword(Keywords.stream)) nestingLvl += 1;
-      if (this.matchKeyword(Keywords.endstream)) nestingLvl -= 1;
+      end = this.bytes.offset();
+
+      if (this.matchKeyword(Keywords.stream)) {
+        nestingLvl += 1;
+      } else if (
+        this.matchKeyword(Keywords.EOF1endstream) ||
+        this.matchKeyword(Keywords.EOF2endstream) ||
+        this.matchKeyword(Keywords.EOF3endstream) ||
+        this.matchKeyword(Keywords.endstream)
+      ) {
+        nestingLvl -= 1;
+      } else {
+        this.bytes.next();
+      }
+
       if (nestingLvl === 0) break;
-      this.bytes.next();
     }
 
     // TODO: Create proper error object for this
     if (nestingLvl !== 0) throw new Error('FIX ME!');
 
-    let end = this.bytes.offset() - Keywords.endstream.length;
-
-    // Move back our `end` marker to account for the EOL marker that should
-    // be in front of `endstream` (\r\n or \r or \n)
-    const twoBack = this.bytes.peekAt(end - 2);
-    const oneBack = this.bytes.peekAt(end - 1);
-    if (twoBack === CarriageReturn && oneBack === Newline) end -= 2;
-    else if (oneBack === CarriageReturn) end -= 1;
-    else if (oneBack === Newline) end -= 1;
-
     const contents = this.bytes.slice(start, end);
 
     return PDFRawStream.of(dict, contents);

diff --git a/src/core/syntax/Keywords.ts b/src/core/syntax/Keywords.ts
@@ -1,5 +1,28 @@
 import CharCodes from 'src/core/syntax/CharCodes';
 
+const { Space, CarriageReturn, Newline } = CharCodes;
+
+const stream = [
+  CharCodes.s,
+  CharCodes.t,
+  CharCodes.r,
+  CharCodes.e,
+  CharCodes.a,
+  CharCodes.m,
+];
+
+const endstream = [
+  CharCodes.e,
+  CharCodes.n,
+  CharCodes.d,
+  CharCodes.s,
+  CharCodes.t,
+  CharCodes.r,
+  CharCodes.e,
+  CharCodes.a,
+  CharCodes.m,
+];
+
 export const Keywords = {
   header: [
     CharCodes.Percent,
@@ -48,23 +71,13 @@ export const Keywords = {
   true: [CharCodes.t, CharCodes.r, CharCodes.u, CharCodes.e],
   false: [CharCodes.f, CharCodes.a, CharCodes.l, CharCodes.s, CharCodes.e],
   null: [CharCodes.n, CharCodes.u, CharCodes.l, CharCodes.l],
-  stream: [
-    CharCodes.s,
-    CharCodes.t,
-    CharCodes.r,
-    CharCodes.e,
-    CharCodes.a,
-    CharCodes.m,
-  ],
-  endstream: [
-    CharCodes.e,
-    CharCodes.n,
-    CharCodes.d,
-    CharCodes.s,
-    CharCodes.t,
-    CharCodes.r,
-    CharCodes.e,
-    CharCodes.a,
-    CharCodes.m,
-  ],
+  stream,
+  streamEOF1: [...stream, Space, CarriageReturn, Newline],
+  streamEOF2: [...stream, CarriageReturn, Newline],
+  streamEOF3: [...stream, CarriageReturn],
+  streamEOF4: [...stream, Newline],
+  endstream,
+  EOF1endstream: [CarriageReturn, Newline, ...endstream],
+  EOF2endstream: [CarriageReturn, ...endstream],
+  EOF3endstream: [Newline, ...endstream],
 };
diff --git a/tests/core/parser/PDFObjectParser.spec.ts b/tests/core/parser/PDFObjectParser.spec.ts
@@ -509,6 +509,16 @@ describe(`PDFObjectParser`, () => {
       });
     });
 
+    // Note that the ' \r\n' sequence following the 'stream' keyword is
+    // technically invalid (per the specification). But some PDFs have it, so
+    // we will support it anyways.
+    it(`handles streams with a space, carriage return, and a newline following the 'stream' keyword`, () => {
+      expectParse(`<<>>\r\nstream \r\n Stuff and Things \nendstream`);
+      expectParseStr(`<<>>\r\nstream \r\n Stuff and Things \nendstream`).toBe(
+        '<<\n/Length 18\n>>\nstream\n Stuff and Things \nendstream',
+      );
+    });
+
     it(`handles streams with a carriage return and a newline following the 'stream' keyword`, () => {
       expectParse(`<<>>\r\nstream\r\n Stuff and Things \nendstream`);
       expectParseStr(`<<>>\r\nstream\r\n Stuff and Things \nendstream`).toBe(