Skip to content

Commit

Permalink
Adapt parser to handle #119
Browse files Browse the repository at this point in the history
  • Loading branch information
Hopding committed Jun 8, 2019
1 parent 9e6148b commit 2ffb88f
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 42 deletions.
48 changes: 25 additions & 23 deletions src/core/parser/PDFObjectParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ import { DigitChars, NumericChars } from 'src/core/syntax/Numeric';
import { WhitespaceChars } from 'src/core/syntax/Whitespace';
import { charFromCode } from 'src/utils';

const { Newline, CarriageReturn } = CharCodes;

// TODO: Throw error if eof is reached before finishing object parse...
class PDFObjectParser extends BaseParser {
static forBytes = (bytes: Uint8Array, context: PDFContext) =>
Expand Down Expand Up @@ -204,19 +202,19 @@ class PDFObjectParser extends BaseParser {
}
}

// TODO: Handle 'stream \r\n' (https://github.com/Hopding/pdf-lib/issues/119)
protected parseDictOrStream(): PDFDict | PDFStream {
const dict = this.parseDict();

this.skipWhitespaceAndComments();
if (!this.matchKeyword(Keywords.stream)) return dict;

// Move past the EOL marker following `stream` (\r\n or \r or \n)
const byte = this.bytes.peek();
if (byte === Newline) this.bytes.next();
if (byte === CarriageReturn) {
this.bytes.next();
if (this.bytes.peek() === Newline) this.bytes.next();
if (
!this.matchKeyword(Keywords.streamEOF1) &&
!this.matchKeyword(Keywords.streamEOF2) &&
!this.matchKeyword(Keywords.streamEOF3) &&
!this.matchKeyword(Keywords.streamEOF4) &&
!this.matchKeyword(Keywords.stream)
) {
return dict;
}

const start = this.bytes.offset();
Expand All @@ -225,26 +223,30 @@ class PDFObjectParser extends BaseParser {

// Move to end of stream, while handling nested streams
let nestingLvl = 1;
let end = this.bytes.offset();

while (!this.bytes.done()) {
if (this.matchKeyword(Keywords.stream)) nestingLvl += 1;
if (this.matchKeyword(Keywords.endstream)) nestingLvl -= 1;
end = this.bytes.offset();

if (this.matchKeyword(Keywords.stream)) {
nestingLvl += 1;
} else if (
this.matchKeyword(Keywords.EOF1endstream) ||
this.matchKeyword(Keywords.EOF2endstream) ||
this.matchKeyword(Keywords.EOF3endstream) ||
this.matchKeyword(Keywords.endstream)
) {
nestingLvl -= 1;
} else {
this.bytes.next();
}

if (nestingLvl === 0) break;
this.bytes.next();
}

// TODO: Create proper error object for this
if (nestingLvl !== 0) throw new Error('FIX ME!');

let end = this.bytes.offset() - Keywords.endstream.length;

// Move back our `end` marker to account for the EOL marker that should
// be in front of `endstream` (\r\n or \r or \n)
const twoBack = this.bytes.peekAt(end - 2);
const oneBack = this.bytes.peekAt(end - 1);
if (twoBack === CarriageReturn && oneBack === Newline) end -= 2;
else if (oneBack === CarriageReturn) end -= 1;
else if (oneBack === Newline) end -= 1;

const contents = this.bytes.slice(start, end);

return PDFRawStream.of(dict, contents);
Expand Down
51 changes: 32 additions & 19 deletions src/core/syntax/Keywords.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
import CharCodes from 'src/core/syntax/CharCodes';

const { Space, CarriageReturn, Newline } = CharCodes;

const stream = [
CharCodes.s,
CharCodes.t,
CharCodes.r,
CharCodes.e,
CharCodes.a,
CharCodes.m,
];

const endstream = [
CharCodes.e,
CharCodes.n,
CharCodes.d,
CharCodes.s,
CharCodes.t,
CharCodes.r,
CharCodes.e,
CharCodes.a,
CharCodes.m,
];

export const Keywords = {
header: [
CharCodes.Percent,
Expand Down Expand Up @@ -48,23 +71,13 @@ export const Keywords = {
true: [CharCodes.t, CharCodes.r, CharCodes.u, CharCodes.e],
false: [CharCodes.f, CharCodes.a, CharCodes.l, CharCodes.s, CharCodes.e],
null: [CharCodes.n, CharCodes.u, CharCodes.l, CharCodes.l],
stream: [
CharCodes.s,
CharCodes.t,
CharCodes.r,
CharCodes.e,
CharCodes.a,
CharCodes.m,
],
endstream: [
CharCodes.e,
CharCodes.n,
CharCodes.d,
CharCodes.s,
CharCodes.t,
CharCodes.r,
CharCodes.e,
CharCodes.a,
CharCodes.m,
],
stream,
streamEOF1: [...stream, Space, CarriageReturn, Newline],
streamEOF2: [...stream, CarriageReturn, Newline],
streamEOF3: [...stream, CarriageReturn],
streamEOF4: [...stream, Newline],
endstream,
EOF1endstream: [CarriageReturn, Newline, ...endstream],
EOF2endstream: [CarriageReturn, ...endstream],
EOF3endstream: [Newline, ...endstream],
};
10 changes: 10 additions & 0 deletions tests/core/parser/PDFObjectParser.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,16 @@ describe(`PDFObjectParser`, () => {
});
});

// Note that the ' \r\n' sequence following the 'stream' keyword is
// technically invalid (per the specification). But some PDFs have it, so
// we will support it anyways.
it(`handles streams with a space, carriage return, and a newline following the 'stream' keyword`, () => {
expectParse(`<<>>\r\nstream \r\n Stuff and Things \nendstream`);
expectParseStr(`<<>>\r\nstream \r\n Stuff and Things \nendstream`).toBe(
'<<\n/Length 18\n>>\nstream\n Stuff and Things \nendstream',
);
});

it(`handles streams with a carriage return and a newline following the 'stream' keyword`, () => {
expectParse(`<<>>\r\nstream\r\n Stuff and Things \nendstream`);
expectParseStr(`<<>>\r\nstream\r\n Stuff and Things \nendstream`).toBe(
Expand Down

0 comments on commit 2ffb88f

Please # to comment.