Skip to content

Commit

Permalink
fix: get images from page return value
Browse files Browse the repository at this point in the history
  • Loading branch information
johannschopplich committed Sep 2, 2023
1 parent 7d9d16f commit fc3d7bf
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 14 deletions.
14 changes: 8 additions & 6 deletions src/image.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,18 @@ export async function getImagesFromPage(
const operatorList = await page.getOperatorList();
const { OPS } = await getResolvedPDFJS();

const images: ArrayBuffer[] = [];
for (const op of operatorList.fnArray) {
const images: Uint8ClampedArray[] = [];

for (let i = 0; i < operatorList.fnArray.length; i++) {
const op = operatorList.fnArray[i];

if (op !== OPS.paintImageXObject) {
continue;
}

const image = await page.objs.get(operatorList.argsArray[op][0]);
if (image.data) {
images.push(image.data.buffer);
}
const imageKey = operatorList.argsArray[i][0];
const image = await page.objs.get(imageKey);
images.push(image.data);
}

return images;
Expand Down
Binary file added test/fixtures/image-sample.pdf
Binary file not shown.
28 changes: 20 additions & 8 deletions test/index.test.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import { join } from "node:path";
import { readFile } from "node:fs/promises";
import { describe, expect, it } from "vitest";
import {
extractPDFText,
getDocumentProxy,
getImagesFromPage,
getPDFMeta,
getResolvedPDFJS,
resolvePDFJSImports,
Expand All @@ -26,18 +28,27 @@ describe("unpdf", () => {
expect(version).toMatchSnapshot();
});

it("extracts metadata from a PDF", async () => {
const { info, metadata } = await getPDFMeta(await getPDF());

expect(Object.keys(metadata).length).toEqual(0);
expect(info).toMatchSnapshot();
});

it("extracts text from a PDF", async () => {
const { text, totalPages } = await extractPDFText(await getPDF());

expect(text[0]).toEqual("Dummy PDF file");
expect(totalPages).toEqual(1);
});

it("extracts metadata from a PDF", async () => {
const { info, metadata } = await getPDFMeta(await getPDF());

expect(Object.keys(metadata).length).toEqual(0);
expect(info).toMatchSnapshot();
it("extracts images from a PDF", async () => {
const [image] = await getImagesFromPage(
await getPDF("image-sample.pdf"),
1,
);
const buffer = Buffer.from(image);
expect(buffer.length).toEqual(13_641_540);
});

it("supports PDF passing PDFDocumentProxy", async () => {
Expand All @@ -48,8 +59,9 @@ describe("unpdf", () => {
});
});

export async function getPDF() {
// https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf
const pdf = await readFile(new URL("fixtures/dummy.pdf", import.meta.url));
export async function getPDF(filename = "dummy.pdf") {
const pdf = await readFile(
new URL(join("fixtures", filename), import.meta.url),
);
return new Uint8Array(pdf);
}

0 comments on commit fc3d7bf

Please # to comment.