From c18609de978d2aba2d18efa9999ec3b6a0f11024 Mon Sep 17 00:00:00 2001
From: bcapuano <bcapuano@asu.edu>
Date: Sat, 4 Jan 2025 21:07:20 -0700
Subject: [PATCH 1/5] Initial implementation of fuzzing harness

Added pipeline

Fix path in build script

Formatting

Add

Updated changelog to reflect PR number
---
 .github/workflows/cifuzz.yml | 40 ++++++++++++++++
 CHANGELOG.md                 |  1 +
 README.md                    |  1 +
 fuzz/build.sh                | 13 ++++++
 fuzz/fuzz_helpers.py         | 91 ++++++++++++++++++++++++++++++++++++
 fuzz/pdf_load_fuzzer.py      | 59 +++++++++++++++++++++++
 6 files changed, 205 insertions(+)
 create mode 100644 .github/workflows/cifuzz.yml
 create mode 100755 fuzz/build.sh
 create mode 100644 fuzz/fuzz_helpers.py
 create mode 100644 fuzz/pdf_load_fuzzer.py

diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
new file mode 100644
index 00000000..115bd7f1
--- /dev/null
+++ b/.github/workflows/cifuzz.yml
@@ -0,0 +1,40 @@
+name: CIFuzz
+on:
+  push:
+    branches:
+      - stable
+      - develop
+  pull_request:
+permissions: {}
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    permissions:
+      security-events: write
+    steps:
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'pdfplumber'
+        language: python
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'pdfplumber'
+        language: python
+        fuzz-seconds: 800
+        output-sarif: true
+    - name: Upload Crash
+      uses: actions/upload-artifact@v3
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts
+    - name: Upload Sarif
+      if: always() && steps.build.outcome == 'success'
+      uses: github/codeql-action/upload-sarif@v2
+      with:
+        # Path to SARIF file relative to the root of the repository
+        sarif_file: cifuzz-sarif/results.sarif
+        checkout_path: cifuzz-sarif
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9f02c888..57f9b267 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. The format
 - Add `--format text` options to CLI (in addition to previously-available `csv` and `json`) (h/t @brandonrobertz). ([#1235](https://github.com/jsvine/pdfplumber/pull/1235))
 - Add `raise_unicode_errors: bool` parameter to `pdfplumber.open()` to allow bypassing `UnicodeDecodeError`s in annotation-parsing and generate warnings instead (h/t @stolarczyk). ([#1195](https://github.com/jsvine/pdfplumber/issues/1195))
 - Add `name` property to `image` objects (h/t @djr2015). ([#1201](https://github.com/jsvine/pdfplumber/discussions/1201))
+- Added necessary build scripts, pipelines, and harnesses to integrate with [OSS-Fuzz](https://github.com/google/oss-fuzz). ([#1245](https://github.com/jsvine/pdfplumber/pull/1245)
 
 ### Fixed
 
diff --git a/README.md b/README.md
index 8370475b..e7e1598c 100644
--- a/README.md
+++ b/README.md
@@ -570,6 +570,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
 - [@wodny](https://github.com/wodny)
 - [Michal Stolarczyk](https://github.com/stolarczyk)
 - [Brandon Roberts](https://github.com/brandonrobertz)
+- [@ennamarie19](https://github.com/ennamarie19/)
 
 ## Contributing
 
diff --git a/fuzz/build.sh b/fuzz/build.sh
new file mode 100755
index 00000000..72734874
--- /dev/null
+++ b/fuzz/build.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -eu
+
+cd "$SRC"/pdfplumber
+pip3 install .
+
+# Build fuzzers in $OUT
+for fuzzer in $(find fuzz -name '*_fuzzer.py');do
+  compile_python_fuzzer "$fuzzer"
+done
+
+mkdir -p fuzz/corpus
+find . -name "*.pdf" -exec cp "{}" fuzz/corpus \;
+zip -q $OUT/pdf_load_fuzzer_seed_corpus.zip fuzz/corpus/*
diff --git a/fuzz/fuzz_helpers.py b/fuzz/fuzz_helpers.py
new file mode 100644
index 00000000..f5ea91b0
--- /dev/null
+++ b/fuzz/fuzz_helpers.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python3
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+import contextlib
+import io
+import tempfile
+from enum import IntEnum
+from typing import Protocol, Type, TypeVar
+
+import atheris
+
+
+class HasMax(Protocol):
+    MAX: int
+
+
+T = TypeVar("T", bound=IntEnum)
+
+
+class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider):
+    def ConsumeRandomBytes(self) -> bytes:
+        return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes()))
+
+    def ConsumeRandomString(self) -> str:
+        return self.ConsumeUnicodeNoSurrogates(
+            self.ConsumeIntInRange(0, self.remaining_bytes())
+        )
+
+    def ConsumeRemainingString(self) -> str:
+        return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes())
+
+    def ConsumeRemainingBytes(self) -> bytes:
+        return self.ConsumeBytes(self.remaining_bytes())
+
+    @contextlib.contextmanager
+    def ConsumeMemoryFile(
+        self, all_data: bool = False, as_bytes: bool = True
+    ) -> io.BytesIO:
+        if all_data:
+            file_data = (
+                self.ConsumeRemainingBytes()
+                if as_bytes
+                else self.ConsumeRemainingString()
+            )
+        else:
+            file_data = (
+                self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
+            )
+
+        file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data)
+        yield file
+        file.close()
+
+    @contextlib.contextmanager
+    def ConsumeTemporaryFile(
+        self, suffix: str, all_data: bool = False, as_bytes: bool = True
+    ) -> str:
+        if all_data:
+            file_data = (
+                self.ConsumeRemainingBytes()
+                if as_bytes
+                else self.ConsumeRemainingString()
+            )
+        else:
+            file_data = (
+                self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
+            )
+
+        mode = "w+b" if as_bytes else "w+"
+        tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix)
+        tfile.write(file_data)
+        tfile.seek(0)
+        tfile.flush()
+        yield tfile.name
+        tfile.close()
+
+    def ConsumeEnum(self, enum_type: Type[T]) -> T:
+        return enum_type(self.ConsumeIntInRange(0, enum_type.MAX))
diff --git a/fuzz/pdf_load_fuzzer.py b/fuzz/pdf_load_fuzzer.py
new file mode 100644
index 00000000..d40800dc
--- /dev/null
+++ b/fuzz/pdf_load_fuzzer.py
@@ -0,0 +1,59 @@
+import sys
+from enum import IntEnum
+
+import atheris
+from fuzz_helpers import EnhancedFuzzedDataProvider
+
+with atheris.instrument_imports(include=["pdfplumber"]):
+    from pdfminer.pdftypes import PDFException
+    from pdfminer.psparser import PSException
+
+    import pdfplumber
+
+
+class CastType(IntEnum):
+    CSV = 0
+    IMAGE = 1
+    JSON = 2
+    DICT = 3
+    MAX = 4
+
+
+def TestOneInput(data: bytes):
+    fdp = EnhancedFuzzedDataProvider(data)
+
+    try:
+        with fdp.ConsumeMemoryFile(all_data=False, as_bytes=True) as f:
+            pdf = pdfplumber.open(f)
+
+            # Test casting
+            cast_ty = fdp.ConsumeEnum(CastType)
+
+            if cast_ty is CastType.CSV:
+                pdf.to_csv()
+            elif cast_ty is CastType.IMAGE and pdf.pages:
+                pdf.pages[0].to_image()
+            elif cast_ty is CastType.JSON:
+                pdf.to_json()
+            elif cast_ty is CastType.DICT:
+                pdf.to_dict()
+
+    except (PDFException, PSException, AssertionError):
+        return -1
+    except ValueError as e:
+        if "invalid literal for int" in str(e):
+            return -1
+        raise e
+    except TypeError as e:
+        if "argument must be a string" in str(e):
+            return -1
+        raise e
+
+
+def main():
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()

From 580da860fd9e1d552da411301e38387e82fc1b3f Mon Sep 17 00:00:00 2001
From: bcapuano <bcapuano@asu.edu>
Date: Fri, 17 Jan 2025 18:05:20 -0700
Subject: [PATCH 2/5] Added nightly upstream sync

---
 .github/workflows/sync.yml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 .github/workflows/sync.yml

diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml
new file mode 100644
index 00000000..3ed77882
--- /dev/null
+++ b/.github/workflows/sync.yml
@@ -0,0 +1,28 @@
+name: Sync Fork with Upstream
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # Run daily at midnight UTC
+  workflow_dispatch: # Allow manual triggering
+
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Fork
+        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
+
+      - name: Set Upstream
+        run: |
+          git remote add upstream https://github.com/jsvine/pdfplumber
+          git fetch upstream
+          git checkout stable
+          git merge upstream/stable
+
+      - name: Push Changes to Fork
+        run: |
+          git push origin stable
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From c6ebd23318f1750c8be403a632f8073ec6d87fcf Mon Sep 17 00:00:00 2001
From: bcapuano <bcapuano@asu.edu>
Date: Fri, 17 Jan 2025 18:14:26 -0700
Subject: [PATCH 3/5] More work on sync

---
 .github/workflows/sync.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml
index 3ed77882..29f6d2d0 100644
--- a/.github/workflows/sync.yml
+++ b/.github/workflows/sync.yml
@@ -19,7 +19,8 @@ jobs:
           git remote add upstream https://github.com/jsvine/pdfplumber
           git fetch upstream
           git checkout stable
-          git merge upstream/stable
+          git merge upstream/stable --allow-unrelated-histories
+          git status
 
       - name: Push Changes to Fork
         run: |

From 43464b2c970d3edce5ef361987d6afd3414fd127 Mon Sep 17 00:00:00 2001
From: bcapuano <bcapuano@asu.edu>
Date: Fri, 17 Jan 2025 18:17:33 -0700
Subject: [PATCH 4/5] Abandon idea of syncing fork

---
 .github/workflows/cifuzz.yml | 40 ------------------------------------
 1 file changed, 40 deletions(-)
 delete mode 100644 .github/workflows/cifuzz.yml

diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
deleted file mode 100644
index 115bd7f1..00000000
--- a/.github/workflows/cifuzz.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: CIFuzz
-on:
-  push:
-    branches:
-      - stable
-      - develop
-  pull_request:
-permissions: {}
-jobs:
-  Fuzzing:
-    runs-on: ubuntu-latest
-    permissions:
-      security-events: write
-    steps:
-    - name: Build Fuzzers
-      id: build
-      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
-      with:
-        oss-fuzz-project-name: 'pdfplumber'
-        language: python
-    - name: Run Fuzzers
-      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
-      with:
-        oss-fuzz-project-name: 'pdfplumber'
-        language: python
-        fuzz-seconds: 800
-        output-sarif: true
-    - name: Upload Crash
-      uses: actions/upload-artifact@v3
-      if: failure() && steps.build.outcome == 'success'
-      with:
-        name: artifacts
-        path: ./out/artifacts
-    - name: Upload Sarif
-      if: always() && steps.build.outcome == 'success'
-      uses: github/codeql-action/upload-sarif@v2
-      with:
-        # Path to SARIF file relative to the root of the repository
-        sarif_file: cifuzz-sarif/results.sarif
-        checkout_path: cifuzz-sarif

From 2f091f57d53d0f742105e917afcd23e7cf79864b Mon Sep 17 00:00:00 2001
From: bcapuano <bcapuano@asu.edu>
Date: Wed, 12 Feb 2025 13:30:26 -0700
Subject: [PATCH 5/5] Add new exceptions to handler

---
 fuzz/pdf_load_fuzzer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fuzz/pdf_load_fuzzer.py b/fuzz/pdf_load_fuzzer.py
index d40800dc..eaa9d617 100644
--- a/fuzz/pdf_load_fuzzer.py
+++ b/fuzz/pdf_load_fuzzer.py
@@ -7,6 +7,7 @@
 with atheris.instrument_imports(include=["pdfplumber"]):
     from pdfminer.pdftypes import PDFException
     from pdfminer.psparser import PSException
+    from pdfplumber.utils.exceptions import MalformedPDFException, PdfminerException
 
     import pdfplumber
 
@@ -38,7 +39,7 @@ def TestOneInput(data: bytes):
             elif cast_ty is CastType.DICT:
                 pdf.to_dict()
 
-    except (PDFException, PSException, AssertionError):
+    except (PDFException, PSException, AssertionError, MalformedPDFException, PdfminerException):
         return -1
     except ValueError as e:
         if "invalid literal for int" in str(e):