From c18609de978d2aba2d18efa9999ec3b6a0f11024 Mon Sep 17 00:00:00 2001 From: bcapuano Date: Sat, 4 Jan 2025 21:07:20 -0700 Subject: [PATCH 1/5] Initial implementation of fuzzing harness Added pipeline Fix path in build script Formatting Add Updated changelog to reflect PR number --- .github/workflows/cifuzz.yml | 40 ++++++++++++++++ CHANGELOG.md | 1 + README.md | 1 + fuzz/build.sh | 13 ++++++ fuzz/fuzz_helpers.py | 91 ++++++++++++++++++++++++++++++++++++ fuzz/pdf_load_fuzzer.py | 59 +++++++++++++++++++++++ 6 files changed, 205 insertions(+) create mode 100644 .github/workflows/cifuzz.yml create mode 100755 fuzz/build.sh create mode 100644 fuzz/fuzz_helpers.py create mode 100644 fuzz/pdf_load_fuzzer.py diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml new file mode 100644 index 00000000..115bd7f1 --- /dev/null +++ b/.github/workflows/cifuzz.yml @@ -0,0 +1,40 @@ +name: CIFuzz +on: + push: + branches: + - stable + - develop + pull_request: +permissions: {} +jobs: + Fuzzing: + runs-on: ubuntu-latest + permissions: + security-events: write + steps: + - name: Build Fuzzers + id: build + uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master + with: + oss-fuzz-project-name: 'pdfplumber' + language: python + - name: Run Fuzzers + uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master + with: + oss-fuzz-project-name: 'pdfplumber' + language: python + fuzz-seconds: 800 + output-sarif: true + - name: Upload Crash + uses: actions/upload-artifact@v3 + if: failure() && steps.build.outcome == 'success' + with: + name: artifacts + path: ./out/artifacts + - name: Upload Sarif + if: always() && steps.build.outcome == 'success' + uses: github/codeql-action/upload-sarif@v2 + with: + # Path to SARIF file relative to the root of the repository + sarif_file: cifuzz-sarif/results.sarif + checkout_path: cifuzz-sarif diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f02c888..57f9b267 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. The format - Add `--format text` options to CLI (in addition to previously-available `csv` and `json`) (h/t @brandonrobertz). ([#1235](https://github.com/jsvine/pdfplumber/pull/1235)) - Add `raise_unicode_errors: bool` parameter to `pdfplumber.open()` to allow bypassing `UnicodeDecodeError`s in annotation-parsing and generate warnings instead (h/t @stolarczyk). ([#1195](https://github.com/jsvine/pdfplumber/issues/1195)) - Add `name` property to `image` objects (h/t @djr2015). ([#1201](https://github.com/jsvine/pdfplumber/discussions/1201)) +- Added necessary build scripts, pipelines, and harnesses to integrate with [OSS-Fuzz](https://github.com/google/oss-fuzz). ([#1245](https://github.com/jsvine/pdfplumber/pull/1245) ### Fixed diff --git a/README.md b/README.md index 8370475b..e7e1598c 100644 --- a/README.md +++ b/README.md @@ -570,6 +570,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes - [@wodny](https://github.com/wodny) - [Michal Stolarczyk](https://github.com/stolarczyk) - [Brandon Roberts](https://github.com/brandonrobertz) +- [@ennamarie19](https://github.com/ennamarie19/) ## Contributing diff --git a/fuzz/build.sh b/fuzz/build.sh new file mode 100755 index 00000000..72734874 --- /dev/null +++ b/fuzz/build.sh @@ -0,0 +1,13 @@ +#!/bin/bash -eu + +cd "$SRC"/pdfplumber +pip3 install . + +# Build fuzzers in $OUT +for fuzzer in $(find fuzz -name '*_fuzzer.py');do + compile_python_fuzzer "$fuzzer" +done + +mkdir -p fuzz/corpus +find . -name "*.pdf" -exec cp "{}" fuzz/corpus \; +zip -q $OUT/pdf_load_fuzzer_seed_corpus.zip fuzz/corpus/* diff --git a/fuzz/fuzz_helpers.py b/fuzz/fuzz_helpers.py new file mode 100644 index 00000000..f5ea91b0 --- /dev/null +++ b/fuzz/fuzz_helpers.py @@ -0,0 +1,91 @@ +#!/usr/bin/python3 +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ +import contextlib +import io +import tempfile +from enum import IntEnum +from typing import Protocol, Type, TypeVar + +import atheris + + +class HasMax(Protocol): + MAX: int + + +T = TypeVar("T", bound=IntEnum) + + +class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider): + def ConsumeRandomBytes(self) -> bytes: + return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes())) + + def ConsumeRandomString(self) -> str: + return self.ConsumeUnicodeNoSurrogates( + self.ConsumeIntInRange(0, self.remaining_bytes()) + ) + + def ConsumeRemainingString(self) -> str: + return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes()) + + def ConsumeRemainingBytes(self) -> bytes: + return self.ConsumeBytes(self.remaining_bytes()) + + @contextlib.contextmanager + def ConsumeMemoryFile( + self, all_data: bool = False, as_bytes: bool = True + ) -> io.BytesIO: + if all_data: + file_data = ( + self.ConsumeRemainingBytes() + if as_bytes + else self.ConsumeRemainingString() + ) + else: + file_data = ( + self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString() + ) + + file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data) + yield file + file.close() + + @contextlib.contextmanager + def ConsumeTemporaryFile( + self, suffix: str, all_data: bool = False, as_bytes: bool = True + ) -> str: + if all_data: + file_data = ( + self.ConsumeRemainingBytes() + if as_bytes + else self.ConsumeRemainingString() + ) + else: + file_data = ( + self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString() + ) + + mode = "w+b" if as_bytes else "w+" + tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix) + tfile.write(file_data) + tfile.seek(0) + tfile.flush() + yield tfile.name + tfile.close() + + def ConsumeEnum(self, enum_type: Type[T]) -> T: + return enum_type(self.ConsumeIntInRange(0, enum_type.MAX)) diff --git a/fuzz/pdf_load_fuzzer.py b/fuzz/pdf_load_fuzzer.py new file mode 100644 index 00000000..d40800dc --- /dev/null +++ b/fuzz/pdf_load_fuzzer.py @@ -0,0 +1,59 @@ +import sys +from enum import IntEnum + +import atheris +from fuzz_helpers import EnhancedFuzzedDataProvider + +with atheris.instrument_imports(include=["pdfplumber"]): + from pdfminer.pdftypes import PDFException + from pdfminer.psparser import PSException + + import pdfplumber + + +class CastType(IntEnum): + CSV = 0 + IMAGE = 1 + JSON = 2 + DICT = 3 + MAX = 4 + + +def TestOneInput(data: bytes): + fdp = EnhancedFuzzedDataProvider(data) + + try: + with fdp.ConsumeMemoryFile(all_data=False, as_bytes=True) as f: + pdf = pdfplumber.open(f) + + # Test casting + cast_ty = fdp.ConsumeEnum(CastType) + + if cast_ty is CastType.CSV: + pdf.to_csv() + elif cast_ty is CastType.IMAGE and pdf.pages: + pdf.pages[0].to_image() + elif cast_ty is CastType.JSON: + pdf.to_json() + elif cast_ty is CastType.DICT: + pdf.to_dict() + + except (PDFException, PSException, AssertionError): + return -1 + except ValueError as e: + if "invalid literal for int" in str(e): + return -1 + raise e + except TypeError as e: + if "argument must be a string" in str(e): + return -1 + raise e + + +def main(): + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() From 580da860fd9e1d552da411301e38387e82fc1b3f Mon Sep 17 00:00:00 2001 From: bcapuano Date: Fri, 17 Jan 2025 18:05:20 -0700 Subject: [PATCH 2/5] Added nightly upstream sync --- .github/workflows/sync.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/sync.yml diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml new file mode 100644 index 00000000..3ed77882 --- /dev/null +++ b/.github/workflows/sync.yml @@ -0,0 +1,28 @@ +name: Sync Fork with Upstream + +on: + schedule: + - cron: '0 0 * * *' # Run daily at midnight UTC + workflow_dispatch: # Allow manual triggering + +jobs: + sync: + runs-on: ubuntu-latest + steps: + - name: Checkout Fork + uses: actions/checkout@v3 + with: + persist-credentials: false + + - name: Set Upstream + run: | + git remote add upstream https://github.com/jsvine/pdfplumber + git fetch upstream + git checkout stable + git merge upstream/stable + + - name: Push Changes to Fork + run: | + git push origin stable + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From c6ebd23318f1750c8be403a632f8073ec6d87fcf Mon Sep 17 00:00:00 2001 From: bcapuano Date: Fri, 17 Jan 2025 18:14:26 -0700 Subject: [PATCH 3/5] More work on sync --- .github/workflows/sync.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml index 3ed77882..29f6d2d0 100644 --- a/.github/workflows/sync.yml +++ b/.github/workflows/sync.yml @@ -19,7 +19,8 @@ jobs: git remote add upstream https://github.com/jsvine/pdfplumber git fetch upstream git checkout stable - git merge upstream/stable + git merge upstream/stable --allow-unrelated-histories + git status - name: Push Changes to Fork run: | From 43464b2c970d3edce5ef361987d6afd3414fd127 Mon Sep 17 00:00:00 2001 From: bcapuano Date: Fri, 17 Jan 2025 18:17:33 -0700 Subject: [PATCH 4/5] Abandon idea of syncing fork --- .github/workflows/cifuzz.yml | 40 ------------------------------------ 1 file changed, 40 deletions(-) delete mode 100644 .github/workflows/cifuzz.yml diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml deleted file mode 100644 index 115bd7f1..00000000 --- a/.github/workflows/cifuzz.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: CIFuzz -on: - push: - branches: - - stable - - develop - pull_request: -permissions: {} -jobs: - Fuzzing: - runs-on: ubuntu-latest - permissions: - security-events: write - steps: - - name: Build Fuzzers - id: build - uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master - with: - oss-fuzz-project-name: 'pdfplumber' - language: python - - name: Run Fuzzers - uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master - with: - oss-fuzz-project-name: 'pdfplumber' - language: python - fuzz-seconds: 800 - output-sarif: true - - name: Upload Crash - uses: actions/upload-artifact@v3 - if: failure() && steps.build.outcome == 'success' - with: - name: artifacts - path: ./out/artifacts - - name: Upload Sarif - if: always() && steps.build.outcome == 'success' - uses: github/codeql-action/upload-sarif@v2 - with: - # Path to SARIF file relative to the root of the repository - sarif_file: cifuzz-sarif/results.sarif - checkout_path: cifuzz-sarif From 2f091f57d53d0f742105e917afcd23e7cf79864b Mon Sep 17 00:00:00 2001 From: bcapuano Date: Wed, 12 Feb 2025 13:30:26 -0700 Subject: [PATCH 5/5] Add new exceptions to handler --- fuzz/pdf_load_fuzzer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fuzz/pdf_load_fuzzer.py b/fuzz/pdf_load_fuzzer.py index d40800dc..eaa9d617 100644 --- a/fuzz/pdf_load_fuzzer.py +++ b/fuzz/pdf_load_fuzzer.py @@ -7,6 +7,7 @@ with atheris.instrument_imports(include=["pdfplumber"]): from pdfminer.pdftypes import PDFException from pdfminer.psparser import PSException + from pdfplumber.utils.exceptions import MalformedPDFException, PdfminerException import pdfplumber @@ -38,7 +39,7 @@ def TestOneInput(data: bytes): elif cast_ty is CastType.DICT: pdf.to_dict() - except (PDFException, PSException, AssertionError): + except (PDFException, PSException, AssertionError, MalformedPDFException, PdfminerException): return -1 except ValueError as e: if "invalid literal for int" in str(e):