Skip to content

Commit

Permalink
MAINT: Quadratic runtime while parsing reduced to linear (#808)
Browse files Browse the repository at this point in the history
When the PdfFileReader tries to find the xref marker, the readNextEndLine methods builds a so called line by reading byte-for-byte. Every time a new byte is read, it is concatenated with the currently read line. This leads to quadratic runtime O(n²) behavior as Python strings (also byte-strings) are immutable and have to be copied where n is the size of the file.
For files where the xref marker can not be found at the end this takes a enormous amount of time:

* 1mb of zeros at the end: 45.54 seconds
* 2mb of zeros at the end: 357.04 seconds
(measured on a laptop made in 2015)

This pull request changes the relevant section of the code to become linear runtime O(n), leading to a run time of less then a second for both cases mentioned above. Furthermore this PR adds a regression test.
  • Loading branch information
dsk7 committed Apr 23, 2022
1 parent 9941099 commit c6c56f5
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 4 deletions.
8 changes: 4 additions & 4 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2082,7 +2082,7 @@ def _pairs(self, array):
def readNextEndLine(self, stream, limit_offset=0):
debug = False
if debug: print(">>readNextEndLine")
line = b_("")
line_parts = []
while True:
# Prevent infinite loops in malformed PDFs
if stream.tell() == 0 or stream.tell() == limit_offset:
Expand All @@ -2109,10 +2109,10 @@ def readNextEndLine(self, stream, limit_offset=0):
break
else:
if debug: print(" x is neither")
line = x + line
if debug: print((" RNEL line:", line))
line_parts.append(x)
if debug: print("leaving RNEL")
return line
line_parts.reverse()
return b"".join(line_parts)

def decrypt(self, password):
"""
Expand Down
22 changes: 22 additions & 0 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import io
import os
import time

import pytest

Expand All @@ -9,6 +10,14 @@
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadError
from PyPDF2.filters import _xobj_to_image
from sys import version_info

if version_info < ( 3, 0 ):
from cStringIO import StringIO
StreamIO = StringIO
else:
from io import BytesIO
StreamIO = BytesIO

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
Expand Down Expand Up @@ -462,3 +471,16 @@ def test_get_destination_age_number():
for outline in outlines:
if not isinstance(outline, list):
reader.getDestinationPageNumber(outline)


def test_do_not_get_stuck_on_large_files_without_start_xref():
"""Tests for the absence of a DoS bug, where a large file without an startxref mark
would cause the library to hang for minutes to hours """
start_time = time.time()
broken_stream = StreamIO(b"\0" * 5 * 1000 * 1000)
with pytest.raises(PdfReadError):
PdfFileReader(broken_stream)
parse_duration = time.time() - start_time
# parsing is expected take less than a second on a modern cpu, but include a large
# tolerance to account for busy or slow systems
assert parse_duration < 60

0 comments on commit c6c56f5

Please # to comment.