MAINT: Quadratic runtime while parsing reduced to linear (#808)

When the PdfFileReader tries to find the xref marker, the readNextEndLine methods builds a so called line by reading byte-for-byte. Every time a new byte is read, it is concatenated with the currently read line. This leads to quadratic runtime O(n²) behavior as Python strings (also byte-strings) are immutable and have to be copied where n is the size of the file. For files where the xref marker can not be found at the end this takes a enormous amount of time: * 1mb of zeros at the end: 45.54 seconds * 2mb of zeros at the end: 357.04 seconds (measured on a laptop made in 2015) This pull request changes the relevant section of the code to become linear runtime O(n), leading to a run time of less then a second for both cases mentioned above. Furthermore this PR adds a regression test.
py-pdf · Apr 23, 2022 · c6c56f5 · c6c56f5
1 parent 9941099
commit c6c56f5
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 4 deletions.
diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -2082,7 +2082,7 @@ def _pairs(self, array):
     def readNextEndLine(self, stream, limit_offset=0):
         debug = False
         if debug: print(">>readNextEndLine")
-        line = b_("")
+        line_parts = []
         while True:
             # Prevent infinite loops in malformed PDFs
             if stream.tell() == 0 or stream.tell() == limit_offset:
@@ -2109,10 +2109,10 @@ def readNextEndLine(self, stream, limit_offset=0):
                 break
             else:
                 if debug: print("  x is neither")
-                line = x + line
-                if debug: print(("  RNEL line:", line))
+                line_parts.append(x)
         if debug: print("leaving RNEL")
-        return line
+        line_parts.reverse()
+        return b"".join(line_parts)
 
     def decrypt(self, password):
         """

diff --git a/Tests/test_reader.py b/Tests/test_reader.py
@@ -1,5 +1,6 @@
 import io
 import os
+import time
 
 import pytest
 
@@ -9,6 +10,14 @@
 from PyPDF2.constants import Ressources as RES
 from PyPDF2.errors import PdfReadError
 from PyPDF2.filters import _xobj_to_image
+from sys import version_info
+
+if version_info < ( 3, 0 ):
+    from cStringIO import StringIO
+    StreamIO = StringIO
+else:
+    from io import BytesIO
+    StreamIO = BytesIO
 
 TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
 PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
@@ -462,3 +471,16 @@ def test_get_destination_age_number():
     for outline in outlines:
         if not isinstance(outline, list):
             reader.getDestinationPageNumber(outline)
+
+
+def test_do_not_get_stuck_on_large_files_without_start_xref():
+    """Tests for the absence of a DoS bug, where a large file without an startxref mark
+    would cause the library to hang for minutes to hours """
+    start_time = time.time()
+    broken_stream = StreamIO(b"\0" * 5 * 1000 * 1000)
+    with pytest.raises(PdfReadError):
+        PdfFileReader(broken_stream)
+    parse_duration = time.time() - start_time
+    # parsing is expected take less than a second on a modern cpu, but include a large
+    # tolerance to account for busy or slow systems
+    assert parse_duration < 60