ENH: Add incremental capability to PdfWriter (#2811)

Closes #2780.
py-pdf · Sep 11, 2024 · 98d4425 · 98d4425
1 parent b85c171
commit 98d4425
Show file tree

Hide file tree

Showing 13 changed files with 613 additions and 93 deletions.
diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
@@ -65,9 +65,7 @@
 from .constants import FieldDictionaryAttributes as FA
 from .constants import PageAttributes as PG
 from .constants import PagesAttributes as PA
-from .errors import (
-    PdfReadError,
-)
+from .errors import PdfReadError, PyPdfError
 from .generic import (
     ArrayObject,
     BooleanObject,
@@ -254,6 +252,8 @@ class PdfDocCommon:
 
     _encryption: Optional[Encryption] = None
 
+    _readonly: bool = False
+
     @property
     @abstractmethod
     def root_object(self) -> DictionaryObject:
@@ -349,7 +349,7 @@ def get_num_pages(self) -> int:
             return self.root_object["/Pages"]["/Count"]  # type: ignore
         else:
             if self.flattened_pages is None:
-                self._flatten()
+                self._flatten(self._readonly)
             assert self.flattened_pages is not None
             return len(self.flattened_pages)
 
@@ -366,10 +366,49 @@ def get_page(self, page_number: int) -> PageObject:
             A :class:`PageObject<pypdf._page.PageObject>` instance.
         """
         if self.flattened_pages is None:
-            self._flatten()
+            self._flatten(self._readonly)
         assert self.flattened_pages is not None, "hint for mypy"
         return self.flattened_pages[page_number]
 
+    def _get_page_in_node(
+        self,
+        page_number: int,
+    ) -> Tuple[DictionaryObject, int]:
+        """
+        Retrieve the node and position within the /Kids containing the page.
+        If page_number is greater than the number of pages, it returns the top node, -1.
+        """
+        top = cast(DictionaryObject, self.root_object["/Pages"])
+
+        def recursive_call(
+            node: DictionaryObject, mi: int
+        ) -> Tuple[Optional[PdfObject], int]:
+            ma = cast(int, node.get("/Count", 1))  # default 1 for /Page types
+            if node["/Type"] == "/Page":
+                if page_number == mi:
+                    return node, -1
+                # else
+                return None, mi + 1
+            if (page_number - mi) >= ma:  # not in nodes below
+                if node == top:
+                    return top, -1
+                # else
+                return None, mi + ma
+            for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):
+                kid = cast(DictionaryObject, kid.get_object())
+                n, i = recursive_call(kid, mi)
+                if n is not None:  # page has just been found ...
+                    if i < 0:  # ... just below!
+                        return node, idx
+                    # else:  # ... at lower levels
+                    return n, i
+                mi = i
+            raise PyPdfError("Unexpectedly cannot find the node.")
+
+        node, idx = recursive_call(top, 0)
+        assert isinstance(node, DictionaryObject), "mypy"
+        return node, idx
+
     @property
     def named_destinations(self) -> Dict[str, Any]:
         """
@@ -1082,10 +1121,20 @@ def page_mode(self) -> Optional[PagemodeType]:
 
     def _flatten(
         self,
+        list_only: bool = False,
         pages: Union[None, DictionaryObject, PageObject] = None,
         inherit: Optional[Dict[str, Any]] = None,
         indirect_reference: Optional[IndirectObject] = None,
     ) -> None:
+        """
+        Prepare the document pages to ease searching
+
+        Args:
+            list_only: Will only list the pages within _flatten_pages.
+            pages:
+            inherit:
+            indirect_reference: Used recursively to flatten the /Pages object.
+        """
         inheritable_page_attributes = (
             NameObject(PG.RESOURCES),
             NameObject(PG.MEDIABOX),
@@ -1122,7 +1171,7 @@ def _flatten(
                 if obj:
                     # damaged file may have invalid child in /Pages
                     try:
-                        self._flatten(obj, inherit, **addt)
+                        self._flatten(list_only, obj, inherit, **addt)
                     except RecursionError:
                         raise PdfReadError(
                             "Maximum recursion depth reached during page flattening."
@@ -1134,7 +1183,8 @@ def _flatten(
                 if attr_in not in pages:
                     pages[attr_in] = value
             page_obj = PageObject(self, indirect_reference)
-            page_obj.update(pages)
+            if not list_only:
+                page_obj.update(pages)
 
             # TODO: Could flattened_pages be None at this point?
             self.flattened_pages.append(page_obj)  # type: ignore
@@ -1158,7 +1208,7 @@ def remove_page(
                 or destinations to reference a detached page.
         """
         if self.flattened_pages is None:
-            self._flatten()
+            self._flatten(self._readonly)
         assert self.flattened_pages is not None
         if isinstance(page, IndirectObject):
             p = page.get_object()

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -492,6 +492,22 @@ def __init__(
         self.inline_images: Optional[Dict[str, ImageFile]] = None
         # below Union for mypy but actually Optional[List[str]]
         self.indirect_reference = indirect_reference
+        if indirect_reference is not None:
+            self.update(cast(DictionaryObject, indirect_reference.get_object()))
+
+    def hash_bin(self) -> int:
+        """
+        Used to detect modified object.
+
+        Note: this function is overloaded to return the same results
+        as a DictionaryObject.
+
+        Returns:
+            Hash considering type and value.
+        """
+        return hash(
+            (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
+        )
 
     def hash_value_data(self) -> bytes:
         data = super().hash_value_data()
@@ -2399,27 +2415,33 @@ def __delitem__(self, index: Union[int, slice]) -> None:
             raise IndexError("index out of range")
         ind = self[index].indirect_reference
         assert ind is not None
-        parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None)
+        parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
+            "/Parent", None
+        )
+        first = True
         while parent is not None:
             parent = cast(DictionaryObject, parent.get_object())
             try:
-                i = parent["/Kids"].index(ind)
-                del parent["/Kids"][i]
+                i = cast(ArrayObject, parent["/Kids"]).index(ind)
+                del cast(ArrayObject, parent["/Kids"])[i]
+                first = False
                 try:
                     assert ind is not None
                     del ind.pdf.flattened_pages[index]  # case of page in a Reader
                 except Exception:  # pragma: no cover
                     pass
                 if "/Count" in parent:
-                    parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1)
-                if len(parent["/Kids"]) == 0:
+                    parent[NameObject("/Count")] = NumberObject(
+                        cast(int, parent["/Count"]) - 1
+                    )
+                if len(cast(ArrayObject, parent["/Kids"])) == 0:
                     # No more objects in this part of this sub tree
                     ind = parent.indirect_reference
-                    parent = cast(DictionaryObject, parent.get("/Parent", None))
-                else:
-                    parent = None
+                parent = parent.get("/Parent", None)
             except ValueError:  # from index
-                raise PdfReadError(f"Page Not Found in Page Tree {ind}")
+                if first:
+                    raise PdfReadError(f"Page not found in page tree: {ind}")
+                break
 
     def __iter__(self) -> Iterator[PageObject]:
         for i in range(len(self)):

diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py
@@ -74,6 +74,9 @@ class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
     _objects: List[Any]
     _id_translated: Dict[int, Dict[int, int]]
 
+    incremental: bool
+    _reader: Any  # PdfReader
+
     @abstractmethod
     def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
         ...  # pragma: no cover

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -136,6 +136,7 @@ def __init__(
             with open(stream, "rb") as fh:
                 stream = BytesIO(fh.read())
             self._stream_opened = True
+        self._startxref: int = 0
         self.read(stream)
         self.stream = stream
 
@@ -563,6 +564,7 @@ def read(self, stream: StreamType) -> None:
         self._basic_validation(stream)
         self._find_eof_marker(stream)
         startxref = self._find_startxref_pos(stream)
+        self._startxref = startxref
 
         # check and eventually correct the startxref only in not strict
         xref_issue_nr = self._get_xref_issues(stream, startxref)