Skip to content

Commit

Permalink
ENH: Add incremental capability to PdfWriter (#2811)
Browse files Browse the repository at this point in the history
Closes #2780.
  • Loading branch information
pubpub-zz authored Sep 11, 2024
1 parent b85c171 commit 98d4425
Show file tree
Hide file tree
Showing 13 changed files with 613 additions and 93 deletions.
66 changes: 58 additions & 8 deletions pypdf/_doc_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@
from .constants import FieldDictionaryAttributes as FA
from .constants import PageAttributes as PG
from .constants import PagesAttributes as PA
from .errors import (
PdfReadError,
)
from .errors import PdfReadError, PyPdfError
from .generic import (
ArrayObject,
BooleanObject,
Expand Down Expand Up @@ -254,6 +252,8 @@ class PdfDocCommon:

_encryption: Optional[Encryption] = None

_readonly: bool = False

@property
@abstractmethod
def root_object(self) -> DictionaryObject:
Expand Down Expand Up @@ -349,7 +349,7 @@ def get_num_pages(self) -> int:
return self.root_object["/Pages"]["/Count"] # type: ignore
else:
if self.flattened_pages is None:
self._flatten()
self._flatten(self._readonly)
assert self.flattened_pages is not None
return len(self.flattened_pages)

Expand All @@ -366,10 +366,49 @@ def get_page(self, page_number: int) -> PageObject:
A :class:`PageObject<pypdf._page.PageObject>` instance.
"""
if self.flattened_pages is None:
self._flatten()
self._flatten(self._readonly)
assert self.flattened_pages is not None, "hint for mypy"
return self.flattened_pages[page_number]

def _get_page_in_node(
self,
page_number: int,
) -> Tuple[DictionaryObject, int]:
"""
Retrieve the node and position within the /Kids containing the page.
If page_number is greater than the number of pages, it returns the top node, -1.
"""
top = cast(DictionaryObject, self.root_object["/Pages"])

def recursive_call(
node: DictionaryObject, mi: int
) -> Tuple[Optional[PdfObject], int]:
ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types
if node["/Type"] == "/Page":
if page_number == mi:
return node, -1
# else
return None, mi + 1
if (page_number - mi) >= ma: # not in nodes below
if node == top:
return top, -1
# else
return None, mi + ma
for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):
kid = cast(DictionaryObject, kid.get_object())
n, i = recursive_call(kid, mi)
if n is not None: # page has just been found ...
if i < 0: # ... just below!
return node, idx
# else: # ... at lower levels
return n, i
mi = i
raise PyPdfError("Unexpectedly cannot find the node.")

node, idx = recursive_call(top, 0)
assert isinstance(node, DictionaryObject), "mypy"
return node, idx

@property
def named_destinations(self) -> Dict[str, Any]:
"""
Expand Down Expand Up @@ -1082,10 +1121,20 @@ def page_mode(self) -> Optional[PagemodeType]:

def _flatten(
self,
list_only: bool = False,
pages: Union[None, DictionaryObject, PageObject] = None,
inherit: Optional[Dict[str, Any]] = None,
indirect_reference: Optional[IndirectObject] = None,
) -> None:
"""
Prepare the document pages to ease searching
Args:
list_only: Will only list the pages within _flatten_pages.
pages:
inherit:
indirect_reference: Used recursively to flatten the /Pages object.
"""
inheritable_page_attributes = (
NameObject(PG.RESOURCES),
NameObject(PG.MEDIABOX),
Expand Down Expand Up @@ -1122,7 +1171,7 @@ def _flatten(
if obj:
# damaged file may have invalid child in /Pages
try:
self._flatten(obj, inherit, **addt)
self._flatten(list_only, obj, inherit, **addt)
except RecursionError:
raise PdfReadError(
"Maximum recursion depth reached during page flattening."
Expand All @@ -1134,7 +1183,8 @@ def _flatten(
if attr_in not in pages:
pages[attr_in] = value
page_obj = PageObject(self, indirect_reference)
page_obj.update(pages)
if not list_only:
page_obj.update(pages)

# TODO: Could flattened_pages be None at this point?
self.flattened_pages.append(page_obj) # type: ignore
Expand All @@ -1158,7 +1208,7 @@ def remove_page(
or destinations to reference a detached page.
"""
if self.flattened_pages is None:
self._flatten()
self._flatten(self._readonly)
assert self.flattened_pages is not None
if isinstance(page, IndirectObject):
p = page.get_object()
Expand Down
40 changes: 31 additions & 9 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,22 @@ def __init__(
self.inline_images: Optional[Dict[str, ImageFile]] = None
# below Union for mypy but actually Optional[List[str]]
self.indirect_reference = indirect_reference
if indirect_reference is not None:
self.update(cast(DictionaryObject, indirect_reference.get_object()))

def hash_bin(self) -> int:
"""
Used to detect modified object.
Note: this function is overloaded to return the same results
as a DictionaryObject.
Returns:
Hash considering type and value.
"""
return hash(
(DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
)

def hash_value_data(self) -> bytes:
data = super().hash_value_data()
Expand Down Expand Up @@ -2399,27 +2415,33 @@ def __delitem__(self, index: Union[int, slice]) -> None:
raise IndexError("index out of range")
ind = self[index].indirect_reference
assert ind is not None
parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None)
parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
"/Parent", None
)
first = True
while parent is not None:
parent = cast(DictionaryObject, parent.get_object())
try:
i = parent["/Kids"].index(ind)
del parent["/Kids"][i]
i = cast(ArrayObject, parent["/Kids"]).index(ind)
del cast(ArrayObject, parent["/Kids"])[i]
first = False
try:
assert ind is not None
del ind.pdf.flattened_pages[index] # case of page in a Reader
except Exception: # pragma: no cover
pass
if "/Count" in parent:
parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1)
if len(parent["/Kids"]) == 0:
parent[NameObject("/Count")] = NumberObject(
cast(int, parent["/Count"]) - 1
)
if len(cast(ArrayObject, parent["/Kids"])) == 0:
# No more objects in this part of this sub tree
ind = parent.indirect_reference
parent = cast(DictionaryObject, parent.get("/Parent", None))
else:
parent = None
parent = parent.get("/Parent", None)
except ValueError: # from index
raise PdfReadError(f"Page Not Found in Page Tree {ind}")
if first:
raise PdfReadError(f"Page not found in page tree: {ind}")
break

def __iter__(self) -> Iterator[PageObject]:
for i in range(len(self)):
Expand Down
3 changes: 3 additions & 0 deletions pypdf/_protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
_objects: List[Any]
_id_translated: Dict[int, Dict[int, int]]

incremental: bool
_reader: Any # PdfReader

@abstractmethod
def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
... # pragma: no cover
Expand Down
2 changes: 2 additions & 0 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def __init__(
with open(stream, "rb") as fh:
stream = BytesIO(fh.read())
self._stream_opened = True
self._startxref: int = 0
self.read(stream)
self.stream = stream

Expand Down Expand Up @@ -563,6 +564,7 @@ def read(self, stream: StreamType) -> None:
self._basic_validation(stream)
self._find_eof_marker(stream)
startxref = self._find_startxref_pos(stream)
self._startxref = startxref

# check and eventually correct the startxref only in not strict
xref_issue_nr = self._get_xref_issues(stream, startxref)
Expand Down
Loading

0 comments on commit 98d4425

Please # to comment.