Skip to content

Update table.py #4516

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 45 additions & 9 deletions src/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1332,9 +1332,35 @@ def row_count(self) -> int: # PyMuPDF extension
def col_count(self) -> int: # PyMuPDF extension
return max([len(r.cells) for r in self.rows])

def extract(self, **kwargs) -> list:
def extract(
self,
*,
footnote: str = "none",
**kwargs,
) -> list | tuple[list, str | None]:
"""
Extract the table’s text content.

Parameters
----------
footnote : {"none", "last_single_cell"}, optional
• "none" (default) – return the table exactly as on page.
• "last_single_cell" – if the final physical row contains exactly
one non-empty cell, treat that row as a foot-note: remove it from
the table and return its text alongside the table data.

Other **kwargs are forwarded to `extract_text()`.

Returns
-------
list
When *footnote="none"* – the table content as a list of rows.
(list, str | None)
When *footnote="last_single_cell"* – the table content **and**
the extracted foot-note text (or *None* if no foot-note found).
"""
chars = CHARS
table_arr = []
table_arr: list[list[str | None]] = []

def char_in_bbox(char, bbox) -> bool:
v_mid = (char["top"] + char["bottom"]) / 2
Expand All @@ -1344,19 +1370,19 @@ def char_in_bbox(char, bbox) -> bool:
(h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
)

# -----------------------------------------
# Build raw rows × columns string matrix
# -----------------------------------------
for row in self.rows:
arr = []
row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
row_chars = [c for c in chars if char_in_bbox(c, row.bbox)]

for cell in row.cells:
if cell is None:
cell_text = None
else:
cell_chars = [
char for char in row_chars if char_in_bbox(char, cell)
]

if len(cell_chars):
cell_chars = [c for c in row_chars if char_in_bbox(c, cell)]
if cell_chars:
kwargs["x_shift"] = cell[0]
kwargs["y_shift"] = cell[1]
if "layout" in kwargs:
Expand All @@ -1368,7 +1394,17 @@ def char_in_bbox(char, bbox) -> bool:
arr.append(cell_text)
table_arr.append(arr)

return table_arr
# -----------------------------------------
# Optional foot-note post-processing
# -----------------------------------------
footnote_txt: str | None = None
if footnote == "last_single_cell" and table_arr:
non_empty = [c for c in table_arr[-1] if c and str(c).strip()]
if len(non_empty) == 1:
footnote_txt = non_empty[0]
table_arr = table_arr[:-1]

return (table_arr, footnote_txt) if footnote != "none" else table_arr

def to_markdown(self, clean=False, fill_empty=True):
"""Output table content as a string in Github-markdown format.
Expand Down
Loading