From 0bf91088560da3bda5498c71fc81d5d114b4090d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=87=83=E5=A4=8F?= Date: Mon, 15 Jul 2024 14:03:33 +0800 Subject: [PATCH] fix pdf reader --- src/pai_rag/integrations/readers/pai_pdf_reader.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/pai_rag/integrations/readers/pai_pdf_reader.py b/src/pai_rag/integrations/readers/pai_pdf_reader.py index 5f9eccc9..cc073067 100644 --- a/src/pai_rag/integrations/readers/pai_pdf_reader.py +++ b/src/pai_rag/integrations/readers/pai_pdf_reader.py @@ -139,8 +139,10 @@ def merge_page_tables(total_tables: List[PageItem]) -> List[PageItem]: table = total_tables[i] pre_table = total_tables[i - 1] if table["page_number"] == pre_table["page_number"]: + i -= 1 continue if table["page_number"] - pre_table["page_number"] > 1: + i -= 1 continue if ( table["index_id"] <= 1 @@ -196,7 +198,11 @@ def table_to_json(table: List[List]) -> str: for row in range(1, len(table)): single_line_dict = {} for column in range(len(column_name)): - if column_name[column] and len(column_name[column]) > 0: + if ( + column_name[column] + and len(column_name[column]) > 0 + and column < len(table[row]) + ): single_line_dict[column_name[column]] = table[row][column] table_info.append(single_line_dict)