From 636d9007f572a7a86e26439a5499622bc8ed075f Mon Sep 17 00:00:00 2001 From: CaptainB <bin@fit2cloud.com> Date: Mon, 18 Nov 2024 12:42:42 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E5=A4=84=E7=90=86PDF=E4=B8=AD=E5=87=BA?= =?UTF-8?q?=E7=8E=B0=20\0=20=E5=AD=97=E7=AC=A6=E6=8A=A5=20Null=20character?= =?UTF-8?q?s=20are=20not=20allowed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --bug=1048190 --user=刘瑞斌 【知识库】- 上传PDF文档 报错 ,关联issue #1468 https://www.tapd.cn/57709429/s/1611070 --- apps/common/handle/impl/pdf_split_handle.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index 828196b7bab..b759c6d6a1c 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -104,6 +104,9 @@ def handle_pdf_content(file, pdf_document): content += page_content + # Null characters are not allowed. + content = content.replace('\0', '') + elapsed_time = time.time() - start_time max_kb.debug( f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}") @@ -156,6 +159,10 @@ def handle_toc(doc, limit): text = text[:idx] chapter_text += text # 提取文本 + + # Null characters are not allowed. + chapter_text = chapter_text.replace('\0', '') + # 限制章节内容长度 if 0 < limit < len(chapter_text): split_text = PdfSplitHandle.split_text(chapter_text, limit) @@ -228,6 +235,9 @@ def handle_links(doc, pattern_list, with_filter, limit): text = text[:idx] chapter_text += text + # Null characters are not allowed. + chapter_text = chapter_text.replace('\0', '') + # 限制章节内容长度 if 0 < limit < len(chapter_text): split_text = PdfSplitHandle.split_text(chapter_text, limit)