From 636d9007f572a7a86e26439a5499622bc8ed075f Mon Sep 17 00:00:00 2001
From: CaptainB <bin@fit2cloud.com>
Date: Mon, 18 Nov 2024 12:42:42 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E5=A4=84=E7=90=86PDF=E4=B8=AD=E5=87=BA?=
 =?UTF-8?q?=E7=8E=B0=20\0=20=E5=AD=97=E7=AC=A6=E6=8A=A5=20Null=20character?=
 =?UTF-8?q?s=20are=20not=20allowed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

--bug=1048190 --user=刘瑞斌 【知识库】- 上传PDF文档 报错  ,关联issue #1468 https://www.tapd.cn/57709429/s/1611070
---
 apps/common/handle/impl/pdf_split_handle.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py
index 828196b7bab..b759c6d6a1c 100644
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@@ -104,6 +104,9 @@ def handle_pdf_content(file, pdf_document):
 
             content += page_content
 
+            # Null characters are not allowed.
+            content = content.replace('\0', '')
+
             elapsed_time = time.time() - start_time
             max_kb.debug(
                 f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s,   content-length: {len(page_content)}")
@@ -156,6 +159,10 @@ def handle_toc(doc, limit):
                         text = text[:idx]
 
                 chapter_text += text  # 提取文本
+
+            # Null characters are not allowed.
+            chapter_text = chapter_text.replace('\0', '')
+
             # 限制章节内容长度
             if 0 < limit < len(chapter_text):
                 split_text = PdfSplitHandle.split_text(chapter_text, limit)
@@ -228,6 +235,9 @@ def handle_links(doc, pattern_list, with_filter, limit):
                                 text = text[:idx]
                         chapter_text += text
 
+                    # Null characters are not allowed.
+                    chapter_text = chapter_text.replace('\0', '')
+
                     # 限制章节内容长度
                     if 0 < limit < len(chapter_text):
                         split_text = PdfSplitHandle.split_text(chapter_text, limit)