Skip to content

Commit e1df4b2

Browse files
committed
fix: 处理PDF中出现 \0 字符报 Null characters are not allowed
--bug=1048190 --user=刘瑞斌 【知识库】- 上传PDF文档 报错 ,关联issue #1468 https://www.tapd.cn/57709429/s/1611070
1 parent 4dd497e commit e1df4b2

File tree

1 file changed

+10
-0
lines changed

1 file changed

+10
-0
lines changed

apps/common/handle/impl/pdf_split_handle.py

+10
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ def handle_pdf_content(file, pdf_document):
104104

105105
content += page_content
106106

107+
# Null characters are not allowed.
108+
content = content.replace('\0', '')
109+
107110
elapsed_time = time.time() - start_time
108111
max_kb.debug(
109112
f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}")
@@ -156,6 +159,10 @@ def handle_toc(doc, limit):
156159
text = text[:idx]
157160

158161
chapter_text += text # 提取文本
162+
163+
# Null characters are not allowed.
164+
chapter_text = chapter_text.replace('\0', '')
165+
159166
# 限制章节内容长度
160167
if 0 < limit < len(chapter_text):
161168
split_text = PdfSplitHandle.split_text(chapter_text, limit)
@@ -228,6 +235,9 @@ def handle_links(doc, pattern_list, with_filter, limit):
228235
text = text[:idx]
229236
chapter_text += text
230237

238+
# Null characters are not allowed.
239+
chapter_text = chapter_text.replace('\0', '')
240+
231241
# 限制章节内容长度
232242
if 0 < limit < len(chapter_text):
233243
split_text = PdfSplitHandle.split_text(chapter_text, limit)

0 commit comments

Comments
 (0)