-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproofreading1.py
79 lines (67 loc) · 2.93 KB
/
proofreading1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""一般校对用例
输入切分好的JSON文件、上下文文件【可选】、参考文件【可选】;
输出校对后的JSON文件
"""
import os
import json
import asyncio
from src.proofreader import process_paragraphs_async
# 文件所在路径(以项目根目录为当前目录)
ROOT_DIR = "example"
# 文件名列表(不含后缀`.md`)
file_names = [
'your_markdown',
# '1.21 先秦诗.clean',
# '1.21 汉魏晋六朝(上).clean',
# '1.21 汉魏晋六朝(下册).clean',
# '1.21 唐诗上册.clean',
# '1.21 唐诗中册.clean',
# '1.21 唐诗下册.clean',
# '1.21 宋诗.clean',
# '1.21 宋词上(未转曲).clean',
# '1.21 宋词中.clean',
# '1.21 宋词下.clean',
# '1.21 题画诗.clean',
# '1.21 元散曲.clean',
# '1.21 元杂剧.clean',
]
for file_name in file_names:
# 切分好的JSON文件
FILE_IN_JSON = f"{ROOT_DIR}/{file_name}.json"
# 参考文件【可选】
REFERENCE_JSON = f"{ROOT_DIR}/{file_name}.reference.json"
# 上下文文件【可选】
CONTEXT_JSON = f"{ROOT_DIR}/{file_name}.context.json"
# 将生成的文件
FILE_PROOFREAD_JSON = f"{ROOT_DIR}/{file_name}.proofread.json"
# 确保输入文件存在
if not os.path.exists(FILE_IN_JSON):
print(f"错误:输入文件 {FILE_IN_JSON} 不存在")
exit(1)
# 确保输出目录存在
os.makedirs(os.path.dirname(FILE_PROOFREAD_JSON), exist_ok=True)
# 处理文本
try:
asyncio.run(process_paragraphs_async(FILE_IN_JSON, FILE_PROOFREAD_JSON, start_count=1, model="deepseek-chat", rpm=15, max_concurrent=3, context_json=CONTEXT_JSON, reference_json=REFERENCE_JSON))
except Exception as e:
print(f"处理文本时出错: {str(e)}")
exit(1)
# 输出处理进度统计
try:
with open(FILE_IN_JSON, "r", encoding="utf-8") as f:
input_paragraphs = json.load(f)
with open(FILE_PROOFREAD_JSON, "r", encoding="utf-8") as f:
output_paragraphs = json.load(f)
processed_count = sum(1 for p in output_paragraphs if p is not None)
total_count = len(input_paragraphs)
processed_length = sum(len(p) for p in output_paragraphs if p is not None)
total_length = sum(len(p) for p in input_paragraphs)
print(f"\n【{file_name}】处理进度统计:")
print(f"总段落数: {total_count}")
print(f"已处理段落数、字数: {processed_count} ({processed_count/total_count*100:.2f}%), {processed_length} ({processed_length/total_length*100:.2f}%)")
print(f"未处理段落数: {total_count - processed_count} ({(total_count-processed_count)/total_count*100:.2f}%)")
for i, paragraph in enumerate(input_paragraphs):
if output_paragraphs[i] is None:
print(f"No.{i+1} \n {paragraph.strip().splitlines()[0][:20]}...\n")
except Exception as e:
print(f"统计处理进度时出错: {str(e)}")