forked from DS4SD/docling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcustom_convert.py
130 lines (101 loc) · 4.34 KB
/
custom_convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import json
import logging
import time
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/2206.01062.pdf")
###########################################################################
# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.
# PyPdfium without EasyOCR
# --------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = False
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# )
# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# )
# Docling Parse without EasyOCR
# -------------------------
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
# Docling Parse with EasyOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractOcrOptions()
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract CLI
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions()
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
###########################################################################
start_time = time.time()
conv_result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
_log.info(f"Document converted in {end_time:.2f} seconds.")
## Export results
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
fp.write(json.dumps(conv_result.document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_document_tokens())
if __name__ == "__main__":
main()