-
Notifications
You must be signed in to change notification settings - Fork 1
/
PDF Parser.py
37 lines (31 loc) · 1.09 KB
/
PDF Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import os
import pandas as pd
import pdfplumber
def parse_pdf(file_path):
"""Parse fields from PDF"""
with pdfplumber.open(file_path) as pdf:
first_page = pdf.pages[0]
text = first_page.extract_text()
# Use regex or string parsing to extract required fields from text
parsed_data = {
'field1': extract_field_1(text),
'Field2': extract_field_2(text),
# Continue for other fields
}
return parsed_data
def extract_field_1(text):
# Custom logic to extract Field 1
pass
def extract_field_2(text):
# Custom logic to extract Field 2
pass
def scan_folder_and_save_to_excel(folder_path, excel_path):
parsed_results = []
for filename in os.listdir(folder_path):
if filename.endswith('.pdf'):
if_path = os.path.join(folder_path, filename)
parsed_results.append(parse_pdf(file_path))
df = pd.DataFrame(parsed_results)
df.to_excel(excel_path, index=False)
# Usage example
scan_folder_and_save_to_excel('path/to/pdf/folder', 'output.xlsx')