-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvoter_participation_parser.py
68 lines (55 loc) · 2.12 KB
/
voter_participation_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""Extracts voter participation info from a City of Portland-provided PDF into a CSV format"""
import logging
import re
import pandas as pd
import pypdfium2 as pdfium
TESTMODE = False
LINESPERVOTER = 3
HEADERLINES = 6
FOOTERLINES = 1
def voter(lines: list) -> list:
"""Extract voter data from pdf data group"""
ward, voterid, votername, address, status = re.findall(r"^(\d+-\d+) (\d+) (.*?) (\d.*?) ([A-Z]+)$", lines[0])[0]
history = lines[1]
party, ballot = lines[2].split(" ")
return [ward, voterid, party, votername, history, address, status, ballot]
def voters(pagetext: str) -> list:
"""Iterate through voters from provided page"""
page = []
lines = pagetext.splitlines()
pagenumber = int(re.findall(r"Page (\d+) of", lines[-1])[0])
num_voters = (len(lines) - HEADERLINES - FOOTERLINES) // LINESPERVOTER
logging.info("Found %s voters on page %s", num_voters, pagenumber)
for n in range(HEADERLINES, len(lines) - (FOOTERLINES + 1), LINESPERVOTER):
logging.info("Iterating over voter %s", ((n - HEADERLINES) // 3) + 1)
page.append(voter(lines[n: n + LINESPERVOTER]))
return page
def read_voters_pages() -> list:
"""Read PDF into pages and iterate over them as text strings"""
all_voters = []
pdf = pdfium.PdfDocument("./Voter Participation History.pdf")
logging.info("Found %s pages in PDF", len(pdf))
for n in range(1, len(pdf)): # skip title page
all_voters.extend(voters(pdf[n].get_textpage().get_text_range()))
if TESTMODE:
break
return all_voters
def main():
"""Extracts voter participation info from a City of Portland-provided PDF into a CSV format"""
df = pd.DataFrame(
data=read_voters_pages(),
columns=[
"Ward/Precinct",
"Voter Record #",
"Party",
"Voter Name",
"History",
"Residence Address",
"Status",
"Ballot Type",
],
)
df.set_index(["Voter Record #"], inplace=True)
df.to_csv(r"./Voter Participation History.csv", encoding="utf-8", index=False)
if __name__ == "__main__":
main()