-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze_raw_words.py
117 lines (87 loc) · 4.05 KB
/
analyze_raw_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Analyze raw words from inaugural addresses text files.
Usage::
pip install joblib pandas wordcloud matplotlib nltk
python analyze_raw_words.py
"""
import itertools
import logging
import re
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
from joblib import Parallel, delayed
from nltk.probability import FreqDist
from wordcloud import WordCloud
DATASET_BASE_PATH = Path("./data")
DATASET_RAW_PATH = DATASET_BASE_PATH / "raw"
DATASET_INTERIM_PATH = DATASET_BASE_PATH / "interim"
DATASET_INAUGURAL_RAW_PATH = DATASET_RAW_PATH / "inaugural-addresses"
METADATA_HEADERS = ("word", "frequency", "has_special_chars", "has_nums", "has_contraction")
PARALLEL_NUM_JOBS = -1
PARALLEL_VERBOSITY_LEVEL = 10
LOGGING_LEVEL = logging.INFO
LOGGING_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
def _configure_logging():
"""Configure basic multiprocessing loggging."""
if len(logging.getLogger().handlers) == 0:
logging.basicConfig(level=LOGGING_LEVEL, format=LOGGING_FORMAT)
def generate_wordcloud(freq_dist=None):
"""Generate and save inaugural addresses word cloud."""
logging.info("Generating inaugural addresses wordcloud ...")
wordcloud = WordCloud(width=1600, height=800, background_color="white")
wordcloud = wordcloud.generate_from_frequencies(freq_dist)
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.title("Inaugural Addresses - Raw Word Cloud")
wordcloud_path = DATASET_INTERIM_PATH / "inaugural_address_raw_words_wordcloud.png"
logging.info(f"Saving inagural addresses wordcloud at {wordcloud_path}")
plt.savefig(wordcloud_path, dpi=300)
logging.info("Generating inaugural addresses wordcloud finished.")
def process_inaugural_word(word_freq=None):
"""Enrich inaugural address word with extra metadata."""
_configure_logging()
word, frequency = word_freq
has_special_chars = True if re.search(r"[^a-zA-Z0-9\s]", word) else False
has_contraction = "'" in word
has_nums = True if re.search(r"\d", word) else False
return word, frequency, has_special_chars, has_nums, has_contraction
def process_inaugural_file(inaugural_address_file=None):
"""Process each text line in inaugural address file."""
_configure_logging()
logging.info(f"Processing {inaugural_address_file.name} inaugural address file ...")
with open(inaugural_address_file, "r") as text_lines:
words = []
for text_line in text_lines:
words = words + text_line.strip().split()
return words
def glob_inaugural_files(raw_dir=None):
"""Search and yield inaugural address files."""
raw_dir_path = Path(raw_dir).expanduser().resolve()
yield from raw_dir_path.glob("*.txt")
_configure_logging()
logging.info("Start.")
logging.info("Processing inaugural addresses files ...")
files = glob_inaugural_files(raw_dir=DATASET_INAUGURAL_RAW_PATH)
file_words = Parallel(n_jobs=PARALLEL_NUM_JOBS, verbose=PARALLEL_VERBOSITY_LEVEL)(
delayed(process_inaugural_file)(file) for file in files
)
file_words = itertools.chain.from_iterable(file_words)
logging.info("Processing inaugural addresses files finished.")
logging.info("Generating inaugural addresses word frequency distribution ...")
freq_dist = FreqDist(file_words)
generate_wordcloud(freq_dist=freq_dist)
logging.info("Generating inaugural addresses word frequency distribution finished.")
logging.info("Processing inaugural addresses words ...")
words = Parallel(n_jobs=PARALLEL_NUM_JOBS, verbose=PARALLEL_VERBOSITY_LEVEL)(
delayed(process_inaugural_word)(word) for word in freq_dist.items()
)
metadata_df = pd.DataFrame([dict(zip(METADATA_HEADERS, word)) for word in words])
metadata_df = metadata_df.sort_values(by="frequency", ascending=False)
metadata_path = DATASET_INTERIM_PATH / "inaugural_address_raw_words_metadata.csv"
logging.info(f"Saving inagural addresses words metadata at {metadata_path}")
metadata_df.to_csv(metadata_path, index=False)
logging.info("Processing inaugural addresses words finished.")
logging.info("Done.")