This repository was archived by the owner on Sep 2, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathingram.py
324 lines (279 loc) · 14.1 KB
/
ingram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
"""A proofreading tool using Google's N-gram corpus."""
import argparse
from cleanstring import *
import codecs
import csv
import os
import sys
def find_frequency(dictionary_location, s):
"""Reports the raw frequency of the two word string [s] in the specified dictionary location."""
frequency = 0
s = clean_string(s)
if s == "":
return None
else:
# See if the appropriate dictionary exists
path = dictionary_location + s[0]
if os.path.exists(path):
file_name = path + "/" + s[0]
if len(s) < 2 or s[1] == " ":
file_name += "_"
else:
file_name += s[1]
if len(s) < 3 or s[2] == " ":
file_name += "_"
else:
file_name += s[2]
file_name += ".txt"
if os.path.isfile(file_name):
# Read through the dictionary until we find a match (or not.)
in_file = codecs.open(file_name, 'r', 'utf-8')
for data_in in in_file:
data_list = data_in.split("\t")
if data_list[0] == s:
frequency = int(data_list[1])
break
in_file.close()
else:
frequency = None # No dictionary found for this guy.
return frequency
def whitelisted(config, word):
"""Report True if the word is in the whitelist or should otherwise be given a passing grade."""
for char in word:
ascii = ord(char)
if ascii >= 48 and ascii <= 57:
return True # Numbers get a free pass
word = strip_word(word)
if word in config["custom_dict"]:
return True
return False
def report_familiarity(config, word_trio, previous_report={}):
"""Takes a three word list and returns the familiarity rating for the one in the middle."""
edge_frequency = config["maxfreq"] * 0.7 # How much artificial frequency is added to edge words. (Very first and last words.)
report = {"word": word_trio[1]}
if whitelisted(config, word_trio[1]):
report["frequency_before"] = config["maxfreq"]
report["frequency_after"] = config["maxfreq"]
report["score"] = 100
return report
if whitelisted(config, word_trio[0]):
report["frequency_before"] = config["maxfreq"]
else:
if "frequency_after" in previous_report and type(previous_report["frequency_after"]) == int:
report["frequency_before"] = previous_report["frequency_after"]
else:
report["frequency_before"] = find_frequency(config["dict"], word_trio[0]+" "+word_trio[1])
frequency_before = report["frequency_before"]
if whitelisted(config, word_trio[2]):
report["frequency_after"] = config["maxfreq"]
else:
report["frequency_after"] = find_frequency(config["dict"], word_trio[1]+" "+word_trio[2])
frequency_after = report["frequency_after"]
if frequency_before is None and frequency_after is None:
frequency_before = 0
frequency_after = 0
else: # It's either the very first or very last word. Artificially boost its rating accordingly.
if frequency_before is None:
frequency_before = edge_frequency
if frequency_after is None:
frequency_after = edge_frequency
if frequency_before > config["maxfreq"]:
frequency_before = config["maxfreq"]
if frequency_after > config["maxfreq"]:
frequency_after = config["maxfreq"]
report["score"] = frequency_before + frequency_after
if frequency_before == 0 or frequency_after == 0:
report["score"] -= report["score"] * (config["missinghit"]/100)
# Normalize the number 0-100
report["score"] = int(((report["score"] / 200) * 100)/(config["maxfreq"]/100))
return report
def start_report(config):
""" Does the housekeeping necessary before saving/displaying the report."""
out_string = ""
if config["out"] is not None: # Remove any existing file if we're saving one.
if os.path.isfile(config["out"]):
os.remove(config["out"])
if config["type"] == "full_html":
out_string = """<!doctype html>\n<html lang="en">\n<head>\n\t<meta charset="utf-8">\n\t<title>"""
out_string += config["in"]
out_string += """</title>\n\t<meta name="description" content="Ingram processed text.">\n\t<meta name="author" content="Ingram">\n\t<link rel="stylesheet" href="ingram.css">\n</head>\n<body>\n<p>"""
if config["out"] is not None:
out_file = codecs.open(config["out"], 'a', 'utf-8')
out_file.write(out_string)
out_file.close()
def end_report(config):
""" Does the housekeeping necessary after saving/displaying the report."""
out_string = ""
if config["type"] == "full_html":
out_string = "\n\t</p>\n</body>\n</html>\n"
if config["out"] is not None:
out_file = codecs.open(config["out"], 'a', 'utf-8')
out_file.write(out_string)
out_file.close()
def show_report(config, report, fragment=""):
""" Display/save a line of data in the requested output format. """
if fragment != "":
fragment = " " + fragment
if config["type"] == "text":
if "score" in report:
out_string = report["word"] + fragment + "\t" + str(report["score"]) + "\n"
else:
out_string = report["word"] + fragment + "\t\n"
elif config["type"] in ["html", "full_html"]:
if "score" in report:
class_number = round((report["score"]+9) / 10) * 10
if config["type"] == "full_html":
out_string = '<span class="ngram%i ngramPopup">%s%s<span>Score: %i<br>Frequency before: %s<br>Frequency after: %s</span></span> ' % (class_number, report["word"], fragment, report["score"], report["frequency_before"], report["frequency_after"])
else:
out_string = '<span class="ngram%i">%s</span>%s ' % (class_number, report["word"], fragment)
else:
out_string = report["word"] + " "
if "\n" in report["word"]:
out_string += "</p>\n<p>"
elif config["type"] == "tsv":
out_string = str(config["word_count"]) + "\t" + report["word"] + fragment + "\t" + str(report["score"]) + "\t" + str(report["frequency_before"]) + "\t" + str(report["frequency_after"]) + "\n"
if config["type"] == "csv": # CSV writer does its own wacky thing.
if config["out"] is not None:
out_file = codecs.open(config["out"], 'a', 'utf-8')
output = csv.writer(out_file, dialect='excel')
output.writerow([config["word_count"], report["word"], report["score"], report["frequency_before"], report["frequency_after"]])
out_file.close()
else:
output = csv.writer(sys.stdout, dialect='excel')
output.writerow([config["word_count"], report["word"]+fragment, report["score"], report["frequency_before"], report["frequency_after"]])
else: # Dump the output to its chosen locaiton.
if config["out"] is not None:
out_file = codecs.open(config["out"], 'a', 'utf-8')
out_file.write(out_string)
out_file.close()
else:
sys.stdout.write(out_string)
def process_text(config):
""" Processes the input text. """
if os.path.isfile(config["in"]):
if os.path.isfile(config["dict"]+config["custom_dict_name"]):
in_file = codecs.open(config["in"], 'r', 'utf-8')
word_trio = ["", "", ""]
config["word_count"] = 0
last_report = {}
for data_in in in_file:
word_list = data_in.split(" ")
for word in word_list:
if len(strip_word(word)) > 0:
word = word.strip(" ")
if word not in ["", "\t", "\n", "\r", " "]:
word_trio.append(word)
word_trio.pop(0)
if word_trio[1] != "":
report = report_familiarity(config, word_trio, last_report)
if "fragment" in last_report:
show_report(config, report, last_report["fragment"])
else:
show_report(config, report, "")
config["word_count"] += 1
last_report = report
else:
if word not in ["", "\t", "\n", "\r", " "]: # There's a fragment of something (probably punctuation) save for later.
report = {"fragment": word}
last_report = report
# Process the last word in the file.
word_trio.append("")
word_trio.pop(0)
report = report_familiarity(config, word_trio)
show_report(config, report)
else:
print("Error: the input file [%s] was not found." % config["in"])
def add_custom(config):
""" Add a custom word to the custom dictionary. """
# Only add it if it doesn't exist yet.
clean_add_text = config["add"].lower().strip("\n\r\t ")
if not clean_add_text in config["custom_dict"]:
f = codecs.open(config["dict"]+config["custom_dict_name"], 'a', 'utf-8')
f.write("\n"+config["add"])
f.close()
load_custom_dict(config)
print("Word [%s] will now be ignored when using this dictionary." % config["add"])
else:
print("Word not added to the dictionary. [%s] is already in the custom dictionary." % config["add"])
def remove_custom(config):
""" Remove a custom word from the dictionary. """
if os.path.isfile(config["dict"]+config["custom_dict_name"]):
in_file = codecs.open(config["dict"]+config["custom_dict_name"], 'r', 'utf-8')
out_file = codecs.open(config["dict"]+config["custom_dict_name"]+".tmp", 'w', 'utf-8')
found_count = 0
clean_remove_text = config["remove"].lower().strip("\n\r\t ")
for data_in in in_file:
if not clean_remove_text == data_in.lower().strip("\n\r\t "):
out_file.write(data_in)
else:
found_count += 1
in_file.close()
out_file.close()
os.remove(config["dict"]+config["custom_dict_name"])
os.rename(config["dict"]+config["custom_dict_name"]+".tmp", config["dict"]+config["custom_dict_name"])
load_custom_dict(config)
print("Found and removed %i instances of [%s]." % (found_count, config["remove"]))
def strip_word(word):
"""Converts a word to lowercase and strips out all non alpha characters."""
stripped = ""
word = word.lower()
for char in word:
ascii = ord(char)
if ascii >= 97 and ascii <= 122:
stripped += char
return stripped
def load_custom_dict(config):
""" Load the custom dictionary. Returns False if dictionary folder does not exist."""
config["custom_dict"] = []
# Create the file if it doesn't exist.
if not os.path.isfile(config["dict"]+config["custom_dict_name"]):
f = codecs.open(config["dict"]+config["custom_dict_name"], 'w', 'utf-8')
f.write("# Custom dictionary white list.\n")
f.write("# Each entry should be on a line by its self.\n")
f.close()
if os.path.exists(config["dict"]):
if os.path.isfile(config["dict"]+config["custom_dict_name"]):
# load the file, each line that doesn't start with "#" is a word for the dictionary.
in_file = codecs.open(config["dict"]+config["custom_dict_name"], 'r', 'utf-8')
for data_in in in_file:
if data_in[0] != "#":
data_in = strip_word(data_in)
data_in = data_in.lower()
if len(data_in) > 0:
config["custom_dict"].append(data_in)
return True
else:
print("Error: No dictionary found in path [%s]." % config["dict"])
return False
def get_config():
""" Parse the command line arguments. """
parser = argparse.ArgumentParser(description="A tool for checking spelling and grammar using Google's ngram corpus.")
parser.add_argument('-in', help='Text file to process.', required=False, default="", metavar="FILE")
parser.add_argument('-out', help='Name to save output. Will be overwritten if it exists. If not defined output is echoed to stdout.', required=False, metavar="FILE")
parser.add_argument('-type', help='[text, csv, tsv, html, full_html] Type of output to produce.', required=False, default="text", metavar="TYPE")
parser.add_argument('-dict', help="Dictionary to use. (default /dictionary/)", required=False, default="dictionary/", metavar="PATH")
parser.add_argument('-add', help="Add a word to the custom whitelist.", required=False, default="", metavar="STRING")
parser.add_argument('-remove', help="Remove a word from the custom whitelist.", required=False, default="", metavar="STRING")
parser.add_argument('-maxfreq', help="[Advanced] Frequency hits above this will not improve the familiarity score. Higher = more sensitive. (Default: 20,000.)", default=20000, type=int, required=False, metavar="INT")
parser.add_argument('-missinghit', help="[Advanced] Percentage points removed from a word's score if there's no record of a pairing. Higher = missing matches are more visible. (Default: 55)", default=55, type=int, required=False, metavar="INT")
config = vars(parser.parse_args())
# Add some useful things to the config.
config["custom_dict_name"] = "custom.txt"
# Some super basic verification.
if config["type"] not in ["text", "html", "csv", "full_html", "tsv"]:
print("Error: Output type [%s] not recognized." % config["type"])
exit(1)
return config
# Begin Main
config = get_config()
if not load_custom_dict(config):
config = None
if config is not None:
if config["add"] != "":
add_custom(config)
if config["remove"] != "":
remove_custom(config)
if config["in"] != "":
start_report(config)
process_text(config)
end_report(config)