-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_coding.py
71 lines (62 loc) · 1.44 KB
/
make_coding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import sys
import pickle
from collections import Counter
def codes(what, prefix=""):
if len(what) == 2:
yield (prefix, what[1])
elif len(what) > 2:
yield from codes(what[1], prefix+"0")
yield from codes(what[2], prefix+"1")
else:
print(len(what))
with open(sys.argv[1]) as f:
corpus = f.read()
remaining = [(y,x) for x, y in Counter(corpus).most_common()]
extra_tokens = [
'def ',
'return',
'lambda ',
'from ',
'import ',
"''.join(",
'print(',
'map(',
'min(',
'max(',
'input()',
'range(',
'len',
'sorted(',
'set(',
'zip(',
'list(',
'for ',
' in ',
'if ',
'else',
'elif ',
'.count',
'not ',
'any(',
'all(',
'sum(',
' and ',
' or ',
'.split(',
'.replace(',
'.find(',
' ',
' ',
]
remaining.extend([(1, 'END'), *[(corpus.count(token), token) for token in extra_tokens]])
while len(remaining) > 1:
remaining.sort(reverse=True, key=lambda x: x[0])
x = remaining.pop()
y = remaining.pop()
remaining.append((x[0]+y[0], x, y))
result = remaining[0]
coding = {y:x for x,y in codes(result)}
with open("coding.dat", "wb") as f:
pickle.dump(coding, f)
for letter in coding:
print(repr(letter), coding[letter], sep="\t")