-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyzeRefactor4More.py
138 lines (114 loc) · 4.77 KB
/
analyzeRefactor4More.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import sys
import re
import os
import psutil # if this makes an error, you need to install the psutil package on your system
import time
maxmem = 0
def showMemTime(when='Resources'):
global maxmem
# memory and time measurement
process = psutil.Process(os.getpid())
mem = process.get_memory_info()[0] / float(2 ** 20)
maxmem = max(maxmem, mem)
ts = process.get_cpu_times()
sys.stderr.write("{when:<20}: {mb:4.0f} MB (max {maxmb:4.0f} MB), {user:4.1f} s user, {system:4.1f} s system\n".format(
when=when, mb=mem, maxmb=maxmem, user=ts.user, system=ts.system))
# NGramCounter class that counts n-tuples of word endings of length m.
class NGramCounter:
def __init__(self,n,m):
# initialize storage dictionary (datatype of {} is 'dict')
self.ngrams = {}
self.n = n
self.m = m
def count(self, *words):
# if number of given word is not equal to actual desired goal, then give an error to user
if len(words) != self.n:
print "Error : number of given words is not equal to required number of ngram"
return
# make ngram (datatype of (,) is 'tuple')
# *words variable keeps given words
# makes ngram of given word endings of length m
# run until length of given words, then add to ngram
ngram = tuple([unicode(words[x], 'utf8')[-self.m:] for x in range(len(words))])
# increase count for this bigram by one
if ngram not in self.ngrams:
# if it was not yet in the dictionary
self.ngrams[ngram] = 1
else:
# if it was already in the dictionary
self.ngrams[ngram] += 1
# del ngram after addition to the dictionary
del ngram
def display(self,k):
showMemTime('begin display')
# build list of all frequencies and ngrams
ngram_freq = self.ngrams.items()
# del ngrams after addition to the frequencies list
del self.ngrams
showMemTime('after items')
# sort that list by frequencies (i.e., second field), descending
print "sorting ..."
ngram_freq.sort(key = lambda x:x[1], reverse = True)
# iterate over the first k (or less) elements
print "creating output ..."
for ngram, occurred in ngram_freq[0:k]:
print "%d-ending ngram '%s' occured %d times" % \
(self.m, ("' '".join([str(x.encode('utf-8')) for x in ngram])), occurred)
def displayKMostFrequentNMGramsInFile(k,n,m,filename):
showMemTime('begin') # let's observe where we use our memory and time
# initialize ngram counter
nc = NGramCounter(n,m)
# define first2words variable for keeping first 2 words of each line
firstnwords=[]
# define t variable for 'if the current line is first line of the file,then increase value of t'
t=0
# read input file
print "reading from file "+filename
f = open(filename,'r')
# read file one by one line and read back to front
for line in reversed(list(f)):
# increase after reading each line
t+=1
# split on all newlines and spaces
inputwords = re.split(r' |\n',line)
# remove empty strings
inputwords = filter(lambda x: x != '', inputwords)
# if length of given line is greater than zero, check following conditions
if len(inputwords) > 0:
# value of firstnwords is empty, next code shouldn't add firstnwords variable,
# if the current line is last line of file
# when current line is not last line fo file,
# it adds firstnwords variable to the end of the current line which is given from the previous line
if firstnwords != []:
inputwords+=firstnwords
# keep first n words of current line for adding to end of the next line
firstnwords = inputwords[0:n-1]
length = len(inputwords)
# read inputwords back to front
for idx in reversed(range(0,length)):
# count ngram if we can look back m characters
if idx <= length - n:
nc.count( *(inputwords[idx:idx + n]))
# if ngram is counted, then del last word of inputwords
if idx <= length - (n + 1):
del inputwords[idx + n]
showMemTime('after counting')
print ("%d grams:" % n)
nc.display(k)
# this is our main function
def main():
# make sure the user gave us a file to read
if len(sys.argv) != 2:
print "need one argument! (file to read from)"
sys.exit(-1)
filename = sys.argv[1]
# opens the file, creates a n-gram counter class for m, counts the n-grams, and then prints the k most frequent n-grams
displayKMostFrequentNMGramsInFile(30,2,2,filename)
displayKMostFrequentNMGramsInFile(30,2,3,filename)
displayKMostFrequentNMGramsInFile(30,2,4,filename)
displayKMostFrequentNMGramsInFile(20,3,2,filename)
displayKMostFrequentNMGramsInFile(20,3,3,filename)
displayKMostFrequentNMGramsInFile(15,4,2,filename)
displayKMostFrequentNMGramsInFile(15,4,3,filename)
main()
showMemTime('at the end')