-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_frequency.py
85 lines (74 loc) · 2.13 KB
/
word_frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from collections import Counter
import heapq
import nltk
import re
class WordFrequency:
def __init__(self, text, num):
self.text = text
self.num = int(num)
self.stopwords = nltk.corpus.stopwords.words('english')
def word_sent_tokenizer(self,text):
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)
sentences = nltk.sent_tokenize(text)
return words, sentences
def create_wordfrequency(self,words):
word_freq = {}
for k in words:
if k not in self.stopwords:
if k in word_freq:
word_freq[k]+=1
else:
word_freq[k]=1
max_freq = max(word_freq.values())
for w in word_freq.keys():
word_freq[w] = (word_freq[w]/max_freq)
return word_freq
def maxWordFreq(self,word_freq):
#Wordmax = max(word_freq, key=word_freq.get)
#return Wordmax
count = Counter(word_freq)
top = count.most_common(2)
wordmax= ''
for k in top:
wordmax += k[0] +' '
return wordmax
def score_sentences(self, sentences,word_freq):
sent_freq=dict()
for s in sentences:
word, _ = self.word_sent_tokenizer(s.lower())
num_words = len(word)
#print(s, type(s))
if num_words>0:
for w in word:
if w not in self.stopwords:
if s in sent_freq:
sent_freq[s]+=word_freq[w]
else:
sent_freq[s]=word_freq[w]
if s in sent_freq:
sent_freq[s]=sent_freq[s]/num_words
return sent_freq
def clean_text(self):
#cleaning
cleaned_text = re.sub(r'\[[0-9]*\]','',self.text)
cleaned_text=cleaned_text.lower()
return cleaned_text
def generate_summary(self, text):
#print(text)
words, sentences = self.word_sent_tokenizer(text)
word_frequency = self.create_wordfrequency(words)
sentence_scores = self.score_sentences(sentences, word_frequency)
result = heapq.nlargest(self.num, sentence_scores, key=sentence_scores.get)
maxWord = self.maxWordFreq(word_frequency)
#print(maxWord)
summary = ''
for i in result:
summary += ' '+i.capitalize()
return summary, maxWord.upper()
def summarize_text(self):
# if query in text form
text = self.clean_text()
summary, title = self.generate_summary(text)
#print(summary)
return text, title, summary