Skip to content

Commit 64519a5

Browse files
authored
Merge pull request #131 from bact/dev
summarize: Small variable rename and handle engine not found case
2 parents 3307699 + c1296d7 commit 64519a5

File tree

1 file changed

+58
-41
lines changed

1 file changed

+58
-41
lines changed

pythainlp/summarize/__init__.py

+58-41
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,68 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,unicode_literals
3-
from pythainlp.corpus import stopwords
4-
from string import punctuation
2+
3+
from __future__ import absolute_import, unicode_literals
4+
55
from collections import defaultdict
6-
from pythainlp.tokenize import sent_tokenize, word_tokenize
76
from heapq import nlargest
7+
from string import punctuation
8+
9+
from pythainlp.corpus import stopwords
10+
from pythainlp.tokenize import sent_tokenize, word_tokenize
11+
12+
813
class FrequencySummarizer:
914
def __init__(self, min_cut=0.1, max_cut=0.9):
10-
self._min_cut = min_cut
11-
self._max_cut = max_cut
12-
self._stopwords = set(stopwords.words('thai') + list(punctuation))
13-
14-
def _compute_frequencies(self, word_sent):
15-
freq = defaultdict(int)
16-
for s in word_sent:
17-
for word in s:
18-
if word not in self._stopwords:
19-
freq[word] += 1
20-
m = float(max(freq.values()))
21-
for w in list(freq):
22-
freq[w] = freq[w]/m
23-
if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
24-
del freq[w]
25-
return freq
26-
27-
def _rank(self, ranking, n):
15+
self.__min_cut = min_cut
16+
self.__max_cut = max_cut
17+
self.__stopwords = set(stopwords.words("thai") + list(punctuation))
18+
19+
def __compute_frequencies(self, word_tokenized_sents):
20+
word_freqs = defaultdict(int)
21+
for sent in word_tokenized_sents:
22+
for word in sent:
23+
if word not in self.__stopwords:
24+
word_freqs[word] += 1
25+
26+
max_freq = float(max(word_freqs.values()))
27+
for w in list(word_freqs):
28+
word_freqs[w] = word_freqs[w] / max_freq
29+
if word_freqs[w] >= self.__max_cut or word_freqs[w] <= self.__min_cut:
30+
del word_freqs[w]
31+
32+
return word_freqs
33+
34+
def __rank(self, ranking, n):
2835
return nlargest(n, ranking, key=ranking.get)
2936

30-
def summarize(self, text, n,tokenize):
37+
def summarize(self, text, n, tokenizer):
3138
sents = sent_tokenize(text)
32-
word_sent = [word_tokenize(s,tokenize) for s in sents]
33-
self._freq = self._compute_frequencies(word_sent)
39+
word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents]
40+
self.__freq = self.__compute_frequencies(word_tokenized_sents)
3441
ranking = defaultdict(int)
35-
for i, sent in enumerate(word_sent):
42+
43+
for i, sent in enumerate(word_tokenized_sents):
3644
for w in sent:
37-
if w in self._freq:
38-
ranking[i] += self._freq[w]
39-
sents_idx = self._rank(ranking,n)
40-
return [sents[j] for j in sents_idx]
41-
def summarize_text(text,n,engine='frequency',tokenize='newmm'):
42-
'''
43-
Thai text summarize.
44-
:param str text: thai text
45-
:param int n: sent number
46-
:param str engine: Thai text summarize engine.
47-
:param str tokenize: thai word tokenize.
48-
'''
49-
if engine=='frequency':
50-
data=FrequencySummarizer().summarize(text,n,tokenize)
51-
return data
45+
if w in self.__freq:
46+
ranking[i] += self.__freq[w]
47+
summaries_idx = self.__rank(ranking, n)
48+
49+
return [sents[j] for j in summaries_idx]
50+
51+
52+
def summarize_text(text, n, engine="frequency", tokenizer="newmm"):
53+
"""
54+
Thai text summarization
55+
:param str text: text to be summarized
56+
:param int n: number of sentences to be included in the summary
57+
:param str engine: text summarization engine
58+
:param str tokenizer: word tokenizer
59+
:return List[str] summary: list of selected sentences
60+
"""
61+
sents = []
62+
63+
if engine == "frequency":
64+
sents = FrequencySummarizer().summarize(text, n, tokenizer)
65+
else: # if engine not found, return first n sentences
66+
sents = sent_tokenize(text)[:n]
67+
68+
return sents

0 commit comments

Comments
 (0)