Merge pull request #131 from bact/dev

wannaphong · web-flow · commit 64519a5f2f59 · 2018-10-20T17:50:38.000+07:00
summarize: Small variable rename and handle engine not found case
diff --git a/pythainlp/summarize/__init__.py b/pythainlp/summarize/__init__.py
@@ -1,51 +1,68 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,unicode_literals
-from pythainlp.corpus import stopwords
-from string import punctuation
+
+from __future__ import absolute_import, unicode_literals
+
 from collections import defaultdict
-from pythainlp.tokenize import sent_tokenize, word_tokenize
 from heapq import nlargest
+from string import punctuation
+
+from pythainlp.corpus import stopwords
+from pythainlp.tokenize import sent_tokenize, word_tokenize
+
+
 class FrequencySummarizer:
     def __init__(self, min_cut=0.1, max_cut=0.9):
-        self._min_cut = min_cut
-        self._max_cut = max_cut
-        self._stopwords = set(stopwords.words('thai') + list(punctuation))
-
-    def _compute_frequencies(self, word_sent):
-        freq = defaultdict(int)
-        for s in word_sent:
-            for word in s:
-                if word not in self._stopwords:
-                    freq[word] += 1
-        m = float(max(freq.values()))
-        for w in list(freq):
-            freq[w] = freq[w]/m
-            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
-                del freq[w]
-        return freq
-
-    def _rank(self, ranking, n):
+        self.__min_cut = min_cut
+        self.__max_cut = max_cut
+        self.__stopwords = set(stopwords.words("thai") + list(punctuation))
+
+    def __compute_frequencies(self, word_tokenized_sents):
+        word_freqs = defaultdict(int)
+        for sent in word_tokenized_sents:
+            for word in sent:
+                if word not in self.__stopwords:
+                    word_freqs[word] += 1
+
+        max_freq = float(max(word_freqs.values()))
+        for w in list(word_freqs):
+            word_freqs[w] = word_freqs[w] / max_freq
+            if word_freqs[w] >= self.__max_cut or word_freqs[w] <= self.__min_cut:
+                del word_freqs[w]
+
+        return word_freqs
+
+    def __rank(self, ranking, n):
         return nlargest(n, ranking, key=ranking.get)
 
-    def summarize(self, text, n,tokenize):
+    def summarize(self, text, n, tokenizer):
         sents = sent_tokenize(text)
-        word_sent = [word_tokenize(s,tokenize) for s in sents]
-        self._freq = self._compute_frequencies(word_sent)
+        word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents]
+        self.__freq = self.__compute_frequencies(word_tokenized_sents)
         ranking = defaultdict(int)
-        for i, sent in enumerate(word_sent):
+
+        for i, sent in enumerate(word_tokenized_sents):
             for w in sent:
-                if w in self._freq:
-                    ranking[i] += self._freq[w]
-        sents_idx = self._rank(ranking,n)
-        return [sents[j] for j in sents_idx]
-def summarize_text(text,n,engine='frequency',tokenize='newmm'):
-    '''
-    Thai text summarize.
-    :param str text: thai text
-    :param int n: sent number
-    :param str engine: Thai text summarize engine.
-    :param str tokenize: thai word tokenize.
-    '''
-    if engine=='frequency':
-        data=FrequencySummarizer().summarize(text,n,tokenize)
-    return data
+                if w in self.__freq:
+                    ranking[i] += self.__freq[w]
+        summaries_idx = self.__rank(ranking, n)
+
+        return [sents[j] for j in summaries_idx]
+
+
+def summarize_text(text, n, engine="frequency", tokenizer="newmm"):
+    """
+    Thai text summarization
+    :param str text: text to be summarized
+    :param int n: number of sentences to be included in the summary
+    :param str engine: text summarization engine
+    :param str tokenizer: word tokenizer
+    :return List[str] summary: list of selected sentences
+    """
+    sents = []
+
+    if engine == "frequency":
+        sents = FrequencySummarizer().summarize(text, n, tokenizer)
+    else:  # if engine not found, return first n sentences
+        sents = sent_tokenize(text)[:n]
+
+    return sents