|
1 | 1 | # -*- coding: utf-8 -*-
|
2 |
| -from __future__ import absolute_import,unicode_literals |
3 |
| -from pythainlp.corpus import stopwords |
4 |
| -from string import punctuation |
| 2 | + |
| 3 | +from __future__ import absolute_import, unicode_literals |
| 4 | + |
5 | 5 | from collections import defaultdict
|
6 |
| -from pythainlp.tokenize import sent_tokenize, word_tokenize |
7 | 6 | from heapq import nlargest
|
| 7 | +from string import punctuation |
| 8 | + |
| 9 | +from pythainlp.corpus import stopwords |
| 10 | +from pythainlp.tokenize import sent_tokenize, word_tokenize |
| 11 | + |
| 12 | + |
8 | 13 | class FrequencySummarizer:
|
9 | 14 | def __init__(self, min_cut=0.1, max_cut=0.9):
|
10 |
| - self._min_cut = min_cut |
11 |
| - self._max_cut = max_cut |
12 |
| - self._stopwords = set(stopwords.words('thai') + list(punctuation)) |
13 |
| - |
14 |
| - def _compute_frequencies(self, word_sent): |
15 |
| - freq = defaultdict(int) |
16 |
| - for s in word_sent: |
17 |
| - for word in s: |
18 |
| - if word not in self._stopwords: |
19 |
| - freq[word] += 1 |
20 |
| - m = float(max(freq.values())) |
21 |
| - for w in list(freq): |
22 |
| - freq[w] = freq[w]/m |
23 |
| - if freq[w] >= self._max_cut or freq[w] <= self._min_cut: |
24 |
| - del freq[w] |
25 |
| - return freq |
26 |
| - |
27 |
| - def _rank(self, ranking, n): |
| 15 | + self.__min_cut = min_cut |
| 16 | + self.__max_cut = max_cut |
| 17 | + self.__stopwords = set(stopwords.words("thai") + list(punctuation)) |
| 18 | + |
| 19 | + def __compute_frequencies(self, word_tokenized_sents): |
| 20 | + word_freqs = defaultdict(int) |
| 21 | + for sent in word_tokenized_sents: |
| 22 | + for word in sent: |
| 23 | + if word not in self.__stopwords: |
| 24 | + word_freqs[word] += 1 |
| 25 | + |
| 26 | + max_freq = float(max(word_freqs.values())) |
| 27 | + for w in list(word_freqs): |
| 28 | + word_freqs[w] = word_freqs[w] / max_freq |
| 29 | + if word_freqs[w] >= self.__max_cut or word_freqs[w] <= self.__min_cut: |
| 30 | + del word_freqs[w] |
| 31 | + |
| 32 | + return word_freqs |
| 33 | + |
| 34 | + def __rank(self, ranking, n): |
28 | 35 | return nlargest(n, ranking, key=ranking.get)
|
29 | 36 |
|
30 |
| - def summarize(self, text, n,tokenize): |
| 37 | + def summarize(self, text, n, tokenizer): |
31 | 38 | sents = sent_tokenize(text)
|
32 |
| - word_sent = [word_tokenize(s,tokenize) for s in sents] |
33 |
| - self._freq = self._compute_frequencies(word_sent) |
| 39 | + word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents] |
| 40 | + self.__freq = self.__compute_frequencies(word_tokenized_sents) |
34 | 41 | ranking = defaultdict(int)
|
35 |
| - for i, sent in enumerate(word_sent): |
| 42 | + |
| 43 | + for i, sent in enumerate(word_tokenized_sents): |
36 | 44 | for w in sent:
|
37 |
| - if w in self._freq: |
38 |
| - ranking[i] += self._freq[w] |
39 |
| - sents_idx = self._rank(ranking,n) |
40 |
| - return [sents[j] for j in sents_idx] |
41 |
| -def summarize_text(text,n,engine='frequency',tokenize='newmm'): |
42 |
| - ''' |
43 |
| - Thai text summarize. |
44 |
| - :param str text: thai text |
45 |
| - :param int n: sent number |
46 |
| - :param str engine: Thai text summarize engine. |
47 |
| - :param str tokenize: thai word tokenize. |
48 |
| - ''' |
49 |
| - if engine=='frequency': |
50 |
| - data=FrequencySummarizer().summarize(text,n,tokenize) |
51 |
| - return data |
| 45 | + if w in self.__freq: |
| 46 | + ranking[i] += self.__freq[w] |
| 47 | + summaries_idx = self.__rank(ranking, n) |
| 48 | + |
| 49 | + return [sents[j] for j in summaries_idx] |
| 50 | + |
| 51 | + |
| 52 | +def summarize_text(text, n, engine="frequency", tokenizer="newmm"): |
| 53 | + """ |
| 54 | + Thai text summarization |
| 55 | + :param str text: text to be summarized |
| 56 | + :param int n: number of sentences to be included in the summary |
| 57 | + :param str engine: text summarization engine |
| 58 | + :param str tokenizer: word tokenizer |
| 59 | + :return List[str] summary: list of selected sentences |
| 60 | + """ |
| 61 | + sents = [] |
| 62 | + |
| 63 | + if engine == "frequency": |
| 64 | + sents = FrequencySummarizer().summarize(text, n, tokenizer) |
| 65 | + else: # if engine not found, return first n sentences |
| 66 | + sents = sent_tokenize(text)[:n] |
| 67 | + |
| 68 | + return sents |
0 commit comments