forked from rohanag/StockMarketSentimentAnalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrslp.py
145 lines (114 loc) · 5.38 KB
/
rslp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -*- coding: utf-8 -*-
# Natural Language Toolkit: RSLP Stemmer
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Tiago Tresoldi <tresoldi@gmail.com>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
# This code is based on the algorithm presented in the paper "A Stemming
# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
# Christian Huyck, which unfortunately I had no access to. The code is a
# Python version, with some minor modifications of mine, to the description
# presented at http://www.webcitation.org/5NnvdIzOb and to the C source code
# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
# Please note that this stemmer is intended for demonstration and educational
# purposes only. Feel free to write me for any comments, including the
# development of a different and/or better stemmer for Portuguese. I also
# suggest using NLTK's mailing list for Portuguese for any discussion.
# Este código é baseado no algoritmo apresentado no artigo "A Stemming
# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
# código é uma conversão para Python, com algumas pequenas modificações
# minhas, daquele apresentado em http://www.webcitation.org/5NnvdIzOb e do
# código para linguagem C disponível em
# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
# comentário, inclusive sobre o desenvolvimento de um stemmer diferente
# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
# do NLTK para o português para qualquer debate.
from nltk.data import load
from api import StemmerI
class RSLPStemmer(StemmerI):
"""
A stemmer for Portuguese.
>>> from nltk.stem import RSLPStemmer
>>> st = RSLPStemmer()
>>> # opening lines of Erico Verissimo's "Música ao Longe"
>>> text = u'''
... Clarissa risca com giz no quadro-negro a paisagem que os alunos
... devem copiar . Uma casinha de porta e janela , em cima duma
... coxilha .'''
>>> for token in text.split():
... print st.stem(token),
clariss risc com giz no quadro-negr a pais que os alun dev copi .
uma cas de port e janel , em cim dum coxilh .
"""
def __init__ (self):
self._model = []
self._model.append( self.read_rule("step0.pt") )
self._model.append( self.read_rule("step1.pt") )
self._model.append( self.read_rule("step2.pt") )
self._model.append( self.read_rule("step3.pt") )
self._model.append( self.read_rule("step4.pt") )
self._model.append( self.read_rule("step5.pt") )
self._model.append( self.read_rule("step6.pt") )
def read_rule (self, filename):
rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
lines = rules.split("\n")
lines = [line for line in lines if line != u""] # remove blank lines
lines = [line for line in lines if line[0] != "#"] # remove comments
# NOTE: a simple but ugly hack to make this parser happy with double '\t's
lines = [line.replace("\t\t", "\t") for line in lines]
# parse rules
rules = []
for line in lines:
rule = []
tokens = line.split("\t")
# text to be searched for at the end of the string
rule.append( tokens[0][1:-1] ) # remove quotes
# minimum stem size to perform the replacement
rule.append( int(tokens[1]) )
# text to be replaced into
rule.append( tokens[2][1:-1] ) # remove quotes
# exceptions to this rule
rule.append( [token[1:-1] for token in tokens[3].split(",")] )
# append to the results
rules.append(rule)
return rules
def stem(self, word):
word = word.lower()
# the word ends in 's'? apply rule for plural reduction
if word[-1] == "s":
word = self.apply_rule(word, 0)
# the word ends in 'a'? apply rule for feminine reduction
if word[-1] == "a":
word = self.apply_rule(word, 1)
# augmentative reduction
word = self.apply_rule(word, 3)
# adverb reduction
word = self.apply_rule(word, 2)
# noun reduction
prev_word = word
word = self.apply_rule(word, 4)
if word == prev_word:
# verb reduction
prev_word = word
word = self.apply_rule(word, 5)
if word == prev_word:
# vowel removal
word = self.apply_rule(word, 6)
return word
def apply_rule(self, word, rule_index):
rules = self._model[rule_index]
for rule in rules:
suffix_length = len(rule[0])
if word[-suffix_length:] == rule[0]: # if suffix matches
if len(word) >= suffix_length + rule[1]: # if we have minimum size
if word not in rule[3]: # if not an exception
word = word[:-suffix_length] + rule[2]
break
return word
if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)