-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathexp_replace.py
106 lines (94 loc) · 2.48 KB
/
exp_replace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
Used from Mathew Cliche's github: https://github.com/MathieuCliche/Sarcasm_detector
Cretor of thesarcasmdetector.com, our base project.
"""
""" These functions are used to replace tweet slang by words which are more easy
to be recognized by the sentiment analysis and to reduce the number of dimensions of
the features dictionnary. """
import nltk
import re
# dictionnary to sentiment analysis
emo_repl = {
# good emotions
"<3": " good ",
":d": " good ",
":dd": " good ",
":p": " good ",
"8)": " good ",
":-)": " good ",
":)": " good ",
";)": " good ",
"(-:": " good ",
"(:": " good ",
"yay!": " good ",
"yay": " good ",
"yaay": " good ",
"yaaay": " good ",
"yaaaay": " good ",
"yaaaaay": " good ",
# bad emotions
":/": " bad ",
":>": " sad ",
":')": " sad ",
":-(": " bad ",
":(": " bad ",
":s": " bad ",
":-s": " bad "
}
# dictionnary for general (i.e. topic modeler)
emo_repl2 = {
# good emotions
"<3": " heart ",
":d": " smile ",
":p": " smile ",
":dd": " smile ",
"8)": " smile ",
":-)": " smile ",
":)": " smile ",
";)": " smile ",
"(-:": " smile ",
"(:": " smile ",
# bad emotions
":/": " worry ",
":>": " angry ",
":')": " sad ",
":-(": " sad ",
":(": " sad ",
":s": " sad ",
":-s": " sad "
}
# general
re_repl = {
r"\br\b": "are",
r"\bu\b": "you",
r"\bhaha\b": "ha",
r"\bhahaha\b": "ha",
r"\bdon't\b": "do not",
r"\bdoesn't\b": "does not",
r"\bdidn't\b": "did not",
r"\bhasn't\b": "has not",
r"\bhaven't\b": "have not",
r"\bhadn't\b": "had not",
r"\bwon't\b": "will not",
r"\bwouldn't\b": "would not",
r"\bcan't\b": "can not",
r"\bcannot\b": "can not"
}
emo_repl_order = [k for (k_len, k) in
reversed(sorted([(len(k), k) for k in emo_repl.keys()]))]
emo_repl_order2 = [k for (k_len, k) in
reversed(sorted([(len(k), k) for k in emo_repl2.keys()]))]
def replace_emo(sentence):
sentence2 = sentence
for k in emo_repl_order:
sentence2 = sentence2.replace(k, emo_repl[k])
for r, repl in re_repl.items():
sentence2 = re.sub(r, repl, sentence2)
return sentence2
def replace_reg(sentence):
sentence2 = sentence
for k in emo_repl_order2:
sentence2 = sentence2.replace(k, emo_repl2[k])
for r, repl in re_repl.iteritems():
sentence2 = re.sub(r, repl, sentence2)
return sentence2