-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathNaiveBayesModel.py
124 lines (106 loc) · 3.47 KB
/
NaiveBayesModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import numpy
import pandas as pd
from sklearn.model_selection import train_test_split
from EngineFiles import TweetClean as tc
from EngineFiles import TweetFormat as tf
def countTweets(tweets, y):
'''
Input:
tweets: a list of tweets
y: a list corresponding to the sentiment of each tweet (either 0 or 1)
Output:
result: a dictionary mapping each pair to its frequency
'''
result = {}
for i, tweet in zip(y, tweets):
for w in tweet.split():
pair = (w, i)
if pair in result:
result[pair] += 1
else:
result[pair] = 1
return result
def splitSet(dataframe=pd.DataFrame(), test_size=0.2, rand_state=0):
df = dataframe.copy()
df['label'] = df['label'].apply(lambda x : 1 if x=='positive' else 0)
x_tr, x_ts, y_tr, y_ts = train_test_split(df['Tweet'].values, df['label'].values, test_size=test_size, random_state=rand_state, stratify= df.label.values)
return x_tr, x_ts, y_tr, y_ts
def lookup(freqs, word, label):
'''
Input:
freqs: a dictionary with the frequency of each pair (or tuple)
word: the word to look up
label: the label corresponding to the word
Output:
n: the number of times the word with its corresponding label appears.
'''
n = 0
pair = (word, label)
if pair in freqs:
n = freqs[pair]
return n
def naiveBayesTrain(freqs, x, y):
'''
Input:
freqs: dictionary from (word, label) to how often the word appears
x: a list of tweets
y: a list of labels correponding to the tweets (0,1)
Output:
logprior: the log prior. (equation 3 above)
loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
'''
loglikelihood = {}
vocab = set([p[0] for p in freqs.keys()])
V = len(vocab)
N_pos = N_neg = 0
for p in freqs.keys():
if p[1] > 0:
N_pos += freqs[p]
else:
N_neg += freqs[p]
D = len(y)
D_pos = sum(y)
D_neg = D - D_pos
logprior = numpy.log(D_pos) - numpy.log(D_neg)
for w in vocab:
freq_pos = lookup(freqs, w, 1)
freq_neg = lookup(freqs, w, 0)
prob_w_pos = (freq_pos + 1)/(N_pos + V)
prob_w_neg = (freq_neg + 1)/(N_neg + V)
loglikelihood[w] = numpy.log(prob_w_pos/prob_w_neg)
return logprior, loglikelihood
def NB_predictTweet(tweet, logprior, loglikelihood):
'''
Input:
tweet: a string
logprior: a number
loglikelihood: a dictionary of words mapping to numbers
Output:
p: the sum of all the loglikelihoods of each word in the tweet (if found in the dictionary) + logprior (a number)
'''
word = tweet.split()
p = logprior
for w in word:
if w in loglikelihood:
p += loglikelihood[w]
return p
def naiveBayesAccuracy(x, y, logprior, loglikelihood):
"""
Input:
x: A list of tweets
y: the corresponding labels for the list of tweets
logprior: the logprior
loglikelihood: a dictionary with the loglikelihoods for each word
Output:
accuracy: (# of tweets classified correctly)/(total # of tweets)
"""
y_preds = []
for t in x:
if NB_predictTweet(t, logprior, loglikelihood) > 0:
y_pred = 1
else:
y_pred = 0
y_preds.append(y_pred)
error = numpy.mean(numpy.absolute(y_preds-y))
accuracy = 1 - error
return accuracy, y_preds