-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cpp
261 lines (225 loc) · 7.84 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#include <iostream>
#include <string>
#include <fstream>
#include <algorithm>
#include <unordered_set>
#include <unordered_map>
#include <stdio.h>
#include <sys/types.h>
#include <dirent.h>
#include <sstream>
#include <vector>
#include <cmath>
#include <float.h>
#include <limits.h>
#include <iterator>
using namespace std;
#define g(x) (1.0/(1.0+exp(-x)))
#define gprime(x) (g(x)*(1-g(x)))
// Print out progress bar
#define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"
#define PBWIDTH 60
void printProgress(double percentage, string info) {
int val = (int) (percentage * 100);
int lpad = (int) (percentage * PBWIDTH);
int rpad = PBWIDTH - lpad;
printf("%s \r%3d%% [%.*s%*s] ", info.c_str(), val, lpad, PBSTR, rpad, "");
fflush(stdout);
}
// Process words to match words in the vocabulary file
string ProcessWord(string s)
{
string t;
// Remove punctuation.
for (int i = 0; i < s.size(); i++)
if (!ispunct(s[i]))
t += s[i];
// Convert to lower case.
std::transform(t.begin(), t.end(), t.begin(), ::tolower);
return t;
}
// The feature we use is the number of occurances of a word in the file
vector<double> GetFeature(string filename, unordered_map<string, int> &word_to_idx)
{
/*
Args:
- filename: path name to the data file
- word_to_idx: a mapping where a word maps to a preceptron weight's index.
*/
// Open file.
ifstream in;
in.open(filename.c_str());
if (!in.is_open())
{
cerr << "File not found: " << filename << endl;
exit(1);
}
// TODO: obtain feature
// ---- START ENTERING CODE ---- //
// ENTER CODE HERE //
string line;
getline(in, line);
stringstream ss(line);
string word;
vector<string> rawWords;
while(getline(ss, word, ' ')) {
rawWords.push_back(word);
}
for (unsigned int i = 0; i < rawWords.size(); i++) {
string tmp = ProcessWord(rawWords[i]);
rawWords[i] = tmp;
}
vector<double> feature(word_to_idx.size(), 0.0);
// cout << "size: " << rawWords.size() << endl;
for (auto word : rawWords) {
// cout << word << endl;
// if it is a part of the vocab dictionary
if (word_to_idx.find(word) != word_to_idx.end()) {
int ind = word_to_idx[word];
feature[ind] += 1.0;
// cout << "ind : " << ind << endl;
// cout << word << " : " << feature[ind] << endl;
}
}
// ---- STOP ENTERING CODE ---- //
return feature;
}
void save_trained_weights(vector<double> &weight)
{
ofstream output_file("trained_weights");
ostream_iterator<double> output_iterator(output_file, "\n");
copy(weight.begin(), weight.end(), output_iterator);
output_file.close();
}
int main()
{
// Open vocabulary file
ifstream dict_file;
dict_file.open("imdb.vocab");
if (!dict_file.is_open())
{
cerr << "Dictionary not found." << endl;
exit(1);
}
// TODO: Create word to weight index map dictionary based on the imdb.vocab
// ---- START ENTERING CODE ---- //
// ENTER CODE HERE //
string line;
unordered_map<string, int> word_to_idx;
int i = 0;
int N = 0;
while (getline(dict_file, line)) {
word_to_idx.insert(make_pair(line, i));
i++;
N++;
}
// cout << word_to_idx.size() << endl;
// ---- STOP ENTERING CODE ---- //
// cout << weightVec.size() << endl;
dict_file.close();
// initialize accuracy output file
ofstream output_acc_file;
output_acc_file.open("accuracy.txt");
// TODO: Initialize perceptron weights and training parameters.
// Hint: check parameters in the AND perceptron code example
// ---- START ENTERING CODE ---- //
// ENTER CODE HERE //
// random generate weights for each word
vector<double> weights;
for (int i = 0; i < N; i++) {
// double x = (double)rand() / RAND_MAX;
// weights.push_back(x * (1.0 - 0.0));
weights.push_back(0.01);
}
vector<double> classResult;
unordered_map<string, vector<double>> features;
// ---- STOP ENTERING CODE ---- //
float alpha = 1;
int total_epoch = 20;
// Train the weights
for (int epoch = 0; epoch < total_epoch; ++epoch)
{
// Read in train_list
string file_name;
string delimiter = "\t";
ifstream train_file;
train_file.open("training_list");
float line_count = 0.0;
cout << "Epoch " << epoch;
// TODO: Update perceptron weights based on each data instance
while (getline(train_file, file_name))
{
// Hints (not required to follow):
// (1) retrieve the data and class of the data
// (2) obtain feature
// (3) update weights
// ---- START ENTERING CODE ---- //
// ENTER CODE HERE //
// retrieving data and class
stringstream ss(file_name);
string file_dir;
double c;
ss >> file_dir >> c;
classResult.push_back(c);
// obtain feature
vector<double> curFeature = GetFeature(file_dir, word_to_idx);
// features.insert(make_pair(file_dir, curFeature));
// update weights
// cout << curFeature.size() << endl;
double weightedSum = 0.0;
for (unsigned int i = 0; i < curFeature.size(); i++) {
weightedSum += weights[i] * curFeature[i];
}
for (unsigned int i = 0; i < curFeature.size(); i++) {
weights[i] -= (double)alpha * (g(weightedSum) - c) * gprime(weightedSum) * curFeature[i];
}
// ---- STOP ENTERING CODE ---- //
// Output current progress on training with progress bar
line_count += 1;
printProgress(line_count/10000, "Epoch "+ to_string(epoch));
}
train_file.close();
cout << endl;
// Save trained weights
save_trained_weights(weights);
// Evaluate trained weights with test data
ifstream test_file;
test_file.open("test_list");
float correct_pred = 0.0; // should update during evaluation
line_count = 0.0;
// TODO: Predict negative or positive review with current trained preceptron weights
while (getline(test_file, file_name))
{
// Hint: similar to training, but without updating the weights. Should also compute prediction results.
// ---- START ENTERING CODE ---- //
// ENTER CODE HERE //
stringstream ss(file_name);
string file_dir;
double c;
ss >> file_dir >> c;
// vector<double> curFeature = features[file_dir];
vector<double> curFeature = GetFeature(file_dir, word_to_idx);
double weightedSum = 0.0;
for (unsigned int i = 0; i < curFeature.size(); i++) {
weightedSum += weights[i] * curFeature[i];
}
// ---- STOP ENTERING CODE ---- //
// TODO: Check if the prediction is correct. If so, increase correct_pred by one.
// ---- START ENTERING CODE ---- //
// ENTER CODE HERE //
if ((g(weightedSum) > 0.5 && c == 1) || (g(weightedSum) <= 0.5 && c == 0)) {
correct_pred += 1.0;
}
// ---- STOP ENTERING CODE ---- //
// Output current progress on evaluation with progress bar
line_count += 1.0;
printProgress(line_count/1000, "Evaluating...");
}
test_file.close();
// Compute prediction accuracy
float acc = (correct_pred/line_count)*100.0;
printf("Accuracy: %.2f \n", acc);
output_acc_file << "Epoch " << epoch << ": Accuracy = " << acc << "%" << endl;
}
output_acc_file.close();
}