forked from cgpotts/cs224u
-
Notifications
You must be signed in to change notification settings - Fork 1
/
wordentail.py
114 lines (92 loc) · 4.57 KB
/
wordentail.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
# In-class CS224u bake-off, April 8, 2015
# Chris Potts
from distributedwordreps import *
import random
import pickle
import numpy as np
from collections import defaultdict
import sklearn.metrics
# Class labels:
SUBSET = 1.0 # Left word entails right, as in (hippo, mammal)
SUPERSET = -1.0 # Right word entails left, as in (mammal, hippo)
# In case you want to make use of GloVe vectors somehow ...
# It's worth checking on higher dimensionality versions too:
# http://nlp.stanford.edu/projects/glove/
#
# GLOVE_MAT, GLOVE_VOCAB, _ = build('distributedwordreps-data/glove.6B.50d.txt', delimiter=' ', header=False, quoting=csv.QUOTE_NONE)
def randvec(w, n=40, lower=-0.5, upper=0.5):
"""Returns a random vector of length n. w is ignored."""
return np.array([random.uniform(lower, upper) for i in range(n)])
def vec_concatenate(u, v):
return np.concatenate((u, v))
def data_prep(
src_filename='wordentail_data.pickle',
vector_func=None, # Should be a map from the strings in vocab to vectors (e.g., randvec).
vector_combo_func=None): # Use vec_concatenate or write something better!
# Load in the dataset:
vocab, d = pickle.load(file(src_filename))
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# Make vectors a mapping from words (as strings) to their vector
# representations, as determined by vector_func.
vectors = {}
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Here, we create a dataset in the format required by the neural
# network:
#
# {'train': [(vec, [cls]), (vec, [cls]), ...],
# 'test': [(vec, [cls]), (vec, [cls]), ...],
# 'disjoint_vocab_test': [(vec, [cls]), (vec, [cls]), ...]}
dataset = defaultdict(list)
for split, data in d.items():
for clsname, word_pairs in data.items():
for w1, w2 in word_pairs:
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# Use vector_combo_func to combine the word vectors
# for w1 and w2, as given by the vectors dictionary
# above, and pair it with the singleton array containing
# clsname. item should be a pair consisting of a single
# vector and a list containing only clsname:
item = []
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
dataset[split].append(item)
return dataset
def train_and_evaluate(dataset):
train = dataset['train']
test = dataset['test']
disjoint_vocab_test = dataset['disjoint_vocab_test']
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# Set up the neural network so that input_dim is the length of
# your training inputs, hidden_dim is set by you (make it a
# keyword argument to this function, and output_dim is 1:
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# Train the network, with the number of iterations set you by you
# (make it a keyword argument to this function). You might want
# to use display_progress=True to track errors and speed.
# USE ONLY train FOR THE TRAINING!!!
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# The following is evaluation code. You won't have to alter it
# unless you did something unanticipated like transform the output
# variables before training.
for typ, data in (('train', train), ('test', test), ('disjoint_vocab_test', disjoint_vocab_test)):
predictions = []
cats = []
for ex, cat in data:
# The raw prediction is a singleton list containing a float in (-1,1).
# We want only its contents:
prediction = net.predict(ex)[0]
# Categorize the prediction for accuracy comparison:
prediction = SUPERSET if prediction <= 0.0 else SUBSET
predictions.append(prediction)
# Store the gold label for the classification report:
cats.append(cat[0])
# Report:
print "======================================================================"
print typ
print sklearn.metrics.classification_report(cats, predictions, target_names=['SUPERSET', 'SUBSET'])
if __name__ == '__main__':
# This is a complete run. You'll probably want to make keyword
# arguments available here to tune good networks heuristically.
dataset = data_prep()
train_and_evaluate(dataset)