-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathaccuracy_seq.py
131 lines (95 loc) · 4.28 KB
/
accuracy_seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
This calculated accuracy sequences and puts writes them to STDERR
separated by tabs
"""
from experiments import get_accuracy, est_gp, est_majority_vote, est_merge_enough_votes, est_majority_vote_with_nn
from data import texts_vote_lists_truths_by_topic_id
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sys
import random
def get_indexes_of_smallest_elements(l):
"""
>>> get_indexes_of_smallest_elements([1,2,3,1,1,1])
[0, 3, 4, 5]
>>> get_indexes_of_smallest_elements([0,2,3,-1,-1,100])
[3, 4]
>>> get_indexes_of_smallest_elements([0,0,0,0,0,0])
[0, 1, 2, 3, 4, 5]
"""
min_element = min(l)
return [i for i, el in enumerate(l) if el == min_element ]
def get_accuracy_sequences(estimator_dict, sequence_length, texts, vote_lists, truths, X, text_similarity):
random.seed() # This is using system time
document_idx_vote_seq = []
document_vote_counts = [ 0 for _ in vote_lists ]
# Conduct an experiment where you randomly sample votes for documents
for _ in xrange(sequence_length):
# Pick a document randomly from the ones that has fewer votes
min_vote_doc_idxs = get_indexes_of_smallest_elements(document_vote_counts)
updated_doc_idx = random.choice(min_vote_doc_idxs)
document_vote_counts[updated_doc_idx] += 1
# Randomly pick a vote for this document
vote_idx = random.randrange(len(vote_lists[updated_doc_idx]))
vote = vote_lists[updated_doc_idx][vote_idx]
document_idx_vote_seq.append( (updated_doc_idx, vote ) )
# Here we know the sequence of draws was successful
# Let us measure estimator accuracies now
accuracy_sequences = {}
for estimator_name, estimator_args in estimator_dict.iteritems():
estimator, args = estimator_args
accuracy_sequences[estimator_name] = []
# Go through the generated sequence of draws and measure accuracy
known_votes = [ [] for _ in vote_lists ]
for document_idx, vote in document_idx_vote_seq:
known_votes[document_idx].append(vote)
# Recalculate all the estimates for the sake of consistency
estimates = estimator(texts, known_votes, X, text_similarity, *args)
# Calucate the accuracy_sequence
try:
accuracy = get_accuracy(estimates, truths)
except OSError:
print '#OS ERROR'
# Leave the function
return None
accuracy_sequences[estimator_name].append(accuracy)
return accuracy_sequences
def print_accuracy_sequences_to_stderr(estimator_dict, votes_per_doc, topic_id, n_sequesnces_per_estimator):
texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id]
n_documents = len(texts)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
text_similarity = cosine_similarity(X)
min_votes_per_doc, max_votes_per_doc = votes_per_doc
start_vote_count = int(min_votes_per_doc * n_documents)
# In an accuracy sequence, element 0 corresponds to the vote count of 1.
start_idx = start_vote_count - 1
sequence_length = int(max_votes_per_doc * n_documents)
for _ in xrange(n_sequesnces_per_estimator):
# Getting accuracy for all esimators
# If failed, attempt at getting a sequence until it's not None
sequences = None
counter = 0
while sequences is None:
counter += 1
print '#ATTEMPT\t%s' % counter
sequences = get_accuracy_sequences(estimator_dict, sequence_length, texts, vote_lists, truths, X, text_similarity)
# Got a sequence
# Write all sequences from this dict to stderr
run_id = random.randint(0, sys.maxint)
for estimator_name, accuracy_sequence in sequences.iteritems():
accuracy_sequence_trimmed = accuracy_sequence[start_idx: ]
for index, accuracy in enumerate(accuracy_sequence_trimmed):
sys.stderr.write("AC\t%s\t%s\t%s\t%s\t%s\n" % (start_vote_count + index, run_id, estimator_name, topic_id, "%.4f" % accuracy) )
if __name__ == "__main__":
try:
topic_id = sys.argv[1]
except IndexError:
raise Exception("Please supply the topic id")
N_SEQS_PER_EST = 15
print_accuracy_sequences_to_stderr({
'GP' : (est_gp, []),
'MV' : (est_majority_vote, []),
'MEV(1)' : (est_merge_enough_votes, [ 1 ]),
'MVNN(0.5)' : (est_majority_vote_with_nn, [ 0.5 ]),
}, (1.0, 3.0), topic_id, N_SEQS_PER_EST)