-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paths1803764.py
737 lines (596 loc) · 26 KB
/
s1803764.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
"""
Foundations of Natural Language Processing
Assignment 1
Please complete functions, based on their doc_string description
and instructions of the assignment.
To test your code run:
```
[hostname]s1234567 python3 s1234567.py
```
Before submission executed your code with ``--answers`` flag
```
[hostname]s1234567 python3 s1234567.py --answers
```
include generated answers.py file.
Best of Luck!
"""
from ast import operator
from cgi import test
from collections import defaultdict, Counter
import enum
from lib2to3.pgen2 import token
from ntpath import join
from operator import itemgetter
from string import punctuation
from bleach import clean
import numpy as np # for np.mean() and np.std()
import nltk, sys, inspect
import nltk.corpus.util
from nltk import MaxentClassifier
from nltk.corpus import brown, ppattach
from sklearn.multiclass import OutputCodeClassifier # import corpora
# Import the Twitter corpus and LgramModel
from nltk_model import * # See the README inside the nltk_model folder for more information
# Import the Twitter corpus and LgramModel
from twitter.twitter import *
twitter_file_ids = "20100128.txt"
assert twitter_file_ids in xtwc.fileids()
# Some helper functions
def ppEandT(eAndTs):
'''
Pretty print a list of entropy-tweet pairs
:type eAndTs: list(tuple(float,list(str)))
:param eAndTs: entropies and tweets
:return: None
'''
for entropy, tweet in eAndTs:
print("{:.3f} [{}]".format(entropy, ", ".join(tweet)))
def compute_accuracy(classifier, data):
"""
Computes accuracy (range 0 - 1) of a classifier.
:type classifier: NltkClassifierWrapper or NaiveBayes
:param classifier: the classifier whose accuracy we compute.
:type data: list(tuple(list(any), str))
:param data: A list with tuples of the form (list with features, label)
:rtype float
:return accuracy (range 0 - 1).
"""
correct = 0
for d, gold in data:
predicted = classifier.classify(d)
correct += predicted == gold
return correct/len(data)
def apply_extractor(extractor_f, data):
"""
Helper function:
Apply a feature extraction method to a labeled dataset.
:type extractor_f: (str, str, str, str) -> list(any)
:param extractor_f: the feature extractor, that takes as input V, N1, P, N2 (all strings) and returns a list of features
:type data: list(tuple(str))
:param data: a list with tuples of the form (id, V, N1, P, N2, label)
:rtype list(tuple(list(any), str))
:return a list with tuples of the form (list with features, label)
"""
r = []
for d in data:
r.append((extractor_f(*d[1:-1]), d[-1]))
return r
class NltkClassifierWrapper:
"""
This is a little wrapper around the nltk classifiers so that we can interact with them
in the same way as the Naive Bayes classifier.
"""
def __init__(self, classifier_class, train_features, **kwargs):
"""
:type classifier_class: a class object of nltk.classify.api.ClassifierI
:param classifier_class: the kind of classifier we want to create an instance of.
:type train_features: list(tuple(list(any), str))
:param train_features: A list with tuples of the form (list with features, label)
:param kwargs: additional keyword arguments for the classifier, e.g. number of training iterations.
:return None
"""
self.classifier_obj = classifier_class.train(
[(NltkClassifierWrapper.list_to_freq_dict(d), c) for d, c in train_features], **kwargs)
@staticmethod
def list_to_freq_dict(d):
"""
:param d: list(any)
:param d: list of features
:rtype dict(any, int)
:return: dictionary with feature counts.
"""
return Counter(d)
def classify(self, d):
"""
:param d: list(any)
:param d: list of features
:rtype str
:return: most likely class
"""
return self.classifier_obj.classify(NltkClassifierWrapper.list_to_freq_dict(d))
def show_most_informative_features(self, n = 10):
self.classifier_obj.show_most_informative_features(n)
# End helper functions
# ==============================================
# Section I: Language Identification [60 marks]
# ==============================================
# Question 1 [7 marks]
def train_LM(corpus):
'''
Build a bigram letter language model using LgramModel
based on the all-alpha subset the entire corpus
:type corpus: nltk.corpus.CorpusReader
:param corpus: An NLTK corpus
:rtype: LgramModel
:return: A padded letter bigram model based on nltk.model.NgramModel
'''
# subset the corpus to only include all-alpha tokens,
# converted to lower-case (_after_ the all-alpha check)
corpus_tokens = [w.lower() for w in corpus.words(corpus.fileids()) if w.isalpha()]
# Return a smoothed (using the default estimator) padded bigram
# letter language model
return LgramModel(2, corpus_tokens, pad_left=True, pad_right=True)
# Question 2 [7 marks]
def tweet_ent(file_name, bigram_model):
'''
Using a character bigram model, compute sentence entropies
for a subset of the tweet corpus, removing all non-alpha tokens and
tweets with less than 5 all-alpha tokens
:type file_name: str
:param file_name: twitter file to process
:rtype: list(tuple(float,list(str)))
:return: ordered list of average entropies and tweets'''
# Clean up the tweet corpus to remove all non-alpha
# tokens and tweets with less than 5 (remaining) tokens, converted
# to lowercase
list_of_tweets = xtwc.sents(file_name)
alpha_tweets = [[token.lower() for token in tweet if token.isalpha()] for tweet in list_of_tweets]
cleaned_list_of_tweets = [alpha_tweet for alpha_tweet in alpha_tweets if len(alpha_tweet) >= 5]
# Construct a list of tuples of the form: (entropy,tweet)
# for each tweet in the cleaned corpus, where entropy is the
# average word for the tweet, and return the list of
# (entropy,tweet) tuples sorted by entropy
ents = {idx: np.mean([bigram_model.entropy(word, pad_left=True, pad_right=True, perItem=True) for word in tweet]) for idx, tweet in enumerate(cleaned_list_of_tweets)}
sorted_ents = sorted(ents.items(), key=lambda item: item[1])
list_of_tuples = [(item[1], cleaned_list_of_tweets[item[0]]) for item in sorted_ents]
return list_of_tuples
# Question 3 [8 marks]
def open_question_3():
'''
Question: What differentiates the beginning and end of the list
of tweets and their entropies?
:rtype: str
:return: your answer [500 chars max]
'''
return inspect.cleandoc("""
The entropy values represent the average uncertainty the model
has with classifying all the words in the given tweet.
The first tweets are all English words, the most common being
conjunctions ("and"), noun articles ("the"), and nouns
("weather", "love").
The last tweets mainly consisted of non-ASCII logograms from
other languages. This was to be expected given these languages
are evidently not likely to be used in an English tweet.""")[0:500]
# Question 4 [8 marks]
def open_question_4() -> str:
'''
Problem: noise in Twitter data
:rtype: str
:return: your answer [500 chars max]
'''
return inspect.cleandoc("""
We should remove all non-English tweets (non-ASCII) from the corpus
as these characters/words are obviously not relevant for
developing an English NL model.
We can identify non-English tweets by checking if they contain
non-ASCII characters as ASCII is only used for the English language.""")[0:500]
# Question 5 [15 marks]
def tweet_filter(list_of_tweets_and_entropies):
'''
Compute entropy mean, standard deviation and using them,
likely non-English tweets in the all-ascii subset of list
of tweets and their letter bigram entropies
:type list_of_tweets_and_entropies: list(tuple(float,list(str)))
:param list_of_tweets_and_entropies: tweets and their
english (brown) average letter bigram entropy
:rtype: tuple(float, float, list(tuple(float,list(str)))
:return: mean, standard deviation, ascii tweets and entropies,
non-English tweets and entropies
'''
# Find the "ascii" tweets - those in the lowest-entropy 90%
# of list_of_tweets_and_entropies
idx = int(0.9*len(list_of_tweets_and_entropies))
list_of_ascii_tweets_and_entropies = list_of_tweets_and_entropies[0:idx]
# Extract a list of just the entropy values
list_of_entropies = [tweet[0] for tweet in list_of_ascii_tweets_and_entropies]
# Compute the mean of entropy values for "ascii" tweets
mean = np.mean(list_of_entropies)
# Compute their standard deviation
standard_deviation = np.std(list_of_entropies)
# Get a list of "probably not English" tweets, that is
# "ascii" tweets with an entropy greater than (mean + std_dev))
threshold = mean + standard_deviation
list_of_not_English_tweets_and_entropies = [tweet for tweet in list_of_ascii_tweets_and_entropies if tweet[0] > threshold]
# Return mean, standard_deviation,
# list_of_ascii_tweets_and_entropies,
# list_of_not_English_tweets_and_entropies
return mean, standard_deviation, list_of_ascii_tweets_and_entropies, list_of_not_English_tweets_and_entropies
# Question 6 [15 marks]
def open_question_6():
"""
Suppose you are asked to find out what the average per word entropy of English is.
- Name 3 problems with this question, and make a simplifying assumption for each of them.
- What kind of experiment would you perform to estimate the entropy after you have these simplifying assumptions?
Justify the main design decisions you make in your experiment.
:rtype: str
:return: your answer [1000 chars max]
"""
return inspect.cleandoc("""
This question is rather vague because...
1. It does not detail the era of English to use (ie. 1500s-2000 vs. 21st century).
2. It does not detail the dialect(s) of English (ie. British vs. American English).
3. The source of English (ie. where we extract the data from). Corpora always have a genre
that denotes where the data was extracted from, this has a massive affect on the type of
English used (ie. News articles vs. Twitter data).
Thus I will assume we are referring to 21st century British English from a balanced Web corpus.
Experiment:
1. Get a 21st-century British English corpus with a balanced Web genre.
2. Tokenise the corpus.
3. Compute word frequencies for all the words in the corpus.
4. Compute word priors by dividing the word frequency by the sum of all frequencies.
These priors should be smoothed to ensure we have no zero probabilities.
5. Calculate entropy of each word using its prior.
4. Take the mean of these entropies.
""")[:1000]
#############################################
# SECTION II - RESOLVING PP ATTACHMENT AMBIGUITY
#############################################
# Question 7 [15 marks]
class NaiveBayes:
"""
Naive Bayes model with Lidstone smoothing (parameter alpha).
"""
def __init__(self, data, alpha):
"""
:type data: list(tuple(list(any), str))
:param data: A list with tuples of the form (list with features, label)
:type alpha: float
:param alpha: \alpha value for Lidstone smoothing
"""
self.vocab = self.get_vocab(data)
self.alpha = alpha
self.prior, self.likelihood = self.train(data, alpha, self.vocab)
@staticmethod
def get_vocab(data):
"""
Compute the set of all possible features from the (training) data.
:type data: list(tuple(list(any), str))
:param data: A list with tuples of the form (list with features, label)
:rtype: set(any)
:return: The set of all features used in the training data for all classes.
"""
return {ftr for el in data for ftr in el[0]}
@staticmethod
def train(data, alpha, vocab):
"""
Estimates the prior and likelihood from the data with Lidstone smoothing.
:type data: list(tuple(list(any), str))
:param data: A list of tuples ([f1, f2, ... ], c) with the first element
being a list of features and the second element being its class.
:type alpha: float
:param alpha: \alpha value for Lidstone smoothing
:type vocab: set(any)
:param vocab: The set of all features used in the training data for all classes.
:rtype: tuple(dict(str, float), dict(str, dict(any, float)))
:return: Two dictionaries: the prior and the likelihood (in that order).
We expect the returned values to relate as follows to the probabilities:
prior[c] = P(c)
likelihood[c][f] = P(f|c)
"""
assert alpha >= 0.0
likelihood, prior = {}, {}
dclasses, dftrs = [], []
# Compute raw frequency distributions
cfdist = {}
for el in data:
dclasses.append(el[1])
if not el[1] in cfdist.keys():
cfdist[el[1]] = {}
for ftr in el[0]:
dftrs.append(ftr)
if ftr in cfdist[el[1]].keys():
cfdist[el[1]][ftr] = cfdist[el[1]][ftr] + 1
else:
cfdist[el[1]][ftr] = 1
classes = set(dclasses)
ftrs = set(dftrs)
class_counts = {c: dclasses.count(c) for c in classes}
ftr_counts = {f: dftrs.count(f) for f in ftrs}
# Compute prior (MLE). Compute likelihood with smoothing.
num_ftrs = np.sum(list(ftr_counts.values()))
for c in classes:
prior[c] = class_counts[c]/len(data)
likelihood[c] = {}
# Calculate the sum of class prior probabilities
tot_cprior_prob = np.sum(list(prior.values()))
for c in classes:
# Divide each prior probability by the sum of prior probabilities over all classes.
# This is done to ensure that:
# SUM_from(i=1)_to(k) P(c_i) = 1
# Which helps negate the effect of floating point errors
prior[c] = prior[c]/tot_cprior_prob
for v in vocab:
if not v in cfdist[c].keys():
cfdist[c][v] = 0
prob_cv = cfdist[c][v]/ftr_counts[v]
likelihood[c][v] = (prob_cv + alpha)/(prior[c] + alpha*len(vocab))
assert likelihood[c][v] >= 0
# Calculate the sum of feature likelihood probabilities
tot_lh_prob = np.sum(list(likelihood[c].values()))
# Divide each likelihood probability by the sum of likelihood probabilities for this feature.
# This is done to ensure that:
# SUM_from(i=1)_to(n) P(f_i|c) = 1
# Which helps negate the effect of floating point errors
for v in vocab:
likelihood[c][v] = likelihood[c][v]/tot_lh_prob
assert abs(np.sum(list(likelihood[c].values())) - 1) <= 1e-12
assert prior[c] >= 0
assert abs(np.sum(list(prior.values())) - 1) <= 1e-12
return prior, likelihood
def prob_classify(self, d):
"""
Compute the probability P(c|d) for all classes.
:type d: list(any)
:param d: A list of features.
:rtype: dict(str, float)
:return: The probability p(c|d) for all classes as a dictionary.
"""
classes = set(self.likelihood.keys())
c_probs = {}
# Calculate the sum of feature likelihood probabilities for every feature over all classes
cftr_lh_count = {}
for ftr in d:
if ftr in self.vocab:
cftr_lh_count[ftr] = 0
for c in classes:
cftr_lh_count[ftr] += self.likelihood[c][ftr]
# Divide each likelihood probability by the sum of likelihood probabilities for this feature.
# This is done to ensure that:
# SUM_from(i=1)_to(n) P(f_i|c) = 1
# Which helps negate the effect of floating point errors
for c in classes:
ftr_likelihoods = [self.likelihood[c][ftr]/cftr_lh_count[ftr] for ftr in d if ftr in self.vocab]
c_probs[c] = np.prod(ftr_likelihoods)
assert c_probs[c] >= 0
# Calculate the sum of class posterior probabilities
tot_prob = np.sum(list(c_probs.values()))
# Divide each posterior probability by the sum of posterior probabilities.
# This is done to ensure that:
# SUM_from(i=1)_to(m) P(c_i|d) = 1
# Which helps negate the effect of floating point errors
for c in classes:
c_probs[c] = c_probs[c]/tot_prob
assert abs(np.sum(list(c_probs.values())) - 1) <= 1e-12
return c_probs
def classify(self, d):
"""
Compute the most likely class of the given "document" with ties broken arbitrarily.
:type d: list(any)
:param d: A list of features.
:rtype: str
:return: The most likely class.
"""
probs = self.prob_classify(d)
return max(probs, key=probs.get)
# Question 8 [10 marks]
def open_question_8() -> str:
"""
How do you interpret the differences in accuracy between the different ways to extract features?
:rtype: str
:return: Your answer of 500 characters maximum.
"""
return inspect.cleandoc("""
The best accuracy was achieved using a sequence of words with labels rather than a single word
with no labels. Indicating this model performs best when passed a sequence of words/features,
and/or when the feature(s) have labels.
My NB model achieved better accuracies for all models in table 1 except the last one. We can
imagine this is due to the multi-feature nature of this extractor and the fact that the LR model
doesn't assume features are independent of each other given the class unlike NB.
""")[:500]
# Feature extractors used in the table:
# see your_feature_extractor for documentation on arguments and types.
def feature_extractor_1(v, n1, p, n2):
return [v]
def feature_extractor_2(v, n1, p, n2):
return [n1]
def feature_extractor_3(v, n1, p, n2):
return [p]
def feature_extractor_4(v, n1, p, n2):
return [n2]
def feature_extractor_5(v, n1, p, n2):
return [("v", v), ("n1", n1), ("p", p), ("n2", n2)]
# Question 9.1 [5 marks]
def your_feature_extractor(v, n1, p, n2):
"""vsumvsum
Takes the head words and produces a list of features. The features may
be of any type as long as they are hashable.
:type v: str
:param v: The verb.
:type n1: str
:param n1: Head of the object NP (Noun Phrase).
:type p: str
:param p: The preposition.
:type n2: str
:param n2: Head of the NP embedded in the PP (Prepositional Phrase).
:rtype: list(any)
:return: A list of features produced by you.
"""
data = [v, n1, p, n2]
features = []
for i in range(4):
# Singleton feature
features.append(data[i])
for j in range(4):
if i != j:
# Tuple of features
features.append((data[i],data[j]))
ptags = [ptag[1] for ptag in nltk.pos_tag(data)]
features = features + ptags
#Verb features
if "ing" == v[-3:]:
features.append(True)
features.append(False)
features.append(False)
if v[-4] != "y":
features.append(v[:-3] + "e")
else:
features.append(v[:-3])
elif "ed" == v[-2:]:
features.append(False)
features.append(True)
features.append(False)
if len(v) > 4 and v[-4] in "aeiou":
features.append(v[:-3])
else:
features.append(v[:-2] + "e")
elif "s" == v[-1] and len(v) > 2:
features.append(False)
features.append(False)
features.append(True)
features.append(v[:-1])
else:
features.append(False)
features.append(False)
features.append(False)
features.append(v)
#Noun features
features.append(n1[-1] == "s")
features.append(n2[-1] == "s")
features.append("?" in n2)
features.append(n1 == "%")
features.append(n1 == "million")
#Converting to dictionary resulted in improved accuracy
dic = {}
for i, ftr in enumerate(features):
dic[i] = ftr
return dic
# Question 9.2 [10 marks]
def open_question_9():
"""
Briefly describe your feature templates and your reasoning for them.
Pick 3 examples of informative features and discuss why they make sense or why they do not make sense
and why you think the model relies on them.
:rtype: str
:return: Your answer of 1000 characters maximum.
"""
return inspect.cleandoc("""
I first decided to include the unformatted features individually as this will allow our
model to fit classes based on specific values for these features (like a unigram). This proved particularly
useful for prepositions such as "of".
Next I wanted to create features that would represent combinations of these features I did this by taking
all unique tuple permutations of these features (ie. (f1, f2)). This proved useful as it helped
the model identify common feature combinations.
Lastly, I wanted to manually create features to represent common suffixes/values for the features.
I did this for the verb by separating the suffix (ie. "ing", "ed) of the verb with it's verb to get the tense,
and verb stem.
I did this for the nouns by checking if they were plural (ended in "s"), formed a question (contained "?"),
or equated to common values (ie. "million" or "%").
I did not need to do this fo the prepositions due to the existence of the tuple feature combinations.
""")[:1000]
"""
Format the output of your submission for both development and automarking.
!!!!! DO NOT MODIFY THIS PART !!!!!
"""
def answers():
# Global variables for answers that will be used by automarker
global ents, lm
global best10_ents, worst10_ents, mean, std, best10_ascci_ents, worst10_ascci_ents
global best10_non_eng_ents, worst10_non_eng_ents
global answer_open_question_4, answer_open_question_3, answer_open_question_6,\
answer_open_question_8, answer_open_question_9
global ascci_ents, non_eng_ents
global naive_bayes
global acc_extractor_1, naive_bayes_acc, lr_acc, logistic_regression_model, dev_features
print("*** Part I***\n")
print("*** Question 1 ***")
print('Building brown bigram letter model ... ')
lm = train_LM(brown)
print('Letter model built')
print("*** Question 2 ***")
ents = tweet_ent(twitter_file_ids, lm)
print("Best 10 english entropies:")
best10_ents = ents[:10]
ppEandT(best10_ents)
print("Worst 10 english entropies:")
worst10_ents = ents[-10:]
ppEandT(worst10_ents)
print("*** Question 3 ***")
answer_open_question_3 = open_question_3()
print(answer_open_question_3)
print("*** Question 4 ***")
answer_open_question_4 = open_question_4()
print(answer_open_question_4)
print("*** Question 5 ***")
mean, std, ascci_ents, non_eng_ents = tweet_filter(ents)
print('Mean: {}'.format(mean))
print('Standard Deviation: {}'.format(std))
print('ASCII tweets ')
print("Best 10 English entropies:")
best10_ascci_ents = ascci_ents[:10]
ppEandT(best10_ascci_ents)
print("Worst 10 English entropies:")
worst10_ascci_ents = ascci_ents[-10:]
ppEandT(worst10_ascci_ents)
print('--------')
print('Tweets considered non-English')
print("Best 10 English entropies:")
best10_non_eng_ents = non_eng_ents[:10]
ppEandT(best10_non_eng_ents)
print("Worst 10 English entropies:")
worst10_non_eng_ents = non_eng_ents[-10:]
ppEandT(worst10_non_eng_ents)
print("*** Question 6 ***")
answer_open_question_6 = open_question_6()
print(answer_open_question_6)
print("*** Part II***\n")
print("*** Question 7 ***")
naive_bayes = NaiveBayes(apply_extractor(feature_extractor_5, ppattach.tuples("training")), 0.1)
naive_bayes_acc = compute_accuracy(naive_bayes, apply_extractor(feature_extractor_5, ppattach.tuples("devset")))
print(f"Accuracy on the devset: {naive_bayes_acc * 100}%")
print("*** Question 8 ***")
answer_open_question_8 = open_question_8()
print(answer_open_question_8)
# This is the code that generated the results in the table of the CW:
# A single iteration of suffices for logistic regression for the simple feature extractors.
#
# extractors_and_iterations = [feature_extractor_1, feature_extractor_2, feature_extractor_3, eature_extractor_4, feature_extractor_5]
#
# print("Extractor | Accuracy")
# print("------------------------")
#
# for i, ex_f in enumerate(extractors, start=1):
# training_features = apply_extractor(ex_f, ppattach.tuples("training"))
# dev_features = apply_extractor(ex_f, ppattach.tuples("devset"))
#
# a_logistic_regression_model = NltkClassifierWrapper(MaxentClassifier, training_features, max_iter=6, trace=0)
# lr_acc = compute_accuracy(a_logistic_regression_model, dev_features)
# print(f"Extractor {i} | {lr_acc*100}")
print("*** Question 9 ***")
training_features = apply_extractor(your_feature_extractor, ppattach.tuples("training"))
dev_features = apply_extractor(your_feature_extractor, ppattach.tuples("devset"))
logistic_regression_model = NltkClassifierWrapper(MaxentClassifier, training_features, max_iter=10)
lr_acc = compute_accuracy(logistic_regression_model, dev_features)
print("30 features with highest absolute weights")
logistic_regression_model.show_most_informative_features(30)
print(f"Accuracy on the devset: {lr_acc*100}")
answer_open_question_9 = open_question_9()
print("Answer to open question:")
print(answer_open_question_9)
if __name__ == "__main__":
if len(sys.argv) > 1 and sys.argv[1] == '--answers':
from autodrive_embed import run, carefulBind
import adrive1
with open("userErrs.txt", "w") as errlog:
run(globals(), answers, adrive1.extract_answers, errlog)
else:
answers()