-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathambiguity_vectorizer.py
79 lines (65 loc) · 3.75 KB
/
ambiguity_vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from nltk import pos_tag_sents
from math import log
from utils.wordnet_utils import get_similar_words_from_wordnet, get_similarity_from_pathes, get_wn_tag_from_nltk_tag
import numpy as np
class AmbiguityVectorizer:
"""
This class received a list of list of strings where each sentence is broken down to words.
It goes over all the words in the sentence and extracts similar words from wordnet, wordnet returns a sysnet ,
meaning a path of similar words. Next step is to go over all the word pairs in words , we do this only for NOUN,
VERB, ADJ and ADVERB, we check the paths similarity and get the maximum similar path and minimum path
"""
def __init__(self, sentences):
self.pos_tagged_documents = get_post_tags_from_nltk(sentences)
def get_features_vector(self):
return create_sense_for_each_sentence(self.pos_tagged_documents)
# We want to get the POS tag for sentence the expected input is list of list of str
def get_post_tags_from_nltk(sentences):
"""
:param sentences: list of list of strings:
:return: POS tagged words from NLTK package
"""
return pos_tag_sents(sentences)
def create_sense_for_each_sentence(pos_tagged_sentences):
"""
:param pos_tagged_sentences: list of list of pairs , pair of (str, str) = (word, tag)
:return: Stack of 3 scores for each sentence , (log(product of all pathes in sentence),
minimum path distance, maximum path distance)
"""
feature_vectors = []
for pos_tagged_sentence in pos_tagged_sentences:
pathes_for_words_in_sentence = set()
sense_combination_product = 1
max_sense_farmost = 0
min_sense_closest = 100
for word, tag in pos_tagged_sentence:
tag_for_wn = get_wn_tag_from_nltk_tag(tag)
if tag_for_wn:
list_of_similar_words_from_wordnet = get_similar_words_from_wordnet(word, tag_for_wn)
if list_of_similar_words_from_wordnet and len(list_of_similar_words_from_wordnet) > 0:
sense_combination_product *= len(list_of_similar_words_from_wordnet)
sense_farmost, sense_closest = get_farmost_and_closest_paths_for_pathes(
list_of_similar_words_from_wordnet, pathes_for_words_in_sentence)
pathes_for_words_in_sentence.update(list_of_similar_words_from_wordnet)
min_sense_closest = min(min_sense_closest, sense_closest)
max_sense_farmost = max(max_sense_farmost, sense_farmost)
log_of_sense_combination_product = log(sense_combination_product)
feature_vectors.append((log_of_sense_combination_product, max_sense_farmost,
0 if min_sense_closest == 100 else min_sense_closest))
return np.vstack(feature_vectors)
def get_farmost_and_closest_paths_for_pathes(list_of_similar_words_from_wordnet, pathes_for_words_in_sentence):
"""
Returns the most similar and most farmost path
:param list_of_similar_words_from_wordnet: sysnet structure from wordnet
:param pathes_for_words_in_sentence: all other sysnet structures that we already encountered in the sentences before
:return: (max similarity score , min similarity score) for sentece
"""
max_path_similarity = 0
min_path_similarity = 1
for path in pathes_for_words_in_sentence:
for symilarity_net in list_of_similar_words_from_wordnet:
if path != symilarity_net:
similarity = get_similarity_from_pathes(symilarity_net, path)
max_path_similarity = max(max_path_similarity, similarity) if similarity else max_path_similarity
min_path_similarity = min(min_path_similarity, similarity) if similarity else min_path_similarity
return max_path_similarity, min_path_similarity