-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfast_text.py
79 lines (65 loc) · 2.4 KB
/
fast_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from os.path import isfile
from os import listdir
from typing import Dict, List, Tuple
import fasttext
import numpy as np
from numpy import linalg as LA
from topic_queries import subject_queries
MODEL_FILENAME = 'fasttext_model.bin'
VECTOR_SIZE = 100
DATA_DIR = './raw_biographies/'
def merge_biographies():
filenames = listdir(DATA_DIR)
with open('./merged.txt', 'w') as outfile:
for fname in filenames:
with open(f'{DATA_DIR}{fname}') as infile:
for line in infile:
outfile.write(line)
def create_model():
merge_biographies()
model = fasttext.train_unsupervised('./merged.txt', 'skipgram')
return model
def get_biography_vector(filname: str, model: fasttext.FastText._FastText) -> np.ndarray:
res = np.array([0] * VECTOR_SIZE).astype('float64')
length = 0
with open(f'{DATA_DIR}{filname}', 'r') as f:
for line in f:
for word in line.split():
length += 1
vector = model.get_word_vector(word)
res += vector
return res / length
def get_biographies_vector(model: fasttext.FastText._FastText) -> Dict[str, np.ndarray]:
res = dict()
filenames = listdir(DATA_DIR)
for filename in filenames:
v = get_biography_vector(filname=filename, model=model)
res[filename] = v
return res
def get_similarity(v1: np.ndarray, v2: np.ndarray):
return np.dot(v1, v2) / (LA.norm(v1) * LA.norm(v2))
def get_similarities(word: np.ndarray, vectors: Dict[str, np.ndarray]) -> List[Tuple[float, str]]:
res = []
for k, v in vectors.items():
s = get_similarity(word, v)
res.append((s, k))
return res
def get_most_relevant_mathematicians(similarities: List[Tuple[float, str]], number_of_mathematicians: int = 5):
res = sorted(similarities, key = lambda x: x[0])[-number_of_mathematicians:]
res.reverse()
return [name for _, name in res]
def print_result(res):
for x in res:
print(x)
file_exists = isfile(MODEL_FILENAME)
READ_FROM_STD = False
if file_exists:
model = fasttext.load_model(MODEL_FILENAME)
else:
model = create_model()
model.save_model(MODEL_FILENAME)
biography_vectors = get_biographies_vector(model=model)
def get_query(query):
word_vector = model.get_word_vector(query)
similarities = get_similarities(word_vector, biography_vectors)
return get_most_relevant_mathematicians(similarities, 10)