-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path_bow_pp.py
executable file
·103 lines (75 loc) · 2.91 KB
/
_bow_pp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
# -*- coding:utf-8 -*-
###
# Created Date: 2022-03-20 17:28:52
# Author: Bin Wang
# -----
# Copyright (c) 2022 National University of Singapore
#
# -----
# HISTORY:
# Date&Time By Comments
# ---------- --- ----------------------------------------------------------
###
import copy
import logging
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
from w_model import Word_embedding_model
# Bag-of-word feature with post-postprocessing by https://openreview.net/pdf?id=SyK00v5xx (without idf)
def embedder_init(self, config):
''' initialize for bow sentence embedding '''
config = copy.deepcopy(config) # to not influence other
assert config.bow_we_path != None, "Must specific the word embedding path if using BOW model"
logging.info('BOW sentence embedding')
config.word_emb_model = config.bow_we_path
config.normalization = False # for word embedding normalization
config.post_process = 'False' # for word embedding post-processing
config.centralization = False # for word embedding centralizing
self.our_word_emb_model = Word_embedding_model(config)
def embedder_infer_all(self, sent_list, normalization, centralization):
''' inference package for bow embedding for all needed sentences '''
sent2id = {}
sents_embs = []
count = 0
for sent in tqdm(sent_list, leave=False):
# skip if already computed
if sent not in sent2id:
sent2id[sent] = count
count += 1
else:
continue
# skip words not in vocab
# use zero vector for unknown sents
sent_split = sent.lower().split()
sentvec = []
for word in sent_split:
if word in self.our_word_emb_model.vocab:
sentvec.append(self.our_word_emb_model.compute_embedding(word))
else:
continue
# if not words are found, use zeros as the representation
if not sentvec:
vec = np.zeros(self.our_word_emb_model.wvec_dim) + 1e-9
sentvec.append(vec)
sentvec = np.mean(sentvec, 0)
sents_embs.append(sentvec)
sents_embs = np.stack(sents_embs)
sents_embs = bow_embedder_postprocess(sents_embs)
self.sent2id = sent2id
self.sents_embs = sents_embs
if centralization:
if self.sents_embs is not None:
self.sents_embs = self.sents_embs - self.sents_embs.mean(axis=0, keepdims=True)
if normalization:
self.normalizing_sent_vectors()
def bow_embedder_postprocess(sents_embs):
''' perform post-processing '''
pp_comp = 2
# compute pricipal component
svd = TruncatedSVD(n_components=pp_comp, n_iter=7, random_state=0)
svd.fit(sents_embs)
components = svd.components_
sents_embs_new = sents_embs - sents_embs.dot(components.transpose()).dot(components)
return sents_embs_new