-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2-doc2vec_InsuranceQA.py
106 lines (76 loc) · 3.71 KB
/
2-doc2vec_InsuranceQA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#### word2vec from gensim
#### shervin minaee
import os, sys, math, nltk, json, pickle, pprint, time
import numpy as np
import gensim, logging
from nltk.tokenize import sent_tokenize
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.doc2vec import LabeledSentence
start = time.time()
##################### Reading the data
##### sentence = LabeledSentence(words=[u'some', u'words', u'here'], labels=[u'SENT_1'])
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
labeled_ques_insurance = pickle.load( open( "labeled_ques_insurance100.p", "rb" ) )
#labeled_ans_insurance = pickle.load( open( "labeled_ans_insurance.p", "rb" ) )
#labeled_all_insurance= labeled_ques_insurance+labeled_ans_insurance
###################### Training the model for both question and answer together
## model parameters here: https://radimrehurek.com/gensim/models/word2vec.html
## model = Doc2Vec( labeled_sentences, size= 300, window=9, min_count= 1, workers=5, iter= 20 )
'''
model = Doc2Vec( size=200, window= 5, min_count=2, workers= 5, alpha=0.025, min_alpha=0.025, batch_words= 2000) # use fixed learning rate
num_epochs= 10
model.build_vocab(labeled_all_insurance)
for epoch in range(num_epochs):
print("%d epoch of training" % (epoch+1) )
model.train(labeled_all_insurance)
model.alpha -= 0.001 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
model.train(labeled_all_insurance)
new_sent= "ice is cold"
new_sent= 'Please wait while I transfer this chat to the tech department.'
new_sent_token = sent_tokenize(new_sent)
#print(model.vocab.keys())
new_doc_vec = model.infer_vector(new_sent_token)
print( model.docvecs.most_similar([new_doc_vec]) )
model.save('doc2vec_insurance_all.doc2vec')
#model_loaded = Doc2Vec.load('doc2vec_20days.doc2vec')
'''
########################## Training the model for questions
model = Doc2Vec( size=200, window= 5, min_count=2, workers= 5, alpha=0.025, min_alpha=0.025, batch_words= 2000) # use fixed learning rate
num_epochs= 10
model.build_vocab(labeled_ques_insurance)
for epoch in range(num_epochs):
print("%d epoch of training" % (epoch+1) )
model.train(labeled_ques_insurance)
model.alpha -= 0.001 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
model.train(labeled_ques_insurance)
new_sent= "ice is cold"
new_sent= 'Please wait while I transfer this chat to the tech department.'
new_sent_token = sent_tokenize(new_sent)
#print(model.vocab.keys())
new_doc_vec = model.infer_vector(new_sent_token)
print( model.docvecs.most_similar([new_doc_vec]) )
model.save('doc2vec_insurance_ques100.doc2vec')
########################## Training the model for answers
model = Doc2Vec( size=200, window= 5, min_count=2, workers= 5, alpha=0.025, min_alpha=0.025, batch_words= 2000) # use fixed learning rate
num_epochs= 10
model.build_vocab(labeled_ans_insurance)
for epoch in range(num_epochs):
print("%d epoch of training" % (epoch+1) )
model.train(labeled_ans_insurance)
model.alpha -= 0.001 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
model.train(labeled_ans_insurance)
new_sent= "ice is cold"
new_sent= 'Please wait while I transfer this chat to the tech department.'
new_sent_token = sent_tokenize(new_sent)
#print(model.vocab.keys())
new_doc_vec = model.infer_vector(new_sent_token)
print( model.docvecs.most_similar([new_doc_vec]) )
model.save('doc2vec_insurance_ans.doc2vec')
##################################################
end = time.time()
tot_time_word= end-start
print("time of word2vec:", tot_time_word)