-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
297 lines (262 loc) · 11.8 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import os
import codecs
import json
import spacy
import pandas as pd
import itertools as it
import sqlite3
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from geopy.geocoders import Nominatim
from textblob import TextBlob
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
conn = sqlite3.connect('../Dataset/yelp.db')
def get_LV_restaurants(limit=1000000):
sql="""
select * from business
where categories like '%Restaurants%'
and city like '%Las Vegas%'
limit = {}
""".format(limit)
return pd.read_sql(sql,conn)
def get_LV_reviews(limit=100000000):
sql="""
select * from review
where business_id in (
select business_id from business
where categories like '%Restaurants%'
and city = 'Las Vegas')
limit = {};
""".format(limit)
return pd.read_sql(sql,conn)
def get_parsed_review(limit=100000000):
sql="selct * from parsed_review limit = {}".format(limit)
return pd.read_sql(sql,conn)
def run_sql(sql):
with sqlite3.connect('../Dataset/yelp.db') as con:
c = con.cursor()
c.execute(sql)
con.commit()
"""
Below are data prep functions for Unigram, Bigram, Trigram
one time job to generate the files and then save the models
"""
def punct_space(token):
return token.is_punct or token.is_space
def line_review(filename):
"""
generator function to read in reviews from the file
and un-escape the original line breaks in the text
"""
with codecs.open(filename, encoding='utf_8') as f:
for review in f:
yield review.replace('\\n', '\n')
def lemmatized_sentence_corpus(filename):
"""
generator function to use spaCy to parse reviews,
lemmatize the text, and yield sentences
"""
for parsed_review in nlp.pipe(line_review(filename),
batch_size=10000, n_threads=4):
for sent in parsed_review.sents:
yield u' '.join([token.lemma_ for token in sent
if not punct_space(token)])
def review_json_txt():
with codecs.open('../Dataset/review_text_all.txt', 'w', encoding='utf_8') as review_txt_file:
# open the existing review json file
with codecs.open('../Dataset/review.json', encoding='utf_8') as review_json_file:
# loop through all reviews in the existing file and convert to dict
for review_json in review_json_file:
review = json.loads(review_json)
review_txt_file.write(review[u'text'].replace('\n', '\\n') + '\n')
review_count += 1
print("""Text from {} restaurant reviews
written to the new txt file.""".format(review_count))
def unigram():
# One time use, prepare unigram model
unigram_sentences_filepath = '../Dataset/unigram_sentences_all.txt'
with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
for sentence in lemmatized_sentence_corpus('review_text_all.txt'):
f.write(sentence + '\n')
def bigram():
# One time use, prepare bigram model
unigram_sentences = LineSentence('../Dataset/unigram_sentences_all.txt' )
bigram_model_filepath = '../Models/bigram_model_all'
bigram_sentences_filepath = '../Dataset/bigram_sentences_all.txt'
bigram_model = Phrases(unigram_sentences)
bigram_model.save('../Models/bigram_model_all')
with codecs.open('../Dataset/bigram_sentences_all.txt', 'w', encoding='utf_8') as f:
for unigram_sentence in unigram_sentences:
bigram_sentence = u' '.join(bigram_model[unigram_sentence])
f.write(bigram_sentence + '\n')
def trigram():
# One time use, prepare trigram model
bigram_sentences = LineSentence('../Dataset/bigram_sentences_all.txt')
trigram_model_filepath = '../Models/trigram_model_all'
trigram_model = Phrases(bigram_sentences)
trigram_model.save(trigram_model_filepath)
trigram_sentences_filepath = '../Dataset/trigram_sentences_all.txt'
trigram_reviews_filepath = '../Dataset/trigram_transformed_reviews_all.txt'
with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
for bigram_sentence in bigram_sentences:
trigram_sentence = u' '.join(trigram_model[bigram_sentence])
f.write(trigram_sentence + '\n')
trigram_sentences = LineSentence(trigram_sentences_filepath)
def insert_trigram_review():
values0=''
for i in range(0,len(review)):
parsed_review = nlp(review['text'][i])
unigram_review = [token.lemma_.replace('"','') for token in parsed_review
if not punct_space(token) and token.lemma_ not in '-PRON-']
bigram_review = bigram_model[unigram_review]
trigram_review = trigram_model[bigram_review]
clean_review = [term for term in trigram_review
if term not in spacy.lang.en.stop_words.STOP_WORDS]
values0 = values0 + '("{}","{}","{}"),'.format(review['review_id'][i],review['business_id'][i],' '.join(clean_review))
sql = 'insert into parsed_review (review_id, business_id, clean_review) values {};'.format(values0[:-1])
i += 1
if i % 100 ==0:
print(i)
run_sql(sql)
# create 'trigram_transformed_reviews_all.txt'
def trigram_transform():
nlp = spacy.load('en_core_web_sm')
with codecs.open('../Dataset/trigram_transformed_reviews_all.txt', 'w', encoding='utf_8') as f:
for parsed_review in nlp.pipe(line_review('../Dataset/review_text_all.txt'),
batch_size=10000, n_threads=8):
# lemmatize the text, removing punctuation and whitespace
unigram_review = [token.lemma_ for token in parsed_review
if not punct_space(token)]
# apply the first-order and second-order phrase models
bigram_review = bigram_model[unigram_review]
trigram_review = trigram_model[bigram_review]
# remove any remaining stopwords
trigram_review = [term for term in trigram_review
if term not in spacy.lang.en.stop_words.STOP_WORDS]
# write the transformed review as a line in the new file
trigram_review = u' '.join(trigram_review)
f.write(trigram_review + '\n')
# create and save LDA model
def trigram_bow_generator(filepath):
"""
generator function to read reviews from a file
and yield a bag-of-words representation
"""
for review in LineSentence(filepath):
yield trigram_dictionary.doc2bow(review)
def create_LDA_dict():
#ONE TIME USE, to create and save LDA model
trigram_dictionary_filepath = '../Dataset/trigram_dict_all.dict'
trigram_reviews = LineSentence('../Dataset/trigram_transformed_reviews_all.txt')
# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)
# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reas#teger ids (compactify)
trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
trigram_dictionary.compactify()
trigram_dictionary.save(trigram_dictionary_filepath)
print('LDA dict saved.')
trigram_bow_filepath = '../Models/trigram_bow_corpus_all.mm'
MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator('../Dataset/trigram_transformed_reviews_all.txt'))
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
lda_model_filepath = '../Models/lda_model_all' #lda_model_all_30, lda_model_10topic
# created LDA model with 10, 30, 50 topics, found 30 has best result
with warnings.catch_warnings():
warnings.simplefilter('ignore')
lda = LdaMulticore(trigram_bow_corpus,
num_topics=30, #10, 30, 50
id2word=trigram_dictionary,
workers=8)
lda.save(lda_model_filepath)
print('LDA model saved.')
# subtopics
def get_topic_name():
# return subtopic names
topic_names_30 = {
0: u'experience', 1: u'sandwich', 2: u'customer service', 3: u'asian',
4: u'breakfast', 5: u'discount', 6: u'value', 7: u'pizza', 8: u'burger', 9: u'menu',
10: u'chinese', 11: u'food quality', 12: u'thai', 13: u'buffet', 14: u'hotel', 15: u'steak',
16: u'sushi', 17: u'location', 18: u'bar', 19: u'feeling', 20: u'customer service',
21: u'italian', 22: u'fine dinner', 23: u'dessert', 24: u'wing', 25: u'kid', 26: u'BBQ',
27: u'mexican', 28: u'price', 29: u'environment'
}
return topic_names_30
def lda_show_topic(i = [1]):
# take list variable, return topic name and sub-topic items
lda = LdaMulticore.load('../Models/lda_model_all_30')
name = get_topic_name()
lst = []
for x in i:
print('subtopic = {}'.format(name[x]))
print(lda.show_topic(x, topn=25))
lst.append(lda.show_topic(x, topn=25))
return lst
#load LDA model
lda = LdaMulticore.load('../Models/lda_model_all_30')
def lda_review(txt):
# given a review, return which sub-topics are included
trigram_dictionary = Dictionary.load('../Dataset/trigram_dict_all.dict')
review_bow = trigram_dictionary.doc2bow(txt.split())
return lda[review_bow]
def compare_review(txt1,txt2):
# compute similarity of two texts
lda1, lda2 = lda_review(txt1), lda_review(txt2)
return cosine_similarity(lda1, lda2)
def recom(user_id):
value = []
bus_id = business.business_id
usr_review = get_review_for_person(user_id)
for i in bus_id:
get_review_for_buss(i)
# business_id = '8mIrX_LrOnAqWsB5JrOojQ'
def get_review_for_buss(business_id):
# concat all reviews for given business_id
sql = """
select * from parsed_review
where business_id = '{}'
""".format(business_id)
all_review_df = pd.read_sql(sql,conn)
all_review = all_review_df['clean_review'].str.cat(sep=' ')
return(all_review)
def get_review_for_person(user_id):
# concat all reviews for given user_id
sql = """
select * from parsed_review
where user_id = '{}';""".format(user_id)
all_review_df = pd.read_sql(sql,conn)
all_review = all_review_df['clean_review'].str.cat(sep=' ')
return(all_review)
def get_buss_attr(business_id):
# compute subtopic scores given all reviews from specific business_id
review_buss = get_review_for_buss(business_id)
lda_rst = lda_review(review_buss)
sql = "select stars from business where business_id = '{}'".format(business_id)
avg_star = pd.read_sql(sql,conn)['stars'].values[0]
buss_attr = [business_id,avg_star]
lst = [0]*30
for i in lda_rst:
lst[i[0]] = i[1]
buss_attr.extend(lst)
return buss_attr
def get_user_attr(user_id):
# compute subtopic scores given all reviews from specific user_id
review_user = get_review_for_person(user_id)
lda_rst = lda_review(review_user)
sql = "select average_stars from user where user_id = '{}'".format(user_id)
avg_star = pd.read_sql(sql,conn)['average_stars'].values[0]
user_attr = [user_id,avg_star]
lst = [0]*30
for i in lda_rst:
lst[i[0]] = i[1]
user_attr.extend(lst)
return user_attr
sql = "select count(*) from business where city like '%Vegas%' and categories like '%estaurant%'"
sql = "select count(*) as restaurants, count(distinct(user_id)) from review where business_id in (select business_id from business where city like '%Vegas%' and categories like '%estaurant%')"
a = pd.read_sql(sql,conn)