-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtopicModellingParamScan.py
166 lines (147 loc) · 6.19 KB
/
topicModellingParamScan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 22 13:08:26 2022
@author: chris
"""
import pandas as pd
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
import gensim
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Phrases
import re
import logging
import pickle
import json
from docx import Document
from docx.shared import Cm, Pt
def saveTable(univariablemodels, multivariablemodel , sig=0.05):
word_document = Document()
table = word_document.add_table(0, 0)
table.style = 'TableGrid'
first_column_width = 20
column_with = 8
table.add_column(Cm(first_column_width))
table.add_column(Cm(column_with))
table.add_column(Cm(column_with))
table.add_column(Cm(column_with))
table.add_column(Cm(column_with))
table.add_column(Cm(column_with))
table.add_column(Cm(column_with))
first_row = table.add_row()
first_row.cells[1].text = 'Univariable Regression (n=228)'
first_row.cells[2].text = 'Multiple Regression (n=228) '
second_row = table.add_row()
second_row.cells[0].text = 'Independent Variable'
second_row.cells[1].text = 'Coefficient (standardised) [95% confidence interval]'
second_row.cells[2].text = 'p-value'
second_row.cells[3].text = '$R^2$'
second_row.cells[4].text = 'p-value'
second_row.cells[5].text = 'p-value (adj)'
second_row.cells[6].text = 'f'
for w in range(1,6):
curr_wave = df[(df.wave==w) & (df.pvals_adj<=sig)]
next_row = table.add_row()
next_row.cells[0].text = 'Wave ' + str(w)
print(curr_wave)
for index, row in curr_wave.iterrows():
next_row = table.add_row()
next_row.cells[0].text = data_dict[row['value']]
next_row.cells[1].text = str(row['medians_r1'])
next_row.cells[2].text = str(row['medians_r2'])
next_row.cells[3].text = str(row['medians_w'])
next_row.cells[4].text = str(round(row['pvals'],3))
next_row.cells[5].text = str(round(row['pvals_adj'],3))
next_row.cells[6].text = str(round(row['uvals'],3))
margin = 1
sections = word_document.sections
for section in sections:
section.top_margin = Cm(margin)
section.bottom_margin = Cm(margin)
section.left_margin = Cm(margin)
section.right_margin = Cm(margin)
word_document.save(fname + '.docx')
def britishise(string,american_to_british):
for american_spelling, british_spelling in american_to_british.items():
string = string.replace(american_spelling, british_spelling)
return string
with open("./american_spellings.json", 'r') as url:
american_to_british = json.load(url)
#%%
lemmatizer = WordNetLemmatizer()
logging.basicConfig(filename='./gensim.log',
format="%(asctime)s:%(levelname)s:%(message)s",
level=logging.INFO)
data = pd.read_feather('./reddit_posts.feather')
mnet = pd.read_feather('./mumsnetparenting.feather')
mnet['title'] = mnet.titles
mnet['selftext'] = mnet.post
mnet = mnet.drop(columns=['titles', 'post'])
mnet['source'] = 'mumsnet'
data = data.drop(columns=['num_comments', 'score'])
data['source'] = 'reddit_Parenting'
data = data.append(mnet)
posts = data.title + ' ' + data.selftext
#%%
posts_cleaned = []
for p in posts:
if ('PENIS PENIS' not in p) and ('[removed]' not in p) and ('Message withdrawn at poster\'s request.' not in p):
pi = britishise(p, american_to_british)
pi = pi.replace('diaper', 'nappy')
pi = pi.replace('diapers', 'nappies')
posts_cleaned.append(pi.replace('’', '\''))
stop_words = stopwords.words('english') + ['think', 'thing', 'said', 'want', 'know', 'toddler','kid',
'babi','old','year','utf','keyword','ref','encod', 'month',
'com', 'edu', 'subject', 'lines', 'organization', 'article',
'amp', 'www', 'com', 'amazon', 'http', 'message', 'withdrawn',
'poster', 'request', 'removed','daughter','she\'s', 'he\'s',
'child', 'children']
posts_preprocessed = gensim.parsing.preprocessing.preprocess_documents(posts_cleaned)
posts_processed = []
for post in posts_preprocessed:
post = [p for p in post if p not in stop_words and (len(p)>3)]
posts_processed.append(post)
bigram = Phrases(posts_processed, min_count=20)
#%%
for idx in range(len(posts_processed)):
for token in bigram[posts_processed[idx]]:
if '_' in token:
# Token is a bigram, add to document.
posts_processed[idx].append(token)
with open('./texts.pkl', 'wb') as f:
pickle.dump(posts_processed, f)
#%%
cp = gensim.corpora.Dictionary(posts_processed)
cp.filter_extremes(no_below=20, no_above=0.5)
restricted_words = stopwords.words('english')
corpus = [cp.doc2bow(line) for line in posts_processed if line not in restricted_words]
#%%
ldamodels = []
coherence = []
for numtopics in range(3,35):
ldamods = []
for r_s in range(1,10):
lda_model = LdaMulticore(corpus=corpus,
id2word=cp,
random_state=r_s,
num_topics=numtopics,
passes=20,
chunksize=1000,
batch=False,
decay=0.5,
offset=64,
eval_every=10,
iterations=1000,
gamma_threshold=0.001,
per_word_topics=True)
top_topics = lda_model.top_topics(corpus) #, num_words=20)
ldamods.append(lda_model)
avg_topic_coherence = sum([t[1] for t in top_topics]) / numtopics
print('Average topic coherence: %.4f.' % avg_topic_coherence)
coherence.append(avg_topic_coherence)
ldamodels.append(ldamods)
with open('./ldamodels.pkl', 'wb') as f:
pickle.dump(ldamodels, f)