-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRmixmod.habitat.R
79 lines (73 loc) · 2.93 KB
/
Rmixmod.habitat.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Rmixmod.habitat.R
# using Rmixmod to find latent classes in Habitat missing data
# Feb 2017
library(Rmixmod)
library(doBy)
# load data of pure zeros and ones and information on question order
load('Missing.habitat.data.RData')
# convert missing zero/one to factors
mm = missing.matrix
mm[mm==0] = '0'
mm[mm==1] = '1'
to.use = as.data.frame(mm)
### Run model
# run up to 20 clusters (this takes a while)
max.clust = 20
mres = mixmodCluster(to.use, nbCluster=2:max.clust, model=mixmodMultinomialModel(listModels=c('Binary_pk_Ekj')), criterion='BIC')
### BIC
# get BIC results, flag if any groups under 1%
BIC = NULL
for (i in 1:(max.clust-1)){
tab = table(mres@results[[i]]@partition)
less = any(100*tab/sum(tab) < 1) # any groups with less than 1% (too small)
frame = data.frame(i=i, any.less=less, n.cluster=mres@results[[i]]@nbCluster, criterion=mres@results[[i]]@criterion, criterionV=mres@results[[i]]@criterionValue)
BIC = rbind(BIC, frame)
}
bic.plot = ggplot(data=BIC, aes(x=n.cluster, y=criterionV))+
geom_line()+
geom_point(size=6, data=BIC, aes(pch=any.less))+
scale_shape_manual('Any groups sizes\nunder 1%', values=c(15,19), labels=c('No','Yes'))+
scale_x_continuous(breaks=2:20)+
xlab('Maximum number of latent groups')+
ylab('BIC')+
theme_bw()+
theme(legend.position=c(0.8, 0.8), text=element_text(size=20), panel.grid.minor = element_blank())
bic.plot
### Plot estimated probabilities by group
# examine groups for smallest BIC
bindex = which(BIC$criterionV == min(BIC$criterionV))
best = mres@results[[bindex]] #
props = best@parameters@proportions # proportions in each group
n.cluster = best@nbCluster
preds = best@partition # predicted group for each person
# arrange into plot
to.plot = NULL
for (k in 1:n.cluster){
r = best@parameters@scatter[[k]][,1]
frame = data.frame(cluster=k, i=1:length(r), prob=r)
to.plot = rbind(to.plot, frame)
}
# merge with question order
to.plot = merge(to.plot, qorder, by='i')
# numbers per group
n.group = as.numeric(table(best@partition))
cmeans = summaryBy(prob ~ cluster, data=to.plot)
cmeans$n.group = n.group
cmeans$percent = round(100*cmeans$n.group / sum(cmeans$n.group))
cmeans = cmeans[order(cmeans$prob.mean),] # arrange smallest to largest
cmeans$clustero = 1:nrow(cmeans)
cmeans
# re-order latent groups by missing probability
to.ploto = merge(to.plot, cmeans, by='cluster') # add new label
labels = paste(1:n.cluster, ' (', cmeans$n.group, ')', sep='') # use numbers
to.ploto$flabel = factor(to.ploto$clustero, levels=1:n.cluster, labels=labels)
# plot
gplot = ggplot(data=to.ploto, aes(x=order, y=prob))+
geom_point()+
xlab('Question order')+
ylab('Probability of missing')+
scale_x_continuous(limits=c(1, ncol(mm)), breaks=c(1,50,100,150,200,250))+
theme_bw()+
theme(legend.position='none', text=element_text(size=15), panel.grid.minor = element_blank())+
facet_wrap(~flabel, as.table=T)
gplot