-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmcar.R
105 lines (86 loc) · 3.08 KB
/
mcar.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
mcar <- function(x){
if(!require(norm)) {
stop("You must have norm installed to use LittleMCAR")
}
# if(!require(data.table)) {
# stop("Please install the R-package data.table to use mcar")
# }
if(!(is.matrix(x) | is.data.frame(x))) {
stop("Data should be a matrix or dataframe")
}
if (is.data.frame(x)){
x <- data.matrix(x)
}
# delete rows of complete missingness
foo <- function(x) return(any(!is.na(x)))
dd <- apply(X = x, MARGIN = 1L, FUN = foo)
dd <- which(!dd, arr.ind = TRUE)
if(length(dd) > 0)
x <- x[-dd,]
# define variables
n.var <- ncol(x) # number of variables
n <- nrow(x) #number of respondents
var.names <- colnames(x)
r <- 1 * is.na(x)
nmis <- as.integer(apply(r, 2, sum)) #number of missing data for each variable REWRITE
mdp <- (r %*% (2^((1:n.var - 1)))) + 1 #missing data patterns
x.mp <- data.frame(cbind(x,mdp)) # add column indicating pattern
colnames(x.mp) <- c(var.names,"MisPat") # set name of new column to MisPat
n.mis.pat <- length(unique(x.mp$MisPat)) # number of missing data patterns
p <- n.mis.pat-1 # number of Missing Data patterns minus 1 (complete data row)
s <- prelim.norm(x)
ll <- em.norm(s)
fit <- getparam.norm(s = s, theta = ll)
# gmean<-mlest(x)$muhat #ML estimate of grand mean (assumes Normal dist)
gmean <- fit$mu
# gcov<-mlest(x)$sigmahat #ML estimate of grand covariance (assumes Normal dist)
gcov <- fit$sigma
colnames(gcov) <- rownames(gcov) <- colnames(x)
#recode MisPat variable to go from 1 through n.mis.pat
x.mp$MisPat2 <- rep(NA,n)
for (i in 1:n.mis.pat){
x.mp$MisPat2[x.mp$MisPat == sort(unique(x.mp$MisPat), partial=(i))[i]]<- i
}
x.mp$MisPat<-x.mp$MisPat2
x.mp<-x.mp[ , -which(names(x.mp) %in% "MisPat2")]
#make list of datasets for each pattern of missing data
datasets <- list()
for (i in 1:n.mis.pat){
datasets[[paste("DataSet",i,sep="")]]<-x.mp[which(x.mp$MisPat==i),1:n.var]
}
#degrees of freedom
kj<-0
for (i in 1:n.mis.pat){
no.na<-as.matrix(1* !is.na(colSums(datasets[[i]])))
kj<-kj+colSums(no.na)
}
df<-kj -n.var
#Little's chi-square
d2<-0
cat("this could take a while")
# this crashes at the missingness pattern where every column is missing
# this for-loop can be handled faster with plyr-function
for (i in 1:n.mis.pat){
mean <- (colMeans(datasets[[i]])-gmean)
mean <- mean[!is.na(mean)]
keep <- 1* !is.na(colSums(datasets[[i]]))
keep <- keep[which(keep[1:n.var]!=0)]
cov <- gcov
cov <- cov[which(rownames(cov) %in% names(keep)) , which(colnames(cov) %in% names(keep))]
d2 <- as.numeric(d2+(sum(x.mp$MisPat==i)*(t(mean)%*%solve(cov)%*%mean)))
}
#p-value for chi-square
p.value<-1-pchisq(d2,df)
#descriptives of missing data
amount.missing <- matrix(nmis, 1, length(nmis))
percent.missing <- amount.missing/n
amount.missing <- rbind(amount.missing,percent.missing)
colnames(amount.missing) <- var.names
rownames(amount.missing) <- c("Number Missing", "Percent Missing")
list(chi.square = d2,
df = df,
p.value = p.value,
missing.patterns = n.mis.pat,
amount.missing = amount.missing,
data = datasets)
}