real_data/goodness_of_fit_patel.Rmd

---
title: "Goddness-of-fit Patel"
author: "Fanny Perraudeau"
date: "03/24/2017"
output: 
  html_document: 
  fig_height: 10
fig_width: 10
toc: yes
code_folding: hide
toc_float: yes
---
```{r options, echo=FALSE, results="hide",message=FALSE, error=FALSE, include=FALSE, autodep=TRUE}
knitr::opts_chunk$set(fig.align="center", cache=TRUE, error=FALSE, message=FALSE, warning=TRUE, fig.width=6, fig.height=6)
library(zinbwave)
library(cluster)
library(RColorBrewer)
library(ggplot2)
library(reshape)
library(EDASeq)
library(edgeR)
library(dplyr)
library(DESeq2)
library(cowplot)
library(matrixStats)
library(scRNAseq)
mycol = c(brewer.pal(11,"RdYlGn")[c(8:11, 1:4)], brewer.pal(11,"RdYlBu")[8:11])
```


```{r datain}
df = 'patel'
counts <- read.table("../sims/datasets/patel/glioblastoma_raw_rnaseq_SCandbulk_counts_withannots.txt", header=TRUE, stringsAsFactors = FALSE)

info <- as.matrix(counts)[1,-(1:3)]
counts <- counts[-1,-1]

gene_symbols <- counts[,1]
ensembl_ids <- counts[,2]
sample_names <- colnames(counts)[-(1:2)]

all.counts <- counts[,-(1:2)]
rownames(all.counts) <- ensembl_ids

metadata <- read.table("../sims/datasets/patel/SraRunTable.txt", sep='\t', stringsAsFactors = FALSE, header=TRUE, row.names=5, na.strings = "<not provided>")
metadata <- metadata[sample_names,]

batch <- stringr::str_split_fixed(info, "_", 2)[,2]
batch <- stringr::str_split_fixed(batch, "_", 2)[,1]

# select only single-cell samples from patients
keep <- which(grepl("^Single cell", metadata$source_name_s) &
                !is.na(metadata$patient_id_s) &
                !is.na(metadata$subtype_s))
metadata <- metadata[keep,]
batch <- as.factor(batch[keep])
all.counts <- all.counts[,keep]
all.counts <- as.matrix(all.counts)
class(all.counts) <- "numeric"

stopifnot(all(rownames(metadata)==colnames(all.counts)))

col1 <- brewer.pal(9, "Set1")
col2 <- c(brewer.pal(8, "Set2"), brewer.pal(8, "Set3"), brewer.pal(8, "Set1"))

detection_rate <- colSums(all.counts>0)
coverage <- colSums(all.counts)

qc <- cbind(detection_rate, coverage)

level1 <- as.factor(metadata$patient_id_s[!is.na(metadata$subtype_s)])
level2 <- as.factor(metadata$subtype_s[!is.na(metadata$subtype_s)])

filter <- rowSums(all.counts>50)>=50
raw <- all.counts[filter,]

totalcount = function (ei)
{
  sums = colSums(ei)
  eo = t(t(ei)*mean(sums)/sums)
  return(eo)
}
tc <- totalcount(raw)
vars <- rowVars(log1p(tc))
names(vars) <- rownames(tc)
vars <- sort(vars, decreasing = TRUE)
vargenes <- names(vars)[1:1000]

core = as.matrix(raw[vargenes,])
```

```{r zinb}
zinb <- zinbFit(core, ncores = 2, K = 2, epsilon = 1e3, 
                commondispersion = FALSE, verbose = T)
```

```{r functions}
computeExp <- function(zinbModel){
  (1 - t(getPi(zinbModel))) * t(getMu(zinbModel))
}

computeVar <- function(zinbModel){
  mu = t(getMu(zinbModel))
  pi = t(getPi(zinbModel))
  phi = exp(-getZeta(zinbModel))
  (1 - pi) * mu * (1 + mu*(phi + pi))
}

computeP0 <- function(zinbModel){
  mu = t(getMu(zinbModel))
  pi = t(getPi(zinbModel))
  phi = exp(-getZeta(zinbModel))
  pi + (1 - pi) * (1 + phi * mu) ^ (-1/phi)
}

plotMD <- function(x, y, xlim = c(0,10), ylim = c(-5, 5),
                   main = 'ZINB: MD-plot estimated vs. observed mean count, log scale'){
  mm = .5*(x + y)
  dd = x - y
  smoothScatter(mm, dd, xlim = xlim, ylim = ylim, xlab = 'Mean', ylab = 'Difference',
                main = main)
  abline(h = 0, col = 'gray')
  fit = loess(dd ~ mm)
  xpred = seq(0, 10, .1)
  pred = predict(fit, xpred)
  lines(xpred, pred, col = 'red', type = 'l', lwd=2)
}

fitNB <- function(counts){
  set = newSeqExpressionSet(counts)
  fq = betweenLaneNormalization(set, which = "full", offset = T)
  disp = estimateDisp(counts(fq), offset = -offst(fq)) 
  fit = glmFit(counts(fq), dispersion = disp$tagwise.dispersion, offset = -offst(fq))
  list(fitted = fit$fitted.values, disp = disp$tagwise.dispersion)
}
```

```{r nbzinb}
# zinb
zz = zinb
zinbEY = log1p(rowMeans(computeExp(zz)))
zinbPY0 = rowMeans(computeP0(zz))
# observed
logAveCount = log1p(rowMeans(core))
prop0 = rowMeans(core == 0)
# edgeR
nb = fitNB(core)
nbEY = log1p(rowMeans(nb$fitted))
nbPY0 = rowMeans((1 + nb$fitted * nb$disp)^(-1/nb$disp))
```

MD-plot estimated vs. observed mean count, log scale
```{r meanCount25}
pdf(sprintf('../paper/6680489mtyrjx/gof_mu_%s.pdf', df), width = 10)
par(mfrow=c(1,2))
xlim =  c(3.5, 10)
ylim = c(-1, 3)
plotMD(zinbEY, logAveCount, xlim = xlim, ylim = ylim, main = 'ZINB')
plotMD(nbEY, logAveCount, xlim = xlim, ylim = ylim, main = 'NB, edgeR')
dev.off()
```

MD-plot estimated vs. observed zero probability
```{r}
pdf(sprintf('../paper/6680489mtyrjx/gof_pi_%s.pdf', df), width = 10)
par(mfrow = c(1, 2))
plotMD(zinbPY0, prop0, xlim = c(0, .7), ylim = c(-.2, .2), main = 'ZINB')
plotMD(nbPY0, prop0, xlim = c(0, .7), ylim = c(-.2, .2), main = 'NB, edgeR')
dev.off()
```

Estimated dispersion versus observed zero probability
```{r}
pdf(sprintf('../paper/6680489mtyrjx/gof_disp_%s.pdf', df), width = 10)
par(mfrow = c(1,2))
xpred = seq(0, 1, .05)
fitzinb = loess(exp(-zz@zeta) ~ rowMeans(core == 0))
predzinb = predict(fitzinb, xpred)
fitnb = loess(nb$disp ~ rowMeans(core == 0))
prednb = predict(fitnb, xpred)
smoothScatter(rowMeans(core == 0), exp(-zz@zeta),
              xlab = 'Observed zero probability',ylim = c(0, 15),
              ylab = 'Estimated dispersion',xlim = c(0,.7), 
              main = 'ZINB')
lines(xpred, predzinb, col = 'red', type = 'l', lwd=2)

smoothScatter(rowMeans(core == 0), nb$disp,
              xlab = 'Observed zero probability',ylim = c(0, 15),
              ylab = 'Estimated dispersion',xlim = c(0,.7), 
              main = 'NB, edgeR')
lines(xpred, prednb, col = 'red', type = 'l', lwd=2)
dev.off()
```