Gene_Mutation.R

#Gene Mutation

#Data Loading
library(readr)
Matrix <- as.data.frame(read_csv("Gene_Mutation/MFT_RAS_66gene_matrix0or1_analysis20240707.csv"))
Matrix <- Matrix[which(Matrix$Type %in% c("FA","FTC")),]
Matrix <- Matrix[-c(610:611),]
rownames(Matrix) <- Matrix$...1
Matrix <- Matrix[,-1]

library(dplyr)
Matrix$Type <- as.factor(Matrix$Type)
Matrix[which(Matrix == "0", arr.ind = T)] <- FALSE
Matrix[which(Matrix == "1", arr.ind = T)] <- TRUE

A <- Matrix[,c(2:42)] %>% mutate_all(as.logical)
Matrix <- cbind(Matrix[,1], A)
rm(A)
colnames(Matrix)[1] <- c("Label")

#Training-Test Split According to Cohort
A <- strsplit(rownames(Matrix), "_")
B <- sapply(A, function(x) x[2])
rm(A)

TrainingIndex <- which(B == "AGR")
TestIndex <- which(B != "AGR")
rm(B)

# write.csv(Matrix[TrainingIndex,], "Gene_Mutation/Gene_TrainingMatrix.csv")
# write.csv(Matrix[TestIndex,], "Gene_Mutation/Gene_TestMatrix.csv")

#Package Loading
library(mlr3)
library(mlr3verse)
library(mlr3viz)
library(mlr3learners)
library(mlr3tuning)
library(mlr3extralearners)

# #CV
# 
# #Data Preparation
# set.seed(1)
# Data_ML_XG <- as.data.table(Matrix[TrainingIndex,], keep.rownames = TRUE)
# task_PD_XG <- as_task_classif(Data_ML_XG, target = "Label", positive = "FTC")
# task_PD_XG$set_col_roles("rn", roles = "name")
# print(task_PD_XG)
# print(task_PD_XG$col_roles)
# 
# ValidationAUC <- 1:1000
# ValidationACC <- 1:1000
# ValidationSEN <- 1:1000
# ValidationSPE <- 1:1000
# ValidationPPV <- 1:1000
# ValidationNPV <- 1:1000
# for (i in 1:1000) {
#   set.seed(i)
#   learner_XG <- lrn("classif.xgboost")
#   learner_XG$predict_type <- "prob"
#   learner_XG
#   learner_XG$param_set
#   rr <- resample(task_PD_XG, learner_XG, rsmp("cv", folds = 5))
#   ValidationAUC[i] <- rr$aggregate(msr("classif.auc"))
#   ValidationACC[i] <- rr$aggregate(msr("classif.acc"))
#   ValidationSEN[i] <- rr$aggregate(msr("classif.sensitivity"))
#   ValidationSPE[i] <- rr$aggregate(msr("classif.specificity"))
#   ValidationPPV[i] <- rr$aggregate(msr("classif.ppv"))
#   ValidationNPV[i] <- rr$aggregate(msr("classif.npv"))
# }
# rm(i)
# ValidationMeasures <- data.frame(ValidationAUC, ValidationACC, ValidationSEN, ValidationSPE, ValidationPPV, ValidationNPV)
# 
# write.csv(ValidationMeasures, "Gene_Mutation/ValidationMeasures.csv")
# #Delete the first row and first col

ValidationMeasures <- read.csv("Gene_Mutation/ValidationMeasures.csv", header = F)
for (i in 1:6) {
  print(mean(ValidationMeasures[,i], na.rm = T) - 1.96 * sd(ValidationMeasures[,i], na.rm = T))
  print(mean(ValidationMeasures[,i], na.rm = T) + 1.96 * sd(ValidationMeasures[,i], na.rm = T))
}
rm(i)
rm(ValidationMeasures)

set.seed(3)
Data_ML_XG <- as.data.table(Matrix[TrainingIndex,], keep.rownames = TRUE)
task_PD_XG <- as_task_classif(Data_ML_XG, target = "Label", positive = "FTC")
task_PD_XG$set_col_roles("rn", roles = "name")
print(task_PD_XG)
print(task_PD_XG$col_roles)

learner_XG <- lrn("classif.xgboost")
learner_XG$predict_type <- "prob"
learner_XG
learner_XG$param_set

rr <- resample(task_PD_XG, learner_XG, rsmp("cv", folds = 5))
rr$aggregate(msr("classif.auc"))
#0.6044218
rr$aggregate(msr("classif.acc"))
#0.6690395
rr$aggregate(msr("classif.sensitivity"))
#0.1755942
rr$aggregate(msr("classif.specificity"))
#0.9504627
rr$aggregate(msr("classif.ppv"))
#0.6606061
rr$aggregate(msr("classif.npv"))
#0.6772011
V <- rr$predictions()

library(pROC)
rr$score(msr("classif.auc"))

validation_prob <- c(V[[1]]$prob[,2])
validation_rowids <- c(V[[1]]$row_ids)
roc_v1 <- roc(Data_ML_XG$Label[validation_rowids], validation_prob, direction = ">")
roc_v1$auc

validation_prob <- c(V[[2]]$prob[,2])
validation_rowids <- c(V[[2]]$row_ids)
roc_v2 <- roc(Data_ML_XG$Label[validation_rowids], validation_prob, direction = ">")
roc_v2$auc

validation_prob <- c(V[[3]]$prob[,2])
validation_rowids <- c(V[[3]]$row_ids)
roc_v3 <- roc(Data_ML_XG$Label[validation_rowids], validation_prob, direction = ">")
roc_v3$auc

validation_prob <- c(V[[4]]$prob[,2])
validation_rowids <- c(V[[4]]$row_ids)
roc_v4 <- roc(Data_ML_XG$Label[validation_rowids], validation_prob, direction = ">")
roc_v4$auc

validation_prob <- c(V[[5]]$prob[,2])
validation_rowids <- c(V[[5]]$row_ids)
roc_v5 <- roc(Data_ML_XG$Label[validation_rowids], validation_prob, direction = ">")
roc_v5$auc

f <- function(x, revthresholds, revsensitivity, revspecificity)
{
  se <- vector()
  sp <- vector()
  for (i in 1:length(x)) {
    for (j in 1:(length(revthresholds) - 1)) {
      if((x[i] >= revthresholds[j]) & (x[i] < revthresholds[j + 1])){
        se[i] <- revsensitivity[j]
        sp[i] <- revspecificity[j]
      }
    }
  }
  return(list(se, 1 - sp))
}
#Input: 1 your cutoffs
#2 thresholds 3 sensitivities 4 specificities
#234 generated by roc function and need to be reversed
#Output: a list containing sensitivities and 1-specificities corresponding to your cutoffs

V1 <- f(seq(0, 1, length.out = 100000), rev(roc_v1$thresholds), rev(roc_v1$sensitivities), rev(roc_v1$specificities))
V2 <- f(seq(0, 1, length.out = 100000), rev(roc_v2$thresholds), rev(roc_v2$sensitivities), rev(roc_v2$specificities))
V3 <- f(seq(0, 1, length.out = 100000), rev(roc_v3$thresholds), rev(roc_v3$sensitivities), rev(roc_v3$specificities))
V4 <- f(seq(0, 1, length.out = 100000), rev(roc_v4$thresholds), rev(roc_v4$sensitivities), rev(roc_v4$specificities))
V5 <- f(seq(0, 1, length.out = 100000), rev(roc_v5$thresholds), rev(roc_v5$sensitivities), rev(roc_v5$specificities))

oneminussp <- data.frame(V1[[2]], V2[[2]], V3[[2]], V4[[2]], V5[[2]])
se <- data.frame(V1[[1]], V2[[1]], V3[[1]], V4[[1]], V5[[1]])
oneminussp <- as.numeric(apply(oneminussp, 1, mean))
se <- as.numeric(apply(se, 1, mean))
oneminussp <- c(oneminussp, 1)
se <- c(se, 1)

library(ggplot2)
ggplot(mapping = aes(x = oneminussp, y = se)) +
  geom_line(colour = "#0072B5FF", size = 2) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  theme(axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15)) +
  theme(axis.text.x = element_text(size = 14, color = "black"), axis.text.y = element_text(size = 14, color = "black")) +
  labs(x = "1 - Specificity", y = "Sensitivity") +
  theme(plot.title = element_text(size = 25, face = "bold")) +
  ggtitle("CV ROC") +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
  scale_x_continuous(expand = c(0, 0), limits = c(0, 1)) +
  annotate("text", x = 0.8, y = 0.1, label = "AUC = 0.604", size = 10, fontface = "bold")

rm(learner_XG)
rm(rr)
rm(V)
rm(validation_prob)
rm(validation_rowids)
rm(roc_v1)
rm(roc_v2)
rm(roc_v3)
rm(roc_v4)
rm(roc_v5)
rm(Data_ML_XG)
rm(task_PD_XG)
rm(V1, V2, V3, V4, V5, f, se, oneminussp)

#XG Training Test

#Data Preparation
Data_ML <- as.data.table(Matrix, keep.rownames = TRUE)
task_PD <- as_task_classif(Data_ML, target = "Label", positive = "FTC")
task_PD$set_col_roles("rn", roles = "name")
print(task_PD)
print(task_PD$col_roles)

#XGBoost Learner
learner_XG <- lrn("classif.xgboost")
learner_XG$predict_type <- "prob"
set.seed(456)
learner_XG$train(task_PD, row_ids = TrainingIndex)

#Performance on Training Set
prediction_XG1 <- learner_XG$predict(task_PD, row_ids = TrainingIndex)
prediction_XG1$set_threshold(0.499999)

library(fastR2)

#Prevalence
prediction_XG1$confusion
(17 + 88) / (17 + 88 + 4 + 187)

prediction_XG1$score(msr("classif.acc"))
(17 + 187) / (17 + 88 + 4 + 187)
(17 + 187)
(17 + 88 + 4 + 187)
wilson.ci(x = 204, n = 296, conf.level = 0.95)

prediction_XG1$score(msr("classif.auc"))
library(pROC)
roc(Data_ML$Label[TrainingIndex], prediction_XG1$prob[,1], ci = TRUE)$auc
roc(Data_ML$Label[TrainingIndex], prediction_XG1$prob[,1], ci = TRUE)$ci

prediction_XG1$score(msr("classif.sensitivity"))
17 / (17 + 88)
(17 + 88)
wilson.ci(x = 17, n = 105, conf.level = 0.95)

prediction_XG1$score(msr("classif.specificity"))
187 / (187 + 4)
(187 + 4)
wilson.ci(x = 187, n = 191, conf.level = 0.95)

prediction_XG1$score(msr("classif.ppv"))
17 / (17 + 4)
(17 + 4)
wilson.ci(x = 17, n = 21, conf.level = 0.95)

prediction_XG1$score(msr("classif.npv"))
187 / (187 + 88)
(187 + 88)
wilson.ci(x = 187, n = 275, conf.level = 0.95)

#Performance on Test Set
set.seed(4)
prediction_XG2 <- learner_XG$predict(task_PD, row_ids = TestIndex)
prediction_XG2$set_threshold(0.499999)

#Prevalence
prediction_XG2$confusion
(44 + 64) / (44 + 64 + 24 + 181)

prediction_XG2$score(msr("classif.acc"))
(44 + 181) / (44 + 64 + 24 + 181)
(44 + 181)
(44 + 64 + 24 + 181)
wilson.ci(x = 225, n = 313, conf.level = 0.95)

prediction_XG2$score(msr("classif.auc"))
library(pROC)
roc(Data_ML$Label[TestIndex], prediction_XG2$prob[,1], ci = TRUE)$auc
roc(Data_ML$Label[TestIndex], prediction_XG2$prob[,1], ci = TRUE)$ci

prediction_XG2$score(msr("classif.sensitivity"))
44 / (44 + 64)
(44 + 64)
wilson.ci(x = 44, n = 108, conf.level = 0.95)

prediction_XG2$score(msr("classif.specificity"))
181 / (181 + 24)
(181 + 24)
wilson.ci(x = 181, n = 205, conf.level = 0.95)

prediction_XG2$score(msr("classif.ppv"))
44 / (44 + 24)
44 + 24
wilson.ci(x = 44, n = 68, conf.level = 0.95)

prediction_XG2$score(msr("classif.npv"))
181 / (181 + 64)
181 + 64
wilson.ci(x = 181, n = 245, conf.level = 0.95)

#ROC

#Train
library(pROC)
library(ggplot2)
roc <- roc(Data_ML$Label[TrainingIndex], prediction_XG1$prob[,1], ci = TRUE)
roc$ci
ggroc(roc, colour = "#0072B5FF", size = 4, legacy.axes = TRUE) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  theme(axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15)) +
  theme(axis.text.x = element_text(size = 14, color = "black"), axis.text.y = element_text(size = 14, color = "black")) +
  labs(x = "1 - Specificity", y = "Sensitivity") +
  theme(plot.title = element_text(size = 25, face = "bold")) +
  ggtitle("Training ROC") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0)) +
  annotate("text", x = 0.8, y = 0.1, label = "AUC = 0.611", size = 10, fontface = "bold")
rm(roc)

#Test
library(pROC)
library(ggplot2)
roc <- roc(Data_ML$Label[TestIndex], prediction_XG2$prob[,1], ci = TRUE)
roc$ci
ggroc(roc, colour = "#0072B5FF", size = 4, legacy.axes = TRUE) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  theme(axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15)) +
  theme(axis.text.x = element_text(size = 14, color = "black"), axis.text.y = element_text(size = 14, color = "black")) +
  labs(x = "1 - Specificity", y = "Sensitivity") +
  theme(plot.title = element_text(size = 25, face = "bold")) +
  ggtitle("Test ROC") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0)) +
  annotate("text", x = 0.8, y = 0.1, label = "AUC = 0.670", size = 10, fontface = "bold")
rm(roc)

#Confusion Matrix

#Train
prediction_XG1$confusion
tab1 <- table(Matrix[TrainingIndex,]$Label, prediction_XG1$response)
tab1 <- as.data.frame(tab1)
ggplot(tab1, aes(Var1, Var2, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), size = 10) +
  theme_bw() +
  theme(panel.grid = element_blank(), panel.border = element_blank(), axis.line = element_blank()) +
  theme(axis.text.x = element_text(size = 15, color = "black"), axis.text.y = element_text(size = 15, color = "black")) +
  scale_fill_gradient(low = "#C6DBEFFF", high = "#3182BDFF") +
  labs(x = "Truth", y = "Prediction", title = "Training confusion matrix", fill = "Count") +
  theme(axis.title.x = element_text(size = 23), axis.title.y = element_text(size = 23)) +
  scale_x_discrete(limits = c("FTC", "FA"), expand = c(0,0)) +
  scale_y_discrete(limits = c("FA", "FTC"), expand = c(0,0)) +
  theme(plot.title = element_text(size = 25, hjust = 0.5, margin = margin(20, 0, 20, 0), face = "bold"),
        axis.title.x = element_text(margin = margin(20, 20, 20, 20), size = 20),
        axis.title.y = element_text(margin = margin(0, 20, 0, 10), size = 20)) +
  theme(legend.text = element_text(size = 14), legend.title = element_text(size = 14))
rm(tab1)

#Test
prediction_XG2$confusion
tab1 <- table(Matrix[TestIndex,]$Label, prediction_XG2$response)
tab1 <- as.data.frame(tab1)
ggplot(tab1, aes(Var1, Var2, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), size = 10) +
  theme_bw() +
  theme(panel.grid = element_blank(), panel.border = element_blank(), axis.line = element_blank()) +
  theme(axis.text.x = element_text(size = 15, color = "black"), axis.text.y = element_text(size = 15, color = "black")) +
  scale_fill_gradient(low = "#C6DBEFFF", high = "#3182BDFF") +
  labs(x = "Truth", y = "Prediction", title = "Test confusion matrix", fill = "Count") +
  theme(axis.title.x = element_text(size = 23), axis.title.y = element_text(size = 23)) +
  scale_x_discrete(limits = c("FTC", "FA"), expand = c(0,0)) +
  scale_y_discrete(limits = c("FA", "FTC"), expand = c(0,0)) +
  theme(plot.title = element_text(size = 25, hjust = 0.5, margin = margin(20, 0, 20, 0), face = "bold"),
        axis.title.x = element_text(margin = margin(20, 20, 20, 20), size = 20),
        axis.title.y = element_text(margin = margin(0, 20, 0, 10), size = 20)) +
  theme(legend.text = element_text(size = 14), legend.title = element_text(size = 14))
rm(tab1)

#Protein Importance
Importance <- as.data.frame(learner_XG$importance())
colnames(Importance) <- "Importance"
Importance_DF <- data.frame(Features = rownames(Importance), Importance = Importance$Importance)
library(ggcharts)
bar_chart(Importance_DF, Features, Importance, top_n = 40) +
  theme_classic() +
  theme(axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15)) +
  theme(axis.text.x = element_text(size = 14, color = "black"), axis.text.y = element_text(size = 14, color = "black")) +
  labs(x = "Feature", y = "Importance")
rm(Importance)
rm(Importance_DF)

rm(Data_ML, learner_XG, prediction_XG2, prediction_XG1, task_PD)
rm(TrainingIndex, TestIndex, Matrix)
gc()