--- a
+++ b/RRS_Predictive_V3.Rmd
@@ -0,0 +1,588 @@
+---
+title: "Predictive Signature"
+author: "Pranjal"
+date: "2/21/2019"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+## Initial Setup and Package Loads in R 
+
+Packages used for the analysis.
+```{r initial_setup, cache=FALSE, message = FALSE, warning = FALSE}
+library(glmnet);library(survival);library(survminer);library(readxl);library(ggplot2); library(GGally)
+library(knitr); library(rmdformats); library(magrittr)
+library(skimr); library(Hmisc); library(Epi); library(vcd)
+library(tidyverse) 
+
+source("Love-boost.R")
+
+## Global options
+
+options(max.print="75")
+opts_chunk$set(comment=NA,
+               message=FALSE,
+               warning=FALSE)
+opts_knit$set(width=75)
+
+
+skimr::skim_with(numeric = list(hist = NULL),
+                 integer = list(hist = NULL))
+```
+
+## Loading the Raw Data into R 
+
+Loading raw dataset into R.
+
+Training Data from CCF with minimum and max. survival time.
+```{r}
+train <- read.csv("dataset_train.csv")
+
+train %>%
+  select(time) %>%
+  summary()
+```
+
+
+```{r}
+#train2 <- with(train, subset(train, !train$Cases %in% train$Cases[is.na(time) | therapy == 1]))
+```
+```{r}
+#df_sorted_names_asc <- train[with(train, order(RRS)), ]
+```
+
+
+```{r}
+#threshold_analysis <- lapply(i<- 20:240, function(i) {
+ # threshold <- df_sorted_names_asc$RRS[i]
+  #train$group <- ifelse(train$RRS >= threshold,1,2)
+#  subset1 <- subset(train, group == 1)
+#  res.cox <- coxph(Surv(time, status) ~ therapy, data = subset1)
+#  HR1 <- exp(res.cox$coefficients)
+  
+  
+ # groupss2 <- subset(train, group == 2)
+  
+  #for (k in 4:length(groupss2-8)) {
+   # threshold2 <- groupss2$RRS[k]
+    #groupss2$new <- ifelse(groupss2$RRS > threshold2, 1,2)
+  #  subset2 <- subset(groupss2, new == 2)
+  #  res.cox2 <- coxph(Surv(time, status) ~ therapy, data = subset2)
+   # HR2 <- exp(res.cox2$coefficients)
+  #}
+#})
+
+#res <- t(as.data.frame(threshold_analysis, check.names = FALSE))
+#as.data.frame(res)
+#y_min <- min(res[,"therapy"])
+```
+
+
+```{r}
+#threshold_analysis <- lapply(i<- 15:320, function(i) {
+ #  threshold <- df_sorted_names_asc$RRS[i]
+ #  train$group <- ifelse(train$RRS >= threshold,1,2)
+ #  subset1 <- subset(train, group == 1)
+ #  data.survdiff <- survdiff(Surv(time, status) ~ therapy, data= subset1)
+ #  p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
+ #  HR = (data.survdiff$obs[2]/data.survdiff$exp[2])/(data.survdiff$obs[1]/data.survdiff$exp[1])
+ #  tbl = table(train$time,train$therapy)
+ #  y <- chisq.test(tbl)
+ #  chisq <- y[["statistic"]][["X-squared"]]
+ #  subset2 <- subset(train, group == 2)
+ #  data.survdiff1 <- survdiff(Surv(time, status) ~ therapy, data= subset2)
+ #  p.val2 = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
+ #  HR2= (data.survdiff$obs[2]/data.survdiff$exp[2])/(data.survdiff$obs[1]/data.survdiff$exp[1])
+ #  tbl = table(train$time,train$therapy)
+ #  y2 <- chisq.test(tbl)
+  # chisq2 <- y[["statistic"]][["X-squared"]]
+   
+   #res <- c(HR, p.val,chisq, threshold)
+   
+#   res <- c(HR, p.val,chisq, threshold)
+#   return(res)
+
+#})
+
+#res2 <- t(as.data.frame(threshold_analysis, check.names = FALSE))
+#as.data.frame(res2)
+#y_min <- min(res[,"therapy"])
+```
+
+
+
+```{r}
+
+threshold1 <- -0.09316364
+#threshold1 <- 0.0280
+
+threshold1
+#threshold2 <- 0.0987032923
+#threshold2 <- 0.1324831611
+threshold2 <- 0.0987032923
+```
+
+```{r}
+train$group <- 'Group3'
+train$group[train$QuRiS < threshold2 & train$QuRiS >= threshold1] <- 'Group2'
+train$group[train$QuRiS  < threshold1] <- 'Group1'
+
+```
+
+
+```{r}
+subset1 <- subset(train, group == "Group1")
+dim(subset1)
+aggregate(subset1[,], list(subset1$therapy), mean)$time
+```
+
+```{r}
+res.cox <- coxph(Surv(time, status) ~ therapy, data = subset1)
+summary(res.cox)
+```
+
+```{r}
+variables <- c("therapy")
+univ_formulas <- sapply(variables,
+                        function(x) as.formula(paste('Surv(time, status)~', x)))
+
+univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset1)})
+# Extract data 
+univ_results <- lapply(univ_models,
+                       function(x){ 
+                         x <- summary(x)
+                         p.value<-signif(x$wald["pvalue"], digits=4)
+                         wald.test<-signif(x$wald["test"], digits=4)
+                         beta<-signif(x$coef[1], digits=2);#coeficient beta
+                         HR <-signif(x$coef[2], digits=4);#exp(beta)
+                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
+                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
+                         HR <- paste0(HR, " (", 
+                                      HR.confint.lower, "-", HR.confint.upper, ")")
+                         res<-c(beta, HR, wald.test, p.value)
+                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
+                                       "p.value")
+                         return(res)
+                         #return(exp(cbind(coef(x),confint(x))))
+                       })
+res <- t(as.data.frame(univ_results, check.names = FALSE))
+as.data.frame(res)
+```
+
+
+
+```{r}
+subset2 <- subset(train, group == "Group2")
+dim(subset2)
+aggregate(subset2[,], list(subset2$therapy), mean)$time
+```
+
+
+
+```{r}
+res.cox <- coxph(Surv(time, status) ~ therapy, data = subset2)
+summary(res.cox)
+```
+
+```{r}
+variables <- c("therapy")
+univ_formulas <- sapply(variables,
+                        function(x) as.formula(paste('Surv(time, status)~', x)))
+
+univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset2)})
+# Extract data 
+univ_results <- lapply(univ_models,
+                       function(x){ 
+                         x <- summary(x)
+                         p.value<-signif(x$wald["pvalue"], digits=4)
+                         wald.test<-signif(x$wald["test"], digits=4)
+                         beta<-signif(x$coef[1], digits=2);#coeficient beta
+                         HR <-signif(x$coef[2], digits=4);#exp(beta)
+                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
+                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
+                         HR <- paste0(HR, " (", 
+                                      HR.confint.lower, "-", HR.confint.upper, ")")
+                         res<-c(beta, HR, wald.test, p.value)
+                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
+                                       "p.value")
+                         return(res)
+                         #return(exp(cbind(coef(x),confint(x))))
+                       })
+res <- t(as.data.frame(univ_results, check.names = FALSE))
+as.data.frame(res)
+```
+
+
+```{r}
+subset31 <- subset(train, group == "Group3")
+dim(subset31)
+aggregate(subset31[,], list(subset31$therapy), mean)$time
+```
+
+```{r}
+subset3 <- with(subset31, subset(subset31, !subset31$Cases %in% subset31$Cases[is.na(time)]))
+```
+
+
+
+```{r}
+res.cox <- coxph(Surv(time, status) ~ therapy, data = subset3)
+summary(res.cox)
+```
+```{r}
+variables <- c("therapy")
+univ_formulas <- sapply(variables,
+                        function(x) as.formula(paste('Surv(time, status)~', x)))
+
+univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset3)})
+# Extract data 
+univ_results <- lapply(univ_models,
+                       function(x){ 
+                         x <- summary(x)
+                         p.value<-signif(x$wald["pvalue"], digits=4)
+                         wald.test<-signif(x$wald["test"], digits=4)
+                         beta<-signif(x$coef[1], digits=2);#coeficient beta
+                         HR <-signif(x$coef[2], digits=4);#exp(beta)
+                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
+                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
+                         HR <- paste0(HR, " (", 
+                                      HR.confint.lower, "-", HR.confint.upper, ")")
+                         res<-c(beta, HR, wald.test, p.value)
+                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
+                                       "p.value")
+                         return(res)
+                         #return(exp(cbind(coef(x),confint(x))))
+                       })
+res <- t(as.data.frame(univ_results, check.names = FALSE))
+as.data.frame(res)
+```
+
+
+```{r}
+fit2 <- survfit(Surv(time, status) ~ therapy, data = subset3)
+
+ggsurvplot(
+   fit2,                     # survfit object with calculated statistics.
+   data = subset3,             # data used to fit survival curves.
+   size = 1.2,
+ #  palette = "jco", 
+   palette = c("#0080FF","#FFCC33"),
+   risk.table = TRUE,       # show risk table.
+   pval = TRUE,             # show p-value of log-rank test..
+   xlab = "Time in months"   # customize X axis label.
+)
+```
+
+```{r}
+data.survdiff <- survdiff(Surv(time, status) ~ therapy, data= subset3)
+
+p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
+HR = (data.survdiff$obs[2]/data.survdiff$exp[2])/(data.survdiff$obs[1]/data.survdiff$exp[1])
+up95 = exp(log(HR) + qnorm(0.975)*sqrt(1/data.survdiff$exp[2]+1/data.survdiff$exp[1]))
+low95 = exp(log(HR) - qnorm(0.975)*sqrt(1/data.survdiff$exp[2]+1/data.survdiff$exp[1]))
+
+data.survdiff
+HR
+up95
+low95
+p.val
+```
+
+# TCGA DATASET
+```{r train_set}
+TCGA1 <- read.csv("dataset_nsclc1.csv")
+UPeNN1 <- read.csv("dataset_upenn1.csv")
+
+TCGA <- rbind(TCGA1, UPeNN1)
+#train <- na.omit(train1)
+TCGA %>%
+  select(time) %>%
+  summary()
+```
+
+
+```{r}
+TCGA$group <- 'Group1'
+TCGA$group[TCGA$RRS < threshold2 & TCGA$RRS >= threshold1] <- 'Group2'
+TCGA$group[TCGA$RRS  >= threshold2] <- 'Group3'
+
+```
+
+
+```{r}
+subset1 <- subset(TCGA, group == "Group1")
+dim(subset1)
+aggregate(subset1[,], list(subset1$therapy), mean)$time
+```
+
+```{r}
+res.cox <- coxph(Surv(time, status) ~ therapy, data = subset1)
+summary(res.cox)
+```
+```{r}
+variables <- c("therapy")
+univ_formulas <- sapply(variables,
+                        function(x) as.formula(paste('Surv(time, status)~', x)))
+
+univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset1)})
+# Extract data 
+univ_results <- lapply(univ_models,
+                       function(x){ 
+                         x <- summary(x)
+                         p.value<-signif(x$wald["pvalue"], digits=4)
+                         wald.test<-signif(x$wald["test"], digits=4)
+                         beta<-signif(x$coef[1], digits=2);#coeficient beta
+                         HR <-signif(x$coef[2], digits=4);#exp(beta)
+                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
+                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
+                         HR <- paste0(HR, " (", 
+                                      HR.confint.lower, "-", HR.confint.upper, ")")
+                         res<-c(beta, HR, wald.test, p.value)
+                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
+                                       "p.value")
+                         return(res)
+                         #return(exp(cbind(coef(x),confint(x))))
+                       })
+res <- t(as.data.frame(univ_results, check.names = FALSE))
+as.data.frame(res)
+```
+
+
+```{r}
+subset2 <- subset(TCGA, group == "Group2")
+dim(subset2)
+aggregate(subset2[,], list(subset2$therapy), mean)$time
+```
+
+
+
+```{r}
+res.cox <- coxph(Surv(time, status) ~ therapy, data = subset2)
+summary(res.cox)
+```
+```{r}
+variables <- c("therapy")
+univ_formulas <- sapply(variables,
+                        function(x) as.formula(paste('Surv(time, status)~', x)))
+
+univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset2)})
+# Extract data 
+univ_results <- lapply(univ_models,
+                       function(x){ 
+                         x <- summary(x)
+                         p.value<-signif(x$wald["pvalue"], digits=4)
+                         wald.test<-signif(x$wald["test"], digits=4)
+                         beta<-signif(x$coef[1], digits=2);#coeficient beta
+                         HR <-signif(x$coef[2], digits=4);#exp(beta)
+                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
+                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
+                         HR <- paste0(HR, " (", 
+                                      HR.confint.lower, "-", HR.confint.upper, ")")
+                         res<-c(beta, HR, wald.test, p.value)
+                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
+                                       "p.value")
+                         return(res)
+                         #return(exp(cbind(coef(x),confint(x))))
+                       })
+res <- t(as.data.frame(univ_results, check.names = FALSE))
+as.data.frame(res)
+```
+
+
+```{r}
+subset31 <- subset(TCGA, group == "Group3")
+subset3 <- with(subset31, subset(subset31, !subset31$Cases %in% subset31$Cases[is.na(time)]))
+dim(subset3)
+aggregate(subset3[,], list(subset3$therapy), mean)$time
+```
+
+```{r}
+res.cox <- coxph(Surv(time, status) ~ therapy, data = subset3)
+summary(res.cox)
+```
+
+```{r}
+variables <- c("therapy")
+univ_formulas <- sapply(variables,
+                        function(x) as.formula(paste('Surv(time, status)~', x)))
+
+univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset3)})
+# Extract data 
+univ_results <- lapply(univ_models,
+                       function(x){ 
+                         x <- summary(x)
+                         p.value<-signif(x$wald["pvalue"], digits=4)
+                         wald.test<-signif(x$wald["test"], digits=4)
+                         beta<-signif(x$coef[1], digits=2);#coeficient beta
+                         HR <-signif(x$coef[2], digits=4);#exp(beta)
+                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
+                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
+                         HR <- paste0(HR, " (", 
+                                      HR.confint.lower, "-", HR.confint.upper, ")")
+                         res<-c(beta, HR, wald.test, p.value)
+                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
+                                       "p.value")
+                         return(res)
+                         #return(exp(cbind(coef(x),confint(x))))
+                       })
+res <- t(as.data.frame(univ_results, check.names = FALSE))
+as.data.frame(res)
+```
+
+
+```{r}
+fit2 <- survfit(Surv(time, status) ~ therapy, data = subset3)
+
+ggsurvplot(
+   fit2,                     # survfit object with calculated statistics.
+   data = subset3,             # data used to fit survival curves.
+   size = 1.2,
+   palette = c("#0080FF","#FFCC33"),
+   risk.table = TRUE,       # show risk table.
+   pval = TRUE,             # show p-value of log-rank test..
+   xlab = "Time in months"   # customize X axis label.
+)
+```
+```{r}
+data.survdiff <- survdiff(Surv(time, status) ~ therapy, data= subset3)
+
+p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
+HR = (data.survdiff$obs[2]/data.survdiff$exp[2])/(data.survdiff$obs[1]/data.survdiff$exp[1])
+up95 = exp(log(HR) + qnorm(0.975)*sqrt(1/data.survdiff$exp[2]+1/data.survdiff$exp[1]))
+low95 = exp(log(HR) - qnorm(0.975)*sqrt(1/data.survdiff$exp[2]+1/data.survdiff$exp[1]))
+
+data.survdiff
+HR
+up95
+low95
+p.val
+```
+
+
+```{r}
+setEPS()
+postscript("QuRiS_ForestPlot_validation.eps")
+library(forestplot)
+# Cochrane data from the 'rmeta'-package
+cochrane_from_rmeta <- 
+  structure(list(
+    mean  = c(NA, NA, 0.07, 2.13, 2.24),
+    lower = c(NA, NA, 0.01, 0.99, 0.95),
+    upper = c(NA, NA, 0.41, 4.54, 5.30)),
+    .Names = c("mean", "lower", "upper"), 
+    row.names = c(NA, -11L), 
+    class = "data.frame")
+
+tabletext<-cbind(
+  c("", "QuRiS","QH","QI", "QL"),
+  c("Treatment", "Chemo", "33.55", "24.15", "32.61"),
+  c("Treatment","Non-Chemo","10.86", "37.04", "34.08"),
+  c("", "P Value", "<0.001", "0.05","0.06"))
+
+forestplot(tabletext,
+                   cochrane_from_rmeta$mean,
+                   cochrane_from_rmeta$lower,
+                   cochrane_from_rmeta$upper,new_page = TRUE,
+           txt_gp = fpTxtGp(label = gpar(fontfamily = "Arial"), cex = 0.8),
+           hrzl_lines = gpar(col="#444444"),
+           graph.pos = 3,
+          colgap=unit(c(2),"mm"),
+          lineheight = unit(c(15),"mm"),
+      #     cochrane_from_rmeta,new_page = TRUE,
+           is.summary=c(TRUE,TRUE,rep(FALSE,4)),
+           align = c("c", "c", "c", "c"),
+           clip=c(0.05,2.5), 
+           xlog=TRUE,
+         #xticks = c(0.1,1,1.1,1.2,1.3,1.4) ,
+           col=fpColors(box="blue",line="darkblue", hrz_lines = "#444444"),
+           vertices=FALSE,
+          fn.ci_norm=c("fpDrawDiamondCI"),
+                 boxsize=0.2,
+           cex.axis=8,
+          xlab="Hazard Ratios")
+
+x <- unit(0.5, 'npc'); y <- unit(.96, 'npc')
+grid.text('D2+D3:Disease Free Survival', x, y, gp = gpar(fontsize=15, font=2))
+
+# Fix manually the position of the HR numbers
+#xhr <- unit(0.68, 'npc'); yhr <- unit(0.65, 'npc')
+#grid.text('0.09', xhr, yhr, gp = gpar(fontsize=10, font=1))
+
+#xh2r <- unit(0.75, 'npc'); yh2r <- unit(0.52, 'npc')
+#grid.text('0.19', xh2r, yh2r, gp = #gpar(fontsize=10, font=1))
+
+#xl2r <- unit(0.80, 'npc'); yl2r <- unit(0.38, 'npc')
+#grid.text('0.57', xl2r, yl2r, gp = gpar(fontsize=10, font=1))
+
+#xlr <- unit(0.82, 'npc'); ylr <- unit(0.25, 'npc')
+#grid.text('1.9', xlr, ylr, gp = gpar(fontsize=10, font=1))
+
+#xl2r <- unit(0.92, 'npc'); yl2r <- unit(0.25, 'npc')
+#grid.text('13', xl2r, yl2r, gp = gpar(fontsize=10, font=1))
+dev.off()
+```
+```{r}
+setEPS()
+postscript("QuRiS_ForestPlot_training.eps")
+library(forestplot)
+# Cochrane data from the 'rmeta'-package
+cochrane_from_rmeta <- 
+  structure(list(
+    mean  = c(NA, NA, 0.27, 1.32, 2.15),
+    lower = c(NA, NA, 0.07, 0.73, 1.13),
+    upper = c(NA, NA, 0.95, 2.38, 4.08)),
+    .Names = c("mean", "lower", "upper"), 
+    row.names = c(NA, -11L), 
+    class = "data.frame")
+
+tabletext<-cbind(
+  c("", "QuRiS","QH","QI", "QL"),
+  c("Treatment", "Chemo", "46.19", "34.32", "40.03"),
+  c("Treatment","Non-Chemo","39.78", "37.07", "44.26"),
+  c("", "P Value", "0.03", "0.4","0.03"))
+
+forestplot(tabletext,
+                   cochrane_from_rmeta$mean,
+                   cochrane_from_rmeta$lower,
+                   cochrane_from_rmeta$upper,new_page = TRUE,
+           txt_gp = fpTxtGp(label = gpar(fontfamily = "Arial"), cex = 0.8),
+           hrzl_lines = gpar(col="#444444"),
+           graph.pos = 3,
+          colgap=unit(c(2),"mm"),
+          lineheight = unit(c(15),"mm"),
+      #     cochrane_from_rmeta,new_page = TRUE,
+           is.summary=c(TRUE,TRUE,rep(FALSE,4)),
+           align = c("c", "c", "c", "c"),
+           clip=c(0.05,2.5), 
+           xlog=TRUE,
+         #xticks = c(0.1,1,1.1,1.2,1.3,1.4) ,
+           col=fpColors(box="blue",line="darkblue", hrz_lines = "#444444"),
+           vertices=FALSE,
+          fn.ci_norm=c("fpDrawDiamondCI"),
+                 boxsize=0.2,
+           cex.axis=8,
+          xlab="Hazard Ratios")
+
+x <- unit(0.5, 'npc'); y <- unit(.96, 'npc')
+grid.text('D1:Disease Free Survival', x, y, gp = gpar(fontsize=15, font=2))
+
+# Fix manually the position of the HR numbers
+#xhr <- unit(0.68, 'npc'); yhr <- unit(0.65, 'npc')
+#grid.text('0.09', xhr, yhr, gp = gpar(fontsize=10, font=1))
+
+#xh2r <- unit(0.75, 'npc'); yh2r <- unit(0.52, 'npc')
+#grid.text('0.19', xh2r, yh2r, gp = #gpar(fontsize=10, font=1))
+
+#xl2r <- unit(0.80, 'npc'); yl2r <- unit(0.38, 'npc')
+#grid.text('0.57', xl2r, yl2r, gp = gpar(fontsize=10, font=1))
+
+#xlr <- unit(0.82, 'npc'); ylr <- unit(0.25, 'npc')
+#grid.text('1.9', xlr, ylr, gp = gpar(fontsize=10, font=1))
+
+#xl2r <- unit(0.92, 'npc'); yl2r <- unit(0.25, 'npc')
+#grid.text('13', xl2r, yl2r, gp = gpar(fontsize=10, font=1))
+dev.off()
+```
+