[aa8877]: / RRS_Predictive_V3.Rmd

Download this file

589 lines (486 with data), 20.3 kB

---
title: "Predictive Signature"
author: "Pranjal"
date: "2/21/2019"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Initial Setup and Package Loads in R 

Packages used for the analysis.
```{r initial_setup, cache=FALSE, message = FALSE, warning = FALSE}
library(glmnet);library(survival);library(survminer);library(readxl);library(ggplot2); library(GGally)
library(knitr); library(rmdformats); library(magrittr)
library(skimr); library(Hmisc); library(Epi); library(vcd)
library(tidyverse) 

source("Love-boost.R")

## Global options

options(max.print="75")
opts_chunk$set(comment=NA,
               message=FALSE,
               warning=FALSE)
opts_knit$set(width=75)


skimr::skim_with(numeric = list(hist = NULL),
                 integer = list(hist = NULL))
```

## Loading the Raw Data into R 

Loading raw dataset into R.

Training Data from CCF with minimum and max. survival time.
```{r}
train <- read.csv("dataset_train.csv")

train %>%
  select(time) %>%
  summary()
```


```{r}
#train2 <- with(train, subset(train, !train$Cases %in% train$Cases[is.na(time) | therapy == 1]))
```
```{r}
#df_sorted_names_asc <- train[with(train, order(RRS)), ]
```


```{r}
#threshold_analysis <- lapply(i<- 20:240, function(i) {
 # threshold <- df_sorted_names_asc$RRS[i]
  #train$group <- ifelse(train$RRS >= threshold,1,2)
#  subset1 <- subset(train, group == 1)
#  res.cox <- coxph(Surv(time, status) ~ therapy, data = subset1)
#  HR1 <- exp(res.cox$coefficients)
  
  
 # groupss2 <- subset(train, group == 2)
  
  #for (k in 4:length(groupss2-8)) {
   # threshold2 <- groupss2$RRS[k]
    #groupss2$new <- ifelse(groupss2$RRS > threshold2, 1,2)
  #  subset2 <- subset(groupss2, new == 2)
  #  res.cox2 <- coxph(Surv(time, status) ~ therapy, data = subset2)
   # HR2 <- exp(res.cox2$coefficients)
  #}
#})

#res <- t(as.data.frame(threshold_analysis, check.names = FALSE))
#as.data.frame(res)
#y_min <- min(res[,"therapy"])
```


```{r}
#threshold_analysis <- lapply(i<- 15:320, function(i) {
 #  threshold <- df_sorted_names_asc$RRS[i]
 #  train$group <- ifelse(train$RRS >= threshold,1,2)
 #  subset1 <- subset(train, group == 1)
 #  data.survdiff <- survdiff(Surv(time, status) ~ therapy, data= subset1)
 #  p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
 #  HR = (data.survdiff$obs[2]/data.survdiff$exp[2])/(data.survdiff$obs[1]/data.survdiff$exp[1])
 #  tbl = table(train$time,train$therapy)
 #  y <- chisq.test(tbl)
 #  chisq <- y[["statistic"]][["X-squared"]]
 #  subset2 <- subset(train, group == 2)
 #  data.survdiff1 <- survdiff(Surv(time, status) ~ therapy, data= subset2)
 #  p.val2 = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
 #  HR2= (data.survdiff$obs[2]/data.survdiff$exp[2])/(data.survdiff$obs[1]/data.survdiff$exp[1])
 #  tbl = table(train$time,train$therapy)
 #  y2 <- chisq.test(tbl)
  # chisq2 <- y[["statistic"]][["X-squared"]]
   
   #res <- c(HR, p.val,chisq, threshold)
   
#   res <- c(HR, p.val,chisq, threshold)
#   return(res)

#})

#res2 <- t(as.data.frame(threshold_analysis, check.names = FALSE))
#as.data.frame(res2)
#y_min <- min(res[,"therapy"])
```



```{r}

threshold1 <- -0.09316364
#threshold1 <- 0.0280

threshold1
#threshold2 <- 0.0987032923
#threshold2 <- 0.1324831611
threshold2 <- 0.0987032923
```

```{r}
train$group <- 'Group3'
train$group[train$QuRiS < threshold2 & train$QuRiS >= threshold1] <- 'Group2'
train$group[train$QuRiS  < threshold1] <- 'Group1'

```


```{r}
subset1 <- subset(train, group == "Group1")
dim(subset1)
aggregate(subset1[,], list(subset1$therapy), mean)$time
```

```{r}
res.cox <- coxph(Surv(time, status) ~ therapy, data = subset1)
summary(res.cox)
```

```{r}
variables <- c("therapy")
univ_formulas <- sapply(variables,
                        function(x) as.formula(paste('Surv(time, status)~', x)))

univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset1)})
# Extract data 
univ_results <- lapply(univ_models,
                       function(x){ 
                         x <- summary(x)
                         p.value<-signif(x$wald["pvalue"], digits=4)
                         wald.test<-signif(x$wald["test"], digits=4)
                         beta<-signif(x$coef[1], digits=2);#coeficient beta
                         HR <-signif(x$coef[2], digits=4);#exp(beta)
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", HR.confint.upper, ")")
                         res<-c(beta, HR, wald.test, p.value)
                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
                                       "p.value")
                         return(res)
                         #return(exp(cbind(coef(x),confint(x))))
                       })
res <- t(as.data.frame(univ_results, check.names = FALSE))
as.data.frame(res)
```



```{r}
subset2 <- subset(train, group == "Group2")
dim(subset2)
aggregate(subset2[,], list(subset2$therapy), mean)$time
```



```{r}
res.cox <- coxph(Surv(time, status) ~ therapy, data = subset2)
summary(res.cox)
```

```{r}
variables <- c("therapy")
univ_formulas <- sapply(variables,
                        function(x) as.formula(paste('Surv(time, status)~', x)))

univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset2)})
# Extract data 
univ_results <- lapply(univ_models,
                       function(x){ 
                         x <- summary(x)
                         p.value<-signif(x$wald["pvalue"], digits=4)
                         wald.test<-signif(x$wald["test"], digits=4)
                         beta<-signif(x$coef[1], digits=2);#coeficient beta
                         HR <-signif(x$coef[2], digits=4);#exp(beta)
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", HR.confint.upper, ")")
                         res<-c(beta, HR, wald.test, p.value)
                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
                                       "p.value")
                         return(res)
                         #return(exp(cbind(coef(x),confint(x))))
                       })
res <- t(as.data.frame(univ_results, check.names = FALSE))
as.data.frame(res)
```


```{r}
subset31 <- subset(train, group == "Group3")
dim(subset31)
aggregate(subset31[,], list(subset31$therapy), mean)$time
```

```{r}
subset3 <- with(subset31, subset(subset31, !subset31$Cases %in% subset31$Cases[is.na(time)]))
```



```{r}
res.cox <- coxph(Surv(time, status) ~ therapy, data = subset3)
summary(res.cox)
```
```{r}
variables <- c("therapy")
univ_formulas <- sapply(variables,
                        function(x) as.formula(paste('Surv(time, status)~', x)))

univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset3)})
# Extract data 
univ_results <- lapply(univ_models,
                       function(x){ 
                         x <- summary(x)
                         p.value<-signif(x$wald["pvalue"], digits=4)
                         wald.test<-signif(x$wald["test"], digits=4)
                         beta<-signif(x$coef[1], digits=2);#coeficient beta
                         HR <-signif(x$coef[2], digits=4);#exp(beta)
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", HR.confint.upper, ")")
                         res<-c(beta, HR, wald.test, p.value)
                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
                                       "p.value")
                         return(res)
                         #return(exp(cbind(coef(x),confint(x))))
                       })
res <- t(as.data.frame(univ_results, check.names = FALSE))
as.data.frame(res)
```


```{r}
fit2 <- survfit(Surv(time, status) ~ therapy, data = subset3)

ggsurvplot(
   fit2,                     # survfit object with calculated statistics.
   data = subset3,             # data used to fit survival curves.
   size = 1.2,
 #  palette = "jco", 
   palette = c("#0080FF","#FFCC33"),
   risk.table = TRUE,       # show risk table.
   pval = TRUE,             # show p-value of log-rank test..
   xlab = "Time in months"   # customize X axis label.
)
```

```{r}
data.survdiff <- survdiff(Surv(time, status) ~ therapy, data= subset3)

p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
HR = (data.survdiff$obs[2]/data.survdiff$exp[2])/(data.survdiff$obs[1]/data.survdiff$exp[1])
up95 = exp(log(HR) + qnorm(0.975)*sqrt(1/data.survdiff$exp[2]+1/data.survdiff$exp[1]))
low95 = exp(log(HR) - qnorm(0.975)*sqrt(1/data.survdiff$exp[2]+1/data.survdiff$exp[1]))

data.survdiff
HR
up95
low95
p.val
```

# TCGA DATASET
```{r train_set}
TCGA1 <- read.csv("dataset_nsclc1.csv")
UPeNN1 <- read.csv("dataset_upenn1.csv")

TCGA <- rbind(TCGA1, UPeNN1)
#train <- na.omit(train1)
TCGA %>%
  select(time) %>%
  summary()
```


```{r}
TCGA$group <- 'Group1'
TCGA$group[TCGA$RRS < threshold2 & TCGA$RRS >= threshold1] <- 'Group2'
TCGA$group[TCGA$RRS  >= threshold2] <- 'Group3'

```


```{r}
subset1 <- subset(TCGA, group == "Group1")
dim(subset1)
aggregate(subset1[,], list(subset1$therapy), mean)$time
```

```{r}
res.cox <- coxph(Surv(time, status) ~ therapy, data = subset1)
summary(res.cox)
```
```{r}
variables <- c("therapy")
univ_formulas <- sapply(variables,
                        function(x) as.formula(paste('Surv(time, status)~', x)))

univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset1)})
# Extract data 
univ_results <- lapply(univ_models,
                       function(x){ 
                         x <- summary(x)
                         p.value<-signif(x$wald["pvalue"], digits=4)
                         wald.test<-signif(x$wald["test"], digits=4)
                         beta<-signif(x$coef[1], digits=2);#coeficient beta
                         HR <-signif(x$coef[2], digits=4);#exp(beta)
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", HR.confint.upper, ")")
                         res<-c(beta, HR, wald.test, p.value)
                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
                                       "p.value")
                         return(res)
                         #return(exp(cbind(coef(x),confint(x))))
                       })
res <- t(as.data.frame(univ_results, check.names = FALSE))
as.data.frame(res)
```


```{r}
subset2 <- subset(TCGA, group == "Group2")
dim(subset2)
aggregate(subset2[,], list(subset2$therapy), mean)$time
```



```{r}
res.cox <- coxph(Surv(time, status) ~ therapy, data = subset2)
summary(res.cox)
```
```{r}
variables <- c("therapy")
univ_formulas <- sapply(variables,
                        function(x) as.formula(paste('Surv(time, status)~', x)))

univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset2)})
# Extract data 
univ_results <- lapply(univ_models,
                       function(x){ 
                         x <- summary(x)
                         p.value<-signif(x$wald["pvalue"], digits=4)
                         wald.test<-signif(x$wald["test"], digits=4)
                         beta<-signif(x$coef[1], digits=2);#coeficient beta
                         HR <-signif(x$coef[2], digits=4);#exp(beta)
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", HR.confint.upper, ")")
                         res<-c(beta, HR, wald.test, p.value)
                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
                                       "p.value")
                         return(res)
                         #return(exp(cbind(coef(x),confint(x))))
                       })
res <- t(as.data.frame(univ_results, check.names = FALSE))
as.data.frame(res)
```


```{r}
subset31 <- subset(TCGA, group == "Group3")
subset3 <- with(subset31, subset(subset31, !subset31$Cases %in% subset31$Cases[is.na(time)]))
dim(subset3)
aggregate(subset3[,], list(subset3$therapy), mean)$time
```

```{r}
res.cox <- coxph(Surv(time, status) ~ therapy, data = subset3)
summary(res.cox)
```

```{r}
variables <- c("therapy")
univ_formulas <- sapply(variables,
                        function(x) as.formula(paste('Surv(time, status)~', x)))

univ_models <- lapply(univ_formulas, function(x){coxph(x, data = subset3)})
# Extract data 
univ_results <- lapply(univ_models,
                       function(x){ 
                         x <- summary(x)
                         p.value<-signif(x$wald["pvalue"], digits=4)
                         wald.test<-signif(x$wald["test"], digits=4)
                         beta<-signif(x$coef[1], digits=2);#coeficient beta
                         HR <-signif(x$coef[2], digits=4);#exp(beta)
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 4)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", HR.confint.upper, ")")
                         res<-c(beta, HR, wald.test, p.value)
                         names(res)<-c("beta", "HR (95% CI for HR)", "wald.test", 
                                       "p.value")
                         return(res)
                         #return(exp(cbind(coef(x),confint(x))))
                       })
res <- t(as.data.frame(univ_results, check.names = FALSE))
as.data.frame(res)
```


```{r}
fit2 <- survfit(Surv(time, status) ~ therapy, data = subset3)

ggsurvplot(
   fit2,                     # survfit object with calculated statistics.
   data = subset3,             # data used to fit survival curves.
   size = 1.2,
   palette = c("#0080FF","#FFCC33"),
   risk.table = TRUE,       # show risk table.
   pval = TRUE,             # show p-value of log-rank test..
   xlab = "Time in months"   # customize X axis label.
)
```
```{r}
data.survdiff <- survdiff(Surv(time, status) ~ therapy, data= subset3)

p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
HR = (data.survdiff$obs[2]/data.survdiff$exp[2])/(data.survdiff$obs[1]/data.survdiff$exp[1])
up95 = exp(log(HR) + qnorm(0.975)*sqrt(1/data.survdiff$exp[2]+1/data.survdiff$exp[1]))
low95 = exp(log(HR) - qnorm(0.975)*sqrt(1/data.survdiff$exp[2]+1/data.survdiff$exp[1]))

data.survdiff
HR
up95
low95
p.val
```


```{r}
setEPS()
postscript("QuRiS_ForestPlot_validation.eps")
library(forestplot)
# Cochrane data from the 'rmeta'-package
cochrane_from_rmeta <- 
  structure(list(
    mean  = c(NA, NA, 0.07, 2.13, 2.24),
    lower = c(NA, NA, 0.01, 0.99, 0.95),
    upper = c(NA, NA, 0.41, 4.54, 5.30)),
    .Names = c("mean", "lower", "upper"), 
    row.names = c(NA, -11L), 
    class = "data.frame")

tabletext<-cbind(
  c("", "QuRiS","QH","QI", "QL"),
  c("Treatment", "Chemo", "33.55", "24.15", "32.61"),
  c("Treatment","Non-Chemo","10.86", "37.04", "34.08"),
  c("", "P Value", "<0.001", "0.05","0.06"))

forestplot(tabletext,
                   cochrane_from_rmeta$mean,
                   cochrane_from_rmeta$lower,
                   cochrane_from_rmeta$upper,new_page = TRUE,
           txt_gp = fpTxtGp(label = gpar(fontfamily = "Arial"), cex = 0.8),
           hrzl_lines = gpar(col="#444444"),
           graph.pos = 3,
          colgap=unit(c(2),"mm"),
          lineheight = unit(c(15),"mm"),
      #     cochrane_from_rmeta,new_page = TRUE,
           is.summary=c(TRUE,TRUE,rep(FALSE,4)),
           align = c("c", "c", "c", "c"),
           clip=c(0.05,2.5), 
           xlog=TRUE,
         #xticks = c(0.1,1,1.1,1.2,1.3,1.4) ,
           col=fpColors(box="blue",line="darkblue", hrz_lines = "#444444"),
           vertices=FALSE,
          fn.ci_norm=c("fpDrawDiamondCI"),
                 boxsize=0.2,
           cex.axis=8,
          xlab="Hazard Ratios")

x <- unit(0.5, 'npc'); y <- unit(.96, 'npc')
grid.text('D2+D3:Disease Free Survival', x, y, gp = gpar(fontsize=15, font=2))

# Fix manually the position of the HR numbers
#xhr <- unit(0.68, 'npc'); yhr <- unit(0.65, 'npc')
#grid.text('0.09', xhr, yhr, gp = gpar(fontsize=10, font=1))

#xh2r <- unit(0.75, 'npc'); yh2r <- unit(0.52, 'npc')
#grid.text('0.19', xh2r, yh2r, gp = #gpar(fontsize=10, font=1))

#xl2r <- unit(0.80, 'npc'); yl2r <- unit(0.38, 'npc')
#grid.text('0.57', xl2r, yl2r, gp = gpar(fontsize=10, font=1))

#xlr <- unit(0.82, 'npc'); ylr <- unit(0.25, 'npc')
#grid.text('1.9', xlr, ylr, gp = gpar(fontsize=10, font=1))

#xl2r <- unit(0.92, 'npc'); yl2r <- unit(0.25, 'npc')
#grid.text('13', xl2r, yl2r, gp = gpar(fontsize=10, font=1))
dev.off()
```
```{r}
setEPS()
postscript("QuRiS_ForestPlot_training.eps")
library(forestplot)
# Cochrane data from the 'rmeta'-package
cochrane_from_rmeta <- 
  structure(list(
    mean  = c(NA, NA, 0.27, 1.32, 2.15),
    lower = c(NA, NA, 0.07, 0.73, 1.13),
    upper = c(NA, NA, 0.95, 2.38, 4.08)),
    .Names = c("mean", "lower", "upper"), 
    row.names = c(NA, -11L), 
    class = "data.frame")

tabletext<-cbind(
  c("", "QuRiS","QH","QI", "QL"),
  c("Treatment", "Chemo", "46.19", "34.32", "40.03"),
  c("Treatment","Non-Chemo","39.78", "37.07", "44.26"),
  c("", "P Value", "0.03", "0.4","0.03"))

forestplot(tabletext,
                   cochrane_from_rmeta$mean,
                   cochrane_from_rmeta$lower,
                   cochrane_from_rmeta$upper,new_page = TRUE,
           txt_gp = fpTxtGp(label = gpar(fontfamily = "Arial"), cex = 0.8),
           hrzl_lines = gpar(col="#444444"),
           graph.pos = 3,
          colgap=unit(c(2),"mm"),
          lineheight = unit(c(15),"mm"),
      #     cochrane_from_rmeta,new_page = TRUE,
           is.summary=c(TRUE,TRUE,rep(FALSE,4)),
           align = c("c", "c", "c", "c"),
           clip=c(0.05,2.5), 
           xlog=TRUE,
         #xticks = c(0.1,1,1.1,1.2,1.3,1.4) ,
           col=fpColors(box="blue",line="darkblue", hrz_lines = "#444444"),
           vertices=FALSE,
          fn.ci_norm=c("fpDrawDiamondCI"),
                 boxsize=0.2,
           cex.axis=8,
          xlab="Hazard Ratios")

x <- unit(0.5, 'npc'); y <- unit(.96, 'npc')
grid.text('D1:Disease Free Survival', x, y, gp = gpar(fontsize=15, font=2))

# Fix manually the position of the HR numbers
#xhr <- unit(0.68, 'npc'); yhr <- unit(0.65, 'npc')
#grid.text('0.09', xhr, yhr, gp = gpar(fontsize=10, font=1))

#xh2r <- unit(0.75, 'npc'); yh2r <- unit(0.52, 'npc')
#grid.text('0.19', xh2r, yh2r, gp = #gpar(fontsize=10, font=1))

#xl2r <- unit(0.80, 'npc'); yl2r <- unit(0.38, 'npc')
#grid.text('0.57', xl2r, yl2r, gp = gpar(fontsize=10, font=1))

#xlr <- unit(0.82, 'npc'); ylr <- unit(0.25, 'npc')
#grid.text('1.9', xlr, ylr, gp = gpar(fontsize=10, font=1))

#xl2r <- unit(0.92, 'npc'); yl2r <- unit(0.25, 'npc')
#grid.text('13', xl2r, yl2r, gp = gpar(fontsize=10, font=1))
dev.off()
```