--- a
+++ b/QuRiS_construction.Rmd
@@ -0,0 +1,314 @@
+---
+title: "Prognostic Predictive Signature"
+author: "Pranjal Vaidya"
+date: "`r Sys.Date()`"
+linkcolor: blue
+output:
+  rmdformats::readthedown:
+    highlight: kate
+    number_sections: true
+    code_folding: show
+---
+
+# Introduction 
+
+Generating Radiomic Signature based on the LASSO GLMNET model for predicting Disease-Free Survival in Early-Stage NSCLC. 
+
+# Data Load/Merge
+
+Initial Setup and Package Loads in R 
+
+Packages used for the analysis.
+```{r initial_setup, cache=FALSE, message = FALSE, warning = FALSE}
+library(glmnet);library(survival);library(survminer);library(readxl);library(ggplot2); library(GGally);library(knitr); library(rmdformats); library(magrittr);
+library(skimr); library(Hmisc); library(Epi); library(vcd); library(tidyverse) 
+
+## Global options
+
+options(max.print="75")
+opts_chunk$set(comment=NA,
+               message=FALSE,
+               warning=FALSE)
+opts_knit$set(width=75)
+
+
+skimr::skim_with(numeric = list(hist = NULL),
+                 integer = list(hist = NULL))
+```
+
+## Loading the Raw Data into R 
+
+Loading raw dataset into R.
+
+```{r load_train_set}
+train <- read.csv("sample.csv")
+train %>%
+  select(time) %>%
+  summary()
+```
+
+## Feature Selection 
+
+LASSO Feature Selection Method.
+alpha=0 gives ridge regression and alpha =1 is LASSO regression.
+
+input to the cross validation LASSO model
+x1: Feature Matrix (Rows: patients and col: features)
+y1: time (months/days/years), status_vector(1:censor, 2:dead)
+
+output: 
+10 fold cross-validation LASSO plot 
+```{r Feature Selection}
+x1 <- train[,(8:length(train))]
+x <- data.matrix(x1, rownames.force = NA)
+y1 <- train[,(2:3)]
+y <- data.matrix(y1, rownames.force = NA)
+
+
+library(survival)
+cvfit = cv.glmnet(x, y,family = 'cox',alpha=0.1)
+plot(cvfit)
+```
+
+Seelcting top features.
+output:
+coeffs_arranges: matrix of name of features and corresponding coefficients
+names: top feature names
+my_coefficients: coefficients of top features
+
+```{r top_features}
+tmp_coeffs <- coef(cvfit, s = "lambda.min")
+non_zero_coefs <- data.frame(name = tmp_coeffs@Dimnames[[1]][tmp_coeffs@i + 1], coefficient = (tmp_coeffs@x))
+coeffs_arranges <- non_zero_coefs[order(-non_zero_coefs$coefficient),]
+names <- coeffs_arranges[,1]
+my_coefficients <- (coeffs_arranges[,2])
+my_coefficients
+```
+
+
+# Contruction of QuRiS
+
+Multivariate Analysis with LASSO top features on training cohort. 
+```{r Mulivariate_all_features}
+varnames = sapply(1:length(names), function(i){
+  (paste0(names[i]))
+})
+
+iformula <- as.formula(sprintf("Surv(time, status) ~ %s ", paste(varnames, collapse='+')))  
+res.cox <- coxph(iformula, data = train)
+summary(res.cox)
+```
+
+Creating a signature using LASSO coefficients: 
+
+```{r creating_signature}
+train_set <- lapply(1:length(varnames), function(i) {
+  ifor <-  my_coefficients[i] 
+  k <- (sprintf("%s", varnames[i]))
+  feature_list <- train[,k]
+  value11 <- feature_list*ifor
+  df <- data.frame(value11)
+})
+
+
+store <- data.frame(train_set)
+QuRiS <- rowSums(store)
+```
+
+
+# Entire dataset
+
+## Multivariate Model
+Multivariate analysis with the signature and calculating CI with signature alone:
+
+```{r signature}
+train <- cbind(train, QuRiS)
+
+res.cox <- coxph(Surv(time, status) ~ QuRiS, data = train)
+summary(res.cox)
+res.cox[["coefficients"]]
+```
+
+Threshold was selected usign the MATLAB function in the repository. For example here, we used just the median value of the constructed QuRiS 
+```{r dividing_data_based_median}
+threshold <- median(train$QuRiS)
+train$group <- 'Group1'
+train$group[train$QuRiS >= threshold] <- 'Group2'
+threshold
+```
+
+## Kaplan_Meier Survival Curve
+Kaplan-Meier plot based on predicted high and low risk groups
+```{r KM plot for training}
+fit2 <- survfit(Surv(time, status) ~ group, data = train)
+
+ggsurvplot(
+   fit2,                     # survfit object with calculated statistics.
+   data = train,             # data used to fit survival curves.
+   size = 1.2,
+   palette = c("#FFCC33", "#0080FF"),
+   risk.table = TRUE,       # show risk table.
+   pval = TRUE,             # show p-value of log-rank test..
+   xlab = "Time in months" ,  # customize X axis label.
+   cumcensor = TRUE      # plot the number of censored subjects at time t
+ #  ncensor.plot.height = 0.25
+)
+
+```
+
+## Hazard Ratio
+```{r hazard ratio}
+variables <- c("group")
+formula <- sapply(variables,
+                        function(x) as.formula(paste('Surv(time, status)~', x)))
+
+univariate_analysis <- lapply(formula, function(x){coxph(x, data = train)})
+# Extract data 
+results <- lapply(univariate_analysis,
+                       function(x){ 
+                         x <- summary(x)
+                         p.value<-signif(x$wald["pvalue"], digits=4)
+                         beta<-signif(x$coef[1], digits=4);#coeficient beta
+                         HR <-signif(x$coef[2], digits=4);#exp(beta)
+                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
+                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],2)
+                         HR <- paste0(HR, " (", 
+                                      HR.confint.lower, "-", HR.confint.upper, ")")
+                         res<-c(beta, HR, p.value)
+                         names(res)<-c("beta", "HR (95% CI for HR)", 
+                                       "p.value")
+                         return(res)
+                       })
+res <- t(as.data.frame(results, check.names = FALSE))
+as.data.frame(res)
+```
+
+```{r}
+write.csv(train, file = "dataset_train.csv")
+```
+
+# Surgery Alone group (therapy = 0)
+
+
+Taking only cases for which therapy is euqal to 0 (i.e. has not received any additional treatment)
+
+```{r surgery_group}
+surgery <- with(train, subset(train, !train$Cases %in% train$Cases[is.na(time) | therapy == 1]))
+```
+
+## Multivariate Model
+```{r multivariate_surgery}
+res.cox <- coxph(Surv(time, status) ~ QuRiS, data = surgery)
+summary(res.cox)
+res.cox[["coefficients"]]
+```
+
+## Kaplan-Meier Curve
+Kaplan-Meier plot based on predicted high and low risk groups
+```{r KM plot for surgery}
+fit2 <- survfit(Surv(time, status) ~ group, data = surgery)
+
+ggsurvplot(
+   fit2,                     # survfit object with calculated statistics.
+   data = surgery,             # data used to fit survival curves.
+   size = 1.2,
+   palette = c("#FFCC33", "#0080FF"),
+   risk.table = TRUE,       # show risk table.
+   pval = TRUE,             # show p-value of log-rank test..
+   xlab = "Time in months" ,  # customize X axis label.
+   cumcensor = TRUE      # plot the number of censored subjects at time t
+ #  ncensor.plot.height = 0.25
+)
+
+```
+
+## Hazard Ratio
+```{r hazard ratio surgery}
+variables <- c("group")
+formula <- sapply(variables,
+                        function(x) as.formula(paste('Surv(time, status)~', x)))
+
+univariate_analysis <- lapply(formula, function(x){coxph(x, data = surgery)})
+# Extract data 
+results <- lapply(univariate_analysis,
+                       function(x){ 
+                         x <- summary(x)
+                         p.value<-signif(x$wald["pvalue"], digits=4)
+                         beta<-signif(x$coef[1], digits=4);#coeficient beta
+                         HR <-signif(x$coef[2], digits=4);#exp(beta)
+                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
+                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],2)
+                         HR <- paste0(HR, " (", 
+                                      HR.confint.lower, "-", HR.confint.upper, ")")
+                         res<-c(beta, HR, p.value)
+                         names(res)<-c("beta", "HR (95% CI for HR)", 
+                                       "p.value")
+                         return(res)
+                       })
+res <- t(as.data.frame(results, check.names = FALSE))
+as.data.frame(res)
+```
+
+
+# Adjuvant-Chemotherapy Group (therapy = 1)
+
+Taking only cases for whcih therapy is euqal to 1 (i.e. They received any additional treatment)
+
+```{r adj_chemo_group}
+adj_chemo <- with(train, subset(train, !train$Cases %in% train$Cases[is.na(time) | therapy == 0]))
+```
+
+## Multivariate Model
+```{r train_signature}
+res.cox <- coxph(Surv(time, status) ~ QuRiS, data = adj_chemo)
+summary(res.cox)
+res.cox[["coefficients"]]
+```
+
+## Kaplan-Meier Curve
+Kaplan-Meier plot based on predicted high and low risk groups
+```{r KM plot for adj_chemo}
+fit2 <- survfit(Surv(time, status) ~ group, data = adj_chemo)
+
+ggsurvplot(
+   fit2,                     # survfit object with calculated statistics.
+   data = adj_chemo,             # data used to fit survival curves.
+   size = 1.2,
+   palette = c("#FFCC33", "#0080FF"),
+   risk.table = TRUE,       # show risk table.
+   pval = TRUE,             # show p-value of log-rank test..
+   xlab = "Time in months" ,  # customize X axis label.
+   cumcensor = TRUE      # plot the number of censored subjects at time t
+ #  ncensor.plot.height = 0.25
+)
+
+```
+
+## Hazard Ratio
+
+```{r hazard_ratio_adj.chemo}
+variables <- c("group")
+formula <- sapply(variables,
+                        function(x) as.formula(paste('Surv(time, status)~', x)))
+
+univariate_analysis <- lapply(formula, function(x){coxph(x, data = surgery)})
+# Extract data 
+results <- lapply(univariate_analysis,
+                       function(x){ 
+                         x <- summary(x)
+                         p.value<-signif(x$wald["pvalue"], digits=4)
+                         beta<-signif(x$coef[1], digits=4);#coeficient beta
+                         HR <-signif(x$coef[2], digits=4);#exp(beta)
+                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
+                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],2)
+                         HR <- paste0(HR, " (", 
+                                      HR.confint.lower, "-", HR.confint.upper, ")")
+                         res<-c(beta, HR, p.value)
+                         names(res)<-c("beta", "HR (95% CI for HR)", 
+                                       "p.value")
+                         return(res)
+                       })
+res <- t(as.data.frame(results, check.names = FALSE))
+as.data.frame(res)
+```
+