Prognostic Predictive Signature

1 Introduction
2 Data Load/Merge
- 2.1 Loading the Raw Data into R
- 2.2 Feature Selection
3 Contruction of QuRiS
4 Entire dataset
5 Surgery Alone group (therapy = 0)
6 Adjuvant-Chemotherapy Group (therapy = 1)

1 Introduction

Generating Radiomic Signature based on the LASSO GLMNET model for predicting Disease-Free Survival in Early-Stage NSCLC.

2 Data Load/Merge

Initial Setup and Package Loads in R

Packages used for the analysis.

library(glmnet);library(survival);library(survminer);library(readxl);library(ggplot2); library(GGally);library(knitr); library(rmdformats); library(magrittr);
library(skimr); library(Hmisc); library(Epi); library(vcd); library(tidyverse) 

## Global options

options(max.print="75")
opts_chunk$set(comment=NA,
               message=FALSE,
               warning=FALSE)
opts_knit$set(width=75)


skimr::skim_with(numeric = list(hist = NULL),
                 integer = list(hist = NULL))

2.1 Loading the Raw Data into R

Loading raw dataset into R.

train <- read.csv("sample.csv")
train %>%
  select(time) %>%
  summary()

      time       
 Min.   : 2.895  
 1st Qu.:16.179  
 Median :26.123  
 Mean   :29.014  
 3rd Qu.:38.921  
 Max.   :93.239

2.2 Feature Selection

LASSO Feature Selection Method. alpha=0 gives ridge regression and alpha =1 is LASSO regression.

input to the cross validation LASSO model x1: Feature Matrix (Rows: patients and col: features) y1: time (months/days/years), status_vector(1:censor, 2:dead)

output: 10 fold cross-validation LASSO plot

x1 <- train[,(8:length(train))]
x <- data.matrix(x1, rownames.force = NA)
y1 <- train[,(2:3)]
y <- data.matrix(y1, rownames.force = NA)


library(survival)
cvfit = cv.glmnet(x, y,family = 'cox',alpha=0.1)
plot(cvfit)

Seelcting top features. output: coeffs_arranges: matrix of name of features and corresponding coefficients names: top feature names my_coefficients: coefficients of top features

tmp_coeffs <- coef(cvfit, s = "lambda.min")
non_zero_coefs <- data.frame(name = tmp_coeffs@Dimnames[[1]][tmp_coeffs@i + 1], coefficient = (tmp_coeffs@x))
coeffs_arranges <- non_zero_coefs[order(-non_zero_coefs$coefficient),]
names <- coeffs_arranges[,1]
my_coefficients <- (coeffs_arranges[,2])
my_coefficients

 [1]  0.1021445885  0.0783834209  0.0217510233  0.0004785898  0.0003860738
 [6] -0.0006585577 -0.0008374606 -0.0361923820 -0.0467877817 -0.0490544736
[11] -0.0606422928 -0.1182823462

3 Contruction of QuRiS

Multivariate Analysis with LASSO top features on training cohort.

varnames = sapply(1:length(names), function(i){
  (paste0(names[i]))
})

iformula <- as.formula(sprintf("Surv(time, status) ~ %s ", paste(varnames, collapse='+')))  
res.cox <- coxph(iformula, data = train)
summary(res.cox)

Call:
coxph(formula = iformula, data = train)

  n= 100, number of events= 27 

                 coef  exp(coef)   se(coef)      z Pr(>|z|)  
feature73   4.140e+01  9.533e+17  5.000e+01  0.828   0.4077  
feature74  -2.259e+01  1.547e-10  3.888e+01 -0.581   0.5613  
feature71  -3.829e+00  2.174e-02  2.314e+00 -1.655   0.0980 .
feature129  2.581e-01  1.294e+00  2.525e-01  1.022   0.3068  
feature132 -2.385e-01  7.878e-01  2.453e-01 -0.972   0.3309  
feature93   4.821e-03  1.005e+00  1.448e-02  0.333   0.7392  
feature95  -2.690e-02  9.735e-01  3.396e-02 -0.792   0.4282  
feature76   7.715e-01  2.163e+00  1.078e+00  0.716   0.4741  
feature72  -1.600e+00  2.020e-01  1.074e+00 -1.489   0.1365  
feature70  -2.065e+01  1.073e-09  1.571e+01 -1.315   0.1885  
feature28  -3.555e-01  7.008e-01  2.121e-01 -1.676   0.0937 .
feature69   2.006e+01  5.141e+08  1.760e+01  1.140   0.2544  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

           exp(coef) exp(-coef) lower .95 upper .95
feature73  9.533e+17  1.049e-18 2.610e-25 3.482e+60
feature74  1.547e-10  6.463e+09 1.236e-43 1.937e+23
feature71  2.174e-02  4.600e+01 2.333e-04 2.026e+00
feature129 1.294e+00  7.725e-01 7.891e-01 2.123e+00
feature132 7.878e-01  1.269e+00 4.871e-01 1.274e+00
feature93  1.005e+00  9.952e-01 9.767e-01 1.034e+00
feature95  9.735e-01  1.027e+00 9.108e-01 1.040e+00
feature76  2.163e+00  4.623e-01 2.616e-01 1.789e+01
feature72  2.020e-01  4.951e+00 2.460e-02 1.659e+00
feature70  1.073e-09  9.320e+08 4.595e-23 2.505e+04
feature28  7.008e-01  1.427e+00 4.625e-01 1.062e+00
feature69  5.141e+08  1.945e-09 5.378e-07 4.914e+23

Concordance= 0.802  (se = 0.041 )
Likelihood ratio test= 23.28  on 12 df,   p=0.03
Wald test            = 20.93  on 12 df,   p=0.05
Score (logrank) test = 26.95  on 12 df,   p=0.008

Creating a signature using LASSO coefficients:

train_set <- lapply(1:length(varnames), function(i) {
  ifor <-  my_coefficients[i] 
  k <- (sprintf("%s", varnames[i]))
  feature_list <- train[,k]
  value11 <- feature_list*ifor
  df <- data.frame(value11)
})


store <- data.frame(train_set)
QuRiS <- rowSums(store)

4 Entire dataset

4.1 Multivariate Model

Multivariate analysis with the signature and calculating CI with signature alone:

train <- cbind(train, QuRiS)

res.cox <- coxph(Surv(time, status) ~ QuRiS, data = train)
summary(res.cox)

Call:
coxph(formula = Surv(time, status) ~ QuRiS, data = train)

  n= 100, number of events= 27 

        coef exp(coef) se(coef)    z Pr(>|z|)    
QuRiS  4.077    58.971    1.024 3.98  6.9e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

      exp(coef) exp(-coef) lower .95 upper .95
QuRiS     58.97    0.01696     7.919     439.2

Concordance= 0.782  (se = 0.044 )
Likelihood ratio test= 17.41  on 1 df,   p=3e-05
Wald test            = 15.84  on 1 df,   p=7e-05
Score (logrank) test = 17.59  on 1 df,   p=3e-05

res.cox[["coefficients"]]

   QuRiS 
4.077039

Threshold was selected usign the MATLAB function in the repository. For example here, we used just the median value of the constructed QuRiS

threshold <- median(train$QuRiS)
train$group <- 'Group1'
train$group[train$QuRiS >= threshold] <- 'Group2'
threshold

[1] -0.5661455

4.2 Kaplan_Meier Survival Curve

Kaplan-Meier plot based on predicted high and low risk groups

fit2 <- survfit(Surv(time, status) ~ group, data = train)

ggsurvplot(
   fit2,                     # survfit object with calculated statistics.
   data = train,             # data used to fit survival curves.
   size = 1.2,
   palette = c("#FFCC33", "#0080FF"),
   risk.table = TRUE,       # show risk table.
   pval = TRUE,             # show p-value of log-rank test..
   xlab = "Time in months" ,  # customize X axis label.
   cumcensor = TRUE      # plot the number of censored subjects at time t
 #  ncensor.plot.height = 0.25
)

4.3 Hazard Ratio

variables <- c("group")
formula <- sapply(variables,
                        function(x) as.formula(paste('Surv(time, status)~', x)))

univariate_analysis <- lapply(formula, function(x){coxph(x, data = train)})
# Extract data 
results <- lapply(univariate_analysis,
                       function(x){ 
                         x <- summary(x)
                         p.value<-signif(x$wald["pvalue"], digits=4)
                         beta<-signif(x$coef[1], digits=4);#coeficient beta
                         HR <-signif(x$coef[2], digits=4);#exp(beta)
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],2)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", HR.confint.upper, ")")
                         res<-c(beta, HR, p.value)
                         names(res)<-c("beta", "HR (95% CI for HR)", 
                                       "p.value")
                         return(res)
                       })
res <- t(as.data.frame(results, check.names = FALSE))
as.data.frame(res)

       beta HR (95% CI for HR)  p.value
group 1.463      4.32 (1.8-10) 0.001152

write.csv(train, file = "dataset_train.csv")

5 Surgery Alone group (therapy = 0)

Taking only cases for which therapy is euqal to 0 (i.e. has not received any additional treatment)

surgery <- with(train, subset(train, !train$Cases %in% train$Cases[is.na(time) | therapy == 1]))

5.1 Multivariate Model

res.cox <- coxph(Surv(time, status) ~ QuRiS, data = surgery)
summary(res.cox)

Call:
coxph(formula = Surv(time, status) ~ QuRiS, data = surgery)

  n= 67, number of events= 16 

         coef exp(coef) se(coef)     z Pr(>|z|)    
QuRiS   5.249   190.301    1.527 3.438 0.000585 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

      exp(coef) exp(-coef) lower .95 upper .95
QuRiS     190.3   0.005255     9.551      3792

Concordance= 0.792  (se = 0.08 )
Likelihood ratio test= 12  on 1 df,   p=5e-04
Wald test            = 11.82  on 1 df,   p=6e-04
Score (logrank) test = 12.9  on 1 df,   p=3e-04

res.cox[["coefficients"]]

   QuRiS 
5.248607

5.2 Kaplan-Meier Curve

Kaplan-Meier plot based on predicted high and low risk groups

fit2 <- survfit(Surv(time, status) ~ group, data = surgery)

ggsurvplot(
   fit2,                     # survfit object with calculated statistics.
   data = surgery,             # data used to fit survival curves.
   size = 1.2,
   palette = c("#FFCC33", "#0080FF"),
   risk.table = TRUE,       # show risk table.
   pval = TRUE,             # show p-value of log-rank test..
   xlab = "Time in months" ,  # customize X axis label.
   cumcensor = TRUE      # plot the number of censored subjects at time t
 #  ncensor.plot.height = 0.25
)

5.3 Hazard Ratio

variables <- c("group")
formula <- sapply(variables,
                        function(x) as.formula(paste('Surv(time, status)~', x)))

univariate_analysis <- lapply(formula, function(x){coxph(x, data = surgery)})
# Extract data 
results <- lapply(univariate_analysis,
                       function(x){ 
                         x <- summary(x)
                         p.value<-signif(x$wald["pvalue"], digits=4)
                         beta<-signif(x$coef[1], digits=4);#coeficient beta
                         HR <-signif(x$coef[2], digits=4);#exp(beta)
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],2)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", HR.confint.upper, ")")
                         res<-c(beta, HR, p.value)
                         names(res)<-c("beta", "HR (95% CI for HR)", 
                                       "p.value")
                         return(res)
                       })
res <- t(as.data.frame(results, check.names = FALSE))
as.data.frame(res)

       beta HR (95% CI for HR)  p.value
group 1.812       6.123 (2-19) 0.001542

6 Adjuvant-Chemotherapy Group (therapy = 1)

Taking only cases for whcih therapy is euqal to 1 (i.e. They received any additional treatment)

adj_chemo <- with(train, subset(train, !train$Cases %in% train$Cases[is.na(time) | therapy == 0]))

6.1 Multivariate Model

res.cox <- coxph(Surv(time, status) ~ QuRiS, data = adj_chemo)
summary(res.cox)

Call:
coxph(formula = Surv(time, status) ~ QuRiS, data = adj_chemo)

  n= 33, number of events= 11 

        coef exp(coef) se(coef)     z Pr(>|z|)  
QuRiS  4.595    98.963    2.475 1.856   0.0634 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

      exp(coef) exp(-coef) lower .95 upper .95
QuRiS     98.96     0.0101     0.774     12653

Concordance= 0.644  (se = 0.073 )
Likelihood ratio test= 4.12  on 1 df,   p=0.04
Wald test            = 3.45  on 1 df,   p=0.06
Score (logrank) test = 3.66  on 1 df,   p=0.06

res.cox[["coefficients"]]

  QuRiS 
4.59475

6.2 Kaplan-Meier Curve

Kaplan-Meier plot based on predicted high and low risk groups

fit2 <- survfit(Surv(time, status) ~ group, data = adj_chemo)

ggsurvplot(
   fit2,                     # survfit object with calculated statistics.
   data = adj_chemo,             # data used to fit survival curves.
   size = 1.2,
   palette = c("#FFCC33", "#0080FF"),
   risk.table = TRUE,       # show risk table.
   pval = TRUE,             # show p-value of log-rank test..
   xlab = "Time in months" ,  # customize X axis label.
   cumcensor = TRUE      # plot the number of censored subjects at time t
 #  ncensor.plot.height = 0.25
)

6.3 Hazard Ratio

variables <- c("group")
formula <- sapply(variables,
                        function(x) as.formula(paste('Surv(time, status)~', x)))

univariate_analysis <- lapply(formula, function(x){coxph(x, data = surgery)})
# Extract data 
results <- lapply(univariate_analysis,
                       function(x){ 
                         x <- summary(x)
                         p.value<-signif(x$wald["pvalue"], digits=4)
                         beta<-signif(x$coef[1], digits=4);#coeficient beta
                         HR <-signif(x$coef[2], digits=4);#exp(beta)
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],2)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", HR.confint.upper, ")")
                         res<-c(beta, HR, p.value)
                         names(res)<-c("beta", "HR (95% CI for HR)", 
                                       "p.value")
                         return(res)
                       })
res <- t(as.data.frame(results, check.names = FALSE))
as.data.frame(res)

       beta HR (95% CI for HR)  p.value
group 1.812       6.123 (2-19) 0.001542