Prognostic Predictive Signature
Prognostic Predictive Signature
1 Introduction
Generating Radiomic Signature based on the LASSO GLMNET model for predicting Disease-Free Survival in Early-Stage NSCLC.
2 Data Load/Merge
Initial Setup and Package Loads in R
Packages used for the analysis.
library(glmnet);library(survival);library(survminer);library(readxl);library(ggplot2); library(GGally);library(knitr); library(rmdformats); library(magrittr);
library(skimr); library(Hmisc); library(Epi); library(vcd); library(tidyverse)
## Global options
options(max.print="75")
opts_chunk$set(comment=NA,
message=FALSE,
warning=FALSE)
opts_knit$set(width=75)
skimr::skim_with(numeric = list(hist = NULL),
integer = list(hist = NULL))
2.1 Loading the Raw Data into R
Loading raw dataset into R.
time
Min. : 2.895
1st Qu.:16.179
Median :26.123
Mean :29.014
3rd Qu.:38.921
Max. :93.239
2.2 Feature Selection
LASSO Feature Selection Method. alpha=0 gives ridge regression and alpha =1 is LASSO regression.
input to the cross validation LASSO model x1: Feature Matrix (Rows: patients and col: features) y1: time (months/days/years), status_vector(1:censor, 2:dead)
output: 10 fold cross-validation LASSO plot
x1 <- train[,(8:length(train))]
x <- data.matrix(x1, rownames.force = NA)
y1 <- train[,(2:3)]
y <- data.matrix(y1, rownames.force = NA)
library(survival)
cvfit = cv.glmnet(x, y,family = 'cox',alpha=0.1)
plot(cvfit)
Seelcting top features. output: coeffs_arranges: matrix of name of features and corresponding coefficients names: top feature names my_coefficients: coefficients of top features
tmp_coeffs <- coef(cvfit, s = "lambda.min")
non_zero_coefs <- data.frame(name = tmp_coeffs@Dimnames[[1]][tmp_coeffs@i + 1], coefficient = (tmp_coeffs@x))
coeffs_arranges <- non_zero_coefs[order(-non_zero_coefs$coefficient),]
names <- coeffs_arranges[,1]
my_coefficients <- (coeffs_arranges[,2])
my_coefficients
[1] 0.1021445885 0.0783834209 0.0217510233 0.0004785898 0.0003860738
[6] -0.0006585577 -0.0008374606 -0.0361923820 -0.0467877817 -0.0490544736
[11] -0.0606422928 -0.1182823462
3 Contruction of QuRiS
Multivariate Analysis with LASSO top features on training cohort.
varnames = sapply(1:length(names), function(i){
(paste0(names[i]))
})
iformula <- as.formula(sprintf("Surv(time, status) ~ %s ", paste(varnames, collapse='+')))
res.cox <- coxph(iformula, data = train)
summary(res.cox)
Call:
coxph(formula = iformula, data = train)
n= 100, number of events= 27
coef exp(coef) se(coef) z Pr(>|z|)
feature73 4.140e+01 9.533e+17 5.000e+01 0.828 0.4077
feature74 -2.259e+01 1.547e-10 3.888e+01 -0.581 0.5613
feature71 -3.829e+00 2.174e-02 2.314e+00 -1.655 0.0980 .
feature129 2.581e-01 1.294e+00 2.525e-01 1.022 0.3068
feature132 -2.385e-01 7.878e-01 2.453e-01 -0.972 0.3309
feature93 4.821e-03 1.005e+00 1.448e-02 0.333 0.7392
feature95 -2.690e-02 9.735e-01 3.396e-02 -0.792 0.4282
feature76 7.715e-01 2.163e+00 1.078e+00 0.716 0.4741
feature72 -1.600e+00 2.020e-01 1.074e+00 -1.489 0.1365
feature70 -2.065e+01 1.073e-09 1.571e+01 -1.315 0.1885
feature28 -3.555e-01 7.008e-01 2.121e-01 -1.676 0.0937 .
feature69 2.006e+01 5.141e+08 1.760e+01 1.140 0.2544
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
feature73 9.533e+17 1.049e-18 2.610e-25 3.482e+60
feature74 1.547e-10 6.463e+09 1.236e-43 1.937e+23
feature71 2.174e-02 4.600e+01 2.333e-04 2.026e+00
feature129 1.294e+00 7.725e-01 7.891e-01 2.123e+00
feature132 7.878e-01 1.269e+00 4.871e-01 1.274e+00
feature93 1.005e+00 9.952e-01 9.767e-01 1.034e+00
feature95 9.735e-01 1.027e+00 9.108e-01 1.040e+00
feature76 2.163e+00 4.623e-01 2.616e-01 1.789e+01
feature72 2.020e-01 4.951e+00 2.460e-02 1.659e+00
feature70 1.073e-09 9.320e+08 4.595e-23 2.505e+04
feature28 7.008e-01 1.427e+00 4.625e-01 1.062e+00
feature69 5.141e+08 1.945e-09 5.378e-07 4.914e+23
Concordance= 0.802 (se = 0.041 )
Likelihood ratio test= 23.28 on 12 df, p=0.03
Wald test = 20.93 on 12 df, p=0.05
Score (logrank) test = 26.95 on 12 df, p=0.008
Creating a signature using LASSO coefficients:
4 Entire dataset
4.1 Multivariate Model
Multivariate analysis with the signature and calculating CI with signature alone:
train <- cbind(train, QuRiS)
res.cox <- coxph(Surv(time, status) ~ QuRiS, data = train)
summary(res.cox)
Call:
coxph(formula = Surv(time, status) ~ QuRiS, data = train)
n= 100, number of events= 27
coef exp(coef) se(coef) z Pr(>|z|)
QuRiS 4.077 58.971 1.024 3.98 6.9e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
QuRiS 58.97 0.01696 7.919 439.2
Concordance= 0.782 (se = 0.044 )
Likelihood ratio test= 17.41 on 1 df, p=3e-05
Wald test = 15.84 on 1 df, p=7e-05
Score (logrank) test = 17.59 on 1 df, p=3e-05
QuRiS
4.077039
Threshold was selected usign the MATLAB function in the repository. For example here, we used just the median value of the constructed QuRiS
threshold <- median(train$QuRiS)
train$group <- 'Group1'
train$group[train$QuRiS >= threshold] <- 'Group2'
threshold
[1] -0.5661455
4.2 Kaplan_Meier Survival Curve
Kaplan-Meier plot based on predicted high and low risk groups
fit2 <- survfit(Surv(time, status) ~ group, data = train)
ggsurvplot(
fit2, # survfit object with calculated statistics.
data = train, # data used to fit survival curves.
size = 1.2,
palette = c("#FFCC33", "#0080FF"),
risk.table = TRUE, # show risk table.
pval = TRUE, # show p-value of log-rank test..
xlab = "Time in months" , # customize X axis label.
cumcensor = TRUE # plot the number of censored subjects at time t
# ncensor.plot.height = 0.25
)
4.3 Hazard Ratio
variables <- c("group")
formula <- sapply(variables,
function(x) as.formula(paste('Surv(time, status)~', x)))
univariate_analysis <- lapply(formula, function(x){coxph(x, data = train)})
# Extract data
results <- lapply(univariate_analysis,
function(x){
x <- summary(x)
p.value<-signif(x$wald["pvalue"], digits=4)
beta<-signif(x$coef[1], digits=4);#coeficient beta
HR <-signif(x$coef[2], digits=4);#exp(beta)
HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
HR.confint.upper <- signif(x$conf.int[,"upper .95"],2)
HR <- paste0(HR, " (",
HR.confint.lower, "-", HR.confint.upper, ")")
res<-c(beta, HR, p.value)
names(res)<-c("beta", "HR (95% CI for HR)",
"p.value")
return(res)
})
res <- t(as.data.frame(results, check.names = FALSE))
as.data.frame(res)
beta HR (95% CI for HR) p.value
group 1.463 4.32 (1.8-10) 0.001152
5 Surgery Alone group (therapy = 0)
Taking only cases for which therapy is euqal to 0 (i.e. has not received any additional treatment)
5.1 Multivariate Model
Call:
coxph(formula = Surv(time, status) ~ QuRiS, data = surgery)
n= 67, number of events= 16
coef exp(coef) se(coef) z Pr(>|z|)
QuRiS 5.249 190.301 1.527 3.438 0.000585 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
QuRiS 190.3 0.005255 9.551 3792
Concordance= 0.792 (se = 0.08 )
Likelihood ratio test= 12 on 1 df, p=5e-04
Wald test = 11.82 on 1 df, p=6e-04
Score (logrank) test = 12.9 on 1 df, p=3e-04
QuRiS
5.248607
5.2 Kaplan-Meier Curve
Kaplan-Meier plot based on predicted high and low risk groups
fit2 <- survfit(Surv(time, status) ~ group, data = surgery)
ggsurvplot(
fit2, # survfit object with calculated statistics.
data = surgery, # data used to fit survival curves.
size = 1.2,
palette = c("#FFCC33", "#0080FF"),
risk.table = TRUE, # show risk table.
pval = TRUE, # show p-value of log-rank test..
xlab = "Time in months" , # customize X axis label.
cumcensor = TRUE # plot the number of censored subjects at time t
# ncensor.plot.height = 0.25
)
5.3 Hazard Ratio
variables <- c("group")
formula <- sapply(variables,
function(x) as.formula(paste('Surv(time, status)~', x)))
univariate_analysis <- lapply(formula, function(x){coxph(x, data = surgery)})
# Extract data
results <- lapply(univariate_analysis,
function(x){
x <- summary(x)
p.value<-signif(x$wald["pvalue"], digits=4)
beta<-signif(x$coef[1], digits=4);#coeficient beta
HR <-signif(x$coef[2], digits=4);#exp(beta)
HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
HR.confint.upper <- signif(x$conf.int[,"upper .95"],2)
HR <- paste0(HR, " (",
HR.confint.lower, "-", HR.confint.upper, ")")
res<-c(beta, HR, p.value)
names(res)<-c("beta", "HR (95% CI for HR)",
"p.value")
return(res)
})
res <- t(as.data.frame(results, check.names = FALSE))
as.data.frame(res)
beta HR (95% CI for HR) p.value
group 1.812 6.123 (2-19) 0.001542
6 Adjuvant-Chemotherapy Group (therapy = 1)
Taking only cases for whcih therapy is euqal to 1 (i.e. They received any additional treatment)
6.1 Multivariate Model
Call:
coxph(formula = Surv(time, status) ~ QuRiS, data = adj_chemo)
n= 33, number of events= 11
coef exp(coef) se(coef) z Pr(>|z|)
QuRiS 4.595 98.963 2.475 1.856 0.0634 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
QuRiS 98.96 0.0101 0.774 12653
Concordance= 0.644 (se = 0.073 )
Likelihood ratio test= 4.12 on 1 df, p=0.04
Wald test = 3.45 on 1 df, p=0.06
Score (logrank) test = 3.66 on 1 df, p=0.06
QuRiS
4.59475
6.2 Kaplan-Meier Curve
Kaplan-Meier plot based on predicted high and low risk groups
fit2 <- survfit(Surv(time, status) ~ group, data = adj_chemo)
ggsurvplot(
fit2, # survfit object with calculated statistics.
data = adj_chemo, # data used to fit survival curves.
size = 1.2,
palette = c("#FFCC33", "#0080FF"),
risk.table = TRUE, # show risk table.
pval = TRUE, # show p-value of log-rank test..
xlab = "Time in months" , # customize X axis label.
cumcensor = TRUE # plot the number of censored subjects at time t
# ncensor.plot.height = 0.25
)
6.3 Hazard Ratio
variables <- c("group")
formula <- sapply(variables,
function(x) as.formula(paste('Surv(time, status)~', x)))
univariate_analysis <- lapply(formula, function(x){coxph(x, data = surgery)})
# Extract data
results <- lapply(univariate_analysis,
function(x){
x <- summary(x)
p.value<-signif(x$wald["pvalue"], digits=4)
beta<-signif(x$coef[1], digits=4);#coeficient beta
HR <-signif(x$coef[2], digits=4);#exp(beta)
HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
HR.confint.upper <- signif(x$conf.int[,"upper .95"],2)
HR <- paste0(HR, " (",
HR.confint.lower, "-", HR.confint.upper, ")")
res<-c(beta, HR, p.value)
names(res)<-c("beta", "HR (95% CI for HR)",
"p.value")
return(res)
})
res <- t(as.data.frame(results, check.names = FALSE))
as.data.frame(res)
beta HR (95% CI for HR) p.value
group 1.812 6.123 (2-19) 0.001542