|
a |
|
b/QuRiS_construction.Rmd |
|
|
1 |
--- |
|
|
2 |
title: "Prognostic Predictive Signature" |
|
|
3 |
author: "Pranjal Vaidya" |
|
|
4 |
date: "`r Sys.Date()`" |
|
|
5 |
linkcolor: blue |
|
|
6 |
output: |
|
|
7 |
rmdformats::readthedown: |
|
|
8 |
highlight: kate |
|
|
9 |
number_sections: true |
|
|
10 |
code_folding: show |
|
|
11 |
--- |
|
|
12 |
|
|
|
13 |
# Introduction |
|
|
14 |
|
|
|
15 |
Generating Radiomic Signature based on the LASSO GLMNET model for predicting Disease-Free Survival in Early-Stage NSCLC. |
|
|
16 |
|
|
|
17 |
# Data Load/Merge |
|
|
18 |
|
|
|
19 |
Initial Setup and Package Loads in R |
|
|
20 |
|
|
|
21 |
Packages used for the analysis. |
|
|
22 |
```{r initial_setup, cache=FALSE, message = FALSE, warning = FALSE} |
|
|
23 |
library(glmnet);library(survival);library(survminer);library(readxl);library(ggplot2); library(GGally);library(knitr); library(rmdformats); library(magrittr); |
|
|
24 |
library(skimr); library(Hmisc); library(Epi); library(vcd); library(tidyverse) |
|
|
25 |
|
|
|
26 |
## Global options |
|
|
27 |
|
|
|
28 |
options(max.print="75") |
|
|
29 |
opts_chunk$set(comment=NA, |
|
|
30 |
message=FALSE, |
|
|
31 |
warning=FALSE) |
|
|
32 |
opts_knit$set(width=75) |
|
|
33 |
|
|
|
34 |
|
|
|
35 |
skimr::skim_with(numeric = list(hist = NULL), |
|
|
36 |
integer = list(hist = NULL)) |
|
|
37 |
``` |
|
|
38 |
|
|
|
39 |
## Loading the Raw Data into R |
|
|
40 |
|
|
|
41 |
Loading raw dataset into R. |
|
|
42 |
|
|
|
43 |
```{r load_train_set} |
|
|
44 |
train <- read.csv("sample.csv") |
|
|
45 |
train %>% |
|
|
46 |
select(time) %>% |
|
|
47 |
summary() |
|
|
48 |
``` |
|
|
49 |
|
|
|
50 |
## Feature Selection |
|
|
51 |
|
|
|
52 |
LASSO Feature Selection Method. |
|
|
53 |
alpha=0 gives ridge regression and alpha =1 is LASSO regression. |
|
|
54 |
|
|
|
55 |
input to the cross validation LASSO model |
|
|
56 |
x1: Feature Matrix (Rows: patients and col: features) |
|
|
57 |
y1: time (months/days/years), status_vector(1:censor, 2:dead) |
|
|
58 |
|
|
|
59 |
output: |
|
|
60 |
10 fold cross-validation LASSO plot |
|
|
61 |
```{r Feature Selection} |
|
|
62 |
x1 <- train[,(8:length(train))] |
|
|
63 |
x <- data.matrix(x1, rownames.force = NA) |
|
|
64 |
y1 <- train[,(2:3)] |
|
|
65 |
y <- data.matrix(y1, rownames.force = NA) |
|
|
66 |
|
|
|
67 |
|
|
|
68 |
library(survival) |
|
|
69 |
cvfit = cv.glmnet(x, y,family = 'cox',alpha=0.1) |
|
|
70 |
plot(cvfit) |
|
|
71 |
``` |
|
|
72 |
|
|
|
73 |
Seelcting top features. |
|
|
74 |
output: |
|
|
75 |
coeffs_arranges: matrix of name of features and corresponding coefficients |
|
|
76 |
names: top feature names |
|
|
77 |
my_coefficients: coefficients of top features |
|
|
78 |
|
|
|
79 |
```{r top_features} |
|
|
80 |
tmp_coeffs <- coef(cvfit, s = "lambda.min") |
|
|
81 |
non_zero_coefs <- data.frame(name = tmp_coeffs@Dimnames[[1]][tmp_coeffs@i + 1], coefficient = (tmp_coeffs@x)) |
|
|
82 |
coeffs_arranges <- non_zero_coefs[order(-non_zero_coefs$coefficient),] |
|
|
83 |
names <- coeffs_arranges[,1] |
|
|
84 |
my_coefficients <- (coeffs_arranges[,2]) |
|
|
85 |
my_coefficients |
|
|
86 |
``` |
|
|
87 |
|
|
|
88 |
|
|
|
89 |
# Contruction of QuRiS |
|
|
90 |
|
|
|
91 |
Multivariate Analysis with LASSO top features on training cohort. |
|
|
92 |
```{r Mulivariate_all_features} |
|
|
93 |
varnames = sapply(1:length(names), function(i){ |
|
|
94 |
(paste0(names[i])) |
|
|
95 |
}) |
|
|
96 |
|
|
|
97 |
iformula <- as.formula(sprintf("Surv(time, status) ~ %s ", paste(varnames, collapse='+'))) |
|
|
98 |
res.cox <- coxph(iformula, data = train) |
|
|
99 |
summary(res.cox) |
|
|
100 |
``` |
|
|
101 |
|
|
|
102 |
Creating a signature using LASSO coefficients: |
|
|
103 |
|
|
|
104 |
```{r creating_signature} |
|
|
105 |
train_set <- lapply(1:length(varnames), function(i) { |
|
|
106 |
ifor <- my_coefficients[i] |
|
|
107 |
k <- (sprintf("%s", varnames[i])) |
|
|
108 |
feature_list <- train[,k] |
|
|
109 |
value11 <- feature_list*ifor |
|
|
110 |
df <- data.frame(value11) |
|
|
111 |
}) |
|
|
112 |
|
|
|
113 |
|
|
|
114 |
store <- data.frame(train_set) |
|
|
115 |
QuRiS <- rowSums(store) |
|
|
116 |
``` |
|
|
117 |
|
|
|
118 |
|
|
|
119 |
# Entire dataset |
|
|
120 |
|
|
|
121 |
## Multivariate Model |
|
|
122 |
Multivariate analysis with the signature and calculating CI with signature alone: |
|
|
123 |
|
|
|
124 |
```{r signature} |
|
|
125 |
train <- cbind(train, QuRiS) |
|
|
126 |
|
|
|
127 |
res.cox <- coxph(Surv(time, status) ~ QuRiS, data = train) |
|
|
128 |
summary(res.cox) |
|
|
129 |
res.cox[["coefficients"]] |
|
|
130 |
``` |
|
|
131 |
|
|
|
132 |
Threshold was selected usign the MATLAB function in the repository. For example here, we used just the median value of the constructed QuRiS |
|
|
133 |
```{r dividing_data_based_median} |
|
|
134 |
threshold <- median(train$QuRiS) |
|
|
135 |
train$group <- 'Group1' |
|
|
136 |
train$group[train$QuRiS >= threshold] <- 'Group2' |
|
|
137 |
threshold |
|
|
138 |
``` |
|
|
139 |
|
|
|
140 |
## Kaplan_Meier Survival Curve |
|
|
141 |
Kaplan-Meier plot based on predicted high and low risk groups |
|
|
142 |
```{r KM plot for training} |
|
|
143 |
fit2 <- survfit(Surv(time, status) ~ group, data = train) |
|
|
144 |
|
|
|
145 |
ggsurvplot( |
|
|
146 |
fit2, # survfit object with calculated statistics. |
|
|
147 |
data = train, # data used to fit survival curves. |
|
|
148 |
size = 1.2, |
|
|
149 |
palette = c("#FFCC33", "#0080FF"), |
|
|
150 |
risk.table = TRUE, # show risk table. |
|
|
151 |
pval = TRUE, # show p-value of log-rank test.. |
|
|
152 |
xlab = "Time in months" , # customize X axis label. |
|
|
153 |
cumcensor = TRUE # plot the number of censored subjects at time t |
|
|
154 |
# ncensor.plot.height = 0.25 |
|
|
155 |
) |
|
|
156 |
|
|
|
157 |
``` |
|
|
158 |
|
|
|
159 |
## Hazard Ratio |
|
|
160 |
```{r hazard ratio} |
|
|
161 |
variables <- c("group") |
|
|
162 |
formula <- sapply(variables, |
|
|
163 |
function(x) as.formula(paste('Surv(time, status)~', x))) |
|
|
164 |
|
|
|
165 |
univariate_analysis <- lapply(formula, function(x){coxph(x, data = train)}) |
|
|
166 |
# Extract data |
|
|
167 |
results <- lapply(univariate_analysis, |
|
|
168 |
function(x){ |
|
|
169 |
x <- summary(x) |
|
|
170 |
p.value<-signif(x$wald["pvalue"], digits=4) |
|
|
171 |
beta<-signif(x$coef[1], digits=4);#coeficient beta |
|
|
172 |
HR <-signif(x$coef[2], digits=4);#exp(beta) |
|
|
173 |
HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2) |
|
|
174 |
HR.confint.upper <- signif(x$conf.int[,"upper .95"],2) |
|
|
175 |
HR <- paste0(HR, " (", |
|
|
176 |
HR.confint.lower, "-", HR.confint.upper, ")") |
|
|
177 |
res<-c(beta, HR, p.value) |
|
|
178 |
names(res)<-c("beta", "HR (95% CI for HR)", |
|
|
179 |
"p.value") |
|
|
180 |
return(res) |
|
|
181 |
}) |
|
|
182 |
res <- t(as.data.frame(results, check.names = FALSE)) |
|
|
183 |
as.data.frame(res) |
|
|
184 |
``` |
|
|
185 |
|
|
|
186 |
```{r} |
|
|
187 |
write.csv(train, file = "dataset_train.csv") |
|
|
188 |
``` |
|
|
189 |
|
|
|
190 |
# Surgery Alone group (therapy = 0) |
|
|
191 |
|
|
|
192 |
|
|
|
193 |
Taking only cases for which therapy is euqal to 0 (i.e. has not received any additional treatment) |
|
|
194 |
|
|
|
195 |
```{r surgery_group} |
|
|
196 |
surgery <- with(train, subset(train, !train$Cases %in% train$Cases[is.na(time) | therapy == 1])) |
|
|
197 |
``` |
|
|
198 |
|
|
|
199 |
## Multivariate Model |
|
|
200 |
```{r multivariate_surgery} |
|
|
201 |
res.cox <- coxph(Surv(time, status) ~ QuRiS, data = surgery) |
|
|
202 |
summary(res.cox) |
|
|
203 |
res.cox[["coefficients"]] |
|
|
204 |
``` |
|
|
205 |
|
|
|
206 |
## Kaplan-Meier Curve |
|
|
207 |
Kaplan-Meier plot based on predicted high and low risk groups |
|
|
208 |
```{r KM plot for surgery} |
|
|
209 |
fit2 <- survfit(Surv(time, status) ~ group, data = surgery) |
|
|
210 |
|
|
|
211 |
ggsurvplot( |
|
|
212 |
fit2, # survfit object with calculated statistics. |
|
|
213 |
data = surgery, # data used to fit survival curves. |
|
|
214 |
size = 1.2, |
|
|
215 |
palette = c("#FFCC33", "#0080FF"), |
|
|
216 |
risk.table = TRUE, # show risk table. |
|
|
217 |
pval = TRUE, # show p-value of log-rank test.. |
|
|
218 |
xlab = "Time in months" , # customize X axis label. |
|
|
219 |
cumcensor = TRUE # plot the number of censored subjects at time t |
|
|
220 |
# ncensor.plot.height = 0.25 |
|
|
221 |
) |
|
|
222 |
|
|
|
223 |
``` |
|
|
224 |
|
|
|
225 |
## Hazard Ratio |
|
|
226 |
```{r hazard ratio surgery} |
|
|
227 |
variables <- c("group") |
|
|
228 |
formula <- sapply(variables, |
|
|
229 |
function(x) as.formula(paste('Surv(time, status)~', x))) |
|
|
230 |
|
|
|
231 |
univariate_analysis <- lapply(formula, function(x){coxph(x, data = surgery)}) |
|
|
232 |
# Extract data |
|
|
233 |
results <- lapply(univariate_analysis, |
|
|
234 |
function(x){ |
|
|
235 |
x <- summary(x) |
|
|
236 |
p.value<-signif(x$wald["pvalue"], digits=4) |
|
|
237 |
beta<-signif(x$coef[1], digits=4);#coeficient beta |
|
|
238 |
HR <-signif(x$coef[2], digits=4);#exp(beta) |
|
|
239 |
HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2) |
|
|
240 |
HR.confint.upper <- signif(x$conf.int[,"upper .95"],2) |
|
|
241 |
HR <- paste0(HR, " (", |
|
|
242 |
HR.confint.lower, "-", HR.confint.upper, ")") |
|
|
243 |
res<-c(beta, HR, p.value) |
|
|
244 |
names(res)<-c("beta", "HR (95% CI for HR)", |
|
|
245 |
"p.value") |
|
|
246 |
return(res) |
|
|
247 |
}) |
|
|
248 |
res <- t(as.data.frame(results, check.names = FALSE)) |
|
|
249 |
as.data.frame(res) |
|
|
250 |
``` |
|
|
251 |
|
|
|
252 |
|
|
|
253 |
# Adjuvant-Chemotherapy Group (therapy = 1) |
|
|
254 |
|
|
|
255 |
Taking only cases for whcih therapy is euqal to 1 (i.e. They received any additional treatment) |
|
|
256 |
|
|
|
257 |
```{r adj_chemo_group} |
|
|
258 |
adj_chemo <- with(train, subset(train, !train$Cases %in% train$Cases[is.na(time) | therapy == 0])) |
|
|
259 |
``` |
|
|
260 |
|
|
|
261 |
## Multivariate Model |
|
|
262 |
```{r train_signature} |
|
|
263 |
res.cox <- coxph(Surv(time, status) ~ QuRiS, data = adj_chemo) |
|
|
264 |
summary(res.cox) |
|
|
265 |
res.cox[["coefficients"]] |
|
|
266 |
``` |
|
|
267 |
|
|
|
268 |
## Kaplan-Meier Curve |
|
|
269 |
Kaplan-Meier plot based on predicted high and low risk groups |
|
|
270 |
```{r KM plot for adj_chemo} |
|
|
271 |
fit2 <- survfit(Surv(time, status) ~ group, data = adj_chemo) |
|
|
272 |
|
|
|
273 |
ggsurvplot( |
|
|
274 |
fit2, # survfit object with calculated statistics. |
|
|
275 |
data = adj_chemo, # data used to fit survival curves. |
|
|
276 |
size = 1.2, |
|
|
277 |
palette = c("#FFCC33", "#0080FF"), |
|
|
278 |
risk.table = TRUE, # show risk table. |
|
|
279 |
pval = TRUE, # show p-value of log-rank test.. |
|
|
280 |
xlab = "Time in months" , # customize X axis label. |
|
|
281 |
cumcensor = TRUE # plot the number of censored subjects at time t |
|
|
282 |
# ncensor.plot.height = 0.25 |
|
|
283 |
) |
|
|
284 |
|
|
|
285 |
``` |
|
|
286 |
|
|
|
287 |
## Hazard Ratio |
|
|
288 |
|
|
|
289 |
```{r hazard_ratio_adj.chemo} |
|
|
290 |
variables <- c("group") |
|
|
291 |
formula <- sapply(variables, |
|
|
292 |
function(x) as.formula(paste('Surv(time, status)~', x))) |
|
|
293 |
|
|
|
294 |
univariate_analysis <- lapply(formula, function(x){coxph(x, data = surgery)}) |
|
|
295 |
# Extract data |
|
|
296 |
results <- lapply(univariate_analysis, |
|
|
297 |
function(x){ |
|
|
298 |
x <- summary(x) |
|
|
299 |
p.value<-signif(x$wald["pvalue"], digits=4) |
|
|
300 |
beta<-signif(x$coef[1], digits=4);#coeficient beta |
|
|
301 |
HR <-signif(x$coef[2], digits=4);#exp(beta) |
|
|
302 |
HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2) |
|
|
303 |
HR.confint.upper <- signif(x$conf.int[,"upper .95"],2) |
|
|
304 |
HR <- paste0(HR, " (", |
|
|
305 |
HR.confint.lower, "-", HR.confint.upper, ")") |
|
|
306 |
res<-c(beta, HR, p.value) |
|
|
307 |
names(res)<-c("beta", "HR (95% CI for HR)", |
|
|
308 |
"p.value") |
|
|
309 |
return(res) |
|
|
310 |
}) |
|
|
311 |
res <- t(as.data.frame(results, check.names = FALSE)) |
|
|
312 |
as.data.frame(res) |
|
|
313 |
``` |
|
|
314 |
|