[16eabd]: / 4-Multi-Omic Integration / scripts / 6.random_forest.r

Download this file

319 lines (227 with data), 12.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# files required:
# 1) "4_MetaG.MetaB.modules.NEU.linked.txt": resulting data generated from script "4_MetaG.MetaB.link.r"
# 2) "4_MetaB.HostT.modules.NEU.linked.txt": resulting data generated from script "4_MetaB.HostT.link.r"
# 3) "meta.mediation.NEU.txt": meta data containing clinical variable NEU
# 4) "1_metaG-combined.gct": MetaG module abundance generated by ssGSEA2
# 5) "1_metaB.module_eigengene.txt": MetaB module abundance generated by WGCNA
# 6) "1_hostT.module_eigengene.txt": HostT module abundance generated by WGCNA
# output:
# "Output/prediction.performance_byLinks.rf.txt": prediction performance of MetaG-MetaB-HostT links to predict clinical variable NEU
library(data.table)
library(dplyr)
library(tidymodels)
library(tidyverse)
library(workflows)
library(tune)
library(ranger)
# first identify the MetaG-MetaB-HostT links ---------
MetaG.MetaB.links <-
fread("4_MetaG.MetaB.modules.NEU.linked.txt")
MetaB.HostT.links <-
fread("4_MetaB.HostT.modules.NEU.linked.txt") %>%
mutate(MetaB.module = sapply(strsplit(MetaB.HostT_modulePair, "_", fixed = T), "[[", 1),
HostT.module = sapply(strsplit(MetaB.HostT_modulePair, "_", fixed = T), "[[", 2))
MetaG.MetaB.HostT.links <- NULL
for(i in c(1:nrow(MetaG.MetaB.links))){
Gm = strsplit(MetaG.MetaB.links$module_pair[i],"_",fixed = T)[[1]][1]
Bm = strsplit(MetaG.MetaB.links$module_pair[i],"_",fixed = T)[[1]][2]
gb.fpairs = strsplit(MetaG.MetaB.links$feature_connections[i],";",fixed = T)[[1]]
if(any(grepl("-", gb.fpairs, fixed = T))) gb.fpairs <- gsub("-","_",gb.fpairs)
Gf = sapply(strsplit(gb.fpairs,"_",fixed = T),"[[", 1)
Bf = sapply(strsplit(gb.fpairs,"_",fixed = T),"[[", 2)
BT.link_sub <- MetaB.HostT.links[which(MetaB.HostT.links$MetaB.module == Bm & MetaB.HostT.links$MetaB.feature %in% Bf),]
Tms <- MetaB.HostT.links$HostT.module[which(MetaB.HostT.links$MetaB.module == Bm & MetaB.HostT.links$MetaB.feature %in% Bf)]
if(length(Tms) == 0 ) next
tmp.links <- cbind.data.frame(MetaG = Gm, MetaB = Bm, HostT =Tms, stringsAsFactors=F ) %>% unique()
# add feature links
gb.fpairs_df <- cbind.data.frame(Gf, Bf, stringsAsFactors=F)
feature.links <- vector("character", length = nrow(tmp.links))
for(i_2 in c(1:nrow(tmp.links))){
# i_2 = 1
Tm = tmp.links$HostT[i_2]
# Tf = BT.link_sub$HostT.feature[which(BT.link_sub$HostT.module == Tm)]
bt.fpairs_df <- BT.link_sub %>%
filter(HostT.module == Tm) %>%
mutate(Bf=MetaB.feature, Tf=HostT.feature) %>% select(Bf,Tf)
feature.links_df <- merge(gb.fpairs_df,bt.fpairs_df,by="Bf") %>% relocate(Gf)
flinks <-
paste(paste(feature.links_df$Gf, feature.links_df$Bf, feature.links_df$Tf, sep = "_"), collapse = ";")
feature.links[i_2] <- flinks
}
tmp.links$feature.links <- feature.links
# bind rows
MetaG.MetaB.HostT.links <- bind_rows(MetaG.MetaB.HostT.links, tmp.links)
}
# then create a predicted variable data frame -----
meta <- fread("meta.mediation.NEU.txt") %>% select(SampleID, NUE)
meta$SampleID <- sapply(meta$SampleID, function(x)if(grepl("^\\d",x,perl = T) ) paste("X",x,sep = "") else x )
# load omic data --------
MetaB.Mod.dat <- fread("1_metaB.module_eigengene.txt",data.table = F) %>%
dplyr::filter(!grepl("#",`#NAME`,fixed=T))
MetaB.Mod.dat[-1] <- sapply(MetaB.Mod.dat[-1], as.numeric)
MetaB.Mod.dat <- MetaB.Mod.dat %>% tibble::column_to_rownames("#NAME")
colnames(MetaB.Mod.dat) <- sapply(colnames(MetaB.Mod.dat),
function(x)if(grepl("^\\d",x,perl = T) ) paste("X",x,sep = "") else x)
HostT.Mod.dat <- fread("1_hostT.module_eigengene.txt",data.table = F) %>%
dplyr::filter(!grepl("#",`#NAME`,fixed=T))
HostT.Mod.dat[-1] <- sapply(HostT.Mod.dat[-1], as.numeric)
HostT.Mod.dat <- HostT.Mod.dat %>% tibble::column_to_rownames("#NAME")
colnames(HostT.Mod.dat) <- sapply(colnames(HostT.Mod.dat),
function(x)if(grepl("^\\d",x,perl = T) ) paste("X",x,sep = "") else x)
# last, perform random forest analysis -----------
## Import data
log.file = "rf_by.links.log"
cat(paste("\n\n",as.character(Sys.time()), '\n'), file=log.file, append=T)
cat("Importing data : \n", file=log.file, append=T)
# metagenomic data ------
m1 <- parse.gctx("1_metaG-combined.gct")@mat %>% t() %>% data.frame()
#colnames(m1) <- paste("MetaG.", colnames(m1),sep = "")
# metabolomic data -----
m2 = MetaB.Mod.dat
feature.abb_df1 <- cbind.data.frame(feature = rownames(m2),
abb = paste("feature",seq(1,nrow(m2),1),sep = ""),
stringsAsFactors = F)
rownames(m2) <- sapply(rownames(m2), function(x) feature.abb_df1$abb[which(feature.abb_df1$feature == x)])
m2 <- t(m2) %>% as.data.frame(stringsAsFactors=F)
colnames(m2) <- sapply(colnames(m2), function(x) feature.abb_df1$feature[which(feature.abb_df1$abb == x)])
#colnames(m2) <- paste("MetaB.", colnames(m2),sep = "")
# host transcriptomic data -----
m3 = HostT.Mod.dat
feature.abb_df1 <- cbind.data.frame(feature = rownames(m3),
abb = paste("feature",seq(1,nrow(m3),1),sep = ""),
stringsAsFactors = F)
rownames(m3) <- sapply(rownames(m3), function(x) feature.abb_df1$abb[which(feature.abb_df1$feature == x)])
m3 <- t(m3) %>% as.data.frame(stringsAsFactors=F)
colnames(m3) <- sapply(colnames(m3), function(x) feature.abb_df1$feature[which(feature.abb_df1$abb == x)])
#colnames(m3) <- paste("HostT.", colnames(m3),sep = "")
# clinical variable of interest -------
Predicted.dat <- meta
Y = colnames(Predicted.dat)[colnames(Predicted.dat) != "SampleID"]
colnames(Predicted.dat)[which(colnames(Predicted.dat) == Y)] <- "Y"
head(Predicted.dat)
# match rows of all the data frames ------
completeSps <- intersect(intersect(intersect(rownames(m1), rownames(m2)), rownames(m3)), Predicted.dat$SampleID)
m1 <- m1[match(completeSps, rownames(m1)),]
m2 <- m2[match(completeSps, rownames(m2)),]
m3 <- m3[match(completeSps, rownames(m3)),]
Predicted.dat <- Predicted.dat[match(completeSps, Predicted.dat$SampleID),]
# MetaG.MetaB.HostT.link -------
link.df <- MetaG.MetaB.HostT.links
## ############################################################################
## ##
## integrate data for random forest ##
## ##
## ############################################################################
cat("Performing random forest modeling : \n", file=log.file, append=T)
Performance <- NULL
for(i in c(1:nrow(link.df))){ # nrow(link.df)
# i=1
if(i %% 100 == 0) cat(paste("----progress: predicting ", Y, "with link ", i ," out of the total ", nrow(link.df), " links ----------", sep = "") , file=log.file, append=T )
Gm <- sub("MetaG\\.","", link.df$MetaG[i])
Bm <- sub("MetaB\\.","",link.df$MetaB[i])
Tm <- sub("HostT\\.", "", link.df$HostT[i])
rf_dat <- cbind.data.frame(SampleID = rownames(m1),
MetaG = m1[,Gm],
MetaB = m2[,Bm],
HostT = m3[,Tm],
Y = Predicted.dat$Y,
stringsAsFactors=F)
rf_dat <- rf_dat[complete.cases(rf_dat),]
# split the data into traing and testing sets
GZ.sp <-meta$SampleID[!grepl("^Z", meta$SampleID)]
Training.samples = GZ.sp
set.seed(100)
if( is.null(Training.samples) ){
rf_dat <- rf_dat[,-which(colnames(rf_dat) == "SampleID")]
my_split <- initial_split(rf_dat, prop = 2/4, strata = Y)
my_train <- training(my_split)
my_test <- testing(my_split)
}else {
tmp.ids <- unlist(sapply(Training.samples, function(x) which(rf_dat$SampleID == x)))
rf_dat <- rf_dat[,-which(colnames(rf_dat) == "SampleID")]
my_split <- initial_split(rf_dat)
my_split$in_id <- unname(tmp.ids) # manually adjust training sample ids
my_train <- training(my_split)
my_test <- testing(my_split)
}
# create a cross validation version of the training set for parameter tuning
my_cv <- vfold_cv(my_train, v = 5)
# define the recipe
my_recipe <-
# which consists of the formula (outcome ~ predictors)
recipe(Y ~ ., data = rf_dat) # %>%
#step_knnimpute(all_predictors()) # missing value
# Specify the model --------------------------------------
PredictionType = "regression"
rf_model <-
# specify that the model is a random forest
rand_forest() %>% # ?rand_forest shows the tunable parameters
# specify that the `mtry` parameter needs to be tuned
set_args(mtry = tune(), trees=tune(),min_n=10) %>%
# select the engine/package that underlies the model
set_engine("ranger", importance = "impurity") %>% # if to examine the variable importance of your final model need to set importance =
# choose either the continuous regression or binary classification mode
set_mode(PredictionType)
# Put it all together in a workflow -----------------------------------
# set the workflow
rf_workflow <- workflow() %>%
# add the recipe
add_recipe(my_recipe) %>%
# add the model
add_model(rf_model)
# Tune the parameters ----------------------------------
# specify which values eant to try
rf_grid <- expand.grid(mtry = c(1,2,3), trees=c(500,1000,1500,2000,2500))
# extract results
if(PredictionType == "regression"){
rf_tune_results <- rf_workflow %>%
tune_grid(resamples = my_cv, #CV object
grid = rf_grid, # grid of values to try
metrics = metric_set(rmse) # metrics we care about
)
resultSelect = "rmse"
}else{
rf_tune_results <- rf_workflow %>%
tune_grid(resamples = my_cv, #CV object
grid = rf_grid, # grid of values to try
metrics = metric_set(accuracy, roc_auc) # metrics we care about
)
resultSelect = "accuracy"
}
# autoplot(rf_tune_results)
# print results
results_df <- rf_tune_results %>%
collect_metrics()
# Finalize the workflow ------------------------------------------
param_final <- rf_tune_results %>%
select_best(metric = resultSelect)
param_final
rf_workflow <- rf_workflow %>%
finalize_workflow(param_final)
# Evaluate the model on the test set --------------------------------
rf_fit <- rf_workflow %>%
# fit on the training set and evaluate on test set
last_fit(my_split) # train the training dataset and evaluate the test dataset
# check performance
performance <- rf_fit %>% collect_metrics()
performance
if(PredictionType == "regression"){
# manually calculate mae
predictions <- rf_fit %>% collect_predictions()
MAE <- predictions %>% mae(Y,`.pred` )
# bind MAE
performance <- bind_rows(performance,MAE )
}
# bind MAE and transfer performance to wide
performance <- performance %>% select(-`.estimator`, -`.config`) # %>%
# as.data.frame() %>%
# reshape2::dcast(0 ~ `.metric`) %>%
# select(-`0`)
performance$MetaG.MetaB.HostT.link = paste(Gm, Bm, Tm, sep = "_")
performance$mtry = param_final$mtry
performance$trees = param_final$trees
performance <- performance %>%
relocate(MetaG.MetaB.HostT.link, mtry, trees)
Performance <- bind_rows(Performance, performance)
}
write.table(Performance, file = "6_NEU_prediction.performance_byLinks.rf.txt", sep = "\t", quote = F, row.names = F)