Load packages
library(Seurat)
library(tidyverse)
library(caret)
library(rsample)
source("R/Functions.R")
Read data
sobj <- readRDS("output/Tcells_Integrated.rds")
sobj$IdentI <- factor(sobj$IdentI, levels = sobj$IdentI %>% unique %>% sort())
Idents(sobj) <- "IdentI"
DimPlot(sobj, label = T, reduction = "wnn.umap", raster = TRUE,
raster.dpi = c(500,500), group.by = "IdentI")+
NoLegend()

Split data using rsample package
set.seed(1)
split <- sobj@assays$integratedADT@scale.data %>% t %>% data.frame() %>%
cbind(., FetchData(sobj, vars = c("integratedrna_MKI67", "integratedrna_IKZF3", "tfactivity_FOXP3-E", "IdentI"))) %>%
select(-IdentI) %>%
initial_split(prop = 0.35)
train <- training(split) %>% as.matrix()
test <- testing(split) %>% as.matrix()
classes_split <- initial_split(as_tibble(Idents(sobj)), prop = 0.35)
classes_split[["in_id"]] <- split[["in_id"]] # Ensures corresponding celltypes
classes_training <- training(classes_split) %>% .$value %>% as.factor()
classes_test <- testing(classes_split) %>% .$value %>% as.factor()
# Ensure all classes are represented
all(levels(classes_training) == levels(classes_test))
## [1] TRUE
Train model
trControl <- trainControl(method = "cv", number = 10, search = "random",
verboseIter = FALSE, allowParallel = TRUE)
set.seed(23)
gb_fit <- train(x = train,
y = classes_training,
method = "xgbTree",
importance = TRUE,
metric = "Accuracy",
maximize = TRUE,
trControl = trControl)
Print model
## eXtreme Gradient Boosting
##
## 25452 samples
## 73 predictor
## 14 classes: '1', '2', '3', '5', '6', '8', '9', '11', '12', '13', '14', '15', '16', '19'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 22907, 22907, 22906, 22905, 22907, 22905, ...
## Resampling results across tuning parameters:
##
## eta max_depth gamma colsample_bytree min_child_weight subsample nrounds Accuracy Kappa
## 0.2348934 2 1.392785 0.6769847 9 0.7637030 456 0.7427325 0.712033
## 0.4211516 7 8.459473 0.5374203 12 0.8653380 499 0.7286658 0.696139
## 0.5197095 9 3.147697 0.5072482 5 0.5398074 248 0.7130682 0.678977
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 456, max_depth = 2, eta = 0.2348934, gamma = 1.392785, colsample_bytree
## = 0.6769847, min_child_weight = 9 and subsample = 0.763703.
Predict
predict_test <- predict(gb_fit, newdata = test)
confusionMatrix(predict_test, classes_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 5 6 8 9 11 12 13 14 15 16 19
## 1 4000 211 57 24 60 86 193 2 43 10 3 24 12 4
## 2 209 3004 195 17 74 124 384 9 22 16 3 111 16 0
## 3 31 195 4660 586 45 17 31 7 151 7 29 4 190 29
## 5 31 16 916 6750 141 13 31 136 23 24 129 82 359 63
## 6 67 73 62 112 6279 43 533 353 7 25 60 209 184 6
## 8 76 110 13 5 16 1062 44 27 0 204 0 133 10 0
## 9 126 261 49 15 297 48 1290 54 4 16 3 75 11 0
## 11 6 7 7 103 176 39 46 1467 0 230 20 189 10 1
## 12 51 14 200 10 4 1 3 0 2047 1 2 0 8 7
## 13 2 10 1 11 19 208 5 238 0 1183 12 103 1 0
## 14 4 1 12 92 49 4 5 24 1 3 1398 17 19 13
## 15 15 40 8 64 149 135 76 169 1 133 13 1229 15 1
## 16 1 7 128 127 57 4 8 4 3 2 24 9 734 10
## 19 8 0 2 10 2 1 1 0 5 1 9 1 7 401
##
## Overall Statistics
##
## Accuracy : 0.7511
## 95% CI : (0.7472, 0.755)
## No Information Rate : 0.1677
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7216
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 8 Class: 9 Class: 11 Class: 12 Class: 13 Class: 14
## Sensitivity 0.86449 0.76070 0.73851 0.8516 0.8522 0.59496 0.48679 0.58916 0.88730 0.63774 0.81994
## Specificity 0.98290 0.97276 0.96772 0.9501 0.9565 0.98597 0.97851 0.98138 0.99331 0.98657 0.99464
## Pos Pred Value 0.84584 0.71797 0.77900 0.7746 0.7836 0.62471 0.57359 0.63755 0.87181 0.65979 0.85140
## Neg Pred Value 0.98526 0.97807 0.96004 0.9695 0.9723 0.98413 0.96979 0.97725 0.99421 0.98522 0.99327
## Prevalence 0.09789 0.08354 0.13349 0.1677 0.1559 0.03776 0.05606 0.05268 0.04881 0.03924 0.03607
## Detection Rate 0.08462 0.06355 0.09858 0.1428 0.1328 0.02247 0.02729 0.03104 0.04331 0.02503 0.02958
## Detection Prevalence 0.10004 0.08851 0.12655 0.1843 0.1695 0.03596 0.04758 0.04868 0.04967 0.03793 0.03474
## Balanced Accuracy 0.92370 0.86673 0.85312 0.9009 0.9044 0.79047 0.73265 0.78527 0.94030 0.81215 0.90729
## Class: 15 Class: 16 Class: 19
## Sensitivity 0.56221 0.46574 0.749533
## Specificity 0.98183 0.99160 0.998994
## Pos Pred Value 0.60010 0.65653 0.895089
## Neg Pred Value 0.97884 0.98176 0.997138
## Prevalence 0.04625 0.03334 0.011318
## Detection Rate 0.02600 0.01553 0.008483
## Detection Prevalence 0.04333 0.02365 0.009478
## Balanced Accuracy 0.77202 0.72867 0.874264
Store classes
names(classes_training) <- rownames(train)
names(classes_test) <- rownames(test)
names(predict_test) <- rownames(test)
classes <- list(classes_training, classes_test, predict_test)
names(classes) <- c("train", "test", "predict")
Show most important markers
varImp(gb_fit, scale = TRUE, top=40)
## xgbTree variable importance
##
## only 20 most important variables shown (out of 73)
##
## Overall
## .CD279 100.00
## .CD4 82.45
## .CD95 82.43
## integratedrna_MKI67 80.96
## .CD127 63.16
## tfactivity_FOXP3-E 59.19
## .CD244 57.89
## .CD31 54.26
## .CD25 54.21
## .CD38 47.40
## .CD45RA 44.08
## .KLRG1 42.99
## .CD69 35.90
## integratedrna_IKZF3 34.93
## .CD366 33.49
## .CD185 33.10
## .CD278 27.66
## .CD8a 27.58
## .CD29 22.08
## .CD45RO 20.02
Save objects
saveRDS(gb_fit, file = "output/GBresults_Combined_T_surfaceplus.rds")
saveRDS(classes, file = "output/GBclasses_Combined_T_surfaceplus.rds")
Session info
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /g/easybuild/x86_64/CentOS/7/haswell/software/FlexiBLAS/3.0.4-GCC-11.2.0/lib64/libflexiblas.so.3.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rsample_0.1.1 caret_6.0-90 lattice_0.20-45 ggrastr_1.0.1 ggpubr_0.4.0 readxl_1.4.1
## [7] forcats_0.5.1 stringr_1.4.1 dplyr_1.0.10 purrr_0.3.4 readr_2.1.2 tidyr_1.2.1
## [13] tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.1 SeuratObject_4.0.4 Seurat_4.1.0 knitr_1.40
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 reticulate_1.24 tidyselect_1.1.2 htmlwidgets_1.5.4 grid_4.1.2
## [6] Rtsne_0.16 pROC_1.18.0 munsell_0.5.0 codetools_0.2-18 ica_1.0-2
## [11] xgboost_1.4.1.1 future_1.23.0 miniUI_0.1.1.1 withr_2.5.0 spatstat.random_2.1-0
## [16] colorspace_2.0-3 highr_0.9 rstudioapi_0.13 stats4_4.1.2 ROCR_1.0-11
## [21] ggsignif_0.6.3 tensor_1.5 listenv_0.8.0 labeling_0.4.2 polyclip_1.10-0
## [26] farver_2.1.1 pheatmap_1.0.12 parallelly_1.30.0 vctrs_0.4.2 generics_0.1.3
## [31] ipred_0.9-12 xfun_0.33 R6_2.5.1 ggbeeswarm_0.6.0 rmdformats_1.0.4
## [36] spatstat.utils_2.3-0 cachem_1.0.6 assertthat_0.2.1 promises_1.2.0.1 scales_1.2.1
## [41] nnet_7.3-16 beeswarm_0.4.0 gtable_0.3.1 Cairo_1.5-12.2 globals_0.14.0
## [46] goftest_1.2-3 timeDate_3043.102 rlang_1.0.6 splines_4.1.2 rstatix_0.7.0
## [51] lazyeval_0.2.2 ModelMetrics_1.2.2.2 spatstat.geom_2.3-2 broom_1.0.1 yaml_2.3.5
## [56] reshape2_1.4.4 abind_1.4-5 modelr_0.1.8 backports_1.4.1 httpuv_1.6.6
## [61] tools_4.1.2 lava_1.6.10 bookdown_0.29 ellipsis_0.3.2 spatstat.core_2.4-0
## [66] jquerylib_0.1.4 RColorBrewer_1.1-3 proxy_0.4-26 ggridges_0.5.3 Rcpp_1.0.9
## [71] plyr_1.8.7 rpart_4.1-15 deldir_1.0-6 pbapply_1.5-0 cowplot_1.1.1
## [76] zoo_1.8-9 haven_2.4.3 ggrepel_0.9.1 cluster_2.1.2 fs_1.5.2
## [81] furrr_0.2.3 magrittr_2.0.3 data.table_1.14.2 scattermore_0.8 lmtest_0.9-39
## [86] reprex_2.0.1 RANN_2.6.1 fitdistrplus_1.1-6 matrixStats_0.61.0 hms_1.1.2
## [91] patchwork_1.1.2 mime_0.12 evaluate_0.16 xtable_1.8-4 gridExtra_2.3
## [96] compiler_4.1.2 KernSmooth_2.23-20 crayon_1.5.2 htmltools_0.5.3 mgcv_1.8-38
## [101] later_1.3.0 tzdb_0.3.0 lubridate_1.8.0 DBI_1.1.2 dbplyr_2.1.1
## [106] MASS_7.3-54 Matrix_1.5-1 car_3.1-0 cli_3.4.1 parallel_4.1.2
## [111] gower_0.2.2 igraph_1.3.5 pkgconfig_2.0.3 plotly_4.10.0 spatstat.sparse_2.1-0
## [116] recipes_0.1.17 xml2_1.3.2 foreach_1.5.2 vipor_0.4.5 bslib_0.4.0
## [121] prodlim_2019.11.13 rvest_1.0.2 digest_0.6.29 sctransform_0.3.3 RcppAnnoy_0.0.19
## [126] spatstat.data_2.1-2 rmarkdown_2.17 cellranger_1.1.0 leiden_0.3.9 uwot_0.1.11
## [131] shiny_1.7.2 lifecycle_1.0.2 nlme_3.1-153 jsonlite_1.8.0 carData_3.0-5
## [136] viridisLite_0.4.1 fansi_1.0.3 pillar_1.8.1 fastmap_1.1.0 httr_1.4.2
## [141] survival_3.2-13 glue_1.6.1 png_0.1-7 iterators_1.0.14 class_7.3-19
## [146] stringi_1.7.8 sass_0.4.2 irlba_2.3.5 e1071_1.7-9 future.apply_1.8.1