Load packages
library(Seurat)
library(tidyverse)
library(caret)
library(rsample)
source("R/Functions.R")
Read data
sobj <- readRDS("output/Tcells_Integrated.rds")
sobj$IdentI <- factor(sobj$IdentI, levels = sobj$IdentI %>% unique %>% sort())
Idents(sobj) <- "IdentI"
DimPlot(sobj, label = T, reduction = "wnn.umap", raster = TRUE,
raster.dpi = c(500,500), group.by = "IdentI")+
NoLegend()

Split data using rsample package
set.seed(1)
split <- sobj@assays$integratedADT@scale.data %>% t %>% data.frame() %>%
cbind(., FetchData(sobj, vars = c("IdentI"))) %>%
select(-IdentI) %>%
initial_split(prop = 0.35)
train <- training(split) %>% as.matrix()
test <- testing(split) %>% as.matrix()
classes_split <- initial_split(as_tibble(Idents(sobj)), prop = 0.35)
classes_split[["in_id"]] <- split[["in_id"]] # Ensures corresponding celltypes
classes_training <- training(classes_split) %>% .$value %>% as.factor()
classes_test <- testing(classes_split) %>% .$value %>% as.factor()
# Ensure all classes are represented
all(levels(classes_training) == levels(classes_test))
## [1] TRUE
Train model
trControl <- trainControl(method = "cv", number = 10, allowParallel = TRUE,
search = "random", verboseIter = FALSE)
set.seed(23)
gb_fit <- train(x = train,
y = classes_training,
method = "xgbTree",
importance = TRUE,
metric = "Accuracy",
maximize = TRUE,
trControl = trControl)
Print model
## eXtreme Gradient Boosting
##
## 25452 samples
## 70 predictor
## 14 classes: '1', '2', '3', '5', '6', '8', '9', '11', '12', '13', '14', '15', '16', '19'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 22907, 22907, 22906, 22905, 22907, 22905, ...
## Resampling results across tuning parameters:
##
## eta max_depth gamma colsample_bytree min_child_weight subsample nrounds Accuracy Kappa
## 0.2348934 2 1.392785 0.6769847 9 0.7637030 456 0.7024597 0.6663547
## 0.4211516 7 8.459473 0.5374203 12 0.8653380 499 0.6908706 0.6530974
## 0.5197095 9 3.147697 0.5072482 5 0.5398074 248 0.6667058 0.6265636
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 456, max_depth = 2, eta = 0.2348934, gamma = 1.392785, colsample_bytree
## = 0.6769847, min_child_weight = 9 and subsample = 0.763703.
Predict
predict_test <- predict(gb_fit, newdata = test)
confusionMatrix(predict_test, classes_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 5 6 8 9 11 12 13 14 15 16 19
## 1 3975 202 62 27 71 113 202 7 40 15 10 33 14 9
## 2 210 2946 192 19 90 132 400 14 25 22 31 128 21 1
## 3 33 191 4601 588 49 24 29 7 168 7 72 10 210 28
## 5 40 19 937 6646 137 19 43 132 22 27 483 82 326 55
## 6 75 85 67 106 6189 58 554 376 8 30 278 259 189 6
## 8 83 129 12 6 26 993 51 39 0 215 8 111 8 1
## 9 121 278 43 20 294 53 1244 55 5 17 6 77 13 0
## 11 10 13 8 84 175 64 46 1304 0 249 103 254 8 1
## 12 48 16 209 12 3 1 4 1 2026 1 9 2 9 9
## 13 2 12 4 17 17 217 8 258 0 1107 52 106 3 1
## 14 7 10 27 224 56 7 3 84 2 27 482 50 56 36
## 15 11 40 11 52 176 96 61 207 2 132 51 1059 20 0
## 16 3 8 134 118 84 7 3 6 5 6 62 15 695 13
## 19 9 0 3 7 1 1 2 0 4 0 58 0 4 375
##
## Overall Statistics
##
## Accuracy : 0.7117
## 95% CI : (0.7076, 0.7158)
## No Information Rate : 0.1677
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.677
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 8 Class: 9 Class: 11 Class: 12 Class: 13 Class: 14
## Sensitivity 0.85909 0.74601 0.72916 0.8385 0.8400 0.55630 0.46943 0.52369 0.87820 0.59677 0.28270
## Specificity 0.98112 0.97034 0.96543 0.9410 0.9476 0.98485 0.97799 0.97733 0.99279 0.98465 0.98707
## Pos Pred Value 0.83159 0.69629 0.76467 0.7411 0.7475 0.59037 0.55885 0.56231 0.86213 0.61364 0.45005
## Neg Pred Value 0.98465 0.97670 0.95857 0.9666 0.9698 0.98263 0.96879 0.97362 0.99374 0.98355 0.97353
## Prevalence 0.09789 0.08354 0.13349 0.1677 0.1559 0.03776 0.05606 0.05268 0.04881 0.03924 0.03607
## Detection Rate 0.08409 0.06232 0.09734 0.1406 0.1309 0.02101 0.02632 0.02759 0.04286 0.02342 0.01020
## Detection Prevalence 0.10112 0.08951 0.12729 0.1897 0.1752 0.03558 0.04709 0.04906 0.04972 0.03816 0.02266
## Balanced Accuracy 0.92010 0.85817 0.84729 0.8897 0.8938 0.77058 0.72371 0.75051 0.93550 0.79071 0.63489
## Class: 15 Class: 16 Class: 19
## Sensitivity 0.48445 0.44099 0.700935
## Specificity 0.98095 0.98985 0.998096
## Pos Pred Value 0.55214 0.59965 0.808190
## Neg Pred Value 0.97515 0.98089 0.996582
## Prevalence 0.04625 0.03334 0.011318
## Detection Rate 0.02240 0.01470 0.007933
## Detection Prevalence 0.04058 0.02452 0.009816
## Balanced Accuracy 0.73270 0.71542 0.849515
Store classes
names(classes_training) <- rownames(train)
names(classes_test) <- rownames(test)
names(predict_test) <- rownames(test)
classes <- list(classes_training, classes_test, predict_test)
names(classes) <- c("train", "test", "predict")
Show most important markers
varImp(gb_fit, scale = TRUE, top=40)
## xgbTree variable importance
##
## only 20 most important variables shown (out of 70)
##
## Overall
## .CD279 100.00
## .CD4 80.48
## .CD95 67.22
## .CD244 59.53
## .CD31 56.51
## .CD25 54.87
## .CD127 54.32
## .CD45RA 44.26
## .CD38 41.19
## .CD69 35.36
## .CD278 30.08
## .KLRG1 29.01
## .CD366 27.31
## .CD185 25.16
## .CD8a 20.72
## .CD29 18.90
## .CD47 15.83
## .CD39 15.67
## .TIGIT 15.10
## .CD161 14.21
Save objects
saveRDS(gb_fit, file = "output/GBresults_Combined_T_surface.rds")
saveRDS(classes, file = "output/GBclasses_Combined_T_surface.rds")
Session info
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /g/easybuild/x86_64/CentOS/7/haswell/software/FlexiBLAS/3.0.4-GCC-11.2.0/lib64/libflexiblas.so.3.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rsample_0.1.1 caret_6.0-90 lattice_0.20-45 ggrastr_1.0.1 ggpubr_0.4.0 readxl_1.4.1
## [7] forcats_0.5.1 stringr_1.4.1 dplyr_1.0.10 purrr_0.3.4 readr_2.1.2 tidyr_1.2.1
## [13] tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.1 SeuratObject_4.0.4 Seurat_4.1.0 knitr_1.40
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 reticulate_1.24 tidyselect_1.1.2 htmlwidgets_1.5.4 grid_4.1.2
## [6] Rtsne_0.16 pROC_1.18.0 munsell_0.5.0 codetools_0.2-18 ica_1.0-2
## [11] xgboost_1.4.1.1 future_1.23.0 miniUI_0.1.1.1 withr_2.5.0 spatstat.random_2.1-0
## [16] colorspace_2.0-3 highr_0.9 rstudioapi_0.13 stats4_4.1.2 ROCR_1.0-11
## [21] ggsignif_0.6.3 tensor_1.5 listenv_0.8.0 labeling_0.4.2 polyclip_1.10-0
## [26] farver_2.1.1 pheatmap_1.0.12 parallelly_1.30.0 vctrs_0.4.2 generics_0.1.3
## [31] ipred_0.9-12 xfun_0.33 R6_2.5.1 ggbeeswarm_0.6.0 rmdformats_1.0.4
## [36] spatstat.utils_2.3-0 cachem_1.0.6 assertthat_0.2.1 promises_1.2.0.1 scales_1.2.1
## [41] nnet_7.3-16 beeswarm_0.4.0 gtable_0.3.1 Cairo_1.5-12.2 globals_0.14.0
## [46] goftest_1.2-3 timeDate_3043.102 rlang_1.0.6 splines_4.1.2 rstatix_0.7.0
## [51] lazyeval_0.2.2 ModelMetrics_1.2.2.2 spatstat.geom_2.3-2 broom_1.0.1 yaml_2.3.5
## [56] reshape2_1.4.4 abind_1.4-5 modelr_0.1.8 backports_1.4.1 httpuv_1.6.6
## [61] tools_4.1.2 lava_1.6.10 bookdown_0.29 ellipsis_0.3.2 spatstat.core_2.4-0
## [66] jquerylib_0.1.4 RColorBrewer_1.1-3 proxy_0.4-26 ggridges_0.5.3 Rcpp_1.0.9
## [71] plyr_1.8.7 rpart_4.1-15 deldir_1.0-6 pbapply_1.5-0 cowplot_1.1.1
## [76] zoo_1.8-9 haven_2.4.3 ggrepel_0.9.1 cluster_2.1.2 fs_1.5.2
## [81] furrr_0.2.3 magrittr_2.0.3 data.table_1.14.2 scattermore_0.8 lmtest_0.9-39
## [86] reprex_2.0.1 RANN_2.6.1 fitdistrplus_1.1-6 matrixStats_0.61.0 hms_1.1.2
## [91] patchwork_1.1.2 mime_0.12 evaluate_0.16 xtable_1.8-4 gridExtra_2.3
## [96] compiler_4.1.2 KernSmooth_2.23-20 crayon_1.5.2 htmltools_0.5.3 mgcv_1.8-38
## [101] later_1.3.0 tzdb_0.3.0 lubridate_1.8.0 DBI_1.1.2 dbplyr_2.1.1
## [106] MASS_7.3-54 Matrix_1.5-1 car_3.1-0 cli_3.4.1 parallel_4.1.2
## [111] gower_0.2.2 igraph_1.3.5 pkgconfig_2.0.3 plotly_4.10.0 spatstat.sparse_2.1-0
## [116] recipes_0.1.17 xml2_1.3.2 foreach_1.5.2 vipor_0.4.5 bslib_0.4.0
## [121] prodlim_2019.11.13 rvest_1.0.2 digest_0.6.29 sctransform_0.3.3 RcppAnnoy_0.0.19
## [126] spatstat.data_2.1-2 rmarkdown_2.17 cellranger_1.1.0 leiden_0.3.9 uwot_0.1.11
## [131] shiny_1.7.2 lifecycle_1.0.2 nlme_3.1-153 jsonlite_1.8.0 carData_3.0-5
## [136] viridisLite_0.4.1 fansi_1.0.3 pillar_1.8.1 fastmap_1.1.0 httr_1.4.2
## [141] survival_3.2-13 glue_1.6.1 png_0.1-7 iterators_1.0.14 class_7.3-19
## [146] stringi_1.7.8 sass_0.4.2 irlba_2.3.5 e1071_1.7-9 future.apply_1.8.1