Run GB on Combined T cells - surfaceplus

Load packages

library(Seurat)
library(tidyverse)
library(caret)
library(rsample)
source("R/Functions.R")

Read data

sobj <- readRDS("output/Tcells_Integrated.rds")
sobj$IdentI <- factor(sobj$IdentI, levels = sobj$IdentI %>% unique %>% sort())

Idents(sobj) <- "IdentI"
DimPlot(sobj, label = T, reduction = "wnn.umap", raster = TRUE, 
        raster.dpi = c(500,500), group.by = "IdentI")+
  NoLegend()

Split data using rsample package

set.seed(1)
split <- sobj@assays$integratedADT@scale.data %>% t %>% data.frame() %>% 
  cbind(., FetchData(sobj, vars = c("integratedrna_MKI67", "integratedrna_IKZF3", "tfactivity_FOXP3-E", "IdentI"))) %>% 
  select(-IdentI) %>% 
  initial_split(prop = 0.35)

train <- training(split) %>% as.matrix()
test <- testing(split) %>% as.matrix()
  
classes_split <- initial_split(as_tibble(Idents(sobj)), prop = 0.35)
classes_split[["in_id"]] <- split[["in_id"]] # Ensures corresponding celltypes
classes_training <- training(classes_split) %>% .$value %>% as.factor()
classes_test  <- testing(classes_split) %>% .$value %>% as.factor()
  
# Ensure all classes are represented
all(levels(classes_training) == levels(classes_test))

## [1] TRUE

Train model

trControl <- trainControl(method = "cv", number = 10, search = "random", 
                          verboseIter = FALSE, allowParallel = TRUE)
  
set.seed(23)
gb_fit <- train(x = train,
                y = classes_training,
                method = "xgbTree",
                importance = TRUE,
                metric = "Accuracy",
                maximize = TRUE,
                trControl = trControl)

Print model

print(gb_fit)

## eXtreme Gradient Boosting 
## 
## 25452 samples
##    73 predictor
##    14 classes: '1', '2', '3', '5', '6', '8', '9', '11', '12', '13', '14', '15', '16', '19' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 22907, 22907, 22906, 22905, 22907, 22905, ... 
## Resampling results across tuning parameters:
## 
##   eta        max_depth  gamma     colsample_bytree  min_child_weight  subsample  nrounds  Accuracy   Kappa   
##   0.2348934  2          1.392785  0.6769847          9                0.7637030  456      0.7427325  0.712033
##   0.4211516  7          8.459473  0.5374203         12                0.8653380  499      0.7286658  0.696139
##   0.5197095  9          3.147697  0.5072482          5                0.5398074  248      0.7130682  0.678977
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 456, max_depth = 2, eta = 0.2348934, gamma = 1.392785, colsample_bytree
##  = 0.6769847, min_child_weight = 9 and subsample = 0.763703.

Predict

predict_test <- predict(gb_fit, newdata = test)

confusionMatrix(predict_test, classes_test)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2    3    5    6    8    9   11   12   13   14   15   16   19
##         1  4000  211   57   24   60   86  193    2   43   10    3   24   12    4
##         2   209 3004  195   17   74  124  384    9   22   16    3  111   16    0
##         3    31  195 4660  586   45   17   31    7  151    7   29    4  190   29
##         5    31   16  916 6750  141   13   31  136   23   24  129   82  359   63
##         6    67   73   62  112 6279   43  533  353    7   25   60  209  184    6
##         8    76  110   13    5   16 1062   44   27    0  204    0  133   10    0
##         9   126  261   49   15  297   48 1290   54    4   16    3   75   11    0
##         11    6    7    7  103  176   39   46 1467    0  230   20  189   10    1
##         12   51   14  200   10    4    1    3    0 2047    1    2    0    8    7
##         13    2   10    1   11   19  208    5  238    0 1183   12  103    1    0
##         14    4    1   12   92   49    4    5   24    1    3 1398   17   19   13
##         15   15   40    8   64  149  135   76  169    1  133   13 1229   15    1
##         16    1    7  128  127   57    4    8    4    3    2   24    9  734   10
##         19    8    0    2   10    2    1    1    0    5    1    9    1    7  401
## 
## Overall Statistics
##                                          
##                Accuracy : 0.7511         
##                  95% CI : (0.7472, 0.755)
##     No Information Rate : 0.1677         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7216         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 8 Class: 9 Class: 11 Class: 12 Class: 13 Class: 14
## Sensitivity           0.86449  0.76070  0.73851   0.8516   0.8522  0.59496  0.48679   0.58916   0.88730   0.63774   0.81994
## Specificity           0.98290  0.97276  0.96772   0.9501   0.9565  0.98597  0.97851   0.98138   0.99331   0.98657   0.99464
## Pos Pred Value        0.84584  0.71797  0.77900   0.7746   0.7836  0.62471  0.57359   0.63755   0.87181   0.65979   0.85140
## Neg Pred Value        0.98526  0.97807  0.96004   0.9695   0.9723  0.98413  0.96979   0.97725   0.99421   0.98522   0.99327
## Prevalence            0.09789  0.08354  0.13349   0.1677   0.1559  0.03776  0.05606   0.05268   0.04881   0.03924   0.03607
## Detection Rate        0.08462  0.06355  0.09858   0.1428   0.1328  0.02247  0.02729   0.03104   0.04331   0.02503   0.02958
## Detection Prevalence  0.10004  0.08851  0.12655   0.1843   0.1695  0.03596  0.04758   0.04868   0.04967   0.03793   0.03474
## Balanced Accuracy     0.92370  0.86673  0.85312   0.9009   0.9044  0.79047  0.73265   0.78527   0.94030   0.81215   0.90729
##                      Class: 15 Class: 16 Class: 19
## Sensitivity            0.56221   0.46574  0.749533
## Specificity            0.98183   0.99160  0.998994
## Pos Pred Value         0.60010   0.65653  0.895089
## Neg Pred Value         0.97884   0.98176  0.997138
## Prevalence             0.04625   0.03334  0.011318
## Detection Rate         0.02600   0.01553  0.008483
## Detection Prevalence   0.04333   0.02365  0.009478
## Balanced Accuracy      0.77202   0.72867  0.874264

Store classes

names(classes_training) <- rownames(train)
names(classes_test) <- rownames(test)
names(predict_test) <- rownames(test)
classes <- list(classes_training, classes_test, predict_test)
names(classes) <- c("train", "test", "predict")

Show most important markers

varImp(gb_fit, scale = TRUE, top=40)

## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 73)
## 
##                     Overall
## .CD279               100.00
## .CD4                  82.45
## .CD95                 82.43
## integratedrna_MKI67   80.96
## .CD127                63.16
## tfactivity_FOXP3-E    59.19
## .CD244                57.89
## .CD31                 54.26
## .CD25                 54.21
## .CD38                 47.40
## .CD45RA               44.08
## .KLRG1                42.99
## .CD69                 35.90
## integratedrna_IKZF3   34.93
## .CD366                33.49
## .CD185                33.10
## .CD278                27.66
## .CD8a                 27.58
## .CD29                 22.08
## .CD45RO               20.02

Save objects

saveRDS(gb_fit, file = "output/GBresults_Combined_T_surfaceplus.rds")
saveRDS(classes, file = "output/GBclasses_Combined_T_surfaceplus.rds")

Session info

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /g/easybuild/x86_64/CentOS/7/haswell/software/FlexiBLAS/3.0.4-GCC-11.2.0/lib64/libflexiblas.so.3.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rsample_0.1.1      caret_6.0-90       lattice_0.20-45    ggrastr_1.0.1      ggpubr_0.4.0       readxl_1.4.1      
##  [7] forcats_0.5.1      stringr_1.4.1      dplyr_1.0.10       purrr_0.3.4        readr_2.1.2        tidyr_1.2.1       
## [13] tibble_3.1.8       ggplot2_3.3.6      tidyverse_1.3.1    SeuratObject_4.0.4 Seurat_4.1.0       knitr_1.40        
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2            reticulate_1.24       tidyselect_1.1.2      htmlwidgets_1.5.4     grid_4.1.2           
##   [6] Rtsne_0.16            pROC_1.18.0           munsell_0.5.0         codetools_0.2-18      ica_1.0-2            
##  [11] xgboost_1.4.1.1       future_1.23.0         miniUI_0.1.1.1        withr_2.5.0           spatstat.random_2.1-0
##  [16] colorspace_2.0-3      highr_0.9             rstudioapi_0.13       stats4_4.1.2          ROCR_1.0-11          
##  [21] ggsignif_0.6.3        tensor_1.5            listenv_0.8.0         labeling_0.4.2        polyclip_1.10-0      
##  [26] farver_2.1.1          pheatmap_1.0.12       parallelly_1.30.0     vctrs_0.4.2           generics_0.1.3       
##  [31] ipred_0.9-12          xfun_0.33             R6_2.5.1              ggbeeswarm_0.6.0      rmdformats_1.0.4     
##  [36] spatstat.utils_2.3-0  cachem_1.0.6          assertthat_0.2.1      promises_1.2.0.1      scales_1.2.1         
##  [41] nnet_7.3-16           beeswarm_0.4.0        gtable_0.3.1          Cairo_1.5-12.2        globals_0.14.0       
##  [46] goftest_1.2-3         timeDate_3043.102     rlang_1.0.6           splines_4.1.2         rstatix_0.7.0        
##  [51] lazyeval_0.2.2        ModelMetrics_1.2.2.2  spatstat.geom_2.3-2   broom_1.0.1           yaml_2.3.5           
##  [56] reshape2_1.4.4        abind_1.4-5           modelr_0.1.8          backports_1.4.1       httpuv_1.6.6         
##  [61] tools_4.1.2           lava_1.6.10           bookdown_0.29         ellipsis_0.3.2        spatstat.core_2.4-0  
##  [66] jquerylib_0.1.4       RColorBrewer_1.1-3    proxy_0.4-26          ggridges_0.5.3        Rcpp_1.0.9           
##  [71] plyr_1.8.7            rpart_4.1-15          deldir_1.0-6          pbapply_1.5-0         cowplot_1.1.1        
##  [76] zoo_1.8-9             haven_2.4.3           ggrepel_0.9.1         cluster_2.1.2         fs_1.5.2             
##  [81] furrr_0.2.3           magrittr_2.0.3        data.table_1.14.2     scattermore_0.8       lmtest_0.9-39        
##  [86] reprex_2.0.1          RANN_2.6.1            fitdistrplus_1.1-6    matrixStats_0.61.0    hms_1.1.2            
##  [91] patchwork_1.1.2       mime_0.12             evaluate_0.16         xtable_1.8-4          gridExtra_2.3        
##  [96] compiler_4.1.2        KernSmooth_2.23-20    crayon_1.5.2          htmltools_0.5.3       mgcv_1.8-38          
## [101] later_1.3.0           tzdb_0.3.0            lubridate_1.8.0       DBI_1.1.2             dbplyr_2.1.1         
## [106] MASS_7.3-54           Matrix_1.5-1          car_3.1-0             cli_3.4.1             parallel_4.1.2       
## [111] gower_0.2.2           igraph_1.3.5          pkgconfig_2.0.3       plotly_4.10.0         spatstat.sparse_2.1-0
## [116] recipes_0.1.17        xml2_1.3.2            foreach_1.5.2         vipor_0.4.5           bslib_0.4.0          
## [121] prodlim_2019.11.13    rvest_1.0.2           digest_0.6.29         sctransform_0.3.3     RcppAnnoy_0.0.19     
## [126] spatstat.data_2.1-2   rmarkdown_2.17        cellranger_1.1.0      leiden_0.3.9          uwot_0.1.11          
## [131] shiny_1.7.2           lifecycle_1.0.2       nlme_3.1-153          jsonlite_1.8.0        carData_3.0-5        
## [136] viridisLite_0.4.1     fansi_1.0.3           pillar_1.8.1          fastmap_1.1.0         httr_1.4.2           
## [141] survival_3.2-13       glue_1.6.1            png_0.1-7             iterators_1.0.14      class_7.3-19         
## [146] stringi_1.7.8         sass_0.4.2            irlba_2.3.5           e1071_1.7-9           future.apply_1.8.1