Run GB on Combined T cells - surface

Load packages

library(Seurat)
library(tidyverse)
library(caret)
library(rsample)
source("R/Functions.R")

Read data

sobj <- readRDS("output/Tcells_Integrated.rds")
sobj$IdentI <- factor(sobj$IdentI, levels = sobj$IdentI %>% unique %>% sort())

Idents(sobj) <- "IdentI"
DimPlot(sobj, label = T, reduction = "wnn.umap", raster = TRUE, 
        raster.dpi = c(500,500), group.by = "IdentI")+
  NoLegend()

Split data using rsample package

set.seed(1)
split <- sobj@assays$integratedADT@scale.data %>% t %>% data.frame() %>% 
  cbind(., FetchData(sobj, vars = c("IdentI"))) %>% 
  select(-IdentI) %>% 
  initial_split(prop = 0.35)

train <- training(split) %>% as.matrix()
test <- testing(split) %>% as.matrix()
  
classes_split <- initial_split(as_tibble(Idents(sobj)), prop = 0.35)
classes_split[["in_id"]] <- split[["in_id"]] # Ensures corresponding celltypes
classes_training <- training(classes_split) %>% .$value %>% as.factor()
classes_test  <- testing(classes_split) %>% .$value %>% as.factor()
  
# Ensure all classes are represented
all(levels(classes_training) == levels(classes_test))

## [1] TRUE

Train model

trControl <- trainControl(method = "cv", number = 10, allowParallel = TRUE,
                          search = "random", verboseIter = FALSE)
  
set.seed(23)
gb_fit <- train(x = train,
                y = classes_training,
                method = "xgbTree",
                importance = TRUE,
                metric = "Accuracy",
                maximize = TRUE,
                trControl = trControl)

Print model

print(gb_fit)

## eXtreme Gradient Boosting 
## 
## 25452 samples
##    70 predictor
##    14 classes: '1', '2', '3', '5', '6', '8', '9', '11', '12', '13', '14', '15', '16', '19' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 22907, 22907, 22906, 22905, 22907, 22905, ... 
## Resampling results across tuning parameters:
## 
##   eta        max_depth  gamma     colsample_bytree  min_child_weight  subsample  nrounds  Accuracy   Kappa    
##   0.2348934  2          1.392785  0.6769847          9                0.7637030  456      0.7024597  0.6663547
##   0.4211516  7          8.459473  0.5374203         12                0.8653380  499      0.6908706  0.6530974
##   0.5197095  9          3.147697  0.5072482          5                0.5398074  248      0.6667058  0.6265636
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 456, max_depth = 2, eta = 0.2348934, gamma = 1.392785, colsample_bytree
##  = 0.6769847, min_child_weight = 9 and subsample = 0.763703.

Predict

predict_test <- predict(gb_fit, newdata = test)

confusionMatrix(predict_test, classes_test)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2    3    5    6    8    9   11   12   13   14   15   16   19
##         1  3975  202   62   27   71  113  202    7   40   15   10   33   14    9
##         2   210 2946  192   19   90  132  400   14   25   22   31  128   21    1
##         3    33  191 4601  588   49   24   29    7  168    7   72   10  210   28
##         5    40   19  937 6646  137   19   43  132   22   27  483   82  326   55
##         6    75   85   67  106 6189   58  554  376    8   30  278  259  189    6
##         8    83  129   12    6   26  993   51   39    0  215    8  111    8    1
##         9   121  278   43   20  294   53 1244   55    5   17    6   77   13    0
##         11   10   13    8   84  175   64   46 1304    0  249  103  254    8    1
##         12   48   16  209   12    3    1    4    1 2026    1    9    2    9    9
##         13    2   12    4   17   17  217    8  258    0 1107   52  106    3    1
##         14    7   10   27  224   56    7    3   84    2   27  482   50   56   36
##         15   11   40   11   52  176   96   61  207    2  132   51 1059   20    0
##         16    3    8  134  118   84    7    3    6    5    6   62   15  695   13
##         19    9    0    3    7    1    1    2    0    4    0   58    0    4  375
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7117          
##                  95% CI : (0.7076, 0.7158)
##     No Information Rate : 0.1677          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.677           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 8 Class: 9 Class: 11 Class: 12 Class: 13 Class: 14
## Sensitivity           0.85909  0.74601  0.72916   0.8385   0.8400  0.55630  0.46943   0.52369   0.87820   0.59677   0.28270
## Specificity           0.98112  0.97034  0.96543   0.9410   0.9476  0.98485  0.97799   0.97733   0.99279   0.98465   0.98707
## Pos Pred Value        0.83159  0.69629  0.76467   0.7411   0.7475  0.59037  0.55885   0.56231   0.86213   0.61364   0.45005
## Neg Pred Value        0.98465  0.97670  0.95857   0.9666   0.9698  0.98263  0.96879   0.97362   0.99374   0.98355   0.97353
## Prevalence            0.09789  0.08354  0.13349   0.1677   0.1559  0.03776  0.05606   0.05268   0.04881   0.03924   0.03607
## Detection Rate        0.08409  0.06232  0.09734   0.1406   0.1309  0.02101  0.02632   0.02759   0.04286   0.02342   0.01020
## Detection Prevalence  0.10112  0.08951  0.12729   0.1897   0.1752  0.03558  0.04709   0.04906   0.04972   0.03816   0.02266
## Balanced Accuracy     0.92010  0.85817  0.84729   0.8897   0.8938  0.77058  0.72371   0.75051   0.93550   0.79071   0.63489
##                      Class: 15 Class: 16 Class: 19
## Sensitivity            0.48445   0.44099  0.700935
## Specificity            0.98095   0.98985  0.998096
## Pos Pred Value         0.55214   0.59965  0.808190
## Neg Pred Value         0.97515   0.98089  0.996582
## Prevalence             0.04625   0.03334  0.011318
## Detection Rate         0.02240   0.01470  0.007933
## Detection Prevalence   0.04058   0.02452  0.009816
## Balanced Accuracy      0.73270   0.71542  0.849515

Store classes

names(classes_training) <- rownames(train)
names(classes_test) <- rownames(test)
names(predict_test) <- rownames(test)
classes <- list(classes_training, classes_test, predict_test)
names(classes) <- c("train", "test", "predict")

Show most important markers

varImp(gb_fit, scale = TRUE, top=40)

## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 70)
## 
##         Overall
## .CD279   100.00
## .CD4      80.48
## .CD95     67.22
## .CD244    59.53
## .CD31     56.51
## .CD25     54.87
## .CD127    54.32
## .CD45RA   44.26
## .CD38     41.19
## .CD69     35.36
## .CD278    30.08
## .KLRG1    29.01
## .CD366    27.31
## .CD185    25.16
## .CD8a     20.72
## .CD29     18.90
## .CD47     15.83
## .CD39     15.67
## .TIGIT    15.10
## .CD161    14.21

Save objects

saveRDS(gb_fit, file = "output/GBresults_Combined_T_surface.rds")
saveRDS(classes, file = "output/GBclasses_Combined_T_surface.rds")

Session info

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /g/easybuild/x86_64/CentOS/7/haswell/software/FlexiBLAS/3.0.4-GCC-11.2.0/lib64/libflexiblas.so.3.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rsample_0.1.1      caret_6.0-90       lattice_0.20-45    ggrastr_1.0.1      ggpubr_0.4.0       readxl_1.4.1      
##  [7] forcats_0.5.1      stringr_1.4.1      dplyr_1.0.10       purrr_0.3.4        readr_2.1.2        tidyr_1.2.1       
## [13] tibble_3.1.8       ggplot2_3.3.6      tidyverse_1.3.1    SeuratObject_4.0.4 Seurat_4.1.0       knitr_1.40        
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2            reticulate_1.24       tidyselect_1.1.2      htmlwidgets_1.5.4     grid_4.1.2           
##   [6] Rtsne_0.16            pROC_1.18.0           munsell_0.5.0         codetools_0.2-18      ica_1.0-2            
##  [11] xgboost_1.4.1.1       future_1.23.0         miniUI_0.1.1.1        withr_2.5.0           spatstat.random_2.1-0
##  [16] colorspace_2.0-3      highr_0.9             rstudioapi_0.13       stats4_4.1.2          ROCR_1.0-11          
##  [21] ggsignif_0.6.3        tensor_1.5            listenv_0.8.0         labeling_0.4.2        polyclip_1.10-0      
##  [26] farver_2.1.1          pheatmap_1.0.12       parallelly_1.30.0     vctrs_0.4.2           generics_0.1.3       
##  [31] ipred_0.9-12          xfun_0.33             R6_2.5.1              ggbeeswarm_0.6.0      rmdformats_1.0.4     
##  [36] spatstat.utils_2.3-0  cachem_1.0.6          assertthat_0.2.1      promises_1.2.0.1      scales_1.2.1         
##  [41] nnet_7.3-16           beeswarm_0.4.0        gtable_0.3.1          Cairo_1.5-12.2        globals_0.14.0       
##  [46] goftest_1.2-3         timeDate_3043.102     rlang_1.0.6           splines_4.1.2         rstatix_0.7.0        
##  [51] lazyeval_0.2.2        ModelMetrics_1.2.2.2  spatstat.geom_2.3-2   broom_1.0.1           yaml_2.3.5           
##  [56] reshape2_1.4.4        abind_1.4-5           modelr_0.1.8          backports_1.4.1       httpuv_1.6.6         
##  [61] tools_4.1.2           lava_1.6.10           bookdown_0.29         ellipsis_0.3.2        spatstat.core_2.4-0  
##  [66] jquerylib_0.1.4       RColorBrewer_1.1-3    proxy_0.4-26          ggridges_0.5.3        Rcpp_1.0.9           
##  [71] plyr_1.8.7            rpart_4.1-15          deldir_1.0-6          pbapply_1.5-0         cowplot_1.1.1        
##  [76] zoo_1.8-9             haven_2.4.3           ggrepel_0.9.1         cluster_2.1.2         fs_1.5.2             
##  [81] furrr_0.2.3           magrittr_2.0.3        data.table_1.14.2     scattermore_0.8       lmtest_0.9-39        
##  [86] reprex_2.0.1          RANN_2.6.1            fitdistrplus_1.1-6    matrixStats_0.61.0    hms_1.1.2            
##  [91] patchwork_1.1.2       mime_0.12             evaluate_0.16         xtable_1.8-4          gridExtra_2.3        
##  [96] compiler_4.1.2        KernSmooth_2.23-20    crayon_1.5.2          htmltools_0.5.3       mgcv_1.8-38          
## [101] later_1.3.0           tzdb_0.3.0            lubridate_1.8.0       DBI_1.1.2             dbplyr_2.1.1         
## [106] MASS_7.3-54           Matrix_1.5-1          car_3.1-0             cli_3.4.1             parallel_4.1.2       
## [111] gower_0.2.2           igraph_1.3.5          pkgconfig_2.0.3       plotly_4.10.0         spatstat.sparse_2.1-0
## [116] recipes_0.1.17        xml2_1.3.2            foreach_1.5.2         vipor_0.4.5           bslib_0.4.0          
## [121] prodlim_2019.11.13    rvest_1.0.2           digest_0.6.29         sctransform_0.3.3     RcppAnnoy_0.0.19     
## [126] spatstat.data_2.1-2   rmarkdown_2.17        cellranger_1.1.0      leiden_0.3.9          uwot_0.1.11          
## [131] shiny_1.7.2           lifecycle_1.0.2       nlme_3.1-153          jsonlite_1.8.0        carData_3.0-5        
## [136] viridisLite_0.4.1     fansi_1.0.3           pillar_1.8.1          fastmap_1.1.0         httr_1.4.2           
## [141] survival_3.2-13       glue_1.6.1            png_0.1-7             iterators_1.0.14      class_7.3-19         
## [146] stringi_1.7.8         sass_0.4.2            irlba_2.3.5           e1071_1.7-9           future.apply_1.8.1