a b/notebooks/scRNAseq_compilation.Rmd
1
---
2
title: "Analysis of scRNA-seq data (compilation)"
3
date: "Last modified: `r format(Sys.time(), '%B %d, %Y')`"
4
tags: [scRNA-seq, melanoma, immunotherapy, PBMCs] 
5
output:
6
  html_document:
7
    theme: flatly
8
    highlight: zenburn
9
    toc: true
10
    number_sections: false
11
    toc_depth: 2
12
    toc_float:
13
      collapsed: false
14
      smooth_scroll: true
15
    code_folding: hide
16
    self_contained: yes
17
---
18
19
# Project Description
20
21
This project aims to identify therapeutic targets for melanoma patients based on scRNAseq data analysis.  
22
  
23
## Datasets
24
25
Disease datasets:  
26
27
  * Dataset 1 (Treatment Response Dataset)
28
    - Contains treatment response data
29
    - Includes responder/non-responder classifications
30
    - Focus on immunotherapy outcomes
31
32
  * Dataset 2 (Tumor Cell Dataset)
33
    - Characterizes tumor cell heterogeneity
34
    - Includes cell type annotations
35
    - Focus on tumor cell populations
36
37
  * Dataset 3 (Tumor Microenvironment Dataset)
38
    - Maps the tumor microenvironment
39
    - Contains malignant/non-malignant classifications
40
    - Includes detailed immune cell typing
41
  
42
Control datasets:  
43
44
  * Dataset 4 (Control Melanocyte Dataset)
45
    - Normal melanocyte control data
46
    - Reference for non-malignant state
47
    - Baseline expression profiles
48
49
  * Dataset 5 (Control Immune Dataset)
50
    - Normal immune cell control data
51
    - Reference for immune cell states
52
    - Baseline immune profiles
53
  
54
## Processes/Analyses
55
56
The following processing or analytical steps were conducted:  
57
58
  0. Environment initialization
59
  1. scRNAseq data consolidation and quality control  
60
  2. Cell clustering based on UMAP dimensionality reduction  
61
  3. Biomarker identification based on UMAP cluster groups  
62
  4. Differential gene expression analysis  
63
  5. Gene set enrichment analysis  
64
  6. Cell classification using Random Forests  
65
  7. Output saving (if prompted)  
66
  
67
  
68
# 0. Environment initialization
69
70
Install required packages.
71
72
```{r install_packages, results="hide", message=F, warning=F, error=F}
73
74
# Initiate Renv (for reproducibility)
75
renv::init()
76
77
# Define packages to install
78
pkg.list = c('renv', 'svDialogs', 'vroom', 'dplyr', 'DT', 'openxlsx', 'progress',
79
             'ggplot2', 'ggrepel', 'ggvenn', 'ggfortify', 'pheatmap', 
80
             'patchwork', 'Seurat', 'harmony', 'MAST', 'enrichR', 'UpSetR', 
81
             'singleCellNet')
82
83
# Install MAST (if needed)
84
if ('MAST' %in% pkg.install) {
85
  if (!requireNamespace('BiocManager', quietly=T)) {
86
    install.packages('BiocManager')
87
  }
88
  BiocManager::install('MAST')
89
  pkg.install <- pkg.install[!('MAST' %in% pkg.install)]
90
}
91
92
# Install SingleCellNet (if needed)
93
if ('singleCellNet' %in% pkg.install) {
94
  install.packages("devtools")
95
  devtools::install_github("pcahan1/singleCellNet")
96
  pkg.install <- pkg.install[!('singleCellNet' %in% pkg.install)]
97
}
98
99
# Define packages not already installed
100
pkg.install <- pkg.list[!(pkg.list %in% installed.packages()[, 'Package'])]
101
102
# Install uninstalled packages
103
if (length(pkg.install) > 0) {
104
  install.packages(pkg.install)
105
}
106
107
```
108
109
Load installed packages.  
110
111
```{r load_packages, results="hide", message=F, warning=F, error=F}
112
113
# Load packages
114
library(renv)           # for version control + reproducibility
115
library(svDialogs)      # for prompting user-input
116
library(vroom)          # for quickly reading data
117
library(dplyr)          # for data processing
118
library(DT)             # to display datatables
119
library(rio)            # to load all worksheets in a workbook
120
library(openxlsx)       # to write data to excel
121
library(progress)       # to display progress bar
122
library(ggplot2)        # for data visualization
123
library(ggrepel)        # to use geom_pont_repel()
124
library(ggvenn)         # to visualize venn diagrams
125
library(ggfortify)      # to visualize PCA plots
126
library(pheatmap)       # to visualize heatmaps
127
library(patchwork)      # for combining plots
128
library(Seurat)         # for scRNA-seq analysis
129
library(harmony)        # to integration of scRNA-seq data
130
library(MAST)           # for scRNAseq DEG analysis
131
library(enrichR)        # to perform GSEA
132
library(UpSetR)         # to visualize multiple set comparisons
133
library(singleCellNet)  # for RF implementation
134
135
```
136
137
## Custom functions
138
139
`qc_filter()`: ilter data based on QC for scRNA-seq data. 
140
141
```{r qc_filter, results="hide", message=F, warning=F, error=F}
142
143
qc_filter <- function(obj, feat.t=c(200, 2500), pct.mt.t=5, var.method='vst', 
144
                      feat.n=2000, qc.plot=T, top.n=10, title='') {
145
  
146
  ############################ FUNCTION DESCRIPTION ############################
147
  # feat.t = lower and upper limits on unique gene counts
148
  # pct.mt.t = threshold of level in mitochondrial contamination
149
  # var.method = method for selecting highly variable genes
150
  # feat.n = number of variable genes to select
151
  # qc.plot = boolean whether to generate plots to decide downstream analyses
152
  # title = string to use for plot title
153
  ############################## BEGIN FUNCTION ################################
154
  
155
  # determine percentage of mitochondrial contamination
156
  obj[['pct.mt']] <- PercentageFeatureSet(obj, pattern='^MT-')
157
  # filter + nomalize + scale data
158
  obj <- obj %>% 
159
    subset(., subset=(nFeature_RNA > feat.t[1]) & (nFeature_RNA < feat.t[2]) & 
160
             (pct.mt < pct.mt.t)) %>% NormalizeData(.) %>% 
161
    FindVariableFeatures(., selection.method=var.method) %>% 
162
    ScaleData(.) %>% RunPCA(., features=VariableFeatures(object=.))
163
  # generate follow-up QC plots (if prompted)
164
  if (qc.plot) { 
165
    p1 <- VariableFeaturePlot(obj) %>% 
166
      LabelPoints(plot=., points=head(VariableFeatures(obj), top.n), repel=T)
167
    p2 <- ElbowPlot(obj)
168
    plot(p1 + p2 + plot_annotation(title=title))
169
  }
170
  # return output
171
  return(obj)
172
}
173
```
174
175
`de_analyze()`: conduct differential expression (DE) analysis based on unpaired Student's t-test. 
176
177
```{r de_analyze, results="hide", message=F, warning=F, error=F}
178
179
de_analyze <- function(m1, m2, alt='two.sided', paired=F, var.equal=F, 
180
                       adj.method='bonferroni', t=0.05, de.plot=F, title='') { 
181
  
182
  ############################ FUNCTION DESCRIPTION ############################
183
  # m1, m2 = expression matrices to compare
184
  # alt, paired, var.equal = arguments for t.test() function
185
  # adj.method = method for calculating adjusted p-value
186
  # t = threshold for significance 
187
  # de.plot = boolean whether to generate a volcano plot
188
  ############################## BEGIN FUNCTION ################################
189
  
190
  # make sure two matrices have same number of rows
191
  if (nrow(m1) != nrow(m2)) { 
192
    stop('Row length does not match between the provided matrices.')
193
  }
194
  # make sure gene names align between matrices
195
  if (!all(rownames(m1) == rownames(m2))) { 
196
    stop('Gene names do not align between provided matrices.')
197
    }
198
  # instantiate output variable
199
  results <- data.frame(gene=rownames(m1), 
200
                        t.stat=vector(mode='numeric', length=nrow(m1)), 
201
                        p.val=vector(mode='numeric', length=nrow(m1)))
202
  # conduct unpaired t-test with unequal variance for each gene
203
  pb <- progress_bar$new(
204
    format='  analyzing [:bar] :percent time left: :eta', total=nrow(m1))
205
  for (i in 1:nrow(m1)) { 
206
    pb$tick()
207
    x <- m1[i, ]; y <- m2[i, ]
208
    r <- t.test(x, y, alternative=alt, paired=paired, var.equal=var.equal)
209
    results$t.stat[i] <- r$statistic
210
    results$p.val[i] <- r$p.value
211
  }
212
  # determine adjusted p-values
213
  results$q.val <- p.adjust(results$p.val, method=adj.method)
214
  # add additional fields
215
  results <- results %>%
216
    mutate(Significance=case_when(q.val < t & t.stat > 0 ~ 'Up',
217
                                  q.val < t & t.stat < 0 ~ 'Down',
218
                                  T ~ 'NS')) %>% arrange(q.val)
219
  # generate volcano plot (if prompted)
220
  if (de.plot) { 
221
    p <- results %>% arrange(t.stat) %>% ggplot(data=., 
222
           aes(x=t.stat, y=-log10(q.val), col=Significance, label=gene)) +
223
      geom_point() + geom_text_repel() + theme_minimal() + 
224
      scale_color_manual(values=c('blue', 'black', 'red')) + ggtitle(title)
225
    plot(p)
226
  }
227
  # return output
228
  return(results)
229
}
230
231
```
232
233
## Additional settings.  
234
235
```{r settings}
236
237
# Adjust system settings
238
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
239
240
# Save plots? (default: F)
241
save.plots <- dlgInput('Save all plots? (T/F)', F)$res
242
243
# Set seed (for reproducibility)
244
set.seed(123)
245
246
```
247
248
249
# 1. scRNAseq data consolidation and quality control
250
251
Load all scRNA-seq files or load already saved workspace.  
252
253
```{r load_data, warning=F, message=F}
254
255
f <- list.files()
256
if (any(endsWith(f, '.RData'))) {
257
  load(f[endsWith(f, '.RData')][1])
258
} else { 
259
  # Define variables to store data
260
  obj <- list(); meta <- list()
261
  
262
  # Dataset 1: Treatment Response Dataset
263
  meta$dataset1 <- read.delim(file='data/dataset1/metadata.txt')
264
  x <- vroom('data/dataset1/counts.txt', col_names=F)
265
  x <- x[, 1:nrow(meta$dataset1)] %>% as.matrix(.)
266
  y <- read.delim('data/dataset1/genes.txt', header=F)
267
  rownames(x) <- make.unique(y$V1)
268
  colnames(x) <- meta$dataset1$ID
269
  obj$dataset1 <- CreateSeuratObject(x, project='dataset1',
270
                                      min.cells=3, min.features=200)
271
  obj$dataset1$Treatment <- as.factor(meta$dataset1$Treatment)
272
  obj$dataset1$Response <- as.factor(meta$dataset1$Response)
273
  rm(x, y); gc()
274
  
275
  # Dataset 2: Tumor Cell Dataset
276
  meta$dataset2 <- read.csv('data/dataset2/metadata.csv')
277
  x <- vroom('data/dataset2/expression_matrix.csv')
278
  y <- as.matrix(x[, -1]); rownames(y) <- x$...1
279
  obj$dataset2 <- CreateSeuratObject(y, project='dataset2',
280
                                      min.cells=3, min.features=200)
281
  obj$dataset2$type <- as.factor(meta$dataset2$cell.types)
282
  
283
  # Dataset 3: Tumor Microenvironment Dataset
284
  x <- vroom('data/dataset3/expression_matrix.txt')
285
  y <- as.matrix(x[-c(1:3), -1]); rownames(y) <- x$Cell[-c(1:3)]
286
  obj$dataset3 <- CreateSeuratObject(y, project='dataset3',
287
                                     min.cells=3, min.features=200)
288
  meta$dataset3 <- data.frame(Tumor=t(x[1, -1]),
289
                              Malignant=case_when(x[2, -1] == 1 ~ 'No',
290
                                                  x[2, -1] == 2 ~ 'Yes',
291
                                                  x[2, -1] == 0 ~ 'Unresolved'),
292
                              NMType=case_when(x[3, -1] == 1 ~ 'T',
293
                                               x[3, -1] == 2 ~ 'B',
294
                                               x[3, -1] == 3 ~ 'Macro',
295
                                               x[3, -1] == 4 ~ 'Endo',
296
                                               x[3, -1] == 5 ~ 'CAF',
297
                                               x[3, -1] == 6 ~ 'NK',
298
                                               x[3, -1] == 0 ~ 'NA'))
299
  obj$dataset3$Malignant <- meta$dataset3$Malignant
300
  obj$dataset3$NMType <- meta$dataset3$NMType
301
302
  # Dataset 4: Control Melanocyte Dataset
303
  x <- vroom('data/dataset4/expression_matrix.csv')
304
  y <- as.matrix(x[, -1]); rownames(y) <- x$...1
305
  obj$dataset4 <- CreateSeuratObject(y, project='dataset4',
306
                                      min.cells=3, min.features=200)
307
  
308
  # Dataset 5: Control Immune Dataset
309
  meta$dataset5 <- read.table('data/dataset5/metadata.tsv',
310
                               sep='\t', header=T)
311
  x <- vroom('data/dataset5/matrix.tsv')
312
  y <- as.matrix(x[, -1]); rownames(y) <- x$Gene.Name
313
  obj$dataset5 <- CreateSeuratObject(y, project='dataset5',
314
                                      min.cells=3, min.features=200)
315
  obj$dataset5$sample <- as.factor(meta$dataset5$sample)
316
  
317
  # Clear unnecessary memory
318
  rm(f, x, y, meta); gc()
319
}
320
321
```
322
323
Combine all datasets together into single Seurat object. Also apply `harmony` to remove clustering bias based on dataset source.  
324
325
```{r combine_data, warning=F, message=F} 
326
327
# Combine all datasets
328
if (!exists('data.all')) { 
329
  data.all <- list()
330
}
331
if (!('raw' %in% names(data.all))) { 
332
  data.all$raw <- merge(obj$dataset1, obj$dataset2) %>%
333
    merge(., obj$dataset3) %>% merge(., obj$dataset4) %>% merge(., obj$dataset5)
334
}
335
336
# Add source information
337
if (!('Source' %in% names(data.all$raw@meta.data))) { 
338
  data.all$raw$Source <- c(rep(names(obj)[1], ncol(obj$dataset1)),
339
                           rep(names(obj)[2], ncol(obj$dataset2)),
340
                           rep(names(obj)[3], ncol(obj$dataset3)),
341
                           rep(names(obj)[4], ncol(obj$dataset4)),
342
                           rep(names(obj)[5], ncol(obj$dataset5)))
343
}
344
345
# Apply QC
346
if (!('proc' %in% names(data.all))) { 
347
  data.all$proc <- qc_filter(data.all$raw)
348
}
349
350
# Visualize integration results
351
p1 <- DimPlot(object=data.all$proc, reduction='pca', pt.size=0.1, 
352
              group.by='Source')
353
p2 <- VlnPlot(object=data.all$proc, features='PC_1', pt.size=0.1, 
354
              group.by='Source')
355
p1 + p2
356
if (save.plots) { 
357
  ggsave('plots/QC_no_harmony.pdf', width=10, height=5, units='in')
358
}
359
360
# Apply harmony (to remove clustering based on dataset source)
361
if (!('harmony' %in% names(data.all$proc@reductions))) { 
362
  data.all$proc <- data.all$proc %>% RunHarmony('Source', plot_convergence=T)
363
}
364
365
# Visualize integration results (after harmony)
366
p1 <- DimPlot(object=data.all$proc, reduction='harmony', pt.size=0.1, 
367
              group.by='Source')
368
p2 <- VlnPlot(object=data.all$proc, features='harmony_1', pt.size=0.1, 
369
              group.by='Source')
370
p1 + p2
371
if (save.plots) { 
372
  ggsave('plots/QC_w_harmony.pdf', width=10, height=5, units='in')
373
}
374
375
```
376
377
378
# 2. Cell clustering based on UMAP dimensionality reduction
379
380
Cluster cells based on uniform manifold approximation and projection (UMAP).  
381
382
```{r UMAP_clustering, warning=F, message=T}
383
384
# Visualize UMAP plots (runtime: ~5 minutes)
385
if (!('umap' %in% names(data.all$proc@reductions))) { 
386
  data.all$proc <- data.all$proc %>% 
387
      RunUMAP(reduction='harmony', dims=1:20) %>% 
388
      FindNeighbors(reduction='harmony', dims=1:20) %>% 
389
      FindClusters(resolution=0.5) %>% identity()
390
}
391
392
#   by dataset source
393
p <- DimPlot(data.all$proc, reduction='umap', group.by='Source', pt.size=0.1, 
394
        split.by='Source') + ggtitle('UMAP split by dataset source'); plot(p)
395
#   by cluster (unlabeled)
396
p <- DimPlot(data.all$proc, reduction='umap', label=T, pt.size=0.1) + 
397
  ggtitle('UMAP of combined scRNA-seq data (unlabeled)'); plot(p)
398
if (save.plots) { 
399
  ggsave('plots/UMAP_unlabeled.pdf', width=10, height=5, units='in')
400
}
401
402
#   by cluster (labeled)
403
lab <- c('Tumor cell', 'Technical error 1', 'Cytotoxic CD8 T cell 1', 'B cell', 
404
         'Melanocytes 1', 'Macrophage', 'Cytotoxic CD8 T cell 2', 'Technical error 2', 
405
         'Memory T cell', 'Melanocytes 2', 'Dysfunctional CD8 T cell', 
406
         'Cytotoxic CD8 T cell 3', '? 1', 'Melanocytes 3', 'Activated cell', 
407
         'Exhausted T cell', 'Cytotoxic T cell', '? 2', '? 3', 'Melanocytes 4', '? 4')
408
names(lab) <- levels(data.all$proc)
409
plot.data <- data.all$proc %>% RenameIdents(., lab)
410
p <- DimPlot(plot.data, reduction='umap', label=T, pt.size=0.1) + 
411
  ggtitle('UMAP of combined scRNA-seq data (labeled)'); plot(p)
412
if (save.plots) { 
413
  ggsave('plots/UMAP_labeled.pdf', width=10, height=5, units='in')
414
}
415
416
```
417
418
419
# 3. Biomarker identification based on UMAP cluster groups
420
421
Determine biomarkers based on UMAP cluster assignment. 
422
423
```{r biomarkers, warning=F, message=T}
424
425
# Find all biomarkers based on clustering (runtime: ~30 minutes)
426
if (!('bm' %in% names(data.all))) { 
427
  data.all$bm <- FindAllMarkers(data.all$proc, min.pct=0.25, logfc.threshold=0.25)
428
}
429
430
# View table of top 3 biomarkers for each cluster
431
d <- data.all$bm %>% group_by(cluster) %>% slice_max(n=3, order_by=avg_log2FC)
432
datatable(d)
433
434
# Visualize ridge plots based on biomarkers of interest
435
ridge.feat <- dlg_list(sort(unique(d$gene)), multiple=T)$res
436
p <- RidgePlot(data.all$proc, features=ridge.feat, 
437
          ncol=ceiling(length(ridge.feat) / 2)); plot(p)
438
if (save.plots) { 
439
  ggsave('plots/biomarker_ridge_plots.pdf', width=10, height=10, units='in')
440
}
441
442
```
443
444
445
# 4. Differential gene expression analysis
446
447
Determine differentially expressed genes (DEGs) for the following group comparisons:  
448
449
  * Disease vs. control (tumor)  
450
  * Disease vs. control (immune cells, bulk)  
451
  * Disease vs. control (immune cells, cluster-based)  
452
  * Responder vs. non-responder (specific to GSE120575)  
453
  
454
Of note, DEGs were determined using a custom function for unpaired Student's t-test and [MAST](https://bioconductor.org/packages/release/bioc/html/MAST.html). 
455
```{r DE_initialization, warning=F, message=T, eval=!('data' %in% de)}
456
457
# Instantiate DE variable + relevant fields
458
de <- list(); de$ttest <- list(); de$mast <- list()
459
460
# Define data to work with
461
de$data <- GetAssayData(object=data.all$proc, slot='data')
462
463
```
464
465
## Case: disease vs. control (tumor)
466
467
Comparison between benign vs. tumor skin cells. 
468
Estimated runtime: ~15 minutes. 
469
470
```{r DE_tumor, warning=F, message=T, eval=!('tumor' %in% de$ttest)}
471
472
# Start timer
473
t.start <- Sys.time()
474
475
# Define cell groups to compare
476
ix <- intersect(rownames(obj$dataset3), rownames(obj$dataset2)) %>%
477
  intersect(., rownames(obj$dataset4))
478
jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes'
479
jx2 <- data.all$proc$Source == names(obj)[4]
480
481
# DE analysis (t-test)
482
x <- de$data[ix, jx1]; y <- de$data[ix, jx2]
483
de$ttest$tumor <- de_analyze(x, y) %>% na.omit
484
485
# DE analysis (MAST)
486
x <- cbind(de$data[ix, jx1], de$data[ix, jx2]) %>% CreateSeuratObject(.)
487
Idents(object=x, cells=1:sum(jx1)) <- 'Disease'
488
Idents(object=x, cells=sum(jx1)+1:ncol(x)) <- 'Control'
489
de$mast$tumor <- FindMarkers(object=x, ident.1='Disease', ident.2='Control', 
490
                             test.use='MAST')
491
492
# End timer + log time elapsed
493
t.end <- Sys.time()
494
t.end - t.start
495
496
```
497
498
## Case: disease vs. control (immune cells, bulk)
499
500
Bulk comparison between healthy vs. diseased immune cells. 
501
Estimated runtime: ~20 minutes. 
502
503
```{r DE_immune_bulk,warning=F, message=T, eval=!('immune' %in% de$ttest)}
504
505
# Start timer
506
t.start <- Sys.time()
507
508
# Define cell groups to compare
509
ix <- intersect(rownames(obj$dataset1), rownames(obj$dataset2)) %>% 
510
  intersect(., rownames(obj$dataset5))
511
jx1 <- data.all$proc$Source == names(obj)[1] | data.all$proc$Malignant %in% 'No'
512
jx2 <- data.all$proc$Source == names(obj)[5]
513
514
# DE analysis (t-test)
515
x <- de$data[ix, jx1]; y <- de$data[ix, jx2]
516
de$ttest$immune <- de_analyze(x, y) %>% na.omit()
517
518
# DE analysis (MAST)
519
x <- cbind(de$data[ix, jx1], de$data[ix, jx2]) %>% CreateSeuratObject(.)
520
Idents(object=x, cells=1:sum(jx1)) <- 'Tumor'
521
Idents(object=x, cells=sum(jx1)+1:ncol(x)) <- 'Benign'
522
de$mast$immune <- FindMarkers(object=x, ident.1='Tumor', ident.2='Benign', 
523
                              test.use='MAST')
524
525
# End timer + log time elapsed
526
t.end <- Sys.time()
527
t.end - t.start
528
529
```
530
531
## Case: disease vs. control (immune cells, cluster-based)
532
533
Immune cell-specific comparisons between healthy vs. diseased cells. 
534
Estimated runtime: ~15 minutes. 
535
536
```{r DE_immune_cluster, warning=F, message=T, eval=!('B' %in% de$ttest)}
537
538
# Start timer
539
t.start <- Sys.time()
540
541
# Define labeled scRNAseq data
542
lab <- c('Tumor cell', 'Technical error 1', 'Cytotoxic CD8 T cell 1', 'B cell', 
543
         'Melanocytes 1', 'Macrophage', 'Cytotoxic CD8 T cell 2', 'Technical error 2', 
544
         'Memory T cell', 'Melanocytes 2', 'Dysfunctional CD8 T cell', 
545
         'Cytotoxic CD8 T cell 3', '? 1', 'Melanocytes 3', 'Activated cell', 
546
         'Exhausted T cell', 'Cytotoxic T cell', '? 2', '? 3', 'Melanocytes 4', '? 4')
547
names(lab) <- levels(data.all$proc)
548
plot.data <- data.all$proc %>% RenameIdents(., lab)
549
550
# Iterate through each immune cell group (runtime: ~5 minutes)
551
immune.cells <- c('B', 'CD8', 'Macrophage', 'Memory', 'Activated')
552
for (i in 1:length(immune.cells)) { 
553
  
554
  # Define row + column indices
555
  ix <- intersect(rownames(obj$dataset1), rownames(obj$dataset2)) %>% 
556
    intersect(., rownames(obj$dataset5))
557
  c <- grepl(immune.cells[i], Idents(plot.data))
558
  jx1 <- (plot.data$Source == names(obj)[1] | 
559
            data.all$proc$Malignant %in% 'No') & c
560
  jx2 <- data.all$proc$Source == names(obj)[5] & c
561
  
562
  # # Print number of genes + cells
563
  # cat(sprintf('No. of genes: %d\tNo. of G1: %d\tNo. of G2: %d\n', 
564
  #             length(ix), sum(jx1), sum(jx2)))
565
  
566
  # DE analysis (t-test)
567
  x <- de$data[ix, jx1]; y <- de$data[ix, jx2]
568
  de$ttest[[immune.cells[i]]] <- de_analyze(x, y) %>% na.omit()
569
570
  # DE analysis (MAST)
571
  x <- cbind(de$data[ix, jx1], de$data[ix, jx2]) %>% CreateSeuratObject(.)
572
  Idents(object=x, cells=1:sum(jx1)) <- 'Tumor'
573
  Idents(object=x, cells=sum(jx1)+1:ncol(x)) <- 'Benign'
574
  de$mast[[immune.cells[i]]] <- FindMarkers(object=x, ident.1='Tumor',
575
                                            ident.2='Benign', test.use='MAST')
576
}
577
578
# End timer + log time elapsed
579
t.end <- Sys.time()
580
t.end - t.start
581
582
```
583
584
## Case: responder vs. non-responder (GSE120575)
585
586
Comparison between responder vs. non-responder cells to immunotherapy. 
587
Estimated runtime: ~1 hour.  
588
589
```{r DE_response, warning=F, message=T, eval=!('response' %in% de$ttest)}
590
591
# Start timer
592
t.start <- Sys.time()
593
594
# Define cell groups to compare
595
ix <- rownames(obj$dataset1)
596
jx1 <- data.all$proc$Response %in% 'Responder'
597
jx2 <- data.all$proc$Response %in% 'Non-responder'
598
599
# DE analysis (t-test)
600
x <- de$data[ix, jx1]; y <- de$data[ix, jx2]
601
de$ttest$response <- de_analyze(x, y) %>% na.omit
602
603
# DE analysis (MAST)
604
x <- cbind(de$data[ix, jx1], de$data[ix, jx2]) %>% CreateSeuratObject(.)
605
Idents(object=x, cells=1:sum(jx1)) <- 'Responder'
606
Idents(object=x, cells=sum(jx1)+1:ncol(x)) <- 'Non-responder'
607
de$mast$response <- FindMarkers(object=x, ident.1='Responder', 
608
                                ident.2='Non-responder', test.use='MAST')
609
610
# End timer + log time elapsed
611
t.end <- Sys.time()
612
t.end - t.start
613
614
```
615
616
## Visualizations
617
618
**Description**: visual inspection of DE analysis results. 
619
620
### T-test vs. MAST DEGs
621
622
Visual comparison of DEGs determined with t-test vs. MAST (venn diagrams).
623
624
```{r DE_venn, warning=F, message=T}
625
626
# Compare DEGs b/t the two methods
627
f <- names(de$mast); p <- list()
628
for (i in 1:length(f)) { 
629
  x <- de$ttest[[f[i]]]; y <- de$mast[[f[i]]]
630
  g <- intersect(x$gene[x$q.val < 0.05], row.names(y)[y$p_val_adj < 0.05])
631
  # g <- intersect(x$gene[x$q.val < 0.05], y$gene[y$p_val_adj < 0.05]) 
632
  cat(sprintf('No. of intersecting DEGs for %s: %d\n', f[i], length(g)))
633
  ggvenn(list('t-test'=x$gene[x$q.val < 0.05], 
634
              'MAST'=row.names(y)[y$p_val_adj < 0.05])) + ggtitle(f[i])
635
  ggsave(paste0('plots/DE_venn_', f[i], '.pdf'), width=5, height=5, units='in')
636
}
637
638
```
639
640
### Responder vs. non-responder DEGs
641
642
Clustergram (or heatmap) of DEGs identified for responder vs. non-responder comparison. 
643
644
```{r DE_response_heat, warning=F, message=T}
645
646
# Define data to plot
647
n <- 50
648
g <- de$mast$response$gene[1:n]
649
jx1 <- which(data.all$proc$Response %in% 'Responder')[1:(n/2)]
650
jx2 <- which(data.all$proc$Response %in% 'Non-responder')[1:(n/2)]
651
x <- de$data[g, c(jx1, jx2)] %>% as.data.frame(.)
652
653
# Specify heatmap arguments
654
#   color palette
655
col.pal <- colorRampPalette(colors=c('white', 'red'), space='Lab')(100)
656
#   column annotations (response)
657
col.annot <- data.frame(Response=rep(c('Responder', 'Non-responder'), each=n/2), 
658
                        row.names=colnames(x))
659
  
660
# Heatmap
661
p <- pheatmap(x, cluster_rows=T, cluster_cols=F, scale='none', color=col.pal, 
662
              annotation_col=col.annot, cellheight=(500/n), cellwidth=(500/n), 
663
              show_rownames=T, show_colnames=F, gaps_col=n/2, 
664
              main='DEG heatmap (response)')
665
  
666
# Save plot
667
tiff('plots/DE_heatmap_response.tiff', units='in', width=11, height=8, res=300)
668
print({p}); dev.off()
669
670
```
671
672
PCA of DEGs identified for responder vs. non-responder comparison. 
673
674
```{r DE_response_PCA, warning=F, message=T}
675
676
# Define data to plot
677
g <- de$mast$response$gene[de$mast$response$p_val_adj < 0.05]
678
jx <- data.all$proc$Response %in% c('Responder', 'Non-responder')
679
x <- de$data[g, jx] %>% as.data.frame(.) %>% t(.)
680
m <- data.frame(Response=data.all$proc$Response[jx])
681
682
# Apply PCA
683
response_pca <- prcomp(x, scale.=T)
684
  
685
# Visualize PCA plot
686
autoplot(response_pca, data=m, colour='Response') + ggtitle('DEG PCA (response)')
687
  
688
# Save plot
689
ggsave('plots/DE_PCA_response.tiff', units='in', width=7, height=5, dpi=300)
690
691
```
692
693
### Tumor vs. benign DEGs
694
695
Clustergram (or heatmap) of DEGs identified for tumor vs. benign comparison. 
696
697
```{r DE_tumor_heat, warning=F, message=T}
698
699
# Define data to plot
700
n <- 50
701
g <- de$mast$tumor$gene[1:n]
702
jx1 <- which(data.all$proc$Source == names(obj)[3] | 
703
  data.all$proc$Malignant %in% 'Yes')[1:(n/2)]
704
jx2 <- which(data.all$proc$Source == names(obj)[4])[1:(n/2)]
705
x <- de$data[g, c(jx1, jx2)] %>% as.data.frame(.)
706
707
# Specify heatmap arguments
708
#   color palette
709
col.pal <- colorRampPalette(colors=c('white', 'red'), space='Lab')(100)
710
#   column annotations (response)
711
col.annot <- data.frame(Malignant=rep(c('Yes', 'No'), each=n/2), 
712
                        row.names=colnames(x))
713
  
714
# Heatmap
715
p <- pheatmap(x, cluster_rows=T, cluster_cols=F, scale='none', color=col.pal, 
716
              annotation_col=col.annot, cellheight=(500/n), cellwidth=(500/n), 
717
              show_rownames=T, show_colnames=F, gaps_col=n/2, 
718
              main='DEG heatmap (tumor)')
719
  
720
# Save plot
721
tiff('plots/DE_heatmap_tumor.tiff', units='in', width=11, height=8, res=300)
722
print({p}); dev.off()
723
724
```
725
726
PCA of DEGs identified for tumor vs. benign comparison. 
727
728
```{r DE_tumor_PCA, warning=F, message=T}
729
730
# Define data to plot
731
g <- de$mast$tumor$gene[1:100]
732
jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes'
733
jx2 <- data.all$proc$Source == names(obj)[4]
734
x <- cbind(de$data[g, jx1], de$data[g, jx2]) %>% as.data.frame(.) %>% t(.)
735
m <- data.frame(Malignant=rep(c('Yes', 'No'), c(sum(jx1), sum(jx2))))
736
737
# Apply PCA
738
pca.tumor <- prcomp(x, scale.=T)
739
  
740
# Visualize PCA plot
741
autoplot(pca.tumor, data=m, colour='Malignant') + ggtitle('DEG PCA (tumor)')
742
  
743
# Save plot
744
ggsave('plots/DE_PCA_tumor.tiff', units='in', width=7, height=5, dpi=300)
745
746
```
747
748
749
# 5. Gene set enrichment analysis
750
751
Determine enriched pathways based on the DEG results (from MAST) using [enrichR](https://cran.r-project.org/web/packages/enrichR/vignettes/enrichR.html)..
752
Total runtime: ~2 minutes.  
753
754
```{r GSEA_analysis, warning=F, message=T}
755
756
# Start timer
757
t.start <- Sys.time()
758
759
# Instantiate GSEA variable
760
gsea <- list()
761
762
# Ensure comparison to human genes
763
setEnrichrSite('Enrichr')
764
765
# Select databases to inquire
766
dbs <- dlg_list(sort(listEnrichrDbs()$libraryName), multiple=T)$res
767
768
# Loop through all DE fields
769
f <- names(de$mast)
770
for (i in 1:length(f)) {
771
  x <- de$mast[[f[i]]]
772
  y <- enrichr(x$gene[x$p_val_adj < 0.05], dbs)
773
  z <- y$GO_Biological_Process_2021
774
  n <- sum(z$Adjusted.P.value < 0.05)
775
  gsea[[f[i]]] <- z[z$Adjusted.P.value < 0.05, ]
776
  cat(sprintf('No. of enriched pathways for %s: %d\n', f[i], n))
777
  plotEnrich(z, orderBy='Adjusted.P.value') + 
778
    ggtitle(sprintf('Case: %s (Total = %d)', f[i], n))
779
  ggsave(paste0('plots/GSEA_', f[i], '.pdf'), width=7, height=5, units='in')
780
}
781
782
# End timer + log time elapsed
783
t.end <- Sys.time()
784
t.end - t.start
785
786
```
787
788
Visualize intersecting pathways between comparisons.  
789
790
```{r GSEA_intersect, warning=F, message=T}
791
792
# Define plots data
793
df <- list('tumor'=gsea$tumor$Term, 
794
           'immune'=gsea$immune$Term, 
795
           'B'=gsea$B$Term, 
796
           'CD8'=gsea$CD8$Term, 
797
           'Macrophage'=gsea$Macrophage$Term, 
798
           'Memory'=gsea$Memory$Term, 
799
           'Activated T'=gsea$Activated$Term, 
800
           'response'=gsea$response$Term)
801
802
# Generate and save UpSet plot
803
pdf(file='plots/GSEA_intersect.pdf', width=11, height=8)
804
upset(fromList(df), sets=names(df), order.by='freq')
805
806
```
807
808
809
# 6. Cell classification using Random Forests
810
811
Apply Random Forests to construct a cell type classifier using [SingleCellNet](https://github.com/pcahan1/singleCellNet).
812
813
Define training + testing data. 
814
815
```{r ML_train_test, warning=F, message=T}
816
817
# Pull required data
818
s <- extractSeurat(data.all$proc, exp_slot_name='counts')
819
rm(data.all); gc()
820
st <- s$sampTab %>% mutate(ID=rownames(.))
821
exp <- s$expDat %>% as(., 'dgCMatrix')
822
823
# Define label field
824
label <- dlg_list(title='Choose label: ', sort(colnames(st)), multiple=F)$res
825
ID <- dlg_list(title='Choose cell ID: ', sort(colnames(st)), multiple=F)$res
826
827
# Re-order by label field
828
ix <- order(st[[label]])
829
st <- st[ix, ]
830
exp <- exp[, ix]
831
832
# Train/Test split
833
stList1 <- splitCommon(sampTab=st, ncells=100, dLevel=label)
834
stTrain <- stList1[[1]]
835
expTrain <- exp[, rownames(stTrain)]
836
stList2 <- splitCommon(sampTab=stList1[[2]], ncells=100, dLevel=label)
837
stTest <- stList2[[1]]
838
expTest <- exp[, rownames(stTest)]
839
840
```
841
842
Apply SCN (runtime: ~5 minutes). 
843
844
```{r SCN, warning=F, message=T}
845
846
# Start timer
847
t.start <- Sys.time()
848
849
# Train model
850
x <- scn_train(stTrain=stTrain, expTrain=expTrain, nTopGenes=10, nRand=70, 
851
               nTrees=1000, nTopGenePairs=25, dLevel=label)
852
853
# Test model
854
y <- scn_predict(cnProc=x[['cnProc']], expDat=expTest, nrand=50)
855
856
# End timer + log time elapsed
857
t.end <- Sys.time()
858
t.end - t.start
859
860
```
861
862
Model assessment and visualization of results.  
863
864
```{r ML_evaluation, warning=F, message=F}
865
866
# Assess model
867
z <- assess_comm(ct_scores=y, stTrain=stTrain, stQuery=stTest, dLevelSID=ID,
868
                 classTrain=label, classQuery=label)
869
plot_PRs(z) + ggtitle('Model performance (test set)')
870
if (save.plots) {
871
  ggsave('plots/ML_performance.pdf', width=5, height=5, units='in')
872
}
873
874
# Visualize results
875
#   classification results
876
nrand <- 50
877
sla <- as.vector(stTest[[label]])
878
names(sla) <- as.vector(rownames(stTest))
879
slaRand <- rep('rand', nrand)
880
names(slaRand) <- paste('rand_', 1:nrand, sep='')
881
sla <- append(sla, slaRand)
882
p <- sc_hmClass(classMat=y, grps=sla, max=300, isBig=T)
883
if (save.plots) {
884
  pdf('plots/ML_classification.pdf', width=7, height=5,
885
      title='Classification results (test set)')
886
  p
887
}
888
#   attribution plot
889
plot_attr(classRes=y, sampTab=stTest, nrand=nrand, sid=ID, dLevel=label) + 
890
  xlab('Predicted group') + ylab('True class ratio') + 
891
  ggtitle('Attribution plot (test set)')
892
if (save.plots) {
893
  ggsave('plots/ML_attribution.pdf', width=7, height=5, units='in')
894
}
895
896
```
897
898
899
# 7. Output saving (if prompted)
900
901
```{r save_outputs, warning=F, message=T}
902
903
# Save outputs
904
save.data <- dlgInput('Save all results? (T/F)', F)$res %>% as.logical(.)
905
if (save.data) { 
906
  # Biomarker results
907
  xl.list <- list('biomarkers'=data.all$bm)
908
  write.xlsx(xl.list, 'results/biomarkers.xlsx', row.names=F)
909
  # DEG results
910
  write.xlsx(de$ttest, 'results/DE_ttest.xlsx', row.names=F)
911
  write.xlsx(de$mast, 'results/DE_MAST.xlsx', row.names=T)
912
  # GSEA results
913
  write.xlsx(gsea, 'results/GSEA.xlsx', row.names=F)
914
}
915
916
# Save workspace
917
save.wksp <- dlgInput('Save R workspace? (T/F)', F)$res %>% as.logical(.)
918
if (save.wksp) { 
919
  # Save workspace (runtime: ~2 hours)
920
  save.image(file='scRNAseq_wksp.RData')
921
}
922
923
# Save project state (for reproducibility)
924
renv::snapshot()
925
926
```
927
928
# Visualize data
929
930
```{r visualize_data, warning=F, message=F, fig.width=12, fig.height=6}
931
932
# Create UMAP plots
933
p1 <- DimPlot(data.all$proc, reduction='umap', group.by='Source',
934
              label=T, repel=T) +
935
  ggtitle('Dataset Source')
936
937
p2 <- DimPlot(data.all$proc, reduction='harmony', group.by='Source',
938
              label=T, repel=T) +
939
  ggtitle('Dataset Source (Harmony)')
940
941
# Combine plots
942
p1 + p2
943
944
# Save plot
945
ggsave('plots/UMAP_integration.pdf', width=12, height=6)
946
947
```
948
949
# Visualize tumor vs melanocyte comparison
950
951
```{r visualize_tumor, warning=F, message=F, fig.width=12, fig.height=6}
952
953
# Create volcano plot
954
p1 <- EnhancedVolcano(tumor.vs.melanocyte,
955
                      lab=rownames(tumor.vs.melanocyte),
956
                      x='avg_log2FC', y='p_val_adj',
957
                      pCutoff=0.05, FCcutoff=1,
958
                      title='Tumor vs Melanocyte DEGs')
959
960
# Create heatmap
961
top.genes <- rownames(tumor.vs.melanocyte)[tumor.vs.melanocyte$p_val_adj < 0.05]
962
top.genes <- top.genes[order(abs(tumor.vs.melanocyte$avg_log2FC[
963
  tumor.vs.melanocyte$p_val_adj < 0.05]), decreasing=T)][1:50]
964
965
plot.data <- subset(data.all$proc,
966
                    subset=Source %in% c(names(obj)[3:4]))
967
plot.data <- ScaleData(plot.data, features=top.genes)
968
969
p2 <- DoHeatmap(plot.data, features=top.genes,
970
                group.by='Source') +
971
  ggtitle('Top 50 Tumor vs Melanocyte DEGs')
972
973
# Combine plots
974
p1 + p2
975
976
# Save plot
977
ggsave('plots/tumor_vs_melanocyte.pdf', width=12, height=6)
978
979
```
980
981
# Visualize immune vs non-malignant comparison
982
983
```{r visualize_immune, warning=F, message=F, fig.width=12, fig.height=6}
984
985
# Create volcano plot
986
p1 <- EnhancedVolcano(immune.vs.nonmalignant,
987
                      lab=rownames(immune.vs.nonmalignant),
988
                      x='avg_log2FC', y='p_val_adj',
989
                      pCutoff=0.05, FCcutoff=1,
990
                      title='Immune vs Non-malignant DEGs')
991
992
# Create heatmap
993
top.genes <- rownames(immune.vs.nonmalignant)[
994
  immune.vs.nonmalignant$p_val_adj < 0.05]
995
top.genes <- top.genes[order(abs(immune.vs.nonmalignant$avg_log2FC[
996
  immune.vs.nonmalignant$p_val_adj < 0.05]), decreasing=T)][1:50]
997
998
plot.data <- subset(data.all$proc,
999
                    subset=Source %in% c(names(obj)[c(1,2,5)]))
1000
plot.data <- ScaleData(plot.data, features=top.genes)
1001
1002
p2 <- DoHeatmap(plot.data, features=top.genes,
1003
                group.by='Source') +
1004
  ggtitle('Top 50 Immune vs Non-malignant DEGs')
1005
1006
# Combine plots
1007
p1 + p2
1008
1009
# Save plot
1010
ggsave('plots/immune_vs_nonmalignant.pdf', width=12, height=6)
1011
1012
```
1013
1014
# Visualize immune cell type comparisons
1015
1016
```{r visualize_immune_types, warning=F, message=F, fig.width=12, fig.height=6}
1017
1018
# Create plots for each cell type
1019
for (i in 1:length(immune.cells)) {
1020
  # Read results
1021
  immune.cell.type <- read.csv(paste0('results/',
1022
                                     immune.cells[i],
1023
                                     '_cell_DEGs.csv'))
1024
  
1025
  # Create volcano plot
1026
  p1 <- EnhancedVolcano(immune.cell.type,
1027
                        lab=rownames(immune.cell.type),
1028
                        x='avg_log2FC', y='p_val_adj',
1029
                        pCutoff=0.05, FCcutoff=1,
1030
                        title=paste0(immune.cells[i], ' Cell DEGs'))
1031
  
1032
  # Create heatmap
1033
  top.genes <- rownames(immune.cell.type)[immune.cell.type$p_val_adj < 0.05]
1034
  top.genes <- top.genes[order(abs(immune.cell.type$avg_log2FC[
1035
    immune.cell.type$p_val_adj < 0.05]), decreasing=T)][1:50]
1036
  
1037
  plot.data <- subset(data.all$proc,
1038
                      subset=Source %in% c(names(obj)[c(1:3,5)]))
1039
  plot.data <- ScaleData(plot.data, features=top.genes)
1040
  
1041
  p2 <- DoHeatmap(plot.data, features=top.genes,
1042
                  group.by='Source') +
1043
    ggtitle(paste0('Top 50 ', immune.cells[i], ' Cell DEGs'))
1044
  
1045
  # Combine plots
1046
  p <- p1 + p2
1047
  
1048
  # Save plot
1049
  ggsave(paste0('plots/', immune.cells[i], '_cell_DEGs.pdf'),
1050
         plot=p, width=12, height=6)
1051
}
1052
1053
```
1054
1055
# Visualize responder vs non-responder comparison
1056
1057
```{r visualize_response, warning=F, message=F, fig.width=12, fig.height=6}
1058
1059
# Create volcano plot
1060
p1 <- EnhancedVolcano(response.vs.nonresponse,
1061
                      lab=rownames(response.vs.nonresponse),
1062
                      x='avg_log2FC', y='p_val_adj',
1063
                      pCutoff=0.05, FCcutoff=1,
1064
                      title='Response vs Non-response DEGs')
1065
1066
# Create heatmap
1067
top.genes <- rownames(response.vs.nonresponse)[
1068
  response.vs.nonresponse$p_val_adj < 0.05]
1069
top.genes <- top.genes[order(abs(response.vs.nonresponse$avg_log2FC[
1070
  response.vs.nonresponse$p_val_adj < 0.05]), decreasing=T)][1:50]
1071
1072
plot.data <- subset(data.all$proc,
1073
                    subset=Source %in% names(obj)[1])
1074
plot.data <- ScaleData(plot.data, features=top.genes)
1075
1076
p2 <- DoHeatmap(plot.data, features=top.genes,
1077
                group.by='Response') +
1078
  ggtitle('Top 50 Response vs Non-response DEGs')
1079
1080
# Combine plots
1081
p1 + p2
1082
1083
# Save plot
1084
ggsave('plots/response_vs_nonresponse.pdf', width=12, height=6)
1085
1086
```