a b/notebooks/scRNAseq_analysis.Rmd
1
---
2
title: "Analysis of scRNA-seq data"
3
date: "Last modified: `r format(Sys.time(), '%B %d, %Y')`"
4
tags: [scRNA-seq, seurat, melanoma, immunotherapy, PBMCs] 
5
output:
6
  html_document:
7
    theme: flatly
8
    highlight: zenburn
9
    toc: true
10
    number_sections: false
11
    toc_depth: 2
12
    toc_float:
13
      collapsed: false
14
      smooth_scroll: true
15
    code_folding: hide
16
    self_contained: yes
17
---
18
19
# Project Description
20
21
This project aims to identify therapeutic targets for melanoma patients by following a 2-phase approach:
22
23
-   Phase I: Analyze scRNA-seq data to identify DE genes and infer enriched pathways\
24
-   Phase II: Compare responders vs. non-responders to identify therapeutic targets
25
26
## Dataset Description
27
28
This analysis integrates five complementary single-cell RNA sequencing datasets:
29
30
1. Dataset 1 (Treatment Response Dataset)
31
   - Contains treatment response data
32
   - Includes responder/non-responder classifications
33
   - Focus on immunotherapy outcomes
34
35
2. Dataset 2 (Tumor Cell Dataset)
36
   - Characterizes tumor cell heterogeneity
37
   - Includes cell type annotations
38
   - Focus on tumor cell populations
39
40
3. Dataset 3 (Tumor Microenvironment Dataset)
41
   - Maps the tumor microenvironment
42
   - Contains malignant/non-malignant classifications
43
   - Includes detailed immune cell typing
44
45
4. Dataset 4 (Control Melanocyte Dataset)
46
   - Normal melanocyte control data
47
   - Reference for non-malignant state
48
   - Baseline expression profiles
49
50
5. Dataset 5 (Control Immune Dataset)
51
   - Normal immune cell control data
52
   - Reference for immune cell states
53
   - Baseline immune profiles
54
55
## Strategy
56
57
Use `seurat` and `harmony` to complete the following tasks:
58
59
1.  Load/process/combine data\
60
2.  Clustering and biomarker determination\
61
3.  Comparisons: disease vs. control
62
63
# Initialize environment
64
65
Install required packages.
66
67
```{r install_packages, results="hide", message=F, warning=F, error=F}
68
69
# Define packages to install
70
pkg.list = c('svDialogs', 'vroom', 'dplyr', 'DT', 'Seurat', 'harmony', 
71
             'patchwork', 'ggplot2', 'ggrepel', 'openxlsx', 'progress')
72
73
# Define packages not already installed
74
pkg.install <- pkg.list[!(pkg.list %in% installed.packages()[, 'Package'])]
75
76
# Install uninstalled packages
77
if (length(pkg.install) > 0) {
78
  install.packages(pkg.install)
79
}
80
81
```
82
83
Load installed packages.
84
85
```{r load_packages, results="hide", message=F, warning=F, error=F}
86
87
# Load packages
88
library(svDialogs)    # for prompting user-input
89
library(vroom)        # for quickly reading data
90
library(dplyr)        # for data processing
91
library(DT)           # to display datatables
92
library(Seurat)       # for scRNA-seq analysis
93
library(harmony)      # to integration scRNA-seq data
94
library(patchwork)    # for combining plots
95
library(ggplot2)      # for data visualization
96
library(ggrepel)      # to use geom_pont_repel()
97
library(openxlsx)     # to write data to excel
98
library(progress)     # to display progress bar
99
100
```
101
102
Define custom functions.
103
104
```{r custom_functions, results="hide", message=F, warning=F, error=F}
105
106
# qc_filter(): filter data based on QC for scRNA-seq data
107
qc_filter <- function(obj, feat.t=c(200, 2500), pct.mt.t=5, var.method='vst', 
108
                      feat.n=2000, qc.plot=T, top.n=10, title='') {
109
  
110
  ############################ FUNCTION DESCRIPTION ############################
111
  # feat.t = lower and upper limits on unique gene counts
112
  # pct.mt.t = threshold of level in mitochondrial contamination
113
  # var.method = method for selecting highly variable genes
114
  # feat.n = number of variable genes to select
115
  # qc.plot = boolean whether to generate plots to decide downstream analyses
116
  # title = string to use for plot title
117
  ############################## BEGIN FUNCTION ################################
118
  
119
  # determine percentage of mitochondrial contamination
120
  obj[['pct.mt']] <- PercentageFeatureSet(obj, pattern='^MT-')
121
  # filter + nomalize + scale data
122
  obj <- obj %>% 
123
    subset(., subset=(nFeature_RNA > feat.t[1]) & (nFeature_RNA < feat.t[2]) & 
124
             (pct.mt < pct.mt.t)) %>% NormalizeData(.) %>% 
125
    FindVariableFeatures(., selection.method=var.method) %>% 
126
    ScaleData(.) %>% RunPCA(., features=VariableFeatures(object=.))
127
  # generate follow-up QC plots (if prompted)
128
  if (qc.plot) { 
129
    p1 <- VariableFeaturePlot(obj) %>% 
130
      LabelPoints(plot=., points=head(VariableFeatures(obj), top.n), repel=T)
131
    p2 <- ElbowPlot(obj)
132
    plot(p1 + p2 + plot_annotation(title=title))
133
  }
134
  # return output
135
  return(obj)
136
}
137
138
# de_analyze(): conduct differential expression (DE) analysis
139
de_analyze <- function(m1, m2, alt='two.sided', paired=F, var.equal=F, 
140
                       adj.method='bonferroni', t=0.05, de.plot=F, title='') { 
141
  
142
  ############################ FUNCTION DESCRIPTION ############################
143
  # m1, m2 = expression matrices to compare
144
  # alt, paired, var.equal = arguments for t.test() function
145
  # adj.method = method for calculating adjusted p-value
146
  # t = threshold for significance 
147
  # de.plot = boolean whether to generate a volcano plot
148
  ############################## BEGIN FUNCTION ################################
149
  
150
  # make sure two matrices have same number of rows
151
  if (nrow(m1) != nrow(m2)) { 
152
    stop('Row length does not match between the provided matrices.')
153
  }
154
  # make sure gene names align between matrices
155
  if (!all(rownames(m1) == rownames(m2))) { 
156
    stop('Gene names do not align between provided matrices.')
157
    }
158
  # instantiate output variable
159
  results <- data.frame(gene=rownames(m1), 
160
                        t.stat=vector(mode='numeric', length=nrow(m1)), 
161
                        p.val=vector(mode='numeric', length=nrow(m1)))
162
  # conduct unpaired t-test with unequal variance for each gene
163
  pb <- progress_bar$new(
164
    format='  analyzing [:bar] :percent time left: :eta', total=nrow(m1))
165
  for (i in 1:nrow(m1)) { 
166
    pb$tick()
167
    x <- m1[i, ]; y <- m2[i, ]
168
    r <- t.test(x, y, alternative=alt, paired=paired, var.equal=var.equal)
169
    results$t.stat[i] <- r$statistic
170
    results$p.val[i] <- r$p.value
171
  }
172
  # determine adjusted p-values
173
  results$q.val <- p.adjust(results$p.val, method=adj.method)
174
  # add additional fields
175
  results <- results %>%
176
    mutate(Significance=case_when(q.val < t & t.stat > 0 ~ 'Up',
177
                                  q.val < t & t.stat < 0 ~ 'Down',
178
                                  T ~ 'NS')) %>% arrange(q.val)
179
  # generate volcano plot (if prompted)
180
  if (de.plot) { 
181
    p <- results %>% arrange(t.stat) %>% ggplot(data=., 
182
           aes(x=t.stat, y=-log10(q.val), col=Significance, label=gene)) +
183
      geom_point() + geom_text_repel() + theme_minimal() + 
184
      scale_color_manual(values=c('blue', 'black', 'red')) + ggtitle(title)
185
    plot(p)
186
  }
187
  # return output
188
  return(results)
189
}
190
191
```
192
193
Additional settings.
194
195
```{r settings}
196
197
# Adjust system settings
198
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
199
200
# Save plots? (default: F)
201
save.plots <- dlgInput('Save all outputs? (T/F)', F)$res
202
203
```
204
205
# Tasks
206
207
## 1. Load/process/combine data
208
209
**Description**: load all scRNA-seq files.
210
211
```{r load_data, warning=F, message=F}
212
setwd("..")
213
# Load workspace (if it exists)
214
f <- list.files()
215
if (any(endsWith(f, '.RData'))) {
216
  load(f[endsWith(f, '.RData')][1])
217
}
218
```
219
220
```{r load_data, warning=F, message=F}
221
##################### ONLY RUN IF THERE'S NO WORKSPACE #########################
222
# # Load disease datasets
223
# obj <- list(); meta <- list()
224
# #   Dataset 1: Treatment response dataset
225
# meta$dataset1 <- read.delim(file='data/dataset1/metadata.txt')
226
# x <- vroom('data/dataset1/counts.txt', col_names=F)
227
# x <- x[, 1:nrow(meta$dataset1)] %>% as.matrix(.)
228
# y <- read.delim('data/dataset1/genes.txt', header=F)
229
# rownames(x) <- make.unique(y$V1)
230
# colnames(x) <- meta$dataset1$ID
231
# obj$dataset1 <- CreateSeuratObject(x, project='dataset1',
232
#                                     min.cells=3, min.features=200)
233
# obj$dataset1$Treatment <- as.factor(meta$dataset1$Treatment)
234
# obj$dataset1$Response <- as.factor(meta$dataset1$Response)
235
# rm(x, y); gc()
236
# 
237
# #   Dataset 2: Tumor cell dataset
238
# meta$dataset2 <- read.csv('data/dataset2/metadata.csv')
239
# x <- vroom('data/dataset2/expression_matrix.csv')
240
# y <- as.matrix(x[, -1]); rownames(y) <- x$...1
241
# obj$dataset2 <- CreateSeuratObject(y, project='dataset2',
242
#                                     min.cells=3, min.features=200)
243
# obj$dataset2$type <- as.factor(meta$dataset2$cell.types)
244
# 
245
# #   Dataset 3: Tumor microenvironment dataset
246
# x <- vroom('data/dataset3/expression_matrix.txt')
247
# y <- as.matrix(x[-c(1:3), -1]); rownames(y) <- x$Cell[-c(1:3)]
248
# obj$dataset3 <- CreateSeuratObject(y, project='dataset3',
249
#                                    min.cells=3, min.features=200)
250
# meta$dataset3 <- data.frame(Tumor=t(x[1, -1]),
251
#                             Malignant=case_when(x[2, -1] == 1 ~ 'No',
252
#                                                 x[2, -1] == 2 ~ 'Yes',
253
#                                                 x[2, -1] == 0 ~ 'Unresolved'),
254
#                             NMType=case_when(x[3, -1] == 1 ~ 'T',
255
#                                              x[3, -1] == 2 ~ 'B',
256
#                                              x[3, -1] == 3 ~ 'Macro',
257
#                                              x[3, -1] == 4 ~ 'Endo',
258
#                                              x[3, -1] == 5 ~ 'CAF',
259
#                                              x[3, -1] == 6 ~ 'NK',
260
#                                              x[3, -1] == 0 ~ 'NA'))
261
# obj$dataset3$Malignant <- meta$dataset3$Malignant
262
# obj$dataset3$NMType <- meta$dataset3$NMType
263
# 
264
# # Load control datasets
265
# #   Dataset 4: Control melanocyte dataset
266
# x <- vroom('data/dataset4/expression_matrix.csv')
267
# y <- as.matrix(x[, -1]); rownames(y) <- x$...1
268
# obj$dataset4 <- CreateSeuratObject(y, project='dataset4',
269
#                                     min.cells=3, min.features=200)
270
# 
271
# #   Dataset 5: Control immune dataset
272
# meta$dataset5 <- read.table('data/dataset5/metadata.tsv',
273
#                              sep='\t', header=T)
274
# x <- vroom('data/dataset5/matrix.tsv')
275
# y <- as.matrix(x[, -1]); rownames(y) <- x$Gene.Name
276
# obj$dataset5 <- CreateSeuratObject(y, project='dataset5',
277
#                                     min.cells=3, min.features=200)
278
# obj$dataset5$sample <- as.factor(meta$dataset5$sample)
279
# rm(f, x, y, meta); gc()
280
281
```
282
283
*Description*: Combine all datasets together into single Seurat object. Also apply `harmony` to remove clustering bias based on dataset source.
284
285
```{r combine_data, warning=F, message=F}
286
file_path <- "/Users/scampit/Projects/torchstack/cancer-biomarker-discovery"
287
setwd(file_path)
288
289
# Combine all datasets
290
if (!exists('data.all')) { 
291
  data.all <- list()
292
}
293
if (!('raw' %in% names(data.all))) { 
294
  data.all$raw <- merge(obj$dataset1, obj$dataset2) %>%
295
    merge(., obj$dataset3) %>% merge(., obj$dataset4) %>% merge(., obj$dataset5)
296
}
297
298
# Add source information
299
if (!('Source' %in% names(data.all$raw@meta.data))) { 
300
  data.all$raw$Source <- c(rep(names(obj)[1], ncol(obj$dataset1)),
301
                           rep(names(obj)[2], ncol(obj$dataset2)),
302
                           rep(names(obj)[3], ncol(obj$dataset3)),
303
                           rep(names(obj)[4], ncol(obj$dataset4)),
304
                           rep(names(obj)[5], ncol(obj$dataset5)))
305
}
306
307
# Apply QC
308
if (!('proc' %in% names(data.all))) { 
309
  data.all$proc <- qc_filter(data.all$raw)
310
}
311
312
# Visualize integration results
313
p1 <- DimPlot(object=data.all$proc, reduction='pca', pt.size=0.1, 
314
              group.by='Source')
315
p2 <- VlnPlot(object=data.all$proc, features='PC_1', pt.size=0.1, 
316
              group.by='Source')
317
p1 + p2
318
if (save.plots) { 
319
  ggsave('plots/QC_no_harmony.pdf', width=10, height=5, units='in')
320
}
321
322
# Apply harmony (to remove clustering based on dataset source)
323
if (!('harmony' %in% names(data.all$proc@reductions))) { 
324
  data.all$proc <- data.all$proc %>% RunHarmony('Source', plot_convergence=T)
325
}
326
327
# Visualize integration results (after harmony)
328
p1 <- DimPlot(object=data.all$proc, reduction='harmony', pt.size=0.1, 
329
              group.by='Source')
330
p2 <- VlnPlot(object=data.all$proc, features='harmony_1', pt.size=0.1, 
331
              group.by='Source')
332
p1 + p2
333
if (save.plots) { 
334
  ggsave('plots/QC_w_harmony.pdf', width=10, height=5, units='in')
335
}
336
337
```
338
339
## 2. Clustering and biomarker determination
340
341
**Description**: cluster cells based on UMAP and determine biomarkers based on cluster assignment.
342
343
### Clustering
344
345
```{r clustering, warning=F, message=T}
346
347
# Visualize UMAP plots (runtime: ~5 minutes)
348
if (!('umap' %in% names(data.all$proc@reductions))) { 
349
  data.all$proc <- data.all$proc %>% 
350
      RunUMAP(reduction='harmony', dims=1:20) %>% 
351
      FindNeighbors(reduction='harmony', dims=1:20) %>% 
352
      FindClusters(resolution=0.5) %>% identity()
353
}
354
355
#   by dataset source
356
p <- DimPlot(data.all$proc, reduction='umap', group.by='Source', pt.size=0.1, 
357
        split.by='Source') + ggtitle('UMAP split by dataset source'); plot(p)
358
#   by cluster (unlabeled)
359
p <- DimPlot(data.all$proc, reduction='umap', label=T, pt.size=0.1) + 
360
  ggtitle('UMAP of combined scRNA-seq data (unlabeled)'); plot(p)
361
if (save.plots) { 
362
  ggsave('plots/UMAP_unlabeled.pdf', width=10, height=5, units='in')
363
}
364
365
#   by cluster (labeled)
366
lab <- c('Tumor cell', 'Technical error 1', 'Cytotoxic CD8 T cell 1', 'B cell', 
367
         'Melanocytes 1', 'Macrophage', 'Cytotoxic CD8 T cell 2', 'Technical error 2', 
368
         'Memory T cell', 'Melanocytes 2', 'Dysfunctional CD8 T cell', 
369
         'Cytotoxic CD8 T cell 3', '? 1', 'Melanocytes 3', 'Activated cell', 
370
         'Exhausted T cell', 'Cytotoxic T cell', '? 2', '? 3', 'Melanocytes 4', '? 4')
371
names(lab) <- levels(data.all$proc)
372
plot.data <- data.all$proc %>% RenameIdents(., lab)
373
p <- DimPlot(plot.data, reduction='umap', label=T, pt.size=0.1) + 
374
  ggtitle('UMAP of combined scRNA-seq data (labeled)'); plot(p)
375
if (save.plots) { 
376
  ggsave('plots/UMAP_labeled.pdf', width=10, height=5, units='in')
377
}
378
379
```
380
381
### Biomarkers
382
383
```{r biomarkers, warning=F, message=T}
384
385
# Find all biomarkers based on clustering (runtime: ~30 minutes)
386
if (!('bm' %in% names(data.all))) { 
387
  data.all$bm <- FindAllMarkers(data.all$proc, min.pct=0.25, logfc.threshold=0.25)
388
}
389
390
# View table of top 3 biomarkers for each cluster
391
d <- data.all$bm %>% group_by(cluster) %>% slice_max(n=3, order_by=avg_log2FC)
392
datatable(d)
393
394
# Visualize ridge plots based on biomarkers of interest
395
ridge.feat <- dlg_list(sort(unique(d$gene)), multiple=T)$res
396
p <- RidgePlot(data.all$proc, features=ridge.feat, 
397
          ncol=ceiling(length(ridge.feat) / 2)); plot(p)
398
if (save.plots) { 
399
  ggsave('plots/biomarker_ridge_plots.pdf', width=10, height=10, units='in')
400
}
401
402
```
403
404
## 3. Comparisons: disease vs. control
405
406
**Description**: determine differentially expressed genes (DEGs) between disease vs. control groups.
407
408
### Case: tumor
409
410
```{r DE_tumor, warning=F, message=T}
411
412
# Instantiate DE variable
413
if (!exists('de')) { 
414
  de <- list()
415
}
416
if (!('data' %in% names(de))) { 
417
  de$data <- GetAssayData(object=data.all$proc, slot='data')
418
}
419
420
# Define datasets to compare
421
ix <- intersect(rownames(obj$dataset3), rownames(obj$dataset2)) %>%
422
  intersect(., rownames(obj$dataset4))
423
jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes'
424
jx2 <- data.all$proc$Source == names(obj)[4]
425
x <- de$data[ix, jx1]; y <- de$data[ix, jx2]
426
427
# DE analysis (runtime: ~15 minutes)
428
if (!('tumor' %in% names(de))) { 
429
  de$tumor <- de_analyze(x, y) %>% na.omit
430
}
431
432
# Visualize dot plot
433
dot.feat <- dlg_list(de$tumor$gene[de$tumor$q.val < 0.05], multiple=T)$res
434
p.data <- data.all$proc 
435
Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA'))
436
p <- p.data %>% 
437
  DotPlot(object=., features=dot.feat, idents=c('Disease', 'Control')) + 
438
  ggtitle('Case: tumor')
439
plot(p)
440
if (save.plots) { 
441
  ggsave('plots/DE_tumor.pdf', width=10, height=5, units='in')
442
}
443
444
```
445
446
### Case: immune cells (bulk)
447
448
```{r DE_immune_bulk,warning=F, message=T}
449
450
# Define datasets to compare
451
ix <- intersect(rownames(obj$dataset1), rownames(obj$dataset2)) %>% 
452
  intersect(., rownames(obj$dataset5))
453
jx1 <- data.all$proc$Source == names(obj)[1] | data.all$proc$Malignant %in% 'No'
454
jx2 <- data.all$proc$Source == names(obj)[5]
455
x <- de$data[ix, jx1]; y <- de$data[ix, jx2]
456
457
# DE analysis (runtime: ~20 minutes)
458
if (!('immune' %in% names(de))) { 
459
  de$immune <- de_analyze(x, y) %>% na.omit()
460
}
461
462
# Visualize dot plot
463
dot.feat <- dlg_list(de$immune$gene[de$immune$q.val < 0.05], multiple=T)$res
464
p.data <- data.all$proc 
465
Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA'))
466
p <- p.data %>% 
467
  DotPlot(object=., features=dot.feat, idents=c('Disease', 'Control')) + 
468
  ggtitle('Case: immune cells')
469
plot(p)
470
if (save.plots) { 
471
  ggsave('plots/DE_immune_bulk.pdf', width=10, height=5, units='in')
472
}
473
474
```
475
476
### Case: immune cells (cluster-based)
477
478
```{r DE_immune_cluster, warning=F, message=T}
479
480
# Define datasets to compare (runtime: ~5 minutes)
481
immune.cells <- c('B', 'CD8', 'Macrophage', 'Memory', 'Activated')
482
for (i in 1:length(immune.cells)) { 
483
  # Define row + column indices
484
  ix <- intersect(rownames(obj$dataset1), rownames(obj$dataset2)) %>% 
485
    intersect(., rownames(obj$dataset5))
486
  c <- grepl(immune.cells[i], Idents(plot.data))
487
  jx1 <- (plot.data$Source == names(obj)[1] | 
488
            data.all$proc$Malignant %in% 'No') & c
489
  jx2 <- data.all$proc$Source == names(obj)[5] & c
490
  x <- de$data[ix, jx1]; y <- de$data[ix, jx2]
491
  # DE analysis
492
  if (!(immune.cells[i] %in% names(de))) { 
493
    de[[immune.cells[i]]] <- de_analyze(x, y) %>% na.omit()
494
  }
495
  # Visualize dot plot
496
  x <- de[[immune.cells[[i]]]]
497
  dot.feat <- x$gene[x$q.val < 0.05][1:5]
498
  p.data <- data.all$proc 
499
  Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA'))
500
  p <- p.data %>% 
501
    DotPlot(object=., features=dot.feat, idents=c('Disease', 'Control')) + 
502
    ggtitle(paste0('Case: ', immune.cells[i]))
503
  plot(p)
504
  if (save.plots) { 
505
    ggsave(paste0('plots/DE_', immune.cells[i], '.pdf'), 
506
           width=10, height=5, units='in')
507
  }
508
}
509
510
```
511
512
# Case: responder vs. non-responder
513
ix <- rownames(obj$dataset1)
514
jx1 <- data.all$proc$Response %in% 'Responder'
515
jx2 <- data.all$proc$Response %in% 'Non-responder'
516
517
# Save outputs and working space (if prompted)
518
519
```
520
521
```{r glycolysis_visualization, warning=F, message=T}
522
# Define key glycolytic genes
523
glycolysis_genes <- c(
524
  "HK1", "HK2", "GPI", "PFKL", "PFKM", "PFKP",  # Glucose uptake and early glycolysis
525
  "ALDOA", "ALDOB", "ALDOC",                     # Aldolase isoforms
526
  "TPI1",                                        # Triose phosphate isomerase
527
  "GAPDH",                                       # Glyceraldehyde phosphate dehydrogenase
528
  "PGK1",                                        # Phosphoglycerate kinase
529
  "PGAM1",                                       # Phosphoglycerate mutase
530
  "ENO1", "ENO2",                               # Enolase isoforms
531
  "PKM", "PKLR",                                # Pyruvate kinase isoforms
532
  "LDHA", "LDHB",                               # Lactate dehydrogenase
533
  "SLC2A1", "SLC2A3"                            # Glucose transporters (GLUT1, GLUT3)
534
)
535
536
# Filter for genes present in the dataset
537
glycolysis_genes_present <- intersect(glycolysis_genes, rownames(de$data))
538
539
# Create anonymous gene IDs
540
anon_genes <- paste0("gene_", seq_along(glycolysis_genes_present))
541
names(anon_genes) <- glycolysis_genes_present
542
543
# Create mapping dataframe for reference
544
gene_mapping <- data.frame(
545
  Original_Gene = glycolysis_genes_present,
546
  Anonymous_ID = anon_genes[glycolysis_genes_present]
547
)
548
549
# Write mapping to file if saving is enabled
550
if (save.plots) {
551
  write.csv(gene_mapping, 'results/glycolysis_gene_mapping.csv', row.names = FALSE)
552
}
553
554
# Create dot plot for glycolytic genes
555
p.data <- data.all$proc
556
Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA'))
557
558
# Generate dot plot with anonymous IDs and custom colors
559
p_glycolysis <- DotPlot(p.data, 
560
                        features = glycolysis_genes_present,
561
                        idents = c('Disease', 'Control')) +
562
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
563
  xlab('Genes') +
564
  ylab('Condition') +
565
  scale_color_gradient2(low = "navy", mid = "gray90", high = "red", 
566
                       midpoint = 0, name = "Average\nExpression") +
567
  scale_x_discrete(labels = anon_genes[glycolysis_genes_present])
568
569
# Display plot
570
print(p_glycolysis)
571
572
# Save plot if requested
573
if (save.plots) {
574
  ggsave('plots/DE_tumor_glycolysis.pdf', 
575
         plot = p_glycolysis, 
576
         width = 12, 
577
         height = 6, 
578
         units = 'in')
579
}
580
581
# Calculate statistics for glycolytic genes
582
glycolysis_stats <- de$tumor[de$tumor$gene %in% glycolysis_genes_present, ]
583
glycolysis_stats <- glycolysis_stats[order(glycolysis_stats$q.val), ]
584
585
# Add anonymous IDs to statistics
586
glycolysis_stats$anonymous_id <- anon_genes[glycolysis_stats$gene]
587
588
# Display statistics with anonymous IDs
589
print("Differential Expression Statistics for De-identified Glycolytic Genes:")
590
glycolysis_stats$gene <- glycolysis_stats$anonymous_id
591
print(glycolysis_stats)
592
593
# Count significantly different genes
594
sig_glyco_genes <- sum(glycolysis_stats$q.val < 0.05)
595
print(sprintf("Number of significantly different glycolytic genes (q < 0.05): %d", sig_glyco_genes))
596
```
597
598
```{r glycolysis_boxplots, warning=F, message=T}
599
# Get expression data for glycolytic genes
600
expr_data <- GetAssayData(p.data, slot = "data")[glycolysis_genes_present,]
601
602
# Filter out genes with all zeros
603
nonzero_genes <- rownames(expr_data)[rowSums(expr_data > 0) > 0]
604
expr_data <- expr_data[nonzero_genes,]
605
606
# Update anonymous gene IDs for non-zero genes
607
anon_genes <- anon_genes[nonzero_genes]
608
609
target_genes <- c("g10", "g21", "g03")
610
selected_genes <- names(anon_genes)[anon_genes %in% target_genes]
611
612
# Create a data frame for plotting and store statistical results
613
plot_data <- data.frame()
614
stat_results <- data.frame(
615
  Gene = character(),
616
  P_value = numeric(),
617
  T_statistic = numeric(),
618
  Mean_diff = numeric(),
619
  stringsAsFactors = FALSE
620
)
621
622
for (gene in selected_genes) {
623
  # Get expression values for current gene
624
  expr_values <- expr_data[gene,]
625
  
626
  # Split values by condition before centering
627
  disease_vals <- expr_values[colnames(expr_data) %in% colnames(de$data)[jx1]]
628
  control_vals <- expr_values[colnames(expr_data) %in% colnames(de$data)[jx2]]
629
  
630
  # Perform t-test on raw values
631
  t_test_result <- t.test(disease_vals, control_vals)
632
  
633
  # Center the expression values by subtracting the overall mean
634
  overall_mean <- mean(expr_values)
635
  disease_vals_centered <- disease_vals - overall_mean
636
  control_vals_centered <- control_vals - overall_mean
637
  
638
  # Store statistical results
639
  stat_results <- rbind(stat_results, data.frame(
640
    Gene = anon_genes[gene],
641
    P_value = t_test_result$p.value,
642
    T_statistic = t_test_result$statistic,
643
    Mean_diff = mean(disease_vals) - mean(control_vals),
644
    Mean_centered_diff = mean(disease_vals_centered) - mean(control_vals_centered),
645
    stringsAsFactors = FALSE
646
  ))
647
  
648
  # Create temporary data frames for each condition with centered values
649
  temp_df <- data.frame(
650
    Expression = c(disease_vals_centered, control_vals_centered),
651
    Gene = anon_genes[gene],
652
    Condition = c(rep("Disease", length(disease_vals_centered)), 
653
                 rep("Control", length(control_vals_centered)))
654
  )
655
  
656
  # Append to main data frame
657
  plot_data <- rbind(plot_data, temp_df)
658
}
659
660
# Add FDR correction
661
stat_results$FDR <- p.adjust(stat_results$P_value, method = "BH")
662
663
# Sort results by p-value
664
stat_results <- stat_results[order(stat_results$P_value), ]
665
666
# Create boxplot with swarm plot
667
p_boxplot <- ggplot(plot_data, aes(x = Gene, y = Expression, fill = Condition)) +
668
  # Add swarm plot layer first (so it appears behind)
669
  geom_jitter(aes(color = Condition), position = position_jitterdodge(jitter.width = 0.2), 
670
              size = 0.3, alpha = 0.3) +
671
  # Add boxplot layer second (so it appears in front)
672
  geom_boxplot(outlier.shape = NA, alpha = 0.4, width = 0.7) +
673
  # Colors for boxplot fill and point colors
674
  scale_fill_manual(values = c("Disease" = "#E41A1C", "Control" = "#377EB8")) +
675
  scale_color_manual(values = c("Disease" = "#E41A1C", "Control" = "#377EB8")) +
676
  # Set y-axis limits
677
  ylim(-2, max(plot_data$Expression)) +
678
  # Theme customization
679
  theme_minimal() +
680
  theme(
681
    axis.text.x = element_text(angle = 45, hjust = 1),
682
    panel.grid.major.y = element_blank(),  # Remove y-axis major grid lines
683
    panel.grid.minor.y = element_blank(),  # Remove y-axis minor grid lines
684
    panel.grid.major.x = element_line(color = "gray90"),  # Keep x-axis major grid lines
685
    panel.grid.minor.x = element_blank(),  # Remove x-axis minor grid lines
686
    legend.position = c(0.95, 0.95),  # Position legend inside top-right
687
    legend.justification = c(1, 1),    # Anchor point for legend
688
    legend.box.just = "right",
689
    legend.margin = margin(6, 6, 6, 6),
690
    legend.background = element_rect(fill = "white", color = NA),
691
    legend.box.background = element_rect(color = "gray90")
692
  ) +
693
  # Labels
694
  xlab("Genes") +
695
  ylab("Centered Expression") +
696
  ggtitle("Gene Expression Relative to Mean") +
697
  # Add horizontal line at y=0
698
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5)
699
700
# Display plot
701
print(p_boxplot)
702
703
# Save plot if requested
704
if (save.plots) {
705
  ggsave('plots/DE_tumor_glycolysis_boxplots.pdf', 
706
         plot = p_boxplot, 
707
         width = 12, 
708
         height = 6, 
709
         units = 'in')
710
}
711
712
# Calculate summary statistics for centered data
713
summary_stats <- plot_data %>%
714
  group_by(Gene, Condition) %>%
715
  summarise(
716
    Mean = mean(Expression),
717
    Median = median(Expression),
718
    SD = sd(Expression),
719
    Q1 = quantile(Expression, 0.25),
720
    Q3 = quantile(Expression, 0.75),
721
    n = n(),
722
    .groups = 'drop'
723
  )
724
725
# Display summary statistics
726
print("Summary Statistics for Centered Expression by Condition:")
727
print(summary_stats)
728
729
# Display statistical test results
730
print("\nStatistical Test Results (Disease vs Control):")
731
print(stat_results)
732
733
# Print number of genes removed due to zero expression
734
n_removed <- length(glycolysis_genes_present) - length(nonzero_genes)
735
if (n_removed > 0) {
736
  print(sprintf("\nRemoved %d genes with zero expression", n_removed))
737
  print("Removed genes:")
738
  print(setdiff(glycolysis_genes_present, nonzero_genes))
739
}
740
741
# Print number of significantly different genes
742
sig_genes <- sum(stat_results$FDR < 0.05)
743
if (sig_genes > 0) {
744
  print(sprintf("\nNumber of significantly different genes (FDR < 0.05): %d", sig_genes))
745
  print("Significant genes:")
746
  print(stat_results[stat_results$FDR < 0.05, ])
747
}
748
```