|
a |
|
b/notebooks/scRNAseq_analysis.Rmd |
|
|
1 |
--- |
|
|
2 |
title: "Analysis of scRNA-seq data" |
|
|
3 |
date: "Last modified: `r format(Sys.time(), '%B %d, %Y')`" |
|
|
4 |
tags: [scRNA-seq, seurat, melanoma, immunotherapy, PBMCs] |
|
|
5 |
output: |
|
|
6 |
html_document: |
|
|
7 |
theme: flatly |
|
|
8 |
highlight: zenburn |
|
|
9 |
toc: true |
|
|
10 |
number_sections: false |
|
|
11 |
toc_depth: 2 |
|
|
12 |
toc_float: |
|
|
13 |
collapsed: false |
|
|
14 |
smooth_scroll: true |
|
|
15 |
code_folding: hide |
|
|
16 |
self_contained: yes |
|
|
17 |
--- |
|
|
18 |
|
|
|
19 |
# Project Description |
|
|
20 |
|
|
|
21 |
This project aims to identify therapeutic targets for melanoma patients by following a 2-phase approach: |
|
|
22 |
|
|
|
23 |
- Phase I: Analyze scRNA-seq data to identify DE genes and infer enriched pathways\ |
|
|
24 |
- Phase II: Compare responders vs. non-responders to identify therapeutic targets |
|
|
25 |
|
|
|
26 |
## Dataset Description |
|
|
27 |
|
|
|
28 |
This analysis integrates five complementary single-cell RNA sequencing datasets: |
|
|
29 |
|
|
|
30 |
1. Dataset 1 (Treatment Response Dataset) |
|
|
31 |
- Contains treatment response data |
|
|
32 |
- Includes responder/non-responder classifications |
|
|
33 |
- Focus on immunotherapy outcomes |
|
|
34 |
|
|
|
35 |
2. Dataset 2 (Tumor Cell Dataset) |
|
|
36 |
- Characterizes tumor cell heterogeneity |
|
|
37 |
- Includes cell type annotations |
|
|
38 |
- Focus on tumor cell populations |
|
|
39 |
|
|
|
40 |
3. Dataset 3 (Tumor Microenvironment Dataset) |
|
|
41 |
- Maps the tumor microenvironment |
|
|
42 |
- Contains malignant/non-malignant classifications |
|
|
43 |
- Includes detailed immune cell typing |
|
|
44 |
|
|
|
45 |
4. Dataset 4 (Control Melanocyte Dataset) |
|
|
46 |
- Normal melanocyte control data |
|
|
47 |
- Reference for non-malignant state |
|
|
48 |
- Baseline expression profiles |
|
|
49 |
|
|
|
50 |
5. Dataset 5 (Control Immune Dataset) |
|
|
51 |
- Normal immune cell control data |
|
|
52 |
- Reference for immune cell states |
|
|
53 |
- Baseline immune profiles |
|
|
54 |
|
|
|
55 |
## Strategy |
|
|
56 |
|
|
|
57 |
Use `seurat` and `harmony` to complete the following tasks: |
|
|
58 |
|
|
|
59 |
1. Load/process/combine data\ |
|
|
60 |
2. Clustering and biomarker determination\ |
|
|
61 |
3. Comparisons: disease vs. control |
|
|
62 |
|
|
|
63 |
# Initialize environment |
|
|
64 |
|
|
|
65 |
Install required packages. |
|
|
66 |
|
|
|
67 |
```{r install_packages, results="hide", message=F, warning=F, error=F} |
|
|
68 |
|
|
|
69 |
# Define packages to install |
|
|
70 |
pkg.list = c('svDialogs', 'vroom', 'dplyr', 'DT', 'Seurat', 'harmony', |
|
|
71 |
'patchwork', 'ggplot2', 'ggrepel', 'openxlsx', 'progress') |
|
|
72 |
|
|
|
73 |
# Define packages not already installed |
|
|
74 |
pkg.install <- pkg.list[!(pkg.list %in% installed.packages()[, 'Package'])] |
|
|
75 |
|
|
|
76 |
# Install uninstalled packages |
|
|
77 |
if (length(pkg.install) > 0) { |
|
|
78 |
install.packages(pkg.install) |
|
|
79 |
} |
|
|
80 |
|
|
|
81 |
``` |
|
|
82 |
|
|
|
83 |
Load installed packages. |
|
|
84 |
|
|
|
85 |
```{r load_packages, results="hide", message=F, warning=F, error=F} |
|
|
86 |
|
|
|
87 |
# Load packages |
|
|
88 |
library(svDialogs) # for prompting user-input |
|
|
89 |
library(vroom) # for quickly reading data |
|
|
90 |
library(dplyr) # for data processing |
|
|
91 |
library(DT) # to display datatables |
|
|
92 |
library(Seurat) # for scRNA-seq analysis |
|
|
93 |
library(harmony) # to integration scRNA-seq data |
|
|
94 |
library(patchwork) # for combining plots |
|
|
95 |
library(ggplot2) # for data visualization |
|
|
96 |
library(ggrepel) # to use geom_pont_repel() |
|
|
97 |
library(openxlsx) # to write data to excel |
|
|
98 |
library(progress) # to display progress bar |
|
|
99 |
|
|
|
100 |
``` |
|
|
101 |
|
|
|
102 |
Define custom functions. |
|
|
103 |
|
|
|
104 |
```{r custom_functions, results="hide", message=F, warning=F, error=F} |
|
|
105 |
|
|
|
106 |
# qc_filter(): filter data based on QC for scRNA-seq data |
|
|
107 |
qc_filter <- function(obj, feat.t=c(200, 2500), pct.mt.t=5, var.method='vst', |
|
|
108 |
feat.n=2000, qc.plot=T, top.n=10, title='') { |
|
|
109 |
|
|
|
110 |
############################ FUNCTION DESCRIPTION ############################ |
|
|
111 |
# feat.t = lower and upper limits on unique gene counts |
|
|
112 |
# pct.mt.t = threshold of level in mitochondrial contamination |
|
|
113 |
# var.method = method for selecting highly variable genes |
|
|
114 |
# feat.n = number of variable genes to select |
|
|
115 |
# qc.plot = boolean whether to generate plots to decide downstream analyses |
|
|
116 |
# title = string to use for plot title |
|
|
117 |
############################## BEGIN FUNCTION ################################ |
|
|
118 |
|
|
|
119 |
# determine percentage of mitochondrial contamination |
|
|
120 |
obj[['pct.mt']] <- PercentageFeatureSet(obj, pattern='^MT-') |
|
|
121 |
# filter + nomalize + scale data |
|
|
122 |
obj <- obj %>% |
|
|
123 |
subset(., subset=(nFeature_RNA > feat.t[1]) & (nFeature_RNA < feat.t[2]) & |
|
|
124 |
(pct.mt < pct.mt.t)) %>% NormalizeData(.) %>% |
|
|
125 |
FindVariableFeatures(., selection.method=var.method) %>% |
|
|
126 |
ScaleData(.) %>% RunPCA(., features=VariableFeatures(object=.)) |
|
|
127 |
# generate follow-up QC plots (if prompted) |
|
|
128 |
if (qc.plot) { |
|
|
129 |
p1 <- VariableFeaturePlot(obj) %>% |
|
|
130 |
LabelPoints(plot=., points=head(VariableFeatures(obj), top.n), repel=T) |
|
|
131 |
p2 <- ElbowPlot(obj) |
|
|
132 |
plot(p1 + p2 + plot_annotation(title=title)) |
|
|
133 |
} |
|
|
134 |
# return output |
|
|
135 |
return(obj) |
|
|
136 |
} |
|
|
137 |
|
|
|
138 |
# de_analyze(): conduct differential expression (DE) analysis |
|
|
139 |
de_analyze <- function(m1, m2, alt='two.sided', paired=F, var.equal=F, |
|
|
140 |
adj.method='bonferroni', t=0.05, de.plot=F, title='') { |
|
|
141 |
|
|
|
142 |
############################ FUNCTION DESCRIPTION ############################ |
|
|
143 |
# m1, m2 = expression matrices to compare |
|
|
144 |
# alt, paired, var.equal = arguments for t.test() function |
|
|
145 |
# adj.method = method for calculating adjusted p-value |
|
|
146 |
# t = threshold for significance |
|
|
147 |
# de.plot = boolean whether to generate a volcano plot |
|
|
148 |
############################## BEGIN FUNCTION ################################ |
|
|
149 |
|
|
|
150 |
# make sure two matrices have same number of rows |
|
|
151 |
if (nrow(m1) != nrow(m2)) { |
|
|
152 |
stop('Row length does not match between the provided matrices.') |
|
|
153 |
} |
|
|
154 |
# make sure gene names align between matrices |
|
|
155 |
if (!all(rownames(m1) == rownames(m2))) { |
|
|
156 |
stop('Gene names do not align between provided matrices.') |
|
|
157 |
} |
|
|
158 |
# instantiate output variable |
|
|
159 |
results <- data.frame(gene=rownames(m1), |
|
|
160 |
t.stat=vector(mode='numeric', length=nrow(m1)), |
|
|
161 |
p.val=vector(mode='numeric', length=nrow(m1))) |
|
|
162 |
# conduct unpaired t-test with unequal variance for each gene |
|
|
163 |
pb <- progress_bar$new( |
|
|
164 |
format=' analyzing [:bar] :percent time left: :eta', total=nrow(m1)) |
|
|
165 |
for (i in 1:nrow(m1)) { |
|
|
166 |
pb$tick() |
|
|
167 |
x <- m1[i, ]; y <- m2[i, ] |
|
|
168 |
r <- t.test(x, y, alternative=alt, paired=paired, var.equal=var.equal) |
|
|
169 |
results$t.stat[i] <- r$statistic |
|
|
170 |
results$p.val[i] <- r$p.value |
|
|
171 |
} |
|
|
172 |
# determine adjusted p-values |
|
|
173 |
results$q.val <- p.adjust(results$p.val, method=adj.method) |
|
|
174 |
# add additional fields |
|
|
175 |
results <- results %>% |
|
|
176 |
mutate(Significance=case_when(q.val < t & t.stat > 0 ~ 'Up', |
|
|
177 |
q.val < t & t.stat < 0 ~ 'Down', |
|
|
178 |
T ~ 'NS')) %>% arrange(q.val) |
|
|
179 |
# generate volcano plot (if prompted) |
|
|
180 |
if (de.plot) { |
|
|
181 |
p <- results %>% arrange(t.stat) %>% ggplot(data=., |
|
|
182 |
aes(x=t.stat, y=-log10(q.val), col=Significance, label=gene)) + |
|
|
183 |
geom_point() + geom_text_repel() + theme_minimal() + |
|
|
184 |
scale_color_manual(values=c('blue', 'black', 'red')) + ggtitle(title) |
|
|
185 |
plot(p) |
|
|
186 |
} |
|
|
187 |
# return output |
|
|
188 |
return(results) |
|
|
189 |
} |
|
|
190 |
|
|
|
191 |
``` |
|
|
192 |
|
|
|
193 |
Additional settings. |
|
|
194 |
|
|
|
195 |
```{r settings} |
|
|
196 |
|
|
|
197 |
# Adjust system settings |
|
|
198 |
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2) |
|
|
199 |
|
|
|
200 |
# Save plots? (default: F) |
|
|
201 |
save.plots <- dlgInput('Save all outputs? (T/F)', F)$res |
|
|
202 |
|
|
|
203 |
``` |
|
|
204 |
|
|
|
205 |
# Tasks |
|
|
206 |
|
|
|
207 |
## 1. Load/process/combine data |
|
|
208 |
|
|
|
209 |
**Description**: load all scRNA-seq files. |
|
|
210 |
|
|
|
211 |
```{r load_data, warning=F, message=F} |
|
|
212 |
setwd("..") |
|
|
213 |
# Load workspace (if it exists) |
|
|
214 |
f <- list.files() |
|
|
215 |
if (any(endsWith(f, '.RData'))) { |
|
|
216 |
load(f[endsWith(f, '.RData')][1]) |
|
|
217 |
} |
|
|
218 |
``` |
|
|
219 |
|
|
|
220 |
```{r load_data, warning=F, message=F} |
|
|
221 |
##################### ONLY RUN IF THERE'S NO WORKSPACE ######################### |
|
|
222 |
# # Load disease datasets |
|
|
223 |
# obj <- list(); meta <- list() |
|
|
224 |
# # Dataset 1: Treatment response dataset |
|
|
225 |
# meta$dataset1 <- read.delim(file='data/dataset1/metadata.txt') |
|
|
226 |
# x <- vroom('data/dataset1/counts.txt', col_names=F) |
|
|
227 |
# x <- x[, 1:nrow(meta$dataset1)] %>% as.matrix(.) |
|
|
228 |
# y <- read.delim('data/dataset1/genes.txt', header=F) |
|
|
229 |
# rownames(x) <- make.unique(y$V1) |
|
|
230 |
# colnames(x) <- meta$dataset1$ID |
|
|
231 |
# obj$dataset1 <- CreateSeuratObject(x, project='dataset1', |
|
|
232 |
# min.cells=3, min.features=200) |
|
|
233 |
# obj$dataset1$Treatment <- as.factor(meta$dataset1$Treatment) |
|
|
234 |
# obj$dataset1$Response <- as.factor(meta$dataset1$Response) |
|
|
235 |
# rm(x, y); gc() |
|
|
236 |
# |
|
|
237 |
# # Dataset 2: Tumor cell dataset |
|
|
238 |
# meta$dataset2 <- read.csv('data/dataset2/metadata.csv') |
|
|
239 |
# x <- vroom('data/dataset2/expression_matrix.csv') |
|
|
240 |
# y <- as.matrix(x[, -1]); rownames(y) <- x$...1 |
|
|
241 |
# obj$dataset2 <- CreateSeuratObject(y, project='dataset2', |
|
|
242 |
# min.cells=3, min.features=200) |
|
|
243 |
# obj$dataset2$type <- as.factor(meta$dataset2$cell.types) |
|
|
244 |
# |
|
|
245 |
# # Dataset 3: Tumor microenvironment dataset |
|
|
246 |
# x <- vroom('data/dataset3/expression_matrix.txt') |
|
|
247 |
# y <- as.matrix(x[-c(1:3), -1]); rownames(y) <- x$Cell[-c(1:3)] |
|
|
248 |
# obj$dataset3 <- CreateSeuratObject(y, project='dataset3', |
|
|
249 |
# min.cells=3, min.features=200) |
|
|
250 |
# meta$dataset3 <- data.frame(Tumor=t(x[1, -1]), |
|
|
251 |
# Malignant=case_when(x[2, -1] == 1 ~ 'No', |
|
|
252 |
# x[2, -1] == 2 ~ 'Yes', |
|
|
253 |
# x[2, -1] == 0 ~ 'Unresolved'), |
|
|
254 |
# NMType=case_when(x[3, -1] == 1 ~ 'T', |
|
|
255 |
# x[3, -1] == 2 ~ 'B', |
|
|
256 |
# x[3, -1] == 3 ~ 'Macro', |
|
|
257 |
# x[3, -1] == 4 ~ 'Endo', |
|
|
258 |
# x[3, -1] == 5 ~ 'CAF', |
|
|
259 |
# x[3, -1] == 6 ~ 'NK', |
|
|
260 |
# x[3, -1] == 0 ~ 'NA')) |
|
|
261 |
# obj$dataset3$Malignant <- meta$dataset3$Malignant |
|
|
262 |
# obj$dataset3$NMType <- meta$dataset3$NMType |
|
|
263 |
# |
|
|
264 |
# # Load control datasets |
|
|
265 |
# # Dataset 4: Control melanocyte dataset |
|
|
266 |
# x <- vroom('data/dataset4/expression_matrix.csv') |
|
|
267 |
# y <- as.matrix(x[, -1]); rownames(y) <- x$...1 |
|
|
268 |
# obj$dataset4 <- CreateSeuratObject(y, project='dataset4', |
|
|
269 |
# min.cells=3, min.features=200) |
|
|
270 |
# |
|
|
271 |
# # Dataset 5: Control immune dataset |
|
|
272 |
# meta$dataset5 <- read.table('data/dataset5/metadata.tsv', |
|
|
273 |
# sep='\t', header=T) |
|
|
274 |
# x <- vroom('data/dataset5/matrix.tsv') |
|
|
275 |
# y <- as.matrix(x[, -1]); rownames(y) <- x$Gene.Name |
|
|
276 |
# obj$dataset5 <- CreateSeuratObject(y, project='dataset5', |
|
|
277 |
# min.cells=3, min.features=200) |
|
|
278 |
# obj$dataset5$sample <- as.factor(meta$dataset5$sample) |
|
|
279 |
# rm(f, x, y, meta); gc() |
|
|
280 |
|
|
|
281 |
``` |
|
|
282 |
|
|
|
283 |
*Description*: Combine all datasets together into single Seurat object. Also apply `harmony` to remove clustering bias based on dataset source. |
|
|
284 |
|
|
|
285 |
```{r combine_data, warning=F, message=F} |
|
|
286 |
file_path <- "/Users/scampit/Projects/torchstack/cancer-biomarker-discovery" |
|
|
287 |
setwd(file_path) |
|
|
288 |
|
|
|
289 |
# Combine all datasets |
|
|
290 |
if (!exists('data.all')) { |
|
|
291 |
data.all <- list() |
|
|
292 |
} |
|
|
293 |
if (!('raw' %in% names(data.all))) { |
|
|
294 |
data.all$raw <- merge(obj$dataset1, obj$dataset2) %>% |
|
|
295 |
merge(., obj$dataset3) %>% merge(., obj$dataset4) %>% merge(., obj$dataset5) |
|
|
296 |
} |
|
|
297 |
|
|
|
298 |
# Add source information |
|
|
299 |
if (!('Source' %in% names(data.all$raw@meta.data))) { |
|
|
300 |
data.all$raw$Source <- c(rep(names(obj)[1], ncol(obj$dataset1)), |
|
|
301 |
rep(names(obj)[2], ncol(obj$dataset2)), |
|
|
302 |
rep(names(obj)[3], ncol(obj$dataset3)), |
|
|
303 |
rep(names(obj)[4], ncol(obj$dataset4)), |
|
|
304 |
rep(names(obj)[5], ncol(obj$dataset5))) |
|
|
305 |
} |
|
|
306 |
|
|
|
307 |
# Apply QC |
|
|
308 |
if (!('proc' %in% names(data.all))) { |
|
|
309 |
data.all$proc <- qc_filter(data.all$raw) |
|
|
310 |
} |
|
|
311 |
|
|
|
312 |
# Visualize integration results |
|
|
313 |
p1 <- DimPlot(object=data.all$proc, reduction='pca', pt.size=0.1, |
|
|
314 |
group.by='Source') |
|
|
315 |
p2 <- VlnPlot(object=data.all$proc, features='PC_1', pt.size=0.1, |
|
|
316 |
group.by='Source') |
|
|
317 |
p1 + p2 |
|
|
318 |
if (save.plots) { |
|
|
319 |
ggsave('plots/QC_no_harmony.pdf', width=10, height=5, units='in') |
|
|
320 |
} |
|
|
321 |
|
|
|
322 |
# Apply harmony (to remove clustering based on dataset source) |
|
|
323 |
if (!('harmony' %in% names(data.all$proc@reductions))) { |
|
|
324 |
data.all$proc <- data.all$proc %>% RunHarmony('Source', plot_convergence=T) |
|
|
325 |
} |
|
|
326 |
|
|
|
327 |
# Visualize integration results (after harmony) |
|
|
328 |
p1 <- DimPlot(object=data.all$proc, reduction='harmony', pt.size=0.1, |
|
|
329 |
group.by='Source') |
|
|
330 |
p2 <- VlnPlot(object=data.all$proc, features='harmony_1', pt.size=0.1, |
|
|
331 |
group.by='Source') |
|
|
332 |
p1 + p2 |
|
|
333 |
if (save.plots) { |
|
|
334 |
ggsave('plots/QC_w_harmony.pdf', width=10, height=5, units='in') |
|
|
335 |
} |
|
|
336 |
|
|
|
337 |
``` |
|
|
338 |
|
|
|
339 |
## 2. Clustering and biomarker determination |
|
|
340 |
|
|
|
341 |
**Description**: cluster cells based on UMAP and determine biomarkers based on cluster assignment. |
|
|
342 |
|
|
|
343 |
### Clustering |
|
|
344 |
|
|
|
345 |
```{r clustering, warning=F, message=T} |
|
|
346 |
|
|
|
347 |
# Visualize UMAP plots (runtime: ~5 minutes) |
|
|
348 |
if (!('umap' %in% names(data.all$proc@reductions))) { |
|
|
349 |
data.all$proc <- data.all$proc %>% |
|
|
350 |
RunUMAP(reduction='harmony', dims=1:20) %>% |
|
|
351 |
FindNeighbors(reduction='harmony', dims=1:20) %>% |
|
|
352 |
FindClusters(resolution=0.5) %>% identity() |
|
|
353 |
} |
|
|
354 |
|
|
|
355 |
# by dataset source |
|
|
356 |
p <- DimPlot(data.all$proc, reduction='umap', group.by='Source', pt.size=0.1, |
|
|
357 |
split.by='Source') + ggtitle('UMAP split by dataset source'); plot(p) |
|
|
358 |
# by cluster (unlabeled) |
|
|
359 |
p <- DimPlot(data.all$proc, reduction='umap', label=T, pt.size=0.1) + |
|
|
360 |
ggtitle('UMAP of combined scRNA-seq data (unlabeled)'); plot(p) |
|
|
361 |
if (save.plots) { |
|
|
362 |
ggsave('plots/UMAP_unlabeled.pdf', width=10, height=5, units='in') |
|
|
363 |
} |
|
|
364 |
|
|
|
365 |
# by cluster (labeled) |
|
|
366 |
lab <- c('Tumor cell', 'Technical error 1', 'Cytotoxic CD8 T cell 1', 'B cell', |
|
|
367 |
'Melanocytes 1', 'Macrophage', 'Cytotoxic CD8 T cell 2', 'Technical error 2', |
|
|
368 |
'Memory T cell', 'Melanocytes 2', 'Dysfunctional CD8 T cell', |
|
|
369 |
'Cytotoxic CD8 T cell 3', '? 1', 'Melanocytes 3', 'Activated cell', |
|
|
370 |
'Exhausted T cell', 'Cytotoxic T cell', '? 2', '? 3', 'Melanocytes 4', '? 4') |
|
|
371 |
names(lab) <- levels(data.all$proc) |
|
|
372 |
plot.data <- data.all$proc %>% RenameIdents(., lab) |
|
|
373 |
p <- DimPlot(plot.data, reduction='umap', label=T, pt.size=0.1) + |
|
|
374 |
ggtitle('UMAP of combined scRNA-seq data (labeled)'); plot(p) |
|
|
375 |
if (save.plots) { |
|
|
376 |
ggsave('plots/UMAP_labeled.pdf', width=10, height=5, units='in') |
|
|
377 |
} |
|
|
378 |
|
|
|
379 |
``` |
|
|
380 |
|
|
|
381 |
### Biomarkers |
|
|
382 |
|
|
|
383 |
```{r biomarkers, warning=F, message=T} |
|
|
384 |
|
|
|
385 |
# Find all biomarkers based on clustering (runtime: ~30 minutes) |
|
|
386 |
if (!('bm' %in% names(data.all))) { |
|
|
387 |
data.all$bm <- FindAllMarkers(data.all$proc, min.pct=0.25, logfc.threshold=0.25) |
|
|
388 |
} |
|
|
389 |
|
|
|
390 |
# View table of top 3 biomarkers for each cluster |
|
|
391 |
d <- data.all$bm %>% group_by(cluster) %>% slice_max(n=3, order_by=avg_log2FC) |
|
|
392 |
datatable(d) |
|
|
393 |
|
|
|
394 |
# Visualize ridge plots based on biomarkers of interest |
|
|
395 |
ridge.feat <- dlg_list(sort(unique(d$gene)), multiple=T)$res |
|
|
396 |
p <- RidgePlot(data.all$proc, features=ridge.feat, |
|
|
397 |
ncol=ceiling(length(ridge.feat) / 2)); plot(p) |
|
|
398 |
if (save.plots) { |
|
|
399 |
ggsave('plots/biomarker_ridge_plots.pdf', width=10, height=10, units='in') |
|
|
400 |
} |
|
|
401 |
|
|
|
402 |
``` |
|
|
403 |
|
|
|
404 |
## 3. Comparisons: disease vs. control |
|
|
405 |
|
|
|
406 |
**Description**: determine differentially expressed genes (DEGs) between disease vs. control groups. |
|
|
407 |
|
|
|
408 |
### Case: tumor |
|
|
409 |
|
|
|
410 |
```{r DE_tumor, warning=F, message=T} |
|
|
411 |
|
|
|
412 |
# Instantiate DE variable |
|
|
413 |
if (!exists('de')) { |
|
|
414 |
de <- list() |
|
|
415 |
} |
|
|
416 |
if (!('data' %in% names(de))) { |
|
|
417 |
de$data <- GetAssayData(object=data.all$proc, slot='data') |
|
|
418 |
} |
|
|
419 |
|
|
|
420 |
# Define datasets to compare |
|
|
421 |
ix <- intersect(rownames(obj$dataset3), rownames(obj$dataset2)) %>% |
|
|
422 |
intersect(., rownames(obj$dataset4)) |
|
|
423 |
jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes' |
|
|
424 |
jx2 <- data.all$proc$Source == names(obj)[4] |
|
|
425 |
x <- de$data[ix, jx1]; y <- de$data[ix, jx2] |
|
|
426 |
|
|
|
427 |
# DE analysis (runtime: ~15 minutes) |
|
|
428 |
if (!('tumor' %in% names(de))) { |
|
|
429 |
de$tumor <- de_analyze(x, y) %>% na.omit |
|
|
430 |
} |
|
|
431 |
|
|
|
432 |
# Visualize dot plot |
|
|
433 |
dot.feat <- dlg_list(de$tumor$gene[de$tumor$q.val < 0.05], multiple=T)$res |
|
|
434 |
p.data <- data.all$proc |
|
|
435 |
Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA')) |
|
|
436 |
p <- p.data %>% |
|
|
437 |
DotPlot(object=., features=dot.feat, idents=c('Disease', 'Control')) + |
|
|
438 |
ggtitle('Case: tumor') |
|
|
439 |
plot(p) |
|
|
440 |
if (save.plots) { |
|
|
441 |
ggsave('plots/DE_tumor.pdf', width=10, height=5, units='in') |
|
|
442 |
} |
|
|
443 |
|
|
|
444 |
``` |
|
|
445 |
|
|
|
446 |
### Case: immune cells (bulk) |
|
|
447 |
|
|
|
448 |
```{r DE_immune_bulk,warning=F, message=T} |
|
|
449 |
|
|
|
450 |
# Define datasets to compare |
|
|
451 |
ix <- intersect(rownames(obj$dataset1), rownames(obj$dataset2)) %>% |
|
|
452 |
intersect(., rownames(obj$dataset5)) |
|
|
453 |
jx1 <- data.all$proc$Source == names(obj)[1] | data.all$proc$Malignant %in% 'No' |
|
|
454 |
jx2 <- data.all$proc$Source == names(obj)[5] |
|
|
455 |
x <- de$data[ix, jx1]; y <- de$data[ix, jx2] |
|
|
456 |
|
|
|
457 |
# DE analysis (runtime: ~20 minutes) |
|
|
458 |
if (!('immune' %in% names(de))) { |
|
|
459 |
de$immune <- de_analyze(x, y) %>% na.omit() |
|
|
460 |
} |
|
|
461 |
|
|
|
462 |
# Visualize dot plot |
|
|
463 |
dot.feat <- dlg_list(de$immune$gene[de$immune$q.val < 0.05], multiple=T)$res |
|
|
464 |
p.data <- data.all$proc |
|
|
465 |
Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA')) |
|
|
466 |
p <- p.data %>% |
|
|
467 |
DotPlot(object=., features=dot.feat, idents=c('Disease', 'Control')) + |
|
|
468 |
ggtitle('Case: immune cells') |
|
|
469 |
plot(p) |
|
|
470 |
if (save.plots) { |
|
|
471 |
ggsave('plots/DE_immune_bulk.pdf', width=10, height=5, units='in') |
|
|
472 |
} |
|
|
473 |
|
|
|
474 |
``` |
|
|
475 |
|
|
|
476 |
### Case: immune cells (cluster-based) |
|
|
477 |
|
|
|
478 |
```{r DE_immune_cluster, warning=F, message=T} |
|
|
479 |
|
|
|
480 |
# Define datasets to compare (runtime: ~5 minutes) |
|
|
481 |
immune.cells <- c('B', 'CD8', 'Macrophage', 'Memory', 'Activated') |
|
|
482 |
for (i in 1:length(immune.cells)) { |
|
|
483 |
# Define row + column indices |
|
|
484 |
ix <- intersect(rownames(obj$dataset1), rownames(obj$dataset2)) %>% |
|
|
485 |
intersect(., rownames(obj$dataset5)) |
|
|
486 |
c <- grepl(immune.cells[i], Idents(plot.data)) |
|
|
487 |
jx1 <- (plot.data$Source == names(obj)[1] | |
|
|
488 |
data.all$proc$Malignant %in% 'No') & c |
|
|
489 |
jx2 <- data.all$proc$Source == names(obj)[5] & c |
|
|
490 |
x <- de$data[ix, jx1]; y <- de$data[ix, jx2] |
|
|
491 |
# DE analysis |
|
|
492 |
if (!(immune.cells[i] %in% names(de))) { |
|
|
493 |
de[[immune.cells[i]]] <- de_analyze(x, y) %>% na.omit() |
|
|
494 |
} |
|
|
495 |
# Visualize dot plot |
|
|
496 |
x <- de[[immune.cells[[i]]]] |
|
|
497 |
dot.feat <- x$gene[x$q.val < 0.05][1:5] |
|
|
498 |
p.data <- data.all$proc |
|
|
499 |
Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA')) |
|
|
500 |
p <- p.data %>% |
|
|
501 |
DotPlot(object=., features=dot.feat, idents=c('Disease', 'Control')) + |
|
|
502 |
ggtitle(paste0('Case: ', immune.cells[i])) |
|
|
503 |
plot(p) |
|
|
504 |
if (save.plots) { |
|
|
505 |
ggsave(paste0('plots/DE_', immune.cells[i], '.pdf'), |
|
|
506 |
width=10, height=5, units='in') |
|
|
507 |
} |
|
|
508 |
} |
|
|
509 |
|
|
|
510 |
``` |
|
|
511 |
|
|
|
512 |
# Case: responder vs. non-responder |
|
|
513 |
ix <- rownames(obj$dataset1) |
|
|
514 |
jx1 <- data.all$proc$Response %in% 'Responder' |
|
|
515 |
jx2 <- data.all$proc$Response %in% 'Non-responder' |
|
|
516 |
|
|
|
517 |
# Save outputs and working space (if prompted) |
|
|
518 |
|
|
|
519 |
``` |
|
|
520 |
|
|
|
521 |
```{r glycolysis_visualization, warning=F, message=T} |
|
|
522 |
# Define key glycolytic genes |
|
|
523 |
glycolysis_genes <- c( |
|
|
524 |
"HK1", "HK2", "GPI", "PFKL", "PFKM", "PFKP", # Glucose uptake and early glycolysis |
|
|
525 |
"ALDOA", "ALDOB", "ALDOC", # Aldolase isoforms |
|
|
526 |
"TPI1", # Triose phosphate isomerase |
|
|
527 |
"GAPDH", # Glyceraldehyde phosphate dehydrogenase |
|
|
528 |
"PGK1", # Phosphoglycerate kinase |
|
|
529 |
"PGAM1", # Phosphoglycerate mutase |
|
|
530 |
"ENO1", "ENO2", # Enolase isoforms |
|
|
531 |
"PKM", "PKLR", # Pyruvate kinase isoforms |
|
|
532 |
"LDHA", "LDHB", # Lactate dehydrogenase |
|
|
533 |
"SLC2A1", "SLC2A3" # Glucose transporters (GLUT1, GLUT3) |
|
|
534 |
) |
|
|
535 |
|
|
|
536 |
# Filter for genes present in the dataset |
|
|
537 |
glycolysis_genes_present <- intersect(glycolysis_genes, rownames(de$data)) |
|
|
538 |
|
|
|
539 |
# Create anonymous gene IDs |
|
|
540 |
anon_genes <- paste0("gene_", seq_along(glycolysis_genes_present)) |
|
|
541 |
names(anon_genes) <- glycolysis_genes_present |
|
|
542 |
|
|
|
543 |
# Create mapping dataframe for reference |
|
|
544 |
gene_mapping <- data.frame( |
|
|
545 |
Original_Gene = glycolysis_genes_present, |
|
|
546 |
Anonymous_ID = anon_genes[glycolysis_genes_present] |
|
|
547 |
) |
|
|
548 |
|
|
|
549 |
# Write mapping to file if saving is enabled |
|
|
550 |
if (save.plots) { |
|
|
551 |
write.csv(gene_mapping, 'results/glycolysis_gene_mapping.csv', row.names = FALSE) |
|
|
552 |
} |
|
|
553 |
|
|
|
554 |
# Create dot plot for glycolytic genes |
|
|
555 |
p.data <- data.all$proc |
|
|
556 |
Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA')) |
|
|
557 |
|
|
|
558 |
# Generate dot plot with anonymous IDs and custom colors |
|
|
559 |
p_glycolysis <- DotPlot(p.data, |
|
|
560 |
features = glycolysis_genes_present, |
|
|
561 |
idents = c('Disease', 'Control')) + |
|
|
562 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + |
|
|
563 |
xlab('Genes') + |
|
|
564 |
ylab('Condition') + |
|
|
565 |
scale_color_gradient2(low = "navy", mid = "gray90", high = "red", |
|
|
566 |
midpoint = 0, name = "Average\nExpression") + |
|
|
567 |
scale_x_discrete(labels = anon_genes[glycolysis_genes_present]) |
|
|
568 |
|
|
|
569 |
# Display plot |
|
|
570 |
print(p_glycolysis) |
|
|
571 |
|
|
|
572 |
# Save plot if requested |
|
|
573 |
if (save.plots) { |
|
|
574 |
ggsave('plots/DE_tumor_glycolysis.pdf', |
|
|
575 |
plot = p_glycolysis, |
|
|
576 |
width = 12, |
|
|
577 |
height = 6, |
|
|
578 |
units = 'in') |
|
|
579 |
} |
|
|
580 |
|
|
|
581 |
# Calculate statistics for glycolytic genes |
|
|
582 |
glycolysis_stats <- de$tumor[de$tumor$gene %in% glycolysis_genes_present, ] |
|
|
583 |
glycolysis_stats <- glycolysis_stats[order(glycolysis_stats$q.val), ] |
|
|
584 |
|
|
|
585 |
# Add anonymous IDs to statistics |
|
|
586 |
glycolysis_stats$anonymous_id <- anon_genes[glycolysis_stats$gene] |
|
|
587 |
|
|
|
588 |
# Display statistics with anonymous IDs |
|
|
589 |
print("Differential Expression Statistics for De-identified Glycolytic Genes:") |
|
|
590 |
glycolysis_stats$gene <- glycolysis_stats$anonymous_id |
|
|
591 |
print(glycolysis_stats) |
|
|
592 |
|
|
|
593 |
# Count significantly different genes |
|
|
594 |
sig_glyco_genes <- sum(glycolysis_stats$q.val < 0.05) |
|
|
595 |
print(sprintf("Number of significantly different glycolytic genes (q < 0.05): %d", sig_glyco_genes)) |
|
|
596 |
``` |
|
|
597 |
|
|
|
598 |
```{r glycolysis_boxplots, warning=F, message=T} |
|
|
599 |
# Get expression data for glycolytic genes |
|
|
600 |
expr_data <- GetAssayData(p.data, slot = "data")[glycolysis_genes_present,] |
|
|
601 |
|
|
|
602 |
# Filter out genes with all zeros |
|
|
603 |
nonzero_genes <- rownames(expr_data)[rowSums(expr_data > 0) > 0] |
|
|
604 |
expr_data <- expr_data[nonzero_genes,] |
|
|
605 |
|
|
|
606 |
# Update anonymous gene IDs for non-zero genes |
|
|
607 |
anon_genes <- anon_genes[nonzero_genes] |
|
|
608 |
|
|
|
609 |
target_genes <- c("g10", "g21", "g03") |
|
|
610 |
selected_genes <- names(anon_genes)[anon_genes %in% target_genes] |
|
|
611 |
|
|
|
612 |
# Create a data frame for plotting and store statistical results |
|
|
613 |
plot_data <- data.frame() |
|
|
614 |
stat_results <- data.frame( |
|
|
615 |
Gene = character(), |
|
|
616 |
P_value = numeric(), |
|
|
617 |
T_statistic = numeric(), |
|
|
618 |
Mean_diff = numeric(), |
|
|
619 |
stringsAsFactors = FALSE |
|
|
620 |
) |
|
|
621 |
|
|
|
622 |
for (gene in selected_genes) { |
|
|
623 |
# Get expression values for current gene |
|
|
624 |
expr_values <- expr_data[gene,] |
|
|
625 |
|
|
|
626 |
# Split values by condition before centering |
|
|
627 |
disease_vals <- expr_values[colnames(expr_data) %in% colnames(de$data)[jx1]] |
|
|
628 |
control_vals <- expr_values[colnames(expr_data) %in% colnames(de$data)[jx2]] |
|
|
629 |
|
|
|
630 |
# Perform t-test on raw values |
|
|
631 |
t_test_result <- t.test(disease_vals, control_vals) |
|
|
632 |
|
|
|
633 |
# Center the expression values by subtracting the overall mean |
|
|
634 |
overall_mean <- mean(expr_values) |
|
|
635 |
disease_vals_centered <- disease_vals - overall_mean |
|
|
636 |
control_vals_centered <- control_vals - overall_mean |
|
|
637 |
|
|
|
638 |
# Store statistical results |
|
|
639 |
stat_results <- rbind(stat_results, data.frame( |
|
|
640 |
Gene = anon_genes[gene], |
|
|
641 |
P_value = t_test_result$p.value, |
|
|
642 |
T_statistic = t_test_result$statistic, |
|
|
643 |
Mean_diff = mean(disease_vals) - mean(control_vals), |
|
|
644 |
Mean_centered_diff = mean(disease_vals_centered) - mean(control_vals_centered), |
|
|
645 |
stringsAsFactors = FALSE |
|
|
646 |
)) |
|
|
647 |
|
|
|
648 |
# Create temporary data frames for each condition with centered values |
|
|
649 |
temp_df <- data.frame( |
|
|
650 |
Expression = c(disease_vals_centered, control_vals_centered), |
|
|
651 |
Gene = anon_genes[gene], |
|
|
652 |
Condition = c(rep("Disease", length(disease_vals_centered)), |
|
|
653 |
rep("Control", length(control_vals_centered))) |
|
|
654 |
) |
|
|
655 |
|
|
|
656 |
# Append to main data frame |
|
|
657 |
plot_data <- rbind(plot_data, temp_df) |
|
|
658 |
} |
|
|
659 |
|
|
|
660 |
# Add FDR correction |
|
|
661 |
stat_results$FDR <- p.adjust(stat_results$P_value, method = "BH") |
|
|
662 |
|
|
|
663 |
# Sort results by p-value |
|
|
664 |
stat_results <- stat_results[order(stat_results$P_value), ] |
|
|
665 |
|
|
|
666 |
# Create boxplot with swarm plot |
|
|
667 |
p_boxplot <- ggplot(plot_data, aes(x = Gene, y = Expression, fill = Condition)) + |
|
|
668 |
# Add swarm plot layer first (so it appears behind) |
|
|
669 |
geom_jitter(aes(color = Condition), position = position_jitterdodge(jitter.width = 0.2), |
|
|
670 |
size = 0.3, alpha = 0.3) + |
|
|
671 |
# Add boxplot layer second (so it appears in front) |
|
|
672 |
geom_boxplot(outlier.shape = NA, alpha = 0.4, width = 0.7) + |
|
|
673 |
# Colors for boxplot fill and point colors |
|
|
674 |
scale_fill_manual(values = c("Disease" = "#E41A1C", "Control" = "#377EB8")) + |
|
|
675 |
scale_color_manual(values = c("Disease" = "#E41A1C", "Control" = "#377EB8")) + |
|
|
676 |
# Set y-axis limits |
|
|
677 |
ylim(-2, max(plot_data$Expression)) + |
|
|
678 |
# Theme customization |
|
|
679 |
theme_minimal() + |
|
|
680 |
theme( |
|
|
681 |
axis.text.x = element_text(angle = 45, hjust = 1), |
|
|
682 |
panel.grid.major.y = element_blank(), # Remove y-axis major grid lines |
|
|
683 |
panel.grid.minor.y = element_blank(), # Remove y-axis minor grid lines |
|
|
684 |
panel.grid.major.x = element_line(color = "gray90"), # Keep x-axis major grid lines |
|
|
685 |
panel.grid.minor.x = element_blank(), # Remove x-axis minor grid lines |
|
|
686 |
legend.position = c(0.95, 0.95), # Position legend inside top-right |
|
|
687 |
legend.justification = c(1, 1), # Anchor point for legend |
|
|
688 |
legend.box.just = "right", |
|
|
689 |
legend.margin = margin(6, 6, 6, 6), |
|
|
690 |
legend.background = element_rect(fill = "white", color = NA), |
|
|
691 |
legend.box.background = element_rect(color = "gray90") |
|
|
692 |
) + |
|
|
693 |
# Labels |
|
|
694 |
xlab("Genes") + |
|
|
695 |
ylab("Centered Expression") + |
|
|
696 |
ggtitle("Gene Expression Relative to Mean") + |
|
|
697 |
# Add horizontal line at y=0 |
|
|
698 |
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5) |
|
|
699 |
|
|
|
700 |
# Display plot |
|
|
701 |
print(p_boxplot) |
|
|
702 |
|
|
|
703 |
# Save plot if requested |
|
|
704 |
if (save.plots) { |
|
|
705 |
ggsave('plots/DE_tumor_glycolysis_boxplots.pdf', |
|
|
706 |
plot = p_boxplot, |
|
|
707 |
width = 12, |
|
|
708 |
height = 6, |
|
|
709 |
units = 'in') |
|
|
710 |
} |
|
|
711 |
|
|
|
712 |
# Calculate summary statistics for centered data |
|
|
713 |
summary_stats <- plot_data %>% |
|
|
714 |
group_by(Gene, Condition) %>% |
|
|
715 |
summarise( |
|
|
716 |
Mean = mean(Expression), |
|
|
717 |
Median = median(Expression), |
|
|
718 |
SD = sd(Expression), |
|
|
719 |
Q1 = quantile(Expression, 0.25), |
|
|
720 |
Q3 = quantile(Expression, 0.75), |
|
|
721 |
n = n(), |
|
|
722 |
.groups = 'drop' |
|
|
723 |
) |
|
|
724 |
|
|
|
725 |
# Display summary statistics |
|
|
726 |
print("Summary Statistics for Centered Expression by Condition:") |
|
|
727 |
print(summary_stats) |
|
|
728 |
|
|
|
729 |
# Display statistical test results |
|
|
730 |
print("\nStatistical Test Results (Disease vs Control):") |
|
|
731 |
print(stat_results) |
|
|
732 |
|
|
|
733 |
# Print number of genes removed due to zero expression |
|
|
734 |
n_removed <- length(glycolysis_genes_present) - length(nonzero_genes) |
|
|
735 |
if (n_removed > 0) { |
|
|
736 |
print(sprintf("\nRemoved %d genes with zero expression", n_removed)) |
|
|
737 |
print("Removed genes:") |
|
|
738 |
print(setdiff(glycolysis_genes_present, nonzero_genes)) |
|
|
739 |
} |
|
|
740 |
|
|
|
741 |
# Print number of significantly different genes |
|
|
742 |
sig_genes <- sum(stat_results$FDR < 0.05) |
|
|
743 |
if (sig_genes > 0) { |
|
|
744 |
print(sprintf("\nNumber of significantly different genes (FDR < 0.05): %d", sig_genes)) |
|
|
745 |
print("Significant genes:") |
|
|
746 |
print(stat_results[stat_results$FDR < 0.05, ]) |
|
|
747 |
} |
|
|
748 |
``` |