|
a |
|
b/notebooks/DE_analysis.Rmd |
|
|
1 |
--- |
|
|
2 |
title: "DE Analysis of scRNA-seq data" |
|
|
3 |
date: "Last modified: `r format(Sys.time(), '%B %d, %Y')`" |
|
|
4 |
tags: [scRNA-seq, seurat, melanoma, immunotherapy, PBMCs] |
|
|
5 |
output: |
|
|
6 |
html_document: |
|
|
7 |
theme: flatly |
|
|
8 |
highlight: zenburn |
|
|
9 |
toc: true |
|
|
10 |
number_sections: false |
|
|
11 |
toc_depth: 2 |
|
|
12 |
toc_float: |
|
|
13 |
collapsed: false |
|
|
14 |
smooth_scroll: true |
|
|
15 |
code_folding: hide |
|
|
16 |
self_contained: yes |
|
|
17 |
--- |
|
|
18 |
|
|
|
19 |
# Notebook Description |
|
|
20 |
|
|
|
21 |
This notebook compiles the code and outputs for differential expression (DE) analysis. Specifically, two methods are covered: |
|
|
22 |
* unpaired Student's t-test (custom function) |
|
|
23 |
* [MAST](https://doi.org/10.1186/s13059-015-0844-5) |
|
|
24 |
|
|
|
25 |
Of note, pre-processed data stored in the `scRNAseq_analysis.RData` file is used for the analyses detailed below. |
|
|
26 |
|
|
|
27 |
# Initialize environment |
|
|
28 |
|
|
|
29 |
```{r load_packages, results="hide", message=F, warning=F, error=F} |
|
|
30 |
# Install required packages if not already installed |
|
|
31 |
if (!require("BiocManager", quietly = TRUE)) { |
|
|
32 |
install.packages("BiocManager") |
|
|
33 |
} |
|
|
34 |
if (!require("MAST", quietly = TRUE)) { |
|
|
35 |
BiocManager::install("MAST") |
|
|
36 |
} |
|
|
37 |
if (!require("presto", quietly = TRUE)) { |
|
|
38 |
install.packages("presto") |
|
|
39 |
} |
|
|
40 |
|
|
|
41 |
# Memory management settings |
|
|
42 |
options(future.globals.maxSize = 16000 * 1024^2) # Set maximum global size to 8GB |
|
|
43 |
|
|
|
44 |
# Load core packages first |
|
|
45 |
library(Matrix) # for sparse matrix operations |
|
|
46 |
library(Seurat) # for scRNA-seq analysis |
|
|
47 |
library(MAST) # for scRNAseq DEG analysis |
|
|
48 |
|
|
|
49 |
# Load other packages |
|
|
50 |
library(svDialogs) # for prompting user-input |
|
|
51 |
library(vroom) # for quickly reading data |
|
|
52 |
library(dplyr) # for data processing |
|
|
53 |
library(DT) # to display datatables |
|
|
54 |
library(harmony) # to integration scRNA-seq data |
|
|
55 |
library(patchwork) # for combining plots |
|
|
56 |
library(ggplot2) # for data visualization |
|
|
57 |
library(ggrepel) # to use geom_pont_repel() |
|
|
58 |
library(ggvenn) # to visualize venn diagrams |
|
|
59 |
library(openxlsx) # to write data to excel |
|
|
60 |
library(progress) # to display progress bar |
|
|
61 |
library(rio) # to load all worksheets in a workbook |
|
|
62 |
library(pheatmap) # to visualize heatmaps |
|
|
63 |
library(ggfortify) # to visualize PCA plots |
|
|
64 |
library(presto) # for faster Wilcoxon test implementation |
|
|
65 |
library(gridExtra) # for arranging multiple plots |
|
|
66 |
|
|
|
67 |
# Set up parallel processing |
|
|
68 |
library(future) |
|
|
69 |
plan(multisession, workers = 2) # Adjust based on available memory |
|
|
70 |
options(future.globals.maxSize = 16000 * 1024^2) |
|
|
71 |
``` |
|
|
72 |
|
|
|
73 |
System settings. |
|
|
74 |
|
|
|
75 |
```{r settings} |
|
|
76 |
|
|
|
77 |
# Adjust system settings |
|
|
78 |
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2) |
|
|
79 |
|
|
|
80 |
# Enhanced memory management functions |
|
|
81 |
clear_memory <- function(deep_clean = FALSE) { |
|
|
82 |
# Standard cleanup |
|
|
83 |
gc(verbose = FALSE) |
|
|
84 |
if(exists(".Random.seed")) rm(.Random.seed) |
|
|
85 |
|
|
|
86 |
# Deep cleanup if requested |
|
|
87 |
if(deep_clean) { |
|
|
88 |
# Clear package caches |
|
|
89 |
if(exists(".__C__")) rm(list=".__C__", envir=.GlobalEnv) |
|
|
90 |
# Clear hidden objects |
|
|
91 |
rm(list = ls(all.names = TRUE, envir = .GlobalEnv), envir = .GlobalEnv) |
|
|
92 |
# Force garbage collection multiple times |
|
|
93 |
replicate(3, gc(verbose = FALSE)) |
|
|
94 |
} |
|
|
95 |
|
|
|
96 |
invisible(gc()) |
|
|
97 |
} |
|
|
98 |
|
|
|
99 |
# Function to get data efficiently with batching support |
|
|
100 |
get_assay_data <- function(object, data_type = "data", batch_size = 1000) { |
|
|
101 |
tryCatch({ |
|
|
102 |
# Get data using appropriate method |
|
|
103 |
if (packageVersion("Seurat") >= "5.0.0") { |
|
|
104 |
data <- GetAssayData(object = object, layer = data_type) |
|
|
105 |
} else { |
|
|
106 |
data <- GetAssayData(object = object, slot = data_type) |
|
|
107 |
} |
|
|
108 |
|
|
|
109 |
# Convert to sparse matrix if dense |
|
|
110 |
if (!is(data, "dgCMatrix")) { |
|
|
111 |
data <- as(data, "dgCMatrix") |
|
|
112 |
} |
|
|
113 |
|
|
|
114 |
return(data) |
|
|
115 |
}, error = function(e) { |
|
|
116 |
stop(sprintf("Failed to get assay data: %s", e$message)) |
|
|
117 |
}) |
|
|
118 |
} |
|
|
119 |
|
|
|
120 |
# Function to process data in batches |
|
|
121 |
process_in_batches <- function(data, batch_size = 1000, FUN, ...) { |
|
|
122 |
n_cells <- ncol(data) |
|
|
123 |
n_batches <- ceiling(n_cells / batch_size) |
|
|
124 |
results <- vector("list", n_batches) |
|
|
125 |
|
|
|
126 |
pb <- progress_bar$new(total = n_batches) |
|
|
127 |
|
|
|
128 |
for(i in 1:n_batches) { |
|
|
129 |
start_idx <- (i-1) * batch_size + 1 |
|
|
130 |
end_idx <- min(i * batch_size, n_cells) |
|
|
131 |
|
|
|
132 |
# Process batch |
|
|
133 |
batch_data <- data[, start_idx:end_idx] |
|
|
134 |
results[[i]] <- FUN(batch_data, ...) |
|
|
135 |
|
|
|
136 |
# Clean up |
|
|
137 |
rm(batch_data) |
|
|
138 |
clear_memory() |
|
|
139 |
pb$tick() |
|
|
140 |
} |
|
|
141 |
|
|
|
142 |
# Combine results |
|
|
143 |
combined_results <- do.call(rbind, results) |
|
|
144 |
rm(results) |
|
|
145 |
clear_memory() |
|
|
146 |
|
|
|
147 |
return(combined_results) |
|
|
148 |
} |
|
|
149 |
|
|
|
150 |
# Add gene anonymization function after the settings chunk |
|
|
151 |
anonymize_genes <- function(gene_list, prefix = "GENE") { |
|
|
152 |
# Create mapping of real genes to anonymous IDs |
|
|
153 |
anon_ids <- paste0(prefix, "_", sprintf("%04d", seq_along(gene_list))) |
|
|
154 |
names(anon_ids) <- gene_list |
|
|
155 |
return(anon_ids) |
|
|
156 |
} |
|
|
157 |
|
|
|
158 |
``` |
|
|
159 |
|
|
|
160 |
Load pre-processed data. |
|
|
161 |
|
|
|
162 |
```{r load_data, warning=F, message=F} |
|
|
163 |
file_path <- "/Users/scampit/Projects/torchstack/cancer-biomarker-discovery" |
|
|
164 |
setwd(file_path) |
|
|
165 |
|
|
|
166 |
# Start with clean environment |
|
|
167 |
#clear_memory(deep_clean = TRUE) |
|
|
168 |
|
|
|
169 |
# Load data from saved workspace |
|
|
170 |
f <- list.files() |
|
|
171 |
if (any(endsWith(f, '.RData'))) { |
|
|
172 |
load(f[endsWith(f, '.RData')][1]) |
|
|
173 |
} else { |
|
|
174 |
stop("No .RData file found. Please ensure scRNAseq_wksp.RData is in the working directory.") |
|
|
175 |
} |
|
|
176 |
|
|
|
177 |
# Initialize DE lists |
|
|
178 |
de <- list() |
|
|
179 |
|
|
|
180 |
# Get data from Seurat object efficiently |
|
|
181 |
tryCatch({ |
|
|
182 |
message("Loading expression data from Seurat object...") |
|
|
183 |
de$data <- get_assay_data(data.all$proc) |
|
|
184 |
|
|
|
185 |
# Verify data loaded correctly |
|
|
186 |
if (is.null(de$data) || !is(de$data, "dgCMatrix")) { |
|
|
187 |
stop("Failed to load expression data properly") |
|
|
188 |
} |
|
|
189 |
message(sprintf("Loaded expression matrix with %d genes and %d cells", |
|
|
190 |
nrow(de$data), ncol(de$data))) |
|
|
191 |
}, error = function(e) { |
|
|
192 |
stop(sprintf("Error loading data: %s", e$message)) |
|
|
193 |
}) |
|
|
194 |
|
|
|
195 |
#clear_memory() |
|
|
196 |
|
|
|
197 |
# Load or initialize results |
|
|
198 |
de$ttest <- if(file.exists('results/tables/DE_ttest.xlsx')) { |
|
|
199 |
import_list('results/tables/DE_ttest.xlsx') |
|
|
200 |
} else { |
|
|
201 |
list() |
|
|
202 |
} |
|
|
203 |
|
|
|
204 |
de$mast <- if(file.exists('results/tables/DE_mast.xlsx')) { |
|
|
205 |
import_list('results/tables/DE_mast.xlsx') |
|
|
206 |
} else { |
|
|
207 |
list() |
|
|
208 |
} |
|
|
209 |
|
|
|
210 |
#clear_memory(deep_clean = TRUE) |
|
|
211 |
|
|
|
212 |
# Verify data structure |
|
|
213 |
str(de, max.level = 1) |
|
|
214 |
|
|
|
215 |
``` |
|
|
216 |
|
|
|
217 |
```{r summary_stats, warning=F, message=F} |
|
|
218 |
|
|
|
219 |
# Assign row names to MAST results |
|
|
220 |
f <- names(de$ttest) |
|
|
221 |
for (i in 1:length(f)) { |
|
|
222 |
x <- de$ttest[[f[i]]]; y <- de$mast[[f[i]]] |
|
|
223 |
g <- intersect(x$gene[x$q.val < 0.05], y$gene[y$p_val_adj < 0.05]) |
|
|
224 |
cat(sprintf('%s\tDEGs (t-test): %d\tDEGs (MAST): %d\tDEGs (overlap): %d\n', |
|
|
225 |
f[i], sum(x$q.val < 0.05), sum(y$p_val_adj < 0.05), length(g))) |
|
|
226 |
} |
|
|
227 |
|
|
|
228 |
``` |
|
|
229 |
|
|
|
230 |
# DE analyses |
|
|
231 |
|
|
|
232 |
**Description**: determine differentially expressed genes (DEGs) between different comparisons. |
|
|
233 |
|
|
|
234 |
```{r DE_initialize, warning=F, message=T, eval=!('data' %in% de)} |
|
|
235 |
|
|
|
236 |
# Instantiate DE variable + relevant fields |
|
|
237 |
de <- list(); de$ttest <- list(); de$mast <- list() |
|
|
238 |
|
|
|
239 |
# Define data to work with - update to use layer |
|
|
240 |
tryCatch({ |
|
|
241 |
de$data <- GetAssayData(object=data.all$proc, layer='data') |
|
|
242 |
}, error = function(e) { |
|
|
243 |
# Fallback for older Seurat versions |
|
|
244 |
message("Attempting fallback for older Seurat version...") |
|
|
245 |
de$data <- GetAssayData(object=data.all$proc, slot='data') |
|
|
246 |
}) |
|
|
247 |
|
|
|
248 |
# Add validation |
|
|
249 |
if (is.null(de$data) || ncol(de$data) == 0 || nrow(de$data) == 0) { |
|
|
250 |
stop("Failed to retrieve data from Seurat object") |
|
|
251 |
} |
|
|
252 |
|
|
|
253 |
``` |
|
|
254 |
|
|
|
255 |
## Case: disease vs. control (tumor) |
|
|
256 |
|
|
|
257 |
Comparison between benign vs. tumor skin cells. |
|
|
258 |
Estimated runtime: ~15 minutes. |
|
|
259 |
|
|
|
260 |
```{r DE_tumor, warning=F, message=T, eval=!('tumor' %in% de$ttest)} |
|
|
261 |
|
|
|
262 |
# Start timer |
|
|
263 |
t.start <- Sys.time() |
|
|
264 |
|
|
|
265 |
# Define cell groups to compare |
|
|
266 |
ix <- intersect(rownames(obj$GSE72056), rownames(obj$GSE115978)) %>% |
|
|
267 |
intersect(., rownames(obj$GSE151091)) |
|
|
268 |
jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes' |
|
|
269 |
jx2 <- data.all$proc$Source == names(obj)[4] |
|
|
270 |
|
|
|
271 |
# Subsample cells if too many |
|
|
272 |
max_cells_per_group <- 1000 |
|
|
273 |
if(sum(jx1) > max_cells_per_group) { |
|
|
274 |
set.seed(42) |
|
|
275 |
cells_g1 <- sample(which(jx1), max_cells_per_group) |
|
|
276 |
jx1[setdiff(which(jx1), cells_g1)] <- FALSE |
|
|
277 |
} |
|
|
278 |
if(sum(jx2) > max_cells_per_group) { |
|
|
279 |
set.seed(42) |
|
|
280 |
cells_g2 <- sample(which(jx2), max_cells_per_group) |
|
|
281 |
jx2[setdiff(which(jx2), cells_g2)] <- FALSE |
|
|
282 |
} |
|
|
283 |
|
|
|
284 |
# DE analysis (t-test) with memory management |
|
|
285 |
x <- de$data[ix, jx1] |
|
|
286 |
y <- de$data[ix, jx2] |
|
|
287 |
#clear_memory() |
|
|
288 |
|
|
|
289 |
de$ttest$tumor <- de_analyze(x, y) %>% na.omit |
|
|
290 |
rm(x, y) |
|
|
291 |
#clear_memory(deep_clean = TRUE) |
|
|
292 |
|
|
|
293 |
# DE analysis (MAST) |
|
|
294 |
message("Creating Seurat object...") |
|
|
295 |
combined_data <- cbind(de$data[ix, jx1], de$data[ix, jx2]) |
|
|
296 |
if (!is(combined_data, "dgCMatrix")) { |
|
|
297 |
combined_data <- as(combined_data, "dgCMatrix") |
|
|
298 |
} |
|
|
299 |
x <- CreateSeuratObject(counts = combined_data) |
|
|
300 |
rm(combined_data) |
|
|
301 |
#clear_memory() |
|
|
302 |
|
|
|
303 |
# Process data in steps with memory management |
|
|
304 |
message("Normalizing data...") |
|
|
305 |
x <- NormalizeData(x, verbose = FALSE) |
|
|
306 |
#clear_memory() |
|
|
307 |
|
|
|
308 |
message("Finding variable features...") |
|
|
309 |
x <- FindVariableFeatures(x, verbose = FALSE) |
|
|
310 |
#clear_memory() |
|
|
311 |
|
|
|
312 |
message("Scaling data...") |
|
|
313 |
x <- ScaleData(x, verbose = FALSE) |
|
|
314 |
#clear_memory() |
|
|
315 |
|
|
|
316 |
message("Running PCA...") |
|
|
317 |
x <- RunPCA(x, verbose = FALSE) |
|
|
318 |
#clear_memory() |
|
|
319 |
|
|
|
320 |
# Set identities |
|
|
321 |
cells_g1 <- 1:sum(jx1) |
|
|
322 |
cells_g2 <- (sum(jx1) + 1):ncol(x) |
|
|
323 |
Idents(object = x, cells = cells_g1) <- 'Disease' |
|
|
324 |
Idents(object = x, cells = cells_g2) <- 'Control' |
|
|
325 |
rm(cells_g1, cells_g2) |
|
|
326 |
#clear_memory() |
|
|
327 |
|
|
|
328 |
# Run MAST analysis with error handling and memory management |
|
|
329 |
message("Running differential expression analysis...") |
|
|
330 |
tryCatch({ |
|
|
331 |
de$mast$tumor <- FindMarkers( |
|
|
332 |
object = x, |
|
|
333 |
ident.1 = 'Disease', |
|
|
334 |
ident.2 = 'Control', |
|
|
335 |
test.use = 'MAST', |
|
|
336 |
verbose = TRUE, |
|
|
337 |
max.cells.per.ident = 1000, |
|
|
338 |
min.pct = 0.1, # Filter low-expressed genes |
|
|
339 |
logfc.threshold = 0.25 # Filter low effect size |
|
|
340 |
) |
|
|
341 |
}, error = function(e) { |
|
|
342 |
message("Error in MAST analysis: ", e$message) |
|
|
343 |
message("Attempting alternative analysis method...") |
|
|
344 |
de$mast$tumor <- FindMarkers( |
|
|
345 |
object = x, |
|
|
346 |
ident.1 = 'Disease', |
|
|
347 |
ident.2 = 'Control', |
|
|
348 |
test.use = 'wilcox', |
|
|
349 |
verbose = TRUE, |
|
|
350 |
max.cells.per.ident = 1000, |
|
|
351 |
min.pct = 0.1, |
|
|
352 |
logfc.threshold = 0.25 |
|
|
353 |
) |
|
|
354 |
}) |
|
|
355 |
|
|
|
356 |
# Clean up |
|
|
357 |
rm(x) |
|
|
358 |
#clear_memory(deep_clean = TRUE) |
|
|
359 |
|
|
|
360 |
# End timer + log time elapsed |
|
|
361 |
t.end <- Sys.time() |
|
|
362 |
message("Analysis completed in: ", difftime(t.end, t.start, units = "mins"), " minutes") |
|
|
363 |
|
|
|
364 |
``` |
|
|
365 |
|
|
|
366 |
## Case: disease vs. control (immune cells, bulk) |
|
|
367 |
|
|
|
368 |
Bulk comparison between healthy vs. diseased immune cells. |
|
|
369 |
Estimated runtime: ~20 minutes. |
|
|
370 |
|
|
|
371 |
```{r DE_immune_bulk,warning=F, message=T, eval=!('immune' %in% de$ttest)} |
|
|
372 |
|
|
|
373 |
# Start timer |
|
|
374 |
t.start <- Sys.time() |
|
|
375 |
|
|
|
376 |
# Define cell groups to compare |
|
|
377 |
ix <- intersect(rownames(obj$GSE120575), rownames(obj$GSE115978)) %>% |
|
|
378 |
intersect(., rownames(obj$GSE183212)) |
|
|
379 |
jx1 <- data.all$proc$Source == names(obj)[1] | data.all$proc$Malignant %in% 'No' |
|
|
380 |
jx2 <- data.all$proc$Source == names(obj)[5] |
|
|
381 |
|
|
|
382 |
# DE analysis (t-test) |
|
|
383 |
x <- de$data[ix, jx1]; y <- de$data[ix, jx2] |
|
|
384 |
de$ttest$immune <- de_analyze(x, y) %>% na.omit() |
|
|
385 |
|
|
|
386 |
# DE analysis (MAST) |
|
|
387 |
# Create Seurat object with proper assay |
|
|
388 |
x <- CreateSeuratObject(counts = cbind(de$data[ix, jx1], de$data[ix, jx2])) |
|
|
389 |
# Normalize the data |
|
|
390 |
x <- NormalizeData(x) |
|
|
391 |
# Find variable features |
|
|
392 |
x <- FindVariableFeatures(x) |
|
|
393 |
# Scale the data |
|
|
394 |
x <- ScaleData(x) |
|
|
395 |
# Run PCA |
|
|
396 |
x <- RunPCA(x) |
|
|
397 |
|
|
|
398 |
# Set identities |
|
|
399 |
Idents(object=x, cells=1:sum(jx1)) <- 'Tumor' |
|
|
400 |
Idents(object=x, cells=sum(jx1)+1:ncol(x)) <- 'Benign' |
|
|
401 |
|
|
|
402 |
# Run MAST analysis |
|
|
403 |
de$mast$immune <- FindMarkers(object=x, |
|
|
404 |
ident.1='Tumor', |
|
|
405 |
ident.2='Benign', |
|
|
406 |
test.use='MAST', |
|
|
407 |
verbose = TRUE) |
|
|
408 |
|
|
|
409 |
# End timer + log time elapsed |
|
|
410 |
t.end <- Sys.time() |
|
|
411 |
t.end - t.start |
|
|
412 |
|
|
|
413 |
``` |
|
|
414 |
|
|
|
415 |
## Case: disease vs. control (immune cells, cluster-based) |
|
|
416 |
|
|
|
417 |
Immune cell-specific comparisons between healthy vs. diseased cells. |
|
|
418 |
Estimated runtime: ~15 minutes. |
|
|
419 |
|
|
|
420 |
```{r DE_immune_cluster, warning=F, message=T} |
|
|
421 |
# Start timer |
|
|
422 |
t.start <- Sys.time() |
|
|
423 |
|
|
|
424 |
# Define labeled scRNAseq data |
|
|
425 |
lab <- c('Tumor cell', 'Technical error 1', 'Cytotoxic CD8 T cell 1', 'B cell', |
|
|
426 |
'Melanocytes 1', 'Macrophage', 'Cytotoxic CD8 T cell 2', 'Technical error 2', |
|
|
427 |
'Memory T cell', 'Melanocytes 2', 'Dysfunctional CD8 T cell', |
|
|
428 |
'Cytotoxic CD8 T cell 3', '? 1', 'Melanocytes 3', 'Activated cell', |
|
|
429 |
'Exhausted T cell', 'Cytotoxic T cell', '? 2', '? 3', 'Melanocytes 4', '? 4') |
|
|
430 |
names(lab) <- levels(data.all$proc) |
|
|
431 |
plot.data <- data.all$proc %>% RenameIdents(., lab) |
|
|
432 |
|
|
|
433 |
# Make sure we have the expression data |
|
|
434 |
if (!exists("de") || is.null(de$data) || !is(de$data, "dgCMatrix")) { |
|
|
435 |
message("Reloading expression data...") |
|
|
436 |
de <- list() |
|
|
437 |
de$data <- GetAssayData(object = data.all$proc, layer = "data") |
|
|
438 |
if (!is(de$data, "dgCMatrix")) { |
|
|
439 |
de$data <- as(de$data, "dgCMatrix") |
|
|
440 |
} |
|
|
441 |
} |
|
|
442 |
|
|
|
443 |
# Initialize results lists if they don't exist |
|
|
444 |
if (!"ttest" %in% names(de)) de$ttest <- list() |
|
|
445 |
if (!"mast" %in% names(de)) de$mast <- list() |
|
|
446 |
|
|
|
447 |
# Iterate through each immune cell group |
|
|
448 |
immune.cells <- c('B', 'CD8', 'Macrophage', 'Memory', 'Activated') |
|
|
449 |
for (i in 1:length(immune.cells)) { |
|
|
450 |
message(sprintf("\nProcessing %s cells...", immune.cells[i])) |
|
|
451 |
|
|
|
452 |
# Define row + column indices |
|
|
453 |
ix <- intersect(rownames(obj$GSE120575), rownames(obj$GSE115978)) %>% |
|
|
454 |
intersect(., rownames(obj$GSE183212)) |
|
|
455 |
c <- grepl(immune.cells[i], Idents(plot.data)) |
|
|
456 |
jx1 <- (plot.data$Source == names(obj)[1] | |
|
|
457 |
data.all$proc$Malignant %in% 'No') & c |
|
|
458 |
jx2 <- data.all$proc$Source == names(obj)[5] & c |
|
|
459 |
|
|
|
460 |
# Print number of genes + cells |
|
|
461 |
message(sprintf('No. of genes: %d\tNo. of G1: %d\tNo. of G2: %d', |
|
|
462 |
length(ix), sum(jx1), sum(jx2))) |
|
|
463 |
|
|
|
464 |
# Skip if not enough cells |
|
|
465 |
if (sum(jx1) < 3 || sum(jx2) < 3) { |
|
|
466 |
message(sprintf("Skipping %s - insufficient cells", immune.cells[i])) |
|
|
467 |
next |
|
|
468 |
} |
|
|
469 |
|
|
|
470 |
# DE analysis (t-test) |
|
|
471 |
message("Running t-test analysis...") |
|
|
472 |
x <- de$data[ix, jx1] |
|
|
473 |
y <- de$data[ix, jx2] |
|
|
474 |
de$ttest[[immune.cells[i]]] <- de_analyze(x, y) %>% na.omit() |
|
|
475 |
rm(x, y) |
|
|
476 |
clear_memory() |
|
|
477 |
|
|
|
478 |
# DE analysis (MAST) |
|
|
479 |
message("Running MAST analysis...") |
|
|
480 |
tryCatch({ |
|
|
481 |
# Create Seurat object |
|
|
482 |
combined_data <- cbind(de$data[ix, jx1], de$data[ix, jx2]) |
|
|
483 |
if (!is(combined_data, "dgCMatrix")) { |
|
|
484 |
combined_data <- as(combined_data, "dgCMatrix") |
|
|
485 |
} |
|
|
486 |
x <- CreateSeuratObject(counts = combined_data) |
|
|
487 |
rm(combined_data) |
|
|
488 |
clear_memory() |
|
|
489 |
|
|
|
490 |
# Process data |
|
|
491 |
x <- x %>% |
|
|
492 |
NormalizeData(verbose = FALSE) %>% |
|
|
493 |
FindVariableFeatures(verbose = FALSE) %>% |
|
|
494 |
ScaleData(verbose = FALSE) |
|
|
495 |
|
|
|
496 |
# Set identities |
|
|
497 |
Idents(object = x, cells = 1:sum(jx1)) <- 'Tumor' |
|
|
498 |
Idents(object = x, cells = (sum(jx1) + 1):ncol(x)) <- 'Benign' |
|
|
499 |
|
|
|
500 |
# Run MAST |
|
|
501 |
de$mast[[immune.cells[i]]] <- FindMarkers( |
|
|
502 |
object = x, |
|
|
503 |
ident.1 = 'Tumor', |
|
|
504 |
ident.2 = 'Benign', |
|
|
505 |
test.use = 'MAST', |
|
|
506 |
verbose = TRUE, |
|
|
507 |
max.cells.per.ident = 1000 |
|
|
508 |
) |
|
|
509 |
|
|
|
510 |
rm(x) |
|
|
511 |
clear_memory() |
|
|
512 |
|
|
|
513 |
}, error = function(e) { |
|
|
514 |
message(sprintf("Error in MAST analysis for %s: %s", immune.cells[i], e$message)) |
|
|
515 |
message("Attempting Wilcoxon test instead...") |
|
|
516 |
|
|
|
517 |
# Try Wilcoxon as fallback |
|
|
518 |
de$mast[[immune.cells[i]]] <- FindMarkers( |
|
|
519 |
object = x, |
|
|
520 |
ident.1 = 'Tumor', |
|
|
521 |
ident.2 = 'Benign', |
|
|
522 |
test.use = 'wilcox', |
|
|
523 |
verbose = TRUE, |
|
|
524 |
max.cells.per.ident = 1000 |
|
|
525 |
) |
|
|
526 |
}) |
|
|
527 |
|
|
|
528 |
message(sprintf("Completed analysis for %s", immune.cells[i])) |
|
|
529 |
} |
|
|
530 |
|
|
|
531 |
# End timer + log time elapsed |
|
|
532 |
t.end <- Sys.time() |
|
|
533 |
message("\nTotal analysis time: ", difftime(t.end, t.start, units = "mins"), " minutes") |
|
|
534 |
|
|
|
535 |
# Print summary of results |
|
|
536 |
message("\nSummary of analyses:") |
|
|
537 |
for (cell_type in immune.cells) { |
|
|
538 |
if (cell_type %in% names(de$ttest)) { |
|
|
539 |
n_ttest <- nrow(de$ttest[[cell_type]]) |
|
|
540 |
n_mast <- if(cell_type %in% names(de$mast)) nrow(de$mast[[cell_type]]) else 0 |
|
|
541 |
message(sprintf("%s: %d t-test DEGs, %d MAST DEGs", cell_type, n_ttest, n_mast)) |
|
|
542 |
} |
|
|
543 |
} |
|
|
544 |
|
|
|
545 |
``` |
|
|
546 |
|
|
|
547 |
## Case: responder vs. non-responder (GSE120575) |
|
|
548 |
|
|
|
549 |
Comparison between responder vs. non-responder cells to immunotherapy. |
|
|
550 |
Estimated runtime: ~1 hour. |
|
|
551 |
|
|
|
552 |
```{r DE_response, warning=F, message=T, eval=!('response' %in% de$ttest)} |
|
|
553 |
# Start timer |
|
|
554 |
t.start <- Sys.time() |
|
|
555 |
|
|
|
556 |
# Make sure we have the expression data |
|
|
557 |
if (!exists("de") || is.null(de$data) || !is(de$data, "dgCMatrix")) { |
|
|
558 |
message("Reloading expression data...") |
|
|
559 |
de <- list() |
|
|
560 |
de$data <- GetAssayData(object = data.all$proc, layer = "data") |
|
|
561 |
if (!is(de$data, "dgCMatrix")) { |
|
|
562 |
de$data <- as(de$data, "dgCMatrix") |
|
|
563 |
} |
|
|
564 |
} |
|
|
565 |
|
|
|
566 |
# Define cell groups to compare |
|
|
567 |
ix <- rownames(obj$GSE120575) |
|
|
568 |
jx1 <- data.all$proc$Response %in% 'Responder' |
|
|
569 |
jx2 <- data.all$proc$Response %in% 'Non-responder' |
|
|
570 |
|
|
|
571 |
# Print cell counts |
|
|
572 |
message(sprintf("Number of responder cells: %d", sum(jx1))) |
|
|
573 |
message(sprintf("Number of non-responder cells: %d", sum(jx2))) |
|
|
574 |
|
|
|
575 |
# Subsample cells if too many |
|
|
576 |
max_cells_per_group <- 1000 |
|
|
577 |
if(sum(jx1) > max_cells_per_group) { |
|
|
578 |
set.seed(42) |
|
|
579 |
cells_g1 <- sample(which(jx1), max_cells_per_group) |
|
|
580 |
jx1[setdiff(which(jx1), cells_g1)] <- FALSE |
|
|
581 |
} |
|
|
582 |
if(sum(jx2) > max_cells_per_group) { |
|
|
583 |
set.seed(42) |
|
|
584 |
cells_g2 <- sample(which(jx2), max_cells_per_group) |
|
|
585 |
jx2[setdiff(which(jx2), cells_g2)] <- FALSE |
|
|
586 |
} |
|
|
587 |
|
|
|
588 |
# DE analysis (t-test) |
|
|
589 |
message("Running t-test analysis...") |
|
|
590 |
x <- de$data[ix, jx1] |
|
|
591 |
y <- de$data[ix, jx2] |
|
|
592 |
de$ttest$response <- de_analyze(x, y) %>% na.omit |
|
|
593 |
rm(x, y) |
|
|
594 |
#clear_memory() |
|
|
595 |
|
|
|
596 |
# DE analysis (MAST) |
|
|
597 |
message("Running MAST analysis...") |
|
|
598 |
tryCatch({ |
|
|
599 |
# Create Seurat object |
|
|
600 |
message("Creating Seurat object...") |
|
|
601 |
combined_data <- cbind(de$data[ix, jx1], de$data[ix, jx2]) |
|
|
602 |
if (!is(combined_data, "dgCMatrix")) { |
|
|
603 |
combined_data <- as(combined_data, "dgCMatrix") |
|
|
604 |
} |
|
|
605 |
x <- CreateSeuratObject(counts = combined_data) |
|
|
606 |
rm(combined_data) |
|
|
607 |
clear_memory() |
|
|
608 |
|
|
|
609 |
# Process data |
|
|
610 |
message("Processing data...") |
|
|
611 |
x <- x %>% |
|
|
612 |
NormalizeData(verbose = FALSE) %>% |
|
|
613 |
FindVariableFeatures(verbose = FALSE) %>% |
|
|
614 |
ScaleData(verbose = FALSE) %>% |
|
|
615 |
RunPCA(verbose = FALSE) |
|
|
616 |
|
|
|
617 |
# Set identities |
|
|
618 |
message("Setting identities...") |
|
|
619 |
cells_g1 <- 1:sum(jx1) |
|
|
620 |
cells_g2 <- (sum(jx1) + 1):ncol(x) |
|
|
621 |
Idents(object = x, cells = cells_g1) <- 'Responder' |
|
|
622 |
Idents(object = x, cells = cells_g2) <- 'Non-responder' |
|
|
623 |
|
|
|
624 |
# Verify identities |
|
|
625 |
if (!all(levels(Idents(x)) %in% c('Responder', 'Non-responder'))) { |
|
|
626 |
stop("Failed to set identities correctly") |
|
|
627 |
} |
|
|
628 |
|
|
|
629 |
# Run MAST |
|
|
630 |
message("Running differential expression analysis...") |
|
|
631 |
de$mast$response <- FindMarkers( |
|
|
632 |
object = x, |
|
|
633 |
ident.1 = 'Responder', |
|
|
634 |
ident.2 = 'Non-responder', |
|
|
635 |
test.use = 'MAST', |
|
|
636 |
verbose = TRUE, |
|
|
637 |
max.cells.per.ident = 1000, |
|
|
638 |
min.pct = 0.1, |
|
|
639 |
logfc.threshold = 0.25 |
|
|
640 |
) |
|
|
641 |
|
|
|
642 |
# Clean up |
|
|
643 |
rm(x, cells_g1, cells_g2) |
|
|
644 |
clear_memory() |
|
|
645 |
|
|
|
646 |
}, error = function(e) { |
|
|
647 |
message("Error in MAST analysis: ", e$message) |
|
|
648 |
message("Attempting Wilcoxon test instead...") |
|
|
649 |
|
|
|
650 |
# Try Wilcoxon as fallback |
|
|
651 |
de$mast$response <- FindMarkers( |
|
|
652 |
object = x, |
|
|
653 |
ident.1 = 'Responder', |
|
|
654 |
ident.2 = 'Non-responder', |
|
|
655 |
test.use = 'wilcox', |
|
|
656 |
verbose = TRUE, |
|
|
657 |
max.cells.per.ident = 1000, |
|
|
658 |
min.pct = 0.1, |
|
|
659 |
logfc.threshold = 0.25 |
|
|
660 |
) |
|
|
661 |
}) |
|
|
662 |
|
|
|
663 |
# End timer + log time elapsed |
|
|
664 |
t.end <- Sys.time() |
|
|
665 |
message("\nAnalysis completed in: ", difftime(t.end, t.start, units = "mins"), " minutes") |
|
|
666 |
|
|
|
667 |
# Print summary |
|
|
668 |
if (!is.null(de$mast$response)) { |
|
|
669 |
message("\nFound ", sum(de$mast$response$p_val_adj < 0.05), |
|
|
670 |
" significant DEGs (FDR < 0.05)") |
|
|
671 |
} |
|
|
672 |
|
|
|
673 |
``` |
|
|
674 |
|
|
|
675 |
# Visualizations |
|
|
676 |
|
|
|
677 |
**Description**: visual inspection of DE analysis results. |
|
|
678 |
|
|
|
679 |
## T-test vs. MAST DEGs |
|
|
680 |
|
|
|
681 |
Visual comparison of DEGs determined with t-test vs. MAST (venn diagrams). |
|
|
682 |
|
|
|
683 |
```{r DE_venn, warning=F, message=T} |
|
|
684 |
|
|
|
685 |
# Compare DEGs b/t the two methods |
|
|
686 |
f <- names(de$mast); p <- list() |
|
|
687 |
for (i in 1:length(f)) { |
|
|
688 |
x <- de$ttest[[f[i]]]; y <- de$mast[[f[i]]] |
|
|
689 |
g <- intersect(x$gene[x$q.val < 0.05], row.names(y)[y$p_val_adj < 0.05]) |
|
|
690 |
# g <- intersect(x$gene[x$q.val < 0.05], y$gene[y$p_val_adj < 0.05]) |
|
|
691 |
cat(sprintf('No. of intersecting DEGs for %s: %d\n', f[i], length(g))) |
|
|
692 |
ggvenn(list('t-test'=x$gene[x$q.val < 0.05], |
|
|
693 |
'MAST'=row.names(y)[y$p_val_adj < 0.05])) + ggtitle(f[i]) |
|
|
694 |
ggsave(paste0('plots/DE_venn_', f[i], '.pdf'), width=5, height=5, units='in') |
|
|
695 |
} |
|
|
696 |
|
|
|
697 |
``` |
|
|
698 |
|
|
|
699 |
## Responder vs. non-responder DEGs |
|
|
700 |
|
|
|
701 |
Clustergram (or heatmap) of DEGs identified for responder vs. non-responder comparison. |
|
|
702 |
|
|
|
703 |
```{r DE_response_heat, warning=F, message=T} |
|
|
704 |
# Modify DE_response_heat chunk |
|
|
705 |
tryCatch({ |
|
|
706 |
# Get significant genes |
|
|
707 |
if (!exists("de") || is.null(de$mast$response)) { |
|
|
708 |
stop("MAST results not found. Please run the DE analysis first.") |
|
|
709 |
} |
|
|
710 |
|
|
|
711 |
sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05] |
|
|
712 |
if (length(sig_genes) == 0) { |
|
|
713 |
stop("No significant genes found (FDR < 0.05)") |
|
|
714 |
} |
|
|
715 |
message(sprintf("Found %d significant genes", length(sig_genes))) |
|
|
716 |
|
|
|
717 |
# Take top N genes by absolute log fold change |
|
|
718 |
n <- min(50, length(sig_genes)) |
|
|
719 |
g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]), |
|
|
720 |
decreasing = TRUE)][1:n] |
|
|
721 |
|
|
|
722 |
# Create anonymous gene IDs |
|
|
723 |
anon_genes <- anonymize_genes(g) |
|
|
724 |
|
|
|
725 |
# Get all cells |
|
|
726 |
jx1 <- which(data.all$proc$Response %in% 'Responder') |
|
|
727 |
jx2 <- which(data.all$proc$Response %in% 'Non-responder') |
|
|
728 |
|
|
|
729 |
# Get expression data |
|
|
730 |
x <- de$data[g, c(jx1, jx2)] |
|
|
731 |
if (!is(x, "dgCMatrix")) { |
|
|
732 |
x <- as(x, "dgCMatrix") |
|
|
733 |
} |
|
|
734 |
x <- as.matrix(x) |
|
|
735 |
|
|
|
736 |
# Replace gene names with anonymous IDs |
|
|
737 |
rownames(x) <- anon_genes[rownames(x)] |
|
|
738 |
|
|
|
739 |
# Calculate mean expression for each group |
|
|
740 |
x_resp <- rowMeans(x[, 1:length(jx1), drop=FALSE]) |
|
|
741 |
x_nonresp <- rowMeans(x[, (length(jx1)+1):ncol(x), drop=FALSE]) |
|
|
742 |
|
|
|
743 |
# Combine into matrix |
|
|
744 |
x_means <- cbind(x_resp, x_nonresp) |
|
|
745 |
colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)") |
|
|
746 |
|
|
|
747 |
# Calculate standard errors for error bars |
|
|
748 |
x_resp_se <- apply(x[, 1:length(jx1), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x))) |
|
|
749 |
x_nonresp_se <- apply(x[, (length(jx1)+1):ncol(x), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x))) |
|
|
750 |
|
|
|
751 |
# Add SE information to rownames |
|
|
752 |
rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)", |
|
|
753 |
rownames(x_means), |
|
|
754 |
x_resp_se, |
|
|
755 |
x_nonresp_se) |
|
|
756 |
|
|
|
757 |
# Scale data |
|
|
758 |
x_scaled <- t(scale(t(x_means))) |
|
|
759 |
|
|
|
760 |
# Create annotation |
|
|
761 |
col.annot <- data.frame( |
|
|
762 |
Group = c('Responder', 'Non-responder'), |
|
|
763 |
row.names = colnames(x_means) |
|
|
764 |
) |
|
|
765 |
|
|
|
766 |
# Color palette |
|
|
767 |
col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100) |
|
|
768 |
|
|
|
769 |
# Create heatmap |
|
|
770 |
message("Generating heatmap...") |
|
|
771 |
p <- pheatmap( |
|
|
772 |
x_scaled, |
|
|
773 |
cluster_rows = TRUE, |
|
|
774 |
cluster_cols = FALSE, |
|
|
775 |
scale = "none", # already scaled above |
|
|
776 |
color = col.pal, |
|
|
777 |
annotation_col = col.annot, |
|
|
778 |
show_rownames = TRUE, |
|
|
779 |
show_colnames = TRUE, |
|
|
780 |
main = sprintf('Top %d De-identified DEGs (Mean Expression)', nrow(x_scaled)), |
|
|
781 |
fontsize_row = 8, |
|
|
782 |
annotation_colors = list( |
|
|
783 |
Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8') |
|
|
784 |
) |
|
|
785 |
) |
|
|
786 |
|
|
|
787 |
# Save plot |
|
|
788 |
message("Saving plot...") |
|
|
789 |
pdf('plots/DE_heatmap_response_collapsed.pdf', width = 11, height = 8) |
|
|
790 |
print(p) |
|
|
791 |
dev.off() |
|
|
792 |
|
|
|
793 |
# Save gene ID mapping |
|
|
794 |
write.csv(data.frame( |
|
|
795 |
Anonymous_ID = names(anon_genes), |
|
|
796 |
Original_Gene = anon_genes, |
|
|
797 |
Mean_Responder = x_resp, |
|
|
798 |
SE_Responder = x_resp_se, |
|
|
799 |
Mean_NonResponder = x_nonresp, |
|
|
800 |
SE_NonResponder = x_nonresp_se |
|
|
801 |
), './response_gene_mapping_with_stats.csv', row.names = FALSE) |
|
|
802 |
|
|
|
803 |
message("Heatmap generated successfully") |
|
|
804 |
message("Gene mapping with statistics saved to results/response_gene_mapping_with_stats.csv") |
|
|
805 |
|
|
|
806 |
}, error = function(e) { |
|
|
807 |
message(sprintf("Error generating heatmap: %s", e$message)) |
|
|
808 |
}) |
|
|
809 |
``` |
|
|
810 |
# Add new chunk after DE_response_heat |
|
|
811 |
```{r DE_response_heat_individual, warning=F, message=T} |
|
|
812 |
# Show individual cell heatmap alongside collapsed view |
|
|
813 |
tryCatch({ |
|
|
814 |
# Get significant genes |
|
|
815 |
if (!exists("de") || is.null(de$mast$response)) { |
|
|
816 |
stop("MAST results not found. Please run the DE analysis first.") |
|
|
817 |
} |
|
|
818 |
|
|
|
819 |
sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05] |
|
|
820 |
if (length(sig_genes) == 0) { |
|
|
821 |
stop("No significant genes found (FDR < 0.05)") |
|
|
822 |
} |
|
|
823 |
message(sprintf("Found %d significant genes", length(sig_genes))) |
|
|
824 |
|
|
|
825 |
# Take top N genes by absolute log fold change |
|
|
826 |
n <- min(50, length(sig_genes)) |
|
|
827 |
g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]), |
|
|
828 |
decreasing = TRUE)][1:n] |
|
|
829 |
|
|
|
830 |
# Create anonymous gene IDs |
|
|
831 |
anon_genes <- anonymize_genes(g) |
|
|
832 |
|
|
|
833 |
# Get cells (subsample to make visualization manageable) |
|
|
834 |
jx1 <- which(data.all$proc$Response %in% 'Responder') |
|
|
835 |
jx2 <- which(data.all$proc$Response %in% 'Non-responder') |
|
|
836 |
|
|
|
837 |
# Subsample cells |
|
|
838 |
cells_per_group <- min(25, min(length(jx1), length(jx2))) |
|
|
839 |
set.seed(42) |
|
|
840 |
sampled_jx1 <- sample(jx1, cells_per_group) |
|
|
841 |
sampled_jx2 <- sample(jx2, cells_per_group) |
|
|
842 |
|
|
|
843 |
# Get expression data for individual cells |
|
|
844 |
x_individual <- de$data[g, c(sampled_jx1, sampled_jx2)] |
|
|
845 |
if (!is(x_individual, "dgCMatrix")) { |
|
|
846 |
x_individual <- as(x_individual, "dgCMatrix") |
|
|
847 |
} |
|
|
848 |
x_individual <- as.matrix(x_individual) |
|
|
849 |
|
|
|
850 |
# Clean individual cell data |
|
|
851 |
message("Cleaning individual cell data...") |
|
|
852 |
# Replace Inf/-Inf with NA |
|
|
853 |
x_individual[is.infinite(x_individual)] <- NA |
|
|
854 |
# Remove rows (genes) with any NA values |
|
|
855 |
complete_rows <- complete.cases(x_individual) |
|
|
856 |
if (!all(complete_rows)) { |
|
|
857 |
message(sprintf("Removing %d genes with NA values", sum(!complete_rows))) |
|
|
858 |
x_individual <- x_individual[complete_rows, ] |
|
|
859 |
g <- g[complete_rows] |
|
|
860 |
anon_genes <- anon_genes[complete_rows] |
|
|
861 |
} |
|
|
862 |
|
|
|
863 |
# Check for zero variance genes |
|
|
864 |
gene_var <- apply(x_individual, 1, var) |
|
|
865 |
if (any(gene_var == 0)) { |
|
|
866 |
message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0))) |
|
|
867 |
x_individual <- x_individual[gene_var > 0, ] |
|
|
868 |
g <- g[gene_var > 0] |
|
|
869 |
anon_genes <- anon_genes[gene_var > 0] |
|
|
870 |
} |
|
|
871 |
|
|
|
872 |
# Replace gene names with anonymous IDs |
|
|
873 |
rownames(x_individual) <- anon_genes[rownames(x_individual)] |
|
|
874 |
|
|
|
875 |
# Scale individual cell data with validation |
|
|
876 |
message("Scaling individual cell data...") |
|
|
877 |
x_individual_scaled <- t(scale(t(x_individual))) |
|
|
878 |
# Check for NA/Inf values after scaling |
|
|
879 |
if (any(is.na(x_individual_scaled)) || any(is.infinite(x_individual_scaled))) { |
|
|
880 |
message("Found NA/Inf values after scaling, capping extreme values at ±10") |
|
|
881 |
x_individual_scaled[x_individual_scaled > 10] <- 10 |
|
|
882 |
x_individual_scaled[x_individual_scaled < -10] <- -10 |
|
|
883 |
x_individual_scaled[is.na(x_individual_scaled)] <- 0 |
|
|
884 |
} |
|
|
885 |
|
|
|
886 |
# Create annotation for individual cells |
|
|
887 |
col.annot.individual <- data.frame( |
|
|
888 |
Response = rep(c('Responder', 'Non-responder'), each = cells_per_group), |
|
|
889 |
row.names = colnames(x_individual) |
|
|
890 |
) |
|
|
891 |
|
|
|
892 |
# Get mean expression data (for side-by-side comparison) |
|
|
893 |
x_resp <- rowMeans(de$data[g, jx1, drop=FALSE]) |
|
|
894 |
x_nonresp <- rowMeans(de$data[g, jx2, drop=FALSE]) |
|
|
895 |
x_means <- cbind(x_resp, x_nonresp) |
|
|
896 |
colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)") |
|
|
897 |
|
|
|
898 |
# Calculate standard errors |
|
|
899 |
x_resp_se <- apply(de$data[g, jx1, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x))) |
|
|
900 |
x_nonresp_se <- apply(de$data[g, jx2, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x))) |
|
|
901 |
|
|
|
902 |
# Clean mean data |
|
|
903 |
message("Cleaning mean data...") |
|
|
904 |
x_means[is.infinite(x_means)] <- NA |
|
|
905 |
complete_rows_means <- complete.cases(x_means) |
|
|
906 |
if (!all(complete_rows_means)) { |
|
|
907 |
message(sprintf("Removing %d genes with NA values from means", sum(!complete_rows_means))) |
|
|
908 |
x_means <- x_means[complete_rows_means, ] |
|
|
909 |
x_resp_se <- x_resp_se[complete_rows_means] |
|
|
910 |
x_nonresp_se <- x_nonresp_se[complete_rows_means] |
|
|
911 |
} |
|
|
912 |
|
|
|
913 |
# Add SE information to rownames for mean data |
|
|
914 |
rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)", |
|
|
915 |
anon_genes[rownames(x_means)], |
|
|
916 |
x_resp_se, |
|
|
917 |
x_nonresp_se) |
|
|
918 |
|
|
|
919 |
# Scale mean data with validation |
|
|
920 |
message("Scaling mean data...") |
|
|
921 |
x_means_scaled <- t(scale(t(x_means))) |
|
|
922 |
if (any(is.na(x_means_scaled)) || any(is.infinite(x_means_scaled))) { |
|
|
923 |
message("Found NA/Inf values after scaling means, capping extreme values at ±10") |
|
|
924 |
x_means_scaled[x_means_scaled > 10] <- 10 |
|
|
925 |
x_means_scaled[x_means_scaled < -10] <- -10 |
|
|
926 |
x_means_scaled[is.na(x_means_scaled)] <- 0 |
|
|
927 |
} |
|
|
928 |
|
|
|
929 |
# Create annotation for means |
|
|
930 |
col.annot.means <- data.frame( |
|
|
931 |
Group = c('Responder', 'Non-responder'), |
|
|
932 |
row.names = colnames(x_means) |
|
|
933 |
) |
|
|
934 |
|
|
|
935 |
# Color palette |
|
|
936 |
col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100) |
|
|
937 |
|
|
|
938 |
message("Generating plots...") |
|
|
939 |
# Create individual cells heatmap |
|
|
940 |
p1 <- pheatmap( |
|
|
941 |
x_individual_scaled, |
|
|
942 |
cluster_rows = TRUE, |
|
|
943 |
cluster_cols = FALSE, |
|
|
944 |
scale = "none", |
|
|
945 |
color = col.pal, |
|
|
946 |
annotation_col = col.annot.individual, |
|
|
947 |
show_rownames = TRUE, |
|
|
948 |
show_colnames = FALSE, |
|
|
949 |
gaps_col = cells_per_group, |
|
|
950 |
main = sprintf('Individual Cells\n(n=%d per group)', cells_per_group), |
|
|
951 |
fontsize_row = 8, |
|
|
952 |
annotation_colors = list( |
|
|
953 |
Response = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8') |
|
|
954 |
), |
|
|
955 |
legend = FALSE, # Hide colorbar |
|
|
956 |
treeheight_row = 0, # Hide row dendrogram |
|
|
957 |
cellwidth = 10, # Force square cells |
|
|
958 |
cellheight = 10 # Force square cells |
|
|
959 |
) |
|
|
960 |
|
|
|
961 |
# Create mean expression heatmap |
|
|
962 |
p2 <- pheatmap( |
|
|
963 |
x_means_scaled, |
|
|
964 |
cluster_rows = TRUE, |
|
|
965 |
cluster_cols = FALSE, |
|
|
966 |
scale = "none", |
|
|
967 |
color = col.pal, |
|
|
968 |
annotation_col = col.annot.means, |
|
|
969 |
show_rownames = TRUE, |
|
|
970 |
show_colnames = TRUE, |
|
|
971 |
main = 'Group Means\n(all cells)', |
|
|
972 |
fontsize_row = 8, |
|
|
973 |
annotation_colors = list( |
|
|
974 |
Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8') |
|
|
975 |
), |
|
|
976 |
legend = FALSE, # Hide colorbar |
|
|
977 |
treeheight_row = 0, # Hide row dendrogram |
|
|
978 |
cellwidth = 20, # Force square cells (larger since fewer columns) |
|
|
979 |
cellheight = 20, # Force square cells (larger since fewer columns) |
|
|
980 |
labels_row = sub("\n.*", "", rownames(x_means)) # Only show gene IDs without SE info |
|
|
981 |
) |
|
|
982 |
|
|
|
983 |
# Save plots to PDF |
|
|
984 |
pdf('plots/DE_heatmap_response_comparison.pdf', width = 15, height = 8) |
|
|
985 |
grid::grid.newpage() |
|
|
986 |
grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2)) |
|
|
987 |
dev.off() |
|
|
988 |
|
|
|
989 |
# Display plots in notebook |
|
|
990 |
grid::grid.newpage() |
|
|
991 |
grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2)) |
|
|
992 |
|
|
|
993 |
# Save detailed statistics |
|
|
994 |
write.csv(data.frame( |
|
|
995 |
Anonymous_ID = names(anon_genes), |
|
|
996 |
Original_Gene = anon_genes, |
|
|
997 |
Mean_Responder = x_resp, |
|
|
998 |
SE_Responder = x_resp_se, |
|
|
999 |
Mean_NonResponder = x_nonresp, |
|
|
1000 |
SE_NonResponder = x_nonresp_se, |
|
|
1001 |
N_Responder = length(jx1), |
|
|
1002 |
N_NonResponder = length(jx2) |
|
|
1003 |
), 'results/response_gene_mapping_detailed.csv', row.names = FALSE) |
|
|
1004 |
|
|
|
1005 |
message("Side-by-side heatmaps generated successfully") |
|
|
1006 |
message("Detailed gene statistics saved to results/response_gene_mapping_detailed.csv") |
|
|
1007 |
|
|
|
1008 |
}, error = function(e) { |
|
|
1009 |
message(sprintf("Error generating heatmaps: %s", e$message)) |
|
|
1010 |
message("\nDebugging information:") |
|
|
1011 |
message("1. Check if expression data contains extreme values") |
|
|
1012 |
message("2. Verify gene filtering and scaling steps") |
|
|
1013 |
message("3. Ensure all matrices have proper dimensions") |
|
|
1014 |
message("4. Look for NA/Inf values in the data") |
|
|
1015 |
}) |
|
|
1016 |
``` |
|
|
1017 |
|
|
|
1018 |
PCA of DEGs identified for responder vs. non-responder comparison. |
|
|
1019 |
|
|
|
1020 |
```{r DE_response_PCA, warning=F, message=T} |
|
|
1021 |
# Modify DE_response_PCA chunk |
|
|
1022 |
tryCatch({ |
|
|
1023 |
# Validate MAST results exist |
|
|
1024 |
if (!exists("de") || is.null(de$mast$response)) { |
|
|
1025 |
stop("MAST results not found. Please run the DE analysis first.") |
|
|
1026 |
} |
|
|
1027 |
|
|
|
1028 |
# Get significant genes |
|
|
1029 |
sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05] |
|
|
1030 |
if (length(sig_genes) == 0) { |
|
|
1031 |
stop("No significant genes found (FDR < 0.05)") |
|
|
1032 |
} |
|
|
1033 |
message(sprintf("Found %d significant genes", length(sig_genes))) |
|
|
1034 |
|
|
|
1035 |
# Get responder/non-responder cells |
|
|
1036 |
jx <- data.all$proc$Response %in% c('Responder', 'Non-responder') |
|
|
1037 |
if (sum(jx) == 0) { |
|
|
1038 |
stop("No cells found matching response criteria") |
|
|
1039 |
} |
|
|
1040 |
message(sprintf("Found %d cells (%d Responders, %d Non-responders)", |
|
|
1041 |
sum(jx), |
|
|
1042 |
sum(data.all$proc$Response[jx] == "Responder"), |
|
|
1043 |
sum(data.all$proc$Response[jx] == "Non-responder"))) |
|
|
1044 |
|
|
|
1045 |
# Subsample cells if too many |
|
|
1046 |
max_cells_per_group <- 1000 |
|
|
1047 |
responder_idx <- which(data.all$proc$Response[jx] == "Responder") |
|
|
1048 |
nonresponder_idx <- which(data.all$proc$Response[jx] == "Non-responder") |
|
|
1049 |
|
|
|
1050 |
if (length(responder_idx) > max_cells_per_group) { |
|
|
1051 |
set.seed(42) |
|
|
1052 |
responder_idx <- sample(responder_idx, max_cells_per_group) |
|
|
1053 |
} |
|
|
1054 |
if (length(nonresponder_idx) > max_cells_per_group) { |
|
|
1055 |
set.seed(42) |
|
|
1056 |
nonresponder_idx <- sample(nonresponder_idx, max_cells_per_group) |
|
|
1057 |
} |
|
|
1058 |
|
|
|
1059 |
selected_cells <- c(responder_idx, nonresponder_idx) |
|
|
1060 |
|
|
|
1061 |
# Get expression data |
|
|
1062 |
x <- de$data[sig_genes, jx][, selected_cells] |
|
|
1063 |
if (!is(x, "dgCMatrix")) { |
|
|
1064 |
x <- as(x, "dgCMatrix") |
|
|
1065 |
} |
|
|
1066 |
x <- as.matrix(x) |
|
|
1067 |
|
|
|
1068 |
# Create anonymous gene IDs |
|
|
1069 |
anon_genes <- anonymize_genes(rownames(x)) |
|
|
1070 |
rownames(x) <- anon_genes[rownames(x)] |
|
|
1071 |
|
|
|
1072 |
# Handle zero variance genes |
|
|
1073 |
gene_var <- apply(x, 1, var) |
|
|
1074 |
if (any(gene_var == 0)) { |
|
|
1075 |
message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0))) |
|
|
1076 |
x <- x[gene_var > 0, ] |
|
|
1077 |
} |
|
|
1078 |
|
|
|
1079 |
# Clean data |
|
|
1080 |
x[is.infinite(x)] <- NA |
|
|
1081 |
x[is.na(x)] <- 0 |
|
|
1082 |
|
|
|
1083 |
# Transpose and convert to data frame |
|
|
1084 |
x_t <- t(x) |
|
|
1085 |
|
|
|
1086 |
# Create metadata |
|
|
1087 |
m <- data.frame( |
|
|
1088 |
Response = data.all$proc$Response[jx][selected_cells], |
|
|
1089 |
row.names = rownames(x_t) |
|
|
1090 |
) |
|
|
1091 |
|
|
|
1092 |
# Apply PCA with validation |
|
|
1093 |
if (ncol(x_t) < 2) { |
|
|
1094 |
stop("Need at least 2 features for PCA") |
|
|
1095 |
} |
|
|
1096 |
if (nrow(x_t) < 3) { |
|
|
1097 |
stop("Need at least 3 samples for PCA") |
|
|
1098 |
} |
|
|
1099 |
|
|
|
1100 |
message("Running PCA...") |
|
|
1101 |
response_pca <- prcomp(x_t, scale. = TRUE) |
|
|
1102 |
|
|
|
1103 |
# Calculate variance explained |
|
|
1104 |
var_explained <- summary(response_pca)$importance[2, 1:2] * 100 |
|
|
1105 |
|
|
|
1106 |
# Calculate group centroids |
|
|
1107 |
centroids <- aggregate(cbind(PC1, PC2) ~ Response, |
|
|
1108 |
data = cbind(as.data.frame(response_pca$x[,1:2]), m), |
|
|
1109 |
FUN = mean) |
|
|
1110 |
|
|
|
1111 |
# Create enhanced PCA plot |
|
|
1112 |
p <- ggplot(data = as.data.frame(response_pca$x[,1:2]), |
|
|
1113 |
aes(x = PC1, y = PC2, color = m$Response)) + |
|
|
1114 |
# Add points with reduced alpha for transparency |
|
|
1115 |
geom_point(alpha = 0.3, size = 2) + |
|
|
1116 |
# Add confidence ellipses |
|
|
1117 |
stat_ellipse(level = 0.95, size = 1) + |
|
|
1118 |
# Add centroids |
|
|
1119 |
geom_point(data = centroids, aes(x = PC1, y = PC2), |
|
|
1120 |
size = 5, shape = 18) + |
|
|
1121 |
# Add centroid labels |
|
|
1122 |
geom_text(data = centroids, |
|
|
1123 |
aes(x = PC1, y = PC2, label = Response), |
|
|
1124 |
vjust = -1.5, size = 4, color = "black") + |
|
|
1125 |
# Customize appearance |
|
|
1126 |
ggtitle('De-identified DEG PCA with Group Trends') + |
|
|
1127 |
xlab(sprintf("PC1 (%.1f%%)", var_explained[1])) + |
|
|
1128 |
ylab(sprintf("PC2 (%.1f%%)", var_explained[2])) + |
|
|
1129 |
theme_minimal() + |
|
|
1130 |
scale_color_manual(values = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')) + |
|
|
1131 |
theme( |
|
|
1132 |
legend.position = "bottom", |
|
|
1133 |
plot.title = element_text(hjust = 0.5, size = 14), |
|
|
1134 |
axis.title = element_text(size = 12), |
|
|
1135 |
legend.title = element_text(size = 12), |
|
|
1136 |
legend.text = element_text(size = 10) |
|
|
1137 |
) |
|
|
1138 |
|
|
|
1139 |
# Save plot |
|
|
1140 |
message("Saving plot...") |
|
|
1141 |
ggsave('plots/DE_PCA_response_collapsed.pdf', plot = p, width = 8, height = 8, units = 'in') |
|
|
1142 |
|
|
|
1143 |
# Display plot |
|
|
1144 |
print(p) |
|
|
1145 |
|
|
|
1146 |
message("PCA visualization completed successfully") |
|
|
1147 |
|
|
|
1148 |
}, error = function(e) { |
|
|
1149 |
message(sprintf("Error in PCA analysis: %s", e$message)) |
|
|
1150 |
message("Please ensure:") |
|
|
1151 |
message("1. DE analysis has been run successfully") |
|
|
1152 |
message("2. There are significant DEGs to analyze") |
|
|
1153 |
message("3. There are enough cells in each response group") |
|
|
1154 |
message("4. Expression data is valid and contains sufficient variation") |
|
|
1155 |
}) |
|
|
1156 |
``` |
|
|
1157 |
|
|
|
1158 |
## Tumor vs. benign DEGs |
|
|
1159 |
|
|
|
1160 |
Clustergram (or heatmap) of DEGs identified for tumor vs. benign comparison. |
|
|
1161 |
|
|
|
1162 |
```{r DE_tumor_heat, warning=F, message=T} |
|
|
1163 |
|
|
|
1164 |
# Define data to plot |
|
|
1165 |
n <- 50 |
|
|
1166 |
g <- de$mast$tumor$gene[1:n] |
|
|
1167 |
jx1 <- which(data.all$proc$Source == names(obj)[3] | |
|
|
1168 |
data.all$proc$Malignant %in% 'Yes')[1:(n/2)] |
|
|
1169 |
jx2 <- which(data.all$proc$Source == names(obj)[4])[1:(n/2)] |
|
|
1170 |
x <- de$data[g, c(jx1, jx2)] %>% as.data.frame(.) |
|
|
1171 |
|
|
|
1172 |
# Specify heatmap arguments |
|
|
1173 |
# color palette |
|
|
1174 |
col.pal <- colorRampPalette(colors=c('white', 'red'), space='Lab')(100) |
|
|
1175 |
# column annotations (response) |
|
|
1176 |
col.annot <- data.frame(Malignant=rep(c('Yes', 'No'), each=n/2), |
|
|
1177 |
row.names=colnames(x)) |
|
|
1178 |
|
|
|
1179 |
# Heatmap |
|
|
1180 |
p <- pheatmap(x, cluster_rows=T, cluster_cols=F, scale='none', color=col.pal, |
|
|
1181 |
annotation_col=col.annot, cellheight=(500/n), cellwidth=(500/n), |
|
|
1182 |
show_rownames=T, show_colnames=F, gaps_col=n/2, |
|
|
1183 |
main='DEG heatmap (tumor)') |
|
|
1184 |
|
|
|
1185 |
# Save plot |
|
|
1186 |
#tiff('plots/DE_heatmap_tumor.tiff', units='in', width=11, height=8, res=300) |
|
|
1187 |
print({p}); dev.off() |
|
|
1188 |
|
|
|
1189 |
``` |
|
|
1190 |
|
|
|
1191 |
PCA of DEGs identified for tumor vs. benign comparison. |
|
|
1192 |
|
|
|
1193 |
```{r DE_tumor_PCA, warning=F, message=T} |
|
|
1194 |
# Define data to plot |
|
|
1195 |
tryCatch({ |
|
|
1196 |
# Validate MAST results exist |
|
|
1197 |
if (!exists("de") || is.null(de$mast$tumor)) { |
|
|
1198 |
stop("MAST results not found. Please run the tumor DE analysis first.") |
|
|
1199 |
} |
|
|
1200 |
|
|
|
1201 |
# Get significant genes |
|
|
1202 |
sig_genes <- rownames(de$mast$tumor)[de$mast$tumor$p_val_adj < 0.05] |
|
|
1203 |
if (length(sig_genes) == 0) { |
|
|
1204 |
stop("No significant genes found (FDR < 0.05)") |
|
|
1205 |
} |
|
|
1206 |
message(sprintf("Found %d significant genes", length(sig_genes))) |
|
|
1207 |
|
|
|
1208 |
# Take top N genes by absolute log fold change |
|
|
1209 |
n <- min(100, length(sig_genes)) |
|
|
1210 |
g <- sig_genes[order(abs(de$mast$tumor$avg_log2FC[match(sig_genes, rownames(de$mast$tumor))]), |
|
|
1211 |
decreasing = TRUE)][1:n] |
|
|
1212 |
|
|
|
1213 |
# Get tumor/benign cells |
|
|
1214 |
jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes' |
|
|
1215 |
jx2 <- data.all$proc$Source == names(obj)[4] |
|
|
1216 |
|
|
|
1217 |
if (sum(jx1) == 0 || sum(jx2) == 0) { |
|
|
1218 |
stop("Insufficient cells in one or both groups") |
|
|
1219 |
} |
|
|
1220 |
message(sprintf("Found %d cells (%d Tumor, %d Benign)", |
|
|
1221 |
sum(jx1) + sum(jx2), sum(jx1), sum(jx2))) |
|
|
1222 |
|
|
|
1223 |
# Subsample cells if too many |
|
|
1224 |
max_cells_per_group <- 1000 |
|
|
1225 |
if (sum(jx1) > max_cells_per_group) { |
|
|
1226 |
set.seed(42) |
|
|
1227 |
tumor_cells <- sample(which(jx1), max_cells_per_group) |
|
|
1228 |
jx1[setdiff(which(jx1), tumor_cells)] <- FALSE |
|
|
1229 |
} |
|
|
1230 |
if (sum(jx2) > max_cells_per_group) { |
|
|
1231 |
set.seed(42) |
|
|
1232 |
benign_cells <- sample(which(jx2), max_cells_per_group) |
|
|
1233 |
jx2[setdiff(which(jx2), benign_cells)] <- FALSE |
|
|
1234 |
} |
|
|
1235 |
|
|
|
1236 |
# Get expression data |
|
|
1237 |
x <- de$data[g, c(which(jx1), which(jx2))] |
|
|
1238 |
if (!is(x, "dgCMatrix")) { |
|
|
1239 |
x <- as(x, "dgCMatrix") |
|
|
1240 |
} |
|
|
1241 |
x <- as.matrix(x) |
|
|
1242 |
|
|
|
1243 |
# Handle zero variance genes |
|
|
1244 |
gene_var <- apply(x, 1, var) |
|
|
1245 |
if (any(gene_var == 0)) { |
|
|
1246 |
message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0))) |
|
|
1247 |
x <- x[gene_var > 0, ] |
|
|
1248 |
g <- g[gene_var > 0] |
|
|
1249 |
} |
|
|
1250 |
|
|
|
1251 |
# Clean data |
|
|
1252 |
x[is.infinite(x)] <- NA |
|
|
1253 |
x[is.na(x)] <- 0 |
|
|
1254 |
|
|
|
1255 |
# Transpose and convert to data frame |
|
|
1256 |
x_t <- t(x) |
|
|
1257 |
|
|
|
1258 |
# Create metadata |
|
|
1259 |
m <- data.frame( |
|
|
1260 |
Malignant = rep(c('Yes', 'No'), c(sum(jx1), sum(jx2))), |
|
|
1261 |
row.names = rownames(x_t) |
|
|
1262 |
) |
|
|
1263 |
|
|
|
1264 |
# Apply PCA with validation |
|
|
1265 |
if (ncol(x_t) < 2) { |
|
|
1266 |
stop("Need at least 2 features for PCA") |
|
|
1267 |
} |
|
|
1268 |
if (nrow(x_t) < 3) { |
|
|
1269 |
stop("Need at least 3 samples for PCA") |
|
|
1270 |
} |
|
|
1271 |
|
|
|
1272 |
message("Running PCA...") |
|
|
1273 |
pca.tumor <- prcomp(x_t, scale. = TRUE) |
|
|
1274 |
|
|
|
1275 |
# Calculate variance explained |
|
|
1276 |
var_explained <- summary(pca.tumor)$importance[2, 1:2] * 100 |
|
|
1277 |
|
|
|
1278 |
# Visualize PCA plot with variance explained |
|
|
1279 |
p <- autoplot(pca.tumor, data = m, colour = 'Malignant') + |
|
|
1280 |
ggtitle('DEG PCA (tumor)') + |
|
|
1281 |
xlab(sprintf("PC1 (%.1f%%)", var_explained[1])) + |
|
|
1282 |
ylab(sprintf("PC2 (%.1f%%)", var_explained[2])) + |
|
|
1283 |
theme_minimal() + |
|
|
1284 |
scale_color_manual(values = c('Yes' = '#E41A1C', 'No' = '#377EB8')) |
|
|
1285 |
|
|
|
1286 |
# Save plot |
|
|
1287 |
message("Saving plot...") |
|
|
1288 |
ggsave('plots/DE_PCA_tumor.pdf', plot = p, width = 8, height = 6, units = 'in') |
|
|
1289 |
|
|
|
1290 |
# Display plot |
|
|
1291 |
print(p) |
|
|
1292 |
|
|
|
1293 |
# Return success message |
|
|
1294 |
message("PCA visualization completed successfully") |
|
|
1295 |
|
|
|
1296 |
}, error = function(e) { |
|
|
1297 |
message(sprintf("Error in PCA analysis: %s", e$message)) |
|
|
1298 |
message("Please ensure:") |
|
|
1299 |
message("1. Tumor DE analysis has been run successfully") |
|
|
1300 |
message("2. There are significant DEGs to analyze") |
|
|
1301 |
message("3. There are enough cells in each group") |
|
|
1302 |
message("4. Expression data is valid and contains sufficient variation") |
|
|
1303 |
}) |
|
|
1304 |
``` |
|
|
1305 |
|
|
|
1306 |
# Save results (if prompted) |
|
|
1307 |
|
|
|
1308 |
```{r save_outputs, warning=F, message=T} |
|
|
1309 |
|
|
|
1310 |
#save.data <- dlgInput('Save all results? (T/F)', F)$res %>% as.logical(.) |
|
|
1311 |
#if (save.data) { |
|
|
1312 |
# # t-test results |
|
|
1313 |
# write.xlsx(de$ttest, 'results/DE_ttest.xlsx', row.names=F) |
|
|
1314 |
# # MAST results |
|
|
1315 |
# write.xlsx(de$mast, 'results/DE_MAST.xlsx', row.names=T) |
|
|
1316 |
#} |
|
|
1317 |
|
|
|
1318 |
``` |