1319 lines (1091 with data), 42.0 kB
---
title: "DE Analysis of scRNA-seq data"
date: "Last modified: `r format(Sys.time(), '%B %d, %Y')`"
tags: [scRNA-seq, seurat, melanoma, immunotherapy, PBMCs]
output:
html_document:
theme: flatly
highlight: zenburn
toc: true
number_sections: false
toc_depth: 2
toc_float:
collapsed: false
smooth_scroll: true
code_folding: hide
self_contained: yes
---
# Notebook Description
This notebook compiles the code and outputs for differential expression (DE) analysis. Specifically, two methods are covered:
* unpaired Student's t-test (custom function)
* [MAST](https://doi.org/10.1186/s13059-015-0844-5)
Of note, pre-processed data stored in the `scRNAseq_analysis.RData` file is used for the analyses detailed below.
# Initialize environment
```{r load_packages, results="hide", message=F, warning=F, error=F}
# Install required packages if not already installed
if (!require("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (!require("MAST", quietly = TRUE)) {
BiocManager::install("MAST")
}
if (!require("presto", quietly = TRUE)) {
install.packages("presto")
}
# Memory management settings
options(future.globals.maxSize = 16000 * 1024^2) # Set maximum global size to 8GB
# Load core packages first
library(Matrix) # for sparse matrix operations
library(Seurat) # for scRNA-seq analysis
library(MAST) # for scRNAseq DEG analysis
# Load other packages
library(svDialogs) # for prompting user-input
library(vroom) # for quickly reading data
library(dplyr) # for data processing
library(DT) # to display datatables
library(harmony) # to integration scRNA-seq data
library(patchwork) # for combining plots
library(ggplot2) # for data visualization
library(ggrepel) # to use geom_pont_repel()
library(ggvenn) # to visualize venn diagrams
library(openxlsx) # to write data to excel
library(progress) # to display progress bar
library(rio) # to load all worksheets in a workbook
library(pheatmap) # to visualize heatmaps
library(ggfortify) # to visualize PCA plots
library(presto) # for faster Wilcoxon test implementation
library(gridExtra) # for arranging multiple plots
# Set up parallel processing
library(future)
plan(multisession, workers = 2) # Adjust based on available memory
options(future.globals.maxSize = 16000 * 1024^2)
```
System settings.
```{r settings}
# Adjust system settings
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
# Enhanced memory management functions
clear_memory <- function(deep_clean = FALSE) {
# Standard cleanup
gc(verbose = FALSE)
if(exists(".Random.seed")) rm(.Random.seed)
# Deep cleanup if requested
if(deep_clean) {
# Clear package caches
if(exists(".__C__")) rm(list=".__C__", envir=.GlobalEnv)
# Clear hidden objects
rm(list = ls(all.names = TRUE, envir = .GlobalEnv), envir = .GlobalEnv)
# Force garbage collection multiple times
replicate(3, gc(verbose = FALSE))
}
invisible(gc())
}
# Function to get data efficiently with batching support
get_assay_data <- function(object, data_type = "data", batch_size = 1000) {
tryCatch({
# Get data using appropriate method
if (packageVersion("Seurat") >= "5.0.0") {
data <- GetAssayData(object = object, layer = data_type)
} else {
data <- GetAssayData(object = object, slot = data_type)
}
# Convert to sparse matrix if dense
if (!is(data, "dgCMatrix")) {
data <- as(data, "dgCMatrix")
}
return(data)
}, error = function(e) {
stop(sprintf("Failed to get assay data: %s", e$message))
})
}
# Function to process data in batches
process_in_batches <- function(data, batch_size = 1000, FUN, ...) {
n_cells <- ncol(data)
n_batches <- ceiling(n_cells / batch_size)
results <- vector("list", n_batches)
pb <- progress_bar$new(total = n_batches)
for(i in 1:n_batches) {
start_idx <- (i-1) * batch_size + 1
end_idx <- min(i * batch_size, n_cells)
# Process batch
batch_data <- data[, start_idx:end_idx]
results[[i]] <- FUN(batch_data, ...)
# Clean up
rm(batch_data)
clear_memory()
pb$tick()
}
# Combine results
combined_results <- do.call(rbind, results)
rm(results)
clear_memory()
return(combined_results)
}
# Add gene anonymization function after the settings chunk
anonymize_genes <- function(gene_list, prefix = "GENE") {
# Create mapping of real genes to anonymous IDs
anon_ids <- paste0(prefix, "_", sprintf("%04d", seq_along(gene_list)))
names(anon_ids) <- gene_list
return(anon_ids)
}
```
Load pre-processed data.
```{r load_data, warning=F, message=F}
file_path <- "/Users/scampit/Projects/torchstack/cancer-biomarker-discovery"
setwd(file_path)
# Start with clean environment
#clear_memory(deep_clean = TRUE)
# Load data from saved workspace
f <- list.files()
if (any(endsWith(f, '.RData'))) {
load(f[endsWith(f, '.RData')][1])
} else {
stop("No .RData file found. Please ensure scRNAseq_wksp.RData is in the working directory.")
}
# Initialize DE lists
de <- list()
# Get data from Seurat object efficiently
tryCatch({
message("Loading expression data from Seurat object...")
de$data <- get_assay_data(data.all$proc)
# Verify data loaded correctly
if (is.null(de$data) || !is(de$data, "dgCMatrix")) {
stop("Failed to load expression data properly")
}
message(sprintf("Loaded expression matrix with %d genes and %d cells",
nrow(de$data), ncol(de$data)))
}, error = function(e) {
stop(sprintf("Error loading data: %s", e$message))
})
#clear_memory()
# Load or initialize results
de$ttest <- if(file.exists('results/tables/DE_ttest.xlsx')) {
import_list('results/tables/DE_ttest.xlsx')
} else {
list()
}
de$mast <- if(file.exists('results/tables/DE_mast.xlsx')) {
import_list('results/tables/DE_mast.xlsx')
} else {
list()
}
#clear_memory(deep_clean = TRUE)
# Verify data structure
str(de, max.level = 1)
```
```{r summary_stats, warning=F, message=F}
# Assign row names to MAST results
f <- names(de$ttest)
for (i in 1:length(f)) {
x <- de$ttest[[f[i]]]; y <- de$mast[[f[i]]]
g <- intersect(x$gene[x$q.val < 0.05], y$gene[y$p_val_adj < 0.05])
cat(sprintf('%s\tDEGs (t-test): %d\tDEGs (MAST): %d\tDEGs (overlap): %d\n',
f[i], sum(x$q.val < 0.05), sum(y$p_val_adj < 0.05), length(g)))
}
```
# DE analyses
**Description**: determine differentially expressed genes (DEGs) between different comparisons.
```{r DE_initialize, warning=F, message=T, eval=!('data' %in% de)}
# Instantiate DE variable + relevant fields
de <- list(); de$ttest <- list(); de$mast <- list()
# Define data to work with - update to use layer
tryCatch({
de$data <- GetAssayData(object=data.all$proc, layer='data')
}, error = function(e) {
# Fallback for older Seurat versions
message("Attempting fallback for older Seurat version...")
de$data <- GetAssayData(object=data.all$proc, slot='data')
})
# Add validation
if (is.null(de$data) || ncol(de$data) == 0 || nrow(de$data) == 0) {
stop("Failed to retrieve data from Seurat object")
}
```
## Case: disease vs. control (tumor)
Comparison between benign vs. tumor skin cells.
Estimated runtime: ~15 minutes.
```{r DE_tumor, warning=F, message=T, eval=!('tumor' %in% de$ttest)}
# Start timer
t.start <- Sys.time()
# Define cell groups to compare
ix <- intersect(rownames(obj$GSE72056), rownames(obj$GSE115978)) %>%
intersect(., rownames(obj$GSE151091))
jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes'
jx2 <- data.all$proc$Source == names(obj)[4]
# Subsample cells if too many
max_cells_per_group <- 1000
if(sum(jx1) > max_cells_per_group) {
set.seed(42)
cells_g1 <- sample(which(jx1), max_cells_per_group)
jx1[setdiff(which(jx1), cells_g1)] <- FALSE
}
if(sum(jx2) > max_cells_per_group) {
set.seed(42)
cells_g2 <- sample(which(jx2), max_cells_per_group)
jx2[setdiff(which(jx2), cells_g2)] <- FALSE
}
# DE analysis (t-test) with memory management
x <- de$data[ix, jx1]
y <- de$data[ix, jx2]
#clear_memory()
de$ttest$tumor <- de_analyze(x, y) %>% na.omit
rm(x, y)
#clear_memory(deep_clean = TRUE)
# DE analysis (MAST)
message("Creating Seurat object...")
combined_data <- cbind(de$data[ix, jx1], de$data[ix, jx2])
if (!is(combined_data, "dgCMatrix")) {
combined_data <- as(combined_data, "dgCMatrix")
}
x <- CreateSeuratObject(counts = combined_data)
rm(combined_data)
#clear_memory()
# Process data in steps with memory management
message("Normalizing data...")
x <- NormalizeData(x, verbose = FALSE)
#clear_memory()
message("Finding variable features...")
x <- FindVariableFeatures(x, verbose = FALSE)
#clear_memory()
message("Scaling data...")
x <- ScaleData(x, verbose = FALSE)
#clear_memory()
message("Running PCA...")
x <- RunPCA(x, verbose = FALSE)
#clear_memory()
# Set identities
cells_g1 <- 1:sum(jx1)
cells_g2 <- (sum(jx1) + 1):ncol(x)
Idents(object = x, cells = cells_g1) <- 'Disease'
Idents(object = x, cells = cells_g2) <- 'Control'
rm(cells_g1, cells_g2)
#clear_memory()
# Run MAST analysis with error handling and memory management
message("Running differential expression analysis...")
tryCatch({
de$mast$tumor <- FindMarkers(
object = x,
ident.1 = 'Disease',
ident.2 = 'Control',
test.use = 'MAST',
verbose = TRUE,
max.cells.per.ident = 1000,
min.pct = 0.1, # Filter low-expressed genes
logfc.threshold = 0.25 # Filter low effect size
)
}, error = function(e) {
message("Error in MAST analysis: ", e$message)
message("Attempting alternative analysis method...")
de$mast$tumor <- FindMarkers(
object = x,
ident.1 = 'Disease',
ident.2 = 'Control',
test.use = 'wilcox',
verbose = TRUE,
max.cells.per.ident = 1000,
min.pct = 0.1,
logfc.threshold = 0.25
)
})
# Clean up
rm(x)
#clear_memory(deep_clean = TRUE)
# End timer + log time elapsed
t.end <- Sys.time()
message("Analysis completed in: ", difftime(t.end, t.start, units = "mins"), " minutes")
```
## Case: disease vs. control (immune cells, bulk)
Bulk comparison between healthy vs. diseased immune cells.
Estimated runtime: ~20 minutes.
```{r DE_immune_bulk,warning=F, message=T, eval=!('immune' %in% de$ttest)}
# Start timer
t.start <- Sys.time()
# Define cell groups to compare
ix <- intersect(rownames(obj$GSE120575), rownames(obj$GSE115978)) %>%
intersect(., rownames(obj$GSE183212))
jx1 <- data.all$proc$Source == names(obj)[1] | data.all$proc$Malignant %in% 'No'
jx2 <- data.all$proc$Source == names(obj)[5]
# DE analysis (t-test)
x <- de$data[ix, jx1]; y <- de$data[ix, jx2]
de$ttest$immune <- de_analyze(x, y) %>% na.omit()
# DE analysis (MAST)
# Create Seurat object with proper assay
x <- CreateSeuratObject(counts = cbind(de$data[ix, jx1], de$data[ix, jx2]))
# Normalize the data
x <- NormalizeData(x)
# Find variable features
x <- FindVariableFeatures(x)
# Scale the data
x <- ScaleData(x)
# Run PCA
x <- RunPCA(x)
# Set identities
Idents(object=x, cells=1:sum(jx1)) <- 'Tumor'
Idents(object=x, cells=sum(jx1)+1:ncol(x)) <- 'Benign'
# Run MAST analysis
de$mast$immune <- FindMarkers(object=x,
ident.1='Tumor',
ident.2='Benign',
test.use='MAST',
verbose = TRUE)
# End timer + log time elapsed
t.end <- Sys.time()
t.end - t.start
```
## Case: disease vs. control (immune cells, cluster-based)
Immune cell-specific comparisons between healthy vs. diseased cells.
Estimated runtime: ~15 minutes.
```{r DE_immune_cluster, warning=F, message=T}
# Start timer
t.start <- Sys.time()
# Define labeled scRNAseq data
lab <- c('Tumor cell', 'Technical error 1', 'Cytotoxic CD8 T cell 1', 'B cell',
'Melanocytes 1', 'Macrophage', 'Cytotoxic CD8 T cell 2', 'Technical error 2',
'Memory T cell', 'Melanocytes 2', 'Dysfunctional CD8 T cell',
'Cytotoxic CD8 T cell 3', '? 1', 'Melanocytes 3', 'Activated cell',
'Exhausted T cell', 'Cytotoxic T cell', '? 2', '? 3', 'Melanocytes 4', '? 4')
names(lab) <- levels(data.all$proc)
plot.data <- data.all$proc %>% RenameIdents(., lab)
# Make sure we have the expression data
if (!exists("de") || is.null(de$data) || !is(de$data, "dgCMatrix")) {
message("Reloading expression data...")
de <- list()
de$data <- GetAssayData(object = data.all$proc, layer = "data")
if (!is(de$data, "dgCMatrix")) {
de$data <- as(de$data, "dgCMatrix")
}
}
# Initialize results lists if they don't exist
if (!"ttest" %in% names(de)) de$ttest <- list()
if (!"mast" %in% names(de)) de$mast <- list()
# Iterate through each immune cell group
immune.cells <- c('B', 'CD8', 'Macrophage', 'Memory', 'Activated')
for (i in 1:length(immune.cells)) {
message(sprintf("\nProcessing %s cells...", immune.cells[i]))
# Define row + column indices
ix <- intersect(rownames(obj$GSE120575), rownames(obj$GSE115978)) %>%
intersect(., rownames(obj$GSE183212))
c <- grepl(immune.cells[i], Idents(plot.data))
jx1 <- (plot.data$Source == names(obj)[1] |
data.all$proc$Malignant %in% 'No') & c
jx2 <- data.all$proc$Source == names(obj)[5] & c
# Print number of genes + cells
message(sprintf('No. of genes: %d\tNo. of G1: %d\tNo. of G2: %d',
length(ix), sum(jx1), sum(jx2)))
# Skip if not enough cells
if (sum(jx1) < 3 || sum(jx2) < 3) {
message(sprintf("Skipping %s - insufficient cells", immune.cells[i]))
next
}
# DE analysis (t-test)
message("Running t-test analysis...")
x <- de$data[ix, jx1]
y <- de$data[ix, jx2]
de$ttest[[immune.cells[i]]] <- de_analyze(x, y) %>% na.omit()
rm(x, y)
clear_memory()
# DE analysis (MAST)
message("Running MAST analysis...")
tryCatch({
# Create Seurat object
combined_data <- cbind(de$data[ix, jx1], de$data[ix, jx2])
if (!is(combined_data, "dgCMatrix")) {
combined_data <- as(combined_data, "dgCMatrix")
}
x <- CreateSeuratObject(counts = combined_data)
rm(combined_data)
clear_memory()
# Process data
x <- x %>%
NormalizeData(verbose = FALSE) %>%
FindVariableFeatures(verbose = FALSE) %>%
ScaleData(verbose = FALSE)
# Set identities
Idents(object = x, cells = 1:sum(jx1)) <- 'Tumor'
Idents(object = x, cells = (sum(jx1) + 1):ncol(x)) <- 'Benign'
# Run MAST
de$mast[[immune.cells[i]]] <- FindMarkers(
object = x,
ident.1 = 'Tumor',
ident.2 = 'Benign',
test.use = 'MAST',
verbose = TRUE,
max.cells.per.ident = 1000
)
rm(x)
clear_memory()
}, error = function(e) {
message(sprintf("Error in MAST analysis for %s: %s", immune.cells[i], e$message))
message("Attempting Wilcoxon test instead...")
# Try Wilcoxon as fallback
de$mast[[immune.cells[i]]] <- FindMarkers(
object = x,
ident.1 = 'Tumor',
ident.2 = 'Benign',
test.use = 'wilcox',
verbose = TRUE,
max.cells.per.ident = 1000
)
})
message(sprintf("Completed analysis for %s", immune.cells[i]))
}
# End timer + log time elapsed
t.end <- Sys.time()
message("\nTotal analysis time: ", difftime(t.end, t.start, units = "mins"), " minutes")
# Print summary of results
message("\nSummary of analyses:")
for (cell_type in immune.cells) {
if (cell_type %in% names(de$ttest)) {
n_ttest <- nrow(de$ttest[[cell_type]])
n_mast <- if(cell_type %in% names(de$mast)) nrow(de$mast[[cell_type]]) else 0
message(sprintf("%s: %d t-test DEGs, %d MAST DEGs", cell_type, n_ttest, n_mast))
}
}
```
## Case: responder vs. non-responder (GSE120575)
Comparison between responder vs. non-responder cells to immunotherapy.
Estimated runtime: ~1 hour.
```{r DE_response, warning=F, message=T, eval=!('response' %in% de$ttest)}
# Start timer
t.start <- Sys.time()
# Make sure we have the expression data
if (!exists("de") || is.null(de$data) || !is(de$data, "dgCMatrix")) {
message("Reloading expression data...")
de <- list()
de$data <- GetAssayData(object = data.all$proc, layer = "data")
if (!is(de$data, "dgCMatrix")) {
de$data <- as(de$data, "dgCMatrix")
}
}
# Define cell groups to compare
ix <- rownames(obj$GSE120575)
jx1 <- data.all$proc$Response %in% 'Responder'
jx2 <- data.all$proc$Response %in% 'Non-responder'
# Print cell counts
message(sprintf("Number of responder cells: %d", sum(jx1)))
message(sprintf("Number of non-responder cells: %d", sum(jx2)))
# Subsample cells if too many
max_cells_per_group <- 1000
if(sum(jx1) > max_cells_per_group) {
set.seed(42)
cells_g1 <- sample(which(jx1), max_cells_per_group)
jx1[setdiff(which(jx1), cells_g1)] <- FALSE
}
if(sum(jx2) > max_cells_per_group) {
set.seed(42)
cells_g2 <- sample(which(jx2), max_cells_per_group)
jx2[setdiff(which(jx2), cells_g2)] <- FALSE
}
# DE analysis (t-test)
message("Running t-test analysis...")
x <- de$data[ix, jx1]
y <- de$data[ix, jx2]
de$ttest$response <- de_analyze(x, y) %>% na.omit
rm(x, y)
#clear_memory()
# DE analysis (MAST)
message("Running MAST analysis...")
tryCatch({
# Create Seurat object
message("Creating Seurat object...")
combined_data <- cbind(de$data[ix, jx1], de$data[ix, jx2])
if (!is(combined_data, "dgCMatrix")) {
combined_data <- as(combined_data, "dgCMatrix")
}
x <- CreateSeuratObject(counts = combined_data)
rm(combined_data)
clear_memory()
# Process data
message("Processing data...")
x <- x %>%
NormalizeData(verbose = FALSE) %>%
FindVariableFeatures(verbose = FALSE) %>%
ScaleData(verbose = FALSE) %>%
RunPCA(verbose = FALSE)
# Set identities
message("Setting identities...")
cells_g1 <- 1:sum(jx1)
cells_g2 <- (sum(jx1) + 1):ncol(x)
Idents(object = x, cells = cells_g1) <- 'Responder'
Idents(object = x, cells = cells_g2) <- 'Non-responder'
# Verify identities
if (!all(levels(Idents(x)) %in% c('Responder', 'Non-responder'))) {
stop("Failed to set identities correctly")
}
# Run MAST
message("Running differential expression analysis...")
de$mast$response <- FindMarkers(
object = x,
ident.1 = 'Responder',
ident.2 = 'Non-responder',
test.use = 'MAST',
verbose = TRUE,
max.cells.per.ident = 1000,
min.pct = 0.1,
logfc.threshold = 0.25
)
# Clean up
rm(x, cells_g1, cells_g2)
clear_memory()
}, error = function(e) {
message("Error in MAST analysis: ", e$message)
message("Attempting Wilcoxon test instead...")
# Try Wilcoxon as fallback
de$mast$response <- FindMarkers(
object = x,
ident.1 = 'Responder',
ident.2 = 'Non-responder',
test.use = 'wilcox',
verbose = TRUE,
max.cells.per.ident = 1000,
min.pct = 0.1,
logfc.threshold = 0.25
)
})
# End timer + log time elapsed
t.end <- Sys.time()
message("\nAnalysis completed in: ", difftime(t.end, t.start, units = "mins"), " minutes")
# Print summary
if (!is.null(de$mast$response)) {
message("\nFound ", sum(de$mast$response$p_val_adj < 0.05),
" significant DEGs (FDR < 0.05)")
}
```
# Visualizations
**Description**: visual inspection of DE analysis results.
## T-test vs. MAST DEGs
Visual comparison of DEGs determined with t-test vs. MAST (venn diagrams).
```{r DE_venn, warning=F, message=T}
# Compare DEGs b/t the two methods
f <- names(de$mast); p <- list()
for (i in 1:length(f)) {
x <- de$ttest[[f[i]]]; y <- de$mast[[f[i]]]
g <- intersect(x$gene[x$q.val < 0.05], row.names(y)[y$p_val_adj < 0.05])
# g <- intersect(x$gene[x$q.val < 0.05], y$gene[y$p_val_adj < 0.05])
cat(sprintf('No. of intersecting DEGs for %s: %d\n', f[i], length(g)))
ggvenn(list('t-test'=x$gene[x$q.val < 0.05],
'MAST'=row.names(y)[y$p_val_adj < 0.05])) + ggtitle(f[i])
ggsave(paste0('plots/DE_venn_', f[i], '.pdf'), width=5, height=5, units='in')
}
```
## Responder vs. non-responder DEGs
Clustergram (or heatmap) of DEGs identified for responder vs. non-responder comparison.
```{r DE_response_heat, warning=F, message=T}
# Modify DE_response_heat chunk
tryCatch({
# Get significant genes
if (!exists("de") || is.null(de$mast$response)) {
stop("MAST results not found. Please run the DE analysis first.")
}
sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
if (length(sig_genes) == 0) {
stop("No significant genes found (FDR < 0.05)")
}
message(sprintf("Found %d significant genes", length(sig_genes)))
# Take top N genes by absolute log fold change
n <- min(50, length(sig_genes))
g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]),
decreasing = TRUE)][1:n]
# Create anonymous gene IDs
anon_genes <- anonymize_genes(g)
# Get all cells
jx1 <- which(data.all$proc$Response %in% 'Responder')
jx2 <- which(data.all$proc$Response %in% 'Non-responder')
# Get expression data
x <- de$data[g, c(jx1, jx2)]
if (!is(x, "dgCMatrix")) {
x <- as(x, "dgCMatrix")
}
x <- as.matrix(x)
# Replace gene names with anonymous IDs
rownames(x) <- anon_genes[rownames(x)]
# Calculate mean expression for each group
x_resp <- rowMeans(x[, 1:length(jx1), drop=FALSE])
x_nonresp <- rowMeans(x[, (length(jx1)+1):ncol(x), drop=FALSE])
# Combine into matrix
x_means <- cbind(x_resp, x_nonresp)
colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)")
# Calculate standard errors for error bars
x_resp_se <- apply(x[, 1:length(jx1), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
x_nonresp_se <- apply(x[, (length(jx1)+1):ncol(x), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
# Add SE information to rownames
rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)",
rownames(x_means),
x_resp_se,
x_nonresp_se)
# Scale data
x_scaled <- t(scale(t(x_means)))
# Create annotation
col.annot <- data.frame(
Group = c('Responder', 'Non-responder'),
row.names = colnames(x_means)
)
# Color palette
col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100)
# Create heatmap
message("Generating heatmap...")
p <- pheatmap(
x_scaled,
cluster_rows = TRUE,
cluster_cols = FALSE,
scale = "none", # already scaled above
color = col.pal,
annotation_col = col.annot,
show_rownames = TRUE,
show_colnames = TRUE,
main = sprintf('Top %d De-identified DEGs (Mean Expression)', nrow(x_scaled)),
fontsize_row = 8,
annotation_colors = list(
Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
)
)
# Save plot
message("Saving plot...")
pdf('plots/DE_heatmap_response_collapsed.pdf', width = 11, height = 8)
print(p)
dev.off()
# Save gene ID mapping
write.csv(data.frame(
Anonymous_ID = names(anon_genes),
Original_Gene = anon_genes,
Mean_Responder = x_resp,
SE_Responder = x_resp_se,
Mean_NonResponder = x_nonresp,
SE_NonResponder = x_nonresp_se
), './response_gene_mapping_with_stats.csv', row.names = FALSE)
message("Heatmap generated successfully")
message("Gene mapping with statistics saved to results/response_gene_mapping_with_stats.csv")
}, error = function(e) {
message(sprintf("Error generating heatmap: %s", e$message))
})
```
# Add new chunk after DE_response_heat
```{r DE_response_heat_individual, warning=F, message=T}
# Show individual cell heatmap alongside collapsed view
tryCatch({
# Get significant genes
if (!exists("de") || is.null(de$mast$response)) {
stop("MAST results not found. Please run the DE analysis first.")
}
sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
if (length(sig_genes) == 0) {
stop("No significant genes found (FDR < 0.05)")
}
message(sprintf("Found %d significant genes", length(sig_genes)))
# Take top N genes by absolute log fold change
n <- min(50, length(sig_genes))
g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]),
decreasing = TRUE)][1:n]
# Create anonymous gene IDs
anon_genes <- anonymize_genes(g)
# Get cells (subsample to make visualization manageable)
jx1 <- which(data.all$proc$Response %in% 'Responder')
jx2 <- which(data.all$proc$Response %in% 'Non-responder')
# Subsample cells
cells_per_group <- min(25, min(length(jx1), length(jx2)))
set.seed(42)
sampled_jx1 <- sample(jx1, cells_per_group)
sampled_jx2 <- sample(jx2, cells_per_group)
# Get expression data for individual cells
x_individual <- de$data[g, c(sampled_jx1, sampled_jx2)]
if (!is(x_individual, "dgCMatrix")) {
x_individual <- as(x_individual, "dgCMatrix")
}
x_individual <- as.matrix(x_individual)
# Clean individual cell data
message("Cleaning individual cell data...")
# Replace Inf/-Inf with NA
x_individual[is.infinite(x_individual)] <- NA
# Remove rows (genes) with any NA values
complete_rows <- complete.cases(x_individual)
if (!all(complete_rows)) {
message(sprintf("Removing %d genes with NA values", sum(!complete_rows)))
x_individual <- x_individual[complete_rows, ]
g <- g[complete_rows]
anon_genes <- anon_genes[complete_rows]
}
# Check for zero variance genes
gene_var <- apply(x_individual, 1, var)
if (any(gene_var == 0)) {
message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0)))
x_individual <- x_individual[gene_var > 0, ]
g <- g[gene_var > 0]
anon_genes <- anon_genes[gene_var > 0]
}
# Replace gene names with anonymous IDs
rownames(x_individual) <- anon_genes[rownames(x_individual)]
# Scale individual cell data with validation
message("Scaling individual cell data...")
x_individual_scaled <- t(scale(t(x_individual)))
# Check for NA/Inf values after scaling
if (any(is.na(x_individual_scaled)) || any(is.infinite(x_individual_scaled))) {
message("Found NA/Inf values after scaling, capping extreme values at ±10")
x_individual_scaled[x_individual_scaled > 10] <- 10
x_individual_scaled[x_individual_scaled < -10] <- -10
x_individual_scaled[is.na(x_individual_scaled)] <- 0
}
# Create annotation for individual cells
col.annot.individual <- data.frame(
Response = rep(c('Responder', 'Non-responder'), each = cells_per_group),
row.names = colnames(x_individual)
)
# Get mean expression data (for side-by-side comparison)
x_resp <- rowMeans(de$data[g, jx1, drop=FALSE])
x_nonresp <- rowMeans(de$data[g, jx2, drop=FALSE])
x_means <- cbind(x_resp, x_nonresp)
colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)")
# Calculate standard errors
x_resp_se <- apply(de$data[g, jx1, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
x_nonresp_se <- apply(de$data[g, jx2, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
# Clean mean data
message("Cleaning mean data...")
x_means[is.infinite(x_means)] <- NA
complete_rows_means <- complete.cases(x_means)
if (!all(complete_rows_means)) {
message(sprintf("Removing %d genes with NA values from means", sum(!complete_rows_means)))
x_means <- x_means[complete_rows_means, ]
x_resp_se <- x_resp_se[complete_rows_means]
x_nonresp_se <- x_nonresp_se[complete_rows_means]
}
# Add SE information to rownames for mean data
rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)",
anon_genes[rownames(x_means)],
x_resp_se,
x_nonresp_se)
# Scale mean data with validation
message("Scaling mean data...")
x_means_scaled <- t(scale(t(x_means)))
if (any(is.na(x_means_scaled)) || any(is.infinite(x_means_scaled))) {
message("Found NA/Inf values after scaling means, capping extreme values at ±10")
x_means_scaled[x_means_scaled > 10] <- 10
x_means_scaled[x_means_scaled < -10] <- -10
x_means_scaled[is.na(x_means_scaled)] <- 0
}
# Create annotation for means
col.annot.means <- data.frame(
Group = c('Responder', 'Non-responder'),
row.names = colnames(x_means)
)
# Color palette
col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100)
message("Generating plots...")
# Create individual cells heatmap
p1 <- pheatmap(
x_individual_scaled,
cluster_rows = TRUE,
cluster_cols = FALSE,
scale = "none",
color = col.pal,
annotation_col = col.annot.individual,
show_rownames = TRUE,
show_colnames = FALSE,
gaps_col = cells_per_group,
main = sprintf('Individual Cells\n(n=%d per group)', cells_per_group),
fontsize_row = 8,
annotation_colors = list(
Response = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
),
legend = FALSE, # Hide colorbar
treeheight_row = 0, # Hide row dendrogram
cellwidth = 10, # Force square cells
cellheight = 10 # Force square cells
)
# Create mean expression heatmap
p2 <- pheatmap(
x_means_scaled,
cluster_rows = TRUE,
cluster_cols = FALSE,
scale = "none",
color = col.pal,
annotation_col = col.annot.means,
show_rownames = TRUE,
show_colnames = TRUE,
main = 'Group Means\n(all cells)',
fontsize_row = 8,
annotation_colors = list(
Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
),
legend = FALSE, # Hide colorbar
treeheight_row = 0, # Hide row dendrogram
cellwidth = 20, # Force square cells (larger since fewer columns)
cellheight = 20, # Force square cells (larger since fewer columns)
labels_row = sub("\n.*", "", rownames(x_means)) # Only show gene IDs without SE info
)
# Save plots to PDF
pdf('plots/DE_heatmap_response_comparison.pdf', width = 15, height = 8)
grid::grid.newpage()
grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2))
dev.off()
# Display plots in notebook
grid::grid.newpage()
grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2))
# Save detailed statistics
write.csv(data.frame(
Anonymous_ID = names(anon_genes),
Original_Gene = anon_genes,
Mean_Responder = x_resp,
SE_Responder = x_resp_se,
Mean_NonResponder = x_nonresp,
SE_NonResponder = x_nonresp_se,
N_Responder = length(jx1),
N_NonResponder = length(jx2)
), 'results/response_gene_mapping_detailed.csv', row.names = FALSE)
message("Side-by-side heatmaps generated successfully")
message("Detailed gene statistics saved to results/response_gene_mapping_detailed.csv")
}, error = function(e) {
message(sprintf("Error generating heatmaps: %s", e$message))
message("\nDebugging information:")
message("1. Check if expression data contains extreme values")
message("2. Verify gene filtering and scaling steps")
message("3. Ensure all matrices have proper dimensions")
message("4. Look for NA/Inf values in the data")
})
```
PCA of DEGs identified for responder vs. non-responder comparison.
```{r DE_response_PCA, warning=F, message=T}
# Modify DE_response_PCA chunk
tryCatch({
# Validate MAST results exist
if (!exists("de") || is.null(de$mast$response)) {
stop("MAST results not found. Please run the DE analysis first.")
}
# Get significant genes
sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
if (length(sig_genes) == 0) {
stop("No significant genes found (FDR < 0.05)")
}
message(sprintf("Found %d significant genes", length(sig_genes)))
# Get responder/non-responder cells
jx <- data.all$proc$Response %in% c('Responder', 'Non-responder')
if (sum(jx) == 0) {
stop("No cells found matching response criteria")
}
message(sprintf("Found %d cells (%d Responders, %d Non-responders)",
sum(jx),
sum(data.all$proc$Response[jx] == "Responder"),
sum(data.all$proc$Response[jx] == "Non-responder")))
# Subsample cells if too many
max_cells_per_group <- 1000
responder_idx <- which(data.all$proc$Response[jx] == "Responder")
nonresponder_idx <- which(data.all$proc$Response[jx] == "Non-responder")
if (length(responder_idx) > max_cells_per_group) {
set.seed(42)
responder_idx <- sample(responder_idx, max_cells_per_group)
}
if (length(nonresponder_idx) > max_cells_per_group) {
set.seed(42)
nonresponder_idx <- sample(nonresponder_idx, max_cells_per_group)
}
selected_cells <- c(responder_idx, nonresponder_idx)
# Get expression data
x <- de$data[sig_genes, jx][, selected_cells]
if (!is(x, "dgCMatrix")) {
x <- as(x, "dgCMatrix")
}
x <- as.matrix(x)
# Create anonymous gene IDs
anon_genes <- anonymize_genes(rownames(x))
rownames(x) <- anon_genes[rownames(x)]
# Handle zero variance genes
gene_var <- apply(x, 1, var)
if (any(gene_var == 0)) {
message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0)))
x <- x[gene_var > 0, ]
}
# Clean data
x[is.infinite(x)] <- NA
x[is.na(x)] <- 0
# Transpose and convert to data frame
x_t <- t(x)
# Create metadata
m <- data.frame(
Response = data.all$proc$Response[jx][selected_cells],
row.names = rownames(x_t)
)
# Apply PCA with validation
if (ncol(x_t) < 2) {
stop("Need at least 2 features for PCA")
}
if (nrow(x_t) < 3) {
stop("Need at least 3 samples for PCA")
}
message("Running PCA...")
response_pca <- prcomp(x_t, scale. = TRUE)
# Calculate variance explained
var_explained <- summary(response_pca)$importance[2, 1:2] * 100
# Calculate group centroids
centroids <- aggregate(cbind(PC1, PC2) ~ Response,
data = cbind(as.data.frame(response_pca$x[,1:2]), m),
FUN = mean)
# Create enhanced PCA plot
p <- ggplot(data = as.data.frame(response_pca$x[,1:2]),
aes(x = PC1, y = PC2, color = m$Response)) +
# Add points with reduced alpha for transparency
geom_point(alpha = 0.3, size = 2) +
# Add confidence ellipses
stat_ellipse(level = 0.95, size = 1) +
# Add centroids
geom_point(data = centroids, aes(x = PC1, y = PC2),
size = 5, shape = 18) +
# Add centroid labels
geom_text(data = centroids,
aes(x = PC1, y = PC2, label = Response),
vjust = -1.5, size = 4, color = "black") +
# Customize appearance
ggtitle('De-identified DEG PCA with Group Trends') +
xlab(sprintf("PC1 (%.1f%%)", var_explained[1])) +
ylab(sprintf("PC2 (%.1f%%)", var_explained[2])) +
theme_minimal() +
scale_color_manual(values = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')) +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5, size = 14),
axis.title = element_text(size = 12),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)
# Save plot
message("Saving plot...")
ggsave('plots/DE_PCA_response_collapsed.pdf', plot = p, width = 8, height = 8, units = 'in')
# Display plot
print(p)
message("PCA visualization completed successfully")
}, error = function(e) {
message(sprintf("Error in PCA analysis: %s", e$message))
message("Please ensure:")
message("1. DE analysis has been run successfully")
message("2. There are significant DEGs to analyze")
message("3. There are enough cells in each response group")
message("4. Expression data is valid and contains sufficient variation")
})
```
## Tumor vs. benign DEGs
Clustergram (or heatmap) of DEGs identified for tumor vs. benign comparison.
```{r DE_tumor_heat, warning=F, message=T}
# Define data to plot
n <- 50
g <- de$mast$tumor$gene[1:n]
jx1 <- which(data.all$proc$Source == names(obj)[3] |
data.all$proc$Malignant %in% 'Yes')[1:(n/2)]
jx2 <- which(data.all$proc$Source == names(obj)[4])[1:(n/2)]
x <- de$data[g, c(jx1, jx2)] %>% as.data.frame(.)
# Specify heatmap arguments
# color palette
col.pal <- colorRampPalette(colors=c('white', 'red'), space='Lab')(100)
# column annotations (response)
col.annot <- data.frame(Malignant=rep(c('Yes', 'No'), each=n/2),
row.names=colnames(x))
# Heatmap
p <- pheatmap(x, cluster_rows=T, cluster_cols=F, scale='none', color=col.pal,
annotation_col=col.annot, cellheight=(500/n), cellwidth=(500/n),
show_rownames=T, show_colnames=F, gaps_col=n/2,
main='DEG heatmap (tumor)')
# Save plot
#tiff('plots/DE_heatmap_tumor.tiff', units='in', width=11, height=8, res=300)
print({p}); dev.off()
```
PCA of DEGs identified for tumor vs. benign comparison.
```{r DE_tumor_PCA, warning=F, message=T}
# Define data to plot
tryCatch({
# Validate MAST results exist
if (!exists("de") || is.null(de$mast$tumor)) {
stop("MAST results not found. Please run the tumor DE analysis first.")
}
# Get significant genes
sig_genes <- rownames(de$mast$tumor)[de$mast$tumor$p_val_adj < 0.05]
if (length(sig_genes) == 0) {
stop("No significant genes found (FDR < 0.05)")
}
message(sprintf("Found %d significant genes", length(sig_genes)))
# Take top N genes by absolute log fold change
n <- min(100, length(sig_genes))
g <- sig_genes[order(abs(de$mast$tumor$avg_log2FC[match(sig_genes, rownames(de$mast$tumor))]),
decreasing = TRUE)][1:n]
# Get tumor/benign cells
jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes'
jx2 <- data.all$proc$Source == names(obj)[4]
if (sum(jx1) == 0 || sum(jx2) == 0) {
stop("Insufficient cells in one or both groups")
}
message(sprintf("Found %d cells (%d Tumor, %d Benign)",
sum(jx1) + sum(jx2), sum(jx1), sum(jx2)))
# Subsample cells if too many
max_cells_per_group <- 1000
if (sum(jx1) > max_cells_per_group) {
set.seed(42)
tumor_cells <- sample(which(jx1), max_cells_per_group)
jx1[setdiff(which(jx1), tumor_cells)] <- FALSE
}
if (sum(jx2) > max_cells_per_group) {
set.seed(42)
benign_cells <- sample(which(jx2), max_cells_per_group)
jx2[setdiff(which(jx2), benign_cells)] <- FALSE
}
# Get expression data
x <- de$data[g, c(which(jx1), which(jx2))]
if (!is(x, "dgCMatrix")) {
x <- as(x, "dgCMatrix")
}
x <- as.matrix(x)
# Handle zero variance genes
gene_var <- apply(x, 1, var)
if (any(gene_var == 0)) {
message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0)))
x <- x[gene_var > 0, ]
g <- g[gene_var > 0]
}
# Clean data
x[is.infinite(x)] <- NA
x[is.na(x)] <- 0
# Transpose and convert to data frame
x_t <- t(x)
# Create metadata
m <- data.frame(
Malignant = rep(c('Yes', 'No'), c(sum(jx1), sum(jx2))),
row.names = rownames(x_t)
)
# Apply PCA with validation
if (ncol(x_t) < 2) {
stop("Need at least 2 features for PCA")
}
if (nrow(x_t) < 3) {
stop("Need at least 3 samples for PCA")
}
message("Running PCA...")
pca.tumor <- prcomp(x_t, scale. = TRUE)
# Calculate variance explained
var_explained <- summary(pca.tumor)$importance[2, 1:2] * 100
# Visualize PCA plot with variance explained
p <- autoplot(pca.tumor, data = m, colour = 'Malignant') +
ggtitle('DEG PCA (tumor)') +
xlab(sprintf("PC1 (%.1f%%)", var_explained[1])) +
ylab(sprintf("PC2 (%.1f%%)", var_explained[2])) +
theme_minimal() +
scale_color_manual(values = c('Yes' = '#E41A1C', 'No' = '#377EB8'))
# Save plot
message("Saving plot...")
ggsave('plots/DE_PCA_tumor.pdf', plot = p, width = 8, height = 6, units = 'in')
# Display plot
print(p)
# Return success message
message("PCA visualization completed successfully")
}, error = function(e) {
message(sprintf("Error in PCA analysis: %s", e$message))
message("Please ensure:")
message("1. Tumor DE analysis has been run successfully")
message("2. There are significant DEGs to analyze")
message("3. There are enough cells in each group")
message("4. Expression data is valid and contains sufficient variation")
})
```
# Save results (if prompted)
```{r save_outputs, warning=F, message=T}
#save.data <- dlgInput('Save all results? (T/F)', F)$res %>% as.logical(.)
#if (save.data) {
# # t-test results
# write.xlsx(de$ttest, 'results/DE_ttest.xlsx', row.names=F)
# # MAST results
# write.xlsx(de$mast, 'results/DE_MAST.xlsx', row.names=T)
#}
```