--- a +++ b/notebooks/scRNAseq_analysis.Rmd @@ -0,0 +1,748 @@ +--- +title: "Analysis of scRNA-seq data" +date: "Last modified: `r format(Sys.time(), '%B %d, %Y')`" +tags: [scRNA-seq, seurat, melanoma, immunotherapy, PBMCs] +output: + html_document: + theme: flatly + highlight: zenburn + toc: true + number_sections: false + toc_depth: 2 + toc_float: + collapsed: false + smooth_scroll: true + code_folding: hide + self_contained: yes +--- + +# Project Description + +This project aims to identify therapeutic targets for melanoma patients by following a 2-phase approach: + +- Phase I: Analyze scRNA-seq data to identify DE genes and infer enriched pathways\ +- Phase II: Compare responders vs. non-responders to identify therapeutic targets + +## Dataset Description + +This analysis integrates five complementary single-cell RNA sequencing datasets: + +1. Dataset 1 (Treatment Response Dataset) + - Contains treatment response data + - Includes responder/non-responder classifications + - Focus on immunotherapy outcomes + +2. Dataset 2 (Tumor Cell Dataset) + - Characterizes tumor cell heterogeneity + - Includes cell type annotations + - Focus on tumor cell populations + +3. Dataset 3 (Tumor Microenvironment Dataset) + - Maps the tumor microenvironment + - Contains malignant/non-malignant classifications + - Includes detailed immune cell typing + +4. Dataset 4 (Control Melanocyte Dataset) + - Normal melanocyte control data + - Reference for non-malignant state + - Baseline expression profiles + +5. Dataset 5 (Control Immune Dataset) + - Normal immune cell control data + - Reference for immune cell states + - Baseline immune profiles + +## Strategy + +Use `seurat` and `harmony` to complete the following tasks: + +1. Load/process/combine data\ +2. Clustering and biomarker determination\ +3. Comparisons: disease vs. control + +# Initialize environment + +Install required packages. + +```{r install_packages, results="hide", message=F, warning=F, error=F} + +# Define packages to install +pkg.list = c('svDialogs', 'vroom', 'dplyr', 'DT', 'Seurat', 'harmony', + 'patchwork', 'ggplot2', 'ggrepel', 'openxlsx', 'progress') + +# Define packages not already installed +pkg.install <- pkg.list[!(pkg.list %in% installed.packages()[, 'Package'])] + +# Install uninstalled packages +if (length(pkg.install) > 0) { + install.packages(pkg.install) +} + +``` + +Load installed packages. + +```{r load_packages, results="hide", message=F, warning=F, error=F} + +# Load packages +library(svDialogs) # for prompting user-input +library(vroom) # for quickly reading data +library(dplyr) # for data processing +library(DT) # to display datatables +library(Seurat) # for scRNA-seq analysis +library(harmony) # to integration scRNA-seq data +library(patchwork) # for combining plots +library(ggplot2) # for data visualization +library(ggrepel) # to use geom_pont_repel() +library(openxlsx) # to write data to excel +library(progress) # to display progress bar + +``` + +Define custom functions. + +```{r custom_functions, results="hide", message=F, warning=F, error=F} + +# qc_filter(): filter data based on QC for scRNA-seq data +qc_filter <- function(obj, feat.t=c(200, 2500), pct.mt.t=5, var.method='vst', + feat.n=2000, qc.plot=T, top.n=10, title='') { + + ############################ FUNCTION DESCRIPTION ############################ + # feat.t = lower and upper limits on unique gene counts + # pct.mt.t = threshold of level in mitochondrial contamination + # var.method = method for selecting highly variable genes + # feat.n = number of variable genes to select + # qc.plot = boolean whether to generate plots to decide downstream analyses + # title = string to use for plot title + ############################## BEGIN FUNCTION ################################ + + # determine percentage of mitochondrial contamination + obj[['pct.mt']] <- PercentageFeatureSet(obj, pattern='^MT-') + # filter + nomalize + scale data + obj <- obj %>% + subset(., subset=(nFeature_RNA > feat.t[1]) & (nFeature_RNA < feat.t[2]) & + (pct.mt < pct.mt.t)) %>% NormalizeData(.) %>% + FindVariableFeatures(., selection.method=var.method) %>% + ScaleData(.) %>% RunPCA(., features=VariableFeatures(object=.)) + # generate follow-up QC plots (if prompted) + if (qc.plot) { + p1 <- VariableFeaturePlot(obj) %>% + LabelPoints(plot=., points=head(VariableFeatures(obj), top.n), repel=T) + p2 <- ElbowPlot(obj) + plot(p1 + p2 + plot_annotation(title=title)) + } + # return output + return(obj) +} + +# de_analyze(): conduct differential expression (DE) analysis +de_analyze <- function(m1, m2, alt='two.sided', paired=F, var.equal=F, + adj.method='bonferroni', t=0.05, de.plot=F, title='') { + + ############################ FUNCTION DESCRIPTION ############################ + # m1, m2 = expression matrices to compare + # alt, paired, var.equal = arguments for t.test() function + # adj.method = method for calculating adjusted p-value + # t = threshold for significance + # de.plot = boolean whether to generate a volcano plot + ############################## BEGIN FUNCTION ################################ + + # make sure two matrices have same number of rows + if (nrow(m1) != nrow(m2)) { + stop('Row length does not match between the provided matrices.') + } + # make sure gene names align between matrices + if (!all(rownames(m1) == rownames(m2))) { + stop('Gene names do not align between provided matrices.') + } + # instantiate output variable + results <- data.frame(gene=rownames(m1), + t.stat=vector(mode='numeric', length=nrow(m1)), + p.val=vector(mode='numeric', length=nrow(m1))) + # conduct unpaired t-test with unequal variance for each gene + pb <- progress_bar$new( + format=' analyzing [:bar] :percent time left: :eta', total=nrow(m1)) + for (i in 1:nrow(m1)) { + pb$tick() + x <- m1[i, ]; y <- m2[i, ] + r <- t.test(x, y, alternative=alt, paired=paired, var.equal=var.equal) + results$t.stat[i] <- r$statistic + results$p.val[i] <- r$p.value + } + # determine adjusted p-values + results$q.val <- p.adjust(results$p.val, method=adj.method) + # add additional fields + results <- results %>% + mutate(Significance=case_when(q.val < t & t.stat > 0 ~ 'Up', + q.val < t & t.stat < 0 ~ 'Down', + T ~ 'NS')) %>% arrange(q.val) + # generate volcano plot (if prompted) + if (de.plot) { + p <- results %>% arrange(t.stat) %>% ggplot(data=., + aes(x=t.stat, y=-log10(q.val), col=Significance, label=gene)) + + geom_point() + geom_text_repel() + theme_minimal() + + scale_color_manual(values=c('blue', 'black', 'red')) + ggtitle(title) + plot(p) + } + # return output + return(results) +} + +``` + +Additional settings. + +```{r settings} + +# Adjust system settings +Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2) + +# Save plots? (default: F) +save.plots <- dlgInput('Save all outputs? (T/F)', F)$res + +``` + +# Tasks + +## 1. Load/process/combine data + +**Description**: load all scRNA-seq files. + +```{r load_data, warning=F, message=F} +setwd("..") +# Load workspace (if it exists) +f <- list.files() +if (any(endsWith(f, '.RData'))) { + load(f[endsWith(f, '.RData')][1]) +} +``` + +```{r load_data, warning=F, message=F} +##################### ONLY RUN IF THERE'S NO WORKSPACE ######################### +# # Load disease datasets +# obj <- list(); meta <- list() +# # Dataset 1: Treatment response dataset +# meta$dataset1 <- read.delim(file='data/dataset1/metadata.txt') +# x <- vroom('data/dataset1/counts.txt', col_names=F) +# x <- x[, 1:nrow(meta$dataset1)] %>% as.matrix(.) +# y <- read.delim('data/dataset1/genes.txt', header=F) +# rownames(x) <- make.unique(y$V1) +# colnames(x) <- meta$dataset1$ID +# obj$dataset1 <- CreateSeuratObject(x, project='dataset1', +# min.cells=3, min.features=200) +# obj$dataset1$Treatment <- as.factor(meta$dataset1$Treatment) +# obj$dataset1$Response <- as.factor(meta$dataset1$Response) +# rm(x, y); gc() +# +# # Dataset 2: Tumor cell dataset +# meta$dataset2 <- read.csv('data/dataset2/metadata.csv') +# x <- vroom('data/dataset2/expression_matrix.csv') +# y <- as.matrix(x[, -1]); rownames(y) <- x$...1 +# obj$dataset2 <- CreateSeuratObject(y, project='dataset2', +# min.cells=3, min.features=200) +# obj$dataset2$type <- as.factor(meta$dataset2$cell.types) +# +# # Dataset 3: Tumor microenvironment dataset +# x <- vroom('data/dataset3/expression_matrix.txt') +# y <- as.matrix(x[-c(1:3), -1]); rownames(y) <- x$Cell[-c(1:3)] +# obj$dataset3 <- CreateSeuratObject(y, project='dataset3', +# min.cells=3, min.features=200) +# meta$dataset3 <- data.frame(Tumor=t(x[1, -1]), +# Malignant=case_when(x[2, -1] == 1 ~ 'No', +# x[2, -1] == 2 ~ 'Yes', +# x[2, -1] == 0 ~ 'Unresolved'), +# NMType=case_when(x[3, -1] == 1 ~ 'T', +# x[3, -1] == 2 ~ 'B', +# x[3, -1] == 3 ~ 'Macro', +# x[3, -1] == 4 ~ 'Endo', +# x[3, -1] == 5 ~ 'CAF', +# x[3, -1] == 6 ~ 'NK', +# x[3, -1] == 0 ~ 'NA')) +# obj$dataset3$Malignant <- meta$dataset3$Malignant +# obj$dataset3$NMType <- meta$dataset3$NMType +# +# # Load control datasets +# # Dataset 4: Control melanocyte dataset +# x <- vroom('data/dataset4/expression_matrix.csv') +# y <- as.matrix(x[, -1]); rownames(y) <- x$...1 +# obj$dataset4 <- CreateSeuratObject(y, project='dataset4', +# min.cells=3, min.features=200) +# +# # Dataset 5: Control immune dataset +# meta$dataset5 <- read.table('data/dataset5/metadata.tsv', +# sep='\t', header=T) +# x <- vroom('data/dataset5/matrix.tsv') +# y <- as.matrix(x[, -1]); rownames(y) <- x$Gene.Name +# obj$dataset5 <- CreateSeuratObject(y, project='dataset5', +# min.cells=3, min.features=200) +# obj$dataset5$sample <- as.factor(meta$dataset5$sample) +# rm(f, x, y, meta); gc() + +``` + +*Description*: Combine all datasets together into single Seurat object. Also apply `harmony` to remove clustering bias based on dataset source. + +```{r combine_data, warning=F, message=F} +file_path <- "/Users/scampit/Projects/torchstack/cancer-biomarker-discovery" +setwd(file_path) + +# Combine all datasets +if (!exists('data.all')) { + data.all <- list() +} +if (!('raw' %in% names(data.all))) { + data.all$raw <- merge(obj$dataset1, obj$dataset2) %>% + merge(., obj$dataset3) %>% merge(., obj$dataset4) %>% merge(., obj$dataset5) +} + +# Add source information +if (!('Source' %in% names(data.all$raw@meta.data))) { + data.all$raw$Source <- c(rep(names(obj)[1], ncol(obj$dataset1)), + rep(names(obj)[2], ncol(obj$dataset2)), + rep(names(obj)[3], ncol(obj$dataset3)), + rep(names(obj)[4], ncol(obj$dataset4)), + rep(names(obj)[5], ncol(obj$dataset5))) +} + +# Apply QC +if (!('proc' %in% names(data.all))) { + data.all$proc <- qc_filter(data.all$raw) +} + +# Visualize integration results +p1 <- DimPlot(object=data.all$proc, reduction='pca', pt.size=0.1, + group.by='Source') +p2 <- VlnPlot(object=data.all$proc, features='PC_1', pt.size=0.1, + group.by='Source') +p1 + p2 +if (save.plots) { + ggsave('plots/QC_no_harmony.pdf', width=10, height=5, units='in') +} + +# Apply harmony (to remove clustering based on dataset source) +if (!('harmony' %in% names(data.all$proc@reductions))) { + data.all$proc <- data.all$proc %>% RunHarmony('Source', plot_convergence=T) +} + +# Visualize integration results (after harmony) +p1 <- DimPlot(object=data.all$proc, reduction='harmony', pt.size=0.1, + group.by='Source') +p2 <- VlnPlot(object=data.all$proc, features='harmony_1', pt.size=0.1, + group.by='Source') +p1 + p2 +if (save.plots) { + ggsave('plots/QC_w_harmony.pdf', width=10, height=5, units='in') +} + +``` + +## 2. Clustering and biomarker determination + +**Description**: cluster cells based on UMAP and determine biomarkers based on cluster assignment. + +### Clustering + +```{r clustering, warning=F, message=T} + +# Visualize UMAP plots (runtime: ~5 minutes) +if (!('umap' %in% names(data.all$proc@reductions))) { + data.all$proc <- data.all$proc %>% + RunUMAP(reduction='harmony', dims=1:20) %>% + FindNeighbors(reduction='harmony', dims=1:20) %>% + FindClusters(resolution=0.5) %>% identity() +} + +# by dataset source +p <- DimPlot(data.all$proc, reduction='umap', group.by='Source', pt.size=0.1, + split.by='Source') + ggtitle('UMAP split by dataset source'); plot(p) +# by cluster (unlabeled) +p <- DimPlot(data.all$proc, reduction='umap', label=T, pt.size=0.1) + + ggtitle('UMAP of combined scRNA-seq data (unlabeled)'); plot(p) +if (save.plots) { + ggsave('plots/UMAP_unlabeled.pdf', width=10, height=5, units='in') +} + +# by cluster (labeled) +lab <- c('Tumor cell', 'Technical error 1', 'Cytotoxic CD8 T cell 1', 'B cell', + 'Melanocytes 1', 'Macrophage', 'Cytotoxic CD8 T cell 2', 'Technical error 2', + 'Memory T cell', 'Melanocytes 2', 'Dysfunctional CD8 T cell', + 'Cytotoxic CD8 T cell 3', '? 1', 'Melanocytes 3', 'Activated cell', + 'Exhausted T cell', 'Cytotoxic T cell', '? 2', '? 3', 'Melanocytes 4', '? 4') +names(lab) <- levels(data.all$proc) +plot.data <- data.all$proc %>% RenameIdents(., lab) +p <- DimPlot(plot.data, reduction='umap', label=T, pt.size=0.1) + + ggtitle('UMAP of combined scRNA-seq data (labeled)'); plot(p) +if (save.plots) { + ggsave('plots/UMAP_labeled.pdf', width=10, height=5, units='in') +} + +``` + +### Biomarkers + +```{r biomarkers, warning=F, message=T} + +# Find all biomarkers based on clustering (runtime: ~30 minutes) +if (!('bm' %in% names(data.all))) { + data.all$bm <- FindAllMarkers(data.all$proc, min.pct=0.25, logfc.threshold=0.25) +} + +# View table of top 3 biomarkers for each cluster +d <- data.all$bm %>% group_by(cluster) %>% slice_max(n=3, order_by=avg_log2FC) +datatable(d) + +# Visualize ridge plots based on biomarkers of interest +ridge.feat <- dlg_list(sort(unique(d$gene)), multiple=T)$res +p <- RidgePlot(data.all$proc, features=ridge.feat, + ncol=ceiling(length(ridge.feat) / 2)); plot(p) +if (save.plots) { + ggsave('plots/biomarker_ridge_plots.pdf', width=10, height=10, units='in') +} + +``` + +## 3. Comparisons: disease vs. control + +**Description**: determine differentially expressed genes (DEGs) between disease vs. control groups. + +### Case: tumor + +```{r DE_tumor, warning=F, message=T} + +# Instantiate DE variable +if (!exists('de')) { + de <- list() +} +if (!('data' %in% names(de))) { + de$data <- GetAssayData(object=data.all$proc, slot='data') +} + +# Define datasets to compare +ix <- intersect(rownames(obj$dataset3), rownames(obj$dataset2)) %>% + intersect(., rownames(obj$dataset4)) +jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes' +jx2 <- data.all$proc$Source == names(obj)[4] +x <- de$data[ix, jx1]; y <- de$data[ix, jx2] + +# DE analysis (runtime: ~15 minutes) +if (!('tumor' %in% names(de))) { + de$tumor <- de_analyze(x, y) %>% na.omit +} + +# Visualize dot plot +dot.feat <- dlg_list(de$tumor$gene[de$tumor$q.val < 0.05], multiple=T)$res +p.data <- data.all$proc +Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA')) +p <- p.data %>% + DotPlot(object=., features=dot.feat, idents=c('Disease', 'Control')) + + ggtitle('Case: tumor') +plot(p) +if (save.plots) { + ggsave('plots/DE_tumor.pdf', width=10, height=5, units='in') +} + +``` + +### Case: immune cells (bulk) + +```{r DE_immune_bulk,warning=F, message=T} + +# Define datasets to compare +ix <- intersect(rownames(obj$dataset1), rownames(obj$dataset2)) %>% + intersect(., rownames(obj$dataset5)) +jx1 <- data.all$proc$Source == names(obj)[1] | data.all$proc$Malignant %in% 'No' +jx2 <- data.all$proc$Source == names(obj)[5] +x <- de$data[ix, jx1]; y <- de$data[ix, jx2] + +# DE analysis (runtime: ~20 minutes) +if (!('immune' %in% names(de))) { + de$immune <- de_analyze(x, y) %>% na.omit() +} + +# Visualize dot plot +dot.feat <- dlg_list(de$immune$gene[de$immune$q.val < 0.05], multiple=T)$res +p.data <- data.all$proc +Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA')) +p <- p.data %>% + DotPlot(object=., features=dot.feat, idents=c('Disease', 'Control')) + + ggtitle('Case: immune cells') +plot(p) +if (save.plots) { + ggsave('plots/DE_immune_bulk.pdf', width=10, height=5, units='in') +} + +``` + +### Case: immune cells (cluster-based) + +```{r DE_immune_cluster, warning=F, message=T} + +# Define datasets to compare (runtime: ~5 minutes) +immune.cells <- c('B', 'CD8', 'Macrophage', 'Memory', 'Activated') +for (i in 1:length(immune.cells)) { + # Define row + column indices + ix <- intersect(rownames(obj$dataset1), rownames(obj$dataset2)) %>% + intersect(., rownames(obj$dataset5)) + c <- grepl(immune.cells[i], Idents(plot.data)) + jx1 <- (plot.data$Source == names(obj)[1] | + data.all$proc$Malignant %in% 'No') & c + jx2 <- data.all$proc$Source == names(obj)[5] & c + x <- de$data[ix, jx1]; y <- de$data[ix, jx2] + # DE analysis + if (!(immune.cells[i] %in% names(de))) { + de[[immune.cells[i]]] <- de_analyze(x, y) %>% na.omit() + } + # Visualize dot plot + x <- de[[immune.cells[[i]]]] + dot.feat <- x$gene[x$q.val < 0.05][1:5] + p.data <- data.all$proc + Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA')) + p <- p.data %>% + DotPlot(object=., features=dot.feat, idents=c('Disease', 'Control')) + + ggtitle(paste0('Case: ', immune.cells[i])) + plot(p) + if (save.plots) { + ggsave(paste0('plots/DE_', immune.cells[i], '.pdf'), + width=10, height=5, units='in') + } +} + +``` + +# Case: responder vs. non-responder +ix <- rownames(obj$dataset1) +jx1 <- data.all$proc$Response %in% 'Responder' +jx2 <- data.all$proc$Response %in% 'Non-responder' + +# Save outputs and working space (if prompted) + +``` + +```{r glycolysis_visualization, warning=F, message=T} +# Define key glycolytic genes +glycolysis_genes <- c( + "HK1", "HK2", "GPI", "PFKL", "PFKM", "PFKP", # Glucose uptake and early glycolysis + "ALDOA", "ALDOB", "ALDOC", # Aldolase isoforms + "TPI1", # Triose phosphate isomerase + "GAPDH", # Glyceraldehyde phosphate dehydrogenase + "PGK1", # Phosphoglycerate kinase + "PGAM1", # Phosphoglycerate mutase + "ENO1", "ENO2", # Enolase isoforms + "PKM", "PKLR", # Pyruvate kinase isoforms + "LDHA", "LDHB", # Lactate dehydrogenase + "SLC2A1", "SLC2A3" # Glucose transporters (GLUT1, GLUT3) +) + +# Filter for genes present in the dataset +glycolysis_genes_present <- intersect(glycolysis_genes, rownames(de$data)) + +# Create anonymous gene IDs +anon_genes <- paste0("gene_", seq_along(glycolysis_genes_present)) +names(anon_genes) <- glycolysis_genes_present + +# Create mapping dataframe for reference +gene_mapping <- data.frame( + Original_Gene = glycolysis_genes_present, + Anonymous_ID = anon_genes[glycolysis_genes_present] +) + +# Write mapping to file if saving is enabled +if (save.plots) { + write.csv(gene_mapping, 'results/glycolysis_gene_mapping.csv', row.names = FALSE) +} + +# Create dot plot for glycolytic genes +p.data <- data.all$proc +Idents(p.data) <- as.factor(case_when(jx1 ~ 'Disease', jx2 ~ 'Control', T ~ 'NA')) + +# Generate dot plot with anonymous IDs and custom colors +p_glycolysis <- DotPlot(p.data, + features = glycolysis_genes_present, + idents = c('Disease', 'Control')) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + xlab('Genes') + + ylab('Condition') + + scale_color_gradient2(low = "navy", mid = "gray90", high = "red", + midpoint = 0, name = "Average\nExpression") + + scale_x_discrete(labels = anon_genes[glycolysis_genes_present]) + +# Display plot +print(p_glycolysis) + +# Save plot if requested +if (save.plots) { + ggsave('plots/DE_tumor_glycolysis.pdf', + plot = p_glycolysis, + width = 12, + height = 6, + units = 'in') +} + +# Calculate statistics for glycolytic genes +glycolysis_stats <- de$tumor[de$tumor$gene %in% glycolysis_genes_present, ] +glycolysis_stats <- glycolysis_stats[order(glycolysis_stats$q.val), ] + +# Add anonymous IDs to statistics +glycolysis_stats$anonymous_id <- anon_genes[glycolysis_stats$gene] + +# Display statistics with anonymous IDs +print("Differential Expression Statistics for De-identified Glycolytic Genes:") +glycolysis_stats$gene <- glycolysis_stats$anonymous_id +print(glycolysis_stats) + +# Count significantly different genes +sig_glyco_genes <- sum(glycolysis_stats$q.val < 0.05) +print(sprintf("Number of significantly different glycolytic genes (q < 0.05): %d", sig_glyco_genes)) +``` + +```{r glycolysis_boxplots, warning=F, message=T} +# Get expression data for glycolytic genes +expr_data <- GetAssayData(p.data, slot = "data")[glycolysis_genes_present,] + +# Filter out genes with all zeros +nonzero_genes <- rownames(expr_data)[rowSums(expr_data > 0) > 0] +expr_data <- expr_data[nonzero_genes,] + +# Update anonymous gene IDs for non-zero genes +anon_genes <- anon_genes[nonzero_genes] + +target_genes <- c("g10", "g21", "g03") +selected_genes <- names(anon_genes)[anon_genes %in% target_genes] + +# Create a data frame for plotting and store statistical results +plot_data <- data.frame() +stat_results <- data.frame( + Gene = character(), + P_value = numeric(), + T_statistic = numeric(), + Mean_diff = numeric(), + stringsAsFactors = FALSE +) + +for (gene in selected_genes) { + # Get expression values for current gene + expr_values <- expr_data[gene,] + + # Split values by condition before centering + disease_vals <- expr_values[colnames(expr_data) %in% colnames(de$data)[jx1]] + control_vals <- expr_values[colnames(expr_data) %in% colnames(de$data)[jx2]] + + # Perform t-test on raw values + t_test_result <- t.test(disease_vals, control_vals) + + # Center the expression values by subtracting the overall mean + overall_mean <- mean(expr_values) + disease_vals_centered <- disease_vals - overall_mean + control_vals_centered <- control_vals - overall_mean + + # Store statistical results + stat_results <- rbind(stat_results, data.frame( + Gene = anon_genes[gene], + P_value = t_test_result$p.value, + T_statistic = t_test_result$statistic, + Mean_diff = mean(disease_vals) - mean(control_vals), + Mean_centered_diff = mean(disease_vals_centered) - mean(control_vals_centered), + stringsAsFactors = FALSE + )) + + # Create temporary data frames for each condition with centered values + temp_df <- data.frame( + Expression = c(disease_vals_centered, control_vals_centered), + Gene = anon_genes[gene], + Condition = c(rep("Disease", length(disease_vals_centered)), + rep("Control", length(control_vals_centered))) + ) + + # Append to main data frame + plot_data <- rbind(plot_data, temp_df) +} + +# Add FDR correction +stat_results$FDR <- p.adjust(stat_results$P_value, method = "BH") + +# Sort results by p-value +stat_results <- stat_results[order(stat_results$P_value), ] + +# Create boxplot with swarm plot +p_boxplot <- ggplot(plot_data, aes(x = Gene, y = Expression, fill = Condition)) + + # Add swarm plot layer first (so it appears behind) + geom_jitter(aes(color = Condition), position = position_jitterdodge(jitter.width = 0.2), + size = 0.3, alpha = 0.3) + + # Add boxplot layer second (so it appears in front) + geom_boxplot(outlier.shape = NA, alpha = 0.4, width = 0.7) + + # Colors for boxplot fill and point colors + scale_fill_manual(values = c("Disease" = "#E41A1C", "Control" = "#377EB8")) + + scale_color_manual(values = c("Disease" = "#E41A1C", "Control" = "#377EB8")) + + # Set y-axis limits + ylim(-2, max(plot_data$Expression)) + + # Theme customization + theme_minimal() + + theme( + axis.text.x = element_text(angle = 45, hjust = 1), + panel.grid.major.y = element_blank(), # Remove y-axis major grid lines + panel.grid.minor.y = element_blank(), # Remove y-axis minor grid lines + panel.grid.major.x = element_line(color = "gray90"), # Keep x-axis major grid lines + panel.grid.minor.x = element_blank(), # Remove x-axis minor grid lines + legend.position = c(0.95, 0.95), # Position legend inside top-right + legend.justification = c(1, 1), # Anchor point for legend + legend.box.just = "right", + legend.margin = margin(6, 6, 6, 6), + legend.background = element_rect(fill = "white", color = NA), + legend.box.background = element_rect(color = "gray90") + ) + + # Labels + xlab("Genes") + + ylab("Centered Expression") + + ggtitle("Gene Expression Relative to Mean") + + # Add horizontal line at y=0 + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5) + +# Display plot +print(p_boxplot) + +# Save plot if requested +if (save.plots) { + ggsave('plots/DE_tumor_glycolysis_boxplots.pdf', + plot = p_boxplot, + width = 12, + height = 6, + units = 'in') +} + +# Calculate summary statistics for centered data +summary_stats <- plot_data %>% + group_by(Gene, Condition) %>% + summarise( + Mean = mean(Expression), + Median = median(Expression), + SD = sd(Expression), + Q1 = quantile(Expression, 0.25), + Q3 = quantile(Expression, 0.75), + n = n(), + .groups = 'drop' + ) + +# Display summary statistics +print("Summary Statistics for Centered Expression by Condition:") +print(summary_stats) + +# Display statistical test results +print("\nStatistical Test Results (Disease vs Control):") +print(stat_results) + +# Print number of genes removed due to zero expression +n_removed <- length(glycolysis_genes_present) - length(nonzero_genes) +if (n_removed > 0) { + print(sprintf("\nRemoved %d genes with zero expression", n_removed)) + print("Removed genes:") + print(setdiff(glycolysis_genes_present, nonzero_genes)) +} + +# Print number of significantly different genes +sig_genes <- sum(stat_results$FDR < 0.05) +if (sig_genes > 0) { + print(sprintf("\nNumber of significantly different genes (FDR < 0.05): %d", sig_genes)) + print("Significant genes:") + print(stat_results[stat_results$FDR < 0.05, ]) +} +```