--- a +++ b/.Rhistory @@ -0,0 +1,512 @@ +data = cbind(as.data.frame(response_pca$x[,1:2]), m), +FUN = mean) +# Create enhanced PCA plot +p <- ggplot(data = as.data.frame(response_pca$x[,1:2]), +aes(x = PC1, y = PC2, color = m$Response)) + +# Add points with reduced alpha for transparency +geom_point(alpha = 0.3, size = 2) + +# Add confidence ellipses +stat_ellipse(level = 0.95, size = 1) + +# Add centroids +geom_point(data = centroids, aes(x = PC1, y = PC2), +size = 5, shape = 18) + +# Add centroid labels +geom_text(data = centroids, +aes(x = PC1, y = PC2, label = Response), +vjust = -1.5, size = 4, color = "black") + +# Customize appearance +ggtitle('De-identified DEG PCA with Group Trends') + +xlab(sprintf("PC1 (%.1f%%)", var_explained[1])) + +ylab(sprintf("PC2 (%.1f%%)", var_explained[2])) + +theme_minimal() + +scale_color_manual(values = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')) + +theme( +legend.position = "bottom", +plot.title = element_text(hjust = 0.5, size = 14), +axis.title = element_text(size = 12), +legend.title = element_text(size = 12), +legend.text = element_text(size = 10) +) +# Save plot +message("Saving plot...") +ggsave('plots/DE_PCA_response_collapsed.pdf', plot = p, width = 8, height = 8, units = 'in') +# Display plot +print(p) +message("PCA visualization completed successfully") +}, error = function(e) { +message(sprintf("Error in PCA analysis: %s", e$message)) +message("Please ensure:") +message("1. DE analysis has been run successfully") +message("2. There are significant DEGs to analyze") +message("3. There are enough cells in each response group") +message("4. Expression data is valid and contains sufficient variation") +}) +# Modify DE_response_heat chunk +tryCatch({ +# Get significant genes +if (!exists("de") || is.null(de$mast$response)) { +stop("MAST results not found. Please run the DE analysis first.") +} +sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05] +if (length(sig_genes) == 0) { +stop("No significant genes found (FDR < 0.05)") +} +message(sprintf("Found %d significant genes", length(sig_genes))) +# Take top N genes by absolute log fold change +n <- min(50, length(sig_genes)) +g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]), +decreasing = TRUE)][1:n] +# Create anonymous gene IDs +anon_genes <- anonymize_genes(g) +# Get all cells +jx1 <- which(data.all$proc$Response %in% 'Responder') +jx2 <- which(data.all$proc$Response %in% 'Non-responder') +# Get expression data +x <- de$data[g, c(jx1, jx2)] +if (!is(x, "dgCMatrix")) { +x <- as(x, "dgCMatrix") +} +x <- as.matrix(x) +# Replace gene names with anonymous IDs +rownames(x) <- anon_genes[rownames(x)] +# Calculate mean expression for each group +x_resp <- rowMeans(x[, 1:length(jx1), drop=FALSE]) +x_nonresp <- rowMeans(x[, (length(jx1)+1):ncol(x), drop=FALSE]) +# Combine into matrix +x_means <- cbind(x_resp, x_nonresp) +colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)") +# Calculate standard errors for error bars +x_resp_se <- apply(x[, 1:length(jx1), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x))) +x_nonresp_se <- apply(x[, (length(jx1)+1):ncol(x), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x))) +# Add SE information to rownames +rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)", +rownames(x_means), +x_resp_se, +x_nonresp_se) +# Scale data +x_scaled <- t(scale(t(x_means))) +# Create annotation +col.annot <- data.frame( +Group = c('Responder', 'Non-responder'), +row.names = colnames(x_means) +) +# Color palette +col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100) +# Create heatmap +message("Generating heatmap...") +p <- pheatmap( +x_scaled, +cluster_rows = TRUE, +cluster_cols = FALSE, +scale = "none", # already scaled above +color = col.pal, +annotation_col = col.annot, +show_rownames = TRUE, +show_colnames = TRUE, +main = sprintf('Top %d De-identified DEGs (Mean Expression)', nrow(x_scaled)), +fontsize_row = 8, +annotation_colors = list( +Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8') +) +) +# Save plot +message("Saving plot...") +pdf('plots/DE_heatmap_response_collapsed.pdf', width = 11, height = 8) +print(p) +dev.off() +# Save gene ID mapping +write.csv(data.frame( +Anonymous_ID = names(anon_genes), +Original_Gene = anon_genes, +Mean_Responder = x_resp, +SE_Responder = x_resp_se, +Mean_NonResponder = x_nonresp, +SE_NonResponder = x_nonresp_se +), './response_gene_mapping_with_stats.csv', row.names = FALSE) +message("Heatmap generated successfully") +message("Gene mapping with statistics saved to results/response_gene_mapping_with_stats.csv") +}, error = function(e) { +message(sprintf("Error generating heatmap: %s", e$message)) +}) +# Show individual cell heatmap alongside collapsed view +tryCatch({ +# Get significant genes +if (!exists("de") || is.null(de$mast$response)) { +stop("MAST results not found. Please run the DE analysis first.") +} +sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05] +if (length(sig_genes) == 0) { +stop("No significant genes found (FDR < 0.05)") +} +message(sprintf("Found %d significant genes", length(sig_genes))) +# Take top N genes by absolute log fold change +n <- min(50, length(sig_genes)) +g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]), +decreasing = TRUE)][1:n] +# Create anonymous gene IDs +anon_genes <- anonymize_genes(g) +# Get cells (subsample to make visualization manageable) +jx1 <- which(data.all$proc$Response %in% 'Responder') +jx2 <- which(data.all$proc$Response %in% 'Non-responder') +# Subsample cells +cells_per_group <- min(25, min(length(jx1), length(jx2))) +set.seed(42) +sampled_jx1 <- sample(jx1, cells_per_group) +sampled_jx2 <- sample(jx2, cells_per_group) +# Get expression data for individual cells +x_individual <- de$data[g, c(sampled_jx1, sampled_jx2)] +if (!is(x_individual, "dgCMatrix")) { +x_individual <- as(x_individual, "dgCMatrix") +} +x_individual <- as.matrix(x_individual) +# Clean individual cell data +message("Cleaning individual cell data...") +# Replace Inf/-Inf with NA +x_individual[is.infinite(x_individual)] <- NA +# Remove rows (genes) with any NA values +complete_rows <- complete.cases(x_individual) +if (!all(complete_rows)) { +message(sprintf("Removing %d genes with NA values", sum(!complete_rows))) +x_individual <- x_individual[complete_rows, ] +g <- g[complete_rows] +anon_genes <- anon_genes[complete_rows] +} +# Check for zero variance genes +gene_var <- apply(x_individual, 1, var) +if (any(gene_var == 0)) { +message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0))) +x_individual <- x_individual[gene_var > 0, ] +g <- g[gene_var > 0] +anon_genes <- anon_genes[gene_var > 0] +} +# Replace gene names with anonymous IDs +rownames(x_individual) <- anon_genes[rownames(x_individual)] +# Scale individual cell data with validation +message("Scaling individual cell data...") +x_individual_scaled <- t(scale(t(x_individual))) +# Check for NA/Inf values after scaling +if (any(is.na(x_individual_scaled)) || any(is.infinite(x_individual_scaled))) { +message("Found NA/Inf values after scaling, capping extreme values at ±10") +x_individual_scaled[x_individual_scaled > 10] <- 10 +x_individual_scaled[x_individual_scaled < -10] <- -10 +x_individual_scaled[is.na(x_individual_scaled)] <- 0 +} +# Create annotation for individual cells +col.annot.individual <- data.frame( +Response = rep(c('Responder', 'Non-responder'), each = cells_per_group), +row.names = colnames(x_individual) +) +# Get mean expression data (for side-by-side comparison) +x_resp <- rowMeans(de$data[g, jx1, drop=FALSE]) +x_nonresp <- rowMeans(de$data[g, jx2, drop=FALSE]) +x_means <- cbind(x_resp, x_nonresp) +colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)") +# Calculate standard errors +x_resp_se <- apply(de$data[g, jx1, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x))) +x_nonresp_se <- apply(de$data[g, jx2, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x))) +# Clean mean data +message("Cleaning mean data...") +x_means[is.infinite(x_means)] <- NA +complete_rows_means <- complete.cases(x_means) +if (!all(complete_rows_means)) { +message(sprintf("Removing %d genes with NA values from means", sum(!complete_rows_means))) +x_means <- x_means[complete_rows_means, ] +x_resp_se <- x_resp_se[complete_rows_means] +x_nonresp_se <- x_nonresp_se[complete_rows_means] +} +# Add SE information to rownames for mean data +rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)", +anon_genes[rownames(x_means)], +x_resp_se, +x_nonresp_se) +# Scale mean data with validation +message("Scaling mean data...") +x_means_scaled <- t(scale(t(x_means))) +if (any(is.na(x_means_scaled)) || any(is.infinite(x_means_scaled))) { +message("Found NA/Inf values after scaling means, capping extreme values at ±10") +x_means_scaled[x_means_scaled > 10] <- 10 +x_means_scaled[x_means_scaled < -10] <- -10 +x_means_scaled[is.na(x_means_scaled)] <- 0 +} +# Create annotation for means +col.annot.means <- data.frame( +Group = c('Responder', 'Non-responder'), +row.names = colnames(x_means) +) +# Color palette +col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100) +message("Generating plots...") +# Set up PDF device for both plots +pdf('plots/DE_heatmap_response_comparison.pdf', width = 15, height = 8) +# Create layout for side-by-side plots +layout(matrix(1:2, ncol = 2)) +# Plot 1: Individual cells +pheatmap( +x_individual_scaled, +cluster_rows = TRUE, +cluster_cols = FALSE, +scale = "none", +color = col.pal, +annotation_col = col.annot.individual, +show_rownames = TRUE, +show_colnames = FALSE, +gaps_col = cells_per_group, +main = sprintf('Individual Cells\n(n=%d per group)', cells_per_group), +fontsize_row = 8, +annotation_colors = list( +Response = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8') +), +legend = FALSE, # Hide colorbar +treeheight_row = 0, # Hide row dendrogram +cellwidth = 10, # Force square cells +cellheight = 10 # Force square cells +) +# Plot 2: Mean expression +pheatmap( +x_means_scaled, +cluster_rows = TRUE, +cluster_cols = FALSE, +scale = "none", +color = col.pal, +annotation_col = col.annot.means, +show_rownames = TRUE, +show_colnames = TRUE, +main = 'Group Means\n(all cells)', +fontsize_row = 8, +annotation_colors = list( +Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8') +), +legend = FALSE, # Hide colorbar +treeheight_row = 0, # Hide row dendrogram +cellwidth = 20, # Force square cells (larger since fewer columns) +cellheight = 20, # Force square cells (larger since fewer columns) +labels_row = sub("\n.*", "", rownames(x_means)) # Only show gene IDs without SE info +) +dev.off() +# Save detailed statistics +write.csv(data.frame( +Anonymous_ID = names(anon_genes), +Original_Gene = anon_genes, +Mean_Responder = x_resp, +SE_Responder = x_resp_se, +Mean_NonResponder = x_nonresp, +SE_NonResponder = x_nonresp_se, +N_Responder = length(jx1), +N_NonResponder = length(jx2) +), 'results/response_gene_mapping_detailed.csv', row.names = FALSE) +message("Side-by-side heatmaps generated successfully") +message("Detailed gene statistics saved to results/response_gene_mapping_detailed.csv") +}, error = function(e) { +message(sprintf("Error generating heatmaps: %s", e$message)) +message("\nDebugging information:") +message("1. Check if expression data contains extreme values") +message("2. Verify gene filtering and scaling steps") +message("3. Ensure all matrices have proper dimensions") +message("4. Look for NA/Inf values in the data") +}) +# Memory management settings +options(future.globals.maxSize = 16000 * 1024^2) # Set maximum global size to 8GB +# Load core packages first +library(Matrix) # for sparse matrix operations +library(Seurat) # for scRNA-seq analysis +library(MAST) # for scRNAseq DEG analysis +# Load other packages +library(svDialogs) # for prompting user-input +library(vroom) # for quickly reading data +library(dplyr) # for data processing +library(DT) # to display datatables +library(harmony) # to integration scRNA-seq data +library(patchwork) # for combining plots +library(ggplot2) # for data visualization +library(ggrepel) # to use geom_pont_repel() +library(ggvenn) # to visualize venn diagrams +library(openxlsx) # to write data to excel +library(progress) # to display progress bar +library(rio) # to load all worksheets in a workbook +library(pheatmap) # to visualize heatmaps +library(ggfortify) # to visualize PCA plots +library(presto) # for faster Wilcoxon test implementation +library(gridExtra) # for arranging multiple plots +# Set up parallel processing +library(future) +plan(multisession, workers = 2) # Adjust based on available memory +options(future.globals.maxSize = 16000 * 1024^2) +# Show individual cell heatmap alongside collapsed view +tryCatch({ +# Get significant genes +if (!exists("de") || is.null(de$mast$response)) { +stop("MAST results not found. Please run the DE analysis first.") +} +sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05] +if (length(sig_genes) == 0) { +stop("No significant genes found (FDR < 0.05)") +} +message(sprintf("Found %d significant genes", length(sig_genes))) +# Take top N genes by absolute log fold change +n <- min(50, length(sig_genes)) +g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]), +decreasing = TRUE)][1:n] +# Create anonymous gene IDs +anon_genes <- anonymize_genes(g) +# Get cells (subsample to make visualization manageable) +jx1 <- which(data.all$proc$Response %in% 'Responder') +jx2 <- which(data.all$proc$Response %in% 'Non-responder') +# Subsample cells +cells_per_group <- min(25, min(length(jx1), length(jx2))) +set.seed(42) +sampled_jx1 <- sample(jx1, cells_per_group) +sampled_jx2 <- sample(jx2, cells_per_group) +# Get expression data for individual cells +x_individual <- de$data[g, c(sampled_jx1, sampled_jx2)] +if (!is(x_individual, "dgCMatrix")) { +x_individual <- as(x_individual, "dgCMatrix") +} +x_individual <- as.matrix(x_individual) +# Clean individual cell data +message("Cleaning individual cell data...") +# Replace Inf/-Inf with NA +x_individual[is.infinite(x_individual)] <- NA +# Remove rows (genes) with any NA values +complete_rows <- complete.cases(x_individual) +if (!all(complete_rows)) { +message(sprintf("Removing %d genes with NA values", sum(!complete_rows))) +x_individual <- x_individual[complete_rows, ] +g <- g[complete_rows] +anon_genes <- anon_genes[complete_rows] +} +# Check for zero variance genes +gene_var <- apply(x_individual, 1, var) +if (any(gene_var == 0)) { +message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0))) +x_individual <- x_individual[gene_var > 0, ] +g <- g[gene_var > 0] +anon_genes <- anon_genes[gene_var > 0] +} +# Replace gene names with anonymous IDs +rownames(x_individual) <- anon_genes[rownames(x_individual)] +# Scale individual cell data with validation +message("Scaling individual cell data...") +x_individual_scaled <- t(scale(t(x_individual))) +# Check for NA/Inf values after scaling +if (any(is.na(x_individual_scaled)) || any(is.infinite(x_individual_scaled))) { +message("Found NA/Inf values after scaling, capping extreme values at ±10") +x_individual_scaled[x_individual_scaled > 10] <- 10 +x_individual_scaled[x_individual_scaled < -10] <- -10 +x_individual_scaled[is.na(x_individual_scaled)] <- 0 +} +# Create annotation for individual cells +col.annot.individual <- data.frame( +Response = rep(c('Responder', 'Non-responder'), each = cells_per_group), +row.names = colnames(x_individual) +) +# Get mean expression data (for side-by-side comparison) +x_resp <- rowMeans(de$data[g, jx1, drop=FALSE]) +x_nonresp <- rowMeans(de$data[g, jx2, drop=FALSE]) +x_means <- cbind(x_resp, x_nonresp) +colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)") +# Calculate standard errors +x_resp_se <- apply(de$data[g, jx1, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x))) +x_nonresp_se <- apply(de$data[g, jx2, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x))) +# Clean mean data +message("Cleaning mean data...") +x_means[is.infinite(x_means)] <- NA +complete_rows_means <- complete.cases(x_means) +if (!all(complete_rows_means)) { +message(sprintf("Removing %d genes with NA values from means", sum(!complete_rows_means))) +x_means <- x_means[complete_rows_means, ] +x_resp_se <- x_resp_se[complete_rows_means] +x_nonresp_se <- x_nonresp_se[complete_rows_means] +} +# Add SE information to rownames for mean data +rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)", +anon_genes[rownames(x_means)], +x_resp_se, +x_nonresp_se) +# Scale mean data with validation +message("Scaling mean data...") +x_means_scaled <- t(scale(t(x_means))) +if (any(is.na(x_means_scaled)) || any(is.infinite(x_means_scaled))) { +message("Found NA/Inf values after scaling means, capping extreme values at ±10") +x_means_scaled[x_means_scaled > 10] <- 10 +x_means_scaled[x_means_scaled < -10] <- -10 +x_means_scaled[is.na(x_means_scaled)] <- 0 +} +# Create annotation for means +col.annot.means <- data.frame( +Group = c('Responder', 'Non-responder'), +row.names = colnames(x_means) +) +# Color palette +col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100) +message("Generating plots...") +# Create individual cells heatmap +p1 <- pheatmap( +x_individual_scaled, +cluster_rows = TRUE, +cluster_cols = FALSE, +scale = "none", +color = col.pal, +annotation_col = col.annot.individual, +show_rownames = TRUE, +show_colnames = FALSE, +gaps_col = cells_per_group, +main = sprintf('Individual Cells\n(n=%d per group)', cells_per_group), +fontsize_row = 8, +annotation_colors = list( +Response = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8') +), +legend = FALSE, # Hide colorbar +treeheight_row = 0, # Hide row dendrogram +cellwidth = 10, # Force square cells +cellheight = 10 # Force square cells +) +# Create mean expression heatmap +p2 <- pheatmap( +x_means_scaled, +cluster_rows = TRUE, +cluster_cols = FALSE, +scale = "none", +color = col.pal, +annotation_col = col.annot.means, +show_rownames = TRUE, +show_colnames = TRUE, +main = 'Group Means\n(all cells)', +fontsize_row = 8, +annotation_colors = list( +Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8') +), +legend = FALSE, # Hide colorbar +treeheight_row = 0, # Hide row dendrogram +cellwidth = 20, # Force square cells (larger since fewer columns) +cellheight = 20, # Force square cells (larger since fewer columns) +labels_row = sub("\n.*", "", rownames(x_means)) # Only show gene IDs without SE info +) +# Save plots to PDF +pdf('plots/DE_heatmap_response_comparison.pdf', width = 15, height = 8) +grid::grid.newpage() +grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2)) +dev.off() +# Display plots in notebook +grid::grid.newpage() +grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2)) +# Save detailed statistics +write.csv(data.frame( +Anonymous_ID = names(anon_genes), +Original_Gene = anon_genes, +Mean_Responder = x_resp, +SE_Responder = x_resp_se, +Mean_NonResponder = x_nonresp, +SE_NonResponder = x_nonresp_se, +N_Responder = length(jx1), +N_NonResponder = length(jx2) +), 'results/response_gene_mapping_detailed.csv', row.names = FALSE) +message("Side-by-side heatmaps generated successfully") +message("Detailed gene statistics saved to results/response_gene_mapping_detailed.csv") +}, error = function(e) { +message(sprintf("Error generating heatmaps: %s", e$message)) +message("\nDebugging information:") +message("1. Check if expression data contains extreme values") +message("2. Verify gene filtering and scaling steps") +message("3. Ensure all matrices have proper dimensions") +message("4. Look for NA/Inf values in the data") +})