cancer-biomarker-discover / Git / Diff of /.Rhistory

Models:
DanielG/
cancer-biomarker-discover
Downloads: 1
Diff of /.Rhistory [000000] .. [6df130]
Switch to side-by-side view

--- a
+++ b/.Rhistory
@@ -0,0 +1,512 @@
+data = cbind(as.data.frame(response_pca$x[,1:2]), m),
+FUN = mean)
+# Create enhanced PCA plot
+p <- ggplot(data = as.data.frame(response_pca$x[,1:2]),
+aes(x = PC1, y = PC2, color = m$Response)) +
+# Add points with reduced alpha for transparency
+geom_point(alpha = 0.3, size = 2) +
+# Add confidence ellipses
+stat_ellipse(level = 0.95, size = 1) +
+# Add centroids
+geom_point(data = centroids, aes(x = PC1, y = PC2),
+size = 5, shape = 18) +
+# Add centroid labels
+geom_text(data = centroids,
+aes(x = PC1, y = PC2, label = Response),
+vjust = -1.5, size = 4, color = "black") +
+# Customize appearance
+ggtitle('De-identified DEG PCA with Group Trends') +
+xlab(sprintf("PC1 (%.1f%%)", var_explained[1])) +
+ylab(sprintf("PC2 (%.1f%%)", var_explained[2])) +
+theme_minimal() +
+scale_color_manual(values = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')) +
+theme(
+legend.position = "bottom",
+plot.title = element_text(hjust = 0.5, size = 14),
+axis.title = element_text(size = 12),
+legend.title = element_text(size = 12),
+legend.text = element_text(size = 10)
+)
+# Save plot
+message("Saving plot...")
+ggsave('plots/DE_PCA_response_collapsed.pdf', plot = p, width = 8, height = 8, units = 'in')
+# Display plot
+print(p)
+message("PCA visualization completed successfully")
+}, error = function(e) {
+message(sprintf("Error in PCA analysis: %s", e$message))
+message("Please ensure:")
+message("1. DE analysis has been run successfully")
+message("2. There are significant DEGs to analyze")
+message("3. There are enough cells in each response group")
+message("4. Expression data is valid and contains sufficient variation")
+})
+# Modify DE_response_heat chunk
+tryCatch({
+# Get significant genes
+if (!exists("de") || is.null(de$mast$response)) {
+stop("MAST results not found. Please run the DE analysis first.")
+}
+sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
+if (length(sig_genes) == 0) {
+stop("No significant genes found (FDR < 0.05)")
+}
+message(sprintf("Found %d significant genes", length(sig_genes)))
+# Take top N genes by absolute log fold change
+n <- min(50, length(sig_genes))
+g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]),
+decreasing = TRUE)][1:n]
+# Create anonymous gene IDs
+anon_genes <- anonymize_genes(g)
+# Get all cells
+jx1 <- which(data.all$proc$Response %in% 'Responder')
+jx2 <- which(data.all$proc$Response %in% 'Non-responder')
+# Get expression data
+x <- de$data[g, c(jx1, jx2)]
+if (!is(x, "dgCMatrix")) {
+x <- as(x, "dgCMatrix")
+}
+x <- as.matrix(x)
+# Replace gene names with anonymous IDs
+rownames(x) <- anon_genes[rownames(x)]
+# Calculate mean expression for each group
+x_resp <- rowMeans(x[, 1:length(jx1), drop=FALSE])
+x_nonresp <- rowMeans(x[, (length(jx1)+1):ncol(x), drop=FALSE])
+# Combine into matrix
+x_means <- cbind(x_resp, x_nonresp)
+colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)")
+# Calculate standard errors for error bars
+x_resp_se <- apply(x[, 1:length(jx1), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
+x_nonresp_se <- apply(x[, (length(jx1)+1):ncol(x), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
+# Add SE information to rownames
+rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)",
+rownames(x_means),
+x_resp_se,
+x_nonresp_se)
+# Scale data
+x_scaled <- t(scale(t(x_means)))
+# Create annotation
+col.annot <- data.frame(
+Group = c('Responder', 'Non-responder'),
+row.names = colnames(x_means)
+)
+# Color palette
+col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100)
+# Create heatmap
+message("Generating heatmap...")
+p <- pheatmap(
+x_scaled,
+cluster_rows = TRUE,
+cluster_cols = FALSE,
+scale = "none",  # already scaled above
+color = col.pal,
+annotation_col = col.annot,
+show_rownames = TRUE,
+show_colnames = TRUE,
+main = sprintf('Top %d De-identified DEGs (Mean Expression)', nrow(x_scaled)),
+fontsize_row = 8,
+annotation_colors = list(
+Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
+)
+)
+# Save plot
+message("Saving plot...")
+pdf('plots/DE_heatmap_response_collapsed.pdf', width = 11, height = 8)
+print(p)
+dev.off()
+# Save gene ID mapping
+write.csv(data.frame(
+Anonymous_ID = names(anon_genes),
+Original_Gene = anon_genes,
+Mean_Responder = x_resp,
+SE_Responder = x_resp_se,
+Mean_NonResponder = x_nonresp,
+SE_NonResponder = x_nonresp_se
+), './response_gene_mapping_with_stats.csv', row.names = FALSE)
+message("Heatmap generated successfully")
+message("Gene mapping with statistics saved to results/response_gene_mapping_with_stats.csv")
+}, error = function(e) {
+message(sprintf("Error generating heatmap: %s", e$message))
+})
+# Show individual cell heatmap alongside collapsed view
+tryCatch({
+# Get significant genes
+if (!exists("de") || is.null(de$mast$response)) {
+stop("MAST results not found. Please run the DE analysis first.")
+}
+sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
+if (length(sig_genes) == 0) {
+stop("No significant genes found (FDR < 0.05)")
+}
+message(sprintf("Found %d significant genes", length(sig_genes)))
+# Take top N genes by absolute log fold change
+n <- min(50, length(sig_genes))
+g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]),
+decreasing = TRUE)][1:n]
+# Create anonymous gene IDs
+anon_genes <- anonymize_genes(g)
+# Get cells (subsample to make visualization manageable)
+jx1 <- which(data.all$proc$Response %in% 'Responder')
+jx2 <- which(data.all$proc$Response %in% 'Non-responder')
+# Subsample cells
+cells_per_group <- min(25, min(length(jx1), length(jx2)))
+set.seed(42)
+sampled_jx1 <- sample(jx1, cells_per_group)
+sampled_jx2 <- sample(jx2, cells_per_group)
+# Get expression data for individual cells
+x_individual <- de$data[g, c(sampled_jx1, sampled_jx2)]
+if (!is(x_individual, "dgCMatrix")) {
+x_individual <- as(x_individual, "dgCMatrix")
+}
+x_individual <- as.matrix(x_individual)
+# Clean individual cell data
+message("Cleaning individual cell data...")
+# Replace Inf/-Inf with NA
+x_individual[is.infinite(x_individual)] <- NA
+# Remove rows (genes) with any NA values
+complete_rows <- complete.cases(x_individual)
+if (!all(complete_rows)) {
+message(sprintf("Removing %d genes with NA values", sum(!complete_rows)))
+x_individual <- x_individual[complete_rows, ]
+g <- g[complete_rows]
+anon_genes <- anon_genes[complete_rows]
+}
+# Check for zero variance genes
+gene_var <- apply(x_individual, 1, var)
+if (any(gene_var == 0)) {
+message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0)))
+x_individual <- x_individual[gene_var > 0, ]
+g <- g[gene_var > 0]
+anon_genes <- anon_genes[gene_var > 0]
+}
+# Replace gene names with anonymous IDs
+rownames(x_individual) <- anon_genes[rownames(x_individual)]
+# Scale individual cell data with validation
+message("Scaling individual cell data...")
+x_individual_scaled <- t(scale(t(x_individual)))
+# Check for NA/Inf values after scaling
+if (any(is.na(x_individual_scaled)) || any(is.infinite(x_individual_scaled))) {
+message("Found NA/Inf values after scaling, capping extreme values at ±10")
+x_individual_scaled[x_individual_scaled > 10] <- 10
+x_individual_scaled[x_individual_scaled < -10] <- -10
+x_individual_scaled[is.na(x_individual_scaled)] <- 0
+}
+# Create annotation for individual cells
+col.annot.individual <- data.frame(
+Response = rep(c('Responder', 'Non-responder'), each = cells_per_group),
+row.names = colnames(x_individual)
+)
+# Get mean expression data (for side-by-side comparison)
+x_resp <- rowMeans(de$data[g, jx1, drop=FALSE])
+x_nonresp <- rowMeans(de$data[g, jx2, drop=FALSE])
+x_means <- cbind(x_resp, x_nonresp)
+colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)")
+# Calculate standard errors
+x_resp_se <- apply(de$data[g, jx1, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
+x_nonresp_se <- apply(de$data[g, jx2, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
+# Clean mean data
+message("Cleaning mean data...")
+x_means[is.infinite(x_means)] <- NA
+complete_rows_means <- complete.cases(x_means)
+if (!all(complete_rows_means)) {
+message(sprintf("Removing %d genes with NA values from means", sum(!complete_rows_means)))
+x_means <- x_means[complete_rows_means, ]
+x_resp_se <- x_resp_se[complete_rows_means]
+x_nonresp_se <- x_nonresp_se[complete_rows_means]
+}
+# Add SE information to rownames for mean data
+rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)",
+anon_genes[rownames(x_means)],
+x_resp_se,
+x_nonresp_se)
+# Scale mean data with validation
+message("Scaling mean data...")
+x_means_scaled <- t(scale(t(x_means)))
+if (any(is.na(x_means_scaled)) || any(is.infinite(x_means_scaled))) {
+message("Found NA/Inf values after scaling means, capping extreme values at ±10")
+x_means_scaled[x_means_scaled > 10] <- 10
+x_means_scaled[x_means_scaled < -10] <- -10
+x_means_scaled[is.na(x_means_scaled)] <- 0
+}
+# Create annotation for means
+col.annot.means <- data.frame(
+Group = c('Responder', 'Non-responder'),
+row.names = colnames(x_means)
+)
+# Color palette
+col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100)
+message("Generating plots...")
+# Set up PDF device for both plots
+pdf('plots/DE_heatmap_response_comparison.pdf', width = 15, height = 8)
+# Create layout for side-by-side plots
+layout(matrix(1:2, ncol = 2))
+# Plot 1: Individual cells
+pheatmap(
+x_individual_scaled,
+cluster_rows = TRUE,
+cluster_cols = FALSE,
+scale = "none",
+color = col.pal,
+annotation_col = col.annot.individual,
+show_rownames = TRUE,
+show_colnames = FALSE,
+gaps_col = cells_per_group,
+main = sprintf('Individual Cells\n(n=%d per group)', cells_per_group),
+fontsize_row = 8,
+annotation_colors = list(
+Response = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
+),
+legend = FALSE,  # Hide colorbar
+treeheight_row = 0,  # Hide row dendrogram
+cellwidth = 10,  # Force square cells
+cellheight = 10  # Force square cells
+)
+# Plot 2: Mean expression
+pheatmap(
+x_means_scaled,
+cluster_rows = TRUE,
+cluster_cols = FALSE,
+scale = "none",
+color = col.pal,
+annotation_col = col.annot.means,
+show_rownames = TRUE,
+show_colnames = TRUE,
+main = 'Group Means\n(all cells)',
+fontsize_row = 8,
+annotation_colors = list(
+Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
+),
+legend = FALSE,  # Hide colorbar
+treeheight_row = 0,  # Hide row dendrogram
+cellwidth = 20,  # Force square cells (larger since fewer columns)
+cellheight = 20,  # Force square cells (larger since fewer columns)
+labels_row = sub("\n.*", "", rownames(x_means))  # Only show gene IDs without SE info
+)
+dev.off()
+# Save detailed statistics
+write.csv(data.frame(
+Anonymous_ID = names(anon_genes),
+Original_Gene = anon_genes,
+Mean_Responder = x_resp,
+SE_Responder = x_resp_se,
+Mean_NonResponder = x_nonresp,
+SE_NonResponder = x_nonresp_se,
+N_Responder = length(jx1),
+N_NonResponder = length(jx2)
+), 'results/response_gene_mapping_detailed.csv', row.names = FALSE)
+message("Side-by-side heatmaps generated successfully")
+message("Detailed gene statistics saved to results/response_gene_mapping_detailed.csv")
+}, error = function(e) {
+message(sprintf("Error generating heatmaps: %s", e$message))
+message("\nDebugging information:")
+message("1. Check if expression data contains extreme values")
+message("2. Verify gene filtering and scaling steps")
+message("3. Ensure all matrices have proper dimensions")
+message("4. Look for NA/Inf values in the data")
+})
+# Memory management settings
+options(future.globals.maxSize = 16000 * 1024^2)  # Set maximum global size to 8GB
+# Load core packages first
+library(Matrix)       # for sparse matrix operations
+library(Seurat)      # for scRNA-seq analysis
+library(MAST)        # for scRNAseq DEG analysis
+# Load other packages
+library(svDialogs)    # for prompting user-input
+library(vroom)        # for quickly reading data
+library(dplyr)        # for data processing
+library(DT)           # to display datatables
+library(harmony)      # to integration scRNA-seq data
+library(patchwork)    # for combining plots
+library(ggplot2)      # for data visualization
+library(ggrepel)      # to use geom_pont_repel()
+library(ggvenn)       # to visualize venn diagrams
+library(openxlsx)     # to write data to excel
+library(progress)     # to display progress bar
+library(rio)          # to load all worksheets in a workbook
+library(pheatmap)     # to visualize heatmaps
+library(ggfortify)    # to visualize PCA plots
+library(presto)       # for faster Wilcoxon test implementation
+library(gridExtra)    # for arranging multiple plots
+# Set up parallel processing
+library(future)
+plan(multisession, workers = 2)  # Adjust based on available memory
+options(future.globals.maxSize = 16000 * 1024^2)
+# Show individual cell heatmap alongside collapsed view
+tryCatch({
+# Get significant genes
+if (!exists("de") || is.null(de$mast$response)) {
+stop("MAST results not found. Please run the DE analysis first.")
+}
+sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
+if (length(sig_genes) == 0) {
+stop("No significant genes found (FDR < 0.05)")
+}
+message(sprintf("Found %d significant genes", length(sig_genes)))
+# Take top N genes by absolute log fold change
+n <- min(50, length(sig_genes))
+g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]),
+decreasing = TRUE)][1:n]
+# Create anonymous gene IDs
+anon_genes <- anonymize_genes(g)
+# Get cells (subsample to make visualization manageable)
+jx1 <- which(data.all$proc$Response %in% 'Responder')
+jx2 <- which(data.all$proc$Response %in% 'Non-responder')
+# Subsample cells
+cells_per_group <- min(25, min(length(jx1), length(jx2)))
+set.seed(42)
+sampled_jx1 <- sample(jx1, cells_per_group)
+sampled_jx2 <- sample(jx2, cells_per_group)
+# Get expression data for individual cells
+x_individual <- de$data[g, c(sampled_jx1, sampled_jx2)]
+if (!is(x_individual, "dgCMatrix")) {
+x_individual <- as(x_individual, "dgCMatrix")
+}
+x_individual <- as.matrix(x_individual)
+# Clean individual cell data
+message("Cleaning individual cell data...")
+# Replace Inf/-Inf with NA
+x_individual[is.infinite(x_individual)] <- NA
+# Remove rows (genes) with any NA values
+complete_rows <- complete.cases(x_individual)
+if (!all(complete_rows)) {
+message(sprintf("Removing %d genes with NA values", sum(!complete_rows)))
+x_individual <- x_individual[complete_rows, ]
+g <- g[complete_rows]
+anon_genes <- anon_genes[complete_rows]
+}
+# Check for zero variance genes
+gene_var <- apply(x_individual, 1, var)
+if (any(gene_var == 0)) {
+message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0)))
+x_individual <- x_individual[gene_var > 0, ]
+g <- g[gene_var > 0]
+anon_genes <- anon_genes[gene_var > 0]
+}
+# Replace gene names with anonymous IDs
+rownames(x_individual) <- anon_genes[rownames(x_individual)]
+# Scale individual cell data with validation
+message("Scaling individual cell data...")
+x_individual_scaled <- t(scale(t(x_individual)))
+# Check for NA/Inf values after scaling
+if (any(is.na(x_individual_scaled)) || any(is.infinite(x_individual_scaled))) {
+message("Found NA/Inf values after scaling, capping extreme values at ±10")
+x_individual_scaled[x_individual_scaled > 10] <- 10
+x_individual_scaled[x_individual_scaled < -10] <- -10
+x_individual_scaled[is.na(x_individual_scaled)] <- 0
+}
+# Create annotation for individual cells
+col.annot.individual <- data.frame(
+Response = rep(c('Responder', 'Non-responder'), each = cells_per_group),
+row.names = colnames(x_individual)
+)
+# Get mean expression data (for side-by-side comparison)
+x_resp <- rowMeans(de$data[g, jx1, drop=FALSE])
+x_nonresp <- rowMeans(de$data[g, jx2, drop=FALSE])
+x_means <- cbind(x_resp, x_nonresp)
+colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)")
+# Calculate standard errors
+x_resp_se <- apply(de$data[g, jx1, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
+x_nonresp_se <- apply(de$data[g, jx2, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
+# Clean mean data
+message("Cleaning mean data...")
+x_means[is.infinite(x_means)] <- NA
+complete_rows_means <- complete.cases(x_means)
+if (!all(complete_rows_means)) {
+message(sprintf("Removing %d genes with NA values from means", sum(!complete_rows_means)))
+x_means <- x_means[complete_rows_means, ]
+x_resp_se <- x_resp_se[complete_rows_means]
+x_nonresp_se <- x_nonresp_se[complete_rows_means]
+}
+# Add SE information to rownames for mean data
+rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)",
+anon_genes[rownames(x_means)],
+x_resp_se,
+x_nonresp_se)
+# Scale mean data with validation
+message("Scaling mean data...")
+x_means_scaled <- t(scale(t(x_means)))
+if (any(is.na(x_means_scaled)) || any(is.infinite(x_means_scaled))) {
+message("Found NA/Inf values after scaling means, capping extreme values at ±10")
+x_means_scaled[x_means_scaled > 10] <- 10
+x_means_scaled[x_means_scaled < -10] <- -10
+x_means_scaled[is.na(x_means_scaled)] <- 0
+}
+# Create annotation for means
+col.annot.means <- data.frame(
+Group = c('Responder', 'Non-responder'),
+row.names = colnames(x_means)
+)
+# Color palette
+col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100)
+message("Generating plots...")
+# Create individual cells heatmap
+p1 <- pheatmap(
+x_individual_scaled,
+cluster_rows = TRUE,
+cluster_cols = FALSE,
+scale = "none",
+color = col.pal,
+annotation_col = col.annot.individual,
+show_rownames = TRUE,
+show_colnames = FALSE,
+gaps_col = cells_per_group,
+main = sprintf('Individual Cells\n(n=%d per group)', cells_per_group),
+fontsize_row = 8,
+annotation_colors = list(
+Response = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
+),
+legend = FALSE,  # Hide colorbar
+treeheight_row = 0,  # Hide row dendrogram
+cellwidth = 10,  # Force square cells
+cellheight = 10  # Force square cells
+)
+# Create mean expression heatmap
+p2 <- pheatmap(
+x_means_scaled,
+cluster_rows = TRUE,
+cluster_cols = FALSE,
+scale = "none",
+color = col.pal,
+annotation_col = col.annot.means,
+show_rownames = TRUE,
+show_colnames = TRUE,
+main = 'Group Means\n(all cells)',
+fontsize_row = 8,
+annotation_colors = list(
+Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
+),
+legend = FALSE,  # Hide colorbar
+treeheight_row = 0,  # Hide row dendrogram
+cellwidth = 20,  # Force square cells (larger since fewer columns)
+cellheight = 20,  # Force square cells (larger since fewer columns)
+labels_row = sub("\n.*", "", rownames(x_means))  # Only show gene IDs without SE info
+)
+# Save plots to PDF
+pdf('plots/DE_heatmap_response_comparison.pdf', width = 15, height = 8)
+grid::grid.newpage()
+grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2))
+dev.off()
+# Display plots in notebook
+grid::grid.newpage()
+grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2))
+# Save detailed statistics
+write.csv(data.frame(
+Anonymous_ID = names(anon_genes),
+Original_Gene = anon_genes,
+Mean_Responder = x_resp,
+SE_Responder = x_resp_se,
+Mean_NonResponder = x_nonresp,
+SE_NonResponder = x_nonresp_se,
+N_Responder = length(jx1),
+N_NonResponder = length(jx2)
+), 'results/response_gene_mapping_detailed.csv', row.names = FALSE)
+message("Side-by-side heatmaps generated successfully")
+message("Detailed gene statistics saved to results/response_gene_mapping_detailed.csv")
+}, error = function(e) {
+message(sprintf("Error generating heatmaps: %s", e$message))
+message("\nDebugging information:")
+message("1. Check if expression data contains extreme values")
+message("2. Verify gene filtering and scaling steps")
+message("3. Ensure all matrices have proper dimensions")
+message("4. Look for NA/Inf values in the data")
+})