cancer-biomarker-discover / Git / [6df130] /.Rhistory

Models:
DanielG/
cancer-biomarker-discover
Downloads: 1
[6df130]: / .Rhistory
History
Download this file
513 lines (512 with data), 19.9 kB

data = cbind(as.data.frame(response_pca$x[,1:2]), m),
FUN = mean)
# Create enhanced PCA plot
p <- ggplot(data = as.data.frame(response_pca$x[,1:2]),
aes(x = PC1, y = PC2, color = m$Response)) +
# Add points with reduced alpha for transparency
geom_point(alpha = 0.3, size = 2) +
# Add confidence ellipses
stat_ellipse(level = 0.95, size = 1) +
# Add centroids
geom_point(data = centroids, aes(x = PC1, y = PC2),
size = 5, shape = 18) +
# Add centroid labels
geom_text(data = centroids,
aes(x = PC1, y = PC2, label = Response),
vjust = -1.5, size = 4, color = "black") +
# Customize appearance
ggtitle('De-identified DEG PCA with Group Trends') +
xlab(sprintf("PC1 (%.1f%%)", var_explained[1])) +
ylab(sprintf("PC2 (%.1f%%)", var_explained[2])) +
theme_minimal() +
scale_color_manual(values = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')) +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5, size = 14),
axis.title = element_text(size = 12),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)
# Save plot
message("Saving plot...")
ggsave('plots/DE_PCA_response_collapsed.pdf', plot = p, width = 8, height = 8, units = 'in')
# Display plot
print(p)
message("PCA visualization completed successfully")
}, error = function(e) {
message(sprintf("Error in PCA analysis: %s", e$message))
message("Please ensure:")
message("1. DE analysis has been run successfully")
message("2. There are significant DEGs to analyze")
message("3. There are enough cells in each response group")
message("4. Expression data is valid and contains sufficient variation")
})
# Modify DE_response_heat chunk
tryCatch({
# Get significant genes
if (!exists("de") || is.null(de$mast$response)) {
stop("MAST results not found. Please run the DE analysis first.")
}
sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
if (length(sig_genes) == 0) {
stop("No significant genes found (FDR < 0.05)")
}
message(sprintf("Found %d significant genes", length(sig_genes)))
# Take top N genes by absolute log fold change
n <- min(50, length(sig_genes))
g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]),
decreasing = TRUE)][1:n]
# Create anonymous gene IDs
anon_genes <- anonymize_genes(g)
# Get all cells
jx1 <- which(data.all$proc$Response %in% 'Responder')
jx2 <- which(data.all$proc$Response %in% 'Non-responder')
# Get expression data
x <- de$data[g, c(jx1, jx2)]
if (!is(x, "dgCMatrix")) {
x <- as(x, "dgCMatrix")
}
x <- as.matrix(x)
# Replace gene names with anonymous IDs
rownames(x) <- anon_genes[rownames(x)]
# Calculate mean expression for each group
x_resp <- rowMeans(x[, 1:length(jx1), drop=FALSE])
x_nonresp <- rowMeans(x[, (length(jx1)+1):ncol(x), drop=FALSE])
# Combine into matrix
x_means <- cbind(x_resp, x_nonresp)
colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)")
# Calculate standard errors for error bars
x_resp_se <- apply(x[, 1:length(jx1), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
x_nonresp_se <- apply(x[, (length(jx1)+1):ncol(x), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
# Add SE information to rownames
rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)",
rownames(x_means),
x_resp_se,
x_nonresp_se)
# Scale data
x_scaled <- t(scale(t(x_means)))
# Create annotation
col.annot <- data.frame(
Group = c('Responder', 'Non-responder'),
row.names = colnames(x_means)
)
# Color palette
col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100)
# Create heatmap
message("Generating heatmap...")
p <- pheatmap(
x_scaled,
cluster_rows = TRUE,
cluster_cols = FALSE,
scale = "none",  # already scaled above
color = col.pal,
annotation_col = col.annot,
show_rownames = TRUE,
show_colnames = TRUE,
main = sprintf('Top %d De-identified DEGs (Mean Expression)', nrow(x_scaled)),
fontsize_row = 8,
annotation_colors = list(
Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
)
)
# Save plot
message("Saving plot...")
pdf('plots/DE_heatmap_response_collapsed.pdf', width = 11, height = 8)
print(p)
dev.off()
# Save gene ID mapping
write.csv(data.frame(
Anonymous_ID = names(anon_genes),
Original_Gene = anon_genes,
Mean_Responder = x_resp,
SE_Responder = x_resp_se,
Mean_NonResponder = x_nonresp,
SE_NonResponder = x_nonresp_se
), './response_gene_mapping_with_stats.csv', row.names = FALSE)
message("Heatmap generated successfully")
message("Gene mapping with statistics saved to results/response_gene_mapping_with_stats.csv")
}, error = function(e) {
message(sprintf("Error generating heatmap: %s", e$message))
})
# Show individual cell heatmap alongside collapsed view
tryCatch({
# Get significant genes
if (!exists("de") || is.null(de$mast$response)) {
stop("MAST results not found. Please run the DE analysis first.")
}
sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
if (length(sig_genes) == 0) {
stop("No significant genes found (FDR < 0.05)")
}
message(sprintf("Found %d significant genes", length(sig_genes)))
# Take top N genes by absolute log fold change
n <- min(50, length(sig_genes))
g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]),
decreasing = TRUE)][1:n]
# Create anonymous gene IDs
anon_genes <- anonymize_genes(g)
# Get cells (subsample to make visualization manageable)
jx1 <- which(data.all$proc$Response %in% 'Responder')
jx2 <- which(data.all$proc$Response %in% 'Non-responder')
# Subsample cells
cells_per_group <- min(25, min(length(jx1), length(jx2)))
set.seed(42)
sampled_jx1 <- sample(jx1, cells_per_group)
sampled_jx2 <- sample(jx2, cells_per_group)
# Get expression data for individual cells
x_individual <- de$data[g, c(sampled_jx1, sampled_jx2)]
if (!is(x_individual, "dgCMatrix")) {
x_individual <- as(x_individual, "dgCMatrix")
}
x_individual <- as.matrix(x_individual)
# Clean individual cell data
message("Cleaning individual cell data...")
# Replace Inf/-Inf with NA
x_individual[is.infinite(x_individual)] <- NA
# Remove rows (genes) with any NA values
complete_rows <- complete.cases(x_individual)
if (!all(complete_rows)) {
message(sprintf("Removing %d genes with NA values", sum(!complete_rows)))
x_individual <- x_individual[complete_rows, ]
g <- g[complete_rows]
anon_genes <- anon_genes[complete_rows]
}
# Check for zero variance genes
gene_var <- apply(x_individual, 1, var)
if (any(gene_var == 0)) {
message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0)))
x_individual <- x_individual[gene_var > 0, ]
g <- g[gene_var > 0]
anon_genes <- anon_genes[gene_var > 0]
}
# Replace gene names with anonymous IDs
rownames(x_individual) <- anon_genes[rownames(x_individual)]
# Scale individual cell data with validation
message("Scaling individual cell data...")
x_individual_scaled <- t(scale(t(x_individual)))
# Check for NA/Inf values after scaling
if (any(is.na(x_individual_scaled)) || any(is.infinite(x_individual_scaled))) {
message("Found NA/Inf values after scaling, capping extreme values at ±10")
x_individual_scaled[x_individual_scaled > 10] <- 10
x_individual_scaled[x_individual_scaled < -10] <- -10
x_individual_scaled[is.na(x_individual_scaled)] <- 0
}
# Create annotation for individual cells
col.annot.individual <- data.frame(
Response = rep(c('Responder', 'Non-responder'), each = cells_per_group),
row.names = colnames(x_individual)
)
# Get mean expression data (for side-by-side comparison)
x_resp <- rowMeans(de$data[g, jx1, drop=FALSE])
x_nonresp <- rowMeans(de$data[g, jx2, drop=FALSE])
x_means <- cbind(x_resp, x_nonresp)
colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)")
# Calculate standard errors
x_resp_se <- apply(de$data[g, jx1, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
x_nonresp_se <- apply(de$data[g, jx2, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
# Clean mean data
message("Cleaning mean data...")
x_means[is.infinite(x_means)] <- NA
complete_rows_means <- complete.cases(x_means)
if (!all(complete_rows_means)) {
message(sprintf("Removing %d genes with NA values from means", sum(!complete_rows_means)))
x_means <- x_means[complete_rows_means, ]
x_resp_se <- x_resp_se[complete_rows_means]
x_nonresp_se <- x_nonresp_se[complete_rows_means]
}
# Add SE information to rownames for mean data
rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)",
anon_genes[rownames(x_means)],
x_resp_se,
x_nonresp_se)
# Scale mean data with validation
message("Scaling mean data...")
x_means_scaled <- t(scale(t(x_means)))
if (any(is.na(x_means_scaled)) || any(is.infinite(x_means_scaled))) {
message("Found NA/Inf values after scaling means, capping extreme values at ±10")
x_means_scaled[x_means_scaled > 10] <- 10
x_means_scaled[x_means_scaled < -10] <- -10
x_means_scaled[is.na(x_means_scaled)] <- 0
}
# Create annotation for means
col.annot.means <- data.frame(
Group = c('Responder', 'Non-responder'),
row.names = colnames(x_means)
)
# Color palette
col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100)
message("Generating plots...")
# Set up PDF device for both plots
pdf('plots/DE_heatmap_response_comparison.pdf', width = 15, height = 8)
# Create layout for side-by-side plots
layout(matrix(1:2, ncol = 2))
# Plot 1: Individual cells
pheatmap(
x_individual_scaled,
cluster_rows = TRUE,
cluster_cols = FALSE,
scale = "none",
color = col.pal,
annotation_col = col.annot.individual,
show_rownames = TRUE,
show_colnames = FALSE,
gaps_col = cells_per_group,
main = sprintf('Individual Cells\n(n=%d per group)', cells_per_group),
fontsize_row = 8,
annotation_colors = list(
Response = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
),
legend = FALSE,  # Hide colorbar
treeheight_row = 0,  # Hide row dendrogram
cellwidth = 10,  # Force square cells
cellheight = 10  # Force square cells
)
# Plot 2: Mean expression
pheatmap(
x_means_scaled,
cluster_rows = TRUE,
cluster_cols = FALSE,
scale = "none",
color = col.pal,
annotation_col = col.annot.means,
show_rownames = TRUE,
show_colnames = TRUE,
main = 'Group Means\n(all cells)',
fontsize_row = 8,
annotation_colors = list(
Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
),
legend = FALSE,  # Hide colorbar
treeheight_row = 0,  # Hide row dendrogram
cellwidth = 20,  # Force square cells (larger since fewer columns)
cellheight = 20,  # Force square cells (larger since fewer columns)
labels_row = sub("\n.*", "", rownames(x_means))  # Only show gene IDs without SE info
)
dev.off()
# Save detailed statistics
write.csv(data.frame(
Anonymous_ID = names(anon_genes),
Original_Gene = anon_genes,
Mean_Responder = x_resp,
SE_Responder = x_resp_se,
Mean_NonResponder = x_nonresp,
SE_NonResponder = x_nonresp_se,
N_Responder = length(jx1),
N_NonResponder = length(jx2)
), 'results/response_gene_mapping_detailed.csv', row.names = FALSE)
message("Side-by-side heatmaps generated successfully")
message("Detailed gene statistics saved to results/response_gene_mapping_detailed.csv")
}, error = function(e) {
message(sprintf("Error generating heatmaps: %s", e$message))
message("\nDebugging information:")
message("1. Check if expression data contains extreme values")
message("2. Verify gene filtering and scaling steps")
message("3. Ensure all matrices have proper dimensions")
message("4. Look for NA/Inf values in the data")
})
# Memory management settings
options(future.globals.maxSize = 16000 * 1024^2)  # Set maximum global size to 8GB
# Load core packages first
library(Matrix)       # for sparse matrix operations
library(Seurat)      # for scRNA-seq analysis
library(MAST)        # for scRNAseq DEG analysis
# Load other packages
library(svDialogs)    # for prompting user-input
library(vroom)        # for quickly reading data
library(dplyr)        # for data processing
library(DT)           # to display datatables
library(harmony)      # to integration scRNA-seq data
library(patchwork)    # for combining plots
library(ggplot2)      # for data visualization
library(ggrepel)      # to use geom_pont_repel()
library(ggvenn)       # to visualize venn diagrams
library(openxlsx)     # to write data to excel
library(progress)     # to display progress bar
library(rio)          # to load all worksheets in a workbook
library(pheatmap)     # to visualize heatmaps
library(ggfortify)    # to visualize PCA plots
library(presto)       # for faster Wilcoxon test implementation
library(gridExtra)    # for arranging multiple plots
# Set up parallel processing
library(future)
plan(multisession, workers = 2)  # Adjust based on available memory
options(future.globals.maxSize = 16000 * 1024^2)
# Show individual cell heatmap alongside collapsed view
tryCatch({
# Get significant genes
if (!exists("de") || is.null(de$mast$response)) {
stop("MAST results not found. Please run the DE analysis first.")
}
sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
if (length(sig_genes) == 0) {
stop("No significant genes found (FDR < 0.05)")
}
message(sprintf("Found %d significant genes", length(sig_genes)))
# Take top N genes by absolute log fold change
n <- min(50, length(sig_genes))
g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]),
decreasing = TRUE)][1:n]
# Create anonymous gene IDs
anon_genes <- anonymize_genes(g)
# Get cells (subsample to make visualization manageable)
jx1 <- which(data.all$proc$Response %in% 'Responder')
jx2 <- which(data.all$proc$Response %in% 'Non-responder')
# Subsample cells
cells_per_group <- min(25, min(length(jx1), length(jx2)))
set.seed(42)
sampled_jx1 <- sample(jx1, cells_per_group)
sampled_jx2 <- sample(jx2, cells_per_group)
# Get expression data for individual cells
x_individual <- de$data[g, c(sampled_jx1, sampled_jx2)]
if (!is(x_individual, "dgCMatrix")) {
x_individual <- as(x_individual, "dgCMatrix")
}
x_individual <- as.matrix(x_individual)
# Clean individual cell data
message("Cleaning individual cell data...")
# Replace Inf/-Inf with NA
x_individual[is.infinite(x_individual)] <- NA
# Remove rows (genes) with any NA values
complete_rows <- complete.cases(x_individual)
if (!all(complete_rows)) {
message(sprintf("Removing %d genes with NA values", sum(!complete_rows)))
x_individual <- x_individual[complete_rows, ]
g <- g[complete_rows]
anon_genes <- anon_genes[complete_rows]
}
# Check for zero variance genes
gene_var <- apply(x_individual, 1, var)
if (any(gene_var == 0)) {
message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0)))
x_individual <- x_individual[gene_var > 0, ]
g <- g[gene_var > 0]
anon_genes <- anon_genes[gene_var > 0]
}
# Replace gene names with anonymous IDs
rownames(x_individual) <- anon_genes[rownames(x_individual)]
# Scale individual cell data with validation
message("Scaling individual cell data...")
x_individual_scaled <- t(scale(t(x_individual)))
# Check for NA/Inf values after scaling
if (any(is.na(x_individual_scaled)) || any(is.infinite(x_individual_scaled))) {
message("Found NA/Inf values after scaling, capping extreme values at ±10")
x_individual_scaled[x_individual_scaled > 10] <- 10
x_individual_scaled[x_individual_scaled < -10] <- -10
x_individual_scaled[is.na(x_individual_scaled)] <- 0
}
# Create annotation for individual cells
col.annot.individual <- data.frame(
Response = rep(c('Responder', 'Non-responder'), each = cells_per_group),
row.names = colnames(x_individual)
)
# Get mean expression data (for side-by-side comparison)
x_resp <- rowMeans(de$data[g, jx1, drop=FALSE])
x_nonresp <- rowMeans(de$data[g, jx2, drop=FALSE])
x_means <- cbind(x_resp, x_nonresp)
colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)")
# Calculate standard errors
x_resp_se <- apply(de$data[g, jx1, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
x_nonresp_se <- apply(de$data[g, jx2, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
# Clean mean data
message("Cleaning mean data...")
x_means[is.infinite(x_means)] <- NA
complete_rows_means <- complete.cases(x_means)
if (!all(complete_rows_means)) {
message(sprintf("Removing %d genes with NA values from means", sum(!complete_rows_means)))
x_means <- x_means[complete_rows_means, ]
x_resp_se <- x_resp_se[complete_rows_means]
x_nonresp_se <- x_nonresp_se[complete_rows_means]
}
# Add SE information to rownames for mean data
rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)",
anon_genes[rownames(x_means)],
x_resp_se,
x_nonresp_se)
# Scale mean data with validation
message("Scaling mean data...")
x_means_scaled <- t(scale(t(x_means)))
if (any(is.na(x_means_scaled)) || any(is.infinite(x_means_scaled))) {
message("Found NA/Inf values after scaling means, capping extreme values at ±10")
x_means_scaled[x_means_scaled > 10] <- 10
x_means_scaled[x_means_scaled < -10] <- -10
x_means_scaled[is.na(x_means_scaled)] <- 0
}
# Create annotation for means
col.annot.means <- data.frame(
Group = c('Responder', 'Non-responder'),
row.names = colnames(x_means)
)
# Color palette
col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100)
message("Generating plots...")
# Create individual cells heatmap
p1 <- pheatmap(
x_individual_scaled,
cluster_rows = TRUE,
cluster_cols = FALSE,
scale = "none",
color = col.pal,
annotation_col = col.annot.individual,
show_rownames = TRUE,
show_colnames = FALSE,
gaps_col = cells_per_group,
main = sprintf('Individual Cells\n(n=%d per group)', cells_per_group),
fontsize_row = 8,
annotation_colors = list(
Response = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
),
legend = FALSE,  # Hide colorbar
treeheight_row = 0,  # Hide row dendrogram
cellwidth = 10,  # Force square cells
cellheight = 10  # Force square cells
)
# Create mean expression heatmap
p2 <- pheatmap(
x_means_scaled,
cluster_rows = TRUE,
cluster_cols = FALSE,
scale = "none",
color = col.pal,
annotation_col = col.annot.means,
show_rownames = TRUE,
show_colnames = TRUE,
main = 'Group Means\n(all cells)',
fontsize_row = 8,
annotation_colors = list(
Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
),
legend = FALSE,  # Hide colorbar
treeheight_row = 0,  # Hide row dendrogram
cellwidth = 20,  # Force square cells (larger since fewer columns)
cellheight = 20,  # Force square cells (larger since fewer columns)
labels_row = sub("\n.*", "", rownames(x_means))  # Only show gene IDs without SE info
)
# Save plots to PDF
pdf('plots/DE_heatmap_response_comparison.pdf', width = 15, height = 8)
grid::grid.newpage()
grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2))
dev.off()
# Display plots in notebook
grid::grid.newpage()
grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2))
# Save detailed statistics
write.csv(data.frame(
Anonymous_ID = names(anon_genes),
Original_Gene = anon_genes,
Mean_Responder = x_resp,
SE_Responder = x_resp_se,
Mean_NonResponder = x_nonresp,
SE_NonResponder = x_nonresp_se,
N_Responder = length(jx1),
N_NonResponder = length(jx2)
), 'results/response_gene_mapping_detailed.csv', row.names = FALSE)
message("Side-by-side heatmaps generated successfully")
message("Detailed gene statistics saved to results/response_gene_mapping_detailed.csv")
}, error = function(e) {
message(sprintf("Error generating heatmaps: %s", e$message))
message("\nDebugging information:")
message("1. Check if expression data contains extreme values")
message("2. Verify gene filtering and scaling steps")
message("3. Ensure all matrices have proper dimensions")
message("4. Look for NA/Inf values in the data")
})