Diff of /notebooks/DE_analysis.Rmd [000000] .. [6df130]

Switch to unified view

a b/notebooks/DE_analysis.Rmd
1
---
2
title: "DE Analysis of scRNA-seq data"
3
date: "Last modified: `r format(Sys.time(), '%B %d, %Y')`"
4
tags: [scRNA-seq, seurat, melanoma, immunotherapy, PBMCs] 
5
output:
6
  html_document:
7
    theme: flatly
8
    highlight: zenburn
9
    toc: true
10
    number_sections: false
11
    toc_depth: 2
12
    toc_float:
13
      collapsed: false
14
      smooth_scroll: true
15
    code_folding: hide
16
    self_contained: yes
17
---
18
19
# Notebook Description
20
21
This notebook compiles the code and outputs for differential expression (DE) analysis. Specifically, two methods are covered:  
22
  * unpaired Student's t-test (custom function) 
23
  * [MAST](https://doi.org/10.1186/s13059-015-0844-5) 
24
25
Of note, pre-processed data stored in the `scRNAseq_analysis.RData` file is used for the analyses detailed below.  
26
27
# Initialize environment
28
29
```{r load_packages, results="hide", message=F, warning=F, error=F}
30
# Install required packages if not already installed
31
if (!require("BiocManager", quietly = TRUE)) {
32
  install.packages("BiocManager")
33
}
34
if (!require("MAST", quietly = TRUE)) {
35
  BiocManager::install("MAST")
36
}
37
if (!require("presto", quietly = TRUE)) {
38
  install.packages("presto")
39
}
40
41
# Memory management settings
42
options(future.globals.maxSize = 16000 * 1024^2)  # Set maximum global size to 8GB
43
44
# Load core packages first
45
library(Matrix)       # for sparse matrix operations
46
library(Seurat)      # for scRNA-seq analysis
47
library(MAST)        # for scRNAseq DEG analysis
48
49
# Load other packages
50
library(svDialogs)    # for prompting user-input
51
library(vroom)        # for quickly reading data
52
library(dplyr)        # for data processing
53
library(DT)           # to display datatables
54
library(harmony)      # to integration scRNA-seq data
55
library(patchwork)    # for combining plots
56
library(ggplot2)      # for data visualization
57
library(ggrepel)      # to use geom_pont_repel()
58
library(ggvenn)       # to visualize venn diagrams
59
library(openxlsx)     # to write data to excel
60
library(progress)     # to display progress bar
61
library(rio)          # to load all worksheets in a workbook
62
library(pheatmap)     # to visualize heatmaps
63
library(ggfortify)    # to visualize PCA plots
64
library(presto)       # for faster Wilcoxon test implementation
65
library(gridExtra)    # for arranging multiple plots
66
67
# Set up parallel processing
68
library(future)
69
plan(multisession, workers = 2)  # Adjust based on available memory
70
options(future.globals.maxSize = 16000 * 1024^2)
71
```
72
73
System settings.  
74
75
```{r settings}
76
77
# Adjust system settings
78
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
79
80
# Enhanced memory management functions
81
clear_memory <- function(deep_clean = FALSE) {
82
  # Standard cleanup
83
  gc(verbose = FALSE)
84
  if(exists(".Random.seed")) rm(.Random.seed)
85
  
86
  # Deep cleanup if requested
87
  if(deep_clean) {
88
    # Clear package caches
89
    if(exists(".__C__")) rm(list=".__C__", envir=.GlobalEnv)
90
    # Clear hidden objects
91
    rm(list = ls(all.names = TRUE, envir = .GlobalEnv), envir = .GlobalEnv)
92
    # Force garbage collection multiple times
93
    replicate(3, gc(verbose = FALSE))
94
  }
95
  
96
  invisible(gc())
97
}
98
99
# Function to get data efficiently with batching support
100
get_assay_data <- function(object, data_type = "data", batch_size = 1000) {
101
  tryCatch({
102
    # Get data using appropriate method
103
    if (packageVersion("Seurat") >= "5.0.0") {
104
      data <- GetAssayData(object = object, layer = data_type)
105
    } else {
106
      data <- GetAssayData(object = object, slot = data_type)
107
    }
108
    
109
    # Convert to sparse matrix if dense
110
    if (!is(data, "dgCMatrix")) {
111
      data <- as(data, "dgCMatrix")
112
    }
113
    
114
    return(data)
115
  }, error = function(e) {
116
    stop(sprintf("Failed to get assay data: %s", e$message))
117
  })
118
}
119
120
# Function to process data in batches
121
process_in_batches <- function(data, batch_size = 1000, FUN, ...) {
122
  n_cells <- ncol(data)
123
  n_batches <- ceiling(n_cells / batch_size)
124
  results <- vector("list", n_batches)
125
  
126
  pb <- progress_bar$new(total = n_batches)
127
  
128
  for(i in 1:n_batches) {
129
    start_idx <- (i-1) * batch_size + 1
130
    end_idx <- min(i * batch_size, n_cells)
131
    
132
    # Process batch
133
    batch_data <- data[, start_idx:end_idx]
134
    results[[i]] <- FUN(batch_data, ...)
135
    
136
    # Clean up
137
    rm(batch_data)
138
    clear_memory()
139
    pb$tick()
140
  }
141
  
142
  # Combine results
143
  combined_results <- do.call(rbind, results)
144
  rm(results)
145
  clear_memory()
146
  
147
  return(combined_results)
148
}
149
150
# Add gene anonymization function after the settings chunk
151
anonymize_genes <- function(gene_list, prefix = "GENE") {
152
  # Create mapping of real genes to anonymous IDs
153
  anon_ids <- paste0(prefix, "_", sprintf("%04d", seq_along(gene_list)))
154
  names(anon_ids) <- gene_list
155
  return(anon_ids)
156
}
157
158
```
159
160
Load pre-processed data. 
161
162
```{r load_data, warning=F, message=F}
163
file_path <- "/Users/scampit/Projects/torchstack/cancer-biomarker-discovery"
164
setwd(file_path)
165
166
# Start with clean environment
167
#clear_memory(deep_clean = TRUE)
168
169
# Load data from saved workspace
170
f <- list.files()
171
if (any(endsWith(f, '.RData'))) {
172
  load(f[endsWith(f, '.RData')][1])
173
} else {
174
  stop("No .RData file found. Please ensure scRNAseq_wksp.RData is in the working directory.")
175
}
176
177
# Initialize DE lists
178
de <- list()
179
180
# Get data from Seurat object efficiently
181
tryCatch({
182
  message("Loading expression data from Seurat object...")
183
  de$data <- get_assay_data(data.all$proc)
184
  
185
  # Verify data loaded correctly
186
  if (is.null(de$data) || !is(de$data, "dgCMatrix")) {
187
    stop("Failed to load expression data properly")
188
  }
189
  message(sprintf("Loaded expression matrix with %d genes and %d cells", 
190
                 nrow(de$data), ncol(de$data)))
191
}, error = function(e) {
192
  stop(sprintf("Error loading data: %s", e$message))
193
})
194
195
#clear_memory()
196
197
# Load or initialize results
198
de$ttest <- if(file.exists('results/tables/DE_ttest.xlsx')) {
199
  import_list('results/tables/DE_ttest.xlsx')
200
} else {
201
  list()
202
}
203
204
de$mast <- if(file.exists('results/tables/DE_mast.xlsx')) {
205
  import_list('results/tables/DE_mast.xlsx')
206
} else {
207
  list()
208
}
209
210
#clear_memory(deep_clean = TRUE)
211
212
# Verify data structure
213
str(de, max.level = 1)
214
215
```
216
217
```{r summary_stats, warning=F, message=F}
218
219
# Assign row names to MAST results
220
f <- names(de$ttest)
221
for (i in 1:length(f)) { 
222
  x <- de$ttest[[f[i]]]; y <- de$mast[[f[i]]]
223
  g <- intersect(x$gene[x$q.val < 0.05], y$gene[y$p_val_adj < 0.05]) 
224
  cat(sprintf('%s\tDEGs (t-test): %d\tDEGs (MAST): %d\tDEGs (overlap): %d\n', 
225
              f[i], sum(x$q.val < 0.05), sum(y$p_val_adj < 0.05), length(g)))
226
}
227
228
```
229
230
# DE analyses
231
232
**Description**: determine differentially expressed genes (DEGs) between different comparisons.  
233
234
```{r DE_initialize, warning=F, message=T, eval=!('data' %in% de)}
235
236
# Instantiate DE variable + relevant fields
237
de <- list(); de$ttest <- list(); de$mast <- list()
238
239
# Define data to work with - update to use layer
240
tryCatch({
241
  de$data <- GetAssayData(object=data.all$proc, layer='data')
242
}, error = function(e) {
243
  # Fallback for older Seurat versions
244
  message("Attempting fallback for older Seurat version...")
245
  de$data <- GetAssayData(object=data.all$proc, slot='data')
246
})
247
248
# Add validation
249
if (is.null(de$data) || ncol(de$data) == 0 || nrow(de$data) == 0) {
250
  stop("Failed to retrieve data from Seurat object")
251
}
252
253
```
254
255
## Case: disease vs. control (tumor)
256
257
Comparison between benign vs. tumor skin cells. 
258
Estimated runtime: ~15 minutes. 
259
260
```{r DE_tumor, warning=F, message=T, eval=!('tumor' %in% de$ttest)}
261
262
# Start timer
263
t.start <- Sys.time()
264
265
# Define cell groups to compare
266
ix <- intersect(rownames(obj$GSE72056), rownames(obj$GSE115978)) %>%
267
  intersect(., rownames(obj$GSE151091))
268
jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes'
269
jx2 <- data.all$proc$Source == names(obj)[4]
270
271
# Subsample cells if too many
272
max_cells_per_group <- 1000
273
if(sum(jx1) > max_cells_per_group) {
274
  set.seed(42)
275
  cells_g1 <- sample(which(jx1), max_cells_per_group)
276
  jx1[setdiff(which(jx1), cells_g1)] <- FALSE
277
}
278
if(sum(jx2) > max_cells_per_group) {
279
  set.seed(42)
280
  cells_g2 <- sample(which(jx2), max_cells_per_group)
281
  jx2[setdiff(which(jx2), cells_g2)] <- FALSE
282
}
283
284
# DE analysis (t-test) with memory management
285
x <- de$data[ix, jx1]
286
y <- de$data[ix, jx2]
287
#clear_memory()
288
289
de$ttest$tumor <- de_analyze(x, y) %>% na.omit
290
rm(x, y)
291
#clear_memory(deep_clean = TRUE)
292
293
# DE analysis (MAST)
294
message("Creating Seurat object...")
295
combined_data <- cbind(de$data[ix, jx1], de$data[ix, jx2])
296
if (!is(combined_data, "dgCMatrix")) {
297
  combined_data <- as(combined_data, "dgCMatrix")
298
}
299
x <- CreateSeuratObject(counts = combined_data)
300
rm(combined_data)
301
#clear_memory()
302
303
# Process data in steps with memory management
304
message("Normalizing data...")
305
x <- NormalizeData(x, verbose = FALSE)
306
#clear_memory()
307
308
message("Finding variable features...")
309
x <- FindVariableFeatures(x, verbose = FALSE)
310
#clear_memory()
311
312
message("Scaling data...")
313
x <- ScaleData(x, verbose = FALSE)
314
#clear_memory()
315
316
message("Running PCA...")
317
x <- RunPCA(x, verbose = FALSE)
318
#clear_memory()
319
320
# Set identities
321
cells_g1 <- 1:sum(jx1)
322
cells_g2 <- (sum(jx1) + 1):ncol(x)
323
Idents(object = x, cells = cells_g1) <- 'Disease'
324
Idents(object = x, cells = cells_g2) <- 'Control'
325
rm(cells_g1, cells_g2)
326
#clear_memory()
327
328
# Run MAST analysis with error handling and memory management
329
message("Running differential expression analysis...")
330
tryCatch({
331
  de$mast$tumor <- FindMarkers(
332
    object = x,
333
    ident.1 = 'Disease',
334
    ident.2 = 'Control',
335
    test.use = 'MAST',
336
    verbose = TRUE,
337
    max.cells.per.ident = 1000,
338
    min.pct = 0.1,        # Filter low-expressed genes
339
    logfc.threshold = 0.25 # Filter low effect size
340
  )
341
}, error = function(e) {
342
  message("Error in MAST analysis: ", e$message)
343
  message("Attempting alternative analysis method...")
344
  de$mast$tumor <- FindMarkers(
345
    object = x,
346
    ident.1 = 'Disease',
347
    ident.2 = 'Control',
348
    test.use = 'wilcox',
349
    verbose = TRUE,
350
    max.cells.per.ident = 1000,
351
    min.pct = 0.1,
352
    logfc.threshold = 0.25
353
  )
354
})
355
356
# Clean up
357
rm(x)
358
#clear_memory(deep_clean = TRUE)
359
360
# End timer + log time elapsed
361
t.end <- Sys.time()
362
message("Analysis completed in: ", difftime(t.end, t.start, units = "mins"), " minutes")
363
364
```
365
366
## Case: disease vs. control (immune cells, bulk)
367
368
Bulk comparison between healthy vs. diseased immune cells. 
369
Estimated runtime: ~20 minutes. 
370
371
```{r DE_immune_bulk,warning=F, message=T, eval=!('immune' %in% de$ttest)}
372
373
# Start timer
374
t.start <- Sys.time()
375
376
# Define cell groups to compare
377
ix <- intersect(rownames(obj$GSE120575), rownames(obj$GSE115978)) %>% 
378
  intersect(., rownames(obj$GSE183212))
379
jx1 <- data.all$proc$Source == names(obj)[1] | data.all$proc$Malignant %in% 'No'
380
jx2 <- data.all$proc$Source == names(obj)[5]
381
382
# DE analysis (t-test)
383
x <- de$data[ix, jx1]; y <- de$data[ix, jx2]
384
de$ttest$immune <- de_analyze(x, y) %>% na.omit()
385
386
# DE analysis (MAST)
387
# Create Seurat object with proper assay
388
x <- CreateSeuratObject(counts = cbind(de$data[ix, jx1], de$data[ix, jx2]))
389
# Normalize the data
390
x <- NormalizeData(x)
391
# Find variable features
392
x <- FindVariableFeatures(x)
393
# Scale the data
394
x <- ScaleData(x)
395
# Run PCA
396
x <- RunPCA(x)
397
398
# Set identities
399
Idents(object=x, cells=1:sum(jx1)) <- 'Tumor'
400
Idents(object=x, cells=sum(jx1)+1:ncol(x)) <- 'Benign'
401
402
# Run MAST analysis
403
de$mast$immune <- FindMarkers(object=x, 
404
                             ident.1='Tumor', 
405
                             ident.2='Benign', 
406
                             test.use='MAST',
407
                             verbose = TRUE)
408
409
# End timer + log time elapsed
410
t.end <- Sys.time()
411
t.end - t.start
412
413
```
414
415
## Case: disease vs. control (immune cells, cluster-based)
416
417
Immune cell-specific comparisons between healthy vs. diseased cells. 
418
Estimated runtime: ~15 minutes. 
419
420
```{r DE_immune_cluster, warning=F, message=T}
421
# Start timer
422
t.start <- Sys.time()
423
424
# Define labeled scRNAseq data
425
lab <- c('Tumor cell', 'Technical error 1', 'Cytotoxic CD8 T cell 1', 'B cell', 
426
         'Melanocytes 1', 'Macrophage', 'Cytotoxic CD8 T cell 2', 'Technical error 2', 
427
         'Memory T cell', 'Melanocytes 2', 'Dysfunctional CD8 T cell', 
428
         'Cytotoxic CD8 T cell 3', '? 1', 'Melanocytes 3', 'Activated cell', 
429
         'Exhausted T cell', 'Cytotoxic T cell', '? 2', '? 3', 'Melanocytes 4', '? 4')
430
names(lab) <- levels(data.all$proc)
431
plot.data <- data.all$proc %>% RenameIdents(., lab)
432
433
# Make sure we have the expression data
434
if (!exists("de") || is.null(de$data) || !is(de$data, "dgCMatrix")) {
435
  message("Reloading expression data...")
436
  de <- list()
437
  de$data <- GetAssayData(object = data.all$proc, layer = "data")
438
  if (!is(de$data, "dgCMatrix")) {
439
    de$data <- as(de$data, "dgCMatrix")
440
  }
441
}
442
443
# Initialize results lists if they don't exist
444
if (!"ttest" %in% names(de)) de$ttest <- list()
445
if (!"mast" %in% names(de)) de$mast <- list()
446
447
# Iterate through each immune cell group
448
immune.cells <- c('B', 'CD8', 'Macrophage', 'Memory', 'Activated')
449
for (i in 1:length(immune.cells)) { 
450
  message(sprintf("\nProcessing %s cells...", immune.cells[i]))
451
  
452
  # Define row + column indices
453
  ix <- intersect(rownames(obj$GSE120575), rownames(obj$GSE115978)) %>% 
454
    intersect(., rownames(obj$GSE183212))
455
  c <- grepl(immune.cells[i], Idents(plot.data))
456
  jx1 <- (plot.data$Source == names(obj)[1] | 
457
            data.all$proc$Malignant %in% 'No') & c
458
  jx2 <- data.all$proc$Source == names(obj)[5] & c
459
  
460
  # Print number of genes + cells
461
  message(sprintf('No. of genes: %d\tNo. of G1: %d\tNo. of G2: %d', 
462
                length(ix), sum(jx1), sum(jx2)))
463
  
464
  # Skip if not enough cells
465
  if (sum(jx1) < 3 || sum(jx2) < 3) {
466
    message(sprintf("Skipping %s - insufficient cells", immune.cells[i]))
467
    next
468
  }
469
  
470
  # DE analysis (t-test)
471
  message("Running t-test analysis...")
472
  x <- de$data[ix, jx1]
473
  y <- de$data[ix, jx2]
474
  de$ttest[[immune.cells[i]]] <- de_analyze(x, y) %>% na.omit()
475
  rm(x, y)
476
  clear_memory()
477
478
  # DE analysis (MAST)
479
  message("Running MAST analysis...")
480
  tryCatch({
481
    # Create Seurat object
482
    combined_data <- cbind(de$data[ix, jx1], de$data[ix, jx2])
483
    if (!is(combined_data, "dgCMatrix")) {
484
      combined_data <- as(combined_data, "dgCMatrix")
485
    }
486
    x <- CreateSeuratObject(counts = combined_data)
487
    rm(combined_data)
488
    clear_memory()
489
    
490
    # Process data
491
    x <- x %>%
492
      NormalizeData(verbose = FALSE) %>%
493
      FindVariableFeatures(verbose = FALSE) %>%
494
      ScaleData(verbose = FALSE)
495
    
496
    # Set identities
497
    Idents(object = x, cells = 1:sum(jx1)) <- 'Tumor'
498
    Idents(object = x, cells = (sum(jx1) + 1):ncol(x)) <- 'Benign'
499
    
500
    # Run MAST
501
    de$mast[[immune.cells[i]]] <- FindMarkers(
502
      object = x,
503
      ident.1 = 'Tumor',
504
      ident.2 = 'Benign',
505
      test.use = 'MAST',
506
      verbose = TRUE,
507
      max.cells.per.ident = 1000
508
    )
509
    
510
    rm(x)
511
    clear_memory()
512
    
513
  }, error = function(e) {
514
    message(sprintf("Error in MAST analysis for %s: %s", immune.cells[i], e$message))
515
    message("Attempting Wilcoxon test instead...")
516
    
517
    # Try Wilcoxon as fallback
518
    de$mast[[immune.cells[i]]] <- FindMarkers(
519
      object = x,
520
      ident.1 = 'Tumor',
521
      ident.2 = 'Benign',
522
      test.use = 'wilcox',
523
      verbose = TRUE,
524
      max.cells.per.ident = 1000
525
    )
526
  })
527
  
528
  message(sprintf("Completed analysis for %s", immune.cells[i]))
529
}
530
531
# End timer + log time elapsed
532
t.end <- Sys.time()
533
message("\nTotal analysis time: ", difftime(t.end, t.start, units = "mins"), " minutes")
534
535
# Print summary of results
536
message("\nSummary of analyses:")
537
for (cell_type in immune.cells) {
538
  if (cell_type %in% names(de$ttest)) {
539
    n_ttest <- nrow(de$ttest[[cell_type]])
540
    n_mast <- if(cell_type %in% names(de$mast)) nrow(de$mast[[cell_type]]) else 0
541
    message(sprintf("%s: %d t-test DEGs, %d MAST DEGs", cell_type, n_ttest, n_mast))
542
  }
543
}
544
545
```
546
547
## Case: responder vs. non-responder (GSE120575)
548
549
Comparison between responder vs. non-responder cells to immunotherapy. 
550
Estimated runtime: ~1 hour.  
551
552
```{r DE_response, warning=F, message=T, eval=!('response' %in% de$ttest)}
553
# Start timer
554
t.start <- Sys.time()
555
556
# Make sure we have the expression data
557
if (!exists("de") || is.null(de$data) || !is(de$data, "dgCMatrix")) {
558
  message("Reloading expression data...")
559
  de <- list()
560
  de$data <- GetAssayData(object = data.all$proc, layer = "data")
561
  if (!is(de$data, "dgCMatrix")) {
562
    de$data <- as(de$data, "dgCMatrix")
563
  }
564
}
565
566
# Define cell groups to compare
567
ix <- rownames(obj$GSE120575)
568
jx1 <- data.all$proc$Response %in% 'Responder'
569
jx2 <- data.all$proc$Response %in% 'Non-responder'
570
571
# Print cell counts
572
message(sprintf("Number of responder cells: %d", sum(jx1)))
573
message(sprintf("Number of non-responder cells: %d", sum(jx2)))
574
575
# Subsample cells if too many
576
max_cells_per_group <- 1000
577
if(sum(jx1) > max_cells_per_group) {
578
  set.seed(42)
579
  cells_g1 <- sample(which(jx1), max_cells_per_group)
580
  jx1[setdiff(which(jx1), cells_g1)] <- FALSE
581
}
582
if(sum(jx2) > max_cells_per_group) {
583
  set.seed(42)
584
  cells_g2 <- sample(which(jx2), max_cells_per_group)
585
  jx2[setdiff(which(jx2), cells_g2)] <- FALSE
586
}
587
588
# DE analysis (t-test)
589
message("Running t-test analysis...")
590
x <- de$data[ix, jx1]
591
y <- de$data[ix, jx2]
592
de$ttest$response <- de_analyze(x, y) %>% na.omit
593
rm(x, y)
594
#clear_memory()
595
596
# DE analysis (MAST)
597
message("Running MAST analysis...")
598
tryCatch({
599
  # Create Seurat object
600
  message("Creating Seurat object...")
601
  combined_data <- cbind(de$data[ix, jx1], de$data[ix, jx2])
602
  if (!is(combined_data, "dgCMatrix")) {
603
    combined_data <- as(combined_data, "dgCMatrix")
604
  }
605
  x <- CreateSeuratObject(counts = combined_data)
606
  rm(combined_data)
607
  clear_memory()
608
  
609
  # Process data
610
  message("Processing data...")
611
  x <- x %>%
612
    NormalizeData(verbose = FALSE) %>%
613
    FindVariableFeatures(verbose = FALSE) %>%
614
    ScaleData(verbose = FALSE) %>%
615
    RunPCA(verbose = FALSE)
616
  
617
  # Set identities
618
  message("Setting identities...")
619
  cells_g1 <- 1:sum(jx1)
620
  cells_g2 <- (sum(jx1) + 1):ncol(x)
621
  Idents(object = x, cells = cells_g1) <- 'Responder'
622
  Idents(object = x, cells = cells_g2) <- 'Non-responder'
623
  
624
  # Verify identities
625
  if (!all(levels(Idents(x)) %in% c('Responder', 'Non-responder'))) {
626
    stop("Failed to set identities correctly")
627
  }
628
  
629
  # Run MAST
630
  message("Running differential expression analysis...")
631
  de$mast$response <- FindMarkers(
632
    object = x,
633
    ident.1 = 'Responder',
634
    ident.2 = 'Non-responder',
635
    test.use = 'MAST',
636
    verbose = TRUE,
637
    max.cells.per.ident = 1000,
638
    min.pct = 0.1,
639
    logfc.threshold = 0.25
640
  )
641
  
642
  # Clean up
643
  rm(x, cells_g1, cells_g2)
644
  clear_memory()
645
  
646
}, error = function(e) {
647
  message("Error in MAST analysis: ", e$message)
648
  message("Attempting Wilcoxon test instead...")
649
  
650
  # Try Wilcoxon as fallback
651
  de$mast$response <- FindMarkers(
652
    object = x,
653
    ident.1 = 'Responder',
654
    ident.2 = 'Non-responder',
655
    test.use = 'wilcox',
656
    verbose = TRUE,
657
    max.cells.per.ident = 1000,
658
    min.pct = 0.1,
659
    logfc.threshold = 0.25
660
  )
661
})
662
663
# End timer + log time elapsed
664
t.end <- Sys.time()
665
message("\nAnalysis completed in: ", difftime(t.end, t.start, units = "mins"), " minutes")
666
667
# Print summary
668
if (!is.null(de$mast$response)) {
669
  message("\nFound ", sum(de$mast$response$p_val_adj < 0.05), 
670
          " significant DEGs (FDR < 0.05)")
671
}
672
673
```
674
675
# Visualizations
676
677
**Description**: visual inspection of DE analysis results. 
678
679
## T-test vs. MAST DEGs
680
681
Visual comparison of DEGs determined with t-test vs. MAST (venn diagrams).
682
683
```{r DE_venn, warning=F, message=T}
684
685
# Compare DEGs b/t the two methods
686
f <- names(de$mast); p <- list()
687
for (i in 1:length(f)) { 
688
  x <- de$ttest[[f[i]]]; y <- de$mast[[f[i]]]
689
  g <- intersect(x$gene[x$q.val < 0.05], row.names(y)[y$p_val_adj < 0.05])
690
  # g <- intersect(x$gene[x$q.val < 0.05], y$gene[y$p_val_adj < 0.05]) 
691
  cat(sprintf('No. of intersecting DEGs for %s: %d\n', f[i], length(g)))
692
  ggvenn(list('t-test'=x$gene[x$q.val < 0.05], 
693
              'MAST'=row.names(y)[y$p_val_adj < 0.05])) + ggtitle(f[i])
694
  ggsave(paste0('plots/DE_venn_', f[i], '.pdf'), width=5, height=5, units='in')
695
}
696
697
```
698
699
## Responder vs. non-responder DEGs
700
701
Clustergram (or heatmap) of DEGs identified for responder vs. non-responder comparison. 
702
703
```{r DE_response_heat, warning=F, message=T}
704
# Modify DE_response_heat chunk
705
tryCatch({
706
  # Get significant genes
707
  if (!exists("de") || is.null(de$mast$response)) {
708
    stop("MAST results not found. Please run the DE analysis first.")
709
  }
710
  
711
  sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
712
  if (length(sig_genes) == 0) {
713
    stop("No significant genes found (FDR < 0.05)")
714
  }
715
  message(sprintf("Found %d significant genes", length(sig_genes)))
716
  
717
  # Take top N genes by absolute log fold change
718
  n <- min(50, length(sig_genes))
719
  g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]), 
720
                      decreasing = TRUE)][1:n]
721
  
722
  # Create anonymous gene IDs
723
  anon_genes <- anonymize_genes(g)
724
  
725
  # Get all cells
726
  jx1 <- which(data.all$proc$Response %in% 'Responder')
727
  jx2 <- which(data.all$proc$Response %in% 'Non-responder')
728
  
729
  # Get expression data
730
  x <- de$data[g, c(jx1, jx2)]
731
  if (!is(x, "dgCMatrix")) {
732
    x <- as(x, "dgCMatrix")
733
  }
734
  x <- as.matrix(x)
735
  
736
  # Replace gene names with anonymous IDs
737
  rownames(x) <- anon_genes[rownames(x)]
738
  
739
  # Calculate mean expression for each group
740
  x_resp <- rowMeans(x[, 1:length(jx1), drop=FALSE])
741
  x_nonresp <- rowMeans(x[, (length(jx1)+1):ncol(x), drop=FALSE])
742
  
743
  # Combine into matrix
744
  x_means <- cbind(x_resp, x_nonresp)
745
  colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)")
746
  
747
  # Calculate standard errors for error bars
748
  x_resp_se <- apply(x[, 1:length(jx1), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
749
  x_nonresp_se <- apply(x[, (length(jx1)+1):ncol(x), drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
750
  
751
  # Add SE information to rownames
752
  rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)", 
753
                              rownames(x_means), 
754
                              x_resp_se, 
755
                              x_nonresp_se)
756
  
757
  # Scale data
758
  x_scaled <- t(scale(t(x_means)))
759
  
760
  # Create annotation
761
  col.annot <- data.frame(
762
    Group = c('Responder', 'Non-responder'),
763
    row.names = colnames(x_means)
764
  )
765
  
766
  # Color palette
767
  col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100)
768
  
769
  # Create heatmap
770
  message("Generating heatmap...")
771
  p <- pheatmap(
772
    x_scaled,
773
    cluster_rows = TRUE,
774
    cluster_cols = FALSE,
775
    scale = "none",  # already scaled above
776
    color = col.pal,
777
    annotation_col = col.annot,
778
    show_rownames = TRUE,
779
    show_colnames = TRUE,
780
    main = sprintf('Top %d De-identified DEGs (Mean Expression)', nrow(x_scaled)),
781
    fontsize_row = 8,
782
    annotation_colors = list(
783
      Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
784
    )
785
  )
786
  
787
  # Save plot
788
  message("Saving plot...")
789
  pdf('plots/DE_heatmap_response_collapsed.pdf', width = 11, height = 8)
790
  print(p)
791
  dev.off()
792
  
793
  # Save gene ID mapping
794
  write.csv(data.frame(
795
    Anonymous_ID = names(anon_genes),
796
    Original_Gene = anon_genes,
797
    Mean_Responder = x_resp,
798
    SE_Responder = x_resp_se,
799
    Mean_NonResponder = x_nonresp,
800
    SE_NonResponder = x_nonresp_se
801
  ), './response_gene_mapping_with_stats.csv', row.names = FALSE)
802
  
803
  message("Heatmap generated successfully")
804
  message("Gene mapping with statistics saved to results/response_gene_mapping_with_stats.csv")
805
  
806
}, error = function(e) {
807
  message(sprintf("Error generating heatmap: %s", e$message))
808
})
809
```
810
# Add new chunk after DE_response_heat
811
```{r DE_response_heat_individual, warning=F, message=T}
812
# Show individual cell heatmap alongside collapsed view
813
tryCatch({
814
  # Get significant genes
815
  if (!exists("de") || is.null(de$mast$response)) {
816
    stop("MAST results not found. Please run the DE analysis first.")
817
  }
818
  
819
  sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
820
  if (length(sig_genes) == 0) {
821
    stop("No significant genes found (FDR < 0.05)")
822
  }
823
  message(sprintf("Found %d significant genes", length(sig_genes)))
824
  
825
  # Take top N genes by absolute log fold change
826
  n <- min(50, length(sig_genes))
827
  g <- sig_genes[order(abs(de$mast$response$avg_log2FC[match(sig_genes, rownames(de$mast$response))]), 
828
                      decreasing = TRUE)][1:n]
829
  
830
  # Create anonymous gene IDs
831
  anon_genes <- anonymize_genes(g)
832
  
833
  # Get cells (subsample to make visualization manageable)
834
  jx1 <- which(data.all$proc$Response %in% 'Responder')
835
  jx2 <- which(data.all$proc$Response %in% 'Non-responder')
836
  
837
  # Subsample cells
838
  cells_per_group <- min(25, min(length(jx1), length(jx2)))
839
  set.seed(42)
840
  sampled_jx1 <- sample(jx1, cells_per_group)
841
  sampled_jx2 <- sample(jx2, cells_per_group)
842
  
843
  # Get expression data for individual cells
844
  x_individual <- de$data[g, c(sampled_jx1, sampled_jx2)]
845
  if (!is(x_individual, "dgCMatrix")) {
846
    x_individual <- as(x_individual, "dgCMatrix")
847
  }
848
  x_individual <- as.matrix(x_individual)
849
  
850
  # Clean individual cell data
851
  message("Cleaning individual cell data...")
852
  # Replace Inf/-Inf with NA
853
  x_individual[is.infinite(x_individual)] <- NA
854
  # Remove rows (genes) with any NA values
855
  complete_rows <- complete.cases(x_individual)
856
  if (!all(complete_rows)) {
857
    message(sprintf("Removing %d genes with NA values", sum(!complete_rows)))
858
    x_individual <- x_individual[complete_rows, ]
859
    g <- g[complete_rows]
860
    anon_genes <- anon_genes[complete_rows]
861
  }
862
  
863
  # Check for zero variance genes
864
  gene_var <- apply(x_individual, 1, var)
865
  if (any(gene_var == 0)) {
866
    message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0)))
867
    x_individual <- x_individual[gene_var > 0, ]
868
    g <- g[gene_var > 0]
869
    anon_genes <- anon_genes[gene_var > 0]
870
  }
871
  
872
  # Replace gene names with anonymous IDs
873
  rownames(x_individual) <- anon_genes[rownames(x_individual)]
874
  
875
  # Scale individual cell data with validation
876
  message("Scaling individual cell data...")
877
  x_individual_scaled <- t(scale(t(x_individual)))
878
  # Check for NA/Inf values after scaling
879
  if (any(is.na(x_individual_scaled)) || any(is.infinite(x_individual_scaled))) {
880
    message("Found NA/Inf values after scaling, capping extreme values at ±10")
881
    x_individual_scaled[x_individual_scaled > 10] <- 10
882
    x_individual_scaled[x_individual_scaled < -10] <- -10
883
    x_individual_scaled[is.na(x_individual_scaled)] <- 0
884
  }
885
  
886
  # Create annotation for individual cells
887
  col.annot.individual <- data.frame(
888
    Response = rep(c('Responder', 'Non-responder'), each = cells_per_group),
889
    row.names = colnames(x_individual)
890
  )
891
  
892
  # Get mean expression data (for side-by-side comparison)
893
  x_resp <- rowMeans(de$data[g, jx1, drop=FALSE])
894
  x_nonresp <- rowMeans(de$data[g, jx2, drop=FALSE])
895
  x_means <- cbind(x_resp, x_nonresp)
896
  colnames(x_means) <- c("Responder\n(mean)", "Non-responder\n(mean)")
897
  
898
  # Calculate standard errors
899
  x_resp_se <- apply(de$data[g, jx1, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
900
  x_nonresp_se <- apply(de$data[g, jx2, drop=FALSE], 1, function(x) sd(x)/sqrt(length(x)))
901
  
902
  # Clean mean data
903
  message("Cleaning mean data...")
904
  x_means[is.infinite(x_means)] <- NA
905
  complete_rows_means <- complete.cases(x_means)
906
  if (!all(complete_rows_means)) {
907
    message(sprintf("Removing %d genes with NA values from means", sum(!complete_rows_means)))
908
    x_means <- x_means[complete_rows_means, ]
909
    x_resp_se <- x_resp_se[complete_rows_means]
910
    x_nonresp_se <- x_nonresp_se[complete_rows_means]
911
  }
912
  
913
  # Add SE information to rownames for mean data
914
  rownames(x_means) <- sprintf("%s\n(SE: %.2f, %.2f)", 
915
                              anon_genes[rownames(x_means)], 
916
                              x_resp_se, 
917
                              x_nonresp_se)
918
  
919
  # Scale mean data with validation
920
  message("Scaling mean data...")
921
  x_means_scaled <- t(scale(t(x_means)))
922
  if (any(is.na(x_means_scaled)) || any(is.infinite(x_means_scaled))) {
923
    message("Found NA/Inf values after scaling means, capping extreme values at ±10")
924
    x_means_scaled[x_means_scaled > 10] <- 10
925
    x_means_scaled[x_means_scaled < -10] <- -10
926
    x_means_scaled[is.na(x_means_scaled)] <- 0
927
  }
928
  
929
  # Create annotation for means
930
  col.annot.means <- data.frame(
931
    Group = c('Responder', 'Non-responder'),
932
    row.names = colnames(x_means)
933
  )
934
  
935
  # Color palette
936
  col.pal <- colorRampPalette(colors = c('navy', 'white', 'red'))(100)
937
  
938
  message("Generating plots...")
939
  # Create individual cells heatmap
940
  p1 <- pheatmap(
941
    x_individual_scaled,
942
    cluster_rows = TRUE,
943
    cluster_cols = FALSE,
944
    scale = "none",
945
    color = col.pal,
946
    annotation_col = col.annot.individual,
947
    show_rownames = TRUE,
948
    show_colnames = FALSE,
949
    gaps_col = cells_per_group,
950
    main = sprintf('Individual Cells\n(n=%d per group)', cells_per_group),
951
    fontsize_row = 8,
952
    annotation_colors = list(
953
      Response = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
954
    ),
955
    legend = FALSE,  # Hide colorbar
956
    treeheight_row = 0,  # Hide row dendrogram
957
    cellwidth = 10,  # Force square cells
958
    cellheight = 10  # Force square cells
959
  )
960
  
961
  # Create mean expression heatmap
962
  p2 <- pheatmap(
963
    x_means_scaled,
964
    cluster_rows = TRUE,
965
    cluster_cols = FALSE,
966
    scale = "none",
967
    color = col.pal,
968
    annotation_col = col.annot.means,
969
    show_rownames = TRUE,
970
    show_colnames = TRUE,
971
    main = 'Group Means\n(all cells)',
972
    fontsize_row = 8,
973
    annotation_colors = list(
974
      Group = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')
975
    ),
976
    legend = FALSE,  # Hide colorbar
977
    treeheight_row = 0,  # Hide row dendrogram
978
    cellwidth = 20,  # Force square cells (larger since fewer columns)
979
    cellheight = 20,  # Force square cells (larger since fewer columns)
980
    labels_row = sub("\n.*", "", rownames(x_means))  # Only show gene IDs without SE info
981
  )
982
  
983
  # Save plots to PDF
984
  pdf('plots/DE_heatmap_response_comparison.pdf', width = 15, height = 8)
985
  grid::grid.newpage()
986
  grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2))
987
  dev.off()
988
  
989
  # Display plots in notebook
990
  grid::grid.newpage()
991
  grid::grid.draw(gridExtra::grid.arrange(p1$gtable, p2$gtable, ncol=2))
992
  
993
  # Save detailed statistics
994
  write.csv(data.frame(
995
    Anonymous_ID = names(anon_genes),
996
    Original_Gene = anon_genes,
997
    Mean_Responder = x_resp,
998
    SE_Responder = x_resp_se,
999
    Mean_NonResponder = x_nonresp,
1000
    SE_NonResponder = x_nonresp_se,
1001
    N_Responder = length(jx1),
1002
    N_NonResponder = length(jx2)
1003
  ), 'results/response_gene_mapping_detailed.csv', row.names = FALSE)
1004
  
1005
  message("Side-by-side heatmaps generated successfully")
1006
  message("Detailed gene statistics saved to results/response_gene_mapping_detailed.csv")
1007
  
1008
}, error = function(e) {
1009
  message(sprintf("Error generating heatmaps: %s", e$message))
1010
  message("\nDebugging information:")
1011
  message("1. Check if expression data contains extreme values")
1012
  message("2. Verify gene filtering and scaling steps")
1013
  message("3. Ensure all matrices have proper dimensions")
1014
  message("4. Look for NA/Inf values in the data")
1015
})
1016
```
1017
1018
PCA of DEGs identified for responder vs. non-responder comparison. 
1019
1020
```{r DE_response_PCA, warning=F, message=T}
1021
# Modify DE_response_PCA chunk
1022
tryCatch({
1023
  # Validate MAST results exist
1024
  if (!exists("de") || is.null(de$mast$response)) {
1025
    stop("MAST results not found. Please run the DE analysis first.")
1026
  }
1027
  
1028
  # Get significant genes
1029
  sig_genes <- rownames(de$mast$response)[de$mast$response$p_val_adj < 0.05]
1030
  if (length(sig_genes) == 0) {
1031
    stop("No significant genes found (FDR < 0.05)")
1032
  }
1033
  message(sprintf("Found %d significant genes", length(sig_genes)))
1034
  
1035
  # Get responder/non-responder cells
1036
  jx <- data.all$proc$Response %in% c('Responder', 'Non-responder')
1037
  if (sum(jx) == 0) {
1038
    stop("No cells found matching response criteria")
1039
  }
1040
  message(sprintf("Found %d cells (%d Responders, %d Non-responders)", 
1041
                 sum(jx),
1042
                 sum(data.all$proc$Response[jx] == "Responder"),
1043
                 sum(data.all$proc$Response[jx] == "Non-responder")))
1044
  
1045
  # Subsample cells if too many
1046
  max_cells_per_group <- 1000
1047
  responder_idx <- which(data.all$proc$Response[jx] == "Responder")
1048
  nonresponder_idx <- which(data.all$proc$Response[jx] == "Non-responder")
1049
  
1050
  if (length(responder_idx) > max_cells_per_group) {
1051
    set.seed(42)
1052
    responder_idx <- sample(responder_idx, max_cells_per_group)
1053
  }
1054
  if (length(nonresponder_idx) > max_cells_per_group) {
1055
    set.seed(42)
1056
    nonresponder_idx <- sample(nonresponder_idx, max_cells_per_group)
1057
  }
1058
  
1059
  selected_cells <- c(responder_idx, nonresponder_idx)
1060
  
1061
  # Get expression data
1062
  x <- de$data[sig_genes, jx][, selected_cells]
1063
  if (!is(x, "dgCMatrix")) {
1064
    x <- as(x, "dgCMatrix")
1065
  }
1066
  x <- as.matrix(x)
1067
  
1068
  # Create anonymous gene IDs
1069
  anon_genes <- anonymize_genes(rownames(x))
1070
  rownames(x) <- anon_genes[rownames(x)]
1071
  
1072
  # Handle zero variance genes
1073
  gene_var <- apply(x, 1, var)
1074
  if (any(gene_var == 0)) {
1075
    message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0)))
1076
    x <- x[gene_var > 0, ]
1077
  }
1078
  
1079
  # Clean data
1080
  x[is.infinite(x)] <- NA
1081
  x[is.na(x)] <- 0
1082
  
1083
  # Transpose and convert to data frame
1084
  x_t <- t(x)
1085
  
1086
  # Create metadata
1087
  m <- data.frame(
1088
    Response = data.all$proc$Response[jx][selected_cells],
1089
    row.names = rownames(x_t)
1090
  )
1091
  
1092
  # Apply PCA with validation
1093
  if (ncol(x_t) < 2) {
1094
    stop("Need at least 2 features for PCA")
1095
  }
1096
  if (nrow(x_t) < 3) {
1097
    stop("Need at least 3 samples for PCA")
1098
  }
1099
  
1100
  message("Running PCA...")
1101
  response_pca <- prcomp(x_t, scale. = TRUE)
1102
  
1103
  # Calculate variance explained
1104
  var_explained <- summary(response_pca)$importance[2, 1:2] * 100
1105
  
1106
  # Calculate group centroids
1107
  centroids <- aggregate(cbind(PC1, PC2) ~ Response, 
1108
                        data = cbind(as.data.frame(response_pca$x[,1:2]), m), 
1109
                        FUN = mean)
1110
  
1111
  # Create enhanced PCA plot
1112
  p <- ggplot(data = as.data.frame(response_pca$x[,1:2]), 
1113
              aes(x = PC1, y = PC2, color = m$Response)) +
1114
    # Add points with reduced alpha for transparency
1115
    geom_point(alpha = 0.3, size = 2) +
1116
    # Add confidence ellipses
1117
    stat_ellipse(level = 0.95, size = 1) +
1118
    # Add centroids
1119
    geom_point(data = centroids, aes(x = PC1, y = PC2), 
1120
               size = 5, shape = 18) +
1121
    # Add centroid labels
1122
    geom_text(data = centroids, 
1123
              aes(x = PC1, y = PC2, label = Response),
1124
              vjust = -1.5, size = 4, color = "black") +
1125
    # Customize appearance
1126
    ggtitle('De-identified DEG PCA with Group Trends') +
1127
    xlab(sprintf("PC1 (%.1f%%)", var_explained[1])) +
1128
    ylab(sprintf("PC2 (%.1f%%)", var_explained[2])) +
1129
    theme_minimal() +
1130
    scale_color_manual(values = c('Responder' = '#E41A1C', 'Non-responder' = '#377EB8')) +
1131
    theme(
1132
      legend.position = "bottom",
1133
      plot.title = element_text(hjust = 0.5, size = 14),
1134
      axis.title = element_text(size = 12),
1135
      legend.title = element_text(size = 12),
1136
      legend.text = element_text(size = 10)
1137
    )
1138
  
1139
  # Save plot
1140
  message("Saving plot...")
1141
  ggsave('plots/DE_PCA_response_collapsed.pdf', plot = p, width = 8, height = 8, units = 'in')
1142
  
1143
  # Display plot
1144
  print(p)
1145
  
1146
  message("PCA visualization completed successfully")
1147
  
1148
}, error = function(e) {
1149
  message(sprintf("Error in PCA analysis: %s", e$message))
1150
  message("Please ensure:")
1151
  message("1. DE analysis has been run successfully")
1152
  message("2. There are significant DEGs to analyze")
1153
  message("3. There are enough cells in each response group")
1154
  message("4. Expression data is valid and contains sufficient variation")
1155
})
1156
```
1157
1158
## Tumor vs. benign DEGs
1159
1160
Clustergram (or heatmap) of DEGs identified for tumor vs. benign comparison. 
1161
1162
```{r DE_tumor_heat, warning=F, message=T}
1163
1164
# Define data to plot
1165
n <- 50
1166
g <- de$mast$tumor$gene[1:n]
1167
jx1 <- which(data.all$proc$Source == names(obj)[3] | 
1168
  data.all$proc$Malignant %in% 'Yes')[1:(n/2)]
1169
jx2 <- which(data.all$proc$Source == names(obj)[4])[1:(n/2)]
1170
x <- de$data[g, c(jx1, jx2)] %>% as.data.frame(.)
1171
1172
# Specify heatmap arguments
1173
#   color palette
1174
col.pal <- colorRampPalette(colors=c('white', 'red'), space='Lab')(100)
1175
#   column annotations (response)
1176
col.annot <- data.frame(Malignant=rep(c('Yes', 'No'), each=n/2), 
1177
                        row.names=colnames(x))
1178
  
1179
# Heatmap
1180
p <- pheatmap(x, cluster_rows=T, cluster_cols=F, scale='none', color=col.pal, 
1181
              annotation_col=col.annot, cellheight=(500/n), cellwidth=(500/n), 
1182
              show_rownames=T, show_colnames=F, gaps_col=n/2, 
1183
              main='DEG heatmap (tumor)')
1184
  
1185
# Save plot
1186
#tiff('plots/DE_heatmap_tumor.tiff', units='in', width=11, height=8, res=300)
1187
print({p}); dev.off()
1188
1189
```
1190
1191
PCA of DEGs identified for tumor vs. benign comparison. 
1192
1193
```{r DE_tumor_PCA, warning=F, message=T}
1194
# Define data to plot
1195
tryCatch({
1196
  # Validate MAST results exist
1197
  if (!exists("de") || is.null(de$mast$tumor)) {
1198
    stop("MAST results not found. Please run the tumor DE analysis first.")
1199
  }
1200
  
1201
  # Get significant genes
1202
  sig_genes <- rownames(de$mast$tumor)[de$mast$tumor$p_val_adj < 0.05]
1203
  if (length(sig_genes) == 0) {
1204
    stop("No significant genes found (FDR < 0.05)")
1205
  }
1206
  message(sprintf("Found %d significant genes", length(sig_genes)))
1207
  
1208
  # Take top N genes by absolute log fold change
1209
  n <- min(100, length(sig_genes))
1210
  g <- sig_genes[order(abs(de$mast$tumor$avg_log2FC[match(sig_genes, rownames(de$mast$tumor))]), 
1211
                      decreasing = TRUE)][1:n]
1212
  
1213
  # Get tumor/benign cells
1214
  jx1 <- data.all$proc$Source == names(obj)[3] | data.all$proc$Malignant %in% 'Yes'
1215
  jx2 <- data.all$proc$Source == names(obj)[4]
1216
  
1217
  if (sum(jx1) == 0 || sum(jx2) == 0) {
1218
    stop("Insufficient cells in one or both groups")
1219
  }
1220
  message(sprintf("Found %d cells (%d Tumor, %d Benign)", 
1221
                 sum(jx1) + sum(jx2), sum(jx1), sum(jx2)))
1222
  
1223
  # Subsample cells if too many
1224
  max_cells_per_group <- 1000
1225
  if (sum(jx1) > max_cells_per_group) {
1226
    set.seed(42)
1227
    tumor_cells <- sample(which(jx1), max_cells_per_group)
1228
    jx1[setdiff(which(jx1), tumor_cells)] <- FALSE
1229
  }
1230
  if (sum(jx2) > max_cells_per_group) {
1231
    set.seed(42)
1232
    benign_cells <- sample(which(jx2), max_cells_per_group)
1233
    jx2[setdiff(which(jx2), benign_cells)] <- FALSE
1234
  }
1235
  
1236
  # Get expression data
1237
  x <- de$data[g, c(which(jx1), which(jx2))]
1238
  if (!is(x, "dgCMatrix")) {
1239
    x <- as(x, "dgCMatrix")
1240
  }
1241
  x <- as.matrix(x)
1242
  
1243
  # Handle zero variance genes
1244
  gene_var <- apply(x, 1, var)
1245
  if (any(gene_var == 0)) {
1246
    message(sprintf("Removing %d genes with zero variance", sum(gene_var == 0)))
1247
    x <- x[gene_var > 0, ]
1248
    g <- g[gene_var > 0]
1249
  }
1250
  
1251
  # Clean data
1252
  x[is.infinite(x)] <- NA
1253
  x[is.na(x)] <- 0
1254
  
1255
  # Transpose and convert to data frame
1256
  x_t <- t(x)
1257
  
1258
  # Create metadata
1259
  m <- data.frame(
1260
    Malignant = rep(c('Yes', 'No'), c(sum(jx1), sum(jx2))),
1261
    row.names = rownames(x_t)
1262
  )
1263
  
1264
  # Apply PCA with validation
1265
  if (ncol(x_t) < 2) {
1266
    stop("Need at least 2 features for PCA")
1267
  }
1268
  if (nrow(x_t) < 3) {
1269
    stop("Need at least 3 samples for PCA")
1270
  }
1271
  
1272
  message("Running PCA...")
1273
  pca.tumor <- prcomp(x_t, scale. = TRUE)
1274
  
1275
  # Calculate variance explained
1276
  var_explained <- summary(pca.tumor)$importance[2, 1:2] * 100
1277
  
1278
  # Visualize PCA plot with variance explained
1279
  p <- autoplot(pca.tumor, data = m, colour = 'Malignant') +
1280
    ggtitle('DEG PCA (tumor)') +
1281
    xlab(sprintf("PC1 (%.1f%%)", var_explained[1])) +
1282
    ylab(sprintf("PC2 (%.1f%%)", var_explained[2])) +
1283
    theme_minimal() +
1284
    scale_color_manual(values = c('Yes' = '#E41A1C', 'No' = '#377EB8'))
1285
  
1286
  # Save plot
1287
  message("Saving plot...")
1288
  ggsave('plots/DE_PCA_tumor.pdf', plot = p, width = 8, height = 6, units = 'in')
1289
  
1290
  # Display plot
1291
  print(p)
1292
  
1293
  # Return success message
1294
  message("PCA visualization completed successfully")
1295
  
1296
}, error = function(e) {
1297
  message(sprintf("Error in PCA analysis: %s", e$message))
1298
  message("Please ensure:")
1299
  message("1. Tumor DE analysis has been run successfully")
1300
  message("2. There are significant DEGs to analyze")
1301
  message("3. There are enough cells in each group")
1302
  message("4. Expression data is valid and contains sufficient variation")
1303
})
1304
```
1305
1306
# Save results (if prompted)
1307
1308
```{r save_outputs, warning=F, message=T}
1309
1310
#save.data <- dlgInput('Save all results? (T/F)', F)$res %>% as.logical(.)
1311
#if (save.data) { 
1312
#  # t-test results
1313
#  write.xlsx(de$ttest, 'results/DE_ttest.xlsx', row.names=F)
1314
#  # MAST results
1315
#  write.xlsx(de$mast, 'results/DE_MAST.xlsx', row.names=T)
1316
#}
1317
1318
```