Diff of /R/EdgeRAnalyze.R [000000] .. [0f2269]

Switch to unified view

a b/R/EdgeRAnalyze.R
1
#' Differential Gene Expression Analysis using 'edgeR'
2
#'
3
#' This function performs differential gene expression analysis using the 'edgeR' package.
4
#' It reads tumor and normal expression data, merges them, filters low-expressed genes,
5
#' normalizes the data, performs edgeR analysis, and outputs the results along with information
6
#' on gene expression changes.
7
#'
8
#' @importFrom tibble column_to_rownames
9
#' @importFrom dplyr mutate
10
#' @importFrom edgeR DGEList cpm calcNormFactors estimateGLMCommonDisp estimateGLMTrendedDisp estimateGLMTagwiseDisp glmFit glmLRT topTags
11
#' @importFrom stats model.matrix
12
#' @param tumor_file Path to the tumor data file (RDS format).
13
#' @param normal_file Path to the normal data file (RDS format).
14
#' @param output_file Path to save the output DEG data (RDS format).
15
#' @param logFC_threshold Threshold for log fold change for marking up/down-regulated genes.
16
#' @param p_value_threshold Threshold for p-value for filtering significant genes.
17
#' @return A data frame of differential expression results.
18
#' @references
19
#' edgeR: Differential analysis of sequence read count data.
20
#' For more information, visit the edgeR Bioconductor page:
21
#' https://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
22
#' @export
23
#'
24
#' @examples
25
#' # Define file paths for tumor and normal data from the data folder
26
#' tumor_file <- system.file("extdata",
27
#'                          "removebatch_SKCM_Skin_TCGA_exp_tumor_test.rds",
28
#'                          package = "TransProR")
29
#' normal_file <- system.file("extdata",
30
#'                            "removebatch_SKCM_Skin_Normal_TCGA_GTEX_count_test.rds",
31
#'                            package = "TransProR")
32
#' output_file <- file.path(tempdir(), "DEG_edgeR.rds")
33
#'
34
#' DEG_edgeR <- edgeR_analyze(
35
#'   tumor_file = tumor_file,
36
#'   normal_file = normal_file,
37
#'   output_file = output_file,
38
#'   2.5,
39
#'   0.01
40
#' )
41
#'
42
#' # View the top 5 rows of the result
43
#' head(DEG_edgeR, 5)
44
#'
45
edgeR_analyze <- function(tumor_file, normal_file, output_file, logFC_threshold = 2.5, p_value_threshold = 0.01) {
46
  tumor <- readRDS(tumor_file)
47
  normal <- readRDS(normal_file)
48
49
  # Merge the datasets, ensuring both have genes as row names
50
  all_count_exp <- merge(tumor, normal, by = "row.names")
51
  all_count_exp <- tibble::column_to_rownames(all_count_exp, var = "Row.names")
52
53
  # Define groups for tumor and normal samples
54
  group <- c(rep('tumor', ncol(tumor)), rep('normal', ncol(normal)))
55
  group <- factor(group, levels = c("normal", "tumor"))
56
  group_table <- table(group)
57
58
  message("Group Table:")
59
  message(paste(names(group_table), group_table, sep = ": ", collapse = "\n"))
60
  # Add a space after the output for separation
61
  message(" ")
62
63
  # Create DGEList object for gene expression data and group information
64
  d <- edgeR::DGEList(counts = all_count_exp, group = group)
65
66
  # Filter lowly expressed genes based on CPM values
67
  keep <- rowSums(edgeR::cpm(d) > 1) >= 2
68
  d <- d[keep, , keep.lib.sizes = FALSE]
69
70
  # Update library size information in the samples
71
  d$samples$lib.size <- colSums(d$counts)
72
73
  # Normalize the data using the TMM method
74
  d <- edgeR::calcNormFactors(d)
75
76
  # Create design matrix for differential analysis model
77
  design <- stats::model.matrix(~0 + factor(group))
78
  rownames(design) <- colnames(d)
79
  colnames(design) <- levels(factor(group))
80
81
  # Estimate dispersions - common dispersion, trended dispersion, tagwise dispersion
82
  d <- edgeR::estimateGLMCommonDisp(d, design)
83
  d <- edgeR::estimateGLMTrendedDisp(d, design)
84
  d <- edgeR::estimateGLMTagwiseDisp(d, design)
85
86
  # Fit the model using Generalized Linear Model (GLM)
87
  fit <- edgeR::glmFit(d, design)
88
89
  # Perform differential expression analysis using Likelihood Ratio Test (LRT)
90
  lrt <- edgeR::glmLRT(fit, contrast = c(-1, 1)) # Note that the 'contrast' here is different from 'DESeq2'. Here, we only need to input c(-1, 1): -1 corresponds to normal, 1 corresponds to tumor.
91
92
  # Retrieve top differentially expressed genes
93
  nrDEG <- edgeR::topTags(lrt, n = nrow(d))
94
  DEG_edgeR <- as.data.frame(nrDEG)
95
96
  # Add 'change' column to mark up/down-regulated genes
97
  k1 <- (DEG_edgeR$PValue < p_value_threshold) & (DEG_edgeR$logFC < -logFC_threshold)
98
  k2 <- (DEG_edgeR$PValue < p_value_threshold) & (DEG_edgeR$logFC > logFC_threshold)
99
  DEG_edgeR <- dplyr::mutate(DEG_edgeR, change = ifelse(k1, "down", ifelse(k2, "up", "stable")))
100
101
  change_table <- table(DEG_edgeR$change)
102
103
  message("Change Table:")
104
  message(paste(names(change_table), change_table, sep = ": ", collapse = "\n"))
105
  # Add a space after the output for separation
106
  message(" ")
107
108
  # Save results to the specified output file
109
  saveRDS(DEG_edgeR, file = output_file)
110
111
  return(DEG_edgeR)
112
}