TransProR / Git / Diff of /R/EdgeRAnalyze.R

Models:
AlyssaS/
TransProR
Downloads: 1
Diff of /R/EdgeRAnalyze.R [000000] .. [0f2269]
Switch to side-by-side view

--- a
+++ b/R/EdgeRAnalyze.R
@@ -0,0 +1,112 @@
+#' Differential Gene Expression Analysis using 'edgeR'
+#'
+#' This function performs differential gene expression analysis using the 'edgeR' package.
+#' It reads tumor and normal expression data, merges them, filters low-expressed genes,
+#' normalizes the data, performs edgeR analysis, and outputs the results along with information
+#' on gene expression changes.
+#'
+#' @importFrom tibble column_to_rownames
+#' @importFrom dplyr mutate
+#' @importFrom edgeR DGEList cpm calcNormFactors estimateGLMCommonDisp estimateGLMTrendedDisp estimateGLMTagwiseDisp glmFit glmLRT topTags
+#' @importFrom stats model.matrix
+#' @param tumor_file Path to the tumor data file (RDS format).
+#' @param normal_file Path to the normal data file (RDS format).
+#' @param output_file Path to save the output DEG data (RDS format).
+#' @param logFC_threshold Threshold for log fold change for marking up/down-regulated genes.
+#' @param p_value_threshold Threshold for p-value for filtering significant genes.
+#' @return A data frame of differential expression results.
+#' @references
+#' edgeR: Differential analysis of sequence read count data.
+#' For more information, visit the edgeR Bioconductor page:
+#' https://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
+#' @export
+#'
+#' @examples
+#' # Define file paths for tumor and normal data from the data folder
+#' tumor_file <- system.file("extdata",
+#'                          "removebatch_SKCM_Skin_TCGA_exp_tumor_test.rds",
+#'                          package = "TransProR")
+#' normal_file <- system.file("extdata",
+#'                            "removebatch_SKCM_Skin_Normal_TCGA_GTEX_count_test.rds",
+#'                            package = "TransProR")
+#' output_file <- file.path(tempdir(), "DEG_edgeR.rds")
+#'
+#' DEG_edgeR <- edgeR_analyze(
+#'   tumor_file = tumor_file,
+#'   normal_file = normal_file,
+#'   output_file = output_file,
+#'   2.5,
+#'   0.01
+#' )
+#'
+#' # View the top 5 rows of the result
+#' head(DEG_edgeR, 5)
+#'
+edgeR_analyze <- function(tumor_file, normal_file, output_file, logFC_threshold = 2.5, p_value_threshold = 0.01) {
+  tumor <- readRDS(tumor_file)
+  normal <- readRDS(normal_file)
+
+  # Merge the datasets, ensuring both have genes as row names
+  all_count_exp <- merge(tumor, normal, by = "row.names")
+  all_count_exp <- tibble::column_to_rownames(all_count_exp, var = "Row.names")
+
+  # Define groups for tumor and normal samples
+  group <- c(rep('tumor', ncol(tumor)), rep('normal', ncol(normal)))
+  group <- factor(group, levels = c("normal", "tumor"))
+  group_table <- table(group)
+
+  message("Group Table:")
+  message(paste(names(group_table), group_table, sep = ": ", collapse = "\n"))
+  # Add a space after the output for separation
+  message(" ")
+
+  # Create DGEList object for gene expression data and group information
+  d <- edgeR::DGEList(counts = all_count_exp, group = group)
+
+  # Filter lowly expressed genes based on CPM values
+  keep <- rowSums(edgeR::cpm(d) > 1) >= 2
+  d <- d[keep, , keep.lib.sizes = FALSE]
+
+  # Update library size information in the samples
+  d$samples$lib.size <- colSums(d$counts)
+
+  # Normalize the data using the TMM method
+  d <- edgeR::calcNormFactors(d)
+
+  # Create design matrix for differential analysis model
+  design <- stats::model.matrix(~0 + factor(group))
+  rownames(design) <- colnames(d)
+  colnames(design) <- levels(factor(group))
+
+  # Estimate dispersions - common dispersion, trended dispersion, tagwise dispersion
+  d <- edgeR::estimateGLMCommonDisp(d, design)
+  d <- edgeR::estimateGLMTrendedDisp(d, design)
+  d <- edgeR::estimateGLMTagwiseDisp(d, design)
+
+  # Fit the model using Generalized Linear Model (GLM)
+  fit <- edgeR::glmFit(d, design)
+
+  # Perform differential expression analysis using Likelihood Ratio Test (LRT)
+  lrt <- edgeR::glmLRT(fit, contrast = c(-1, 1)) # Note that the 'contrast' here is different from 'DESeq2'. Here, we only need to input c(-1, 1): -1 corresponds to normal, 1 corresponds to tumor.
+
+  # Retrieve top differentially expressed genes
+  nrDEG <- edgeR::topTags(lrt, n = nrow(d))
+  DEG_edgeR <- as.data.frame(nrDEG)
+
+  # Add 'change' column to mark up/down-regulated genes
+  k1 <- (DEG_edgeR$PValue < p_value_threshold) & (DEG_edgeR$logFC < -logFC_threshold)
+  k2 <- (DEG_edgeR$PValue < p_value_threshold) & (DEG_edgeR$logFC > logFC_threshold)
+  DEG_edgeR <- dplyr::mutate(DEG_edgeR, change = ifelse(k1, "down", ifelse(k2, "up", "stable")))
+
+  change_table <- table(DEG_edgeR$change)
+
+  message("Change Table:")
+  message(paste(names(change_table), change_table, sep = ": ", collapse = "\n"))
+  # Add a space after the output for separation
+  message(" ")
+
+  # Save results to the specified output file
+  saveRDS(DEG_edgeR, file = output_file)
+
+  return(DEG_edgeR)
+}