a b/R/WilcoxonAnalyze.R
1
#' Differential Gene Expression Analysis Using Wilcoxon Rank-Sum Test
2
#'
3
#' This function performs differential gene expression analysis using Wilcoxon rank-sum tests.
4
#' It reads tumor and normal expression data, performs TMM normalization using 'edgeR', and uses Wilcoxon rank-sum tests to identify differentially expressed genes.
5
#'
6
#' @importFrom tibble column_to_rownames
7
#' @importFrom edgeR DGEList filterByExpr calcNormFactors cpm
8
#' @importFrom dplyr mutate
9
#' @importFrom stats wilcox.test p.adjust
10
#' @param tumor_file Path to the tumor data file (RDS format).
11
#' @param normal_file Path to the normal data file (RDS format).
12
#' @param output_file Path to save the output DEG data (RDS format).
13
#' @param logFC_threshold Threshold for log fold change for marking up/down-regulated genes.
14
#' @param fdr_threshold Threshold for FDR for filtering significant genes.
15
#' @return A data frame of differential expression results.
16
#' @references
17
#' Li, Y., Ge, X., Peng, F., Li, W., & Li, J. J. (2022). Exaggerated False Positives by Popular
18
#' Differential Expression Methods When Analyzing Human Population Samples. Genome Biology, 23(1), 79.
19
#' DOI: https://doi.org/10.1186/s13059-022-02648-4.
20
#' @export
21
#'
22
#' @examples
23
#' # Define file paths for tumor and normal data from the data folder
24
#' tumor_file <- system.file("extdata",
25
#'                           "removebatch_SKCM_Skin_TCGA_exp_tumor_test.rds",
26
#'                           package = "TransProR")
27
#' normal_file <- system.file("extdata",
28
#'                            "removebatch_SKCM_Skin_Normal_TCGA_GTEX_count_test.rds",
29
#'                            package = "TransProR")
30
#' output_file <- file.path(tempdir(), "Wilcoxon_rank_sum_testoutRst.rds")
31
#'
32
#' # Run the Wilcoxon rank sum test
33
#' outRst <- Wilcoxon_analyze(
34
#'   tumor_file = tumor_file,
35
#'   normal_file = normal_file,
36
#'   output_file = output_file,
37
#'   logFC_threshold = 2.5,
38
#'   fdr_threshold = 0.01
39
#' )
40
#'
41
#' # View the top 5 rows of the result
42
#' head(outRst, 5)
43
Wilcoxon_analyze <- function(tumor_file,
44
                             normal_file,
45
                             output_file,
46
                             logFC_threshold = 2.5,
47
                             fdr_threshold = 0.05) {
48
  # Read data
49
  tumor <- readRDS(tumor_file)
50
  normal <- readRDS(normal_file)
51
52
  # Merge the datasets and set row names
53
  all_count_exp <- merge(tumor, normal, by = "row.names")
54
  all_count_exp <- tibble::column_to_rownames(all_count_exp, var = "Row.names")
55
56
  # Define groups
57
  group <- c(rep('tumor', ncol(tumor)), rep('normal', ncol(normal)))
58
  group <- factor(group, levels = c("normal", "tumor"))
59
  group_table <- table(group)
60
61
  message("Group Table:")
62
  message(paste(names(group_table), group_table, sep = ": ", collapse = "\n"))
63
  # Add a space after the output for separation
64
  message(" ")
65
66
  # EdgeR TMM normalization
67
  y <- edgeR::DGEList(counts = all_count_exp, group = group)
68
  keep <- edgeR::filterByExpr(y)
69
  y <- y[keep, keep.lib.sizes = FALSE]
70
71
  # Perform TMM normalization and transfer to CPM (Counts Per Million)
72
  y <- edgeR::calcNormFactors(y, method = "TMM")
73
  count_norm <- edgeR::cpm(y)
74
  count_norm <- as.data.frame(count_norm)
75
76
  # Wilcoxon rank-sum test for each gene
77
  pvalues <- sapply(1:nrow(count_norm), function(i) {
78
    data <- cbind.data.frame(gene = as.numeric(t(count_norm[i, ])), group)
79
    stats::wilcox.test(gene ~ group, data)$p.value
80
  })
81
  fdr <- stats::p.adjust(pvalues, method = "fdr")
82
83
  # Calculate fold-change for each gene
84
  conditionsLevel <- levels(group)
85
  dataCon1 <- count_norm[, which(group == conditionsLevel[1])]
86
  dataCon2 <- count_norm[, which(group == conditionsLevel[2])]
87
  # The addition of a pseudo-count allows for robust statistical analysis of genes with low expression levels, while mitigating computational issues caused by zero expression values.
88
  # It prevents the occurrence of negative infinity (-Inf) when the numerator is zero, and positive infinity (Inf) when the denominator is zero.
89
  foldChanges <- log2((rowMeans(dataCon2) + 0.005) / (rowMeans(dataCon1) + 0.005))
90
  #foldChanges <- log2(rowMeans(dataCon2) / rowMeans(dataCon1))
91
92
  # Output results based on FDR threshold
93
  outRst <- data.frame(log2foldChange = foldChanges, pValues = pvalues, FDR = fdr)
94
  rownames(outRst) <- rownames(count_norm)
95
  outRst <- na.omit(outRst)
96
97
  # Mark up/down-regulated genes
98
  k1 <- (outRst$FDR < fdr_threshold) & (outRst$log2foldChange < -logFC_threshold)
99
  k2 <- (outRst$FDR < fdr_threshold) & (outRst$log2foldChange > logFC_threshold)
100
  outRst <- dplyr::mutate(outRst, change = ifelse(k1, "down", ifelse(k2, "up", "stable")))
101
102
  change_table <- table(outRst$change)
103
104
  message("Change Table:")
105
  message(paste(names(change_table), change_table, sep = ": ", collapse = "\n"))
106
  # Add a space after the output for separation
107
  message(" ")
108
109
  # Save results
110
  saveRDS(outRst, file = output_file)
111
112
  return(outRst)
113
}