|
a |
|
b/R/WilcoxonAnalyze.R |
|
|
1 |
#' Differential Gene Expression Analysis Using Wilcoxon Rank-Sum Test |
|
|
2 |
#' |
|
|
3 |
#' This function performs differential gene expression analysis using Wilcoxon rank-sum tests. |
|
|
4 |
#' It reads tumor and normal expression data, performs TMM normalization using 'edgeR', and uses Wilcoxon rank-sum tests to identify differentially expressed genes. |
|
|
5 |
#' |
|
|
6 |
#' @importFrom tibble column_to_rownames |
|
|
7 |
#' @importFrom edgeR DGEList filterByExpr calcNormFactors cpm |
|
|
8 |
#' @importFrom dplyr mutate |
|
|
9 |
#' @importFrom stats wilcox.test p.adjust |
|
|
10 |
#' @param tumor_file Path to the tumor data file (RDS format). |
|
|
11 |
#' @param normal_file Path to the normal data file (RDS format). |
|
|
12 |
#' @param output_file Path to save the output DEG data (RDS format). |
|
|
13 |
#' @param logFC_threshold Threshold for log fold change for marking up/down-regulated genes. |
|
|
14 |
#' @param fdr_threshold Threshold for FDR for filtering significant genes. |
|
|
15 |
#' @return A data frame of differential expression results. |
|
|
16 |
#' @references |
|
|
17 |
#' Li, Y., Ge, X., Peng, F., Li, W., & Li, J. J. (2022). Exaggerated False Positives by Popular |
|
|
18 |
#' Differential Expression Methods When Analyzing Human Population Samples. Genome Biology, 23(1), 79. |
|
|
19 |
#' DOI: https://doi.org/10.1186/s13059-022-02648-4. |
|
|
20 |
#' @export |
|
|
21 |
#' |
|
|
22 |
#' @examples |
|
|
23 |
#' # Define file paths for tumor and normal data from the data folder |
|
|
24 |
#' tumor_file <- system.file("extdata", |
|
|
25 |
#' "removebatch_SKCM_Skin_TCGA_exp_tumor_test.rds", |
|
|
26 |
#' package = "TransProR") |
|
|
27 |
#' normal_file <- system.file("extdata", |
|
|
28 |
#' "removebatch_SKCM_Skin_Normal_TCGA_GTEX_count_test.rds", |
|
|
29 |
#' package = "TransProR") |
|
|
30 |
#' output_file <- file.path(tempdir(), "Wilcoxon_rank_sum_testoutRst.rds") |
|
|
31 |
#' |
|
|
32 |
#' # Run the Wilcoxon rank sum test |
|
|
33 |
#' outRst <- Wilcoxon_analyze( |
|
|
34 |
#' tumor_file = tumor_file, |
|
|
35 |
#' normal_file = normal_file, |
|
|
36 |
#' output_file = output_file, |
|
|
37 |
#' logFC_threshold = 2.5, |
|
|
38 |
#' fdr_threshold = 0.01 |
|
|
39 |
#' ) |
|
|
40 |
#' |
|
|
41 |
#' # View the top 5 rows of the result |
|
|
42 |
#' head(outRst, 5) |
|
|
43 |
Wilcoxon_analyze <- function(tumor_file, |
|
|
44 |
normal_file, |
|
|
45 |
output_file, |
|
|
46 |
logFC_threshold = 2.5, |
|
|
47 |
fdr_threshold = 0.05) { |
|
|
48 |
# Read data |
|
|
49 |
tumor <- readRDS(tumor_file) |
|
|
50 |
normal <- readRDS(normal_file) |
|
|
51 |
|
|
|
52 |
# Merge the datasets and set row names |
|
|
53 |
all_count_exp <- merge(tumor, normal, by = "row.names") |
|
|
54 |
all_count_exp <- tibble::column_to_rownames(all_count_exp, var = "Row.names") |
|
|
55 |
|
|
|
56 |
# Define groups |
|
|
57 |
group <- c(rep('tumor', ncol(tumor)), rep('normal', ncol(normal))) |
|
|
58 |
group <- factor(group, levels = c("normal", "tumor")) |
|
|
59 |
group_table <- table(group) |
|
|
60 |
|
|
|
61 |
message("Group Table:") |
|
|
62 |
message(paste(names(group_table), group_table, sep = ": ", collapse = "\n")) |
|
|
63 |
# Add a space after the output for separation |
|
|
64 |
message(" ") |
|
|
65 |
|
|
|
66 |
# EdgeR TMM normalization |
|
|
67 |
y <- edgeR::DGEList(counts = all_count_exp, group = group) |
|
|
68 |
keep <- edgeR::filterByExpr(y) |
|
|
69 |
y <- y[keep, keep.lib.sizes = FALSE] |
|
|
70 |
|
|
|
71 |
# Perform TMM normalization and transfer to CPM (Counts Per Million) |
|
|
72 |
y <- edgeR::calcNormFactors(y, method = "TMM") |
|
|
73 |
count_norm <- edgeR::cpm(y) |
|
|
74 |
count_norm <- as.data.frame(count_norm) |
|
|
75 |
|
|
|
76 |
# Wilcoxon rank-sum test for each gene |
|
|
77 |
pvalues <- sapply(1:nrow(count_norm), function(i) { |
|
|
78 |
data <- cbind.data.frame(gene = as.numeric(t(count_norm[i, ])), group) |
|
|
79 |
stats::wilcox.test(gene ~ group, data)$p.value |
|
|
80 |
}) |
|
|
81 |
fdr <- stats::p.adjust(pvalues, method = "fdr") |
|
|
82 |
|
|
|
83 |
# Calculate fold-change for each gene |
|
|
84 |
conditionsLevel <- levels(group) |
|
|
85 |
dataCon1 <- count_norm[, which(group == conditionsLevel[1])] |
|
|
86 |
dataCon2 <- count_norm[, which(group == conditionsLevel[2])] |
|
|
87 |
# The addition of a pseudo-count allows for robust statistical analysis of genes with low expression levels, while mitigating computational issues caused by zero expression values. |
|
|
88 |
# It prevents the occurrence of negative infinity (-Inf) when the numerator is zero, and positive infinity (Inf) when the denominator is zero. |
|
|
89 |
foldChanges <- log2((rowMeans(dataCon2) + 0.005) / (rowMeans(dataCon1) + 0.005)) |
|
|
90 |
#foldChanges <- log2(rowMeans(dataCon2) / rowMeans(dataCon1)) |
|
|
91 |
|
|
|
92 |
# Output results based on FDR threshold |
|
|
93 |
outRst <- data.frame(log2foldChange = foldChanges, pValues = pvalues, FDR = fdr) |
|
|
94 |
rownames(outRst) <- rownames(count_norm) |
|
|
95 |
outRst <- na.omit(outRst) |
|
|
96 |
|
|
|
97 |
# Mark up/down-regulated genes |
|
|
98 |
k1 <- (outRst$FDR < fdr_threshold) & (outRst$log2foldChange < -logFC_threshold) |
|
|
99 |
k2 <- (outRst$FDR < fdr_threshold) & (outRst$log2foldChange > logFC_threshold) |
|
|
100 |
outRst <- dplyr::mutate(outRst, change = ifelse(k1, "down", ifelse(k2, "up", "stable"))) |
|
|
101 |
|
|
|
102 |
change_table <- table(outRst$change) |
|
|
103 |
|
|
|
104 |
message("Change Table:") |
|
|
105 |
message(paste(names(change_table), change_table, sep = ": ", collapse = "\n")) |
|
|
106 |
# Add a space after the output for separation |
|
|
107 |
message(" ") |
|
|
108 |
|
|
|
109 |
# Save results |
|
|
110 |
saveRDS(outRst, file = output_file) |
|
|
111 |
|
|
|
112 |
return(outRst) |
|
|
113 |
} |