|
a |
|
b/R/EdgeRAnalyze.R |
|
|
1 |
#' Differential Gene Expression Analysis using 'edgeR' |
|
|
2 |
#' |
|
|
3 |
#' This function performs differential gene expression analysis using the 'edgeR' package. |
|
|
4 |
#' It reads tumor and normal expression data, merges them, filters low-expressed genes, |
|
|
5 |
#' normalizes the data, performs edgeR analysis, and outputs the results along with information |
|
|
6 |
#' on gene expression changes. |
|
|
7 |
#' |
|
|
8 |
#' @importFrom tibble column_to_rownames |
|
|
9 |
#' @importFrom dplyr mutate |
|
|
10 |
#' @importFrom edgeR DGEList cpm calcNormFactors estimateGLMCommonDisp estimateGLMTrendedDisp estimateGLMTagwiseDisp glmFit glmLRT topTags |
|
|
11 |
#' @importFrom stats model.matrix |
|
|
12 |
#' @param tumor_file Path to the tumor data file (RDS format). |
|
|
13 |
#' @param normal_file Path to the normal data file (RDS format). |
|
|
14 |
#' @param output_file Path to save the output DEG data (RDS format). |
|
|
15 |
#' @param logFC_threshold Threshold for log fold change for marking up/down-regulated genes. |
|
|
16 |
#' @param p_value_threshold Threshold for p-value for filtering significant genes. |
|
|
17 |
#' @return A data frame of differential expression results. |
|
|
18 |
#' @references |
|
|
19 |
#' edgeR: Differential analysis of sequence read count data. |
|
|
20 |
#' For more information, visit the edgeR Bioconductor page: |
|
|
21 |
#' https://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf |
|
|
22 |
#' @export |
|
|
23 |
#' |
|
|
24 |
#' @examples |
|
|
25 |
#' # Define file paths for tumor and normal data from the data folder |
|
|
26 |
#' tumor_file <- system.file("extdata", |
|
|
27 |
#' "removebatch_SKCM_Skin_TCGA_exp_tumor_test.rds", |
|
|
28 |
#' package = "TransProR") |
|
|
29 |
#' normal_file <- system.file("extdata", |
|
|
30 |
#' "removebatch_SKCM_Skin_Normal_TCGA_GTEX_count_test.rds", |
|
|
31 |
#' package = "TransProR") |
|
|
32 |
#' output_file <- file.path(tempdir(), "DEG_edgeR.rds") |
|
|
33 |
#' |
|
|
34 |
#' DEG_edgeR <- edgeR_analyze( |
|
|
35 |
#' tumor_file = tumor_file, |
|
|
36 |
#' normal_file = normal_file, |
|
|
37 |
#' output_file = output_file, |
|
|
38 |
#' 2.5, |
|
|
39 |
#' 0.01 |
|
|
40 |
#' ) |
|
|
41 |
#' |
|
|
42 |
#' # View the top 5 rows of the result |
|
|
43 |
#' head(DEG_edgeR, 5) |
|
|
44 |
#' |
|
|
45 |
edgeR_analyze <- function(tumor_file, normal_file, output_file, logFC_threshold = 2.5, p_value_threshold = 0.01) { |
|
|
46 |
tumor <- readRDS(tumor_file) |
|
|
47 |
normal <- readRDS(normal_file) |
|
|
48 |
|
|
|
49 |
# Merge the datasets, ensuring both have genes as row names |
|
|
50 |
all_count_exp <- merge(tumor, normal, by = "row.names") |
|
|
51 |
all_count_exp <- tibble::column_to_rownames(all_count_exp, var = "Row.names") |
|
|
52 |
|
|
|
53 |
# Define groups for tumor and normal samples |
|
|
54 |
group <- c(rep('tumor', ncol(tumor)), rep('normal', ncol(normal))) |
|
|
55 |
group <- factor(group, levels = c("normal", "tumor")) |
|
|
56 |
group_table <- table(group) |
|
|
57 |
|
|
|
58 |
message("Group Table:") |
|
|
59 |
message(paste(names(group_table), group_table, sep = ": ", collapse = "\n")) |
|
|
60 |
# Add a space after the output for separation |
|
|
61 |
message(" ") |
|
|
62 |
|
|
|
63 |
# Create DGEList object for gene expression data and group information |
|
|
64 |
d <- edgeR::DGEList(counts = all_count_exp, group = group) |
|
|
65 |
|
|
|
66 |
# Filter lowly expressed genes based on CPM values |
|
|
67 |
keep <- rowSums(edgeR::cpm(d) > 1) >= 2 |
|
|
68 |
d <- d[keep, , keep.lib.sizes = FALSE] |
|
|
69 |
|
|
|
70 |
# Update library size information in the samples |
|
|
71 |
d$samples$lib.size <- colSums(d$counts) |
|
|
72 |
|
|
|
73 |
# Normalize the data using the TMM method |
|
|
74 |
d <- edgeR::calcNormFactors(d) |
|
|
75 |
|
|
|
76 |
# Create design matrix for differential analysis model |
|
|
77 |
design <- stats::model.matrix(~0 + factor(group)) |
|
|
78 |
rownames(design) <- colnames(d) |
|
|
79 |
colnames(design) <- levels(factor(group)) |
|
|
80 |
|
|
|
81 |
# Estimate dispersions - common dispersion, trended dispersion, tagwise dispersion |
|
|
82 |
d <- edgeR::estimateGLMCommonDisp(d, design) |
|
|
83 |
d <- edgeR::estimateGLMTrendedDisp(d, design) |
|
|
84 |
d <- edgeR::estimateGLMTagwiseDisp(d, design) |
|
|
85 |
|
|
|
86 |
# Fit the model using Generalized Linear Model (GLM) |
|
|
87 |
fit <- edgeR::glmFit(d, design) |
|
|
88 |
|
|
|
89 |
# Perform differential expression analysis using Likelihood Ratio Test (LRT) |
|
|
90 |
lrt <- edgeR::glmLRT(fit, contrast = c(-1, 1)) # Note that the 'contrast' here is different from 'DESeq2'. Here, we only need to input c(-1, 1): -1 corresponds to normal, 1 corresponds to tumor. |
|
|
91 |
|
|
|
92 |
# Retrieve top differentially expressed genes |
|
|
93 |
nrDEG <- edgeR::topTags(lrt, n = nrow(d)) |
|
|
94 |
DEG_edgeR <- as.data.frame(nrDEG) |
|
|
95 |
|
|
|
96 |
# Add 'change' column to mark up/down-regulated genes |
|
|
97 |
k1 <- (DEG_edgeR$PValue < p_value_threshold) & (DEG_edgeR$logFC < -logFC_threshold) |
|
|
98 |
k2 <- (DEG_edgeR$PValue < p_value_threshold) & (DEG_edgeR$logFC > logFC_threshold) |
|
|
99 |
DEG_edgeR <- dplyr::mutate(DEG_edgeR, change = ifelse(k1, "down", ifelse(k2, "up", "stable"))) |
|
|
100 |
|
|
|
101 |
change_table <- table(DEG_edgeR$change) |
|
|
102 |
|
|
|
103 |
message("Change Table:") |
|
|
104 |
message(paste(names(change_table), change_table, sep = ": ", collapse = "\n")) |
|
|
105 |
# Add a space after the output for separation |
|
|
106 |
message(" ") |
|
|
107 |
|
|
|
108 |
# Save results to the specified output file |
|
|
109 |
saveRDS(DEG_edgeR, file = output_file) |
|
|
110 |
|
|
|
111 |
return(DEG_edgeR) |
|
|
112 |
} |