Diff of /R/DESeq2Analyze.R [000000] .. [0f2269]

Switch to unified view

a b/R/DESeq2Analyze.R
1
#' Differential Gene Expression Analysis using 'DESeq2'
2
#'
3
#' 'DESeq2': Differential gene expression analysis based on the negative binomial distribution.
4
#' This function utilizes the 'DESeq2' package to conduct differential gene expression analysis.
5
#' It processes tumor and normal expression data, applies DESeq2 analysis,
6
#' and outputs the results along with information on gene expression changes.
7
#'
8
#' The DESeq2 methodology is based on modeling count data using a negative binomial distribution,
9
#' which allows for handling the variability observed in gene expression data, especially in
10
#' small sample sizes. This approach is well-suited for RNA-Seq data analysis.
11
#'
12
#' @importFrom DESeq2 DESeqDataSetFromMatrix DESeq results
13
#' @importFrom dplyr mutate
14
#' @importFrom tibble column_to_rownames
15
#' @importFrom stats na.omit
16
#' @param tumor_file Path to the tumor data file (RDS format).
17
#' @param normal_file Path to the normal data file (RDS format).
18
#' @param output_file Path to save the output DEG data (RDS format).
19
#' @param logFC Threshold for log fold change.
20
#' @param p_value Threshold for p-value.
21
#' @return A data frame of differential expression results.
22
#' @examples
23
#' # Define file paths for tumor and normal data from the data folder
24
#' tumor_file <- system.file("extdata",
25
#'                           "removebatch_SKCM_Skin_TCGA_exp_tumor_test.rds",
26
#'                           package = "TransProR")
27
#' normal_file <- system.file("extdata",
28
#'                            "removebatch_SKCM_Skin_Normal_TCGA_GTEX_count_test.rds",
29
#'                            package = "TransProR")
30
#' output_file <- file.path(tempdir(), "DEG_DESeq2.rds")
31
#'
32
#' DEG_DESeq2 <- DESeq2_analyze(
33
#'   tumor_file = tumor_file,
34
#'   normal_file = normal_file,
35
#'   output_file = output_file,
36
#'   2.5,
37
#'   0.01
38
#' )
39
#'
40
#' # View the top 5 rows of the result
41
#' head(DEG_DESeq2, 5)
42
#'
43
#' @references
44
#' DESeq2:Differential gene expression analysis based on the negative binomial distribution.
45
#' For more information, visit the page:
46
#' https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/
47
#' @export
48
DESeq2_analyze <- function(tumor_file, normal_file, output_file, logFC = 2.5, p_value = 0.01) {
49
  # Read and merge data
50
  tumor <- readRDS(tumor_file)
51
  normal <- readRDS(normal_file)
52
  all_count_exp <- merge(tumor, normal, by = "row.names") # Merge the datasets, ensuring both have genes as row names
53
  all_count_exp <- tibble::column_to_rownames(all_count_exp, var = "Row.names") # Set the row names
54
55
  # Create group factor
56
  group <- factor(c(rep('tumor', ncol(tumor)), rep('normal', ncol(normal))), levels = c("normal", "tumor"))
57
  group_table <- table(group)
58
59
  message("Group Table:")
60
  message(paste(names(group_table), group_table, sep = ": ", collapse = "\n"))
61
  # Add a space after the output for separation
62
  message(" ")
63
64
  # Prepare DESeq2 dataset
65
  colData <- data.frame(row.names = colnames(all_count_exp), group = group) # Create a dataframe to store the grouping information of samples, with the row names as sample names and the column names as group information.
66
  dds <- DESeq2::DESeqDataSetFromMatrix(countData = all_count_exp, # Expression matrix, with rows as genes and columns as samples, containing integers derived from the calculation of reads or fragments.
67
                                        colData = colData, # Sample information matrix (dataframe), showing the correspondence between the column names of the expression matrix and the grouping information, with row names as sample names. The first column indicates the treatment status of the sample (control or treatment, tumor or normal, etc.), referred to as the group.
68
                                        design = ~ group) # Differential comparison matrix, which informs the differential analysis function about the variables between which differences are to be analyzed. Simply put, it specifies which are the controls and which are the treatments. The group refers to the group in colData, which is the grouping information.
69
70
  # Perform differential expression analysis.
71
  dds <- DESeq2::DESeq(dds)
72
73
  # Perform DESeq2 analysis
74
  # Extract the results of differential expression and perform a comparison. Here, the 'contrast' parameter specifies the groups to be compared.
75
  # The 'contrast' parameter must be written in a vector format with three elements, and the order cannot be reversed.
76
  res <- DESeq2::results(dds, contrast = c("group", "tumor", "normal"))
77
78
  # Sort the differential results according to 'padj' (adjusted p-value). This step is necessary only for DESeq2, as limma and edgeR will automatically sort the results.
79
  resOrdered <- res[order(res$padj), ]
80
  DEG <- as.data.frame(stats::na.omit(resOrdered)) # Remove missing values. If this step is not taken, some genes with very low expression levels will result in NA values after calculation, causing difficulties in subsequent analysis and plotting.
81
82
  # Add a 'change' column to mark up- or down-regulated genes, with the threshold set according to requirements.
83
  DEG <- dplyr::mutate(DEG, change = ifelse(DEG$pvalue < p_value & DEG$log2FoldChange < -logFC, "down",
84
                                            ifelse(DEG$pvalue < p_value & DEG$log2FoldChange > logFC, "up", "stable")))
85
86
  # Output table of gene expression changes
87
  change_table <- table(DEG$change)
88
89
  message(" ")
90
  message("Change Table:")
91
  message(paste(names(change_table), change_table, sep = ": ", collapse = "\n"))
92
  # Add a space after the output for separation
93
  message(" ")
94
95
  # Save results
96
  #save(DEG, file = output_file)
97
  saveRDS(DEG, file = output_file)
98
  # Return results
99
  return(DEG)
100
}