a b/R/MergeGtexTcga.R
1
#' Merge gene expression data from GTEx and TCGA datasets
2
#'
3
#' This function merges gene expression data obtained from the GTEx (Genotype-Tissue Expression) and TCGA (The Cancer Genome Atlas) datasets.
4
#' It is assumed that both datasets are in '.rds' format and have genes as row names. The merged dataset is saved as an RDS file at the specified output path.
5
#'
6
#' @param gtex_data_path A string that specifies the file path to the GTEx data saved in RDS format.
7
#' @param tcga_exp_path A string that specifies the file path to the TCGA expression data saved in RDS format.
8
#'        This should be a data.frame with rows as genes and columns as samples.
9
#' @param output_path A string that specifies the path where the merged dataset should be saved.
10
#'        The file is saved in '.rds' format. The default path is "./merged_gtex_tcga_data.rds".
11
#'
12
#' @details It is assumed that both datasets are in '.rds' format and have genes as row names.
13
#'
14
#' @return A data frame where rows represent genes and columns represent samples.
15
#'         The data frame contains expression values from both GTEx and TCGA datasets.
16
#'         It saves the merged dataset to the path specified by 'output_path'.
17
#'
18
#' @examples
19
#' tumor_file <- system.file("extdata",
20
#'                           "removebatch_SKCM_Skin_TCGA_exp_tumor_test.rds",
21
#'                           package = "TransProR")
22
#' Normal_file <- system.file("extdata",
23
#'                            "removebatch_SKCM_Skin_Normal_TCGA_GTEX_count_test.rds",
24
#'                            package = "TransProR")
25
#' ouput_file <- file.path(tempdir(), "all_data.rds")
26
#'
27
#' all_data <- merge_gtex_tcga(gtex_data_path = tumor_file,
28
#'                             tcga_exp_path = Normal_file,
29
#'                             output_path = ouput_file)
30
#'
31
#' @note CRITICAL: The 'output_path' parameter must end with '.rds' to be properly recognized by the function. It is also highly recommended
32
#'       that the path includes specific identifiers related to the target samples. Please structure the 'output_path' following this pattern: './your_directory/merged.your_sample_type.gtex.tcga.data.rds'.
33
#'
34
#' @importFrom tibble column_to_rownames
35
#' @export
36
merge_gtex_tcga <- function(gtex_data_path,
37
                            tcga_exp_path,
38
                            output_path = "./merged_gtex_tcga_data.rds") {
39
40
  # Load the GTEx data
41
  gtex_data <- readRDS(gtex_data_path)
42
  message("Number of GTEx samples:", ncol(gtex_data), "\n")
43
44
  # Load the TCGA data
45
  tcga.exp <- readRDS(tcga_exp_path)
46
  message("Number of TCGA samples:", ncol(tcga.exp), "\n")
47
48
  # Merge the datasets, ensuring both have genes as row names
49
  all_data <- merge(gtex_data, tcga.exp, by = "row.names")
50
  all_data <- tibble::column_to_rownames(all_data, var = "Row.names")  # Set the row names
51
52
  message("Number of samples after merging:", ncol(all_data), "\n")
53
54
  # Save the merged dataset
55
  saveRDS(all_data, file = output_path)
56
57
  return(all_data)
58
}