a b/R/GetGtexExp.R
1
#' Get GTEx Expression Data for Specific Organ
2
#'
3
#' This function retrieves gene expression data from the GTEx project that is specific to a certain organ.
4
#' It performs various checks and processing steps to ensure that the data is consistent and relevant to the
5
#' specified organ. The filtered and cleaned data is saved as an RDS file for further analysis.
6
#'
7
#' @param organ_specific A character string specifying the organ to filter the gene expression data by.
8
#' @param file_path A character string specifying the path to the GTEx gene expression data file.
9
#' @param probe_map_path A character string specifying the path to the gtex_probeMap_gencode data file.
10
#' @param pheno_path A character string specifying the path to the GTEx phenotype data file.
11
#' @param output_path A character string specifying the path where the output RDS file will be saved.
12
#'
13
#' @details The function begins by checking if the gene expression and phenotype data files exist at
14
#'          the specified paths. It then loads these data files and processes them by setting appropriate row names,
15
#'          modifying column names for clarity, and filtering samples based on the specified organ. The function ensures
16
#'          that only samples present in both datasets are retained for consistency. It also removes any duplicate gene
17
#'          entries to prevent redundancy. Finally, the processed data is saved as an RDS file.
18
#'
19
#' @return A data frame containing gene expression data for the specified organ.
20
#'         Rows represent genes, and columns represent samples. Note that this function also saves the
21
#'         organ-specific GTEx data as an RDS file at the specified output path.
22
#'
23
#' @note The function will stop and throw an error if the input files do not exist, or if no samples are found
24
#'       for the specified organ.
25
#'
26
#' @note CRITICAL: The 'output_path' parameter must end with '.rds' to be properly recognized by the function. It is also highly recommended
27
#'       that the path includes specific identifiers related to the target samples. Please structure the 'output_path' following this pattern: './your_directory/your_sample_type.gtex.rds'.
28
#'
29
#' @importFrom utils read.table
30
#' @importFrom dplyr distinct filter
31
#' @importFrom rlang .data
32
#' @export
33
get_gtex_exp <- function(organ_specific,
34
                          file_path,
35
                          probe_map_path,
36
                          pheno_path,
37
                          output_path) {
38
39
  # Check for the existence of the file paths
40
  if (!file.exists(file_path) | !file.exists(pheno_path) | !file.exists(probe_map_path)) {
41
    stop("One or more of the input files do not exist.")
42
  }
43
44
  # Load the gene expression, probe map, and phenotype data files from the provided paths
45
  # gtex.exp <- data.table::fread(file_path, header = TRUE, sep = '\t', data.table = FALSE)
46
  # gtex.pro <- data.table::fread(probe_map_path, header = TRUE, sep = '\t', data.table = FALSE)
47
  # gtex.phe <- data.table::fread(pheno_path, header = TRUE, sep = '\t', data.table = FALSE)
48
49
  # Load the gene expression, probe map, and phenotype data files
50
  gtex.exp <- utils::read.table(file_path,
51
                        header = TRUE,
52
                        sep = '\t',
53
                        stringsAsFactors = FALSE,
54
                        check.names = FALSE)
55
56
  gtex.pro <- utils::read.table(probe_map_path,
57
                        header = TRUE,
58
                        sep = '\t',
59
                        stringsAsFactors = FALSE,
60
                        check.names = FALSE)
61
62
  gtex.phe <- utils::read.table(pheno_path,
63
                        header = TRUE,
64
                        sep = '\t',
65
                        stringsAsFactors = FALSE,
66
                        check.names = FALSE)
67
68
  # Merge the probe map with the expression data
69
  gtex.pro <- gtex.pro[, c(1,2)]  # Assuming the columns of interest are the first two
70
  gtex.count.pro <- merge(gtex.pro, gtex.exp, by.x = "id", by.y = "sample")
71
72
  # Set the row names for the samples, facilitating subsequent operations
73
  rownames(gtex.phe) <- gtex.phe$Sample
74
75
  # Modify column names to be more intuitive
76
  colnames(gtex.phe) <- c("Sample", "body_site_detail (SMTSD)", "primary_site", "gender", "patient", "cohort")
77
78
  # Filter samples based on the specified organ
79
  specific_samples <- dplyr::filter(gtex.phe, .data$primary_site == organ_specific)
80
81
  # If no corresponding samples are found, halt the function with an error message
82
  if (nrow(specific_samples) == 0) {
83
    stop("No samples found for the specified organ.")
84
  }
85
86
  # Print the number of samples found for the specified organ
87
  message("Number of samples for", organ_specific, ":", nrow(specific_samples), "\n")
88
89
  # Ensure processing only for samples present in both expression and phenotype data through intersection
90
  valid_sample_names <- intersect(rownames(specific_samples), colnames(gtex.count.pro)) # merge_phe_count_gtex
91
  gtex_data <- gtex.count.pro[, c("gene", valid_sample_names)]  # Extract data for relevant samples
92
93
  # Remove duplicate gene entries and set row names as gene names
94
  gtex_data <- dplyr::distinct(gtex_data, .data$gene, .keep_all = TRUE)
95
  rownames(gtex_data) <- gtex_data$gene
96
  gtex_data <- gtex_data[, -1]  # Remove the 'gene' column, keeping only expression data
97
98
  # Save the results as an RDS file for future data analysis tasks
99
  saveRDS(gtex_data, output_path)
100
101
  return(gtex_data)
102
}