[e25014]: / R / GetGtexExp.R

Download this file

103 lines (87 with data), 5.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
#' Get GTEx Expression Data for Specific Organ
#'
#' This function retrieves gene expression data from the GTEx project that is specific to a certain organ.
#' It performs various checks and processing steps to ensure that the data is consistent and relevant to the
#' specified organ. The filtered and cleaned data is saved as an RDS file for further analysis.
#'
#' @param organ_specific A character string specifying the organ to filter the gene expression data by.
#' @param file_path A character string specifying the path to the GTEx gene expression data file.
#' @param probe_map_path A character string specifying the path to the gtex_probeMap_gencode data file.
#' @param pheno_path A character string specifying the path to the GTEx phenotype data file.
#' @param output_path A character string specifying the path where the output RDS file will be saved.
#'
#' @details The function begins by checking if the gene expression and phenotype data files exist at
#' the specified paths. It then loads these data files and processes them by setting appropriate row names,
#' modifying column names for clarity, and filtering samples based on the specified organ. The function ensures
#' that only samples present in both datasets are retained for consistency. It also removes any duplicate gene
#' entries to prevent redundancy. Finally, the processed data is saved as an RDS file.
#'
#' @return A data frame containing gene expression data for the specified organ.
#' Rows represent genes, and columns represent samples. Note that this function also saves the
#' organ-specific GTEx data as an RDS file at the specified output path.
#'
#' @note The function will stop and throw an error if the input files do not exist, or if no samples are found
#' for the specified organ.
#'
#' @note CRITICAL: The 'output_path' parameter must end with '.rds' to be properly recognized by the function. It is also highly recommended
#' that the path includes specific identifiers related to the target samples. Please structure the 'output_path' following this pattern: './your_directory/your_sample_type.gtex.rds'.
#'
#' @importFrom utils read.table
#' @importFrom dplyr distinct filter
#' @importFrom rlang .data
#' @export
get_gtex_exp <- function(organ_specific,
file_path,
probe_map_path,
pheno_path,
output_path) {
# Check for the existence of the file paths
if (!file.exists(file_path) | !file.exists(pheno_path) | !file.exists(probe_map_path)) {
stop("One or more of the input files do not exist.")
}
# Load the gene expression, probe map, and phenotype data files from the provided paths
# gtex.exp <- data.table::fread(file_path, header = TRUE, sep = '\t', data.table = FALSE)
# gtex.pro <- data.table::fread(probe_map_path, header = TRUE, sep = '\t', data.table = FALSE)
# gtex.phe <- data.table::fread(pheno_path, header = TRUE, sep = '\t', data.table = FALSE)
# Load the gene expression, probe map, and phenotype data files
gtex.exp <- utils::read.table(file_path,
header = TRUE,
sep = '\t',
stringsAsFactors = FALSE,
check.names = FALSE)
gtex.pro <- utils::read.table(probe_map_path,
header = TRUE,
sep = '\t',
stringsAsFactors = FALSE,
check.names = FALSE)
gtex.phe <- utils::read.table(pheno_path,
header = TRUE,
sep = '\t',
stringsAsFactors = FALSE,
check.names = FALSE)
# Merge the probe map with the expression data
gtex.pro <- gtex.pro[, c(1,2)] # Assuming the columns of interest are the first two
gtex.count.pro <- merge(gtex.pro, gtex.exp, by.x = "id", by.y = "sample")
# Set the row names for the samples, facilitating subsequent operations
rownames(gtex.phe) <- gtex.phe$Sample
# Modify column names to be more intuitive
colnames(gtex.phe) <- c("Sample", "body_site_detail (SMTSD)", "primary_site", "gender", "patient", "cohort")
# Filter samples based on the specified organ
specific_samples <- dplyr::filter(gtex.phe, .data$primary_site == organ_specific)
# If no corresponding samples are found, halt the function with an error message
if (nrow(specific_samples) == 0) {
stop("No samples found for the specified organ.")
}
# Print the number of samples found for the specified organ
message("Number of samples for", organ_specific, ":", nrow(specific_samples), "\n")
# Ensure processing only for samples present in both expression and phenotype data through intersection
valid_sample_names <- intersect(rownames(specific_samples), colnames(gtex.count.pro)) # merge_phe_count_gtex
gtex_data <- gtex.count.pro[, c("gene", valid_sample_names)] # Extract data for relevant samples
# Remove duplicate gene entries and set row names as gene names
gtex_data <- dplyr::distinct(gtex_data, .data$gene, .keep_all = TRUE)
rownames(gtex_data) <- gtex_data$gene
gtex_data <- gtex_data[, -1] # Remove the 'gene' column, keeping only expression data
# Save the results as an RDS file for future data analysis tasks
saveRDS(gtex_data, output_path)
return(gtex_data)
}