|
a |
|
b/R/GetGtexExp.R |
|
|
1 |
#' Get GTEx Expression Data for Specific Organ |
|
|
2 |
#' |
|
|
3 |
#' This function retrieves gene expression data from the GTEx project that is specific to a certain organ. |
|
|
4 |
#' It performs various checks and processing steps to ensure that the data is consistent and relevant to the |
|
|
5 |
#' specified organ. The filtered and cleaned data is saved as an RDS file for further analysis. |
|
|
6 |
#' |
|
|
7 |
#' @param organ_specific A character string specifying the organ to filter the gene expression data by. |
|
|
8 |
#' @param file_path A character string specifying the path to the GTEx gene expression data file. |
|
|
9 |
#' @param probe_map_path A character string specifying the path to the gtex_probeMap_gencode data file. |
|
|
10 |
#' @param pheno_path A character string specifying the path to the GTEx phenotype data file. |
|
|
11 |
#' @param output_path A character string specifying the path where the output RDS file will be saved. |
|
|
12 |
#' |
|
|
13 |
#' @details The function begins by checking if the gene expression and phenotype data files exist at |
|
|
14 |
#' the specified paths. It then loads these data files and processes them by setting appropriate row names, |
|
|
15 |
#' modifying column names for clarity, and filtering samples based on the specified organ. The function ensures |
|
|
16 |
#' that only samples present in both datasets are retained for consistency. It also removes any duplicate gene |
|
|
17 |
#' entries to prevent redundancy. Finally, the processed data is saved as an RDS file. |
|
|
18 |
#' |
|
|
19 |
#' @return A data frame containing gene expression data for the specified organ. |
|
|
20 |
#' Rows represent genes, and columns represent samples. Note that this function also saves the |
|
|
21 |
#' organ-specific GTEx data as an RDS file at the specified output path. |
|
|
22 |
#' |
|
|
23 |
#' @note The function will stop and throw an error if the input files do not exist, or if no samples are found |
|
|
24 |
#' for the specified organ. |
|
|
25 |
#' |
|
|
26 |
#' @note CRITICAL: The 'output_path' parameter must end with '.rds' to be properly recognized by the function. It is also highly recommended |
|
|
27 |
#' that the path includes specific identifiers related to the target samples. Please structure the 'output_path' following this pattern: './your_directory/your_sample_type.gtex.rds'. |
|
|
28 |
#' |
|
|
29 |
#' @importFrom utils read.table |
|
|
30 |
#' @importFrom dplyr distinct filter |
|
|
31 |
#' @importFrom rlang .data |
|
|
32 |
#' @export |
|
|
33 |
get_gtex_exp <- function(organ_specific, |
|
|
34 |
file_path, |
|
|
35 |
probe_map_path, |
|
|
36 |
pheno_path, |
|
|
37 |
output_path) { |
|
|
38 |
|
|
|
39 |
# Check for the existence of the file paths |
|
|
40 |
if (!file.exists(file_path) | !file.exists(pheno_path) | !file.exists(probe_map_path)) { |
|
|
41 |
stop("One or more of the input files do not exist.") |
|
|
42 |
} |
|
|
43 |
|
|
|
44 |
# Load the gene expression, probe map, and phenotype data files from the provided paths |
|
|
45 |
# gtex.exp <- data.table::fread(file_path, header = TRUE, sep = '\t', data.table = FALSE) |
|
|
46 |
# gtex.pro <- data.table::fread(probe_map_path, header = TRUE, sep = '\t', data.table = FALSE) |
|
|
47 |
# gtex.phe <- data.table::fread(pheno_path, header = TRUE, sep = '\t', data.table = FALSE) |
|
|
48 |
|
|
|
49 |
# Load the gene expression, probe map, and phenotype data files |
|
|
50 |
gtex.exp <- utils::read.table(file_path, |
|
|
51 |
header = TRUE, |
|
|
52 |
sep = '\t', |
|
|
53 |
stringsAsFactors = FALSE, |
|
|
54 |
check.names = FALSE) |
|
|
55 |
|
|
|
56 |
gtex.pro <- utils::read.table(probe_map_path, |
|
|
57 |
header = TRUE, |
|
|
58 |
sep = '\t', |
|
|
59 |
stringsAsFactors = FALSE, |
|
|
60 |
check.names = FALSE) |
|
|
61 |
|
|
|
62 |
gtex.phe <- utils::read.table(pheno_path, |
|
|
63 |
header = TRUE, |
|
|
64 |
sep = '\t', |
|
|
65 |
stringsAsFactors = FALSE, |
|
|
66 |
check.names = FALSE) |
|
|
67 |
|
|
|
68 |
# Merge the probe map with the expression data |
|
|
69 |
gtex.pro <- gtex.pro[, c(1,2)] # Assuming the columns of interest are the first two |
|
|
70 |
gtex.count.pro <- merge(gtex.pro, gtex.exp, by.x = "id", by.y = "sample") |
|
|
71 |
|
|
|
72 |
# Set the row names for the samples, facilitating subsequent operations |
|
|
73 |
rownames(gtex.phe) <- gtex.phe$Sample |
|
|
74 |
|
|
|
75 |
# Modify column names to be more intuitive |
|
|
76 |
colnames(gtex.phe) <- c("Sample", "body_site_detail (SMTSD)", "primary_site", "gender", "patient", "cohort") |
|
|
77 |
|
|
|
78 |
# Filter samples based on the specified organ |
|
|
79 |
specific_samples <- dplyr::filter(gtex.phe, .data$primary_site == organ_specific) |
|
|
80 |
|
|
|
81 |
# If no corresponding samples are found, halt the function with an error message |
|
|
82 |
if (nrow(specific_samples) == 0) { |
|
|
83 |
stop("No samples found for the specified organ.") |
|
|
84 |
} |
|
|
85 |
|
|
|
86 |
# Print the number of samples found for the specified organ |
|
|
87 |
message("Number of samples for", organ_specific, ":", nrow(specific_samples), "\n") |
|
|
88 |
|
|
|
89 |
# Ensure processing only for samples present in both expression and phenotype data through intersection |
|
|
90 |
valid_sample_names <- intersect(rownames(specific_samples), colnames(gtex.count.pro)) # merge_phe_count_gtex |
|
|
91 |
gtex_data <- gtex.count.pro[, c("gene", valid_sample_names)] # Extract data for relevant samples |
|
|
92 |
|
|
|
93 |
# Remove duplicate gene entries and set row names as gene names |
|
|
94 |
gtex_data <- dplyr::distinct(gtex_data, .data$gene, .keep_all = TRUE) |
|
|
95 |
rownames(gtex_data) <- gtex_data$gene |
|
|
96 |
gtex_data <- gtex_data[, -1] # Remove the 'gene' column, keeping only expression data |
|
|
97 |
|
|
|
98 |
# Save the results as an RDS file for future data analysis tasks |
|
|
99 |
saveRDS(gtex_data, output_path) |
|
|
100 |
|
|
|
101 |
return(gtex_data) |
|
|
102 |
} |