[e25014]: / R / GetTcgaExp.R

Download this file

143 lines (122 with data), 6.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#' TCGA Expression Data Processing
#'
#' This function processes expression data and phenotype information, separates tumor and normal samples,
#' and saves the results into different files. It's specifically designed for data obtained from TCGA.
#'
#' @param counts_file_path File path to the counts data (usually in the form of a large matrix with gene expression data).
#' @param gene_probes_file_path File path containing the gene probes data.
#' @param phenotype_file_path File path to the phenotype data, which includes various sample attributes.
#' @param output_file_path Path where the output files, distinguished between tumor and normal, will be saved.
#'
#' @return A list containing matrices for tumor and normal expression data.
#'
#' @note IMPORTANT: This function assumes that the input files follow a specific format and structure, typically found in TCGA data releases.
#' Users should verify their data's compatibility. Additionally, the function does not perform error checking on the data's content,
#' which users should handle through proper preprocessing.
#'
#' @note CRITICAL: The 'output_file_path' parameter must end with '.rds' to be properly recognized by the function. It is also highly recommended
#' that the path includes specific identifiers related to the target samples, as the function will create further subdivisions in the specified
#' path for tumor or normal tissues. Please structure the 'output_file_path' following this pattern: './your_directory/your_sample_type.exp.rds'.
#'
#' @importFrom dplyr distinct filter
#' @importFrom utils read.table
#' @importFrom rlang .data
#' @export
#' @author Dongyue Yu
#'
#' @examples
#' counts_file <- system.file("extdata", "TCGA-SKCM.htseq_counts_test.tsv", package = "TransProR")
#' gene_probes_file <- system.file("extdata",
#' "TCGA_gencode.v22.annotation.gene.probeMap_test",
#' package = "TransProR")
#' phenotype_file <- system.file("extdata", "TCGA-SKCM.GDC_phenotype_test.tsv", package = "TransProR")
#' ouput_file <- file.path(tempdir(), "SKCM_Skin_TCGA_exp_test.rds")
#'
#' SKCM_exp <- get_tcga_exp(
#' counts_file_path = counts_file,
#' gene_probes_file_path = gene_probes_file,
#' phenotype_file_path = phenotype_file,
#' output_file_path = ouput_file
#' )
#' head(SKCM_exp[["tumor_tcga_data"]])[1:5, 1:5]
#' head(SKCM_exp[["normal_tcga_data"]], n = 10) # Because there is only one column.
get_tcga_exp <- function(counts_file_path,
gene_probes_file_path,
phenotype_file_path,
output_file_path) {
# Load expression matrix
# count_data <- data.table::fread(counts_file_path, header = TRUE, sep = '\t', data.table = FALSE)
count_data <- utils::read.table(counts_file_path,
header = TRUE,
sep = '\t',
stringsAsFactors = FALSE,
check.names = FALSE)
# Load gene ID conversion information
# gene_probes <- data.table::fread(gene_probes_file_path, header = TRUE, sep = '\t', data.table = FALSE)
gene_probes <- utils::read.table(gene_probes_file_path,
header = TRUE,
sep = '\t',
stringsAsFactors = FALSE,
check.names = FALSE)
# Keep only necessary columns
gene_probes <- gene_probes[, c(1, 2)]
# Merge gene ID information with expression matrix
count_probes_merged <- merge(gene_probes, count_data, by.x = "id", by.y = "Ensembl_ID")
# Remove duplicates
count_data_unique <- dplyr::distinct(count_probes_merged, .data$gene, .keep_all = TRUE)
# Set gene names as row names
rownames(count_data_unique) <- count_data_unique$gene
count_data_final <- count_data_unique[, -c(1,2)] # Remove extra columns
# Load clinical information
# phenotype_data <- data.table::fread(phenotype_file_path, header = TRUE, sep = '\t', data.table = FALSE)
# Load clinical information with proper column types
phenotype_data <- utils::read.table(phenotype_file_path,
header = TRUE,
sep = '\t',
stringsAsFactors = FALSE,
check.names = FALSE,
colClasses = list(
withdrawn = "logical",
releasable.project = "logical",
is_ffpe.samples = "logical",
oct_embedded.samples = "logical"
))
rownames(phenotype_data) <- phenotype_data$submitter_id.samples # Set sample names as row names
# Check first if there are any "Metastatic", "Primary Tumor", or "normal" samples in your data
table(phenotype_data$sample_type.samples)
# Create a data frame for tumor samples
tumor_samples <- phenotype_data %>%
dplyr::filter(grepl("Metastatic|Primary Tumor", .data$sample_type.samples, ignore.case = TRUE))
# Check for 'Primary Tumor' or 'Metastatic' samples
if (nrow(tumor_samples) == 0) {
message("No 'Primary Tumor' or 'Metastatic' samples found.\n")
} else {
message("Number of 'Primary Tumor' or 'Metastatic' samples: ", nrow(tumor_samples), "\n")
}
# Create a data frame for normal samples
normal_samples <- phenotype_data %>%
dplyr::filter(grepl("normal", .data$sample_type.samples, ignore.case = TRUE))
if (nrow(normal_samples) == 0) {
message("No 'normal' samples found.\n")
} else {
message("Number of 'normal' samples:", nrow(normal_samples), "\n")
}
# Get the intersection of clinical information and expression matrix
tumor_common_samples <- intersect(rownames(tumor_samples), colnames(count_data_final))
# Extract corresponding expression matrix
tumor_expression_data <- count_data_final[, tumor_common_samples]
# Get the intersection of clinical information and expression matrix
normal_common_samples <- intersect(rownames(normal_samples), colnames(count_data_final))
# Extract corresponding expression matrix
normal_expression_data <- count_data_final[, normal_common_samples, drop = FALSE]
# Save results
tumor_output_path <- gsub("\\.rds$", "_tumor.rds", output_file_path)
normal_output_path <- gsub("\\.rds$", "_normal.rds", output_file_path)
saveRDS(tumor_expression_data, file = tumor_output_path)
saveRDS(normal_expression_data, file = normal_output_path)
result <- list(
tumor_tcga_data = tumor_expression_data,
normal_tcga_data = normal_expression_data
)
return(result)
}