Diff of /R/GetTcgaExp.R [000000] .. [0f2269]

Switch to side-by-side view

--- a
+++ b/R/GetTcgaExp.R
@@ -0,0 +1,142 @@
+#' TCGA Expression Data Processing
+#'
+#' This function processes expression data and phenotype information, separates tumor and normal samples,
+#' and saves the results into different files. It's specifically designed for data obtained from TCGA.
+#'
+#' @param counts_file_path File path to the counts data (usually in the form of a large matrix with gene expression data).
+#' @param gene_probes_file_path File path containing the gene probes data.
+#' @param phenotype_file_path File path to the phenotype data, which includes various sample attributes.
+#' @param output_file_path Path where the output files, distinguished between tumor and normal, will be saved.
+#'
+#' @return A list containing matrices for tumor and normal expression data.
+#'
+#' @note IMPORTANT: This function assumes that the input files follow a specific format and structure, typically found in TCGA data releases.
+#' Users should verify their data's compatibility. Additionally, the function does not perform error checking on the data's content,
+#' which users should handle through proper preprocessing.
+#'
+#' @note CRITICAL: The 'output_file_path' parameter must end with '.rds' to be properly recognized by the function. It is also highly recommended
+#' that the path includes specific identifiers related to the target samples, as the function will create further subdivisions in the specified
+#' path for tumor or normal tissues. Please structure the 'output_file_path' following this pattern: './your_directory/your_sample_type.exp.rds'.
+#'
+#' @importFrom dplyr distinct filter
+#' @importFrom utils read.table
+#' @importFrom rlang .data
+#' @export
+#' @author Dongyue Yu
+#'
+#' @examples
+#' counts_file <- system.file("extdata", "TCGA-SKCM.htseq_counts_test.tsv", package = "TransProR")
+#' gene_probes_file <- system.file("extdata",
+#'                                 "TCGA_gencode.v22.annotation.gene.probeMap_test",
+#'                                 package = "TransProR")
+#' phenotype_file <- system.file("extdata", "TCGA-SKCM.GDC_phenotype_test.tsv", package = "TransProR")
+#' ouput_file <- file.path(tempdir(), "SKCM_Skin_TCGA_exp_test.rds")
+#'
+#' SKCM_exp <- get_tcga_exp(
+#'   counts_file_path = counts_file,
+#'   gene_probes_file_path = gene_probes_file,
+#'   phenotype_file_path = phenotype_file,
+#'   output_file_path = ouput_file
+#' )
+#' head(SKCM_exp[["tumor_tcga_data"]])[1:5, 1:5]
+#' head(SKCM_exp[["normal_tcga_data"]], n = 10) # Because there is only one column.
+get_tcga_exp <- function(counts_file_path,
+                          gene_probes_file_path,
+                          phenotype_file_path,
+                          output_file_path) {
+  # Load expression matrix
+  # count_data <- data.table::fread(counts_file_path, header = TRUE, sep = '\t', data.table = FALSE)
+  count_data <- utils::read.table(counts_file_path,
+                          header = TRUE,
+                          sep = '\t',
+                          stringsAsFactors = FALSE,
+                          check.names = FALSE)
+
+  # Load gene ID conversion information
+  # gene_probes <- data.table::fread(gene_probes_file_path, header = TRUE, sep = '\t', data.table = FALSE)
+  gene_probes <- utils::read.table(gene_probes_file_path,
+                            header = TRUE,
+                            sep = '\t',
+                            stringsAsFactors = FALSE,
+                            check.names = FALSE)
+
+  # Keep only necessary columns
+  gene_probes <- gene_probes[, c(1, 2)]
+
+  # Merge gene ID information with expression matrix
+  count_probes_merged <- merge(gene_probes, count_data, by.x = "id", by.y = "Ensembl_ID")
+
+  # Remove duplicates
+  count_data_unique <- dplyr::distinct(count_probes_merged, .data$gene, .keep_all = TRUE)
+
+  # Set gene names as row names
+  rownames(count_data_unique) <- count_data_unique$gene
+  count_data_final <- count_data_unique[, -c(1,2)]  # Remove extra columns
+
+  # Load clinical information
+  # phenotype_data <- data.table::fread(phenotype_file_path, header = TRUE, sep = '\t', data.table = FALSE)
+  # Load clinical information with proper column types
+  phenotype_data <- utils::read.table(phenotype_file_path,
+                              header = TRUE,
+                              sep = '\t',
+                              stringsAsFactors = FALSE,
+                              check.names = FALSE,
+                              colClasses = list(
+                                withdrawn = "logical",
+                                releasable.project = "logical",
+                                is_ffpe.samples = "logical",
+                                oct_embedded.samples = "logical"
+                              ))
+  rownames(phenotype_data) <- phenotype_data$submitter_id.samples  # Set sample names as row names
+
+
+  # Check first if there are any "Metastatic", "Primary Tumor", or "normal" samples in your data
+  table(phenotype_data$sample_type.samples)
+
+  # Create a data frame for tumor samples
+  tumor_samples <- phenotype_data %>%
+    dplyr::filter(grepl("Metastatic|Primary Tumor", .data$sample_type.samples, ignore.case = TRUE))
+
+  # Check for 'Primary Tumor' or 'Metastatic' samples
+  if (nrow(tumor_samples) == 0) {
+    message("No 'Primary Tumor' or 'Metastatic' samples found.\n")
+  } else {
+    message("Number of 'Primary Tumor' or 'Metastatic' samples: ", nrow(tumor_samples), "\n")
+  }
+
+  # Create a data frame for normal samples
+  normal_samples <- phenotype_data %>%
+    dplyr::filter(grepl("normal", .data$sample_type.samples, ignore.case = TRUE))
+
+  if (nrow(normal_samples) == 0) {
+    message("No 'normal' samples found.\n")
+  } else {
+    message("Number of 'normal' samples:", nrow(normal_samples), "\n")
+  }
+
+  # Get the intersection of clinical information and expression matrix
+  tumor_common_samples <- intersect(rownames(tumor_samples), colnames(count_data_final))
+
+  # Extract corresponding expression matrix
+  tumor_expression_data <- count_data_final[, tumor_common_samples]
+
+  # Get the intersection of clinical information and expression matrix
+  normal_common_samples <- intersect(rownames(normal_samples), colnames(count_data_final))
+
+  # Extract corresponding expression matrix
+  normal_expression_data <- count_data_final[, normal_common_samples, drop = FALSE]
+
+  # Save results
+  tumor_output_path <- gsub("\\.rds$", "_tumor.rds", output_file_path)
+  normal_output_path <- gsub("\\.rds$", "_normal.rds", output_file_path)
+  saveRDS(tumor_expression_data, file = tumor_output_path)
+  saveRDS(normal_expression_data, file = normal_output_path)
+
+
+  result <- list(
+    tumor_tcga_data = tumor_expression_data,
+    normal_tcga_data = normal_expression_data
+  )
+
+  return(result)
+}