|
a |
|
b/R/GetTcgaExp.R |
|
|
1 |
#' TCGA Expression Data Processing |
|
|
2 |
#' |
|
|
3 |
#' This function processes expression data and phenotype information, separates tumor and normal samples, |
|
|
4 |
#' and saves the results into different files. It's specifically designed for data obtained from TCGA. |
|
|
5 |
#' |
|
|
6 |
#' @param counts_file_path File path to the counts data (usually in the form of a large matrix with gene expression data). |
|
|
7 |
#' @param gene_probes_file_path File path containing the gene probes data. |
|
|
8 |
#' @param phenotype_file_path File path to the phenotype data, which includes various sample attributes. |
|
|
9 |
#' @param output_file_path Path where the output files, distinguished between tumor and normal, will be saved. |
|
|
10 |
#' |
|
|
11 |
#' @return A list containing matrices for tumor and normal expression data. |
|
|
12 |
#' |
|
|
13 |
#' @note IMPORTANT: This function assumes that the input files follow a specific format and structure, typically found in TCGA data releases. |
|
|
14 |
#' Users should verify their data's compatibility. Additionally, the function does not perform error checking on the data's content, |
|
|
15 |
#' which users should handle through proper preprocessing. |
|
|
16 |
#' |
|
|
17 |
#' @note CRITICAL: The 'output_file_path' parameter must end with '.rds' to be properly recognized by the function. It is also highly recommended |
|
|
18 |
#' that the path includes specific identifiers related to the target samples, as the function will create further subdivisions in the specified |
|
|
19 |
#' path for tumor or normal tissues. Please structure the 'output_file_path' following this pattern: './your_directory/your_sample_type.exp.rds'. |
|
|
20 |
#' |
|
|
21 |
#' @importFrom dplyr distinct filter |
|
|
22 |
#' @importFrom utils read.table |
|
|
23 |
#' @importFrom rlang .data |
|
|
24 |
#' @export |
|
|
25 |
#' @author Dongyue Yu |
|
|
26 |
#' |
|
|
27 |
#' @examples |
|
|
28 |
#' counts_file <- system.file("extdata", "TCGA-SKCM.htseq_counts_test.tsv", package = "TransProR") |
|
|
29 |
#' gene_probes_file <- system.file("extdata", |
|
|
30 |
#' "TCGA_gencode.v22.annotation.gene.probeMap_test", |
|
|
31 |
#' package = "TransProR") |
|
|
32 |
#' phenotype_file <- system.file("extdata", "TCGA-SKCM.GDC_phenotype_test.tsv", package = "TransProR") |
|
|
33 |
#' ouput_file <- file.path(tempdir(), "SKCM_Skin_TCGA_exp_test.rds") |
|
|
34 |
#' |
|
|
35 |
#' SKCM_exp <- get_tcga_exp( |
|
|
36 |
#' counts_file_path = counts_file, |
|
|
37 |
#' gene_probes_file_path = gene_probes_file, |
|
|
38 |
#' phenotype_file_path = phenotype_file, |
|
|
39 |
#' output_file_path = ouput_file |
|
|
40 |
#' ) |
|
|
41 |
#' head(SKCM_exp[["tumor_tcga_data"]])[1:5, 1:5] |
|
|
42 |
#' head(SKCM_exp[["normal_tcga_data"]], n = 10) # Because there is only one column. |
|
|
43 |
get_tcga_exp <- function(counts_file_path, |
|
|
44 |
gene_probes_file_path, |
|
|
45 |
phenotype_file_path, |
|
|
46 |
output_file_path) { |
|
|
47 |
# Load expression matrix |
|
|
48 |
# count_data <- data.table::fread(counts_file_path, header = TRUE, sep = '\t', data.table = FALSE) |
|
|
49 |
count_data <- utils::read.table(counts_file_path, |
|
|
50 |
header = TRUE, |
|
|
51 |
sep = '\t', |
|
|
52 |
stringsAsFactors = FALSE, |
|
|
53 |
check.names = FALSE) |
|
|
54 |
|
|
|
55 |
# Load gene ID conversion information |
|
|
56 |
# gene_probes <- data.table::fread(gene_probes_file_path, header = TRUE, sep = '\t', data.table = FALSE) |
|
|
57 |
gene_probes <- utils::read.table(gene_probes_file_path, |
|
|
58 |
header = TRUE, |
|
|
59 |
sep = '\t', |
|
|
60 |
stringsAsFactors = FALSE, |
|
|
61 |
check.names = FALSE) |
|
|
62 |
|
|
|
63 |
# Keep only necessary columns |
|
|
64 |
gene_probes <- gene_probes[, c(1, 2)] |
|
|
65 |
|
|
|
66 |
# Merge gene ID information with expression matrix |
|
|
67 |
count_probes_merged <- merge(gene_probes, count_data, by.x = "id", by.y = "Ensembl_ID") |
|
|
68 |
|
|
|
69 |
# Remove duplicates |
|
|
70 |
count_data_unique <- dplyr::distinct(count_probes_merged, .data$gene, .keep_all = TRUE) |
|
|
71 |
|
|
|
72 |
# Set gene names as row names |
|
|
73 |
rownames(count_data_unique) <- count_data_unique$gene |
|
|
74 |
count_data_final <- count_data_unique[, -c(1,2)] # Remove extra columns |
|
|
75 |
|
|
|
76 |
# Load clinical information |
|
|
77 |
# phenotype_data <- data.table::fread(phenotype_file_path, header = TRUE, sep = '\t', data.table = FALSE) |
|
|
78 |
# Load clinical information with proper column types |
|
|
79 |
phenotype_data <- utils::read.table(phenotype_file_path, |
|
|
80 |
header = TRUE, |
|
|
81 |
sep = '\t', |
|
|
82 |
stringsAsFactors = FALSE, |
|
|
83 |
check.names = FALSE, |
|
|
84 |
colClasses = list( |
|
|
85 |
withdrawn = "logical", |
|
|
86 |
releasable.project = "logical", |
|
|
87 |
is_ffpe.samples = "logical", |
|
|
88 |
oct_embedded.samples = "logical" |
|
|
89 |
)) |
|
|
90 |
rownames(phenotype_data) <- phenotype_data$submitter_id.samples # Set sample names as row names |
|
|
91 |
|
|
|
92 |
|
|
|
93 |
# Check first if there are any "Metastatic", "Primary Tumor", or "normal" samples in your data |
|
|
94 |
table(phenotype_data$sample_type.samples) |
|
|
95 |
|
|
|
96 |
# Create a data frame for tumor samples |
|
|
97 |
tumor_samples <- phenotype_data %>% |
|
|
98 |
dplyr::filter(grepl("Metastatic|Primary Tumor", .data$sample_type.samples, ignore.case = TRUE)) |
|
|
99 |
|
|
|
100 |
# Check for 'Primary Tumor' or 'Metastatic' samples |
|
|
101 |
if (nrow(tumor_samples) == 0) { |
|
|
102 |
message("No 'Primary Tumor' or 'Metastatic' samples found.\n") |
|
|
103 |
} else { |
|
|
104 |
message("Number of 'Primary Tumor' or 'Metastatic' samples: ", nrow(tumor_samples), "\n") |
|
|
105 |
} |
|
|
106 |
|
|
|
107 |
# Create a data frame for normal samples |
|
|
108 |
normal_samples <- phenotype_data %>% |
|
|
109 |
dplyr::filter(grepl("normal", .data$sample_type.samples, ignore.case = TRUE)) |
|
|
110 |
|
|
|
111 |
if (nrow(normal_samples) == 0) { |
|
|
112 |
message("No 'normal' samples found.\n") |
|
|
113 |
} else { |
|
|
114 |
message("Number of 'normal' samples:", nrow(normal_samples), "\n") |
|
|
115 |
} |
|
|
116 |
|
|
|
117 |
# Get the intersection of clinical information and expression matrix |
|
|
118 |
tumor_common_samples <- intersect(rownames(tumor_samples), colnames(count_data_final)) |
|
|
119 |
|
|
|
120 |
# Extract corresponding expression matrix |
|
|
121 |
tumor_expression_data <- count_data_final[, tumor_common_samples] |
|
|
122 |
|
|
|
123 |
# Get the intersection of clinical information and expression matrix |
|
|
124 |
normal_common_samples <- intersect(rownames(normal_samples), colnames(count_data_final)) |
|
|
125 |
|
|
|
126 |
# Extract corresponding expression matrix |
|
|
127 |
normal_expression_data <- count_data_final[, normal_common_samples, drop = FALSE] |
|
|
128 |
|
|
|
129 |
# Save results |
|
|
130 |
tumor_output_path <- gsub("\\.rds$", "_tumor.rds", output_file_path) |
|
|
131 |
normal_output_path <- gsub("\\.rds$", "_normal.rds", output_file_path) |
|
|
132 |
saveRDS(tumor_expression_data, file = tumor_output_path) |
|
|
133 |
saveRDS(normal_expression_data, file = normal_output_path) |
|
|
134 |
|
|
|
135 |
|
|
|
136 |
result <- list( |
|
|
137 |
tumor_tcga_data = tumor_expression_data, |
|
|
138 |
normal_tcga_data = normal_expression_data |
|
|
139 |
) |
|
|
140 |
|
|
|
141 |
return(result) |
|
|
142 |
} |