|
a |
|
b/R/CombatNormal.R |
|
|
1 |
#' Process and Correct Batch Effects in TCGA's normal tissue and GTEX Data |
|
|
2 |
#' |
|
|
3 |
#' This function takes a TCGA's normal tissue data set and a pre-saved GTEX data set, asks the user |
|
|
4 |
#' for specific TCGA's normal tissue types to retain, then merges the two datasets. The merged dataset |
|
|
5 |
#' is then corrected for batch effects using the ComBat_seq function from the 'sva' package. |
|
|
6 |
#' |
|
|
7 |
#' @description |
|
|
8 |
#' The function first extracts histological types from the provided TCGA's normal tissue data set. |
|
|
9 |
#' After displaying these types, the user is prompted to input specific types to retain. |
|
|
10 |
#' The data is then filtered based on this input. |
|
|
11 |
#' The GTEX and TCGA's normal tissue datasets are then combined and batch corrected. |
|
|
12 |
#' |
|
|
13 |
#' Note: This function assumes that TCGA's normal samples and GTEX samples represent different batches. |
|
|
14 |
#' |
|
|
15 |
#' @param TCGA_normal_data_path The path to the tumor data stored in an RDS file. |
|
|
16 |
#' @param gtex_data_path The path to the GTEX data stored in an RDS file. |
|
|
17 |
#' @param CombatNormal_output_path A character string specifying the path where the output RDS file will be saved. |
|
|
18 |
#' @param auto_mode Logical. If set to TRUE, the function will not prompt the user for input and |
|
|
19 |
#' will instead use the values provided in default_input. Default is FALSE. |
|
|
20 |
#' @param default_input Character string. When auto_mode is TRUE, this parameter specifies the default |
|
|
21 |
#' TGCA's normal tissue types to be retained. It should be provided as a comma-separated string (e.g., "11,12"). |
|
|
22 |
#' |
|
|
23 |
#' @return A data.frame with corrected values after the ComBat_seq adjustment. Note that this function also saves the |
|
|
24 |
#' combat_count_df data as an RDS file at the specified output path. |
|
|
25 |
#' |
|
|
26 |
#' @details |
|
|
27 |
#' The ComBat_seq function from the 'sva' package is used to correct batch effects. |
|
|
28 |
#' The function requires the 'sva' package to be installed and loaded externally. |
|
|
29 |
#' |
|
|
30 |
#' The example code uses `tempfile()` to generate temporary paths dynamically during execution. |
|
|
31 |
#' These paths are valid during the `R CMD check` process, even if no actual files exist, |
|
|
32 |
#' because `tempfile()` generates a unique file path that does not depend on the user's file system. |
|
|
33 |
#' Using `tempfile()` ensures that the example code does not rely on specific external files and |
|
|
34 |
#' avoids errors during `R CMD check`. CRAN review checks for documentation correctness |
|
|
35 |
#' and syntax parsing, not the existence of actual files, as long as the example code is syntactically valid. |
|
|
36 |
#' |
|
|
37 |
#' @examples |
|
|
38 |
#' TCGA_normal_file <- system.file("extdata", |
|
|
39 |
#' "SKCM_Skin_TCGA_exp_normal_test.rds", |
|
|
40 |
#' package = "TransProR") |
|
|
41 |
#' gtex_file <- system.file("extdata", "Skin_SKCM_Gtex_test.rds", package = "TransProR") |
|
|
42 |
#' output_file <- file.path(tempdir(), "SKCM_Skin_Combat_Normal_TCGA_GTEX_count.rds") |
|
|
43 |
#' |
|
|
44 |
#' SKCM_Skin_Combat_Normal_TCGA_GTEX_count <- Combat_Normal( |
|
|
45 |
#' TCGA_normal_data_path = TCGA_normal_file, |
|
|
46 |
#' gtex_data_path = gtex_file, |
|
|
47 |
#' CombatNormal_output_path = output_file, |
|
|
48 |
#' auto_mode = TRUE, |
|
|
49 |
#' default_input = "skip" |
|
|
50 |
#' ) |
|
|
51 |
#' head(SKCM_Skin_Combat_Normal_TCGA_GTEX_count)[1:5, 1:5] |
|
|
52 |
#' |
|
|
53 |
#' @seealso \code{\link[sva]{ComBat_seq}} |
|
|
54 |
#' @importFrom sva ComBat_seq |
|
|
55 |
#' @importFrom tibble column_to_rownames |
|
|
56 |
#' @export |
|
|
57 |
Combat_Normal <- function(TCGA_normal_data_path, gtex_data_path, CombatNormal_output_path, auto_mode = FALSE, default_input = "11,12") { |
|
|
58 |
|
|
|
59 |
# Load the TGCA's normal tissue data and GTEX data |
|
|
60 |
TCGA_normal_data <- readRDS(TCGA_normal_data_path) |
|
|
61 |
gtex_data <- readRDS(gtex_data_path) |
|
|
62 |
|
|
|
63 |
# Extract histological types for the TGCA's normal tissue data's samples |
|
|
64 |
NormalHistologicalTypes <- substring(colnames(TCGA_normal_data), 14, 15) |
|
|
65 |
|
|
|
66 |
# Filter only the normal samples |
|
|
67 |
normal_data <- TCGA_normal_data[, as.numeric(NormalHistologicalTypes) > 10] |
|
|
68 |
|
|
|
69 |
# Display the table to the user |
|
|
70 |
normal_hist_table <- table(NormalHistologicalTypes) |
|
|
71 |
#print(normal_hist_table) |
|
|
72 |
|
|
|
73 |
message(" ") |
|
|
74 |
message("NormalHistologicalTypes:") |
|
|
75 |
message(paste(names(normal_hist_table), normal_hist_table, sep = ": ", collapse = "\n")) |
|
|
76 |
# Add a space after the output for separation |
|
|
77 |
message(" ") |
|
|
78 |
|
|
|
79 |
# Ask the user for input or use default input in auto_mode |
|
|
80 |
if(auto_mode) { |
|
|
81 |
selected_types <- strsplit(default_input, ",")[[1]] |
|
|
82 |
} else { |
|
|
83 |
message("Please input the normal tissue types you wish to retain or 'skip' to only use GTEX data: ") |
|
|
84 |
selected_types <- strsplit(readline(), ",")[[1]] |
|
|
85 |
} |
|
|
86 |
|
|
|
87 |
# If the user didn't select 'skip' |
|
|
88 |
if (!is.element("skip", selected_types)) { |
|
|
89 |
# Filter the tumor data based on user's input |
|
|
90 |
normal <- normal_data[, NormalHistologicalTypes %in% selected_types] |
|
|
91 |
|
|
|
92 |
# Combine GTEX and selected TCGA data |
|
|
93 |
# Merge the datasets, ensuring both have genes as row names |
|
|
94 |
combined_data <- merge(normal, gtex_data, by = "row.names") |
|
|
95 |
combined_data <- tibble::column_to_rownames(combined_data, var = "Row.names") # Set the row names |
|
|
96 |
|
|
|
97 |
# Create group vector (All samples as same group) |
|
|
98 |
combined_group <- rep("all_group", ncol(combined_data)) |
|
|
99 |
|
|
|
100 |
# Create batch vector for selected normal TCGA samples |
|
|
101 |
tcga_batches <- match(NormalHistologicalTypes[NormalHistologicalTypes %in% selected_types], selected_types) |
|
|
102 |
|
|
|
103 |
# Combine the batch vector for TCGA samples with GTEX batch |
|
|
104 |
combined_batch <- c(tcga_batches, rep("GTEX", ncol(gtex_data))) |
|
|
105 |
|
|
|
106 |
# Modify the tumor values |
|
|
107 |
combined_data_count <- 2^(combined_data) - 1 |
|
|
108 |
combined_data_count <- apply(combined_data_count, 2, as.integer) |
|
|
109 |
rownames(combined_data_count) <- rownames(combined_data) |
|
|
110 |
|
|
|
111 |
# Correct for batch effects using ComBat_seq |
|
|
112 |
combat_count <- sva::ComBat_seq(as.matrix(combined_data_count), |
|
|
113 |
batch = combined_batch, |
|
|
114 |
group = combined_group) |
|
|
115 |
|
|
|
116 |
# Convert matrix to data frame |
|
|
117 |
combat_count_df <- as.data.frame(combat_count) |
|
|
118 |
|
|
|
119 |
saveRDS(combat_count_df, file = CombatNormal_output_path) |
|
|
120 |
return(combat_count_df) |
|
|
121 |
} else { |
|
|
122 |
combat_count_matrix <- 2^(gtex_data) - 1 |
|
|
123 |
combat_count_matrix <- apply(combat_count_matrix, 2, as.integer) |
|
|
124 |
rownames(combat_count_matrix) <- rownames(gtex_data) |
|
|
125 |
combat_count_df <- as.data.frame(combat_count_matrix) |
|
|
126 |
saveRDS(combat_count_df, file = CombatNormal_output_path) |
|
|
127 |
return(combat_count_df) |
|
|
128 |
} |
|
|
129 |
} |