[0f2269]: / R / CombatNormal.R

Download this file

130 lines (114 with data), 6.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#' Process and Correct Batch Effects in TCGA's normal tissue and GTEX Data
#'
#' This function takes a TCGA's normal tissue data set and a pre-saved GTEX data set, asks the user
#' for specific TCGA's normal tissue types to retain, then merges the two datasets. The merged dataset
#' is then corrected for batch effects using the ComBat_seq function from the 'sva' package.
#'
#' @description
#' The function first extracts histological types from the provided TCGA's normal tissue data set.
#' After displaying these types, the user is prompted to input specific types to retain.
#' The data is then filtered based on this input.
#' The GTEX and TCGA's normal tissue datasets are then combined and batch corrected.
#'
#' Note: This function assumes that TCGA's normal samples and GTEX samples represent different batches.
#'
#' @param TCGA_normal_data_path The path to the tumor data stored in an RDS file.
#' @param gtex_data_path The path to the GTEX data stored in an RDS file.
#' @param CombatNormal_output_path A character string specifying the path where the output RDS file will be saved.
#' @param auto_mode Logical. If set to TRUE, the function will not prompt the user for input and
#' will instead use the values provided in default_input. Default is FALSE.
#' @param default_input Character string. When auto_mode is TRUE, this parameter specifies the default
#' TGCA's normal tissue types to be retained. It should be provided as a comma-separated string (e.g., "11,12").
#'
#' @return A data.frame with corrected values after the ComBat_seq adjustment. Note that this function also saves the
#' combat_count_df data as an RDS file at the specified output path.
#'
#' @details
#' The ComBat_seq function from the 'sva' package is used to correct batch effects.
#' The function requires the 'sva' package to be installed and loaded externally.
#'
#' The example code uses `tempfile()` to generate temporary paths dynamically during execution.
#' These paths are valid during the `R CMD check` process, even if no actual files exist,
#' because `tempfile()` generates a unique file path that does not depend on the user's file system.
#' Using `tempfile()` ensures that the example code does not rely on specific external files and
#' avoids errors during `R CMD check`. CRAN review checks for documentation correctness
#' and syntax parsing, not the existence of actual files, as long as the example code is syntactically valid.
#'
#' @examples
#' TCGA_normal_file <- system.file("extdata",
#' "SKCM_Skin_TCGA_exp_normal_test.rds",
#' package = "TransProR")
#' gtex_file <- system.file("extdata", "Skin_SKCM_Gtex_test.rds", package = "TransProR")
#' output_file <- file.path(tempdir(), "SKCM_Skin_Combat_Normal_TCGA_GTEX_count.rds")
#'
#' SKCM_Skin_Combat_Normal_TCGA_GTEX_count <- Combat_Normal(
#' TCGA_normal_data_path = TCGA_normal_file,
#' gtex_data_path = gtex_file,
#' CombatNormal_output_path = output_file,
#' auto_mode = TRUE,
#' default_input = "skip"
#' )
#' head(SKCM_Skin_Combat_Normal_TCGA_GTEX_count)[1:5, 1:5]
#'
#' @seealso \code{\link[sva]{ComBat_seq}}
#' @importFrom sva ComBat_seq
#' @importFrom tibble column_to_rownames
#' @export
Combat_Normal <- function(TCGA_normal_data_path, gtex_data_path, CombatNormal_output_path, auto_mode = FALSE, default_input = "11,12") {
# Load the TGCA's normal tissue data and GTEX data
TCGA_normal_data <- readRDS(TCGA_normal_data_path)
gtex_data <- readRDS(gtex_data_path)
# Extract histological types for the TGCA's normal tissue data's samples
NormalHistologicalTypes <- substring(colnames(TCGA_normal_data), 14, 15)
# Filter only the normal samples
normal_data <- TCGA_normal_data[, as.numeric(NormalHistologicalTypes) > 10]
# Display the table to the user
normal_hist_table <- table(NormalHistologicalTypes)
#print(normal_hist_table)
message(" ")
message("NormalHistologicalTypes:")
message(paste(names(normal_hist_table), normal_hist_table, sep = ": ", collapse = "\n"))
# Add a space after the output for separation
message(" ")
# Ask the user for input or use default input in auto_mode
if(auto_mode) {
selected_types <- strsplit(default_input, ",")[[1]]
} else {
message("Please input the normal tissue types you wish to retain or 'skip' to only use GTEX data: ")
selected_types <- strsplit(readline(), ",")[[1]]
}
# If the user didn't select 'skip'
if (!is.element("skip", selected_types)) {
# Filter the tumor data based on user's input
normal <- normal_data[, NormalHistologicalTypes %in% selected_types]
# Combine GTEX and selected TCGA data
# Merge the datasets, ensuring both have genes as row names
combined_data <- merge(normal, gtex_data, by = "row.names")
combined_data <- tibble::column_to_rownames(combined_data, var = "Row.names") # Set the row names
# Create group vector (All samples as same group)
combined_group <- rep("all_group", ncol(combined_data))
# Create batch vector for selected normal TCGA samples
tcga_batches <- match(NormalHistologicalTypes[NormalHistologicalTypes %in% selected_types], selected_types)
# Combine the batch vector for TCGA samples with GTEX batch
combined_batch <- c(tcga_batches, rep("GTEX", ncol(gtex_data)))
# Modify the tumor values
combined_data_count <- 2^(combined_data) - 1
combined_data_count <- apply(combined_data_count, 2, as.integer)
rownames(combined_data_count) <- rownames(combined_data)
# Correct for batch effects using ComBat_seq
combat_count <- sva::ComBat_seq(as.matrix(combined_data_count),
batch = combined_batch,
group = combined_group)
# Convert matrix to data frame
combat_count_df <- as.data.frame(combat_count)
saveRDS(combat_count_df, file = CombatNormal_output_path)
return(combat_count_df)
} else {
combat_count_matrix <- 2^(gtex_data) - 1
combat_count_matrix <- apply(combat_count_matrix, 2, as.integer)
rownames(combat_count_matrix) <- rownames(gtex_data)
combat_count_df <- as.data.frame(combat_count_matrix)
saveRDS(combat_count_df, file = CombatNormal_output_path)
return(combat_count_df)
}
}