Diff of /R/CombatNormal.R [000000] .. [0f2269]

Switch to unified view

a b/R/CombatNormal.R
1
#' Process and Correct Batch Effects in TCGA's normal tissue and GTEX Data
2
#'
3
#' This function takes a TCGA's normal tissue data set and a pre-saved GTEX data set, asks the user
4
#' for specific TCGA's normal tissue types to retain, then merges the two datasets. The merged dataset
5
#' is then corrected for batch effects using the ComBat_seq function from the 'sva' package.
6
#'
7
#' @description
8
#' The function first extracts histological types from the provided TCGA's normal tissue data set.
9
#' After displaying these types, the user is prompted to input specific types to retain.
10
#' The data is then filtered based on this input.
11
#' The GTEX and TCGA's normal tissue datasets are then combined and batch corrected.
12
#'
13
#' Note: This function assumes that TCGA's normal samples and GTEX samples represent different batches.
14
#'
15
#' @param TCGA_normal_data_path The path to the tumor data stored in an RDS file.
16
#' @param gtex_data_path The path to the GTEX data stored in an RDS file.
17
#' @param CombatNormal_output_path A character string specifying the path where the output RDS file will be saved.
18
#' @param auto_mode Logical. If set to TRUE, the function will not prompt the user for input and
19
#'                  will instead use the values provided in default_input. Default is FALSE.
20
#' @param default_input Character string. When auto_mode is TRUE, this parameter specifies the default
21
#'                      TGCA's normal tissue types to be retained. It should be provided as a comma-separated string (e.g., "11,12").
22
#'
23
#' @return A data.frame with corrected values after the ComBat_seq adjustment. Note that this function also saves the
24
#'        combat_count_df data as an RDS file at the specified output path.
25
#'
26
#' @details
27
#' The ComBat_seq function from the 'sva' package is used to correct batch effects.
28
#' The function requires the 'sva' package to be installed and loaded externally.
29
#'
30
#' The example code uses `tempfile()` to generate temporary paths dynamically during execution.
31
#' These paths are valid during the `R CMD check` process, even if no actual files exist,
32
#' because `tempfile()` generates a unique file path that does not depend on the user's file system.
33
#' Using `tempfile()` ensures that the example code does not rely on specific external files and
34
#' avoids errors during `R CMD check`. CRAN review checks for documentation correctness
35
#' and syntax parsing, not the existence of actual files, as long as the example code is syntactically valid.
36
#'
37
#' @examples
38
#' TCGA_normal_file <- system.file("extdata",
39
#'                                 "SKCM_Skin_TCGA_exp_normal_test.rds",
40
#'                                 package = "TransProR")
41
#' gtex_file <- system.file("extdata", "Skin_SKCM_Gtex_test.rds", package = "TransProR")
42
#' output_file <- file.path(tempdir(), "SKCM_Skin_Combat_Normal_TCGA_GTEX_count.rds")
43
#'
44
#' SKCM_Skin_Combat_Normal_TCGA_GTEX_count <- Combat_Normal(
45
#'   TCGA_normal_data_path = TCGA_normal_file,
46
#'   gtex_data_path = gtex_file,
47
#'   CombatNormal_output_path = output_file,
48
#'   auto_mode = TRUE,
49
#'   default_input = "skip"
50
#' )
51
#' head(SKCM_Skin_Combat_Normal_TCGA_GTEX_count)[1:5, 1:5]
52
#'
53
#' @seealso \code{\link[sva]{ComBat_seq}}
54
#' @importFrom sva ComBat_seq
55
#' @importFrom tibble column_to_rownames
56
#' @export
57
Combat_Normal <- function(TCGA_normal_data_path, gtex_data_path, CombatNormal_output_path, auto_mode = FALSE, default_input = "11,12") {
58
59
  # Load the TGCA's normal tissue data and GTEX data
60
  TCGA_normal_data <- readRDS(TCGA_normal_data_path)
61
  gtex_data <- readRDS(gtex_data_path)
62
63
  # Extract histological types for the TGCA's normal tissue data's samples
64
  NormalHistologicalTypes <- substring(colnames(TCGA_normal_data), 14, 15)
65
66
  # Filter only the normal samples
67
  normal_data <- TCGA_normal_data[, as.numeric(NormalHistologicalTypes) > 10]
68
69
  # Display the table to the user
70
  normal_hist_table <- table(NormalHistologicalTypes)
71
  #print(normal_hist_table)
72
73
  message(" ")
74
  message("NormalHistologicalTypes:")
75
  message(paste(names(normal_hist_table), normal_hist_table, sep = ": ", collapse = "\n"))
76
  # Add a space after the output for separation
77
  message(" ")
78
79
  # Ask the user for input or use default input in auto_mode
80
  if(auto_mode) {
81
    selected_types <- strsplit(default_input, ",")[[1]]
82
  } else {
83
    message("Please input the normal tissue types you wish to retain or 'skip' to only use GTEX data: ")
84
    selected_types <- strsplit(readline(), ",")[[1]]
85
  }
86
87
  # If the user didn't select 'skip'
88
  if (!is.element("skip", selected_types)) {
89
    # Filter the tumor data based on user's input
90
    normal <- normal_data[, NormalHistologicalTypes %in% selected_types]
91
92
    # Combine GTEX and selected TCGA data
93
    # Merge the datasets, ensuring both have genes as row names
94
    combined_data <- merge(normal, gtex_data, by = "row.names")
95
    combined_data <- tibble::column_to_rownames(combined_data, var = "Row.names")  # Set the row names
96
97
    # Create group vector (All samples as same group)
98
    combined_group <- rep("all_group", ncol(combined_data))
99
100
    # Create batch vector for selected normal TCGA samples
101
    tcga_batches <- match(NormalHistologicalTypes[NormalHistologicalTypes %in% selected_types], selected_types)
102
103
    # Combine the batch vector for TCGA samples with GTEX batch
104
    combined_batch <- c(tcga_batches, rep("GTEX", ncol(gtex_data)))
105
106
    # Modify the tumor values
107
    combined_data_count <- 2^(combined_data) - 1
108
    combined_data_count <- apply(combined_data_count, 2, as.integer)
109
    rownames(combined_data_count) <- rownames(combined_data)
110
111
    # Correct for batch effects using ComBat_seq
112
    combat_count <- sva::ComBat_seq(as.matrix(combined_data_count),
113
                                    batch = combined_batch,
114
                                    group = combined_group)
115
116
    # Convert matrix to data frame
117
    combat_count_df <- as.data.frame(combat_count)
118
119
    saveRDS(combat_count_df, file = CombatNormal_output_path)
120
    return(combat_count_df)
121
  } else {
122
    combat_count_matrix <- 2^(gtex_data) - 1
123
    combat_count_matrix <- apply(combat_count_matrix, 2, as.integer)
124
    rownames(combat_count_matrix) <- rownames(gtex_data)
125
    combat_count_df <- as.data.frame(combat_count_matrix)
126
    saveRDS(combat_count_df, file = CombatNormal_output_path)
127
    return(combat_count_df)
128
  }
129
}