--- a
+++ b/R/TCGA_Data_Preparation.R
@@ -0,0 +1,546 @@
+# TCGA_Data_Preparation.R
+
+getGDCprojects()
+
+# Download_GDC.R
+
+# ==== Preperation ====
+if (!require(SummarizedExperiment)) {
+  install.packages("SummarizedExperiment")
+  library(SummarizedExperiment)
+}
+if (!require(parallel)) {
+  install.packages("parallel")
+  library(parallel)
+}
+if (!require(data.table)) {
+  install.packages("data.table")
+  library(data.table)
+}
+# Note that on Linux, libmariadb-client-lgpl-dev must be installed
+# This allows for configuration of RMySQL
+if (!require(TCGAbiolinks)) {
+  if (!requireNamespace("BiocManager", quietly = TRUE))
+    install.packages("BiocManager")
+  BiocManager::install("TCGAbiolinks")
+  require(TCGAbiolinks)
+  ??TCGAbiolinks
+  
+  library(TCGAbiolinks)
+}
+
+getGDCInfo()
+getGDCprojects()
+getProjectSummary("TCGA-ACC")
+# Data Release 28, February 02, 2021
+# Move all data to a single directory later on
+dir.create(("Data/TCGA"))
+dir.create("Data/TCGA/SummarizedExperiments")
+dir.create("Data/TCGA/SummarizedExperiments/Mut")
+dir.create("Data/TCGA/SummarizedExperiments/CNV")
+dir.create("Data/TCGA/SummarizedExperiments/Exp")
+
+projects <- TCGAbiolinks:::getGDCprojects()$project_id
+projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
+
+
+# ==== Download Mut data ====
+plyr::alply(sort(projects), 1, function(cur_project) {
+  tryCatch(
+    query <- GDCquery(
+      project = cur_project,
+      data.category = "Simple Nucleotide Variation",
+      data.type = "Masked Somatic Mutation",
+      workflow.type = 'VarScan2 Variant Aggregation and Masking'
+    )
+  )
+  query <- GDCquery(
+    project = cur_project,
+    data.category = "Simple Nucleotide Variation",
+    data.type = "Masked Somatic Mutation",
+    workflow.type = 'VarScan2 Variant Aggregation and Masking'
+  )
+  
+  GDCdownload(query, files.per.chunk = 10, method = "client")
+  GDCprepare(query,
+    save = T,
+    save.filename = paste0("Data/TCGA/SummarizedExperiments/Mut/", cur_project,
+                           "_VarScan2_MAF.rdata")
+  )
+})
+
+# ==== Download CNV data ====
+plyr::alply(sort(projects)[-(1:14)], 1, function(cur_project) {
+  tryCatch(
+    query <- GDCquery(
+      project = cur_project,
+      data.category = "Copy Number Variation",
+      data.type = "Masked Copy Number Segment", 
+    )
+  )
+  query <- GDCquery(
+    project = cur_project,
+    data.category = "Copy Number Variation",
+    data.type = "Masked Copy Number Segment",
+  )
+  GDCdownload(query, files.per.chunk = 10)
+  GDCprepare(query,
+             save = T,
+             save.filename = paste0("Data/TCGA/SummarizedExperiments/CNV/", cur_project,
+                                    "_CopyNumber.rdata"))
+  # cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/CNV/", cur_project, "_CopyNumber.rdata")))
+  
+})
+
+# ==== Download clinical CNV data ====
+
+dir.create("Data/TCGA/Clinical")
+dir.create("Data/TCGA/Clinical/CNV")
+plyr::alply(sort(projects)[-(1:14)], 1, function(cur_project) {
+  tryCatch(
+    query.biospecimen <- GDCquery(project = cur_project, 
+                                  data.category = "Biospecimen",
+                                  data.type = "Biospecimen Supplement", 
+                                  data.format = "BCR Biotab")
+  )
+  query.biospecimen <- GDCquery(project = cur_project, 
+                                data.category = "Biospecimen",
+                                data.type = "Biospecimen Supplement", 
+                                data.format = "BCR Biotab")
+  GDCdownload(query.biospecimen, files.per.chunk = 10)
+  GDCprepare(query.biospecimen,
+             save = T,
+             save.filename = paste0("Data/TCGA/Clinical/CNV/", cur_project,
+                                    "_CopyNumber_Clinical.rdata"))
+  
+})
+
+# ==== Download gene expression data ====
+plyr::alply(projects, 1, function(cur_project) {
+  tryCatch(
+      query <- GDCquery(
+          project = cur_project,
+          data.category = "Transcriptome Profiling",
+          data.type = "Gene Expression Quantification",
+          workflow.type = "HTSeq - Counts"
+      )
+  )
+  query <- GDCquery(
+    project = cur_project,
+    data.category = "Transcriptome Profiling",
+    data.type = "Gene Expression Quantification",
+    workflow.type = "HTSeq - Counts"
+  )
+  GDCdownload(query, files.per.chunk = 10, method = "client")
+  GDCprepare(
+    query,
+    save = T,
+    save.filename = paste0("Data/TCGA/SummarizedExperiments/Exp/", cur_project, "_GeneExpression.rdata")
+  )
+  
+})
+
+
+# =====================================
+# For each data type, combine all projects into a single file
+require(data.table)
+require(TCGAbiolinks)
+require(SummarizedExperiment)
+require(GeoTcgaData)
+require(stringr)
+require(biomaRt)
+require(curl)
+
+# Get clinical info from Gene Expression data ====
+all_proj_info <- vector(mode = "list", length = length(projects))
+i <- 1
+for (cur_project in projects) {
+  cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/Exp/", cur_project, "_GeneExpression.rdata")))
+  cur_proj_assay <- assays(cur_proj_data)[[1]]
+  cur_proj_info <- colData(cur_proj_data)
+  all_proj_info[[i]] <- as.data.table(cur_proj_info[, c("barcode", "name")])
+  i <- i + 1
+}
+
+all_proj_info <- rbindlist(all_proj_info)
+# Save
+fwrite(all_proj_info, "Data/TCGA/All_Sample_Info.csv")
+
+
+# ==== Merge Exp Data ====
+require(data.table)
+require(biomaRt)
+dir.create("Data/TCGA/Exp")
+projects <- TCGAbiolinks:::getGDCprojects()$project_id
+projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
+# gene_dict <- fread("Data/gene_dictionary.txt")
+depmap_exp <- fread("Data/DepMap/20Q2/CCLE_expression.csv")
+depmap_ensg_numbers <- str_replace(colnames(depmap_exp), ".+\\((.+)\\)", "\\1")
+depmap_hgnc <- str_replace(colnames(depmap_exp), "(.+)\\s\\(.+\\)", "\\1")
+depmap_exp_dict <- data.table(HGNC = depmap_hgnc, ENSG = depmap_ensg_numbers)
+
+rm(depmap_exp)
+
+ensembl = useMart(
+  biomart = "ENSEMBL_MART_ENSEMBL",
+  # host = "feb2014.archive.ensembl.org",
+  path = "/biomart/martservice",
+  dataset = "hsapiens_gene_ensembl"
+)
+atts <- listAttributes(ensembl, what = "name")
+atts[grep(pattern = "gene", x = atts, ignore.case = T)]
+
+cur_project = projects[1]
+
+for (cur_project in projects) {
+  cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/Exp/", cur_project, "_GeneExpression.rdata")))
+  cur_proj_assay <- assays(cur_proj_data)[[1]]
+  cur_proj_info <- colData(cur_proj_data)
+  cur_proj_tpm <- GeoTcgaData::fpkmToTpm_matrix(cur_proj_assay)
+  
+  # log2 transform with a pseudo-count of 1
+  cur_proj_tpm <- cur_proj_tpm+1
+  cur_proj_tpm <- log2(cur_proj_tpm)
+  
+  # Convert ENSG ID's to numbers
+  cur_ensg_numbers <- str_replace(rownames(cur_proj_tpm), "ENSG0*", "")
+  cur_proj_tpm_dt <- as.data.table(cur_proj_tpm, keep.rownames = T)
+  cur_dict <- biomaRt::getBM(attributes = c("ensembl_gene_id",
+                                            "external_gene_name"),
+                         filters = "ensembl_gene_id", values = cur_proj_tpm_dt$rn,
+                         mart = ensembl)
+  cur_dict <- as.data.table(cur_dict)
+  temp <- merge(cur_proj_tpm_dt, cur_dict, by.x = "rn", by.y = "ensembl_gene_id")
+  # Find overlap with HGNC symbols in DepMap 
+  cur_reduced_dt <- temp[external_gene_name %in% depmap_exp_dict$HGNC]
+  
+  dup_genes <- cur_reduced_dt[duplicated(cur_reduced_dt$external_gene_name)]$external_gene_name
+  # Duplicated genes may represent transcripts from the same gene, must consolidate by adding TPM values
+  cur_reduced_dt$rn <- NULL
+  cur_reduced_dt <- cur_reduced_dt[, lapply(.SD, sum), by = external_gene_name]
+  final_dt <- transpose(cur_reduced_dt, keep.names = "external_gene_name")
+  
+  colnames(final_dt) <- unname(unlist(final_dt[1,]))
+  final_dt <- final_dt[!(TSPAN6 == "TSPAN6")]
+  # Add cancer type
+  final_dt <- cbind(cancer_type = cur_proj_info$name, final_dt)
+  colnames(final_dt)[2] <- "tcga_sample_id"
+  # Sort all the other columns
+  new_order <- c("tcga_sample_id", "cancer_type", sort(colnames(final_dt)[-c(1:2)]))
+  setcolorder(final_dt, new_order)
+  
+  fwrite(final_dt, paste0("Data/TCGA/Exp/", cur_project, "_Exp.csv"))
+}
+
+# Now merge all the data.tables
+exp_file_paths <- list.files("Data/TCGA/Exp/", full.names = T)
+exp_dt_list <- vector(mode = "list", length = length(exp_file_paths))
+
+for (i in 1:length(exp_file_paths)) {
+  exp_dt_list[[i]] <- fread(exp_file_paths[i])
+}
+
+all_exp <- rbindlist(exp_dt_list)
+all_exp[1:5, 1:5]
+dim(all_exp)
+fwrite(all_exp, "Data/TCGA/TCGA_All_Exp_Data.csv")
+rm(exp_dt_list)
+rm(all_exp)
+
+# all_exp <- fread("Data/TCGA/TCGA_All_Exp_Data.csv")
+# ==== Merge CNV Data ====
+require(GenomicRanges)
+# Must find the genomic ranges of genes in DepMap CNV data (ensure reference is the same), then merge using the GRanges package (map onto)
+dir.create("Data/TCGA/CNV")
+projects <- TCGAbiolinks:::getGDCprojects()$project_id
+projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
+# gene_dict <- fread("Data/gene_dictionary.txt")
+depmap_cnv <- fread("Data/DepMap/20Q2/CCLE_gene_copy_number.csv")
+head(colnames(depmap_cnv))
+depmap_ensg_numbers <- str_replace(colnames(depmap_cnv), ".+\\((.+)\\)", "\\1")
+depmap_hgnc <- str_replace(colnames(depmap_cnv), "(.+)\\s\\(.+\\)", "\\1")
+depmap_cnv_dict <- data.table(HGNC = depmap_hgnc, ENSG = depmap_ensg_numbers)
+
+rm(depmap_cnv)
+
+ensembl = useMart(
+  biomart = "ENSEMBL_MART_ENSEMBL",
+  # host = "feb2014.archive.ensembl.org",
+  path = "/biomart/martservice",
+  dataset = "hsapiens_gene_ensembl"
+)
+
+cur_translation <- biomaRt::getBM(attributes = c("ensembl_gene_id", "wikigene_name",
+                                                 "external_gene_name", 'chromosome_name',
+                                                 'start_position', 'end_position', 'strand'),
+                           filters = "external_gene_name", values = depmap_cnv_dict$HGNC, uniqueRows = T,
+                           mart = ensembl)
+cur_translation <- as.data.table(cur_translation)
+cur_translation <- cur_translation[!duplicated(cur_translation[,"wikigene_name"]), ]
+cur_translation <- cur_translation[!duplicated(cur_translation[,"external_gene_name"]), ]
+
+depmap_cnv_dict_1 <- merge(depmap_cnv_dict, cur_translation, by.x = "HGNC", by.y = "external_gene_name")
+depmap_cnv_dict_1$wikigene_name <- NULL
+# unfound <- depmap_cnv_dict$HGNC[!(depmap_cnv_dict$HGNC %in% cur_translation$wikigene_name)]
+unfound <- depmap_cnv_dict$HGNC[!(depmap_cnv_dict$HGNC %in% cur_translation$external_gene_name)]
+# Search for the unfound in wikigene info
+unfound_translation <- biomaRt::getBM(attributes = c("ensembl_gene_id", "wikigene_name",
+                                                 "external_gene_name", 'chromosome_name',
+                                                 'start_position', 'end_position', 'strand'),
+                                  filters = "wikigene_name", values = unfound, uniqueRows = T,
+                                  mart = ensembl)
+unfound_translation <- as.data.table(unfound_translation)
+depmap_cnv_dict_2 <- merge(depmap_cnv_dict, unfound_translation, by.x = "HGNC", by.y = "wikigene_name")
+depmap_cnv_dict_2$external_gene_name <- NULL
+
+# Add together
+# all_translation <- rbindlist(list(cur_translation, unfound_translation))
+depmap_cnv_dict <- rbindlist(list(depmap_cnv_dict_1, depmap_cnv_dict_2))
+depmap_cnv_dict$strand[depmap_cnv_dict$strand == -1] <- "-"
+depmap_cnv_dict$strand[depmap_cnv_dict$strand == 1] <- "+"
+
+# Convert to GRanges
+depmap_cnv_granges <- makeGRangesFromDataFrame(df = depmap_cnv_dict, keep.extra.columns = T, seqnames.field = "chromosome_name",
+                         start.field = "start_position", end.field = "end_position", strand.field = "strand")
+
+atts <- listAttributes(ensembl, what = "name")
+atts[grep(pattern = "gene", x = atts, ignore.case = T)]
+atts[grep(pattern = "chromosome", x = atts, ignore.case = T)]
+
+cur_project = projects[1]
+
+rm(list = c("depmap_cnv_dict_1", "depmap_cnv_dict_2"))
+
+
+for (cur_project in projects) {
+  cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/CNV/", cur_project, "_CopyNumber.rdata")))
+  # Convert to GRanges
+  cur_proj_granges <- makeGRangesFromDataFrame(df = cur_proj_data, keep.extra.columns = T, start.field = "Start",
+                                               end.field = "End", seqnames.field = "Chromosome")
+  # Merge with DepMap data
+  cur_proj_merge <- mergeByOverlaps(query = cur_proj_granges, subject = depmap_cnv_granges)
+  cur_proj_merge <- cur_proj_merge[, c("Sample", "HGNC", "Segment_Mean")]
+  cur_proj_merge <- as.data.table(cur_proj_merge)
+  cur_proj_merge <- unique(cur_proj_merge)
+  
+  cur_wide_data <- dcast(cur_proj_merge, Sample ~ HGNC, value.var = "Segment_Mean", fun.aggregate = sum)
+  
+  colnames(cur_wide_data)[1] <- "tcga_sample_id"
+  # Sort all the other columns
+  new_order <- c("tcga_sample_id", sort(colnames(cur_wide_data)[-1]))
+  setcolorder(cur_wide_data, new_order)
+
+  fwrite(cur_wide_data, paste0("Data/TCGA/CNV/", cur_project, "_CNV.csv"))
+}
+
+# Now merge all the data.tables
+cnv_file_paths <- list.files("Data/TCGA/CNV/", full.names = T)
+cnv_dt_list <- vector(mode = "list", length = length(cnv_file_paths))
+
+for (i in 1:length(cnv_file_paths)) {
+  cnv_dt_list[[i]] <- fread(cnv_file_paths[i])
+}
+
+all_cnv <- rbindlist(cnv_dt_list)
+all_cnv[1:5, 1:5]
+fwrite(all_cnv, "Data/TCGA/TCGA_All_CNV_Data.csv")
+
+# The first 3 ID's should be enough to indicate tumor type
+require(stringr)
+all_proj_info <- fread("Data/TCGA/All_Sample_Info.csv")
+temp <- str_split_fixed(all_proj_info$barcode, "-", n = 7)[, 1:3]
+all_proj_info$sub_id <- paste(temp[,1], temp[,2], temp[,3], sep = '-')
+
+all_cnv <- fread("Data/TCGA/TCGA_All_CNV_Data.csv")
+temp2 <- str_split_fixed(all_cnv$tcga_sample_id, "-", n = 7)[, 1:4]
+sample_type <- str_split_fixed(temp2[, 4], "", n=3)[, 1:2]
+sample_type <- as.integer(paste0(sample_type[,1], sample_type[,2]))
+all_cnv$sub_id <- paste(temp2[,1], temp2[,2], temp2[,3], sep = '-')
+
+all_cnv$sample_type <- ifelse(sample_type < 10, yes = "Tumor", no = "Normal")
+table(all_cnv$sample_type)
+
+# Subset to Tumor
+all_cnv <- all_cnv[sample_type == "Tumor"]
+# add cancer type
+all_cnv <- merge(all_cnv, all_proj_info[, c("name", "sub_id")], by = "sub_id")
+table(all_cnv$name)
+dim(all_cnv)
+# remove sub_id, sample_type
+all_cnv$sub_id <- NULL
+all_cnv$sample_type <- NULL
+# reorder
+setcolorder(all_cnv, "name")
+colnames(all_cnv)[1] <- "cancer_type"
+all_cnv[1:5, 1:5]
+
+# Save
+fwrite(all_cnv, "Data/TCGA/TCGA_All_CNV_Data.csv")
+
+dim(all_cnv)
+
+rm(cnv_dt_list)
+rm(all_cnv)
+
+
+# ==== Merge Mut Data ====
+# dir.create("Data/TCGA/Mut")
+# projects <- TCGAbiolinks:::getGDCprojects()$project_id
+# projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
+# # gene_dict <- fread("Data/gene_dictionary.txt")
+# depmap_mut <- fread("Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv")
+# depmap_ensg_numbers <- str_replace(colnames(depmap_mut), ".+\\((.+)\\)", "\\1")
+# depmap_hgnc <- str_replace(colnames(depmap_mut), "(.+)\\s\\(.+\\)", "\\1")
+# depmap_mut_dict <- data.table(HGNC = depmap_hgnc, ENSG = depmap_ensg_numbers)
+# 
+# rm(depmap_mut)
+# 
+# ensembl = useMart(
+#   biomart = "ENSEMBL_MART_ENSEMBL",
+#   # host = "feb2014.archive.ensembl.org",
+#   path = "/biomart/martservice",
+#   dataset = "hsapiens_gene_ensembl"
+# )
+# atts <- listAttributes(ensembl, what = "name")
+# 
+# # cur_project = projects[1]
+# 
+# for (cur_project in projects) {
+#   cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/Mut/", cur_project, "_VarScan2_MAF.rdata")))
+#   cur_sub <- cur_proj_data[, c("Hugo_Symbol", "Variant_Classification", "Variant_Type", "Tumor_Sample_Barcode")]
+#   unique(cur_sub$Hugo_Symbol)
+#   # Find overlap with DepMap
+#   sum(depmap_mut_dict$HGNC %in% unique(cur_sub$Hugo_Symbol))
+#   
+#   
+#   cur_proj_assay <- assays(cur_proj_data)[[1]]
+#   # cur_proj_assay[1:5, 1:5]
+#   # class(cur_proj_assay)
+#   cur_proj_tpm <- GeoTcgaData::fpkmToTpm_matrix(cur_proj_assay)
+#   # Convert ENSG ID's to numbers
+#   cur_ensg_numbers <- str_replace(rownames(cur_proj_tpm), "ENSG0*", "")
+#   cur_proj_tpm_dt <- as.data.table(cur_proj_tpm, keep.rownames = T)
+#   # cur_proj_tpm_dt[1:5, 1:5]
+#   # cur_proj_tpm_dt$rn <- str_replace(cur_proj_tpm_dt$rn, "ENSG0*", "")
+#   cur_dict <- biomaRt::getBM(attributes = c("ensembl_gene_id",
+#                                             "external_gene_name"),
+#                              filters = "ensembl_gene_id", values = cur_proj_tpm_dt$rn,
+#                              mart = ensembl)
+#   cur_dict <- as.data.table(cur_dict)
+#   temp <- merge(cur_proj_tpm_dt, cur_dict, by.x = "rn", by.y = "ensembl_gene_id")
+#   # dim(temp)
+#   # Find overlap with HGNC symbols in DepMap 
+#   cur_reduced_dt <- temp[external_gene_name %in% depmap_exp_dict$HGNC]
+#   # dim(cur_reduced_dt)
+#   
+#   # sum(duplicated(cur_reduced_dt$external_gene_name))
+#   dup_genes <- cur_reduced_dt[duplicated(cur_reduced_dt$external_gene_name)]$external_gene_name
+#   # Duplicated genes may represent transcripts from the same gene, must consolidate by adding TPM values
+#   cur_reduced_dt$rn <- NULL
+#   cur_reduced_dt <- cur_reduced_dt[, lapply(.SD, sum), by = external_gene_name]
+#   final_dt <- transpose(cur_reduced_dt, keep.names = "external_gene_name")
+#   colnames(final_dt) <- unname(unlist(final_dt[1,]))
+#   # final_dt[1:5, 1:5]
+#   final_dt <- final_dt[!(TSPAN6 == "TSPAN6")]
+#   # final_dt[1:5, 1:5]
+#   # dim(final_dt)
+#   
+#   # temp1[external_gene_name %in% dup_genes][,c("external_gene_name", "TCGA-FI-A2D5-01A-11R-A17B-07")]
+#   fwrite(final_dt, paste0("Data/TCGA/Exp/", cur_project, "_Exp.csv"))
+# }
+# 
+# # Now merge all the data.tables
+# exp_file_paths <- list.files("Data/TCGA/Exp/", full.names = T)
+# exp_dt_list <- vector(mode = "list", length = length(exp_file_paths))
+# 
+# for (i in 1:length(exp_file_paths)) {
+#   exp_dt_list[[i]] <- fread(exp_file_paths[i])
+# }
+# 
+# all_exp <- rbindlist(exp_dt_list)
+# fwrite(all_exp, "Data/TCGA/TCGA_All_Exp_Data.csv")
+# 
+
+
+# ==== Sort and Clean Data ====
+
+# ==== Match TCGA and DepMap Exp Data ====
+require(data.table)
+exp_tcga <- fread("Data/TCGA/TCGA_All_Exp_Data.csv")
+exp_tcga[1:5, 1:5]
+exp_depmap <- fread("Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
+exp_depmap[1:5, 1:5]
+dim(exp_tcga)
+dim(exp_depmap)
+
+# Find shared columns:
+shared_exp <- intersect(colnames(exp_depmap), colnames(exp_tcga))
+# Sort columns
+shared_exp <- sort(shared_exp)
+shared_exp <- c("tcga_sample_id", "cancer_type", shared_exp)
+exp_tcga <- exp_tcga[, ..shared_exp]
+colnames(exp_tcga)[2] <- "primary_disease"
+
+exp_tcga[1:5, 1:5]
+dim(exp_tcga)
+
+# Save TCGA pre-training data
+fwrite(exp_tcga, "Data/DRP_Training_Data/TCGA_PreTraining_Expression.csv")
+
+# Subset DepMap data to what's available in TCGA
+shared_exp <- unique(c("stripped_cell_line_name", "primary_disease", sort(intersect(colnames(exp_depmap), colnames(exp_tcga)))))
+exp_depmap <- exp_depmap[, ..shared_exp]
+exp_depmap[1:5, 1:5]
+dim(exp_depmap)
+# Save DepMap training data
+fwrite(exp_depmap, "Data/DRP_Training_Data/DepMap_21Q2_Training_Expression.csv")
+
+rm(exp_depmap)
+rm(exp_tcga)
+# exp_tcga <- fread("Data/DRP_Training_Data/TCGA_PreTraining_Expression.csv")
+# exp_depmap <- fread("Data/DRP_Training_Data/DepMap_20Q2_Training_Expression.csv")
+# exp_tcga[1:5, 1:5]
+# exp_depmap[1:5, 1:5]
+
+# ==== Match TCGA and DepMap CNV Data ====
+cnv_tcga <- fread("Data/TCGA/TCGA_All_CNV_Data.csv")
+cnv_tcga[1:5, 1:5]
+cnv_depmap <- fread("Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv")
+cnv_depmap[1:5, 1:5]
+dim(cnv_depmap)
+dim(cnv_tcga)
+
+shared_cnv <- intersect(colnames(cnv_depmap), colnames(cnv_tcga))
+# Sort columns
+shared_cnv <- sort(shared_cnv)
+shared_cnv <- c("tcga_sample_id", "cancer_type", shared_cnv)
+cnv_tcga <- cnv_tcga[, ..shared_cnv]
+colnames(cnv_tcga)[2] <- "primary_disease"
+
+cnv_tcga[1:5, 1:5]
+
+# Must add pseudocounts to log2 ratios of TCGA data
+temp <- cnv_tcga[, -c(1:2)][, lapply(.SD, function(x) {log2(2**x + 1)})]
+temp[1:5, 1:5]
+temp$tcga_sample_id <- cnv_tcga$tcga_sample_id
+temp$primary_disease <- cnv_tcga$primary_disease
+setcolorder(temp, c("tcga_sample_id", "primary_disease"))
+temp[1:5, 1:5]
+dim(temp)
+
+fwrite(temp, "Data/DRP_Training_Data/TCGA_PreTraining_CopyNumber.csv")
+
+
+# Subset DepMap data to what's available in TCGA
+cnv_depmap[1:5, 1:5]
+shared_cnv <- unique(c("stripped_cell_line_name", "primary_disease", sort(intersect(colnames(cnv_depmap), colnames(cnv_tcga)))))
+cnv_depmap <- cnv_depmap[, ..shared_cnv]
+
+cnv_depmap[1:5, 1:5]
+dim(cnv_depmap)
+# Save DepMap training data
+fwrite(cnv_depmap, "Data/DRP_Training_Data/DepMap_21Q2_Training_CopyNumber.csv")
+
+rm(cnv_depmap)
+rm(cnv_tcga)
+gc()