--- a
+++ b/R/02_Omic_Data_Preparation.R
@@ -0,0 +1,1132 @@
+# Complete_Sample_Prep.R
+
+# This script is intended to pair genomics, transcriptomics, proteomics and drug response data
+# mainly from the DepMap resource.
+path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
+dir.create(paste0(path, "Data/DRP_Training_Data"))
+
+require(data.table)
+
+# ==== Cell line info cleanup ====
+depmap_samples <- fread(paste0(path, "Data/DepMap/21Q2/sample_info.csv"))
+# Subset relevant (machine learning) columns 
+depmap_samples <- depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")]
+
+fwrite(depmap_samples, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv"))
+
+# depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
+# ==== Expression data cleanup ====
+ccle_exp <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_expression.csv"))
+max(ccle_exp[, -1])
+min(ccle_exp[, -1])
+dim(ccle_exp)
+ccle_exp[1:5, 1:20]
+# Change column names to only contain HGNC name: replace everything after first word with ""
+colnames(ccle_exp) <- gsub(" .+", "", colnames(ccle_exp))
+colnames(ccle_exp)[1] <- "DepMap_ID"
+# Merge with sample info to have cell line name in addition to DepMap ID
+ccle_exp <- merge(ccle_exp, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "DepMap_ID")
+ccle_exp[, DepMap_ID := NULL]
+ccle_exp[1:5, 1:20]
+
+# Move cell line name to the first column: just giving the column name to the function moves it to first place
+setcolorder(ccle_exp, neworder = sort(colnames(ccle_exp)))
+setcolorder(ccle_exp, neworder = c("stripped_cell_line_name", "primary_disease"))
+ccle_exp[1:5, 1:20]
+
+# Save
+fwrite(ccle_exp, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Expression.csv"), sep = ',')
+
+ccle_exp <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Expression.csv"))
+
+ccle_exp
+# DIMENSIONS OF EXPRESSION DATA: 1375 X 19178
+rm(ccle_exp)
+# ==== Copy number data cleanup ====
+ccle_cn <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_gene_copy_number.csv"))
+dim(ccle_cn)
+ccle_cn[1:5, 1:10]
+# Change column names to only contain HGNC name: replace everything after first word with ""
+colnames(ccle_cn) <- gsub(" .+", "", colnames(ccle_cn))
+colnames(ccle_cn)[1] <- "DepMap_ID"
+# Merge with sample info to have cell line name in addition to DepMap ID
+ccle_cn <- merge(ccle_cn, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "DepMap_ID")
+ccle_cn[, DepMap_ID := NULL]
+
+setcolorder(ccle_cn, neworder = sort(colnames(ccle_cn)))
+setcolorder(ccle_cn, neworder = c("stripped_cell_line_name", "primary_disease"))
+ccle_cn[1:5, 1:20]
+which(is.na(ccle_cn), arr.ind = T)
+ccle_cn[1407, 26569]
+
+# Replace NA with 0
+setnafill(ccle_cn, fill = 0, cols = unique(which(is.na(ccle_cn), arr.ind = T)[,2]))
+
+dim(ccle_cn)
+anyNA(ccle_cn)
+sum(is.na(ccle_cn))
+which(is.na(ccle_cn))
+# Save
+fwrite(ccle_cn, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv"), sep = ',')
+
+# DIMENSIONS OF COPY NUMBER DATA: 1740 X 27563
+rm(ccle_cn)
+gc()
+# ==== Proteomic data cleanup ====
+ccle_prot <- fread(paste0(path, "Data/DepMap/20Q2/CCLE_protein_quant_current_normalized.csv"))
+dim(ccle_prot)
+ccle_prot[1:5, 1:10]
+ccle_prot[1:5, 48:60]
+# Subset only the Uniprot accession (since its unique unlike HGNC) and the cell line experimental data
+ccle_prot <- ccle_prot[, c(6, 49:ncol(ccle_prot)), with = F]
+colnames(ccle_prot) <- gsub("\\_.+", "", colnames(ccle_prot))
+colnames(ccle_prot)[1] <- "Uniprot_Acc"
+# Transpose the data.table to match with other data type tables
+t <- transpose(ccle_prot, make.names = "Uniprot_Acc")
+
+# Check if transpose worked as intended
+as.numeric(unlist(t[1,])) == as.numeric(unlist(ccle_prot[,2]))
+as.numeric(unlist(t[2,])) == as.numeric(unlist(ccle_prot[,3]))
+
+# Add cell lines
+t$stripped_cell_line_name <- colnames(ccle_prot)[-1]
+# Merge with sample info to have cell line name in addition to DepMap ID
+t <- merge(t, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "stripped_cell_line_name")
+
+# Move to front
+setcolorder(t, neworder = c("DepMap_ID", "stripped_cell_line_name", "primary_disease"))
+t[1:5, 1:10]
+
+# Save
+fwrite(t, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_ProteinQuant.csv"), sep = ',')
+
+
+### Get proteins that are observed in all cell lines
+# Create the same transposed table as above
+# Remove all rows and columns that have any NA in them
+prot_nona <- na.omit(ccle_prot)
+which(is.na(prot_nona))
+# Transpose the data.table to match with other data type tables
+t <- transpose(prot_nona, make.names = "Uniprot_Acc")
+# Check if transpose worked as intended
+as.numeric(unlist(t[1,])) == as.numeric(unlist(prot_nona[,2]))
+as.numeric(unlist(t[2,])) == as.numeric(unlist(prot_nona[,3]))
+# Add cell lines
+t$stripped_cell_line_name <- colnames(prot_nona)[-1]
+# Merge with sample info to have cell line name in addition to DepMap ID
+t <- merge(t, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "stripped_cell_line_name")
+# Move to front
+setcolorder(t, neworder = c("DepMap_ID", "stripped_cell_line_name", "primary_disease"))
+t[1:5, 1:10]
+# Now we have ~5000 proteins that are available in all samples
+dim(t)
+
+# We have 3 duplicates
+sum(duplicated(t$stripped_cell_line_name))
+
+# Save
+fwrite(t, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"), sep = ',')
+# ccle_prot <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"))
+dim(ccle_prot)
+ccle_prot[1:5, 1:5]
+ccle_prot[, DepMap_ID := NULL]
+# fwrite(ccle_prot, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"), sep = ',')
+# DIMENSIONS OF PROTEIN QUANTITY DATA: 378 X 5155
+
+ccle_prot <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"))
+anyDuplicated(ccle_prot$stripped_cell_line_name)
+
+# ==== Mutation data cleanup ====
+rm(list = ls(pattern = "ccle"))
+require(data.table)
+path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
+
+ccle_mut <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_mutations.csv"))
+table(ccle_mut$isCOSMIChotspot)
+table(ccle_mut$isTCGAhotspot)
+table(ccle_mut$Variant_Type)
+length(unique(ccle_mut$DepMap_ID))
+
+dim(ccle_mut)
+ccle_mut[1,]
+colnames(ccle_mut)
+# Calculate number of mutations per cell line
+temp <- ccle_mut[, c("Variant_Type", "DepMap_ID")]
+temp[, nMut := .N, by = "DepMap_ID"]
+temp
+unique(temp$Variant_Type)
+# For simplicity, extract only SNP data for now: this discards ~90,000 mutations
+# ccle_mut <- ccle_mut[Variant_Type == "SNP"]
+dim(ccle_mut)
+t <- ccle_mut[, c("DepMap_ID", "Chromosome", "Strand", "Start_position", "End_position")]
+dim(unique(t))
+length(unique(ccle_mut$DepMap_ID))
+# Keep relevant columns/features
+# Aside: Should the sequence change be provided, or just whether the SNP is deleterious or not?
+ccle_mut <- ccle_mut[, c("DepMap_ID", "Hugo_Symbol", "Chromosome", "Start_position", "End_position", "Strand",
+             "Variant_Classification", "Variant_Type", "isDeleterious",
+             "isTCGAhotspot", "isCOSMIChotspot", "Genome_Change", "cDNA_Change")]
+dim(ccle_mut)
+length(unique(ccle_mut$DepMap_ID))
+table(ccle_mut$isDeleterious)
+table(ccle_mut$isTCGAhotspot)
+table(ccle_mut$isCOSMIChotspot)
+
+# ==== CCLE Mut Overlap with COSMIC CGC ====
+# Perhaps it's best to use the mutations in genes that COSMIC considers important, like another paper in
+# the field (~500 genes)
+# Or, we can use a binary vector for genes and whether they have a deleterious mutation: this will result in 
+# ~20,000 parameters
+length(unique(ccle_mut$Hugo_Symbol))
+
+length(unique(ccle_mut[isCOSMIChotspot == T]$Hugo_Symbol))
+length(unique(ccle_mut[isTCGAhotspot == T]$Hugo_Symbol))
+length(unique(ccle_mut[isDeleterious == T]$Hugo_Symbol))
+
+tcga_hotspot_genes <- unique(ccle_mut[isTCGAhotspot == T]$Hugo_Symbol)
+# Read COSMIC Cancer Gene Census data
+cgc <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/COSMIC/cancer_gene_census.csv")
+dim(cgc)
+cgc[1:5, 1:20]
+length(unique(cgc$`Gene Symbol`))
+length(unique(cgc$HGVSG))
+# Get Genes in this census
+cgc_genes <- unique(cgc$`Gene Symbol`)
+cgc[Tier == 1]
+length(unique(cgc$`Genome Location`))  # 922,732
+# rm(cgc)
+
+# Subset DepMap mutations based on the CGC genes
+sum(unique(ccle_mut$Hugo_Symbol) %in% unique(cgc_genes))
+ccle_mut <- ccle_mut[Hugo_Symbol %in% cgc_genes]
+length(unique(ccle_mut$DepMap_ID))
+
+sum(ccle_mut$isDeleterious)
+ccle_mut[Variant_Classification == "Missense_Mutation"]
+length(unique(ccle_mut[isDeleterious == T]$Hugo_Symbol))
+ccle_mut[isDeleterious == T]
+
+
+# TODO: Use CGC to check for overlap with CCLE cell lines, then collapse to whether each of the 700 genes for
+# that cell line has a mutation listed in the CGC
+length(unique(cgc$`Mutation genome position`))  # ~922,000 unique mutations
+unique(ccle_mut$NCBI_Build)  # CCLE is with GRCh 37
+unique(cgc$GRCh)  # CGC has GRCh 38
+# We must "lift over" the mutations from 37 to 38 before checking for overlap
+if (!require(liftOver)) {
+    BiocManager::install("liftOver")
+    require(liftOver)
+    require(rtracklayer)
+}
+# liftOver requires a chain file to convert 37 to 38: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/
+
+chain_path <- paste0(path, "Data/hg19ToHg38.over.chain")
+grch_37_38_chain <- import.chain(chain_path)
+
+# Must add "chr" to start of chromosome names
+ccle_mut$Chromosome <- paste0("chr", ccle_mut$Chromosome)
+# Must convert positions to GRanges
+ccle_mut_gr <- makeGRangesFromDataFrame(df = ccle_mut, keep.extra.columns = T,
+                         seqnames.field = "Chromosome", start.field = "Start_position",
+                         end.field = "End_position", strand.field = "Strand")
+length(unique(ccle_mut_gr$DepMap_ID))
+
+# Lift over
+lifted_ccle_mut <- liftOver(x = ccle_mut_gr, chain = grch_37_38_chain)
+# Convert GRangesList to GRanges
+lifted_ccle_mut <- unlist(lifted_ccle_mut)
+# Convert back to data.table
+lifted_ccle_mut <- as.data.table(lifted_ccle_mut)
+# Note: Genome_Change is now out of date!
+# Remove chr from seqnames
+lifted_ccle_mut$seqnames <- gsub("chr", "", lifted_ccle_mut$seqnames)
+# Can find the overlap of Mutation genome position in CGC with a newly created column based on CCLE positions
+lifted_ccle_mut[, Mutation_Position := paste0(seqnames, ':', start, '-', end)]
+
+ccle_mut$seqnames <- gsub("chr", "", ccle_mut$Chromosome)
+ccle_mut[, Mutation_Position := paste0(seqnames, ':', as.character(Start_position), '-', as.character(End_position))]
+
+
+length(unique(lifted_ccle_mut$DepMap_ID))
+
+sum(ccle_mut$Mutation_Position %in% unique(cgc$`Genome Location`))
+
+# Now find the overlap with CGC (which already has GRCh38)
+subset <- lifted_ccle_mut[Mutation_Position %in% unique(cgc$`Genome Location`)]
+table(subset$Variant_Type)
+length(unique(subset$DepMap_ID))
+# IMPORTANT! There is a loss of 8 cell lines (which do not have a mutation that is in
+# CGC) using the Tier 1 data only
+
+# Alternative (March 2021) ====
+# Take those mutations that are COSMIC or TCGA hotspots, ignoring CGC
+subset <- ccle_mut[isTCGAhotspot | isCOSMIChotspot]
+
+uniqueN(subset$Hugo_Symbol)
+table(ccle_mut[isTCGAhotspot == T]$Variant_Classification)
+uniqueN(ccle_mut[isTCGAhotspot == T]$Hugo_Symbol)
+uniqueN(ccle_mut[isCOSMIChotspot == T]$Hugo_Symbol)
+
+uniqueN(subset$isTCGAhotspot)
+
+### Create a vector of mutations for each cell line with the CGC genes
+length(unique(subset$Hugo_Symbol))
+sub_dcast <- dcast.data.table(data = subset[, c("DepMap_ID", "Hugo_Symbol")],
+                 formula = DepMap_ID ~ Hugo_Symbol, fun.aggregate = length, value.var = "DepMap_ID")
+dim(sub_dcast)
+sub_dcast[1:5, 1:50]
+sum(sub_dcast$A1BG)
+sum(sub_dcast$A1CF)
+
+depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv"))
+sub_dcast <- merge(sub_dcast, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")],
+                  by = "DepMap_ID")
+setcolorder(sub_dcast, c("DepMap_ID", "stripped_cell_line_name", "primary_disease"))
+sub_dcast[1:5, 1:50]
+
+# Save
+fwrite(sub_dcast, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv"), sep = ',')
+dim(cgc_muts)
+cgc_muts[1:5, 1:5]
+typeof(cgc_muts[1,2])
+
+temp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv")
+dim(temp)
+temp[1:5, 1:50]
+
+# # Attach the cell line name and primary disease
+# # cgc_muts <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv"))
+# depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv"))
+# cgc_muts <- merge(cgc_muts, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")],
+#                   by = "DepMap_ID")
+# setcolorder(cgc_muts, neworder = c("stripped_cell_line_name", colnames(cgc_muts)[-ncol(cgc_muts)]))
+# 
+# # Save
+# fwrite(cgc_muts, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv"), sep = ',')
+# cgc_muts <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv"))
+# cgc_muts[1:5, 1:5]
+# cgc_muts[, DepMap_ID := NULL]
+# DIMENSIONS OF CGC MUTATIONAL DATA: 1733 X 697
+
+
+
+# ==== miRNA Data Cleanup ====
+path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
+require(data.table)
+depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
+
+ccle_mirna <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_miRNA_20181103.gct")
+dim(ccle_mirna)
+anyNA(ccle_mirna)
+ccle_mirna[1:5, 1:5]
+
+min(ccle_mirna[, -c(1:2)], na.rm = T)
+max(ccle_mirna[, -c(1:2)], na.rm = T)
+
+ccle_mirna <- transpose(ccle_mirna, keep.names = "Name")
+dim(ccle_mirna)
+ccle_mirna[1:5, 1:5]
+ccle_mirna$Name
+sum(duplicated(unlist(ccle_mirna[2, ])))
+ccle_mirna <- ccle_mirna[-1,]
+ccle_mirna[1:5, 1:5]
+colnames(ccle_mirna) <- unlist(ccle_mirna[1,])
+ccle_mirna <- ccle_mirna[-1,]
+
+# Clean cell line name
+ccle_mirna$Description <- gsub(pattern = "\\_.+", replacement = "", ccle_mirna$Description)
+colnames(ccle_mirna)[1] <- "stripped_cell_line_name"
+ccle_mirna <- merge(ccle_mirna, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
+                    by = "stripped_cell_line_name")
+dim(ccle_mirna)
+ccle_mirna[1:5, 1:5]
+setcolorder(ccle_mirna, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
+
+# fwrite(ccle_mirna, paste0(path, "Data/DRP_Training_Data/DepMap_2019_miRNA.csv"), sep = ',')
+
+rm(ccle_mirna)
+
+# ccle_mirna <- fread("Data/DRP_Training_Data/DepMap_2019_miRNA.csv")
+# dim(ccle_mirna)
+# ==== Metabolomics Data Cleanup ====
+depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
+ccle_metab <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_metabolomics_20190502.csv")
+
+dim(ccle_metab)
+ccle_metab[1:5, 1:5]
+
+min(ccle_metab[, -c(1:2)], na.rm = T)
+max(ccle_metab[, -c(1:2)], na.rm = T)
+
+anyNA(ccle_metab)
+sum(is.na(ccle_metab))
+which(is.na(ccle_metab), arr.ind = T)
+ccle_metab[which(is.na(ccle_metab), arr.ind = T)]
+ccle_metab[554, 2]  # DepMap_ID is NA
+
+min(ccle_metab[, -c(1:2)])
+ccle_metab <- merge(ccle_metab[, -1], depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
+                    by = "DepMap_ID")
+
+setcolorder(ccle_metab, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
+ccle_metab$DepMap_ID <- NULL
+dim(ccle_metab)
+
+fwrite(ccle_metab, paste0(path, "Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv"), sep = ',')
+rm(ccle_metab)
+
+temp <- fread("Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv")
+dim(temp)
+
+# ==== RPPA Data Cleanup ====
+ccle_rppa <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RPPA_20181003.csv")
+dim(ccle_rppa)
+ccle_rppa[1:5, 1:5]
+anyNA(ccle_rppa)
+
+min(ccle_rppa[, -c(1:2)], na.rm = T)  # has negative values
+max(ccle_rppa[, -c(1:2)], na.rm = T)
+
+ccle_rppa$V1 <- gsub(pattern = "\\_.+", replacement = "", ccle_rppa$V1)
+colnames(ccle_rppa)[1] <- "stripped_cell_line_name"
+ccle_rppa <- merge(ccle_rppa, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
+                    by = "stripped_cell_line_name")
+dim(ccle_rppa)
+ccle_rppa[1:5, 1:10]
+setcolorder(ccle_rppa, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
+
+fwrite(ccle_rppa, paste0(path, "Data/DRP_Training_Data/DepMap_2019_RPPA.csv"), sep = ',')
+
+rm(ccle_rppa)
+
+temp <- fread("Data/DRP_Training_Data/DepMap_2019_RPPA.csv")
+dim(temp)
+# ==== Chromatin Profiling Data Cleanup ====
+ccle_chrom <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_GlobalChromatinProfiling_20181130.csv")
+dim(ccle_chrom)
+ccle_chrom[1:5, 1:10]
+
+min(ccle_chrom[, -c(1:2)], na.rm = T)  # has negative values
+max(ccle_chrom[, -c(1:2)], na.rm = T)
+
+
+anyNA(ccle_chrom)
+sum(is.na(ccle_chrom))  # 842 NA values
+
+unique(which(is.na(ccle_chrom), arr.ind = T)[,2])
+length(unique(which(is.na(ccle_chrom), arr.ind = T)[,2]))  # 26 columns have NAs
+
+# Convert NA to 0
+setnafill(ccle_chrom, fill = 0, cols = unique(which(is.na(ccle_chrom), arr.ind = T)[,2]))
+anyNA(ccle_chrom)
+
+ccle_chrom$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_chrom$CellLineName)
+colnames(ccle_chrom)[1] <- "stripped_cell_line_name"
+dim(ccle_chrom)
+ccle_chrom <- merge(ccle_chrom, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
+                   by = "stripped_cell_line_name")
+ccle_chrom$BroadID <- NULL
+dim(ccle_chrom)
+ccle_chrom[1:5, 1:10]
+setcolorder(ccle_chrom, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
+
+# fwrite(ccle_chrom, paste0(path, "Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv"), sep = ',')
+
+rm(ccle_chrom)
+
+temp <- fread(paste0("Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv"))
+dim(temp)
+# ==== Fusion Data Cleanup ====
+ccle_fusion <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_fusions.csv")
+dim(ccle_fusion)
+ccle_fusion[1:5, 1:17]
+length(unique(ccle_fusion$FusionName))
+length(unique(ccle_fusion$DepMap_ID))
+unique(ccle_fusion$SpliceType)
+quantile(ccle_fusion$FFPM)
+
+ccle_fusion$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_fusion$CellLineName)
+colnames(ccle_fusion)[1] <- "stripped_cell_line_name"
+dim(ccle_fusion)
+ccle_fusion <- merge(ccle_fusion, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
+                    by = "stripped_cell_line_name")
+ccle_fusion$BroadID <- NULL
+dim(ccle_fusion)
+ccle_fusion[1:5, 1:10]
+setcolorder(ccle_fusion, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
+
+fwrite(ccle_fusion, paste0(path, "Data/DRP_Training_Data/DepMap_2019_GeneFusion.csv"), sep = ',')
+
+rm(ccle_fusion)
+
+# ==== Exon Usage Ratio Data Cleanup ====
+require(data.table)
+setDTthreads(8)
+ccle_exon <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RNAseq_ExonUsageRatio_20180929.gct")
+dim(ccle_exon)
+ccle_exon[1:10, 1:17]
+
+transpose(ccle_exon, keep.names = "exon")
+length(unique(ccle_exon$FusionName))
+length(unique(ccle_exon$DepMap_ID))
+unique(ccle_exon$SpliceType)
+quantile(ccle_exon$FFPM)
+
+ccle_exon$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_exon$CellLineName)
+colnames(ccle_exon)[1] <- "stripped_cell_line_name"
+dim(ccle_exon)
+ccle_exon <- merge(ccle_exon, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
+                     by = "stripped_cell_line_name")
+ccle_exon$BroadID <- NULL
+dim(ccle_exon)
+ccle_exon[1:5, 1:10]
+setcolorder(ccle_exon, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
+
+# fwrite(ccle_exon, paste0(path, "Data/DRP_Training_Data/DepMap_2019_ExonUsageRatio.csv"), sep = ',')
+
+rm(ccle_exon)
+
+# ==== RRBS Profiling Data Cleanup ====
+require(data.table)
+setDTthreads(8)
+
+# === TSS
+ccle_tss <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RRBS_TSS1kb_20181022.txt")
+dim(ccle_tss)
+ccle_tss[1:5, 1:5]
+length(unique(ccle_tss$cluster_id))
+
+ccle_tss <- transpose(ccle_tss[, -2], keep.names = "cluster_id")
+colnames(ccle_tss) <- unlist(ccle_tss[1,])
+ccle_tss <- ccle_tss[-1,]
+
+# === Promoter
+ccle_tss <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE")
+dim(ccle_tss)
+ccle_tss[1:5, 1:5]
+length(unique(ccle_tss$cluster_id))
+
+ccle_tss <- transpose(ccle_tss[, -2], keep.names = "cluster_id")
+colnames(ccle_tss) <- unlist(ccle_tss[1,])
+ccle_tss <- ccle_tss[-1,]
+# === Enhancers
+
+
+# ==== GDSC Cell Line Characterization Data Cleanup ====
+require(data.table)
+gdsc_line_info <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/GDSC/GDSC_Line_Info.csv")
+gdsc_line_info <- gdsc_line_info[-c(1,2,3, 1005), c("V1", "V2", "V9")]
+colnames(gdsc_line_info) <- c("stripped_cell_line_name", "COSMIC_ID", "primary_disease")
+gdsc_line_info$stripped_cell_line_name <- gsub("[^[:alnum:] ]", "", gdsc_line_info$stripped_cell_line_name)
+gdsc_line_info$stripped_cell_line_name <- gsub(" ", "", gdsc_line_info$stripped_cell_line_name)
+gdsc_line_info$stripped_cell_line_name <- toupper(gdsc_line_info$stripped_cell_line_name)
+
+# Match with DepMap cell lines
+depmap_line_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
+sum(gdsc_line_info$stripped_cell_line_name %in% depmap_line_info$stripped_cell_line_name) / nrow(gdsc_line_info)  # 98%
+gdsc_line_info[!(gdsc_line_info$stripped_cell_line_name %in% depmap_line_info$stripped_cell_line_name)]
+gdsc_line_info[stripped_cell_line_name == "EOL1CELL"] <- "EOL1"
+gdsc_line_info[stripped_cell_line_name == "SR"] <- "SR786"
+gdsc_line_info[stripped_cell_line_name == "U266"] <- "U266B1"
+gdsc_line_info[stripped_cell_line_name == "G292"] <- "G292CLONEA141B1"
+gdsc_line_info[stripped_cell_line_name == "NCISNU1"] <- "SNU1"
+gdsc_line_info[stripped_cell_line_name == "NCISNU5"] <- "SNU5"
+gdsc_line_info[stripped_cell_line_name == "NCISNU16"] <- "SNU16"
+gdsc_line_info[stripped_cell_line_name == "7860"] <- "786O"
+gdsc_line_info[stripped_cell_line_name == "U031"] <- "UO31"
+gdsc_line_info[stripped_cell_line_name == "H3255"] <- "NCIH3255"
+gdsc_line_info[stripped_cell_line_name == "NCIH510A"] <- "NCIH510"
+gdsc_line_info[stripped_cell_line_name == "U251"] <- "U251MG"
+gdsc_line_info[stripped_cell_line_name == "WM793B"] <- "WM793"
+gdsc_line_info[stripped_cell_line_name == "OVCAR3"] <- "NIHOVCAR3"
+gdsc_line_info[stripped_cell_line_name == "SCC90"] <- "UPCISCC090"
+gdsc_line_info[COSMIC_ID == "1299064"] <- "TDOTT"
+gdsc_line_info[COSMIC_ID == "930299"] <- "TT"
+gdsc_line_info[COSMIC_ID == "909976"] <- "KMH2"
+gdsc_line_info[COSMIC_ID == "1298167"] <- "KMHDASH2"
+sum(gdsc_line_info$stripped_cell_line_name %in% depmap_line_info$stripped_cell_line_name) / nrow(gdsc_line_info)  # 100%
+
+
+# ==== GDSC Expression Data Cleanup ====
+gdsc_exp <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/GDSC/GDSC_Expression.txt")
+gdsc_exp[1:5, 1:5]
+# Delete invalid gene symbol rows
+gdsc_exp <- gdsc_exp[GENE_SYMBOLS != ""]
+
+# Rename Expression columns
+colnames(gdsc_exp) <- gsub("DATA.", "", colnames(gdsc_exp))
+gdsc_exp <- gdsc_exp[, !"GENE_title", with = F]
+# Must transpose to have cell lines on each row
+gdsc_exp <- transpose(gdsc_exp, keep.names = "GENE_SYMBOLS")
+gdsc_exp[1:5, 1:5]
+colnames(gdsc_exp) <- unlist(gdsc_exp[1,])
+gdsc_exp <- gdsc_exp[-1,]
+colnames(gdsc_exp)[1] <- "COSMIC_ID"
+# gdsc_exp[COSMIC_ID == "1299064"][1:5, 1:5]
+# gdsc_exp[COSMIC_ID == "930299"][1:5, 1:5]
+# Merge with cell line names
+gdsc_exp <- merge(gdsc_exp, gdsc_line_info, by = "COSMIC_ID")
+gdsc_exp$COSMIC_ID <- NULL
+setcolorder(gdsc_exp, order(colnames(gdsc_exp)))
+setcolorder(gdsc_exp, c("stripped_cell_line_name", "primary_disease"))
+gdsc_exp[1:5, 1:5]
+# Match genes with those from DepMap Cell lines
+depmap_exp <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
+# depmap_exp <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DRP_Training_Data/DepMap_21Q2_Training_Expression.csv")
+depmap_exp[1:5, 1:5]
+
+dim(gdsc_exp)
+dim(depmap_exp)
+
+t1 <- colnames(gdsc_exp)
+t2 <- colnames(depmap_exp)
+sum(t1 %in% t2) / ncol(gdsc_exp)  # 91%
+colnames(gdsc_exp)[!colnames(gdsc_exp) %in% colnames(depmap_exp)]
+
+# Update gene names with biomaRt
+# require(biomaRt)
+# ensembl = useMart(
+#   biomart = "ENSEMBL_MART_ENSEMBL",
+#   # host = "feb2014.archive.ensembl.org",
+#   path = "/biomart/martservice",
+#   dataset = "hsapiens_gene_ensembl"
+# )
+# atts <- listAttributes(ensembl, what = "name")
+# atts[grep(pattern = "gene", x = atts, ignore.case = T)]
+# depmap_dict <- biomaRt::getBM(attributes = c("ensembl_gene_id",
+#                                           "external_gene_name", "wikigene_name"),
+#                            filters = "external_gene_name", values = colnames(depmap_exp),
+#                            mart = ensembl)
+# depmap_dict <- as.data.table(depmap_dict)
+# depmap_dict <- unique(depmap_dict[, 2:3])
+# 
+# gdsc_dict <- biomaRt::getBM(attributes = c("ensembl_gene_id",
+#                                              "external_gene_name", "wikigene_name"),
+#                               filters = "external_gene_name", values = colnames(gdsc_exp),
+#                               mart = ensembl)
+# gdsc_dict <- as.data.table(gdsc_dict)
+
+# Get updated gene names from: https://www.genenames.org/tools/multi-symbol-checker/
+
+all_gene_names <- data.table(unique(c(colnames(depmap_exp), colnames(gdsc_exp)))[-(1:2)])
+fwrite(all_gene_names, "GDSC_DepMap_All_Gene_Names.txt")
+hgnc_check <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/hgnc-symbol-check.csv", skip = 1)
+
+depmap_hgnc_check <- hgnc_check[Input %in% colnames(depmap_exp)[-(1:2)]]
+names(depmap_exp)[match(depmap_hgnc_check$Input, names(depmap_exp))] <- depmap_hgnc_check$`Approved symbol`
+gdsc_hgnc_check <- hgnc_check[Input %in% colnames(gdsc_exp)[-(1:2)]]
+names(gdsc_exp)[match(gdsc_hgnc_check$Input, names(gdsc_exp))] <- gdsc_hgnc_check$`Approved symbol`
+
+
+t1 <- colnames(gdsc_exp)
+t2 <- colnames(depmap_exp)
+sum(t1 %in% t2) / ncol(gdsc_exp)  # 97.3%
+sum(t2 %in% t1) / ncol(depmap_exp)  # 88.6%
+sum(t2 %in% t1)  # 16988 
+sum(t1 %in% t2)  # 16948 
+colnames(gdsc_exp)[!colnames(gdsc_exp) %in% colnames(depmap_exp)]
+
+fwrite(gdsc_exp, "Data/DRP_Training_Data/GDSC_Expression.csv")
+
+# Subset both datasets's genes based on data available to both
+shared <- intersect(colnames(gdsc_exp), colnames(depmap_exp))
+length(shared)
+gdsc_exp_sub <- gdsc_exp[, shared, with = F]
+gdsc_exp_sub[1:5, 1:5]
+
+depmap_exp_sub <- depmap_exp[, shared, with = F]
+depmap_exp_sub[1:5, 1:5]
+
+# Save
+fwrite(gdsc_exp_sub, "Data/DRP_Training_Data/GDSC_Training_Expression.csv")
+fwrite(depmap_exp_sub, "Data/DRP_Training_Data/DepMap_21Q2_Training_Expression.csv")
+
+# ==== Drug Sensitivity Data Cleanup ====
+path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
+require(data.table)
+require(webchem)
+# BiocManager::install("ChemmineR")
+require(ChemmineR)
+options(chemspider_key = "N98K4aOip0VpcSc8F9GilqIIktLt0hux")
+path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
+ctrp <- fread("Data/DRP_Training_Data/CTRP_AUC_SMILES.txt")
+gdsc1 <- fread("Data/DRP_Training_Data/GDSC1_AUC_SMILES.txt")
+gdsc2 <- fread("Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt")
+
+
+# Clean up duplicate with missing pubchem
+cpd_info_1 <- fread(paste0(path, "Data/GDSC/GDSC1_Drug_Info.csv"))
+cpd_info_1[drug_name == unique(cpd_info_1[, c("drug_name", "pubchem")])[anyDuplicated(unique(cpd_info_1[, c("drug_name", "pubchem")])$drug_name),]$drug_name]
+cpd_info_1 <- cpd_info_1[drug_id != 476]
+cpd_info_1 <- cpd_info_1[drug_id != 1490]
+cpd_info_1 <- cpd_info_1[drug_id != 1496]
+cpd_info_1 <- cpd_info_1[drug_id != 1386]
+cpd_info_1 <- cpd_info_1[drug_id != 1402]
+cpd_info_1 <- cpd_info_1[drug_id != 1393]
+nrow(cpd_info_1[pubchem == "-"])
+sum(cpd_info_1$drug_name %in% unique(ctrp$cpd_name))
+# Subset for valid pubchem IDs
+cpd_info_1 <- cpd_info_1[pubchem != "-"]
+cpd_info_1 <- cpd_info_1[pubchem != "none"]
+cpd_info_1 <- cpd_info_1[pubchem != "several"]
+cpd_info_1$pubchem <- as.numeric(cpd_info_1$pubchem)
+
+cpd_1_smiles <- webchem::pc_prop(cid = cpd_info_1$pubchem, properties = "CanonicalSMILES")
+cpd_info_1 <- merge(cpd_info_1, cpd_1_smiles, by.x = "pubchem", by.y = "CID")
+# Save
+fwrite(cpd_info_1, "Data/GDSC/GDSC1_VALID_Drug_Info.csv")
+
+
+cpd_info_2 <- fread(paste0(path, "Data/GDSC/GDSC2_Drug_Info.csv"))
+cpd_info_2[drug_name == unique(cpd_info_2[, c("drug_name", "pubchem")])[anyDuplicated(unique(cpd_info_2[, c("drug_name", "pubchem")])$drug_name),]$drug_name]
+cpd_info_2 <- cpd_info_2[drug_id != 1811]
+cpd_info_2 <- cpd_info_2[drug_id != 1806]
+cpd_info_2 <- cpd_info_2[drug_id != 1819]
+cpd_info_2 <- cpd_info_2[drug_id != 1816]
+cpd_info_2[pubchem == "25227436, 42602260"]$pubchem <- "25227436"
+cpd_info_2[pubchem == "11719003, 58641927"]$pubchem <- "11719003"
+cpd_info_2[pubchem == "66577015, 16654980"]$pubchem <- "66577015"
+cpd_info_2[pubchem == "11719003, 58641927"]$pubchem <- "11719003"
+
+nrow(cpd_info_2[pubchem == "-"])
+sum(cpd_info_2$pubchem %in% cpd_info_1$pubchem) / nrow(cpd_info_2)
+
+cpd_info_2 <- cpd_info_2[pubchem != "-"]
+cpd_info_2 <- cpd_info_2[pubchem != "none"]
+cpd_info_2 <- cpd_info_2[pubchem != "several"]
+
+cpd_info_2$pubchem <- as.numeric(cpd_info_2$pubchem)
+
+cpd_2_smiles <- webchem::pc_prop(cid = cpd_info_2$pubchem, properties = "CanonicalSMILES")
+cpd_info_2 <- merge(cpd_info_2, cpd_2_smiles, by.x = "pubchem", by.y = "CID")
+# Save
+fwrite(cpd_info_2, "Data/GDSC/GDSC2_VALID_Drug_Info.csv")
+
+
+depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv"))
+
+# ==== GDSC ====
+require(stringr)
+gdsc1 <- fread(paste0(path, "Data/GDSC/GDSC1_Fitted_Dose_Response.csv"))
+sum(unique(gdsc1$CELL_LINE_NAME) %in% depmap_samples$stripped_cell_line_name) / length(unique(gdsc1$CELL_LINE_NAME))  # 0.22
+sum(toupper(unique(gdsc1$CELL_LINE_NAME)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc1$CELL_LINE_NAME))  # 0.24
+sum(str_remove_all(toupper(unique(gdsc1$CELL_LINE_NAME)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc1$CELL_LINE_NAME))  # 0.9696049
+
+dim(gdsc1)  # 310K Combinations
+colnames(gdsc1)
+sum(gdsc1$AUC == 0)
+min(gdsc1$AUC)
+max(gdsc1$AUC)
+
+# Count unique combinations in GDSC1
+length(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")])$CELL_LINE_NAME))  # 987
+length(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")])$DRUG_NAME))  # 345
+nrow(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")]))) # 292,849
+
+
+gdsc1_final <- merge(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME", "AUC")]), unique(cpd_info_1[, c("drug_name", "CanonicalSMILES")]), by.x = "DRUG_NAME", by.y = "drug_name")
+colnames(gdsc1_final) <- c("cpd_name", "ccl_name", "area_under_curve", "cpd_smiles")
+# Save
+fwrite(gdsc1_final, "Data/DRP_Training_Data/GDSC1_AUC_SMILES.txt")
+
+unique(gdsc1_pubchem$DRUG_NAME)
+# gdsc1_cs_ids <- webchem::get_csid(query = unique(gdsc1$DRUG_NAME), from = "name", match = "all", verbose = T)
+gdsc1_cs_ids <- webchem::cir_query(identifier = unique(gdsc1$DRUG_NAME), representation = "smiles", verbose = T, )
+
+# Count unique combinations in GDSC2
+gdsc2 <- fread(paste0(path, "Data/GDSC/GDSC2_Fitted_Dose_Response.csv"))
+sum(unique(gdsc2$CELL_LINE_NAME) %in% depmap_samples$stripped_cell_line_name) / length(unique(gdsc2$CELL_LINE_NAME))  # 0.2311496
+sum(toupper(unique(gdsc2$CELL_LINE_NAME)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc2$CELL_LINE_NAME))  # 0.2546354
+sum(str_remove_all(toupper(unique(gdsc2$CELL_LINE_NAME)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc2$CELL_LINE_NAME))  # 0.9678616
+
+gdsc2_cpd_smiles <- webchem::cir_query(identifier = unique(gdsc2$DRUG_NAME), representation = "smiles", verbose = T)
+
+dim(gdsc2)  # 135K Combinations
+colnames(gdsc2)
+length(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")])$CELL_LINE_NAME))  # 809
+length(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")])$DRUG_NAME))  # 192
+nrow(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")]))) # 131,108
+
+gdsc2_final <- merge(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME", "AUC")]), unique(cpd_info_2[, c("drug_name", "CanonicalSMILES")]), by.x = "DRUG_NAME", by.y = "drug_name")
+colnames(gdsc2_final) <- c("cpd_name", "ccl_name", "area_under_curve", "cpd_smiles")
+# Save
+fwrite(gdsc2_final, "Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt")
+
+
+# Count overlap of drugs and cell lines
+sum(unique(gdsc1$DRUG_NAME) %in% unique(gdsc2$DRUG_NAME)) # Drug Overlap: 88
+sum(unique(gdsc1$CELL_LINE_NAME) %in% unique(gdsc2$CELL_LINE_NAME))  # Cell Line Overlap: 808
+
+# ==== CTRP ====
+require(data.table)
+path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
+
+# NOTE: Newer and better AUC calculation in PharmacoGx.R file!
+ctrp_curves <- fread(paste0(path, "Data/CTRP/v20.data.curves_post_qc.txt"))
+exper_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_experiment.txt"))
+cell_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_cell_line.txt"))
+table(cell_data$ccl_availability)
+
+# Merge sensitivity, experimental and cell line data
+temp <- merge(unique(ctrp_curves[, c("experiment_id", "master_cpd_id")]),
+              unique(exper_data[, c("experiment_id", "master_ccl_id")]),
+              by = "experiment_id")
+ctrp <- merge(temp, cell_data[, c("master_ccl_id", "ccl_name")], by = "master_ccl_id")
+sum(unique(ctrp$ccl_name) %in% depmap_samples$stripped_cell_line_name) / length(unique(ctrp$ccl_name))  # 0.9492672
+sum(toupper(unique(ctrp$ccl_name)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(ctrp$ccl_name))  # 0.9492672
+sum(str_remove_all(toupper(unique(ctrp$ccl_name)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(ctrp$ccl_name))  # 0.9503946
+
+
+# Add compound information
+cpd_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_compound.txt"))
+ctrp <- merge(ctrp, cpd_data[, c("master_cpd_id", "cpd_name", "cpd_smiles")], by = "master_cpd_id")
+
+# Add AUC curve information
+ctrp_auc <- fread(paste0(path, "Data/CTRP/v20.data.curves_post_qc.txt"))
+
+ctrp <- merge(ctrp, ctrp_auc[, c("experiment_id", "master_cpd_id", "area_under_curve")], by = c("experiment_id", "master_cpd_id"))
+
+
+# Save
+fwrite(ctrp, paste0(path, "Data/DRP_Training_Data/CTRP_AUC_SMILES.txt"))
+
+# Add primary disease information. NOTE: This removes some DR data as 45 cell lines in CTRPv2 cannot be paired with DepMap!!!
+line_info <- fread("Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv")
+ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
+
+sum(unique(ctrp$ccl_name) %in% unique(line_info$stripped_cell_line_name))  # 150
+
+line_info$other_ccl_name <- str_replace(toupper(line_info$stripped_cell_line_name), "-", "")
+ctrp$other_ccl_name <- str_replace(toupper(ctrp$ccl_name), "-", "")
+
+ctrp <- merge(ctrp, line_info[, c("other_ccl_name", "primary_disease")], by = "other_ccl_name")
+ctrp$other_ccl_name <- NULL
+setcolorder(ctrp, neworder = c("cpd_name", "ccl_name", "primary_disease", "area_under_curve", "cpd_smiles"))
+
+fwrite(ctrp, "Data/DRP_Training_Data/CTRP_AUC_SMILES.txt")
+
+
+# Experiment ID 
+unique(ctrp[, c("master_ccl_id", "experiment_id")])
+length(unique(ctrp$master_ccl_id))
+length(unique(ctrp$experiment_id))
+length(unique(ctrp$ccl_name))
+length(unique(ctrp$master_cpd_id))
+
+# Check overlap with GDSC 1 and 2
+sum(unique(ctrp$ccl_name) %in% gdsc1$CELL_LINE_NAME)
+sum(unique(ctrp$ccl_name) %in% gdsc2$CELL_LINE_NAME)
+
+dim(ctrp)  # 395K Combinations
+
+
+
+# ==== Chemical Data Cleanup ====
+require(data.table)
+path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
+
+chembl <- fread(paste0(path, "Data/chembl_27_chemreps.txt"))
+dim(chembl)
+
+
+
+# ==== EDA ======
+require(data.table)
+require(stringr)
+require(ggplot2)
+line_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
+ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
+gdsc2 <- fread("Data/DRP_Training_Data/GDSC2_AAC_SMILES.txt")
+exp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
+mut <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv")
+cnv <- fread("Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv")
+prot <- fread("Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv")
+mirna <- fread("Data/DRP_Training_Data/DepMap_2019_miRNA.csv")
+metab <- fread("Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv")
+hist <- fread("Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv")
+rppa <- fread("Data/DRP_Training_Data/DepMap_2019_RPPA.csv")
+
+# pdb_table <- fread("Data/cell_annotation_table_1.1.1.csv")
+# pdb_sub <- pdb_table[, c("CTRPv2.cellid", "CCLE.cellid")]
+# pdb_sub <- pdb_sub[!is.na(CTRPv2.cellid) & !is.na(CCLE.cellid)]
+
+
+exp[1:5., 1:5]
+
+length(unique(ctrp$ccl_name))
+length(unique(ctrp$cpd_name))
+
+sum(unique(ctrp$ccl_name) %in% line_info$stripped_cell_line_name) / length(unique(ctrp$ccl_name))
+ccl_names = toupper(ctrp$ccl_name)
+ccl_names = unique(str_replace(ccl_names, "-", ""))
+length(ccl_names)
+
+sum(ccl_names %in% line_info$stripped_cell_line_name) / length(ccl_names)
+
+line_info[!(stripped_cell_line_name %in% ccl_names)]
+ctrp[ccl_name %like% "NIHOVCAR3"]
+ctrp[ccl_name %like% "HEL"]
+
+# sum(exp$stripped_cell_line_name %in% pdb_sub$CCLE.cellid) 
+
+# Remove hyphens and convert all to upper case
+# pdb_ccl_names = pdb_sub$CCLE.cellid
+# pdb_ccl_names = str_replace(toupper(pdb_ccl_names), "-", "")
+
+ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
+     
+exp_ccl_names = exp$stripped_cell_line_name
+exp_ccl_names = str_replace(toupper(exp_ccl_names), "-", "")
+
+mut_ccl_names = mut$stripped_cell_line_name
+mut_ccl_names = str_replace(toupper(mut_ccl_names), "-", "")
+
+cnv_ccl_names = cnv$stripped_cell_line_name
+cnv_ccl_names = str_replace(toupper(cnv_ccl_names), "-", "")
+
+sum(exp_ccl_names %in% ccl_names) / length(unique(ccl_names))
+# sum(exp_ccl_names %in% pdb_ccl_names) / length(unique(pdb_ccl_names))
+
+
+sum(mut_ccl_names %in% ccl_names) / length(unique(ccl_names)) * length(unique(ccl_names))
+# sum(mut_ccl_names %in% pdb_ccl_names) / length(unique(pdb_ccl_names)) * length(unique(pdb_ccl_names))
+
+ctrp[ccl_name %in% mut_ccl_names[mut_ccl_names %in% ccl_names]]   ### 308K!!!!! Not 144K
+# ctrp[ccl_name %in% mut_ccl_names[mut_ccl_names %in% pdb_ccl_names]]   ### 302K!!!!! Not 144K
+
+sum(cnv_ccl_names %in% ccl_names) / length(unique(ccl_names))
+sum(exp_ccl_names %in% cnv_ccl_names) / length(unique(exp_ccl_names))
+sum(cnv_ccl_names %in% exp_ccl_names) / length(unique(cnv_ccl_names))
+
+dir.create(path = "Plots")
+dir.create(path = "Plots/DepMap")
+ggplot(data = line_info) +
+  geom_bar(mapping = aes(x = primary_disease), stat = "count") +
+  xlab("Primary Disease") +
+  ylab("# of cell lines") + 
+  ggtitle(label = "Proportion of Cancer Types in DepMap Data (overall)", subtitle = "20Q2 Version") +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+ggsave(filename = "Plots/DepMap/DepMap_Cell_Lines_Proportion.pdf", device = "pdf")
+  
+prot[, 1:5]
+unique(prot$stripped_cell_line_name)
+
+mut$stripped_cell_line_name = str_replace(toupper(mut$stripped_cell_line_name), "-", "")
+cnv$stripped_cell_line_name = str_replace(toupper(cnv$stripped_cell_line_name), "-", "")
+exp$stripped_cell_line_name = str_replace(toupper(exp$stripped_cell_line_name), "-", "")
+prot$stripped_cell_line_name = str_replace(toupper(prot$stripped_cell_line_name), "-", "")
+
+mirna$stripped_cell_line_name = str_replace(toupper(mirna$stripped_cell_line_name), "-", "")
+metab$stripped_cell_line_name = str_replace(toupper(metab$stripped_cell_line_name), "-", "")
+hist$stripped_cell_line_name = str_replace(toupper(hist$stripped_cell_line_name), "-", "")
+rppa$stripped_cell_line_name = str_replace(toupper(rppa$stripped_cell_line_name), "-", "")
+
+ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
+
+mut_line_info <- line_info[stripped_cell_line_name %in% unique(mut$stripped_cell_line_name)]  
+cnv_line_info <- line_info[stripped_cell_line_name %in% unique(cnv$stripped_cell_line_name)]  
+exp_line_info <- line_info[stripped_cell_line_name %in% unique(exp$stripped_cell_line_name)]  
+prot_line_info <- line_info[stripped_cell_line_name %in% unique(prot$stripped_cell_line_name)]
+
+mirna_line_info <- line_info[stripped_cell_line_name %in% unique(mirna$stripped_cell_line_name)]
+metab_line_info <- line_info[stripped_cell_line_name %in% unique(metab$stripped_cell_line_name)]
+hist_line_info <- line_info[stripped_cell_line_name %in% unique(hist$stripped_cell_line_name)]
+rppa_line_info <- line_info[stripped_cell_line_name %in% unique(rppa$stripped_cell_line_name)]
+
+ctrp_line_info <- line_info[stripped_cell_line_name %in% unique(ctrp$ccl_name)]
+
+mut_line_info <- mut_line_info[, c("stripped_cell_line_name", "primary_disease")]
+mut_line_info$data_type <- "Mutational"
+
+cnv_line_info <- cnv_line_info[, c("stripped_cell_line_name", "primary_disease")]
+cnv_line_info$data_type <- "Copy Number"
+
+exp_line_info <- exp_line_info[, c("stripped_cell_line_name", "primary_disease")]
+exp_line_info$data_type <- "Gene Expression"
+
+prot_line_info <- prot_line_info[, c("stripped_cell_line_name", "primary_disease")]
+prot_line_info$data_type <- "Protein Quantification"
+
+mirna_line_info <- mirna_line_info[, c("stripped_cell_line_name", "primary_disease")]
+mirna_line_info$data_type <- "miRNA Expression"
+
+metab_line_info <- metab_line_info[, c("stripped_cell_line_name", "primary_disease")]
+metab_line_info$data_type <- "Metabolomics"
+
+hist_line_info <- hist_line_info[, c("stripped_cell_line_name", "primary_disease")]
+hist_line_info$data_type <- "Histone Profiling"
+
+rppa_line_info <- rppa_line_info[, c("stripped_cell_line_name", "primary_disease")]
+rppa_line_info$data_type <- "RPPA"
+
+ctrp_line_info <- ctrp_line_info[, c("stripped_cell_line_name", "primary_disease")]
+ctrp_line_info$data_type <- "Dose-Response"
+
+datatype_line_info <- rbindlist(list(mut_line_info, cnv_line_info, exp_line_info, prot_line_info,
+                                     mirna_line_info, metab_line_info, hist_line_info, rppa_line_info,
+                                     ctrp_line_info))
+datatype_line_info[, count_per_disease := .N, by = "primary_disease"]
+# .(count = .N, var = sum(VAR)), by = MNTH
+
+ggplot(data = datatype_line_info) +
+  geom_bar(mapping = aes(x = reorder(primary_disease, -count_per_disease),
+                         fill = data_type), stat = "count", position = "dodge") +
+  xlab("Primary Disease") +
+  ylab("Number of cell lines") +
+  labs(fill = "Data Type") +
+  # ggtitle(label = "Proportion of Cancer Types in DepMap Data", subtitle = "By data type, 21Q2 Version - Overlap with CTRPv2: 79%") +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1),
+        text = element_text(face = "bold"),
+        legend.position = c(.9,.55))
+
+ggsave(filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Proportion.pdf", device = "pdf")
+
+
+BiocManager::install("VennDiagram")
+require(VennDiagram)
+
+library(RColorBrewer)
+myCol <- brewer.pal(5, "Pastel2")
+
+# NOTE: The CTRPv2 here is from before ctrp was merged with cell line info to add primary disease!
+venn.diagram(x = list(mut_line_info$stripped_cell_line_name,
+                      cnv_line_info$stripped_cell_line_name,
+                      exp_line_info$stripped_cell_line_name,
+                      prot_line_info$stripped_cell_line_name,
+                      unique(ctrp$ccl_name)),
+             category.names = c("Mutational", "Copy Number", "Gene Expression", "Protein Quantification", "CTRPv2 Dose-Response"),
+             filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Venn.png",
+             imagetype = "png",
+             output = TRUE,
+             height = 3000 ,
+             width = 3000 ,
+             resolution = 600,
+             # Circles
+             lwd = 2,
+             # lty = 'blank',
+             fill = myCol,
+             # Numbers
+             cex = .6,
+             fontface = "bold",
+             fontfamily = "sans",
+             
+             # Set names
+             cat.cex = 0.6,
+             cat.fontface = "bold",
+             cat.default.pos = "outer",
+             cat.pos = c(0, 0, -130, 150, 0),
+             cat.dist = c(0.2, 0.2, 0.2, 0.2, 0.2),
+             cat.fontfamily = "sans",
+             # rotation = 1
+             
+)
+
+# ==== Bottleneck Cancer Cell Lines ====
+require(data.table)
+require(stringr)
+require(ggplot2)
+# line_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
+ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
+mut <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv")
+cnv <- fread("Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv")
+exp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
+prot <- fread("Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv")
+mirna <- fread("Data/DRP_Training_Data/DepMap_2019_miRNA.csv")
+metab <- fread("Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv")
+hist <- fread("Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv")
+rppa <- fread("Data/DRP_Training_Data/DepMap_2019_RPPA.csv")
+
+bottleneck_keys <- Reduce(intersect, list(ctrp$ccl_name, mut$stripped_cell_line_name, cnv$stripped_cell_line_name,
+                                          exp$stripped_cell_line_name, prot$stripped_cell_line_name, mirna$stripped_cell_line_name,
+                                          metab$stripped_cell_line_name, hist$stripped_cell_line_name, rppa$stripped_cell_line_name)
+)
+
+bottleneck_keys <- data.table(keys = bottleneck_keys)
+fwrite(bottleneck_keys, "Data/DRP_Training_Data/bottleneck_keys.csv")
+
+bottleneck_keys <- Reduce(intersect, list(ctrp$ccl_name, mut$stripped_cell_line_name, cnv$stripped_cell_line_name,
+                                          exp$stripped_cell_line_name, prot$stripped_cell_line_name, mirna$stripped_cell_line_name,
+                                          metab$stripped_cell_line_name, hist$stripped_cell_line_name, rppa$stripped_cell_line_name)
+)
+
+bottleneck_datatype_line_info <- datatype_line_info[stripped_cell_line_name %in% bottleneck_keys]
+bottleneck_datatype_line_info[, count_per_disease := .N, by = "primary_disease"]
+
+ggplot(data = bottleneck_datatype_line_info) +
+  geom_bar(mapping = aes(x = reorder(primary_disease, -count_per_disease),
+                         fill = data_type), stat = "count", position = "dodge") +
+  xlab("Cancer Type") +
+  ylab("Number of Cell Lines") +
+  labs(fill = "Data Type") +
+  # ggtitle(label = "Proportion of Cancer Types in DepMap Data (Bottleneck Subset)", subtitle = "By data type, 21Q2 Version - Overlap with CTRPv2: 79%") +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1),
+        legend.position = c(.9,.55))
+
+# ggsave(filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Bottleneck_Proportion.pdf", device = "pdf")
+
+
+# Compare bottleneck to original data
+uniqueN(datatype_line_info$primary_disease)
+uniqueN(datatype_line_info$stripped_cell_line_name)
+uniqueN(bottleneck_datatype_line_info$primary_disease)
+uniqueN(bottleneck_datatype_line_info$stripped_cell_line_name)
+unique(bottleneck_datatype_line_info$stripped_cell_line_name)
+ctrp[ccl_name %in% unique(bottleneck_datatype_line_info$stripped_cell_line_name)]
+ctrp[ccl_name %in% unique(datatype_line_info$stripped_cell_line_name)] 
+nrow(ctrp[ccl_name %in% unique(bottleneck_datatype_line_info$stripped_cell_line_name)]) /
+nrow(ctrp[ccl_name %in% unique(datatype_line_info$stripped_cell_line_name)])   # 0.4038907
+
+# Show cell line counts in bottleneck and original data side by side
+bottleneck_datatype_line_info$`Data Subset` <- "Bottleneck"
+datatype_line_info$`Data Subset` <- "Original"
+both_data_type_line_info <- rbindlist(list(bottleneck_datatype_line_info, datatype_line_info))
+
+# Change bar order
+require(dplyr)
+bar_level_df <- data.frame(x1 = c("Original", "Bottleneck"))
+colnames(bar_level_df) <- "Data Subset"
+both_data_type_line_info <- left_join(bar_level_df,  
+                      both_data_type_line_info,
+                      by = "Data Subset")
+both_data_type_line_info <- as.data.table(both_data_type_line_info)
+
+ggplot(data = both_data_type_line_info) +
+  geom_bar(mapping = aes(x = reorder(primary_disease, -count_per_disease),
+                         y = count_per_disease,
+                         fill = `Data Subset`), stat = "identity", position = "dodge") +
+  xlab("Cancer Type") +
+  ylab("Number of Cell Line Profiles") +
+  labs(fill = "Data Subset Type") +
+  # scale_x_discrete(limits = c("Original", "Bottleneck")) +
+  
+  # ggtitle(label = "Proportion of Cancer Types in DepMap Data (Bottleneck Subset)", subtitle = "By data type, 21Q2 Version - Overlap with CTRPv2: 79%") +
+  theme(axis.text.x = element_text(size = 14, angle = 45, hjust = 1, face = "bold"),
+        axis.text.y = element_text(size = 14, face = "bold"),
+        axis.title = element_text(size = 16, face = "bold"),
+        legend.title = element_text(size = 16),
+        legend.text = element_text(size=14),
+        legend.position = c(.9,.55))
+
+ggsave(filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Bottleneck_Proportion_Comparison.pdf",
+       device = "pdf")
+
+# both_data_type_line_info <- merge(bottleneck_datatype_line_info, datatype_line_info, by)
+
+
+
+bottleneck_datatype_line_info
+# proteomics_datatype_line_info <- datatype_line_info[data_type == "Protein Quantification"]
+
+# ggplot(data = proteomics_datatype_line_info) +
+#   geom_bar(mapping = aes(x = reorder(primary_disease, -count_per_disease),
+#                          fill = data_type), stat = "count", position = "dodge") +
+#   xlab("Primary Disease") +
+#   ylab("# of cell lines") +
+#   labs(fill = "Data Type") +
+#   ggtitle(label = "Proportion of Cancer Types in DepMap Data (Bottleneck Subset)", subtitle = "By data type, 21Q2 Version - Overlap with CTRPv2: 79%") +
+#   theme(axis.text.x = element_text(angle = 45, hjust = 1),
+#         legend.position = c(.9,.55))
+
+# ggsave(filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Bottleneck_Proportion.pdf", device = "pdf")
+