--- a
+++ b/R/Fig.1a_Upset_Plot.R
@@ -0,0 +1,281 @@
+# upset_plots.R
+
+# install.packages("UpSetR")
+# install.packages("ggupset")
+require(UpSetR)
+# require(ggupset)
+require(data.table)
+require(stringr)
+require(ggplot2)
+require(patchwork)
+
+line_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
+ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
+# gdsc1 <- fread("Data/DRP_Training_Data/GDSC1_AAC_SMILES.txt")
+gdsc2 <- fread("Data/DRP_Training_Data/GDSC2_AAC_SMILES.txt")
+exp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
+mut <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv")
+cnv <- fread("Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv")
+prot <- fread("Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv")
+
+mirna <- fread("Data/DRP_Training_Data/DepMap_2019_miRNA.csv")
+hist <- fread("Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv")
+metab <- fread("Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv")
+rppa <- fread("Data/DRP_Training_Data/DepMap_2019_RPPA.csv")
+
+
+dim(rppa)
+dim(metab)
+dim(hist)
+dim(mirna)
+dim(prot)
+dim(exp)
+dim(cnv)
+dim(mut)
+uniqueN(gdsc2$ccl_name)
+uniqueN(ctrp$ccl_name)
+# ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
+# 
+# exp_ccl_names = exp$stripped_cell_line_name
+# exp_ccl_names = str_replace(toupper(exp_ccl_names), "-", "")
+# 
+# mut_ccl_names = mut$stripped_cell_line_name
+# mut_ccl_names = str_replace(toupper(mut_ccl_names), "-", "")
+# 
+# cnv_ccl_names = cnv$stripped_cell_line_name
+# cnv_ccl_names = str_replace(toupper(cnv_ccl_names), "-", "")
+mut$stripped_cell_line_name = str_replace(toupper(mut$stripped_cell_line_name), "-", "")
+cnv$stripped_cell_line_name = str_replace(toupper(cnv$stripped_cell_line_name), "-", "")
+exp$stripped_cell_line_name = str_replace(toupper(exp$stripped_cell_line_name), "-", "")
+prot$stripped_cell_line_name = str_replace(toupper(prot$stripped_cell_line_name), "-", "")
+
+mirna$stripped_cell_line_name = str_replace(toupper(mirna$stripped_cell_line_name), "-", "")
+hist$stripped_cell_line_name = str_replace(toupper(hist$stripped_cell_line_name), "-", "")
+metab$stripped_cell_line_name = str_replace(toupper(metab$stripped_cell_line_name), "-", "")
+rppa$stripped_cell_line_name = str_replace(toupper(rppa$stripped_cell_line_name), "-", "")
+
+ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
+gdsc2$ccl_name = str_replace(toupper(gdsc2$ccl_name), "-", "")
+
+mut_line_info <- line_info[stripped_cell_line_name %in% unique(mut$stripped_cell_line_name)]  
+cnv_line_info <- line_info[stripped_cell_line_name %in% unique(cnv$stripped_cell_line_name)]  
+exp_line_info <- line_info[stripped_cell_line_name %in% unique(exp$stripped_cell_line_name)]  
+prot_line_info <- line_info[stripped_cell_line_name %in% unique(prot$stripped_cell_line_name)]
+
+mirna_line_info <- line_info[stripped_cell_line_name %in% unique(mirna$stripped_cell_line_name)]  
+hist_line_info <- line_info[stripped_cell_line_name %in% unique(hist$stripped_cell_line_name)]  
+metab_line_info <- line_info[stripped_cell_line_name %in% unique(metab$stripped_cell_line_name)]  
+rppa_line_info <- line_info[stripped_cell_line_name %in% unique(rppa$stripped_cell_line_name)]
+
+ctrp_line_info <- line_info[stripped_cell_line_name %in% unique(ctrp$ccl_name)]
+gdsc2_line_info <- line_info[stripped_cell_line_name %in% unique(gdsc2$ccl_name)]
+
+mut_line_info <- mut_line_info[, c("stripped_cell_line_name", "primary_disease")]
+mut_line_info$data_type <- "Mutational"
+cnv_line_info <- cnv_line_info[, c("stripped_cell_line_name", "primary_disease")]
+cnv_line_info$data_type <- "Copy Number Variation"
+exp_line_info <- exp_line_info[, c("stripped_cell_line_name", "primary_disease")]
+exp_line_info$data_type <- "Gene Expression"
+prot_line_info <- prot_line_info[, c("stripped_cell_line_name", "primary_disease")]
+prot_line_info$data_type <- "Protein Quantification"
+
+mirna_line_info <- mirna_line_info[, c("stripped_cell_line_name", "primary_disease")]
+mirna_line_info$data_type <- "microRNA Expression"
+hist_line_info <- hist_line_info[, c("stripped_cell_line_name", "primary_disease")]
+hist_line_info$data_type <- "Histone Modification"
+metab_line_info <- metab_line_info[, c("stripped_cell_line_name", "primary_disease")]
+metab_line_info$data_type <- "Metabolomics"
+rppa_line_info <- rppa_line_info[, c("stripped_cell_line_name", "primary_disease")]
+rppa_line_info$data_type <- "Reverse-Phase Protein Array"
+
+ctrp_line_info <- ctrp_line_info[, c("stripped_cell_line_name", "primary_disease")]
+ctrp_line_info$data_type <- "Dose-Response"
+
+gdsc2_line_info <- gdsc2_line_info[, c("stripped_cell_line_name", "primary_disease")]
+gdsc2_line_info$data_type <- "Dose-Response"
+
+list_input <- list(
+  Mutational = mut_line_info$stripped_cell_line_name,
+  `Copy Number Variation` = cnv_line_info$stripped_cell_line_name,
+  `Gene Expression` = exp_line_info$stripped_cell_line_name,
+  `Protein Quantification` = prot_line_info$stripped_cell_line_name,
+  
+  `microRNA Expression` = mirna_line_info$stripped_cell_line_name,
+  `Histone Modification` = hist_line_info$stripped_cell_line_name,
+  `Metabolomics` = metab_line_info$stripped_cell_line_name,
+  `Reverse-Phase Protein Array` = rppa_line_info$stripped_cell_line_name,
+  
+  `CTRPv2 Dose-Response` = unique(ctrp$ccl_name),
+  `GDSC2 Dose-Response` = unique(gdsc2$ccl_name)
+)
+
+make_all_combinations <- function(set){
+  unlist(lapply(seq_along(set), function(size){
+    apply(combn(set, size), 2, paste0, collapse="-")
+  }))
+}
+
+
+?upset
+p <- upset(fromList(list_input),
+      sets = c("CTRPv2 Dose-Response", "GDSC2 Dose-Response",
+               "Mutational","Copy Number Variation", "Gene Expression", "Protein Quantification",
+               "microRNA Expression", "Histone Modification", "Metabolomics", "Reverse-Phase Protein Array"
+               ),
+      keep.order = T,
+      # sets = c("Dose-Response"),
+      mainbar.y.label = "Data Intersection Size",
+      # mainbar.y.max = 30,
+      sets.x.label = "Cell Lines per Data Type",
+      # group.by = "sets",
+      order.by = "freq",
+      scale.sets = "identity",
+      # set_size.angles = 45,
+      text.scale = c(1.3, 1.3, 1, 1, 1, 0.75))
+p
+
+pdf(file="Plots/Dataset_Exploration/UpSetR_Overlap_Plot_CTRPv2.pdf",
+    width = 10, height = 5
+    )
+p
+dev.off()
+
+# ggsave(filename = "Plots/Dataset_Exploration/UpSetR_Overlap_Plot_CTRPv2.pdf")
+
+# all_cells <- rbindlist(list(
+#   mut_line_info,
+#   cnv_line_info,
+#   exp_line_info,
+#   prot_line_info,
+#   mirna_line_info,
+#   hist_line_info,
+#   metab_line_info,
+#   rppa_line_info)
+# )
+# all_cells <- all_cells[, -2]
+# 
+# # install.packages("tidyverse")
+# require(tidyverse)
+# all_cells <- tidyr::as_tibble(all_cells[, 1:2])
+# simple_groups_df <- all_cells %>%
+#   group_by(stripped_cell_line_name) %>%
+#   summarize(groups = list(data_type))
+# 
+# extended_groups_df <- simple_groups_df %>%
+#   mutate(groups = lapply(groups, make_all_combinations)) %>%
+#   unnest()
+# 
+# unique(extended_groups_df)
+# ggplot(extended_groups_df, aes(x=groups)) +
+#   geom_bar() +
+#   axis_combmatrix(sep = "-", )
+# 
+# all_cells[, extended_groups := lapply(data_type, make_all_combinations)]
+# lapply(all_cells, make_all_combinations)
+
+# ==== Bimodal Intersections Counts ====
+require(ggplot2)
+require(data.table)
+require(flextable)
+require(magrittr)
+require(scales)
+require(officer)
+
+ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
+mut_line_info$data_type <- "MUT"
+cnv_line_info$data_type <- "CNV"
+exp_line_info$data_type <- "EXP"
+prot_line_info$data_type <- "PROT"
+
+mirna_line_info$data_type <- "MIRNA"
+hist_line_info$data_type <- "HIST"
+metab_line_info$data_type <- "METAB"
+rppa_line_info$data_type <- "RPPA"
+
+ctrp_line_info$data_type <- "CTRP"
+
+all_cells <- rbindlist(list(mut_line_info, cnv_line_info, exp_line_info, prot_line_info,
+                            mirna_line_info, metab_line_info, hist_line_info, rppa_line_info))
+all_cells <- unique(all_cells)
+
+ctrp_cells <- unique(ctrp_line_info$stripped_cell_line_name)
+all_omics <- data.table(
+  `Data Type(s)` = c("Mutational","Copy Number", "Gene Expression", "Protein Quantification",
+                     "microRNA Expression", "Metabolomics", "Histone Modification", "RPPA"),
+  `Abbreviation` = c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA"),
+                        `Number of Samples` = vector(mode = "integer", length = 8))
+
+for (i in 1:nrow(all_omics)) {
+  first_cells <- all_cells[data_type == all_omics[i, 2]]$stripped_cell_line_name
+  # second_cells <- all_cells[data_type == all_omics[i, 2]]$stripped_cell_line_name
+  # cell_overlap <- Reduce(intersect, list(first_cells, second_cells, ctrp_cells))
+  ctrp_overlap <- uniqueN(ctrp[ccl_name %in% first_cells])
+  all_omics[i, 3] <- ctrp_overlap
+}
+
+set_flextable_defaults(
+  font.size = 10, theme_fun = theme_vanilla,
+  padding = 6,
+  background.color = "#EFEFEF")
+
+colourer <- col_numeric(
+  palette = c("red", "white"),
+  domain = c(min(all_omics$`Number of Samples`), max(all_omics$`Number of Samples`)))
+
+
+# ==== bimodal 
+ft <- flextable(all_omics)
+final_ft <- ft %>%
+  merge_v(j = c("Data Type(s)", "Number of Samples")) %>%
+  border_inner(border = fp_border(color="gray", width = 1)) %>%
+  border_outer(part="all", border = fp_border(color="gray", width = 2)) %>%
+  align(align = "center", j = c(2, 3), part = "header") %>%
+
+  bg(
+    bg = colourer,
+    j = "Number of Samples", 
+    part = "body")
+  
+
+final_ft <- autofit(final_ft)
+read_docx() %>% 
+  body_add_flextable(value = final_ft) %>% 
+  print(target = "Plots/Dataset_Exploration/bimodal_samples_per_data_type_combo.docx")
+
+# ==== trimodal
+all_tri_omic_combos_el <- utils::combn(c("MUT", 'CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA'), 2, simplify = T)
+all_tri_omic_combos_el <- t(all_tri_omic_combos_el)
+all_tri_omic_combos_el <- as.data.table(all_tri_omic_combos_el)
+
+# all_sample_counts <- vector(mode = "numeric", length = nrow(temp))
+ctrp_cells <- unique(ctrp_line_info$stripped_cell_line_name)
+all_tri_omic_combos_el$sample_counts <- vector(mode = "integer")
+for (i in 1:nrow(all_tri_omic_combos_el)) {
+  first_cells <- all_cells[data_type == all_tri_omic_combos_el[i, 1]]$stripped_cell_line_name
+  second_cells <- all_cells[data_type == all_tri_omic_combos_el[i, 2]]$stripped_cell_line_name
+  cell_overlap <- Reduce(intersect, list(first_cells, second_cells, ctrp_cells))
+  ctrp_overlap <- uniqueN(ctrp[ccl_name %in% cell_overlap])
+  all_tri_omic_combos_el[i, 3] <- ctrp_overlap
+}
+colnames(all_tri_omic_combos_el) <- c("Data Type 1", "Data Type 2", "Number of Samples")
+
+colourer <- col_numeric(
+  palette = c("red", "white"),
+  domain = c(min(all_tri_omic_combos_el$`Number of Samples`), max(all_tri_omic_combos_el$`Number of Samples`)))
+
+ft <- flextable(all_tri_omic_combos_el)
+final_ft <- ft %>%
+  merge_v(j = c("Data Type 1", "Data Type 2", "Number of Samples")) %>%
+  border_inner(border = fp_border(color="gray", width = 1)) %>%
+  border_outer(part="all", border = fp_border(color="gray", width = 2)) %>%
+  align(align = "center", j = 1:3, part = "all") %>%
+  bg(
+    bg = colourer,
+    j = "Number of Samples", 
+    part = "body")
+
+
+final_ft <- autofit(final_ft)
+read_docx() %>% 
+  body_add_flextable(value = final_ft) %>% 
+  print(target = "Plots/Dataset_Exploration/trimodal_samples_per_data_type_combo.docx")