MMDRP / Git / [c3b4f8] /R/Samplewise_Omicwise_Performance

Models:
DanielG/
MMDRP
Downloads: 1
[c3b4f8]: / R / Samplewise_Omicwise_Performance_Comparison.R
History
Download this file
2199 lines (1786 with data), 108.0 kB

# Samplewise_Omicwise_Performance_Comparison.R
require(data.table)
setDTthreads(8)
require(ggplot2)
require(gt)
require(flextable)
require(magrittr)
require(scales)
require(officer)

rsq <- function (x, y) cor(x, y, method = "pearson") ^ 2
rmse <- function(x, y) sqrt(mean((x - y)^2))
mae <- function(x, y) mean(abs(x - y))

# Setup ====
# Consider a specific lineage (e.g. lung), and compare the predictions in samples with targeted therapies when using
# different omic data types

targeted_drugs <- c("Idelalisib", "Olaparib", "Venetoclax", "Crizotinib", "Regorafenib", 
                    "Tretinoin", "Bortezomib", "Cabozantinib", "Dasatinib", "Erlotinib", 
                    "Sonidegib", "Vandetanib", "Axitinib", "Ibrutinib", "Gefitinib", 
                    "Nilotinib", "Tamoxifen", "Bosutinib", "Pazopanib", "Lapatinib", 
                    "Dabrafenib", "Bexarotene", "Temsirolimus", "Belinostat", 
                    "Sunitinib", "Vorinostat", "Trametinib", "Fulvestrant", "Sorafenib", 
                    "Vemurafenib", "Alpelisib")


drug_by_cancer <-
  list(
    Leukemia = c(
      "Idelalisib",
      "Venetoclax",
      "Tretinoin",
      "Dasatinib",
      "Ibrutinib",
      "Nilotinib",
      "Bosutinib"
    ),
    Lymphoma = c("Bortezomib", "Bexarotene", "Belinostat", "Vorinostat"),
    `Sarcoma` = c("Pazopanib"),
    `Breast Cancer` = c(
      "Olaparib",
      "Tamoxifen",
      "Lapatinib",
      "Fulvestrant",
      "Alpelisib"
    ),
    `Lung Cancer` = c("Crizotinib", "Erlotinib", "Gefitinib", "Dabrafenib"),
    `Colon/Colorectal Cancer` =  c("Regorafenib"),
    `Thyroid Cancer` = c("Cabozantinib", "Vandetanib"),
    `Skin Cancer` = c("Sonidegib", "Trametinib", "Vemurafenib"),
    `Kidney Cancer` = c("Axitinib", "Temsirolimus", "Sorafenib"),
    `Pancreatic Cancer` = c("Sunitinib")
  )


require(plyr)
cancer_by_drug <- as.data.table(plyr::ldply(drug_by_cancer, rbind))
cancer_by_drug <- melt(cancer_by_drug, id.vars = ".id")
cancer_by_drug <- na.omit(cancer_by_drug, "value")
cancer_by_drug <- cancer_by_drug[, c(1,3)]
colnames(cancer_by_drug) <- c("assigned_disease", "cpd_name")

drug_info <- fread("Data/DRP_Training_Data/CTRP_DRUG_INFO.csv")
cell_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")

# Read Data ====
# Select per fold validation files
all_cv_files <- list.files("Data/CV_Results/", recursive = T,
                           pattern = ".*final_validation.*", full.names = T)
# ".+drug_.{3,5}_HyperOpt.+"
bimodal_cv_files <- grep(pattern = ".ResponseOnly_.*drug_\\w{3,11}_HyperOpt.+", all_cv_files, value = T)
# cur_cv_files <- grep(pattern = ".ResponseOnly_.*drug_\\w{3,5}_.+", cur_cv_files, value = T)
# cur_cv_files <- grep(pattern = ".ResponseOnly_+drug_exp_HyperOpt.+", cur_cv_files, value = T)
# cur_cv_files_2 <- grep(pattern = ".Baseline_ElasticNet.+", all_cv_files, value = T)
# final_cv_files <- c(bimodal_cv_files, cur_cv_files_2)
final_cv_files <- bimodal_cv_files
# cur_cv_files <- grep(pattern = ".+drug_.{6,11}_HyperOpt.+", cur_cv_files, value = T)
length(final_cv_files)
# sum(grepl(".*ElasticNet.*", final_cv_files))
# Read all data
all_results <- vector(mode = "list", length = length(final_cv_files))
gc()
for (i in 1:length(final_cv_files)) {
  cur_res <- fread(final_cv_files[i])
  if (!grepl(".*Baseline_ElasticNet.*", final_cv_files[i])) {
    data_types <- gsub(".+ResponseOnly_\\w*drug_(.+)_HyperOpt.+", "\\1", final_cv_files[i])
    data_types <- toupper(data_types)
    merge_method <- gsub(".+MergeBy(\\w+)_.*RMSE.+", "\\1", final_cv_files[i])
    loss_method <- gsub(".+_(.*)RMSE.+", "\\1RMSE", final_cv_files[i])
    drug_type <- gsub(".+ResponseOnly_(\\w*)drug.+_HyperOpt.+", "\\1drug", final_cv_files[i])
    drug_type <- toupper(drug_type)
    split_method <- gsub(".+Split_(\\w+)_NoBottleNeck.+", "\\1", final_cv_files[i])
    # data_types <- strsplit(data_types, "_")[[1]]
    # cur_res$epoch <- as.integer(epoch)
    cur_res$data_types <- data_types
    cur_res$merge_method <- merge_method
    cur_res$loss_type <- loss_method
    cur_res$drug_type <- drug_type
    cur_res$split_method <- split_method
  } else {
    data_types <- gsub(".+ResponseOnly_drug_(\\w+)_Baseline_.+", "\\1", final_cv_files[i])
    data_types <- toupper(data_types)
    cur_res$data_types <- data_types
    cur_res$merge_method <- "Merge By Early Concat"
    cur_res$loss_type <- "UnBase Model + LDS"
    cur_res$drug_type <- "1024-bit ECFP"
    cur_res$split_method <- "Split By Both"
  }
  
  cur_fold <- gsub(".+CV_Index_(\\d)_.+", "\\1", final_cv_files[i])
  cur_res$fold <- cur_fold
  
  all_results[[i]] <- cur_res
}
all_results <- rbindlist(all_results, fill = T)
if (any(all_results$merge_method == "Merge By Early Concat")) {
  all_results[is.na(rmse_loss), RMSELoss := abs(target - predicted), by = .I]
  all_results[!is.na(rmse_loss), RMSELoss := rmse_loss, by = .I]
  all_results$rmse_loss <- NULL
} else {
  all_results[, RMSELoss := abs(target - predicted), by = .I]
}

# all_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")]
all_results$V1 <- NULL

all_results[drug_type == "DRUG"]$drug_type <- "1024-bit ECFP"
all_results[drug_type == "GNNDRUG"]$drug_type <- "Base Model + GNN"

all_results[split_method == "BOTH"]$split_method <- "Split By Both"
all_results[split_method == "DRUG"]$split_method <- "Split By Drug Scaffold"
all_results[split_method == "CELL_LINE"]$split_method <- "Split By Cell Line"

all_results[merge_method == "Concat"]$merge_method <- "Merge By Concat"
# all_results[merge_method == "MergeByEarlyConcat"]$merge_method <- "Merge By Early Concat"
all_results[merge_method == "LMF"]$merge_method <- "Base Model + LMF"
all_results[merge_method == "Sum"]$merge_method <- "Merge By Sum"

all_results[loss_type == "RMSE"]$loss_type <- "UnBase Model + LDS"
all_results[loss_type == "WeightedRMSE"]$loss_type <- "Base Model + LDS"


all_results[, Targeted := ifelse(cpd_name %in% targeted_drugs, "Targeted Drug", "Untargeted Drug")]

all_results[, TargetRange := ifelse(target >= 0.7, "Target Above 0.7", "Target Below 0.7")]

fwrite(all_results, "Data/all_results.csv")


# Compare samples predicted using different omic data types ====
dir.create("Data/InferenceResults")
all_results <- fread("Data/all_results.csv")
trifecta_results <- unique(all_results[merge_method == "Base Model + LMF" &
              loss_type == "Base Model + LDS" &
              drug_type == "Base Model + GNN",
              c("cpd_name", "cell_name", "target", "predicted", "RMSELoss", "data_types", "split_method")])
trifecta_results <- unique(trifecta_results, by = c("cpd_name", "cell_name", "data_types"))
colnames(trifecta_results)[4] <- "pred"
colnames(trifecta_results)[5] <- "RMSE"

trifecta_results_wide <-
  dcast(data = trifecta_results, formula = cpd_name + cell_name + split_method + target ~ data_types,
        value.var = c("pred", "RMSE"), fill = NA)
# Save 
# fwrite(trifecta_results_wide, "Data/InferenceResults/Trifecta_Results_Wide_SplitByEach.csv")

trifecta_results_wide <- fread("Data/InferenceResults/Trifecta_Results_Wide_SplitByEach.csv")
trifecta_results_wide[target > 0.7]

# Are there data types that are beneficial for specific cell lines, lineages or drug types?
# Best performing drug per cell line and per lineage

# Add cell line lineage data
trifecta_results_wide <- merge(trifecta_results_wide, cell_info[, 2:5], by.x = "cell_name", by.y = "stripped_cell_line_name")
setcolorder(trifecta_results_wide, c("cell_name", "cpd_name", "primary_disease", "lineage", "lineage_subtype", "split_method", "target"))

# Get minimum RMSE across all omic data types and combos for each drug x cell line x split method
trifecta_results_wide[, lowest_rmse := min(.SD, na.rm = T),
                      by = c("cell_name", "cpd_name", "split_method"), .SDcols = patterns("RMSE")]

# Get highest AAC for each drug x cell line
trifecta_results_wide[, highest_aac := max(target),
                      by = c("cell_name", "cpd_name")]

# Top cell lines for each drug
# trifecta_results_wide[, c("top_cell_1", "top_cell_2", "top_cell_3") := tail(.SD, 3),
#                       .SDcols = c("cell_name", names(trifecta_results_wide) %like% "RMSE"), by = c("cpd_name", "split_method")]

trifecta_results_wide[target > 0.7 & cpd_name %in% targeted_drugs, tail(.SD, 3),
                      .SDcols = c("cell_name", "lowest_rmse"), by = c("cpd_name", "split_method")]


# Top cell lines for each drug
setorder(ctrp, -area_above_curve)
setkey(ctrp, "cpd_name")
ctrp <- merge(ctrp, cancer_by_drug, by = "cpd_name")
top_cells_per_drug_min_aac <- ctrp[area_above_curve >= 0.5 & cpd_name %in% targeted_drugs, head(.SD, 10), by = "cpd_name"][, c("cpd_name", "assigned_disease", "ccl_name", "primary_disease", "area_above_curve")]
top_cells_per_drug_matching_disease <- ctrp[(assigned_disease == primary_disease) & cpd_name %in% targeted_drugs, head(.SD, 10), by = "cpd_name"][, c("cpd_name", "assigned_disease", "ccl_name", "primary_disease", "area_above_curve")]
# max(ctrp[cpd_name == "Bosutinib"]$area_above_curve)
top_cells_per_drug <- unique(rbindlist(list(top_cells_per_drug_min_aac, top_cells_per_drug_matching_disease)))
# top_cells_per_drug <- merge(top_cells_per_drug, cancer_by_drug, by = "cpd_name")
setcolorder(top_cells_per_drug, c("cpd_name", "assigned_disease"))

set_flextable_defaults(
  font.size = 10, theme_fun = theme_vanilla,
  padding = 6,
  background.color = "#EFEFEF")

colourer <- col_numeric(
  palette = c("transparent", "red"),
  domain = c(0, 1))

top_cells_per_drug[, area_above_curve := round(area_above_curve, 3)]
colnames(top_cells_per_drug) <- c("Prescribed Drug(s)", "Cancer", "Cell Line", "Cell Line Primary Disease", "AAC")
setcolorder(top_cells_per_drug, c("Cancer", "Prescribed Drug(s)"))
setkey(top_cells_per_drug, Cancer, `Prescribed Drug(s)`)
ft <- flextable(top_cells_per_drug)
final_ft <- ft %>%
  merge_v(j = c("Cancer", "Prescribed Drug(s)", "Cell Line Primary Disease")) %>%
  border_inner(border = fp_border(color="gray", width = 1)) %>%
  border_outer(part="all", border = fp_border(color="gray", width = 2))

final_ft <- autofit(final_ft)
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/Dataset_Exploration/drug_by_cell_line_vs_target_disease_table.docx")


# Are there cancer types where the targeted drug is not prescribed for, but the model was able to predict
# a true high AAC?
# i.e. can the model give faithful drug repurposing recommendations?
# Can you show that previous repurposing recommendations were not possible since everyone was using EXP only?


targeted_results <- merge(top_cells_per_drug, trifecta_results_wide,
      by.x = c("Prescribed Drug(s)", "Cell Line"), by.y = c("cpd_name", "cell_name"))
targeted_results <- targeted_results[, !names(targeted_results) %like% "RMSE", with = F]
targeted_results$primary_disease <- NULL
# targeted_results$lowest_rmse <- NULL
targeted_results$highest_aac <- NULL
targeted_results$target <- NULL

pred_cols <- names(targeted_results)[names(targeted_results) %like% "pred"]
colnames(targeted_results)[9:ncol(targeted_results)]

melt_targeted_results <- melt(targeted_results, id.vars = colnames(targeted_results)[1:8])
# melt_targeted_results[, `Max AAC` := max(AAC), by = c("Prescribed Drug(s)", "Cell Line", "split_method")]
melt_targeted_results[, value := round(value, 3)]
# melt_targeted_results[, `Max AAC` := round(`Max AAC`, 3)]
wide_targeted_results <- dcast(melt_targeted_results, ... ~ split_method + variable)

# df_header <- as.data.table(expand.grid(unique(targeted_results$split_method), pred_cols), stringsAsFactors = FALSE)
# df_header[, header_id := paste(Var1, Var2, sep = "_")]
# setcolorder(df_header, "header_id")
# df_header <- rbindlist(list(no_header, df_header), use.names = T)
# setcolorder(df_header, c("header_id", "Var1", "Var2"))
# df_header <- as.data.frame(df_header, stringsAsFactors = F)


split_by_both_cols <- names(wide_targeted_results)[names(wide_targeted_results) %like% "Split By Both"]
split_by_both_cols <- split_by_both_cols[split_by_both_cols != "Split By Both_lowest_rmse"]
split_by_cell_cols <- names(wide_targeted_results)[names(wide_targeted_results) %like% "Split By Cell Line"]
split_by_cell_cols <- split_by_cell_cols[split_by_cell_cols != "Split By Cell Line_lowest_rmse"]
split_by_drug_cols <- names(wide_targeted_results)[names(wide_targeted_results) %like% "Split By Drug Scaffold"]
split_by_drug_cols <- split_by_drug_cols[split_by_drug_cols != "Split By Drug Scaffold_lowest_rmse"]

# wide_targeted_results$`Split By Both_highest_aac`
# wide_targeted_results$`Split By Drug Scaffold_highest_aac`
# wide_targeted_results$`Split By Cell Line_highest_aac`


setcolorder(wide_targeted_results, c("Prescribed Drug(s)", "Cancer", "Cell Line",
                                     "Cell Line Primary Disease", "lineage", "lineage_subtype", "AAC",
                                     "Split By Both_lowest_rmse", "Split By Drug Scaffold_lowest_rmse",
                                     "Split By Cell Line_lowest_rmse",
                                     split_by_both_cols, split_by_cell_cols, split_by_drug_cols))

colnames(wide_targeted_results)[5] <- "Lineage"
colnames(wide_targeted_results)[6] <- "Lineage Subtype"
wide_targeted_results$Lineage <- tools::toTitleCase(gsub("_", " ", wide_targeted_results$Lineage))
wide_targeted_results$`Lineage Subtype` <- tools::toTitleCase(gsub("_", " ", wide_targeted_results$`Lineage Subtype`))

# stringsAsFactors = False is required for flextables to work normally 
no_header <- data.frame(header_id = colnames(wide_targeted_results)[1:10],
                        Var1 = colnames(wide_targeted_results)[1:10],
                        Var2 = colnames(wide_targeted_results)[1:10], stringsAsFactors = FALSE)
temp <- expand.grid(unique(targeted_results$split_method), pred_cols)
df_header <- data.frame('header_id' = paste(temp$Var1, temp$Var2, sep = "_"),
                        'Var1' = temp$Var1,
                        'Var2' = temp$Var2,
                        stringsAsFactors = F)
# target_order <- c("Split By Both", "Split By Cell Line", "Split By Drug Scaffold")
# df_header[match(target_order, df_header$Var1), ]
df_header <- df_header[with(df_header, order(Var1)), ]
df_header <- rbind(no_header, df_header)

all(df_header$header_id == colnames(wide_targeted_results))
# final_ft <- ft %>%
#   merge_v(j = c("Cancer", "Prescribed Drug(s)", "Cell Line Primary Disease")) %>%
#   border_inner(border = fp_border(color="gray", width = 1)) %>%
#   border_outer(part="all", border = fp_border(color="gray", width = 2))
# 
# final_ft <- autofit(final_ft)

setcolorder(top_cells_per_drug, c("Cancer", "Prescribed Drug(s)"))
setkey(wide_targeted_results, Cancer, `Prescribed Drug(s)`)


# temp <- as.data.frame(wide_targeted_results, stringsAsFactors = F)
ft <- flextable(wide_targeted_results, col_keys = df_header$header_id)
final_ft <- set_header_df(ft, mapping = df_header, key = "header_id") %>%
  merge_v(part = "header") %>%
  merge_v(part = "body") %>%
  merge_h(part = "header", i = 1) %>%
  # theme_booktabs(bold_header = TRUE) %>% 
  align(align = "center", part = "all") %>%
  border_inner(border = fp_border(color="gray", width = 1)) %>%
  border_outer(part="all", border = fp_border(color="gray", width = 2))

  
final_ft <- autofit(final_ft)
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/Dataset_Exploration/drug_by_cell_line_vs_target_disease_by_all_omic_combos_table.docx")


# Summarize Repurposable Drugs ====
require(data.table)
setDTthreads(8)
require(ggplot2)
require(gt)
require(flextable)
require(magrittr)
require(scales)
require(officer)
drug_info <- fread("Data/DRP_Training_Data/CTRP_DRUG_INFO.csv")
cell_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")

# Must summarize to e.g. top 3 omic data types with lowest RMSEs, and write their splitting method,
# predicted AACs and RMSEs in the same cell, preferably (per each cell line and drug combo)
all_results <- fread("Data/all_results.csv")
trifecta_results <- unique(all_results[merge_method == "Base Model + LMF" &
                                         loss_type == "Base Model + LDS" &
                                         drug_type == "Base Model + GNN",
                                       c("cpd_name", "cell_name", "target", "predicted", "RMSELoss",
                                         "data_types", "split_method")])
rm(all_results)
gc()

trifecta_results <- trifecta_results[split_method == "Split By Cell Line"]
# trifecta_results <- unique(trifecta_results, by = c("cpd_name", "cell_name", "data_types"))
colnames(trifecta_results)[4] <- "pred"
colnames(trifecta_results)[5] <- "RMSE"

trifecta_results[, target := round(target, 3)]
trifecta_results[, pred := round(pred, 3)]
trifecta_results[, RMSE := round(RMSE, 3)]

trifecta_results <- merge(trifecta_results, cell_info[, 2:5], by.x = "cell_name", by.y = "stripped_cell_line_name")
trifecta_results <- merge(trifecta_results, cancer_by_drug, by = "cpd_name", all.x = T)

setcolorder(trifecta_results, c("cpd_name", "assigned_disease", "cell_name", "primary_disease", "lineage", "lineage_subtype", "split_method", "target"))

targeted_results <- trifecta_results[cpd_name %in% targeted_drugs]
# Order by RMSE
setorder(targeted_results, RMSE)
setkey(targeted_results, cpd_name, cell_name)

# Subset by lowest RMSEs
targeted_subset <- targeted_results[, head(.SD, 5), by = c("cpd_name", "cell_name")]
# targeted_subset <- targeted_subset[(target >= 0.5) | (assigned_disease == primary_disease)]
# It would be interesting to add inter-dataset concordance as a column to this data,
# although I'm not sure how relevant it would be


# Put top results in a single cell of a table 
targeted_subset[, cell_content := paste0(split_method, "\n", data_types, "\n", target, "\n", pred, "\n", RMSE)]
targeted_subset[, full_cell := paste(.SD, collapse = "\n"), by = c("cpd_name", "cell_name"), .SDcols = "cell_content"]

targeted_sub_sub <- unique(targeted_subset[, c("cpd_name", "assigned_disease", "cell_name", "primary_disease",
                                               "lineage", "lineage_subtype", "full_cell")])

targeted_subset$cell_content <- NULL
targeted_subset$full_cell <- NULL
# Are there cancer types where the targeted drug is not prescribed for, but the model was able to predict
# a true high AAC?
# i.e. can the model give faithful drug repurposing recommendations?
# Can you show that previous repurposing recommendations were not possible since everyone was using EXP only?

# Find each drug's highest AAC on it's assigned disease (NOTE the one liner)
targeted_subset[, highest_drug_match_disease_aac := max(.SD[assigned_disease == primary_disease]$target),
                by = "cpd_name"]

# targeted_subset[, avg_drug_match_disease_aac := mean(.SD[assigned_disease == primary_disease]$target),
#                 by = "cpd_name"]
# NOTE: Have the option of choosing AAC larger by a certain amount
better_than_assigned <- targeted_subset[target >= highest_drug_match_disease_aac]
uniqueN(better_than_assigned[, c("cpd_name", "cell_name")])  # 600 drug and cell line combinations
uniqueN(better_than_assigned[, c("cpd_name")])  # 31 repurposable drugs
better_than_assigned <- targeted_subset[target >= highest_drug_match_disease_aac + 0.1]
uniqueN(better_than_assigned[, c("cpd_name", "cell_name")])  # 182
uniqueN(better_than_assigned[, c("cpd_name")])  # 17
better_than_assigned <- targeted_subset[target >= highest_drug_match_disease_aac + 0.2]
uniqueN(better_than_assigned[, c("cpd_name", "cell_name")])  # 81
uniqueN(better_than_assigned[, c("cpd_name")])  # 14
unique(better_than_assigned[, c("cpd_name")])  # 14


# No minimum difference -> 2991 rows
# Minimum difference of 0.1 -> 910 rows
# Minimum difference of 0.2 -> 405 rows

better_than_assigned <- targeted_subset[target >= highest_drug_match_disease_aac]
# Repurposable drugs that have x amount higher AAC in unassigned cancers, and at least one
# of our models can predict with MAE loss less than 0.2 while seeing that cell line for the 
# first time
unique(better_than_assigned[target > highest_drug_match_disease_aac & RMSE <= 0.2]$cpd_name)

# Subset by THE lowest RMSE model to save space
setorder(better_than_assigned, RMSE)
setkey(better_than_assigned, cpd_name, cell_name)

# Save different subsets
# Only subset by MAE loss less than 0.2
better_than_assigned_subset <- better_than_assigned[target >= highest_drug_match_disease_aac]
# Select top 5 models
# final_data <- better_than_assigned_subset[, head(.SD, 5), by = c("cpd_name", "cell_name")]
final_data <- better_than_assigned_subset
setorder(final_data, -target)
setcolorder(final_data, c("assigned_disease", "cpd_name", "highest_drug_match_disease_aac",
                          "cell_name", "primary_disease", "lineage", "lineage_subtype"))
# Save final data
fwrite(final_data, "Data/repurposable_drugs_table.csv")

# Subset by MAE loss less than 0.2 and AAC more than prescribed at least 0.2
better_than_assigned_subset <- better_than_assigned[target >= highest_drug_match_disease_aac + 0.2 &
                                                      RMSE <= 0.2]

final_data <- better_than_assigned_subset[, head(.SD, 1), by = c("cpd_name", "cell_name")]
setorder(final_data, -target)
setcolorder(final_data, c("assigned_disease", "cpd_name", "highest_drug_match_disease_aac",
                          "cell_name", "primary_disease", "lineage", "lineage_subtype"))
# Save final data
fwrite(final_data, "Data/high_aac_lowest_mae_repurposable_drugs_table.csv")

# Put top results in a single cell of a table 
final_data[, data_types := gsub("_", " + ", data_types, fixed = T)]
final_data[, cell_content := paste0(data_types, "\n", pred)]

# flextable(final_data)

setcolorder(final_data, c("primary_disease", "lineage_subtype", "cell_name", "target",
                          "highest_drug_match_disease_aac", "assigned_disease", "cpd_name",
                          "cell_content"))

final_data$data_types <- NULL
# All the data is from split by cell line 
final_data$split_method <- NULL
final_data$pred <- NULL
final_data$RMSE <- NULL
final_data$lineage <- NULL

setorder(final_data, primary_disease, lineage_subtype, cell_name)
colnames(final_data) <- c("Primary Disease", "Lineage Subtype", "Cell Line", "AAC",
                          "Highest AAC", "Cancer", "Drug",
                          "Top Model:\nData Type(s)\nPrediction")
final_data <- unique(final_data)
final_data$`Lineage Subtype` <- tools::toTitleCase(gsub("_", " ", final_data$`Lineage Subtype`))

colnames(final_data)[1:4] <- paste0("Empirical_", colnames(final_data)[1:4])
colnames(final_data)[5:7] <- paste0("Prescribed_", colnames(final_data)[5:7])
colnames(final_data)[8] <- paste0(colnames(final_data)[8], "_", colnames(final_data)[8])

header_id_colnames <- colnames(final_data)
header_split <- stringr::str_split(header_id_colnames, "_", simplify = T)
df_header <- data.frame('header_id' = header_id_colnames,
                         'Var1' = header_split[,1],  # Empirical vs Prescribed
                         'Var2' = header_split[,2],  # Empirical vs Prescribed
                         stringsAsFactors = F)
# no_header <- data.table(header_id = "Top Model:\nData Type(s)\nPrediction",
#                         Var1 = "Top Model:\nData Type(s)\nPrediction")
# df_header <- rbind(no_header, cur_header)

# NOTE: Cancers of the brain, the eye, the esophagus, the thyroid gland, and the skin of the
# head and neck are not usually classified as head and neck cancers.


ft <- flextable(final_data, col_keys = df_header$header_id)
final_ft <- set_header_df(ft, mapping = df_header, key = "header_id") %>%
  merge_v(part = "header") %>%
  merge_h(part = "header") %>%
  merge_v(part = "body") %>%
  # theme_booktabs(bold_header = TRUE) %>%
  align(align = "center", part = "all") %>%
  border_inner(border = fp_border(color="gray", width = 1)) %>%
  border_outer(part="all", border = fp_border(color="gray", width = 2)) %>%
  bold(bold = T, part = "header")

# ft <- flextable(final_data)
# final_ft <- ft %>%
#   # merge_v(j = c("Cancer", "Prescribed Drug(s)", "Cell Line Primary Disease")) %>%
#   merge_v() %>%
#   border_inner(border = fp_border(color="gray", width = 1)) %>%
#   border_outer(part="all", border = fp_border(color="gray", width = 2)) %>%
#   align(align = "center", part = "all")

final_ft <- autofit(final_ft)

read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/Dataset_Exploration/Top_repurposable_Drugs_table.docx")


# Important data types per cell line / lineage / drug ====
# Are there data types that are beneficial for specific cell lines, lineages or drug types?
# Best performing drug per cell line and per lineage
# Best splitting method per cell line, per drug, per lineage, and overall, especially to predict higher AAC targets
require(data.table)
setDTthreads(8)
require(ggplot2)
require(gt)
require(flextable)
require(magrittr)
require(scales)
require(officer)
drug_info <- fread("Data/DRP_Training_Data/CTRP_DRUG_INFO.csv")
cell_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")

all_results <- fread("Data/all_results.csv")
trifecta_results <- unique(all_results[merge_method == "Base Model + LMF" &
                                         loss_type == "Base Model + LDS" &
                                         drug_type == "Base Model + GNN",
                                       c("cpd_name", "cell_name", "target", "predicted", "RMSELoss",
                                         "data_types", "split_method")])
rm(all_results)
gc()

trifecta_results <- unique(trifecta_results, by = c("cpd_name", "cell_name", "data_types"))
colnames(trifecta_results)[4] <- "pred"
colnames(trifecta_results)[5] <- "RMSE"

trifecta_results[, target := round(target, 3)]
trifecta_results[, pred := round(pred, 3)]
trifecta_results[, RMSE := round(RMSE, 3)]

trifecta_results <- merge(trifecta_results, cell_info[, 2:5], by.x = "cell_name", by.y = "stripped_cell_line_name")
# trifecta_results <- merge(trifecta_results, cancer_by_drug, by = "cpd_name", all.x = T)

# setcolorder(trifecta_results, c("cpd_name", "assigned_disease", "cell_name", "primary_disease", "lineage", "lineage_subtype", "split_method", "target"))
setcolorder(trifecta_results, c("cpd_name", "cell_name", "primary_disease", "lineage", "lineage_subtype", "split_method", "target"))


trifecta_results <- trifecta_results[split_method != "Split By Both"]

# What cell lines never have an AAC >= 0.5?
sensitive_cells <- (trifecta_results[target >= 0.5]$cell_name)
all_cells <- unique(trifecta_results$cell_name)
setdiff(all_cells, sensitive_cells)
# 12 cell lines never have AAC >= 0.5
# [1] "A498"     "CAPAN2"   "CCFSTTG1" "HPAFII"   "KMM1"     "KPL1"     "NCIH2172" "NCIH441"  "NMCG1"    "SNU410"  
# [11] "SW1783"   "TOLEDO"  

# Best data types for each cell line (top 3), will consider only data types that have some response, so AAC >= 0.5
trifecta_results[, top_mean_rmse_data_type_per_cell := mean(.SD[target > 0.5]$RMSE),
                 by = c("data_types", "cell_name", "split_method")]
trifecta_results[split_method == "Split By Cell Line" & cell_name== "HUCCT1"]
table(trifecta_results[split_method == "Split By Cell Line" & cell_name== "HUCCT1"]$data_types)

# Sort by mean RMSE per cell line and split_method
trifecta_results <- trifecta_results[!is.na(top_mean_rmse_data_type_per_cell)]
setorder(trifecta_results, split_method, primary_disease, cell_name, top_mean_rmse_data_type_per_cell)
unique(trifecta_results[, head(.SD, 3), by = "cell_name",
                 .SDcols = c("primary_disease", "data_types", "top_mean_rmse_data_type_per_cell")])

# Subset for (per) cell line related data
data_types_by_cell <- unique(trifecta_results[, c("cell_name", "primary_disease", "split_method", "data_types",
                             "top_mean_rmse_data_type_per_cell")])
setorder(data_types_by_cell, split_method, primary_disease, cell_name, top_mean_rmse_data_type_per_cell)
top_data_types_by_cell <- data_types_by_cell[, head(.SD, 3), by = "cell_name"]

unique(trifecta_results[, head(.SD, 3), by = "cell_name"])


# Do the same for lineages (top 3, response above 0.5 AAC)
trifecta_results[, top_mean_rmse_data_type_per_lineage := mean(.SD[target > 0.5]$RMSE),
                 by = c("data_types", "primary_disease", "split_method")]

# Sort by mean RMSE per cell line and split_method
trifecta_results <- trifecta_results[!is.na(top_mean_rmse_data_type_per_lineage)]
setorder(trifecta_results, split_method, primary_disease, cell_name, top_mean_rmse_data_type_per_lineage)
unique(trifecta_results[, head(.SD, 3), by = "cell_name",
                        .SDcols = c("primary_disease", "data_types", "top_mean_rmse_data_type_per_lineage")])

# Subset for (per) cell line related data
data_types_by_lineage <- unique(trifecta_results[, c("cell_name", "primary_disease", "split_method", "data_types",
                                                  "top_mean_rmse_data_type_per_lineage")])
setorder(data_types_by_lineage, primary_disease, split_method, cell_name, top_mean_rmse_data_type_per_lineage)
top_data_types_by_lineage <- data_types_by_lineage[, head(.SD, 3), by = c("primary_disease", "split_method")]

top_data_types_by_lineage$cell_name <- NULL
top_data_types_by_lineage <- unique(top_data_types_by_lineage)
top_data_types_by_lineage[, top_mean_rmse_data_type_per_lineage := round(top_mean_rmse_data_type_per_lineage, 3)]
# setorder(data_types_by_lineage, primary_disease, split_method, top_mean_rmse_data_type_per_lineage)
setorder(data_types_by_lineage, primary_disease, split_method, `data_types`)
colnames(top_data_types_by_lineage) <- c("Primary Disease", "Split Method", "Data Type(s)", "Mean RMSE per data type per lineage (in samples with AAC >= 0.5)")

ft <- flextable(top_data_types_by_lineage)

final_ft <- ft %>%
  # merge_v(j = c("Cancer", "Prescribed Drug(s)", "Cell Line Primary Disease")) %>%
  merge_v() %>%
  border_inner(border = fp_border(color="gray", width = 1)) %>%
  border_outer(part="all", border = fp_border(color="gray", width = 2)) %>%
  align(align = "center", part = "all")

final_ft <- autofit(final_ft)

read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/Dataset_Exploration/Best_data_types_for_each_lineage_table.docx")


# Final Comparison Table ====
## Bold Table Function ====
setup_bold_table <- function(cur_table, header_df, bold_df=NULL) {
  final_ft <- set_header_df(cur_table, mapping = header_df, key = "header_id") %>%
    merge_v(part = "header") %>%
    # merge_v(part = "body") %>%
    merge_h(part = "header", i = 1:3) %>%
    # theme_booktabs(bold_header = TRUE) %>%
    align(align = "center", part = "all") %>%
    border_inner(border = fp_border(color="gray", width = 1)) %>%
    border_outer(part="all", border = fp_border(color="gray", width = 2)) %>%
    bold(bold = T, part = "header")
  
  if (!is.null(bold_df)) {
    for (i in 1:nrow(bold_df)) {
      cur_data_type <- bold_df[i,]$`Omic Type(s)`
      cur_variable <- as.character(bold_df[i,]$variable)
      cur_pattern <- bold_df[`Omic Type(s)` == cur_data_type &
                               variable == cur_variable]$best_cv_mean
      # cur_pattern <- paste0(cur_pattern, " ±")
      i_formula <- as.formula(paste0("~ grepl(x = `", cur_variable, "`, pattern = '", cur_pattern, "', fixed = T)"))
      j_formula <- as.formula(paste0("~`", cur_variable, "`"))
      final_ft <- final_ft %>%
        bold(i = i_formula,
             j = j_formula) %>%
        highlight(i = i_formula,
                  j = j_formula,
                  color = "yellow",
                  part = "body")
    }
  }
  return(final_ft)
}


## Bimodal Cases ====
require(data.table)
setDTthreads(8)
require(ggplot2)
require(gt)
require(flextable)
require(magrittr)
require(scales)
require(officer)

set_flextable_defaults(
  font.size = 10, theme_fun = theme_vanilla,
  padding = 6,
  background.color = "#EFEFEF")

### Single Method ==== 
# all_results <- fread("Data/all_results.csv")

# Note the change from MAE to RMSE
# all_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "Targeted", "bottleneck")]
all_results[, loss_by_config := rmse(target, predicted), by = c("data_types", "merge_method", "loss_type",
                                                                "drug_type", "split_method", "TargetRange",
                                                                "Targeted", "bottleneck")]
all_results <- unique(all_results[, c("data_types", "merge_method", "loss_type",
                                      "drug_type", "split_method", "TargetRange",
                                      "Targeted", "bottleneck", "loss_by_config")])
# all_results_long_copy <- melt(unique(all_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "TargetRange", "Targeted", "bottleneck")]),
#                               id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "Targeted", "bottleneck"))

# all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange", "Targeted", "bottleneck")]
# all_results_long_copy[, cv_sd := sd(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange", "Targeted", "bottleneck")]

# all_results_long_copy$value <- NULL
# all_results_long_copy$variable <- NULL
# all_results_long_copy$fold <- NULL
# all_results_long_copy <- unique(all_results_long_copy)

rm(all_results)
gc()

bimodal_results <- all_results[nchar(data_types) < 6]
bimodal_results[merge_method == "Base Model"]$merge_method <- "Concat"
bimodal_results[merge_method == "Base Model + Sum"]$merge_method <- "Sum"
bimodal_results[merge_method == "Base Model + LMF"]$merge_method <- "LMF"
bimodal_results[loss_type == "Base Model"]$loss_type <- "non-LDS"
bimodal_results[loss_type == "Base Model + LDS"]$loss_type <- "LDS"
bimodal_results[drug_type == "Base Model"]$drug_type <- "ECFP"
bimodal_results[drug_type == "Base Model + GNN"]$drug_type <- "GNN"
bimodal_results[split_method == "Split By Both Cell Line & Drug Scaffold"]$split_method <- "Group Both"
bimodal_results[split_method == "Split By Drug Scaffold"]$split_method <- "Group Scaffold"
bimodal_results[split_method == "Split By Cell Line"]$split_method <- "Group Cell"
bimodal_results[TargetRange == "Target Above 0.7"]$TargetRange <- ">= 0.7"
bimodal_results[TargetRange == "Target Below 0.7"]$TargetRange <- "< 0.7"
bimodal_results[Targeted == "Targeted Drug"]$Targeted <- "Targeted"
bimodal_results[Targeted == "Untargeted Drug"]$Targeted <- "Untargeted"
bimodal_results[bottleneck == "No Data Bottleneck"]$bottleneck <- "No Bottleneck"
bimodal_results[bottleneck == "With Data Bottleneck"]$bottleneck <- "With Bottleneck"

# Consider only single technique bimodal results for now
bi_single_tech <- bimodal_results[
  (loss_type == "LDS" & merge_method == "Concat" & drug_type == "ECFP") |  # LDS
  (loss_type == "non-LDS" & merge_method == "LMF" & drug_type == "ECFP") |  # LMF
  (loss_type == "non-LDS" & merge_method == "Concat" & drug_type == "GNN") |  # GNN
  (loss_type == "non-LDS" & merge_method == "Concat" & drug_type == "ECFP")  # Baseline
]

# Subset for no bottleneck
bi_single_tech <- bi_single_tech[bottleneck == "No Bottleneck"]
bi_single_tech$bottleneck <- NULL

bi_single_tech[(loss_type == "LDS" & merge_method == "Concat" & drug_type == "ECFP"), Method := "LDS"]
bi_single_tech[(loss_type == "non-LDS" & merge_method == "LMF" & drug_type == "ECFP"), Method := "LMF"]
bi_single_tech[(loss_type == "non-LDS" & merge_method == "Concat" & drug_type == "GNN"), Method := "GNN"]
bi_single_tech[(loss_type == "non-LDS" & merge_method == "Concat" & drug_type == "ECFP"), Method := "Baseline"]
bi_single_tech$loss_type <- NULL
bi_single_tech$merge_method <- NULL
bi_single_tech$drug_type <- NULL

# bi_single_tech[, cv_mean := round(cv_mean, 3)]
# bi_single_tech[, cv_sd := round(cv_sd, 3)]
bi_single_tech[, loss_by_config := round(loss_by_config, 3)]
# bi_single_tech$Result <- paste(bi_single_tech$cv_mean, bi_single_tech$cv_sd, sep = " ± ")
bi_single_tech$Result <- as.character(bi_single_tech$loss_by_config)
# bi_single_tech$cv_mean <- NULL
# bi_single_tech$cv_sd <- NULL
bi_single_tech$loss_by_config <- NULL
bi_single_tech <- unique(bi_single_tech)
bi_single_tech <- dcast(bi_single_tech, ... ~ split_method + Method + Targeted + TargetRange,
                        value.var = "Result")

# Create Header for FlexTable
colnames(bi_single_tech)[1] <- "Data Type(s)"
header_id_colnames <- colnames(bi_single_tech)[-1]
header_split <- stringr::str_split(header_id_colnames, "_", simplify = T)
cur_header <- data.frame('header_id' = header_id_colnames,
                        'Var1' = header_split[,1],  # Grouping method
                        'Var2' = header_split[,2],  # Technique used
                        'Var3' = header_split[,3],  # Targeted or Untargeted
                        'Var4' = header_split[,4],  # AAC range
                        stringsAsFactors = F)
# target_order <- c("Split By Both", "Split By Cell Line", "Split By Drug Scaffold")
# df_header[match(target_order, df_header$Var1), ]
# df_header <- df_header[with(df_header, order(Var1)), ]
no_header <- data.table(header_id = "Data Type(s)",
                        Var1 = "Data Type(s)", Var2 = "Data Type(s)",
                        Var3 = "Data Type(s)", Var4 = "Data Type(s)")
df_header <- rbind(no_header, cur_header)

# all(df_header$header_id == colnames(wide_targeted_results))
# final_ft <- ft %>%
#   merge_v(j = c("Cancer", "Prescribed Drug(s)", "Cell Line Primary Disease")) %>%
#   border_inner(border = fp_border(color="gray", width = 1)) %>%
#   border_outer(part="all", border = fp_border(color="gray", width = 2))
# 
# final_ft <- autofit(final_ft)

# setcolorder(top_cells_per_drug, c("Cancer", "Prescribed Drug(s)"))
# setkey(wide_targeted_results, Cancer, `Prescribed Drug(s)`)

flextable(bi_single_tech)
# temp <- as.data.frame(wide_targeted_results, stringsAsFactors = F)
ft <- flextable(bi_single_tech, col_keys = df_header$header_id)
final_ft <- set_header_df(ft, mapping = df_header, key = "header_id") %>%
  merge_v(part = "header") %>%
  merge_v(part = "body") %>%
  merge_h(part = "header", i = 1:3) %>%
  # theme_booktabs(bold_header = TRUE) %>% 
  align(align = "center", part = "all") %>%
  border_inner(border = fp_border(color="gray", width = 1)) %>%
  border_outer(part="all", border = fp_border(color="gray", width = 2)) %>%
  bold(bold = T, part = "header")


final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Bimodal_byGroup_byMethod_byTargeted_byRange_table.docx")


#### Group By Both ====
df_header <- df_header[Var4 %like% ">= 0.7|Data Type"]
group_both_subset <- bi_single_tech[, .SD, .SDcols = patterns("Data Type|Group Both")]
# Subset by AAC >= 0.7
group_both_subset <- group_both_subset[, .SD, .SDcols = patterns("Data Type|>= 0.7")]
header_both_subset <- df_header[Var1 == "Group Both" | Var1 == "Data Type(s)"]

group_both_cv_means <- melt(group_both_subset,
                            measure.vars = colnames(group_both_subset)[-1])
# Subset for those with higher AACs
group_both_cv_means <- group_both_cv_means[variable %like% ">= 0.7"]

group_both_cv_means$cv_mean <- as.numeric(gsub(" ± .+", "", group_both_cv_means$value))
group_both_cv_means[, c("Group", "Method", "Targeted", "Range") := tstrsplit(variable, "_")]

# Everything except the method used, so we can compare method performance
group_both_cv_means[, best_cv_mean := as.character(min(cv_mean)),
                    by = c("Data Type(s)", "Group", "Targeted")]

group_both_cv_means <- group_both_cv_means[cv_mean == as.numeric(best_cv_mean)]
bold_df <- unique(group_both_cv_means[, c("Data Type(s)", "variable", "best_cv_mean")])


ft <- flextable(group_both_subset, col_keys = header_both_subset$header_id)

final_ft <- setup_bold_table(ft, header_both_subset, bold_df)

final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Bimodal_GroupByBoth_table.docx")


#### Group By Cell Line ====
cur_group_subset <- bi_single_tech[, .SD, .SDcols = patterns("Data Type|Group Cell")]
cur_group_subset <- cur_group_subset[, .SD, .SDcols = patterns("Data Type|>= 0.7")]

cur_header_subset <- df_header[Var1 == "Group Cell" | Var1 == "Data Type(s)"]

cur_group_cv_means <- melt(cur_group_subset,
                            measure.vars = colnames(cur_group_subset)[-1])
# Subset for those with higher AACs
cur_group_cv_means <- cur_group_cv_means[variable %like% ">= 0.7"]
cur_group_cv_means$cv_mean <- as.numeric(gsub(" ± .+", "", cur_group_cv_means$value))
cur_group_cv_means[, c("Group", "Method", "Targeted", "Range") := tstrsplit(variable, "_")]

# Everything except the method used, so we can compare method performance
cur_group_cv_means[, best_cv_mean := as.character(min(cv_mean)),
                    by = c("Data Type(s)", "Group", "Targeted")]

cur_group_cv_means <- cur_group_cv_means[cv_mean == as.numeric(best_cv_mean)]
bold_df <- unique(cur_group_cv_means[, c("Data Type(s)", "variable", "best_cv_mean")])


ft <- flextable(cur_group_subset, col_keys = cur_header_subset$header_id)


final_ft <- setup_bold_table(ft, cur_header_subset, bold_df)
final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Bimodal_GroupByCellLine_table.docx")


#### Group By Drug Scaffold ====
cur_group_subset <- bi_single_tech[, .SD, .SDcols = patterns("Data Type|Group Scaffold")]
cur_group_subset <- cur_group_subset[, .SD, .SDcols = patterns("Data Type|>= 0.7")]
cur_header_subset <- df_header[Var1 == "Group Scaffold" | Var1 == "Data Type(s)"]


cur_group_cv_means <- melt(cur_group_subset,
                           measure.vars = colnames(cur_group_subset)[-1])
# Subset for those with higher AACs
cur_group_cv_means <- cur_group_cv_means[variable %like% ">= 0.7"]
cur_group_cv_means$cv_mean <- as.numeric(gsub(" ± .+", "", cur_group_cv_means$value))
cur_group_cv_means[, c("Group", "Method", "Targeted", "Range") := tstrsplit(variable, "_")]

# Everything except the method used, so we can compare method performance
cur_group_cv_means[, best_cv_mean := as.character(min(cv_mean)),
                   by = c("Data Type(s)", "Group", "Targeted")]

cur_group_cv_means <- cur_group_cv_means[cv_mean == as.numeric(best_cv_mean)]
bold_df <- unique(cur_group_cv_means[, c("Data Type(s)", "variable", "best_cv_mean")])


ft <- flextable(cur_group_subset, col_keys = cur_header_subset$header_id)


final_ft <- setup_bold_table(ft, cur_header_subset, bold_df)
final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Bimodal_GroupByScaffold_table.docx")


#### Best Model for Each Omic Data ====
bimodal_results[merge_method == "Merge By Early Concat"]$merge_method <- "Elastic Net"
bimodal_results_copy <- bimodal_results

# Assign model names
bimodal_results_copy[(loss_type == "LDS" & merge_method == "Concat" & drug_type == "ECFP"), Method := "LDS"]
bimodal_results_copy[(loss_type == "non-LDS" & merge_method == "LMF" & drug_type == "ECFP"), Method := "LMF"]
bimodal_results_copy[(loss_type == "non-LDS" & merge_method == "Concat" & drug_type == "GNN"), Method := "GNN"]
bimodal_results_copy[(loss_type == "non-LDS" & merge_method == "Concat" & drug_type == "ECFP"), Method := "Baseline"]
bimodal_results_copy[(loss_type == "LDS" & merge_method == "LMF" & drug_type == "ECFP"), Method := "LDS+LMF"]
bimodal_results_copy[(loss_type == "non-LDS" & merge_method == "LMF" & drug_type == "GNN"), Method := "LMF+GNN"]
bimodal_results_copy[(loss_type == "LDS" & merge_method == "Concat" & drug_type == "GNN"), Method := "LDS+GNN"]
bimodal_results_copy[(loss_type == "LDS" & merge_method == "LMF" & drug_type == "GNN"), Method := "LDS+LMF+GNN"]

# bimodal_results_copy[(loss_type == "non-LDS" & merge_method == "Sum" & drug_type == "ECFP"), Method := "LDS+Sum+GNN"]
# bimodal_results_copy[(loss_type == "LDS" & merge_method == "Sum" & drug_type == "GNN"), Method := "LDS+Sum+GNN"]

bimodal_results_copy[(merge_method == "Elastic Net"), Method := "Elastic Net"]

bimodal_results_copy$loss_type <- NULL
bimodal_results_copy$merge_method <- NULL
bimodal_results_copy$drug_type <- NULL

# Subset for upper ranges, remove bottleneck column
bimodal_results_copy <- bimodal_results_copy[TargetRange == ">= 0.7"]
bimodal_results_copy$bottleneck <- NULL

# Find the lowest CV mean by data types and drug type
bimodal_results_copy[, best_cv_mean := min(cv_mean), by = c("data_types", "split_method", "Targeted")]

# Subset models for those with best CV means
bimodal_results_copy <- bimodal_results_copy[cv_mean == best_cv_mean]
colnames(bimodal_results_copy)[1] <- "Data Type(s)"
bimodal_results_copy[, variable := paste(split_method, Targeted, TargetRange, sep = "_")]

# Find the best model overall for grouping method and drug type
bimodal_results_copy[, best_overall := min(cv_mean), by = c("split_method", "Targeted")]
bolf_df <- bimodal_results_copy[cv_mean == best_overall]
bold_df <- unique(bolf_df[, c("Data Type(s)", "variable", "best_overall")])
bold_df$best_overall <- round(bold_df$best_overall, 3)
colnames(bold_df)[3] <- "best_cv_mean"
bimodal_results_copy$variable <- NULL
bimodal_results_copy$best_overall <- NULL

bimodal_results_copy[, cv_mean := round(cv_mean, 3)]
bimodal_results_copy[, cv_sd := round(cv_sd, 3)]
bimodal_results_copy$Result <- paste(bimodal_results_copy$cv_mean, bimodal_results_copy$cv_sd, sep = " ± ")
bimodal_results_copy$cv_mean <- NULL
bimodal_results_copy$cv_sd <- NULL
bimodal_results_copy$best_cv_mean <- NULL
bimodal_results_copy <- unique(bimodal_results_copy)
bimodal_results_copy$Result <- paste(bimodal_results_copy$Method, bimodal_results_copy$Result, sep = "\n")
bimodal_results_copy$Method <- NULL

bimodal_results_copy <- dcast(bimodal_results_copy, ... ~ split_method + Targeted + TargetRange,
                              value.var = "Result")

header_id_colnames <- colnames(bimodal_results_copy)[-1]
header_split <- stringr::str_split(header_id_colnames, "_", simplify = T)
cur_header <- data.frame('header_id' = header_id_colnames,
                         'Var1' = header_split[,1],  # Grouping method
                         'Var2' = header_split[,2],  # Targeted or Untargeted
                         'Var3' = header_split[,3],  # AAC range
                         stringsAsFactors = F)
no_header <- data.table(header_id = "Data Type(s)",
                        Var1 = "Data Type(s)", Var2 = "Data Type(s)",
                        Var3 = "Data Type(s)")
df_header <- rbind(no_header, cur_header)

ft <- flextable(bimodal_results_copy, col_keys = df_header$header_id)

final_ft <- setup_bold_table(ft, df_header, bold_df)

final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Bimodal_Best_By_DataType_table.docx")


### Two Methods ====
require(data.table)
setDTthreads(8)
require(ggplot2)
require(gt)
require(flextable)
require(magrittr)
require(scales)
require(officer)
rsq <- function (x, y) cor(x, y, method = "pearson") ^ 2
rmse <- function(x, y) sqrt(mean((x - y)^2))
mae <- function(x, y) mean(abs(x - y))


set_flextable_defaults(
  font.size = 10, theme_fun = theme_vanilla,
  padding = 6,
  background.color = "#EFEFEF")

all_results <- fread("Data/all_results.csv")
all_results <- all_results[nchar(data_types) <= 5]


all_results[, loss_by_config := rmse(target, predicted), by = c("data_types", "merge_method", "loss_type",
                                                                "drug_type", "split_method", "TargetRange",
                                                                "Targeted", "bottleneck")]
all_results <- unique(all_results[, c("data_types", "merge_method", "loss_type",
                                      "drug_type", "split_method", "TargetRange",
                                      "Targeted", "bottleneck", "loss_by_config")])

# all_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "Targeted", "bottleneck")]
# all_results_long_copy <- melt(unique(all_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "TargetRange", "Targeted", "bottleneck")]),
#                               id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "Targeted", "bottleneck"))
# all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange", "Targeted", "bottleneck")]
# all_results_long_copy[, cv_sd := sd(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange", "Targeted", "bottleneck")]

# all_results_long_copy$value <- NULL
# all_results_long_copy$variable <- NULL
# all_results_long_copy$fold <- NULL
# all_results_long_copy <- unique(all_results_long_copy)

# rm(all_results)
gc()

bimodal_results <- all_results[nchar(data_types) < 6]
bimodal_results[merge_method == "Base Model"]$merge_method <- "Concat"
bimodal_results[merge_method == "Base Model + Sum"]$merge_method <- "Sum"
bimodal_results[merge_method == "Base Model + LMF"]$merge_method <- "LMF"
bimodal_results[loss_type == "Base Model"]$loss_type <- "non-LDS"
bimodal_results[loss_type == "Base Model + LDS"]$loss_type <- "LDS"
bimodal_results[drug_type == "Base Model"]$drug_type <- "ECFP"
bimodal_results[drug_type == "Base Model + GNN"]$drug_type <- "GNN"
bimodal_results[split_method == "Split By Both Cell Line & Drug Scaffold"]$split_method <- "Group Both"
bimodal_results[split_method == "Split By Drug Scaffold"]$split_method <- "Group Scaffold"
bimodal_results[split_method == "Split By Cell Line"]$split_method <- "Group Cell"
bimodal_results[TargetRange == "Target Above 0.7"]$TargetRange <- ">= 0.7"
bimodal_results[TargetRange == "Target Below 0.7"]$TargetRange <- "< 0.7"
bimodal_results[Targeted == "Targeted Drug"]$Targeted <- "Targeted"
bimodal_results[Targeted == "Untargeted Drug"]$Targeted <- "Untargeted"
bimodal_results[bottleneck == "No Data Bottleneck"]$bottleneck <- "No Bottleneck"
bimodal_results[bottleneck == "With Data Bottleneck"]$bottleneck <- "With Bottleneck"

#### Grouping Preparation ====
# Consider only single technique bimodal results for now
bi_two_tech <- bimodal_results[
  (loss_type == "LDS" & merge_method == "LMF" & drug_type == "ECFP") |  # LDS + LMF
    (loss_type == "non-LDS" & merge_method == "LMF" & drug_type == "GNN") |  # LMF + GNN
    (loss_type == "LDS" & merge_method == "Concat" & drug_type == "GNN") |  # LDS + GNN
    (loss_type == "non-LDS" & merge_method == "Concat" & drug_type == "ECFP")  # Baseline
]
# Subset for no bottleneck
bi_two_tech <- bi_two_tech[bottleneck == "No Bottleneck"]
bi_two_tech$bottleneck <- NULL

# Assign Model Names
bi_two_tech[(loss_type == "LDS" & merge_method == "LMF" & drug_type == "ECFP"), Method := "LDS+LMF"]
bi_two_tech[(loss_type == "non-LDS" & merge_method == "LMF" & drug_type == "GNN"), Method := "LMF+GNN"]
bi_two_tech[(loss_type == "LDS" & merge_method == "Concat" & drug_type == "GNN"), Method := "LDS+GNN"]
bi_two_tech[(loss_type == "non-LDS" & merge_method == "Concat" & drug_type == "ECFP"), Method := "Baseline"]
bi_two_tech$loss_type <- NULL
bi_two_tech$merge_method <- NULL
bi_two_tech$drug_type <- NULL

bi_two_tech[, cv_mean := round(cv_mean, 3)]
bi_two_tech[, cv_sd := round(cv_sd, 3)]
bi_two_tech$Result <- paste(bi_two_tech$cv_mean, bi_two_tech$cv_sd, sep = " ± ")
bi_two_tech$cv_mean <- NULL
bi_two_tech$cv_sd <- NULL
bi_two_tech <- unique(bi_two_tech)
bi_two_tech <- dcast(bi_two_tech, ... ~ split_method + Method + Targeted + TargetRange,
                        value.var = "Result")

# Make Header Table for FlexTable
colnames(bi_two_tech)[1] <- "Data Type(s)"
header_id_colnames <- colnames(bi_two_tech)[-1]
header_split <- stringr::str_split(header_id_colnames, "_", simplify = T)
cur_header <- data.frame('header_id' = header_id_colnames,
                         'Var1' = header_split[,1],  # Grouping method
                         'Var2' = header_split[,2],  # Technique used
                         'Var3' = header_split[,3],  # Targeted or Untargeted
                         'Var4' = header_split[,4],  # AAC range
                         stringsAsFactors = F)
no_header <- data.table(header_id = "Data Type(s)",
                        Var1 = "Data Type(s)", Var2 = "Data Type(s)",
                        Var3 = "Data Type(s)", Var4 = "Data Type(s)")
df_header <- rbind(no_header, cur_header)


# setup_bold_table <- function(cur_table, header_df, bold_df) {
#   final_ft <- set_header_df(cur_table, mapping = header_df, key = "header_id") %>%
#     merge_v(part = "header") %>%
#     merge_v(part = "body") %>%
#     merge_h(part = "header", i = 1:3) %>%
#     align(align = "center", part = "all") %>%
#     border_inner(border = fp_border(color="gray", width = 1)) %>%
#     border_outer(part="all", border = fp_border(color="gray", width = 2)) %>%
#     bold(bold = T, part = "header")
#   
#   for (i in 1:nrow(bold_df)) {
#     cur_data_type <- bold_df[i,]$`Data Type(s)`
#     cur_variable <- as.character(bold_df[i,]$variable)
#     cur_pattern <- bold_df[`Data Type(s)` == cur_data_type &
#                              variable == cur_variable]$best_cv_mean
#     i_formula <- as.formula(paste0("~ grepl(x = `", cur_variable, "`, pattern = ", cur_pattern, ", fixed = T)"))
#     j_formula <- as.formula(paste0("~`", cur_variable, "`"))
#     final_ft <- final_ft %>%
#       bold(i = i_formula,
#            j = j_formula)
#   }
#   return(final_ft)
# }

#### Group By Both ====
df_header <- df_header[Var4 %like% ">= 0.7|Data Type"]
group_both_subset <- bi_two_tech[, .SD, .SDcols = patterns("Data Type|Group Both")]
# Subset by AAC >= 0.7
group_both_subset <- group_both_subset[, .SD, .SDcols = patterns("Data Type|>= 0.7")]
header_both_subset <- df_header[Var1 == "Group Both" | Var1 == "Data Type(s)"]

group_both_cv_means <- melt(group_both_subset,
                            measure.vars = colnames(group_both_subset)[-1])
# Subset for those with higher AACs
group_both_cv_means <- group_both_cv_means[variable %like% ">= 0.7"]

group_both_cv_means$cv_mean <- as.numeric(gsub(" ± .+", "", group_both_cv_means$value))
group_both_cv_means[, c("Group", "Method", "Targeted", "Range") := tstrsplit(variable, "_")]

# Everything except the method used, so we can compare method performance
group_both_cv_means[, best_cv_mean := as.character(min(cv_mean)),
                    by = c("Data Type(s)", "Group", "Targeted")]

group_both_cv_means <- group_both_cv_means[cv_mean == as.numeric(best_cv_mean)]
bold_df <- unique(group_both_cv_means[, c("Data Type(s)", "variable", "best_cv_mean")])


ft <- flextable(group_both_subset, col_keys = header_both_subset$header_id)

final_ft <- setup_bold_table(ft, header_both_subset, bold_df)

final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Bimodal_TwoMethods_GroupByBoth_table.docx")


#### Group By Cell Line ====
cur_group_subset <- bi_two_tech[, .SD, .SDcols = patterns("Data Type|Group Cell")]
cur_group_subset <- cur_group_subset[, .SD, .SDcols = patterns("Data Type|>= 0.7")]

cur_header_subset <- df_header[Var1 == "Group Cell" | Var1 == "Data Type(s)"]

cur_group_cv_means <- melt(cur_group_subset,
                           measure.vars = colnames(cur_group_subset)[-1])
# Subset for those with higher AACs
cur_group_cv_means <- cur_group_cv_means[variable %like% ">= 0.7"]
cur_group_cv_means$cv_mean <- as.numeric(gsub(" ± .+", "", cur_group_cv_means$value))
cur_group_cv_means[, c("Group", "Method", "Targeted", "Range") := tstrsplit(variable, "_")]

# Everything except the method used, so we can compare method performance
cur_group_cv_means[, best_cv_mean := as.character(min(cv_mean)),
                   by = c("Data Type(s)", "Group", "Targeted")]

cur_group_cv_means <- cur_group_cv_means[cv_mean == as.numeric(best_cv_mean)]
bold_df <- unique(cur_group_cv_means[, c("Data Type(s)", "variable", "best_cv_mean")])


ft <- flextable(cur_group_subset, col_keys = cur_header_subset$header_id)


final_ft <- setup_bold_table(ft, cur_header_subset, bold_df)
final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Bimodal_TwoMethods_GroupByCellLine_table.docx")


#### Group By Drug Scaffold ====
cur_group_subset <- bi_two_tech[, .SD, .SDcols = patterns("Data Type|Group Scaffold")]
cur_group_subset <- cur_group_subset[, .SD, .SDcols = patterns("Data Type|>= 0.7")]
cur_header_subset <- df_header[Var1 == "Group Scaffold" | Var1 == "Data Type(s)"]


cur_group_cv_means <- melt(cur_group_subset,
                           measure.vars = colnames(cur_group_subset)[-1])
# Subset for those with higher AACs
cur_group_cv_means <- cur_group_cv_means[variable %like% ">= 0.7"]
cur_group_cv_means$cv_mean <- as.numeric(gsub(" ± .+", "", cur_group_cv_means$value))
cur_group_cv_means[, c("Group", "Method", "Targeted", "Range") := tstrsplit(variable, "_")]

# Everything except the method used, so we can compare method performance
cur_group_cv_means[, best_cv_mean := as.character(min(cv_mean)),
                   by = c("Data Type(s)", "Group", "Targeted")]

cur_group_cv_means <- cur_group_cv_means[cv_mean == as.numeric(best_cv_mean)]
bold_df <- unique(cur_group_cv_means[, c("Data Type(s)", "variable", "best_cv_mean")])


ft <- flextable(cur_group_subset, col_keys = cur_header_subset$header_id)


final_ft <- setup_bold_table(ft, cur_header_subset, bold_df)
final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Bimodal_TwoMethods_GroupByScaffold_table.docx")

### Best Model for Each Omic Data ====
# bimodal_results[merge_method == "Merge By Early Concat"]$merge_method <- "Elastic Net"
all_results <- fread("Data/all_results.csv")
shared_combos <- fread("Data/shared_unique_combinations.csv")
shared_combos[, unique_samples := paste0(cpd_name, "_", cell_name)]

# Subset for bimodal results
bimodal_results <- all_results[nchar(data_types) <= 5]
# Subset by all shared samples all each data type
bimodal_results[, unique_samples := paste0(cpd_name, "_", cell_name)]
bimodal_results <- bimodal_results[unique_samples %in% shared_combos$unique_samples]

# Calculate total RMSE by configuration
bimodal_results[, loss_by_config := rmse(target, predicted), by = c("data_types", "merge_method", "loss_type",
                                                                "drug_type", "split_method", "TargetRange",
                                                                "Targeted", "bottleneck")]
bimodal_results <- unique(bimodal_results[, c("data_types", "merge_method", "loss_type",
                                      "drug_type", "split_method", "TargetRange",
                                      "Targeted", "bottleneck", "loss_by_config")])

bimodal_results_copy <- bimodal_results

# Assign model names
# Baseline and Elastic Net
bimodal_results_copy[(loss_type == "Base Model" & merge_method == "Merge By Early Concat" & drug_type == "Base Model"), Method := "ElasticNet"]
bimodal_results_copy[(loss_type == "Base Model" & merge_method == "Base Model" & drug_type == "Base Model"), Method := "Baseline"]
# Single Technique
bimodal_results_copy[(loss_type == "Base Model + LDS" & merge_method == "Base Model" & drug_type == "Base Model"), Method := "LDS"]
bimodal_results_copy[(loss_type == "Base Model" & merge_method == "Base Model + LMF" & drug_type == "Base Model"), Method := "LMF"]
bimodal_results_copy[(loss_type == "Base Model" & merge_method == "Base Model + Sum" & drug_type == "Base Model"), Method := "Sum"]
bimodal_results_copy[(loss_type == "Base Model" & merge_method == "Base Model" & drug_type == "Base Model + GNN"), Method := "GNN"]
# Two Techniques
bimodal_results_copy[(loss_type == "Base Model + LDS" & merge_method == "Base Model + LMF" & drug_type == "Base Model"), Method := "LDS+LMF"]
bimodal_results_copy[(loss_type == "Base Model" & merge_method == "Base Model + LMF" & drug_type == "Base Model + GNN"), Method := "LMF+GNN"]
bimodal_results_copy[(loss_type == "Base Model + LDS" & merge_method == "Base Model" & drug_type == "Base Model + GNN"), Method := "LDS+GNN"]
# Three Techniques
bimodal_results_copy[(loss_type == "Base Model + LDS" & merge_method == "Base Model + Sum" & drug_type == "Base Model + GNN"), Method := "LDS+Sum+GNN"]
bimodal_results_copy[(loss_type == "Base Model + LDS" & merge_method == "Base Model + LMF" & drug_type == "Base Model + GNN"), Method := "LDS+LMF+GNN"]

bimodal_results_copy$loss_type <- NULL
bimodal_results_copy$merge_method <- NULL
bimodal_results_copy$drug_type <- NULL

# Subset for upper ranges, remove bottleneck column
bimodal_results_copy <- bimodal_results_copy[TargetRange == "Target Above 0.7"]
bimodal_results_copy <- bimodal_results_copy[bottleneck == "No Data Bottleneck"]
table(bimodal_results_copy$model_name)

bimodal_results_copy$bottleneck <- NULL

# Find the lowest RMSE loss by data types and drug type
# bimodal_results_copy[, best_cv_mean := min(cv_mean), by = c("data_types", "split_method", "Targeted")]
bimodal_results_copy[, best_cv_mean := min(loss_by_config), by = c("data_types", "split_method", "Targeted")]

# Subset models for those with best CV means
# bimodal_results_copy <- bimodal_results_copy[cv_mean == best_cv_mean]
bimodal_results_copy <- bimodal_results_copy[loss_by_config == best_cv_mean]
colnames(bimodal_results_copy)[1] <- "Omic Type(s)"
bimodal_results_copy[, variable := paste(split_method, Targeted, TargetRange, sep = "_")]

# Find the best model overall for grouping method and drug type
# bimodal_results_copy[, best_overall := min(cv_mean), by = c("split_method", "Targeted")]
bimodal_results_copy[, best_overall := min(loss_by_config), by = c("split_method", "Targeted")]
# bolf_df <- bimodal_results_copy[cv_mean == best_overall]
bolf_df <- bimodal_results_copy[loss_by_config == best_overall]
bold_df <- unique(bolf_df[, c("Omic Type(s)", "variable", "best_overall")])
bold_df$best_overall <- round(bold_df$best_overall, 3)
colnames(bold_df)[3] <- "best_cv_mean"
bimodal_results_copy$variable <- NULL
bimodal_results_copy$best_overall <- NULL

# bimodal_results_copy[, cv_mean := round(cv_mean, 3)]
# bimodal_results_copy[, cv_sd := round(cv_sd, 3)]
bimodal_results_copy[, loss_by_config := round(loss_by_config, 3)]
# bimodal_results_copy$Result <- paste(bimodal_results_copy$cv_mean, bimodal_results_copy$cv_sd, sep = " ± ")
bimodal_results_copy$Result <- as.character(bimodal_results_copy$loss_by_config)
# bimodal_results_copy$cv_mean <- NULL
# bimodal_results_copy$cv_sd <- NULL
bimodal_results_copy$loss_by_config <- NULL
bimodal_results_copy$best_cv_mean <- NULL
bimodal_results_copy <- unique(bimodal_results_copy)
bimodal_results_copy$Result <- paste(bimodal_results_copy$Method, bimodal_results_copy$Result, sep = "\n")
bimodal_results_copy$Method <- NULL

bimodal_results_copy <- dcast(bimodal_results_copy, ... ~ split_method + Targeted + TargetRange,
                              value.var = "Result")

header_id_colnames <- colnames(bimodal_results_copy)[-1]
header_split <- stringr::str_split(header_id_colnames, "_", simplify = T)
cur_header <- data.frame('header_id' = header_id_colnames,
                         'Var1' = header_split[,1],  # Grouping method
                         'Var2' = header_split[,2],  # Targeted or Untargeted
                         'Var3' = header_split[,3],  # AAC range
                         stringsAsFactors = F)
no_header <- data.table(header_id = "Omic Type(s)",
                        Var1 = "Omic Type(s)", Var2 = "Omic Type(s)",
                        Var3 = "Omic Type(s)")
df_header <- rbind(no_header, cur_header)

bimodal_results_copy[, "Omic Type(s)" := factor(bimodal_results_copy$`Omic Type(s)`,
                                             levels = c("MUT", "CNV", "EXP", "PROT",
                                                        "MIRNA", "METAB", "HIST", "RPPA"))]
bar_level_df <- data.frame(temp = c("MUT", "CNV", "EXP", "PROT",
                                  "MIRNA", "METAB", "HIST", "RPPA"))
colnames(bar_level_df) <- "Omic Type(s)"
bimodal_results_copy <- left_join(bar_level_df,  
                      bimodal_results_copy,
                      by = "Omic Type(s)")
bimodal_results_copy <- as.data.table(bimodal_results_copy)

ft <- flextable(bimodal_results_copy, col_keys = df_header$header_id)

final_ft <- setup_bold_table(cur_table = ft, header_df = df_header, bold_df = bold_df)

final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Bimodal_Best_By_DataType_table.docx")

## Trimodal Tables ====
require(data.table)
setDTthreads(8)
require(ggplot2)
require(gt)
require(flextable)
require(magrittr)
require(scales)
require(officer)
rmse <- function(x, y) sqrt(mean((x - y)^2))

set_flextable_defaults(
  font.size = 10, theme_fun = theme_vanilla,
  padding = 6,
  background.color = "#EFEFEF")

all_results <- fread("Data/all_results.csv")
all_results <- all_results[str_count(data_types, "_") == 1]

shared_combos <- fread("Data/shared_unique_combinations.csv")
shared_combos[, unique_samples := paste0(cpd_name, "_", cell_name)]

# Subset for bimodal results
# Subset by all shared samples all each data type
all_results[, unique_samples := paste0(cpd_name, "_", cell_name)]
all_results <- all_results[unique_samples %in% shared_combos$unique_samples]

all_results[, loss_by_config := rmse(target, predicted), by = c("data_types", "merge_method", "loss_type",
                                                                "drug_type", "split_method", "TargetRange",
                                                                "Targeted", "bottleneck")]
all_results <- unique(all_results[, c("data_types", "merge_method", "loss_type",
                                      "drug_type", "split_method", "TargetRange",
                                      "Targeted", "bottleneck", "loss_by_config")])
gc()

uniqueN(all_results$data_types)
# all_results[merge_method == "Merge By Concat"]$merge_method <- "Concat"
# all_results[merge_method == "Merge By Sum"]$merge_method <- "Sum"
# all_results[merge_method == "Base Model + LMF"]$merge_method <- "LMF"
# all_results[loss_type == "UnBase Model + LDS"]$loss_type <- "non-LDS"
# all_results[loss_type == "Base Model + LDS"]$loss_type <- "LDS"
# all_results[drug_type == "1024-bit ECFP"]$drug_type <- "ECFP"
# all_results[drug_type == "Base Model + GNN"]$drug_type <- "GNN"
# all_results[split_method == "Split By Both Cell Line & Drug Scaffold"]$split_method <- "Group Both"
# all_results[split_method == "Split By Drug Scaffold"]$split_method <- "Group Scaffold"
# all_results[split_method == "Split By Cell Line"]$split_method <- "Group Cell"
# all_results[split_method == "Split By Cancer Type"]$split_method <- "Group Cancer Type"
# all_results[TargetRange == "Target Above 0.7"]$TargetRange <- ">= 0.7"
# all_results[TargetRange == "Target Below 0.7"]$TargetRange <- "< 0.7"
# all_results[Targeted == "Targeted Drug"]$Targeted <- "Targeted"
# all_results[Targeted == "Untargeted Drug"]$Targeted <- "Untargeted"
# all_results[bottleneck == "No Data Bottleneck"]$bottleneck <- "No Bottleneck"
# all_results[bottleneck == "With Data Bottleneck"]$bottleneck <- "With Bottleneck"

### Best Model for Each Omic Data ====
trimodal_results_copy <- all_results
# Subset for no bottleneck
unique(trimodal_results_copy$bottleneck)
trimodal_results_copy <- trimodal_results_copy[bottleneck == "No Data Bottleneck"]
trimodal_results_copy$bottleneck <- NULL

unique(trimodal_results_copy[, c("merge_method", "loss_type", "drug_type")])

# Assign model names
trimodal_results_copy[(loss_type == "Base Model + LDS" & merge_method == "Base Model + LMF" & drug_type == "Base Model + GNN"), Method := "Trifecta"]
trimodal_results_copy[(loss_type == "Base Model" & merge_method == "Base Model" & drug_type == "Base Model"), Method := "Baseline"]
# Subset for baseline and trifecta models
trimodal_results_copy <- trimodal_results_copy[!is.na(Method)]
# Remove splitting by lineage (only done for trifecta models)
unique(trimodal_results_copy$split_method)
# trimodal_results_copy <- trimodal_results_copy[split_method != "Split By Lineage"]
# Subset TargetRange
trimodal_results_copy <- trimodal_results_copy[TargetRange == "Target Above 0.7"]

trimodal_results_copy$loss_type <- NULL
trimodal_results_copy$merge_method <- NULL
trimodal_results_copy$drug_type <- NULL

# Find the lowest CV mean by data types and drug type
trimodal_results_copy[, best_cv_mean := min(loss_by_config),
                      by = c("data_types", "split_method", "Targeted")]

# Subset models for those with best CV means
trimodal_results_copy <- trimodal_results_copy[loss_by_config == best_cv_mean]
colnames(trimodal_results_copy)[1] <- "Omic Type(s)"
trimodal_results_copy[, variable := paste(split_method, Targeted, TargetRange, sep = "_")]

# Find the best model overall for grouping method and drug type
trimodal_results_copy[, best_overall := min(loss_by_config), by = c("split_method", "Targeted")]
bolf_df <- trimodal_results_copy[loss_by_config == best_overall]

bold_df <- unique(bolf_df[, c("Omic Type(s)", "variable", "best_overall")])
bold_df$best_overall <- round(bold_df$best_overall, 3)
colnames(bold_df)[3] <- "best_cv_mean"
trimodal_results_copy$variable <- NULL
trimodal_results_copy$best_overall <- NULL

# trimodal_results_copy[, cv_mean := round(cv_mean, 3)]
trimodal_results_copy[, loss_by_config := round(loss_by_config, 3)]
# trimodal_results_copy[, cv_sd := round(cv_sd, 3)]
# trimodal_results_copy$Result <- paste(trimodal_results_copy$cv_mean, trimodal_results_copy$cv_sd, sep = " ± ")
trimodal_results_copy[, Result := as.character(loss_by_config)]
# trimodal_results_copy$cv_mean <- NULL
# trimodal_results_copy$cv_sd <- NULL
trimodal_results_copy <- unique(trimodal_results_copy)
# trimodal_results_copy <- dcast(trimodal_results_copy, ... ~ split_method + Method + Targeted + TargetRange,
#                         value.var = "Result")

# bimodal_results_copy[, cv_mean := round(cv_mean, 3)]
# bimodal_results_copy[, cv_sd := round(cv_sd, 3)]
# bimodal_results_copy$Result <- paste(bimodal_results_copy$cv_mean, bimodal_results_copy$cv_sd, sep = " ± ")
# bimodal_results_copy$cv_mean <- NULL
# bimodal_results_copy$cv_sd <- NULL
trimodal_results_copy$best_cv_mean <- NULL
trimodal_results_copy$loss_by_config <- NULL
# bimodal_results_copy <- unique(bimodal_results_copy)
trimodal_results_copy$Result <- paste(trimodal_results_copy$Method, trimodal_results_copy$Result, sep = "\n")
trimodal_results_copy$Method <- NULL

trimodal_results_copy_wide <- dcast(trimodal_results_copy, ... ~ split_method + Targeted + TargetRange,
                              value.var = "Result")

header_id_colnames <- colnames(trimodal_results_copy_wide)[-1]
header_split <- stringr::str_split(header_id_colnames, "_", simplify = T)
cur_header <- data.frame('header_id' = header_id_colnames,
                         'Var1' = header_split[,1],  # Grouping method
                         'Var2' = header_split[,2],  # Targeted or Untargeted
                         'Var3' = header_split[,3],  # AAC range
                         stringsAsFactors = F)
no_header <- data.table(header_id = "Omic Type(s)",
                        Var1 = "Omic Type(s)", Var2 = "Omic Type(s)",
                        Var3 = "Omic Type(s)")
df_header <- rbind(no_header, cur_header)

all_tri_omic_combos_el <- utils::combn(c("MUT", 'CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA'), 2, simplify = T)
all_tri_omic_combos_el <- t(all_tri_omic_combos_el)
all_tri_omic_combos_el <- as.data.table(all_tri_omic_combos_el)

bar_level_df <- data.frame(temp = paste0(all_tri_omic_combos_el$V1, "_", all_tri_omic_combos_el$V2))

colnames(bar_level_df) <- "Omic Type(s)"
trimodal_results_copy_wide <- left_join(bar_level_df,  
                                        trimodal_results_copy_wide,
                                  by = "Omic Type(s)")
trimodal_results_copy_wide <- as.data.table(trimodal_results_copy_wide)

ft <- flextable(trimodal_results_copy_wide, col_keys = df_header$header_id)

final_ft <- setup_bold_table(ft, df_header, bold_df)

final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Trimodal_Best_By_DataType_table.docx")


## Multimodal Tables ====
require(data.table)
setDTthreads(8)
require(ggplot2)
require(gt)
require(flextable)
require(magrittr)
require(scales)
require(officer)
rmse <- function(x, y) sqrt(mean((x - y)^2))

set_flextable_defaults(
  font.size = 10, theme_fun = theme_vanilla,
  padding = 6,
  background.color = "#EFEFEF")

all_results <- fread("Data/all_results.csv")
all_results <- all_results[str_count(data_types, "_") > 1]

shared_combos <- fread("Data/shared_unique_combinations.csv")
shared_combos[, unique_samples := paste0(cpd_name, "_", cell_name)]

# Subset for bimodal results
# Subset by all shared samples all each data type
all_results[, unique_samples := paste0(cpd_name, "_", cell_name)]
all_results <- all_results[unique_samples %in% shared_combos$unique_samples]

all_results[, loss_by_config := rmse(target, predicted), by = c("data_types", "merge_method", "loss_type",
                                                                "drug_type", "split_method", "TargetRange",
                                                                "Targeted", "bottleneck")]
all_results <- unique(all_results[, c("data_types", "merge_method", "loss_type",
                                      "drug_type", "split_method", "TargetRange",
                                      "Targeted", "bottleneck", "loss_by_config")])
gc()

# all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange", "Targeted", "bottleneck")]
# all_results_long_copy[, cv_sd := sd(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange", "Targeted", "bottleneck")]

# all_results_long_copy$value <- NULL
# all_results_long_copy$variable <- NULL
# all_results_long_copy$fold <- NULL
# all_results_long_copy <- unique(all_results_long_copy)

# quadmodal_results <- all_results_long_copy[nchar(data_types) > 11]

# quadmodal_results[data_types == "CNV_EXP_METAB"]
# quadmodal_results[data_types == "CNV_EXP_PROT"]
# quadmodal_results[data_types == "CNV_EXP_PROT_METAB"]
# quadmodal_results[data_types == "CNV_EXP_PROT_MIRNA_METAB_HIST_RPPA"]
# quadmodal_results[data_types == "MUT_CNV_EXP_PROT_MIRNA_METAB_HIST_RPPA"]

# quadmodal_results[merge_method == "Merge By Concat"]$merge_method <- "Concat"
# quadmodal_results[merge_method == "Merge By Sum"]$merge_method <- "Sum"
# quadmodal_results[merge_method == "Base Model + LMF"]$merge_method <- "LMF"
# quadmodal_results[loss_type == "UnBase Model + LDS"]$loss_type <- "non-LDS"
# quadmodal_results[loss_type == "Base Model + LDS"]$loss_type <- "LDS"
# quadmodal_results[drug_type == "1024-bit ECFP"]$drug_type <- "ECFP"
# quadmodal_results[drug_type == "Base Model + GNN"]$drug_type <- "GNN"
# quadmodal_results[split_method == "Split By Both"]$split_method <- "Group Both"
# quadmodal_results[split_method == "Split By Drug Scaffold"]$split_method <- "Group Scaffold"
# quadmodal_results[split_method == "Split By Cell Line"]$split_method <- "Group Cell"
# quadmodal_results[TargetRange == "Target Above 0.7"]$TargetRange <- ">= 0.7"
# quadmodal_results[TargetRange == "Target Below 0.7"]$TargetRange <- "< 0.7"
# quadmodal_results[Targeted == "Targeted Drug"]$Targeted <- "Targeted"
# quadmodal_results[Targeted == "Untargeted Drug"]$Targeted <- "Untargeted"
# quadmodal_results[bottleneck == "No Data Bottleneck"]$bottleneck <- "No Bottleneck"
# quadmodal_results[bottleneck == "With Data Bottleneck"]$bottleneck <- "With Bottleneck"


### Best Model for Each Omic Data ====
quadmodal_results_copy <- all_results
quadmodal_results_copy[, data_types := gsub("_", " + ", data_types)]
# Subset for no bottleneck
quadmodal_results_copy <- quadmodal_results_copy[bottleneck == "No Data Bottleneck"]
quadmodal_results_copy$bottleneck <- NULL


unique(quadmodal_results_copy[, c("merge_method", "loss_type", "drug_type")])

# Assign model names
quadmodal_results_copy[(loss_type == "Base Model + LDS" & merge_method == "Base Model + LMF" & drug_type == "Base Model + GNN"), Method := "Trifecta"]
quadmodal_results_copy[(loss_type == "Base Model" & merge_method == "Base Model" & drug_type == "Base Model"), Method := "Baseline"]
# Subset for baseline and trifecta models
quadmodal_results_copy <- quadmodal_results_copy[!is.na(Method)]

table(quadmodal_results_copy$data_types)
table(quadmodal_results_copy$split_method)

# Remove splitting by lineage (it was only done for trimodal trifecta models)
quadmodal_results_copy <- quadmodal_results_copy[split_method != "Split By Cancer Type"]
# Subset TargetRange
quadmodal_results_copy <- quadmodal_results_copy[TargetRange == "Target Above 0.7"]

table(quadmodal_results_copy$data_types)
table(quadmodal_results_copy$split_method)

quadmodal_results_copy$loss_type <- NULL
quadmodal_results_copy$merge_method <- NULL
quadmodal_results_copy$drug_type <- NULL

# Find the lowest CV mean by data types and drug type
quadmodal_results_copy[, best_cv_mean := min(loss_by_config), by = c("data_types", "split_method", "Targeted")]

# Subset models for those with best CV means
quadmodal_results_copy <- quadmodal_results_copy[loss_by_config == best_cv_mean]
colnames(quadmodal_results_copy)[1] <- "Omic Type(s)"
quadmodal_results_copy[, variable := paste(split_method, Targeted, TargetRange, sep = "_")]

# Find the best model overall for grouping method and drug type
quadmodal_results_copy[, best_overall := min(loss_by_config), by = c("split_method", "Targeted")]
bolf_df <- quadmodal_results_copy[loss_by_config == best_overall]

bold_df <- unique(bolf_df[, c("Omic Type(s)", "variable", "best_overall")])
bold_df$best_overall <- round(bold_df$best_overall, 3)
colnames(bold_df)[3] <- "best_cv_mean"
quadmodal_results_copy$variable <- NULL
quadmodal_results_copy$best_overall <- NULL

quadmodal_results_copy[, loss_by_config := round(loss_by_config, 3)]
# quadmodal_results_copy[, cv_mean := round(cv_mean, 3)]
# quadmodal_results_copy[, cv_sd := round(cv_sd, 3)]
# quadmodal_results_copy$Result <- paste(quadmodal_results_copy$cv_mean, quadmodal_results_copy$cv_sd, sep = " ± ")
quadmodal_results_copy[, Result := as.character(loss_by_config)]
quadmodal_results_copy$loss_by_config <- NULL
# quadmodal_results_copy$cv_mean <- NULL
# quadmodal_results_copy$cv_sd <- NULL
quadmodal_results_copy <- unique(quadmodal_results_copy)
# quadmodal_results_copy <- dcast(quadmodal_results_copy, ... ~ split_method + Method + Targeted + TargetRange,
#                         value.var = "Result")

# bimodal_results_copy[, cv_mean := round(cv_mean, 3)]
# bimodal_results_copy[, cv_sd := round(cv_sd, 3)]
# bimodal_results_copy$Result <- paste(bimodal_results_copy$cv_mean, bimodal_results_copy$cv_sd, sep = " ± ")
# bimodal_results_copy$cv_mean <- NULL
# bimodal_results_copy$cv_sd <- NULL
quadmodal_results_copy$best_cv_mean <- NULL
# bimodal_results_copy <- unique(bimodal_results_copy)
quadmodal_results_copy$Result <- paste(quadmodal_results_copy$Method, quadmodal_results_copy$Result, sep = "\n")
quadmodal_results_copy$Method <- NULL

quadmodal_results_copy_wide <- dcast(quadmodal_results_copy, ... ~ split_method + Targeted + TargetRange,
                               value.var = "Result")

header_id_colnames <- colnames(quadmodal_results_copy_wide)[-1]
header_split <- stringr::str_split(header_id_colnames, "_", simplify = T)
cur_header <- data.frame('header_id' = header_id_colnames,
                         'Var1' = header_split[,1],  # Grouping method
                         'Var2' = header_split[,2],  # Targeted or Untargeted
                         'Var3' = header_split[,3],  # AAC range
                         stringsAsFactors = F)
no_header <- data.table(header_id = "Omic Type(s)",
                        Var1 = "Omic Type(s)", Var2 = "Omic Type(s)",
                        Var3 = "Omic Type(s)")
df_header <- rbind(no_header, cur_header)

ft <- flextable(quadmodal_results_copy_wide, col_keys = df_header$header_id)

final_ft <- setup_bold_table(ft, df_header, bold_df)

final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Multimodal_Best_By_DataType_table.docx")


# Comparison of Best Models from Each Omic Combination ====
require(data.table)
setDTthreads(8)
require(ggplot2)
require(tidytext)
rmse <- function(x, y) sqrt(mean((x - y)^2))

# For each cell line and drug combination, identify the best performing model overall per Splitting Method

# require(flextable)
# require(magrittr)
# require(scales)
# require(officer)

set_flextable_defaults(
  font.size = 10, theme_fun = theme_vanilla,
  padding = 6,
  background.color = "#EFEFEF")

all_results <- fread("Data/all_results.csv")


shared_combos <- fread("Data/shared_unique_combinations.csv")
shared_combos[, unique_samples := paste0(cpd_name, "_", cell_name)]

# Subset for bimodal results
# Subset by all shared samples all each data type
all_results[, unique_samples := paste0(cpd_name, "_", cell_name)]
all_results <- all_results[unique_samples %in% shared_combos$unique_samples]

all_results <- all_results[TargetRange == "Target Above 0.7"]
# Subset all results
all_results <- all_results[bottleneck != "With Data Bottleneck"]
all_results$bottleneck <- NULL

all_results[, loss_by_config := rmse(target, predicted), by = c("data_types", "merge_method", "loss_type",
                                                                "drug_type", "split_method", "TargetRange",
                                                                "Targeted")]
all_results <- unique(all_results[, c("data_types", "merge_method", "loss_type",
                                      "drug_type", "split_method", "TargetRange",
                                      "Targeted", "loss_by_config")])
gc()

# Assign Model Names
# Baseline and Elastic Net
all_results[(loss_type == "Base Model" & merge_method == "Merge By Early Concat" & drug_type == "Base Model"), model_name := "ElasticNet"]
all_results[(loss_type == "Base Model" & merge_method == "Base Model" & drug_type == "Base Model"), model_name := "Baseline"]
# Single Technique
all_results[(loss_type == "Base Model + LDS" & merge_method == "Base Model" & drug_type == "Base Model"), model_name := "LDS"]
all_results[(loss_type == "Base Model" & merge_method == "Base Model + LMF" & drug_type == "Base Model"), model_name := "LMF"]
all_results[(loss_type == "Base Model" & merge_method == "Base Model + Sum" & drug_type == "Base Model"), model_name := "Sum"]
all_results[(loss_type == "Base Model" & merge_method == "Base Model" & drug_type == "Base Model + GNN"), model_name := "GNN"]
# Two Techniques
all_results[(loss_type == "Base Model + LDS" & merge_method == "Base Model + LMF" & drug_type == "Base Model"), model_name := "LDS+LMF"]
all_results[(loss_type == "Base Model" & merge_method == "Base Model + LMF" & drug_type == "Base Model + GNN"), model_name := "LMF+GNN"]
all_results[(loss_type == "Base Model + LDS" & merge_method == "Base Model" & drug_type == "Base Model + GNN"), model_name := "LDS+GNN"]
# Three Techniques
all_results[(loss_type == "Base Model + LDS" & merge_method == "Base Model + Sum" & drug_type == "Base Model + GNN"), model_name := "LDS+Sum+GNN"]
all_results[(loss_type == "Base Model + LDS" & merge_method == "Base Model + LMF" & drug_type == "Base Model + GNN"), model_name := "Trifecta"]

sum(is.na(all_results$model_name))
all_results[is.na(model_name)]
table(all_results$model_name)

# Find the lowest CV mean by data types and drug type
all_results[, best_cv_mean := min(loss_by_config), by = c("data_types", "split_method", "Targeted")]

# Subset models for those with best CV means
all_results <- all_results[loss_by_config == best_cv_mean]
colnames(all_results)[1] <- "Omic Type(s)"
all_results[, variable := paste(split_method, Targeted, TargetRange, sep = "_")]

# Find the best model overall for grouping method and drug type
all_results[, best_overall := min(loss_by_config), by = c("split_method", "Targeted")]
all_results <- all_results[loss_by_config == best_overall]

# bold_df <- unique(bolf_df[, c("Omic Type(s)", "variable", "best_overall")])
# bold_df$best_overall <- round(bold_df$best_overall, 3)
# colnames(bold_df)[3] <- "best_cv_mean"

all_results[, loss_by_config := round(loss_by_config, 3)]
all_results[, Result := as.character(loss_by_config)]
all_results[, "Omic Type(s)" := gsub("_", "+", `Omic Type(s)`)]
all_results[ , Result := paste(`Omic Type(s)`, model_name, Result, sep = "\n")]
all_results$loss_by_config <- NULL
all_results$variable <- NULL
all_results$best_overall <- NULL
all_results$best_cv_mean <- NULL
# bimodal_results_copy <- unique(bimodal_results_copy)
all_results$model_name <- NULL
all_results$`Omic Type(s)` <- NULL
all_results$merge_method <- NULL
all_results$loss_type <- NULL
all_results$drug_type <- NULL

all_results <- all_results[split_method != "Split By Cancer Type"]
all_results_wide <- dcast(all_results, ... ~ split_method + Targeted + TargetRange,
                                     value.var = "Result")
# colnames(all_results_wide)[4] <- "Result"
header_id_colnames <- colnames(all_results_wide)[-1]
header_split <- stringr::str_split(header_id_colnames, "_", simplify = T)
cur_header <- data.frame('header_id' = header_id_colnames,
                         'Var1' = header_split[,1],  # Grouping method
                         'Var2' = header_split[,2],  # Targeted or Untargeted
                         'Var3' = header_split[,3],  # AAC range
                         stringsAsFactors = F)
no_header <- data.table(header_id = "Omic Type(s)",
                        Var1 = "Omic Type(s)", Var2 = "Omic Type(s)",
                        Var3 = "Omic Type(s)")
df_header <- rbind(no_header, cur_header)

# ft <- flextable(all_results_wide, col_keys = df_header$header_id)
ft <- flextable(all_results_wide, col_keys = df_header$header_id)

final_ft <- setup_bold_table(ft, df_header)

final_ft <- autofit(final_ft)
dir.create("Plots/CV_Tables")
read_docx() %>% 
  body_add_flextable(value = final_ft) %>% 
  print(target = "Plots/CV_Tables/Best_Model_By_SplitMethod_table.docx")


# Baseline  ElasticNet         GNN         LDS     LDS+GNN     LDS+LMF LDS+LMF+GNN LDS+Sum+GNN         LMF     LMF+GNN 
# 25397897      419855     6113536     6679193     4872726     4882025    31924617     2475755     5054873     4690297 
# Sum 
# 4882025 

# all_results$loss_type <- NULL
# all_results$merge_method <- NULL
# all_results$drug_type <- NULL
gc()


# Find the lowest CV mean by data types and drug type
all_results[, best_loss := min(RMSELoss), by = c("cpd_name", "cell_name", "split_method")]

quantile(all_results$target)
# Subset models for those with best CV means
all_the_best <- all_results[RMSELoss == best_loss]
# colnames(bimodal_results_copy)[1] <- "Data Type(s)"
# bimodal_results_copy[, variable := paste(split_method, Targeted, TargetRange, sep = "_")]

uniqueN(all_the_best[, c("cpd_name", "cell_name")])  # 309,594

sub <- unique(all_the_best[, c("cpd_name", "cell_name", "target")])
cur_quantile_func <- ecdf(unlist(sub[, "target", with = F]))

cur_quantile_func(0.7)  # 98.6th percentile...
quantile(sub$target)

# 0%        25%        50%        75%       100% 
# 0.00000000 0.02496015 0.09060940 0.21159975 0.99529000 

table(all_the_best$split_method)
table(all_the_best$model_name)

## Entire range results ====
all_the_best[, model_and_data := paste0(model_name, "-", data_types)]
all_the_best[, group_n := .N, by = "split_method"]
all_the_best[, model_and_data_freq := .N / group_n, by = c("split_method", "model_and_data")]
all_the_best[, model_freq := .N / group_n, by = c("split_method", "model_name")]
all_the_best[, data_freq := .N / group_n, by = c("split_method", "data_types")]

unique_sub <- unique(all_the_best[, c("split_method", "model_and_data_freq",
                        "model_name", "data_types", "model_and_data",
                        "model_freq", "data_freq")])

unique_sub <- unique_sub[split_method != "Split By Cancer Type"]
unique_sub$model_and_data <- NULL
unique_sub$model_and_data_freq <- NULL
# unique_sub$model_name <- NULL
# unique_sub$model_freq <- NULL

unique_sub$data_types <- NULL
unique_sub$data_freq <- NULL


unique_sub <- unique(unique_sub)
best_molten <- melt(unique_sub,
     id.vars = c("split_method", "model_name"), 
     measure.vars = "model_freq")

table(best_molten$variable)

require(tidytext)

# Plot top model types
ggplot(data = best_molten) +
  geom_bar(mapping = aes(x = reorder_within(x = model_name, by = -value, within = split_method), y = value, fill = model_name),
           stat = "identity", position = position_dodge2(width = 0.9, preserve = "single"),
           show.legend = F) +
  facet_wrap(~split_method, scales = "free_x") +
  geom_text(position = position_dodge2(width = 0.9, preserve = "single"),
            aes(x = reorder_within(x = model_name, by = -value, within = split_method), y = value+0.01, label=model_name, hjust=0), angle=90) +
  scale_y_continuous(limits = c(0, 0.65), breaks = seq(0, .65, by = 0.05)) +
  theme(axis.text.x = element_blank(),
        axis.ticks = element_blank()) +
  ylab("Frequency") +
  xlab("Model Name")

ggsave("Plots/CV_Results/Best_Model_Frequency_by_Split_Method_BarPlot.pdf")

unique_sub <- unique(all_the_best[, c("split_method", "model_and_data_freq",
                                      "model_name", "data_types", "model_and_data",
                                      "model_freq", "data_freq")])

unique_sub <- unique_sub[split_method != "Split By Cancer Type"]
unique_sub$model_and_data <- NULL
unique_sub$model_and_data_freq <- NULL
unique_sub$model_name <- NULL
unique_sub$model_freq <- NULL

# unique_sub$data_types <- NULL
# unique_sub$data_freq <- NULL

unique_sub <- unique(unique_sub)
best_molten <- melt(unique_sub,
                    id.vars = c("split_method", "data_types"), 
                    measure.vars = "data_freq")

# Plot top data types
ggplot(data = best_molten) +
  geom_bar(mapping = aes(x = reorder_within(x = data_types, by = -value, within = split_method), y = value, fill = data_types),
           stat = "identity", position = position_dodge2(width = 0.9, preserve = "single"),
           show.legend = F) +
  facet_wrap(~split_method, scales = "free_x") +
  geom_text(position = position_dodge2(width = 0.9, preserve = "single"),
            aes(x = reorder_within(x = data_types, by = -value, within = split_method), y = value+0.01, label=data_types, hjust=0), angle=90) +
  scale_y_continuous(limits = c(0, 0.65), breaks = seq(0, .65, by = 0.05)) +
  theme(axis.text.x = element_blank(),
        axis.ticks = element_blank()) +
  ylab("Frequency") +
  xlab("Model Name")

ggsave("Plots/CV_Results/Best_Data_Type_Frequency_by_Split_Method_BarPlot.pdf",
       width = 20)


## Upper Range Results ====
### Targeted Results ====
upper_aac_results <- all_results[target >= 0.7 & Targeted == "Targeted Drug"]
# Find the lowest CV mean by data types and drug type
upper_aac_results[, best_loss := min(RMSELoss),
                  by = c("cpd_name", "cell_name", "split_method", "Targeted")]

# Subset models for those with best CV means
upper_all_the_best <- upper_aac_results[RMSELoss == best_loss]
# colnames(bimodal_results_copy)[1] <- "Data Type(s)"
# bimodal_results_copy[, variable := paste(split_method, Targeted, TargetRange, sep = "_")]

table(upper_all_the_best$split_method)
table(upper_all_the_best$model_name)

upper_all_the_best[, model_and_data := paste0(model_name, "-", data_types)]
upper_all_the_best[, group_n := .N, by = "split_method"]
# upper_all_the_best[, model_and_data_freq := .N / group_n, by = c("split_method", "model_and_data")]
upper_all_the_best[, model_freq := .N / group_n, by = c("split_method", "model_name")]
upper_all_the_best[, data_freq := .N / group_n, by = c("split_method", "data_types")]

unique_sub <- unique(upper_all_the_best[, c("split_method",
                                      "model_name", "data_types",
                                      "model_freq", "data_freq")])

unique_sub <- unique_sub[split_method != "Split By Cancer Type"]
unique_sub$model_and_data <- NULL
# unique_sub$model_and_data_freq <- NULL
# unique_sub$model_name <- NULL
# unique_sub$model_freq <- NULL

unique_sub$data_types <- NULL
unique_sub$data_freq <- NULL

unique_sub <- unique(unique_sub)
best_molten <- melt(unique_sub,
                    id.vars = c("split_method", "model_name"), 
                    measure.vars = "model_freq")

table(best_molten$variable)

# Plot top model types
ggplot(data = best_molten) +
  geom_bar(mapping = aes(x = reorder_within(x = model_name, by = -value, within = split_method), y = value, fill = model_name),
           stat = "identity", position = position_dodge2(width = 0.9, preserve = "single"),
           show.legend = F) +
  facet_wrap(~split_method, scales = "free_x") +
  geom_text(position = position_dodge2(width = 0.9, preserve = "single"),
            aes(x = reorder_within(x = model_name, by = -value, within = split_method), y = value+0.01, label=model_name, hjust=0), angle=90) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.05)) +
  theme(text = element_text(size = 14, face = "bold"),
        axis.text.x = element_blank(),
        axis.ticks = element_blank()) +
  ylab("Frequency") +
  xlab("Model Name")

ggsave("Plots/CV_Results/Best_Model_Frequency_by_Split_Method_UpperAAC_0.7_Targeted_BarPlot.pdf")

unique_sub <- unique(upper_all_the_best[, c("split_method",
                                      "model_name", "data_types", "model_and_data",
                                      "model_freq", "data_freq")])

unique_sub <- unique_sub[split_method != "Split By Cancer Type"]
unique_sub$model_and_data <- NULL
# unique_sub$model_and_data_freq <- NULL
unique_sub$model_name <- NULL
unique_sub$model_freq <- NULL

# unique_sub$data_types <- NULL
# unique_sub$data_freq <- NULL

unique_sub <- unique(unique_sub)
best_molten <- melt(unique_sub,
                    id.vars = c("split_method", "data_types"), 
                    measure.vars = "data_freq")
best_molten <- best_molten[order(-value), head(.SD, 1000), by = "split_method"]
# setorder(best_molten, -value)
# Plot top data types
best_molten[, split_method := factor(split_method,
                                     levels = c("Split By Cell Line",
                                                "Split By Drug Scaffold",
                                                "Split By Both Cell Line & Drug Scaffold"))]

ggplot(data = best_molten) +
  geom_bar(mapping = aes(x = reorder_within(x = data_types, by = -value, within = split_method), y = value, fill = data_types),
           stat = "identity", position = position_dodge2(width = 0.9, preserve = "single"),
           show.legend = F) +
  facet_wrap(~split_method, scales = "free_x", ncol = 1) +
  geom_text(position = position_dodge2(width = 0.9, preserve = "single"),
            aes(x = reorder_within(x = data_types, by = -value, within = split_method), y = value+0.01, label=data_types, hjust=0), angle=90) +
  scale_y_continuous(limits = c(0, 0.3), breaks = seq(0, .3, by = 0.05)) +
  theme(text = element_text(size = 20, face = "bold"),
        axis.text.x = element_blank(),
        axis.ticks = element_blank()) +
  ylab("Frequency") +
  xlab("Model Name")

ggsave("Plots/CV_Results/Best_Data_Type_Frequency_by_Split_Method_UpperAAC_0.7_Targeted_BarPlot.pdf",
       height = 20, width = 16)

### Untargeted Drugs ====
upper_aac_results <- all_results[target >= 0.7 & Targeted == "Untargeted Drug"]
# Find the lowest CV mean by data types and drug type
upper_aac_results[, best_loss := min(RMSELoss),
                  by = c("cpd_name", "cell_name", "split_method", "Targeted")]

# Subset models for those with best CV means
upper_all_the_best <- upper_aac_results[RMSELoss == best_loss]
# colnames(bimodal_results_copy)[1] <- "Data Type(s)"
# bimodal_results_copy[, variable := paste(split_method, Targeted, TargetRange, sep = "_")]

table(upper_all_the_best$split_method)
table(upper_all_the_best$model_name)

upper_all_the_best[, model_and_data := paste0(model_name, "-", data_types)]
upper_all_the_best[, group_n := .N, by = "split_method"]
# upper_all_the_best[, model_and_data_freq := .N / group_n, by = c("split_method", "model_and_data")]
upper_all_the_best[, model_freq := .N / group_n, by = c("split_method", "model_name")]
upper_all_the_best[, data_freq := .N / group_n, by = c("split_method", "data_types")]

unique_sub <- unique(upper_all_the_best[, c("split_method",
                                            "model_name", "data_types",
                                            "model_freq", "data_freq")])

unique_sub <- unique_sub[split_method != "Split By Cancer Type"]
unique_sub$model_and_data <- NULL
# unique_sub$model_and_data_freq <- NULL
# unique_sub$model_name <- NULL
# unique_sub$model_freq <- NULL

unique_sub$data_types <- NULL
unique_sub$data_freq <- NULL

unique_sub <- unique(unique_sub)
best_molten <- melt(unique_sub,
                    id.vars = c("split_method", "model_name"), 
                    measure.vars = "model_freq")

table(best_molten$variable)

# Plot top model types
ggplot(data = best_molten) +
  geom_bar(mapping = aes(x = reorder_within(x = model_name, by = -value, within = split_method), y = value, fill = model_name),
           stat = "identity", position = position_dodge2(width = 0.9, preserve = "single"),
           show.legend = F) +
  facet_wrap(~split_method, scales = "free_x") +
  geom_text(position = position_dodge2(width = 0.9, preserve = "single"),
            aes(x = reorder_within(x = model_name, by = -value, within = split_method), y = value+0.01, label=model_name, hjust=0), angle=90) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.05)) +
  theme(text = element_text(size = 14, face = "bold"),
        axis.text.x = element_blank(),
        axis.ticks = element_blank()) +
  ylab("Frequency") +
  xlab("Model Name")

ggsave("Plots/CV_Results/Best_Model_Frequency_by_Split_Method_UpperAAC_0.7_Untargeted_BarPlot.pdf")

unique_sub <- unique(upper_all_the_best[, c("split_method",
                                            "model_name", "data_types", "model_and_data",
                                            "model_freq", "data_freq")])

unique_sub <- unique_sub[split_method != "Split By Cancer Type"]
unique_sub$model_and_data <- NULL
# unique_sub$model_and_data_freq <- NULL
unique_sub$model_name <- NULL
unique_sub$model_freq <- NULL

# unique_sub$data_types <- NULL
# unique_sub$data_freq <- NULL

unique_sub <- unique(unique_sub)
best_molten <- melt(unique_sub,
                    id.vars = c("split_method", "data_types"), 
                    measure.vars = "data_freq")
best_molten <- best_molten[order(-value), head(.SD, 1000), by = "split_method"]
# setorder(best_molten, -value)
# Plot top data types
best_molten[, split_method := factor(split_method,
                                     levels = c("Split By Cell Line",
                                                "Split By Drug Scaffold",
                                                "Split By Both Cell Line & Drug Scaffold"))]

ggplot(data = best_molten) +
  geom_bar(mapping = aes(x = reorder_within(x = data_types, by = -value, within = split_method), y = value, fill = data_types),
           stat = "identity", position = position_dodge2(width = 0.9, preserve = "single"),
           show.legend = F) +
  facet_wrap(~split_method, scales = "free_x", ncol = 1) +
  geom_text(position = position_dodge2(width = 0.9, preserve = "single"),
            aes(x = reorder_within(x = data_types, by = -value, within = split_method), y = value+0.01, label=data_types, hjust=0), angle=90) +
  scale_y_continuous(limits = c(0, 0.3), breaks = seq(0, .3, by = 0.05)) +
  theme(text = element_text(size = 20, face = "bold"),
        axis.text.x = element_blank(),
        axis.ticks = element_blank()) +
  ylab("Frequency") +
  xlab("Model Name")

ggsave("Plots/CV_Results/Best_Data_Type_Frequency_by_Split_Method_UpperAAC_0.7_Untargeted_BarPlot.pdf",
       height = 20, width = 16)


table(all_the_best[split_method == "Split By Cell Line"]$model_name)
table(all_the_best[split_method == "Split By Both Cell Line & Drug Scaffold"]$model_name)
table(all_the_best[split_method == "Split By Cancer Type"]$model_name)
table(all_the_best[split_method == "Split By Drug Scaffold"]$model_name)

table(all_the_best[split_method == "Split By Both Cell Line & Drug Scaffold"]$model_name)

cur_freqs <- unique(all_the_best[split_method == "Split By Both Cell Line & Drug Scaffold"][, c("model_name", "model_freq")])
colnames(cur_freqs) <- c("label", "freq")
cols <- rainbow(nrow(cur_freqs))
cur_freqs$percent = round(100*cur_freqs$freq/sum(cur_freqs$freq), digits = 1)
cur_freqs$final_label = paste(cur_freqs$label," (", cur_freqs$percent,"%)", sep = "")
model_both_pie <- pie(cur_freqs$freq, labels = cur_freqs$final_label, col = cols)


cur_freqs <- unique(all_the_best[split_method == "Split By Cell Line"][, c("model_name", "model_freq")])
colnames(cur_freqs) <- c("label", "freq")
cols <- rainbow(nrow(cur_freqs))
cur_freqs$percent = round(100*cur_freqs$freq/sum(cur_freqs$freq), digits = 1)
cur_freqs$final_label = paste(cur_freqs$label," (", cur_freqs$percent,"%)", sep = "")
model_cell_pie <- pie(cur_freqs$freq, labels = cur_freqs$final_label, col = cols)

cur_freqs <- unique(all_the_best[split_method == "Split By Drug Scaffold"][, c("model_name", "model_freq")])
colnames(cur_freqs) <- c("label", "freq")
cols <- rainbow(nrow(cur_freqs))
cur_freqs$percent = round(100*cur_freqs$freq/sum(cur_freqs$freq), digits = 1)
cur_freqs$final_label = paste(cur_freqs$label," (", cur_freqs$percent,"%)", sep = "")
model_drug_pie <- pie(cur_freqs$freq, labels = cur_freqs$final_label, col = cols, )

require(cowplot)

cowplot::plot_grid(model_both_pie, model_cell_pie, model_drug_pie)

table(all_the_best$model_and_data)
cur_freqs <- unique(all_the_best[split_method == "Split By Cell Line"][, c("model_and_data", "model_and_data_freq")])
cols <- rainbow(nrow(cur_freqs))
pie(cur_freqs$model_and_data_freq, labels = cur_freqs$model_and_data, col = cols)

table(all_the_best[split_method == "Split By Cell Line"]$model_name)
cur_freqs <- unique(all_the_best[split_method == "Split By Cell Line"][, c("model_name", "model_freq")])
cols <- rainbow(nrow(cur_freqs))
pie(cur_freqs$model_freq, labels = cur_freqs$model_name, col = cols)

cur_freqs <- unique(all_the_best[split_method == "Split By Cell Line"][, c("data_types", "data_freq")])
cols <- rainbow(nrow(cur_freqs))
pie(cur_freqs$data_freq, labels = cur_freqs$data_types, col = cols)
# MUT is surprisingly a good data type???
all_the_best[data_types == "MUT"]


cur_freqs <- unique(all_the_best[split_method == "Split By Both Cell Line & Drug Scaffold"][, c("model_and_data", "freq")])
cols <- rainbow(nrow(cur_freqs))


all_the_best[split_method == "Split By Drug Scaffold"]
all_the_best[split_method == "Split By Cell Line"]
all_the_best[split_method == "Split By Cancer Type"]

# Find the best model overall for grouping method and drug type
bimodal_results_copy[, best_overall := min(cv_mean), by = c("split_method", "Targeted")]
bolf_df <- bimodal_results_copy[cv_mean == best_overall]
bold_df <- unique(bolf_df[, c("Data Type(s)", "variable", "best_overall")])
bold_df$best_overall <- round(bold_df$best_overall, 3)
colnames(bold_df)[3] <- "best_cv_mean"
bimodal_results_copy$variable <- NULL
bimodal_results_copy$best_overall <- NULL

# all_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "Targeted", "bottleneck")]

# For each cell line and drug combination, identify the best performing model overall per Splitting Method

all_results_long_copy <- melt(unique(all_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "TargetRange", "Targeted", "bottleneck")]),
                              id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "Targeted", "bottleneck"))

# rm(all_results)
# gc()

all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange", "Targeted", "bottleneck")]
all_results_long_copy[, cv_sd := sd(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange", "Targeted", "bottleneck")]

all_results_long_copy$value <- NULL
all_results_long_copy$variable <- NULL
all_results_long_copy$fold <- NULL
all_results_long_copy <- unique(all_results_long_copy)

quadmodal_results <- all_results_long_copy[nchar(data_types) > 11]
table(quadmodal_results$data_types)
quadmodal_results[data_types == "CNV_EXP_METAB"]
quadmodal_results[data_types == "CNV_EXP_PROT"]
quadmodal_results[data_types == "CNV_EXP_PROT_METAB"]
quadmodal_results[data_types == "CNV_EXP_PROT_MIRNA_METAB_HIST_RPPA"]
quadmodal_results[data_types == "MUT_CNV_EXP_PROT_MIRNA_METAB_HIST_RPPA"]

quadmodal_results[merge_method == "Merge By Concat"]$merge_method <- "Concat"
quadmodal_results[merge_method == "Merge By Sum"]$merge_method <- "Sum"
quadmodal_results[merge_method == "Base Model + LMF"]$merge_method <- "LMF"
quadmodal_results[loss_type == "UnBase Model + LDS"]$loss_type <- "non-LDS"
quadmodal_results[loss_type == "Base Model + LDS"]$loss_type <- "LDS"
quadmodal_results[drug_type == "1024-bit ECFP"]$drug_type <- "ECFP"
quadmodal_results[drug_type == "Base Model + GNN"]$drug_type <- "GNN"
quadmodal_results[split_method == "Split By Both"]$split_method <- "Group Both"
quadmodal_results[split_method == "Split By Drug Scaffold"]$split_method <- "Group Scaffold"
quadmodal_results[split_method == "Split By Cell Line"]$split_method <- "Group Cell"
quadmodal_results[TargetRange == "Target Above 0.7"]$TargetRange <- ">= 0.7"
quadmodal_results[TargetRange == "Target Below 0.7"]$TargetRange <- "< 0.7"
quadmodal_results[Targeted == "Targeted Drug"]$Targeted <- "Targeted"
quadmodal_results[Targeted == "Untargeted Drug"]$Targeted <- "Untargeted"
quadmodal_results[bottleneck == "No Data Bottleneck"]$bottleneck <- "No Bottleneck"
quadmodal_results[bottleneck == "With Data Bottleneck"]$bottleneck <- "With Bottleneck"