Switch to unified view

a b/R/05_All_Comparison_Plots.R
1
# Targeted_vs_Broad_Drugs.R
2
require(data.table)
3
setDTthreads(8)
4
require(ggplot2)
5
library(dplyr)
6
targeted_drugs <- c("Idelalisib", "Olaparib", "Venetoclax", "Crizotinib", "Regorafenib", 
7
                    "Tretinoin", "Bortezomib", "Cabozantinib", "Dasatinib", "Erlotinib", 
8
                    "Sonidegib", "Vandetanib", "Axitinib", "Ibrutinib", "Gefitinib", 
9
                    "Nilotinib", "Tamoxifen", "Bosutinib", "Pazopanib", "Lapatinib", 
10
                    "Dabrafenib", "Bexarotene", "Temsirolimus", "Belinostat", 
11
                    "Sunitinib", "Vorinostat", "Trametinib", "Fulvestrant", "Sorafenib", 
12
                    "Vemurafenib", "Alpelisib")
13
14
# mysubset <- function(df, ...) {
15
#   ssubset <- deparse(substitute(...))
16
#   subset(df, eval(parse(text = ssubset)))
17
# }
18
19
dodge2 <- position_dodge2(width = 0.9, padding = 0)
20
rsq <- function (x, y) cor(x, y, method = "pearson") ^ 2
21
rmse <- function(x, y) sqrt(mean((x - y)^2))
22
mae <- function(x, y) mean(abs(x - y))
23
# Moving average
24
ma <- function(x, n = 5) filter(x, rep(1 / n, n), sides = 2)
25
26
# install.packages("ggrepel")
27
# require(ggrepel)
28
my_plot_function <- function(avg_loss_by, sub_results_by, fill_by, data_order, bar_level_order,
29
                             facet_by, facet_level_order, facet_nrow = 2,
30
                             legend_title, y_lim = 0.1, y_lab = "Average MAE Loss",
31
                             plot_type = "bar_plot", target_sub_by = "Target Above 0.7",
32
                             cur_comparisons = NULL, test = "wilcox.test", paired = F,
33
                             calculate_avg_mae = T,
34
                             hide_outliers = F, step_increase = 0.1,
35
                             add_mean = F, min_diff = 0.05) {
36
  
37
  if (plot_type == "bar_plot") {
38
    if (calculate_avg_mae == F) {
39
      y_lab <- "Total RMSE Loss"
40
    }
41
    
42
    # all_results_long_copy <- data.table::melt(unique(all_results_copy[, c(avg_loss_by, "loss_by_config"), with = F]),
43
    #                                           id.vars = avg_loss_by)
44
    
45
    # all_results_long_copy[, cv_mean := mean(value), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
46
    # all_results_long_copy[, cv_sd := sd(value), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
47
    all_results_copy[, unique_sample := paste0(cpd_name, "_", cell_name)]
48
    shared_unique_samples <- Reduce(intersect, split(all_results_copy$unique_sample, all_results_copy$data_types))
49
    # uniqueN(shared_unique_samples)
50
    all_results_copy <- all_results_copy[unique_sample %in% shared_unique_samples]
51
    
52
    if (calculate_avg_mae == T) {
53
      all_results_copy[, cv_mean := mean(RMSELoss), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
54
      all_results_copy[, cv_sd := sd(RMSELoss), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
55
      cur_data <- unique(all_results_copy[, c(eval(avg_loss_by[!avg_loss_by %in% c("fold")]), "cv_mean", "cv_sd"), with = F])
56
    } else {
57
      # Calculate RMSE instead
58
      all_results_copy[, cv_mean := rmse(target, predicted), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
59
      cur_data <- unique(all_results_copy[, c(eval(avg_loss_by[!avg_loss_by %in% c("fold")]), "cv_mean"), with = F])
60
      # all_results_copy[, cv_sd := sd(RMSELoss), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
61
    }
62
    
63
    # ssubset <- deparse(substitute(sub_results_by))
64
    # baseline <- subset.data.table(all_results_long_copy, eval(parse(text = ssubset)))
65
    cur_data <- subset(cur_data, eval(sub_results_by))
66
    
67
    # baseline <- mysubset(all_results_long_copy, eval(sub_results_by))
68
    
69
    # Order bars the same as the error bars by changing data frame order via left join
70
    bar_level_df <- data.frame(x1 = bar_level_order)
71
    colnames(bar_level_df) <- as.character(fill_by)
72
    cur_data <- left_join(bar_level_df,  
73
                             cur_data,
74
                             by = as.character(fill_by))
75
    
76
    cur_data <- as.data.table(cur_data)
77
    if (y_lim == "full") {
78
      cur_ylim <- ylim(0, 1)
79
    } else {
80
      if (add_mean == T) {
81
        cur_ylim <- ylim(0, max(cur_data$cv_mean) + y_lim)
82
      } else {
83
        if (calculate_avg_mae == T) {
84
          cur_ylim <- ylim(0, max(cur_data$cv_mean) + max(cur_data$cv_sd) + y_lim)
85
        } else {
86
          cur_ylim <- ylim(0, max(cur_data$cv_mean) + y_lim)
87
        }
88
      }
89
    }
90
    
91
    # cur_data[, diff := abs(cv_mean - shift(cv_mean)), by = c("data_types")]
92
    p <- ggplot(cur_data)
93
    
94
    if (add_mean == T) {
95
      if (!is.null(facet_by)) {
96
        cur_data[, diff := abs(diff(cv_mean)), by = c("data_types", facet_by)]
97
        cur_data[, max_y := max(cv_mean), by = c("data_types", facet_by)]
98
        # "first higher" depends on the bar order given to the function (left to right)
99
        cur_data[, first_higher := ifelse(diff(cv_mean) < 0, T, F), by = c("data_types", facet_by)]
100
        
101
      } else {
102
        cur_data[, diff := abs(diff(cv_mean)), by = c("data_types")]
103
        cur_data[, max_y := max(cv_mean), by = "data_types"]
104
        cur_data[, first_higher := ifelse(diff(cv_mean) < 0, T, F), by = c("data_types")]
105
      }
106
      cur_data[, diff_too_small := ifelse(diff < min_diff, T, F)]
107
      
108
      p <- p + geom_text(aes(x=data_types,
109
                             label = round(cv_mean, 3), y = cv_mean),
110
                vjust = 1, hjust = -0.25, angle = 90, position = position_dodge2(width = .9)) +
111
      geom_bar(aes(x = data_types, y = max_y),
112
               stat = "identity", fill = "grey80", width = 0.4, position = "dodge") +
113
        # geom_text(data = cur_data[first_higher == T],
114
        geom_text(data = unique(cur_data[diff_too_small == F,
115
                                         c("data_types", "diff", "max_y", facet_by), with = F]),
116
                  aes(x = data_types, label = round(diff, 3), y = max_y),
117
                  vjust = 0.5, hjust = -0.25, angle = 45, color = "red")
118
        
119
    } else {
120
      if (calculate_avg_mae == T) {
121
        p <- p + geom_text(aes(x=factor(data_types, levels = bar_level_order),
122
                               label = round(cv_mean, 3), y = cv_mean + cv_sd),
123
                      vjust = 0.5, hjust = -0.25, angle = 90, position = position_dodge2(width = .9)) +
124
          geom_errorbar(aes(x=data_types,
125
                            y=cv_mean,
126
                            ymax=cv_mean + cv_sd, 
127
                            ymin=cv_mean - 0.01, col='black'),
128
                        linetype=1, show.legend = FALSE, position = dodge2, width = 0.9)
129
      } else {
130
        p <- p + geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean),
131
                           vjust = 0.5, hjust = -0.1, angle = 90, position = position_dodge2(width = .9))
132
          
133
      }
134
    }
135
      
136
      # Set bar order
137
    p <- p + geom_bar(mapping = aes(x = data_types, y = cv_mean,
138
                               fill = factor(eval(fill_by),
139
                                             levels = bar_level_order)),
140
                 stat = "identity", position="dodge", width = .9) +
141
    scale_x_discrete(limits = data_order) +
142
      scale_fill_discrete(name = legend_title) +
143
      scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
144
                                   "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
145
      theme(text = element_text(size = 14),
146
            # axis.text.x = element_text(angle = 45, hjust = 1),
147
            axis.title.x = element_blank(),
148
            # legend.position = c(.85,.85),
149
            # legend.position=c(1,1),
150
            legend.direction="horizontal",
151
            legend.position="top",
152
            legend.justification="right",
153
            # legend.justification=c(1, 0),
154
            # plot.margin = unit(c(5, 1, 0.5, 0.5), "lines")
155
            ) +
156
      # theme_gray(base_size = 14) +
157
      ylab(y_lab) +
158
      # ylim(0, max(cur_data$cv_mean) + max(cur_data$cv_sd) + 0.05) +
159
      # ylim(0, max(cur_data$cv_mean) + max(cur_data$cv_sd) + y_lim) +
160
      # ylim(0, 1) +
161
      cur_ylim
162
    
163
    if (!is.null(facet_by)) {
164
      if (length(facet_by) > 1) {
165
        for (i in 1:length(facet_by)) {
166
          # If the length is more than 1, it is assumed that facet_level_order is a list
167
          set(cur_data, j = eval(facet_by)[i], value = factor(unlist(cur_data[, as.character(facet_by)[i], with = F]),
168
                                                              levels = facet_level_order[[i]]))
169
        }
170
      } else {
171
        set(cur_data, j = as.character(facet_by), value = factor(unlist(cur_data[, as.character(facet_by), with = F]),
172
                                                                 levels = facet_level_order))
173
        
174
      }
175
      p <- p + facet_wrap(facet_by,
176
                          ncol = length(facet_level_order),
177
                          nrow = facet_nrow)
178
    }
179
    
180
    return(p)
181
    
182
  } else if (plot_type == "box_plot" | plot_type == "violin_plot") {
183
    # Subset all results
184
    require(ggpubr)
185
    all_results_subset <- subset(all_results_copy, eval(sub_results_by))
186
    
187
    # Find unique samples shared between all given models that use different data types
188
    all_results_subset[, unique_sample := paste0(cpd_name, "_", cell_name)]
189
    
190
    if (uniqueN(all_results_subset$split_method) > 1) {
191
      all_results_subset[, unique_group := paste0(data_types, "_", split_method)]
192
      shared_unique_samples <- Reduce(intersect, split(all_results_subset$unique_sample, all_results_subset$unique_group))
193
    } else {
194
      shared_unique_samples <- Reduce(intersect, split(all_results_subset$unique_sample, all_results_subset$data_types))
195
    }
196
    # shared_unique_samples <- intersect(shared_unique_samples_by_data_types, shared_unique_samples_by_split_method)
197
    all_results_subset <- all_results_subset[unique_sample %in% shared_unique_samples]
198
    # uniqueN(all_results_subset)  # 2003392 for cell line and drug scaffold, 2191444 for all 3
199
    # all_results_subset[data_types == "PROT" & split_method == "Split By Cell Line"]
200
    # all_results_subset[data_types == "PROT" & split_method == "Split By Drug Scaffold"]
201
    # all_results_subset[data_types == "PROT" & split_method == "Split By Both Cell Line & Drug Scaffold"]
202
    if (length(target_sub_by) == 1) {
203
      all_results_sub_sub <- all_results_subset[TargetRange == target_sub_by]
204
    } else {
205
      all_results_sub_sub <- all_results_subset[TargetRange %in% target_sub_by]
206
    }
207
    # Order data for the facet
208
    all_results_sub_sub[, data_types := factor(data_types, levels = data_order)]
209
    all_results_sub_sub[, as.character(fill_by) := factor(unlist(all_results_sub_sub[, as.character(fill_by), with = F]),
210
                                                          levels = bar_level_order)]
211
    # all_results_sub_sub[, cv_mean := mean(RMSELoss), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
212
    # all_results_sub_sub[, cv_sd := sd(RMSELoss), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
213
    
214
    if (paired == T) {
215
      # Set order within each group by the unique ID, so that each group has the same order (for pairing?)
216
      setorder(all_results_sub_sub, data_types, unique_sample)
217
      # uniqueN(all_results_sub_sub) / 8
218
      
219
      # table(all_results_sub_sub[split_method == "Split By Drug Scaffold"]$data_types)
220
      # table(all_results_sub_sub$data_types)
221
      # # Confirm:
222
      # all_results_sub_sub[, head(unique_sample,2),by=data_types]
223
      
224
    }
225
    if (plot_type == "box_plot") {
226
      p <- ggboxplot(data = all_results_sub_sub, x = as.character(fill_by),
227
                     y = "RMSELoss", color = as.character(fill_by),
228
                     outlier.shape = ifelse(hide_outliers, NA, 19))
229
    } else {
230
      p <- ggviolin(data = all_results_sub_sub, x = as.character(fill_by),
231
                    y = "RMSELoss", color = as.character(fill_by),
232
                    draw_quantiles = 0.5,
233
                    # add = "mean_range"
234
                    # add = "boxplot"
235
                    )
236
    }
237
    p <- set_palette(p, "jco")
238
    p <- facet(p = p, facet.by = facet_by, nrow = 1, strip.position = "bottom") +
239
      theme(
240
        # axis.text.x = element_text(angle = 45, hjust = 1),
241
        axis.text.x = element_blank(),
242
        axis.ticks.x =  element_blank(),
243
        axis.title.x = element_blank(),
244
        text = element_text(size = 14)
245
            ) +
246
      labs(color = legend_title) +
247
      ylab(y_lab) +
248
      scale_y_continuous(breaks = seq(0, 1, 0.2))
249
      
250
    # if (plot_difference == T) {
251
    #   p + geom_text(data = unique(all_results_sub_sub[, c("data_types", "cv_mean", "cv_sd")]),
252
    #                      aes(x = data_types,
253
    #                            label = round(cv_mean, 3),
254
    #                            y = cv_mean + cv_sd),
255
    #             vjust = 0.5, hjust = -0.25, angle = 90, position = position_dodge2(width = .9))
256
      # p + annotate("text", x=0.1, y=0.1, label= "boat")
257
      
258
    # }
259
    
260
    if (!is.null(cur_comparisons)) {
261
      if (test == "ks.test") {
262
        # facet_by
263
        # all_results_sub_sub[eval(fill_by) == cur_comparisons[[i]][1]]$RMSELoss
264
        # bar_level_order
265
        all_stats <- vector("list", length = length(cur_comparisons))
266
        for (i in 1:length(cur_comparisons)) {
267
          all_results_sub_sub[eval(fill_by) %in% cur_comparisons[[i]], c("ks_D", "ks_p") := ks.test(x = .SD[eval(fill_by) == cur_comparisons[[i]][1]]$RMSELoss,
268
                                                             y = .SD[eval(fill_by) == cur_comparisons[[i]][2]]$RMSELoss,
269
                                                             alternative = "two.sided")[1:2], by = facet_by]
270
          cur_stat <- unique(all_results_sub_sub[!is.na(ks_D), c(facet_by, as.character(fill_by), "ks_D", "ks_p"), with = F])
271
          all_results_sub_sub$ks_D <- NULL
272
          all_results_sub_sub$ks_p <- NULL
273
          
274
          cur_stat[, ks_D := round(ks_D, 3)]
275
          cur_stat[, ks_p := round(ks_p, 3)]
276
          temp <-  melt(cur_stat, id.vars = c(facet_by, "ks_D", "ks_p"))
277
          
278
          dcast_formula <- as.formula(paste0(paste(facet_by, collapse=" + "), " + ks_D + ks_p ~ value"))
279
          final_stat <- dcast(temp, formula = dcast_formula)
280
          
281
          col_pos <- (length(facet_by) + 2 + 1)
282
          colnames(final_stat)[col_pos:(col_pos+1)] <- c("group1", "group2")
283
          all_stats[[i]] <- final_stat
284
        }
285
        all_stats <- rbindlist(all_stats)
286
        p <- p + stat_pvalue_manual(
287
          # data = all_stats, label = "KS D: {ks_D}", y.position = 1, step.increase = step_increase
288
          data = all_stats, label = "D = {ks_D}\np: {ks_p}", y.position = 1,
289
          step.group.by = facet_by[length(facet_by)], step.increase = step_increase,
290
        )
291
      } else {
292
        # Add pairwise comparisons p-value
293
        p <- p + stat_compare_means(comparisons = cur_comparisons,
294
                                method = test,
295
                                method.args = list(alternative = "two.sided"),
296
                                # label.y.npc = "top",
297
                                paired = paired)
298
        # compare_means(RMSELoss ~ data_types, data = all_results_sub_sub, group.by = c("data_types", "Targeted"))
299
      }
300
    }
301
  return(p)
302
  }
303
}
304
305
306
# Generate shared unique cell line and drug combinations between data specific models
307
# all_results <- fread("Data/all_results.csv")
308
309
temp <- all_results[merge_method == "Base Model" &
310
                      loss_type == "Base Model" &
311
                      drug_type == "Base Model" &
312
                      bottleneck != "With Data Bottleneck" &
313
                      nchar(data_types) <= 5]
314
table(temp$split_method)
315
all_results_subset <- subset(all_results,
316
                             (split_method == "Split By Cell Line" &
317
                                             merge_method == "Base Model" &
318
                                             loss_type == "Base Model" &
319
                                             drug_type == "Base Model" &
320
                                             bottleneck != "With Data Bottleneck" &
321
                                             nchar(data_types) <= 5))
322
all_results_subset$fold <- NULL
323
all_results_subset <- unique(all_results_subset)
324
# Find samples that are shared between all data types
325
all_results_subset[, unique_sample := paste0(cpd_name, "_", cell_name)]
326
shared_unique_samples <- Reduce(intersect, split(all_results_subset$unique_sample, all_results_subset$data_types))
327
all_results_subset <- all_results_subset[unique_sample %in% shared_unique_samples]
328
# all_results_shared_subset$unique_sample <- NULL
329
uniqueN(all_results_subset) / 8  # 125,212 samples in each model that are paired
330
table(all_results_subset$data_types)
331
332
333
# Save unique samples
334
fwrite(unique(all_results_subset[, c("cpd_name", "cell_name")]), "Data/shared_unique_combinations.csv")
335
336
337
# all_results <- fread("Data/all_results.csv")
338
# CTRPv2 Targeted vs Untargeted Therapeutics Distributions ====
339
drug_info <- fread("Data/DRP_Training_Data/CTRP_DRUG_INFO.csv")
340
341
# drug_info$gene_symbol_of_protein_target
342
# drug_info[target_or_activity_of_compound == "inhibitor of p53-MDM2 interaction"]
343
# table(targeted_drugs <- drug_info[gene_symbol_of_protein_target != "" & (cpd_status == "clinical" | cpd_status == "FDA")]$target_or_activity_of_compound)
344
# 
345
# # TODO: Get the list of targeted therapies from NCI-MATCH
346
# # Drugs with shared targets or activities
347
# drug_info[target_or_activity_of_compound == "inhibitor of BCL2, BCL-xL, and BCL-W"]
348
# drug_info[target_or_activity_of_compound == "inhibitor of BRAF"]
349
# drug_info[target_or_activity_of_compound == "inhibitor of cyclin-dependent kinases"]
350
# drug_info[target_or_activity_of_compound == "inhibitor of DNA methyltransferase"]
351
# drug_info[target_or_activity_of_compound == "inhibitor of EGFR and HER2"]
352
# drug_info[target_or_activity_of_compound == "inhibitor of gamma-secretase"]
353
# drug_info[target_or_activity_of_compound == "inhibitor of HDAC1, HDAC2, HDAC3, HDAC6, and HDAC8"]
354
# drug_info[target_or_activity_of_compound == "inhibitor of HMG-CoA reductase"]
355
# drug_info[target_or_activity_of_compound == "inhibitor of HSP90"]
356
# drug_info[target_or_activity_of_compound == "inhibitor of Janus kinases 1 and 2"]
357
# drug_info[target_or_activity_of_compound == "inhibitor of Janus kinase 2"]
358
# drug_info[target_or_activity_of_compound == "inhibitor of MEK1 and MEK2"]
359
# drug_info[target_or_activity_of_compound == "inhibitor of mTOR"]
360
# drug_info[target_or_activity_of_compound == "inhibitor of nicotinamide phosphoribosyltransferase"]
361
# drug_info[target_or_activity_of_compound == "inhibitor of PI3K and mTOR kinase activity"]
362
# drug_info[target_or_activity_of_compound == "inhibitor of polo-like kinase 1 (PLK1)"]
363
# drug_info[target_or_activity_of_compound == "inhibitor of VEGFRs"]
364
# drug_info[target_or_activity_of_compound == "inhibitor of VEGFRs, c-KIT, and PDGFR alpha and beta"]
365
366
367
table(drug_info$target_or_activity_of_compound)
368
# targeted_drugs <- drug_info[gene_symbol_of_protein_target != ""]$rn
369
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
370
# ctrp[ , mean_by_drug := mean(area_above_curve), by = "cpd_name"]
371
# ctrp[ , mean_by_cell := mean(area_above_curve), by = "ccl_name"]
372
# ctrp[, Dataset := "CTRPv2"]
373
374
375
# mean(ctrp[Targeted == T]$area_above_curve)
376
# mean(ctrp[Targeted == F]$area_above_curve)
377
ctrp[, Targeted := ifelse(cpd_name %in% targeted_drugs, "TargetedDrug", "UntargetedDrug")]
378
379
unique(ctrp[, c("cpd_name", "Targeted")])
380
unique(ctrp[Targeted == "TargetedDrug"]$cpd_name)
381
unique(ctrp[Targeted == "UntargetedDrug"]$cpd_name)
382
table(ctrp$Targeted)
383
384
ctrp[Targeted == "UntargetedDrug", Targeted := "Untargeted Drug"]
385
ctrp[Targeted == "TargetedDrug", Targeted := "Targeted Drug"]
386
colnames(ctrp)[8] <- "Drug Type"
387
388
ggplot(ctrp, aes(x = area_above_curve, colour = Targeted)) +
389
  # geom_density(bins=100) +
390
  geom_freqpoly(bins=100) +
391
  geom_vline(aes(xintercept = mean(area_above_curve)), color="blue", linetype="dashed", size=1) +
392
  geom_vline(aes(xintercept = median(area_above_curve)), color="blue", linetype="dashed", size=1) +
393
  scale_x_continuous(breaks=c(0, round(median(ctrp$area_above_curve), 3), round(mean(ctrp$area_above_curve), 3), 0.25, 0.5, 0.75, 1)) +
394
  annotate(x=mean(ctrp$area_above_curve), y=20000,label="CTRPv2 Mean",vjust=1.5,geom="text", angle = 90) + 
395
  annotate(x=median(ctrp$area_above_curve), y=20000,label="CTRPv2 Median",vjust=1.5,geom="text", angle = 90) + 
396
  ggtitle(label = "AAC Frequency Polygon for CTRPv2: Targeted vs Untargeted Drugs") +
397
  xlab("Area Above Curve") + ylab("Count")
398
399
ggsave(filename = "Plots/Dataset_Exploration/CTRP_AAC_Distribution_Targeted_vs_Untargeted.pdf")
400
401
ggplot(ctrp, aes(x = `Drug Type`, y = area_above_curve)) +
402
  geom_boxplot() +
403
  ylab("Area Above Curve")
404
  # theme(legend.position = c(.9,.85)) +
405
  # geom_vline(aes(xintercept = mean(area_above_curve)), color="blue", linetype="dashed", size=1) +
406
  # geom_vline(aes(xintercept = median(area_above_curve)), color="blue", linetype="dashed", size=1) +
407
  # scale_x_continuous(breaks=c(0, round(median(ctrp$area_above_curve), 3),
408
  #                             round(mean(ctrp$area_above_curve), 3),
409
  #                             0.25, 0.5, 0.75, 1)) +
410
  # scale_fill_discrete(name = "Drug Type:") +
411
  # annotate(x=mean(ctrp$area_above_curve), y=20000,label="CTRPv2 Mean",vjust=1.5,geom="text", angle = 90) + 
412
  # annotate(x=median(ctrp$area_above_curve), y=20000,label="CTRPv2 Median",vjust=1.5,geom="text", angle = 90) + 
413
  # ggtitle(label = "AAC Frequency Polygon for CTRPv2: Targeted vs Untargeted Drugs") +
414
415
ggsave(filename = "Plots/Dataset_Exploration/CTRP_AAC_Distribution_Targeted_vs_Untargeted_BoxPlot.pdf")
416
417
418
# ggplot(ctrp, aes(x = `Drug Type`, y = area_above_curve)) +
419
#   geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
420
#   # geom_boxplot() +
421
#   ylab("Area Above Curve")
422
423
require(ggpubr)
424
p <- ggviolin(data = ctrp, x = "Drug Type", y = "area_above_curve",
425
         add = "boxplot") +
426
  stat_compare_means(comparisons = list(c("Targeted Drug", "Untargeted Drug")),
427
                     method = "wilcox.test",
428
                     method.args = list(alternative = "two.sided")) +
429
    ylab("Area Above Curve") +
430
  xlab("") +
431
  scale_y_continuous(breaks = c(seq(0, 1, 0.2),
432
                                round(median(ctrp[`Drug Type` == "Targeted Drug"]$area_above_curve), 3),
433
                                round(median(ctrp[`Drug Type` == "Untargeted Drug"]$area_above_curve), 3))) +
434
  geom_hline(yintercept = median(ctrp[`Drug Type` == "Targeted Drug"]$area_above_curve), linetype = "dotted") +
435
  geom_hline(yintercept = median(ctrp[`Drug Type` == "Untargeted Drug"]$area_above_curve), linetype = "dotted") +
436
  theme(text = element_text(size = 18))
437
  
438
# p <- set_palette(p, "jco")
439
ggsave(plot = p, filename = "Plots/Dataset_Exploration/CTRP_AAC_Distribution_Targeted_vs_Untargeted_ViolinPlot.pdf")
440
441
## Validation Subset ====
442
require(data.table)
443
require(ggpubr)
444
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
445
drug_info <- fread("Data/DRP_Training_Data/CTRP_DRUG_INFO.csv")
446
shared_valid <- fread("Data/shared_unique_combinations.csv")
447
shared_valid[, unique_sample := paste0(cpd_name, "_", cell_name)]
448
449
ctrp[, Targeted := ifelse(cpd_name %in% targeted_drugs, "TargetedDrug", "UntargetedDrug")]
450
ctrp[Targeted == "UntargetedDrug", Targeted := "Untargeted Drug"]
451
ctrp[Targeted == "TargetedDrug", Targeted := "Targeted Drug"]
452
colnames(ctrp)[8] <- "Drug Type"
453
454
ctrp[, unique_sample := paste0(cpd_name, "_", ccl_name)]
455
456
ctrp_sub <- ctrp[unique_sample %in% shared_valid$unique_sample]
457
458
table(ctrp_sub$`Drug Type`)
459
table(ctrp$`Drug Type`)
460
# Subset CTRPv2 by shared validation samples
461
p <- ggviolin(data = ctrp_sub, x = "Drug Type", y = "area_above_curve",
462
              add = "boxplot") +
463
  stat_compare_means(comparisons = list(c("Targeted Drug", "Untargeted Drug")),
464
                     method = "wilcox.test",
465
                     method.args = list(alternative = "two.sided")) +
466
  ylab("Area Above Curve") +
467
  xlab("") +
468
  scale_y_continuous(breaks = c(seq(0, 1, 0.2),
469
                                round(median(ctrp[`Drug Type` == "Targeted Drug"]$area_above_curve), 3),
470
                                round(median(ctrp[`Drug Type` == "Untargeted Drug"]$area_above_curve), 3))) +
471
  geom_hline(yintercept = median(ctrp[`Drug Type` == "Targeted Drug"]$area_above_curve), linetype = "dotted") +
472
  geom_hline(yintercept = median(ctrp[`Drug Type` == "Untargeted Drug"]$area_above_curve), linetype = "dotted") +
473
  theme(text = element_text(size = 18))
474
475
ggsave(plot = p, filename = "Plots/Dataset_Exploration/CTRP_AAC_Distribution_Targeted_vs_Untargeted_Validation_Subset_ViolinPlot.pdf")
476
477
# Combine and compare both
478
ctrp_sub[, Type := "Validation Subset"]
479
ctrp[, Type := "All Training Data"]
480
481
both_combined <- rbindlist(list(ctrp, ctrp_sub))
482
483
require(rstatix)
484
485
ks_results_targeted <- ks.test(both_combined[DrugType == "Targeted Drug" & Type == "All Training Data"]$area_above_curve,
486
        both_combined[DrugType == "Targeted Drug" & Type == "Validation Subset"]$area_above_curve,
487
        alternative = "two.sided")
488
ks_results_untargeted <- ks.test(both_combined[DrugType == "Untargeted Drug" & Type == "All Training Data"]$area_above_curve,
489
        both_combined[DrugType == "Untargeted Drug" & Type == "Validation Subset"]$area_above_curve,
490
        alternative = "two.sided")
491
492
stat_test <- both_combined %>%
493
  group_by(DrugType) %>%
494
  wilcox_test(area_above_curve ~ Type,
495
              p.adjust.method = "fdr", alternative = "two.sided")
496
497
stat_test %>% adjust_pvalue(method = "fdr")
498
499
stat_test <- tibble::tribble(
500
  ~DrugType, ~group1, ~group2, ~`D`,
501
  "Targeted Drug", "All Training Data", "Validation Subset", round(ks_results_targeted$statistic, 5),
502
  "Untargeted Drug", "All Training Data", "Validation Subset", round(ks_results_untargeted$statistic, 5),
503
)
504
505
506
colnames(both_combined)[8] <- "DrugType"
507
p <- ggviolin(data = both_combined, x = "Type", y = "area_above_curve",
508
              add = "boxplot", facet.by = "DrugType") +
509
  stat_pvalue_manual(data = stat_test,
510
                     # label = "D Statistic",
511
                     label = "KS-test, D = {D}",
512
                     y.position = 1.1, ) +
513
  # stat_compare_means(comparisons = list(c("Validation Subset", "All Training Data")),
514
  #                    method = "wilcox.test",
515
  #                    method.args = list(alternative = "two.sided")) +
516
  ylab("Area Above Curve") +
517
  xlab("") +
518
  scale_y_continuous(breaks = c(seq(0, 1, 0.2),
519
                                round(median(ctrp[`Drug Type` == "Targeted Drug"]$area_above_curve), 3),
520
                                round(median(ctrp[`Drug Type` == "Untargeted Drug"]$area_above_curve), 3))) +
521
  geom_hline(yintercept = median(ctrp[`Drug Type` == "Targeted Drug"]$area_above_curve), linetype = "dotted", color = "red") +
522
  geom_hline(yintercept = median(ctrp[`Drug Type` == "Untargeted Drug"]$area_above_curve), linetype = "dotted", color = "red") +
523
  theme(text = element_text(size = 18))
524
525
ggsave(plot = p, filename = "Plots/Dataset_Exploration/CTRP_AAC_Distribution_Targeted_vs_Untargeted_Validation_Subset_Comparison_ViolinPlot.pdf")
526
527
# Load CV Fold Results ====
528
# Select per fold validation files
529
all_cv_files <- list.files("Data/CV_Results/", recursive = T,
530
                           pattern = ".*final_validation.*", full.names = T)
531
# ".+drug_.{3,5}_HyperOpt.+"
532
# bimodal_cv_files <- grep(pattern = ".+_.*drug_\\w{3,5}_HyperOpt.+", all_cv_files, value = T)
533
# bimodal_baseline_cv_files <- grep(pattern = ".+_.*drug_\\w{3,5}_HyperOpt.+MergeByConcat_RMSELoss_MorganDrugs.+", all_cv_files, value = T)
534
# trimodal_baseline_cv_files <- grep(pattern = ".+_.*drug_\\w{6,11}_HyperOpt.+MergeByConcat_RMSELoss_MorganDrugs.+", all_cv_files, value = T)
535
536
# cur_cv_files <- grep(pattern = ".ResponseOnly_.*drug_\\w{3,5}_.+", cur_cv_files, value = T)
537
# cur_cv_files <- grep(pattern = ".ResponseOnly_+drug_exp_HyperOpt.+", cur_cv_files, value = T)
538
# cur_cv_files_2 <- grep(pattern = ".Baseline_ElasticNet.+", all_cv_files, value = T)
539
# lineage_cv_files <- grep(pattern = ".LINEAGE.+", all_cv_files, value = T)
540
# bottleneck_cv_files <- grep(pattern = ".WithBottleNeck.+", all_cv_files, value = T)
541
# final_cv_files <- c(bimodal_cv_files, cur_cv_files_2)
542
# final_cv_files <- bimodal_cv_files
543
# trimodal_cv_files <- grep(pattern = ".ResponseOnly_.*gnndrug_.{6,11}_HyperOpt.+", all_cv_files, value = T)
544
# multimodal_cv_files <- grep(pattern = ".ResponseOnly_.*gnndrug_.{12,}_HyperOpt.+", all_cv_files, value = T)
545
# final_cv_files <- lineage_cv_files
546
# final_cv_files <- bottleneck_cv_files
547
# final_cv_files <- bimodal_cv_files
548
# final_cv_files <- trimodal_baseline_cv_files
549
# final_cv_files <- trimodal_cv_files
550
final_cv_files <- all_cv_files
551
length(final_cv_files)
552
sum(grepl(".*ElasticNet.*", final_cv_files))
553
sum(grepl(".*WithBottleNeck.*", final_cv_files))
554
sum(grepl(".*NoBottleNeck.*", final_cv_files))
555
556
# Read all data
557
all_results <- vector(mode = "list", length = length(final_cv_files))
558
rm(list = c("all_results_copy", "all_results_long_copy", "all_results_sub", "cur_res", "cur_p", "unique_combos"))
559
gc()
560
for (i in 1:length(final_cv_files)) {
561
  cur_res <- fread(final_cv_files[i])
562
  if (!grepl(".*Baseline_ElasticNet.*", final_cv_files[i])) {
563
    data_types <- gsub(".+_\\w*drug_(.+)_HyperOpt.+", "\\1", final_cv_files[i])
564
    data_types <- toupper(data_types)
565
    merge_method <- gsub(".+MergeBy(\\w+)_.*RMSE.+", "\\1", final_cv_files[i])
566
    loss_method <- gsub(".+_(.*)RMSE.+", "\\1RMSE", final_cv_files[i])
567
    drug_type <- gsub(".+_(\\w*)drug.+_HyperOpt.+", "\\1drug", final_cv_files[i])
568
    drug_type <- toupper(drug_type)
569
    split_method <- gsub(".+Split_(\\w+)_\\w+BottleNeck.+", "\\1", final_cv_files[i])
570
    bottleneck <- gsub(".+Split_\\w+_(\\w+BottleNeck).+", "\\1", final_cv_files[i])
571
    # data_types <- strsplit(data_types, "_")[[1]]
572
    # cur_res$epoch <- as.integer(epoch)
573
    cur_res$data_types <- data_types
574
    cur_res$merge_method <- merge_method
575
    cur_res$loss_type <- loss_method
576
    cur_res$drug_type <- drug_type
577
    cur_res$split_method <- split_method
578
    cur_res$bottleneck <- bottleneck
579
    
580
  } else {
581
    split_method <- gsub(".+Baseline_ElasticNet_Split_(\\w+)_drug_.+", "\\1", final_cv_files[i])
582
    data_types <- gsub(".+Baseline_ElasticNet_Split_\\w+_drug_(\\w+).+", "\\1", final_cv_files[i])
583
    data_types <- toupper(data_types)
584
    cur_res$data_types <- data_types
585
    cur_res$split_method <- split_method
586
    cur_res$merge_method <- "Merge By Early Concat"
587
    cur_res$loss_type <- "Base Model"
588
    cur_res$drug_type <- "Base Model"
589
    cur_res$bottleneck <- "No Data Bottleneck"
590
  }
591
  
592
  cur_fold <- gsub(".+CV_Index_(\\d)_.+", "\\1", final_cv_files[i])
593
  cur_res$fold <- cur_fold
594
  
595
  all_results[[i]] <- cur_res
596
}
597
rm(cur_res)
598
gc()
599
600
all_results <- rbindlist(all_results, fill = T)
601
if (any(all_results$merge_method == "Merge By Early Concat")) {
602
  all_results[is.na(rmse_loss), RMSELoss := abs(target - predicted), by = .I]
603
  all_results[!is.na(rmse_loss), RMSELoss := rmse_loss, by = .I]
604
  all_results$rmse_loss <- NULL
605
} else {
606
  all_results[, RMSELoss := abs(target - predicted), by = .I]
607
}
608
609
# all_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")]
610
all_results$V1 <- NULL
611
612
# Update CV splitting method names
613
all_results[split_method == "BOTH", split_method := "Split By Both Cell Line & Drug Scaffold"]
614
all_results[split_method == "DRUG", split_method := "Split By Drug Scaffold"]
615
all_results[split_method == "CELL_LINE", split_method := "Split By Cell Line"]
616
all_results[split_method == "LINEAGE", split_method := "Split By Cancer Type"]
617
618
# all_results[merge_method == "MergeByEarlyConcat"]$merge_method <- "Merge By Early Concat"
619
# Update model names based on used techniques
620
all_results[loss_type == "RMSE", loss_type := "Base Model"]
621
all_results[loss_type == "WeightedRMSE", loss_type := "Base Model + LDS"]
622
all_results[merge_method == "Concat", merge_method := "Base Model"]
623
all_results[merge_method == "LMF", merge_method := "Base Model + LMF"]
624
all_results[merge_method == "Sum", merge_method := "Base Model + Sum"]
625
all_results[drug_type == "DRUG", drug_type := "Base Model"]
626
all_results[drug_type == "GNNDRUG", drug_type := "Base Model + GNN"]
627
628
# Update data bottleneck names
629
all_results[bottleneck == "NoBottleNeck", bottleneck := "No Data Bottleneck"]
630
all_results[bottleneck == "WithBottleNeck", bottleneck := "With Data Bottleneck"]
631
632
all_results[, Targeted := fifelse(cpd_name %in% targeted_drugs, "Targeted Drug", "Untargeted Drug")]
633
634
all_results[, TargetRange := fifelse(target >= 0.7, "Target Above 0.7", "Target Below 0.7")]
635
636
# table(all_results$Targeted)
637
# table(all_results$TargetRange)
638
# 
639
# all_results[RMSELoss > 1]
640
# table(all_results[RMSELoss > 1]$data_types)
641
# table(all_results$data_types)
642
643
644
# Save 
645
fwrite(all_results, "Data/all_results.csv")
646
# fwrite(all_results, "Data/all_bimodal_results.csv")
647
648
# Identify duplicated folds
649
unique_combos <- fread("Data/shared_unique_combinations.csv")
650
unique_combos[, unique_samples := paste0(cpd_name, "_", cell_name)]
651
all_results[, unique_samples := paste0(cpd_name, "_", cell_name)]
652
all_results_sub <- all_results[unique_samples %in% unique_combos$unique_samples]
653
654
all_results_sub[, num_samples := .N, by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "bottleneck")]
655
unique(all_results_sub[num_samples > 125212][, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "bottleneck", "num_samples")])
656
657
# Check for missing folds per config
658
all_results[, num_folds := uniqueN(fold), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "bottleneck")]
659
660
unique(all_results[num_folds != 5][, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "bottleneck", "num_folds")])
661
# data_types     merge_method        loss_type        drug_type                            split_method           bottleneck
662
# 1:  CNV_METAB       Base Model       Base Model       Base Model                      Split By Cell Line With Data Bottleneck
663
# 2:        CNV Base Model + Sum Base Model + LDS Base Model + GNN                      Split By Cell Line   No Data Bottleneck
664
# 3:        CNV Base Model + LMF       Base Model Base Model + GNN                  Split By Drug Scaffold   No Data Bottleneck
665
# 4:  HIST_RPPA Base Model + LMF Base Model + LDS Base Model + GNN                      Split By Cell Line   No Data Bottleneck
666
# 5:  HIST_RPPA Base Model + LMF Base Model + LDS Base Model + GNN                  Split By Drug Scaffold   No Data Bottleneck
667
# 6: MIRNA_HIST Base Model + LMF Base Model + LDS Base Model + GNN                      Split By Cell Line   No Data Bottleneck
668
# 7:      MIRNA Base Model + LMF       Base Model Base Model + GNN                  Split By Drug Scaffold   No Data Bottleneck
669
# 8: MIRNA_RPPA Base Model + LMF Base Model + LDS Base Model + GNN                      Split By Cell Line   No Data Bottleneck
670
# 9: MIRNA_RPPA Base Model + LMF Base Model + LDS Base Model + GNN                  Split By Drug Scaffold   No Data Bottleneck
671
# 10:    MUT_CNV Base Model + LMF Base Model + LDS Base Model + GNN Split By Both Cell Line & Drug Scaffold   No Data Bottleneck
672
# 11:        MUT       Base Model Base Model + LDS Base Model + GNN Split By Both Cell Line & Drug Scaffold   No Data Bottleneck
673
# 12:       PROT Base Model + LMF       Base Model Base Model + GNN                  Split By Drug Scaffold   No Data Bottleneck
674
675
# Targeted vs Untargeted in Baseline ====
676
# targeted_drugs <- fread("Data/DRP_Training_Data/CANCER_GOV_TARGETED_DRUGS.csv", fill = T)
677
# targeted_drugs <- targeted_drugs$Targeted_Drugs
678
all_results_copy <- fread("Data/all_results.csv")
679
all_results_copy <- all_results[nchar(data_types) <= 5]
680
# all_results_copy[, cv_mean := mean(RMSELoss), by = c("cpd_name", "cell_name", "data_types", "merge_method", "loss_type", "drug_type", "split_method")]
681
682
# baseline_with_gnn <- all_results_long_copy[(merge_method == "Concat" & loss_type == "RMSE" & split_method == "DRUG")]
683
baseline <- all_results_copy[merge_method == "MergeByConcat" & loss_type == "UnweightedLoss" & data_types %in% c("EXP", "PROT") &
684
                               drug_type == "Morgan" & split_method == "SplitByBoth" & nchar(data_types) <= 5]
685
686
p <- ggplot(baseline, mapping = aes(x = Targeted, y = cv_mean)) +
687
  geom_boxplot() +
688
  facet_wrap(~data_types+TargetRange, ncol = 2) +
689
  ggtitle(label = tools::toTitleCase("Comparison of GNN drug representation on targeted and untargeted drugs"),
690
          subtitle = "5-fold validation RMSE loss using strict splitting, True Target >= 0.7") +
691
692
693
scale_fill_discrete(name = "CV Fold:") +
694
scale_x_discrete() +
695
scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
696
                             "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
697
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
698
geom_errorbar(aes(x=data_types,
699
                  y=cv_mean,
700
                  ymax=cv_mean, 
701
                  ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) +
702
geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5)
703
# targeted_drug_results <- all_results[cpd_name %in% targeted_drugs]
704
705
# all_results_copy[, Targeted := ifelse(cpd_name %ilike% paste0(targeted_drugs, collapse = "|"), T, F)]
706
707
708
# unique(all_results_copy[Targeted == T]$cpd_name)
709
# dput(unique(all_results_copy[Targeted == T]$cpd_name))
710
# all_results_copy <- all_results_copy[Targeted == T]
711
# all_results_copy <- all_results_copy[target >= 0.9]
712
713
# all_results_copy_sub <- all_results_copy[target >= 0.7]
714
# all_results_copy_sub <- all_results_copy[target >= 0.7]
715
716
# temp <- all_results_copy_sub[data_types == "EXP" & merge_method == "Concat"]
717
# temp[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "Targeted")]
718
719
720
721
# Bi-modal Baseline Bottleneck Comparison (split by cell line) ====
722
all_results_copy <- fread("Data/all_results.csv")
723
all_results_copy <- all_results[nchar(data_types) <= 5]
724
# all_results_copy <- all_results
725
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "bottleneck")
726
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
727
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
728
729
# Bar plot
730
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
731
                          sub_results_by = quote((split_method == "Split By Cell Line" &
732
                                                    merge_method == "Base Model" &
733
                                                    loss_type == "Base Model" &
734
                                                    drug_type == "Base Model" &
735
                                                    nchar(data_types) <= 5)),
736
                          fill_by = quote(bottleneck),
737
                          bar_level_order = c("With Data Bottleneck", "No Data Bottleneck"),
738
                          facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
739
                          data_order = data_order,
740
                          facet_by = quote(TargetRange),
741
                          legend_title = "Data Type:",
742
                          calculate_avg_mae = F,
743
)
744
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold")) 
745
746
ggsave(plot = cur_p,
747
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_Bottleneck_Comparison_BarPlot.pdf")
748
749
# Violin plot
750
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
751
                          sub_results_by = quote((split_method == "Split By Cell Line" &
752
                                                    merge_method == "Base Model" &
753
                                                    loss_type == "Base Model" &
754
                                                    drug_type == "Base Model" &
755
                                                    nchar(data_types) <= 5)),
756
                          fill_by = quote(bottleneck),
757
                          bar_level_order = c("With Data Bottleneck", "No Data Bottleneck"),
758
                          facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
759
                          data_order = data_order,
760
                          facet_by = c("TargetRange", "data_types"),
761
                          legend_title = "Data Type:",
762
                          plot_type = "violin_plot", 
763
                          target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
764
                          # target_sub_by = "Target Above 0.7",
765
                          cur_comparisons = list(c("With Data Bottleneck", "No Data Bottleneck")),
766
                          test = "ks.test",
767
                          paired = T
768
)
769
770
cur_p <- cur_p + theme(text = element_text(size = 18, face = "bold")) + expand_limits(y = c(0, 1.5))
771
ggsave(plot = cur_p,
772
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_Bottleneck_Comparison_ViolinPlot.pdf",
773
       height = 8)
774
775
## Concordance between different models ====
776
all_results_copy <- all_results[bottleneck == "With Data Bottleneck"]
777
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "bottleneck")
778
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
779
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
780
all_comparisons <- utils::combn(data_order, 2, simplify = T)
781
all_comparisons <- list(c("MUT", "CNV"), c("CNV", "EXP"), c("HIST", "RPPA"))
782
783
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
784
                          sub_results_by = quote((split_method == "Split By Cell Line" &
785
                                                    merge_method == "Base Model" &
786
                                                    loss_type == "Base Model" &
787
                                                    drug_type == "Base Model" &
788
                                                    nchar(data_types) <= 5)),
789
                          fill_by = quote(data_types),
790
                          # bar_level_order = c("With Data Bottleneck", "No Data Bottleneck"),
791
                          bar_level_order = data_order,
792
                          facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
793
                          data_order = data_order,
794
                          # facet_by = c("TargetRange"),
795
                          facet_by = NULL,
796
                          legend_title = "Model:",
797
                          plot_type = "box_plot", 
798
                          # target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
799
                          target_sub_by = "Target Above 0.7",
800
                          cur_comparisons = NULL,
801
                          test = "ks.test",
802
                          paired = T,
803
)
804
805
all_results_subset <- subset(all_results_copy, (split_method == "Split By Cell Line" &
806
                                                             merge_method == "Base Model" &
807
                                                             loss_type == "Base Model" &
808
                                                             drug_type == "Base Model" &
809
                                                             nchar(data_types) <= 5))
810
# all_results_sub_sub <- all_results_subset[TargetRange %in% c("Target Above 0.7", "Target Below 0.7")]
811
# Order data for the facet
812
all_results_subset[, data_types := factor(data_types, levels = data_order)]
813
all_results_subset[, TargetRange := factor(unlist(all_results_subset[, "TargetRange", with = F]),
814
                                                      levels = c("Target Above 0.7", "Target Below 0.7"))]
815
all_results_subset[, cv_mean := mean(RMSELoss), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
816
817
# all_results_sub_sub[, cv_sd := sd(RMSELoss), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
818
819
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
820
all_comparisons <- utils::combn(data_order, 2, simplify = F)
821
all_stat_tests <- vector(mode = "list", length = length(all_comparisons))
822
for (i in 1:length(all_stat_tests)) {
823
  all_stat_tests[[i]] <- ks.test(all_results_subset[data_types == all_comparisons[[i]][1]]$RMSELoss,
824
                                 all_results_subset[data_types == all_comparisons[[i]][2]]$RMSELoss,)
825
}
826
827
828
all_stat_tests <- vector(mode = "list", length = 8)
829
for (i in 1:length(data_order)) {
830
  all_stat_tests[[i]] <- compare_means(RMSELoss ~ data_types, all_results_subset,
831
                                 ref.group = data_order[i], 
832
                                 method = "wilcox.test", alternative = "two.sided",
833
                                 p.adjust.method = "fdr", paired = F)
834
}
835
836
837
cur_palette <- get_palette(palette = "jco", 8)
838
839
final_p <- cur_p + theme(axis.text.x = element_text(), legend.position = "none")
840
for (i in 1:length(data_order)) {
841
  final_p <- final_p +
842
    # theme(axis.text.x = element_text()) +
843
    geom_bracket(
844
    aes(xmin = group1,
845
        xmax = group2,
846
        # label = p.adj),
847
        label = signif(p.adj, 2)), position = "identity",
848
    data = all_stat_tests[[i]], y.position = 0.3 + (0.3 * i),
849
    step.increase = 0.015,
850
    label.size = 3,
851
    tip.length = 0.01, color = cur_palette[i])
852
}
853
final_p 
854
855
ggsave(plot = final_p,
856
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_Bottleneck_Concordance_Comparison_BoxPlot.pdf",
857
       height = 12)
858
859
## R-squared Plot ====
860
all_results_subset <- subset(all_results_copy, (split_method == "Split By Cell Line" &
861
                                                  merge_method == "Base Model" &
862
                                                  loss_type == "Base Model" &
863
                                                  drug_type == "Base Model" &
864
                                                  nchar(data_types) <= 5))
865
# all_results_sub_sub <- all_results_subset[TargetRange %in% c("Target Above 0.7", "Target Below 0.7")]
866
# Order data for the facet
867
# all_results_subset[, data_types := factor(data_types, levels = data_order)]
868
# all_results_subset[, TargetRange := factor(unlist(all_results_subset[, "TargetRange", with = F]),
869
#                                            levels = c("Target Above 0.7", "Target Below 0.7"))]
870
# all_results_subset[, cv_mean := mean(RMSELoss), by = eval(avg_loss_by[!avg_loss_by %in% c("fold")])]
871
872
# Find samples that are shared between all data types
873
all_results_subset[, unique_sample := paste0(cpd_name, "_", cell_name)]
874
shared_unique_samples <- Reduce(intersect, split(all_results_subset$unique_sample, all_results_subset$data_types))
875
all_results_copy <- all_results_subset[unique_sample %in% shared_unique_samples]
876
# all_results_shared_subset$unique_sample <- NULL
877
uniqueN(all_results_copy) / 8  # 125,212 samples in each model that are paired
878
879
# Set order within each group by the unique ID, so that each group has the same order (for pairing?)
880
setorder(all_results_copy, data_types, unique_sample)
881
# Confirm:
882
all_results_copy[, head(unique_sample,2),by=data_types]
883
884
885
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
886
                          sub_results_by = quote((split_method == "Split By Cell Line" &
887
                                                    merge_method == "Base Model" &
888
                                                    loss_type == "Base Model" &
889
                                                    drug_type == "Base Model" &
890
                                                    nchar(data_types) <= 5)),
891
                          fill_by = quote(data_types),
892
                          # bar_level_order = c("With Data Bottleneck", "No Data Bottleneck"),
893
                          bar_level_order = data_order,
894
                          facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
895
                          data_order = data_order,
896
                          # facet_by = c("TargetRange"),
897
                          facet_by = NULL,
898
                          legend_title = "Model:",
899
                          plot_type = "box_plot", 
900
                          # target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
901
                          target_sub_by = "Target Above 0.7",
902
                          cur_comparisons = NULL,
903
                          test = "wilcox.test",
904
                          paired = F, hide_outliers = T
905
)
906
907
all_stat_tests <- vector(mode = "list", length = 8)
908
for (i in 1:length(data_order)) {
909
  all_stat_tests[[i]] <- compare_means(RMSELoss ~ data_types, all_results_copy,
910
                                       ref.group = data_order[i],
911
                                       method = "wilcox.test", alternative = "two.sided",
912
                                       p.adjust.method = "fdr", paired = T)
913
}
914
915
916
cur_palette <- get_palette(palette = "jco", 8)
917
918
final_p <- cur_p + theme(axis.text.x = element_text(), legend.position = "none")
919
for (i in 1:length(data_order)) {
920
  final_p <- final_p +
921
    # theme(axis.text.x = element_text()) +
922
    geom_bracket(
923
      aes(xmin = group1,
924
          xmax = group2,
925
          # label = p.adj),
926
          label = signif(p.adj, 2)), position = "identity",
927
      data = all_stat_tests[[i]], y.position = 0.3 + (0.3 * i),
928
      step.increase = 0.015,
929
      label.size = 2, vjust = 1,
930
      tip.length = 0.01, color = cur_palette[i])
931
}
932
final_p 
933
934
rsq <- function (x, y) cor(x, y, method = "pearson") ^ 2
935
rmse <- function(x, y) sqrt(mean((x - y)^2))
936
mae <- function(x, y) mean(abs(x - y))
937
938
all_results_copy[, r2_by_range := rsq(target, predicted), by = c("data_types", "TargetRange", "Targeted")]
939
all_results_copy[, rmse_by_range := rmse(target, predicted), by = c("data_types", "TargetRange", "Targeted")]
940
all_results_copy[, avg_rmseloss_by_range := mean(RMSELoss), by = c("data_types", "TargetRange", "Targeted")]
941
all_results_copy[, mae_by_range := mae(target, predicted), by = c("data_types", "TargetRange", "Targeted")]
942
# all_results_copy[, avg_rmseloss_by_range := mean(RMSELoss), by = c("data_types", "TargetRange")]
943
unique(all_results_copy[, c("data_types", "mae_by_range", "avg_rmseloss_by_range", "rmse_by_range", "r2_by_range", "TargetRange", "Targeted")])
944
945
# Upper AAC range correlation, targeted
946
all_upper_targeted_results_copy <- all_results_copy[TargetRange == "Target Above 0.7" & Targeted == "Targeted Drug"]
947
# upper_targeted_cors <- all_upper_targeted_results_copy[all_upper_targeted_results_copy, allow.cartesian=T, on = "unique_sample"][, cor(predicted, i.predicted), by=list(data_types, i.data_types)]
948
upper_targeted_r2 <- all_upper_targeted_results_copy[all_upper_targeted_results_copy, allow.cartesian=T, on = "unique_sample"][, rsq(predicted, i.predicted), by=list(data_types, i.data_types)]
949
upper_targeted_r2_dt <- dcast(upper_targeted_r2, data_types~i.data_types, value.var = "V1")
950
upper_targeted_r2_mat <- as.matrix(upper_targeted_r2_dt[, 2:9])
951
rownames(upper_targeted_r2_mat) <- upper_targeted_r2_dt$data_types
952
953
# Upper AAC range correlation, untargeted
954
all_upper_untargeted_results_copy <- all_results_copy[TargetRange == "Target Above 0.7" & Targeted == "Untargeted Drug"]
955
# upper_untargeted_cors <- all_upper_untargeted_results_copy[all_upper_untargeted_results_copy, allow.cartesian=T, on = "unique_sample"][, cor(predicted, i.predicted), by=list(data_types, i.data_types)]
956
upper_untargeted_r2 <- all_upper_untargeted_results_copy[all_upper_untargeted_results_copy, allow.cartesian=T, on = "unique_sample"][, rsq(predicted, i.predicted), by=list(data_types, i.data_types)]
957
upper_untargeted_r2_dt <- dcast(upper_untargeted_r2, data_types~i.data_types, value.var = "V1")
958
upper_untargeted_r2_mat <- as.matrix(upper_untargeted_r2_dt[, 2:9])
959
rownames(upper_untargeted_r2_mat) <- upper_untargeted_r2_dt$data_types
960
961
# Lower AAC range correlation, targeted
962
all_lower_targeted_results_copy <- all_results_copy[TargetRange == "Target Below 0.7" & Targeted == "Targeted Drug"]
963
# lower_targeted_cors <- all_lower_targeted_results_copy[all_lower_targeted_results_copy, allow.cartesian=T, on = "unique_sample"][, cor(predicted, i.predicted), by=list(data_types, i.data_types)]
964
lower_targeted_r2 <- all_lower_targeted_results_copy[all_lower_targeted_results_copy, allow.cartesian=T, on = "unique_sample"][, rsq(predicted, i.predicted), by=list(data_types, i.data_types)]
965
966
lower_targeted_r2_dt <- dcast(lower_targeted_r2, data_types~i.data_types, value.var = "V1")
967
lower_targeted_r2_mat <- as.matrix(lower_targeted_r2_dt[, 2:9])
968
rownames(lower_targeted_r2_mat) <- lower_targeted_r2_dt$data_types
969
970
# Lower AAC range correlation, untargeted
971
all_lower_untargeted_results_copy <- all_results_copy[TargetRange == "Target Below 0.7" & Targeted == "Untargeted Drug"]
972
# lower_untargeted_cors <- all_lower_untargeted_results_copy[all_lower_untargeted_results_copy, allow.cartesian=T, on = "unique_sample"][, cor(predicted, i.predicted), by=list(data_types, i.data_types)]
973
lower_untargeted_r2 <- all_lower_untargeted_results_copy[all_lower_untargeted_results_copy, allow.cartesian=T, on = "unique_sample"][, rsq(predicted, i.predicted), by=list(data_types, i.data_types)]
974
975
lower_untargeted_r2_dt <- dcast(lower_untargeted_r2, data_types~i.data_types, value.var = "V1")
976
lower_untargeted_r2_mat <- as.matrix(lower_untargeted_r2_dt[, 2:9])
977
rownames(lower_untargeted_r2_mat) <- lower_untargeted_r2_dt$data_types
978
979
# install.packages("corrplot")
980
# require(corrplot)
981
# install.packages("ggcorrplot")
982
# install.packages("patchwork")
983
require(ggcorrplot)
984
require(patchwork)
985
require(ggplot2)
986
987
g_upper_targeted <- ggcorrplot(upper_targeted_r2_mat, hc.order = TRUE, outline.color = "white",
988
           type = "lower", 
989
           ggtheme = ggplot2::theme_gray,
990
           colors = c("#E46726", "white", "#6D9EC1"),
991
           lab = TRUE) + ggtitle("AAC >= 0.7, Targeted") + 
992
  theme(text = element_text(size = 12, face = "bold"), 
993
        legend.position = 'none')
994
g_upper_untargeted <- ggcorrplot(upper_untargeted_r2_mat, hc.order = TRUE, outline.color = "white",
995
           type = "lower", 
996
           ggtheme = ggplot2::theme_gray,
997
           colors = c("#E46726", "white", "#6D9EC1"),
998
           lab = TRUE) + ggtitle("AAC >= 0.7, Untargeted") + 
999
  theme(text = element_text(size = 12, face = "bold"), 
1000
        legend.position = 'none')
1001
1002
# g_upper <- ggplot(upper_r2, aes(data_types, i.data_types, fill = V1)) +
1003
#   geom_tile() +
1004
#   ggtitle("AAC >= 0.7") + 
1005
#   theme(text = element_text(size = 14, face = "bold"), 
1006
#         legend.position = 'none')
1007
1008
g_lower_targeted <- ggcorrplot(lower_targeted_r2_mat, hc.order = TRUE, outline.color = "white",
1009
           type = "lower",
1010
           ggtheme = ggplot2::theme_gray,
1011
           colors = c("#E46726", "white", "#6D9EC1"),
1012
           lab = TRUE) + ggtitle("AAC < 0.7, Targeted") +
1013
  theme(text = element_text(size = 12, face = "bold"),
1014
        legend.position = 'none')
1015
g_lower_untargeted <- ggcorrplot(lower_untargeted_r2_mat, hc.order = TRUE, outline.color = "white",
1016
           type = "lower",
1017
           ggtheme = ggplot2::theme_gray,
1018
           colors = c("#E46726", "white", "#6D9EC1"),
1019
           lab = TRUE) + ggtitle("AAC < 0.7, Untargeted") +
1020
  theme(text = element_text(size = 12, face = "bold"),
1021
        legend.position = 'none')
1022
1023
1024
full <- (g_upper_targeted | g_upper_untargeted) / (g_lower_targeted | g_lower_untargeted)
1025
1026
1027
ggsave("Plots/CV_Results/Baseline_R2_Matrix_ByDataType.pdf",
1028
       height = 8, width = 8, units = "in",
1029
       full)
1030
# corrplot(final_cor_mat, method = 'square', order = 'AOE', type = "lower",
1031
#          addCoef.col = 'white')
1032
1033
# pdf(file = "Plots/CV_Results/Baseline_Correlation_Matrix_ByDataType.pdf")
1034
1035
corrplot(final_cor_mat, method = 'square', order = 'AOE', type = "lower",
1036
         addCoef.col = 'white')
1037
1038
dev.off()
1039
1040
1041
1042
1043
rsq(all_results_copy[data_types == "EXP"]$target, all_results_copy[data_types == "EXP"]$predicted)
1044
rsq(all_results_copy[data_types == "CNV"]$target, all_results_copy[data_types == "CNV"]$predicted)
1045
rsq(all_results_copy[data_types == "PROT"]$target, all_results_copy[data_types == "PROT"]$predicted)
1046
rsq(all_results_copy[data_types == "MUT"]$target, all_results_copy[data_types == "MUT"]$predicted)
1047
rsq(all_results_copy[data_types == "MUT"]$target, all_results_copy[data_types == "MUT"]$predicted)
1048
1049
ggsave(plot = final_p,
1050
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_Bottleneck_Paired_Concordance_Comparison_BoxPlot.pdf",
1051
       height = 12)
1052
1053
1054
# Bi-Modal Baseline Upper vs Lower AAC Range Comparison ====
1055
all_results_copy <- all_results
1056
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "bottleneck")
1057
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
1058
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
1059
1060
# Violin plot
1061
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1062
                          sub_results_by = quote((split_method == "Split By Cell Line" &
1063
                                                    merge_method == "Base Model" &
1064
                                                    loss_type == "Base Model" &
1065
                                                    drug_type == "Base Model" &
1066
                                                    bottleneck == "No Data Bottleneck" &
1067
                                                    nchar(data_types) <= 5)),
1068
                          fill_by = quote(TargetRange),
1069
                          bar_level_order = c("Target Above 0.7", "Target Below 0.7"),
1070
                          # facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
1071
                          data_order = data_order,
1072
                          facet_by = "data_types",
1073
                          legend_title = "AAC Range:",
1074
                          plot_type = "violin_plot", 
1075
                          target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
1076
                          # target_sub_by = "Target Above 0.7",
1077
                          cur_comparisons = list(c("Target Above 0.7", "Target Below 0.7")),
1078
                          test = "ks.test",
1079
                          paired = T
1080
)
1081
1082
cur_p <- cur_p + theme(text = element_text(size = 18, face = "bold"))
1083
# +
1084
#   geom_text(data = all_results_copy, aes(x=data_types, label = round(cv_mean, 3), y = cv_mean + cv_sd),
1085
#             vjust = 0.5, hjust = -0.25, angle = 90, position = position_dodge2(width = .9))
1086
1087
1088
ggsave(plot = cur_p,
1089
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_UpperVsLower_Comparison_ViolinPlot.pdf",
1090
       height = 8)
1091
1092
# Bar plot
1093
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1094
                          sub_results_by = quote((split_method == "Split By Cell Line" &
1095
                                                    merge_method == "Base Model" &
1096
                                                    loss_type == "Base Model" &
1097
                                                    drug_type == "Base Model" &
1098
                                                    bottleneck == "No Data Bottleneck" &
1099
                                                    nchar(data_types) <= 5)),
1100
                          fill_by = quote(TargetRange),
1101
                          bar_level_order = c("Target Above 0.7", "Target Below 0.7"),
1102
                          # facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
1103
                          data_order = data_order,
1104
                          facet_by = NULL,
1105
                          legend_title = "AAC Range:",
1106
                          plot_type = "bar_plot",
1107
                          add_mean = T,
1108
                          calculate_avg_mae = F,
1109
                          # target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
1110
                          # # target_sub_by = "Target Above 0.7",
1111
                          # cur_comparisons = list(c("Target Above 0.7", "Target Below 0.7")),
1112
                          # test = "wilcox.test",
1113
                          # paired = F
1114
)
1115
1116
cur_p <- cur_p + theme(text = element_text(size = 18, face = "bold"))
1117
ggsave(plot = cur_p,
1118
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_UpperVsLower_Diff_Comparison_BarPlot.pdf")
1119
1120
# Bi-Modal Baseline Targeted vs Untargeted Drug Comparison ====
1121
all_results_copy <- fread("Data/all_results.csv")
1122
all_results_copy <- all_results[nchar(data_types) <= 5]
1123
1124
all_results_copy <- all_results[bottleneck == "No Data Bottleneck"]
1125
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
1126
                 "split_method", "fold", "TargetRange", "bottleneck", "Targeted")
1127
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
1128
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
1129
# merge_method %in% c("Base Model") &
1130
#   loss_type == "Base Model" & drug_type == "Base Model" &
1131
#   split_method == "Split By Both Cell Line & Drug Scaffold" &
1132
#   nchar(data_types) <= 5 & data_types != "MUT"
1133
1134
# Box plot
1135
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1136
                          sub_results_by = quote((split_method == "Split By Cell Line" &
1137
                                                    merge_method == "Base Model" &
1138
                                                    loss_type == "Base Model" &
1139
                                                    drug_type == "Base Model" &
1140
                                                    bottleneck == "No Data Bottleneck" &
1141
                                                    nchar(data_types) <= 5)),
1142
                          fill_by = quote(Targeted),
1143
                          bar_level_order = c("Targeted Drug", "Untargeted Drug"),
1144
                          # facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
1145
                          data_order = data_order,
1146
                          facet_by = c("TargetRange", "data_types"),
1147
                          legend_title = "AAC Range:",
1148
                          plot_type = "box_plot", 
1149
                          target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
1150
                          # target_sub_by = "Target Above 0.7",
1151
                          cur_comparisons = list(c("Targeted Drug", "Untargeted Drug")),
1152
                          test = "ks.test",
1153
                          paired = T,
1154
                          hide_outliers = T
1155
)
1156
1157
cur_p <- cur_p + theme(text = element_text(size = 18, face = "bold")) + expand_limits(y = c(0, 1.5))
1158
1159
ggsave(plot = cur_p,
1160
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_UpperVsLower_Comparison_BoxPlot.pdf",
1161
       height = 8)
1162
1163
## Difference between models ====
1164
all_results_copy <- all_results_copy[TargetRange == "Target Above 0.7"]
1165
# Box plot
1166
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1167
                          sub_results_by = quote((split_method == "Split By Cell Line" &
1168
                                                    merge_method == "Base Model" &
1169
                                                    loss_type == "Base Model" &
1170
                                                    drug_type == "Base Model" &
1171
                                                    bottleneck == "No Data Bottleneck" &
1172
                                                    nchar(data_types) <= 5)),
1173
                          fill_by = quote(Targeted),
1174
                          bar_level_order = c("Targeted Drug", "Untargeted Drug"),
1175
                          facet_level_order = c("Target Above 0.7"),
1176
                          data_order = data_order,
1177
                          facet_by = "TargetRange",
1178
                          legend_title = "AAC Range:",
1179
                          plot_type = "bar_plot", 
1180
                          add_mean = T,
1181
                          calculate_avg_mae = F
1182
                          # target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
1183
                          # target_sub_by = "Target Above 0.7",
1184
                          # cur_comparisons = list(c("Targeted Drug", "Untargeted Drug")),
1185
                          # test = "wilcox.test",
1186
                          # paired = F,
1187
                          # hide_outliers = T,
1188
                          
1189
)
1190
1191
cur_p <- cur_p + theme(text = element_text(size = 18, face = "bold"))
1192
1193
ggsave(plot = cur_p,
1194
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_TargetedVsUntargeted_Upper0.7_Comparison_BarPlot.pdf")
1195
1196
# Bi-modal Baseline Split Comparison ====
1197
all_results_copy <- all_results
1198
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "Targeted", "bottleneck")
1199
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
1200
1201
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
1202
1203
# TODO Must ensure different splitting methods also are compared on the same validation data
1204
## Wilcox box plot (cell line and drug scaffold) ====
1205
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1206
                          sub_results_by = quote((merge_method == "Base Model" &
1207
                                                    loss_type == "Base Model" &
1208
                                                    drug_type == "Base Model" &
1209
                                                    nchar(data_types) <= 5 &
1210
                                                    bottleneck == "No Data Bottleneck" &
1211
                                                    # split_method %in% c("Split By Cell Line", "Split By Drug Scaffold", "Split By Both Cell Line & Drug Scaffold"))),
1212
                                                    split_method %in% c("Split By Cell Line", "Split By Drug Scaffold"))),
1213
                          facet_by = c("Targeted", "data_types"),
1214
                          fill_by = quote(split_method),
1215
                          data_order = data_order,
1216
                          # bar_level_order = c("Split By Cell Line", "Split By Drug Scaffold", "Split By Both Cell Line & Drug Scaffold"),
1217
                          bar_level_order = c("Split By Cell Line", "Split By Drug Scaffold"),
1218
                          facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
1219
                          plot_type = "box_plot",
1220
                          legend_title = "Splitting Method:",
1221
                          hide_outliers = T,
1222
                          # target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
1223
                          # cur_comparisons = c("Targeted Drug", "Untargeted Drug"),
1224
                          # cur_comparisons = list(c("Split By Cell Line", "Split By Drug Scaffold"),
1225
                          #                        c("Split By Cell Line", "Split By Both Cell Line & Drug Scaffold"),
1226
                          #                        c("Split By Both Cell Line & Drug Scaffold", "Split By Drug Scaffold")),
1227
                          cur_comparisons = list(c("Split By Cell Line", "Split By Drug Scaffold")),
1228
                          test = "wilcox.test",
1229
                          paired = T, step_increase = 0.01,
1230
                          y_lim = 0.05)
1231
1232
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
1233
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_Split_CellLineDrugScaffold_Wilcox_Comparison_BoxPlot.pdf",
1234
       height = 8)
1235
1236
## KS boxplot (cell line and drug scaffold) ====
1237
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1238
                          sub_results_by = quote((merge_method == "Base Model" &
1239
                                                    loss_type == "Base Model" &
1240
                                                    drug_type == "Base Model" &
1241
                                                    nchar(data_types) <= 5 &
1242
                                                    bottleneck == "No Data Bottleneck" &
1243
                                                    # split_method %in% c("Split By Cell Line", "Split By Drug Scaffold", "Split By Both Cell Line & Drug Scaffold"))),
1244
                                                    split_method %in% c("Split By Cell Line", "Split By Drug Scaffold"))),
1245
                          facet_by = c("Targeted", "data_types"),
1246
                          fill_by = quote(split_method),
1247
                          data_order = data_order,
1248
                          # bar_level_order = c("Split By Cell Line", "Split By Drug Scaffold", "Split By Both Cell Line & Drug Scaffold"),
1249
                          bar_level_order = c("Split By Cell Line", "Split By Drug Scaffold"),
1250
                          facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
1251
                          plot_type = "box_plot",
1252
                          legend_title = "Splitting Method:",
1253
                          hide_outliers = T,
1254
                          # target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
1255
                          # cur_comparisons = c("Targeted Drug", "Untargeted Drug"),
1256
                          # cur_comparisons = list(c("Split By Cell Line", "Split By Drug Scaffold"),
1257
                          #                        c("Split By Cell Line", "Split By Both Cell Line & Drug Scaffold"),
1258
                          #                        c("Split By Both Cell Line & Drug Scaffold", "Split By Drug Scaffold")),
1259
                          cur_comparisons = list(c("Split By Cell Line", "Split By Drug Scaffold")),
1260
                          test = "ks.test",
1261
                          paired = T, step_increase = 0.01,
1262
                          y_lim = 0.05)
1263
1264
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
1265
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_Split_CellLineDrugScaffold_KS_Comparison_BoxPlot.pdf",
1266
       height = 8)
1267
1268
## KS violin plot (cell line and drug scaffold) ====
1269
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1270
                          sub_results_by = quote((merge_method == "Base Model" &
1271
                                                    loss_type == "Base Model" &
1272
                                                    drug_type == "Base Model" &
1273
                                                    nchar(data_types) <= 5 &
1274
                                                    bottleneck == "No Data Bottleneck" &
1275
                                                    # split_method %in% c("Split By Cell Line", "Split By Drug Scaffold", "Split By Both Cell Line & Drug Scaffold"))),
1276
                                                    split_method %in% c("Split By Cell Line", "Split By Drug Scaffold"))),
1277
                          facet_by = c("Targeted", "data_types"),
1278
                          fill_by = quote(split_method),
1279
                          data_order = data_order,
1280
                          # bar_level_order = c("Split By Cell Line", "Split By Drug Scaffold", "Split By Both Cell Line & Drug Scaffold"),
1281
                          bar_level_order = c("Split By Cell Line", "Split By Drug Scaffold"),
1282
                          facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
1283
                          plot_type = "violin_plot",
1284
                          legend_title = "Splitting Method:",
1285
                          hide_outliers = T,
1286
                          # target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
1287
                          # cur_comparisons = c("Targeted Drug", "Untargeted Drug"),
1288
                          # cur_comparisons = list(c("Split By Cell Line", "Split By Drug Scaffold"),
1289
                          #                        c("Split By Cell Line", "Split By Both Cell Line & Drug Scaffold"),
1290
                          #                        c("Split By Both Cell Line & Drug Scaffold", "Split By Drug Scaffold")),
1291
                          cur_comparisons = list(c("Split By Cell Line", "Split By Drug Scaffold")),
1292
                          test = "ks.test",
1293
                          paired = T, step_increase = 0.00,
1294
                          y_lim = 0.05)
1295
1296
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold")) + expand_limits(y = c(0, 1.5))
1297
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_Split_CellLineDrugScaffold_KS_Comparison_ViolinPlot.pdf",
1298
       height = 8)
1299
1300
## Bar plot RMSE (cell line and drug scaffold) ====
1301
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1302
                          sub_results_by = quote((merge_method == "Base Model" &
1303
                                                    loss_type == "Base Model" &
1304
                                                    drug_type == "Base Model" &
1305
                                                    nchar(data_types) <= 5 &
1306
                                                    bottleneck == "No Data Bottleneck" &
1307
                                                    # split_method %in% c("Split By Cell Line", "Split By Drug Scaffold", "Split By Both Cell Line & Drug Scaffold"))),
1308
                                                    split_method %in% c("Split By Cell Line", "Split By Drug Scaffold"))),
1309
                          facet_by = c("Targeted", "TargetRange"),
1310
                          fill_by = quote(split_method),
1311
                          data_order = data_order,
1312
                          # bar_level_order = c("Split By Cell Line", "Split By Drug Scaffold", "Split By Both Cell Line & Drug Scaffold"),
1313
                          bar_level_order = c("Split By Cell Line", "Split By Drug Scaffold"),
1314
                          facet_level_order = list(c("Targeted Drug", "Untargeted Drug"), 
1315
                                                   c("Target Above 0.7", "Target Below 0.7")),
1316
                          plot_type = "bar_plot",
1317
                          legend_title = "Splitting Method:",
1318
                          hide_outliers = T,
1319
                          calculate_avg_mae = F,
1320
                          y_lab = "Total RMSE Loss",
1321
                          # target_sub_by = c("Target Above 0.7", "Target Below 0.7"),
1322
                          # cur_comparisons = c("Targeted Drug", "Untargeted Drug"),
1323
                          # cur_comparisons = list(c("Split By Cell Line", "Split By Drug Scaffold"),
1324
                          #                        c("Split By Cell Line", "Split By Both Cell Line & Drug Scaffold"),
1325
                          #                        c("Split By Both Cell Line & Drug Scaffold", "Split By Drug Scaffold")),
1326
                          # cur_comparisons = list(c("Split By Cell Line", "Split By Drug Scaffold")),
1327
                          # test = "ks.test",
1328
                          # paired = T, step_increase = 0.01,
1329
                          y_lim = 0.1)
1330
1331
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
1332
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_Split_CellLineDrugScaffold_RMSE_Comparison_BarPlot.pdf",
1333
       height = 8)
1334
1335
1336
# Bi-modal Baseline vs ElasticNet Baseline (Split By Cell Line) ====
1337
1338
## Without separating target ranges ====
1339
all_results_copy <- fread("Data/all_results.csv")
1340
all_results_copy <- all_results_copy[nchar(data_types) <= 5]
1341
1342
# Don't average loss by TargetRange
1343
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "bottleneck")
1344
all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
1345
all_results_copy[merge_method == "Base Model", merge_method := "Baseline Neural Network"]
1346
all_results_copy[merge_method == "Merge By Early Concat", merge_method := "Elastic Net"]
1347
all_results_copy[merge_method == "Elastic Net", bottleneck := "No Data Bottleneck"]
1348
all_results_copy <- all_results_copy[data_types != "MUT"]
1349
# Order data types by mut, cnv, exp, prot, mirna, metab, hist, rppa
1350
data_order <- c('CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA')
1351
1352
# Bar plot
1353
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1354
                          sub_results_by = quote((merge_method %in% c("Baseline Neural Network", "Elastic Net") &
1355
                                                    loss_type == "Base Model" &
1356
                                                    drug_type == "Base Model" &
1357
                                                    nchar(data_types) <= 5 &
1358
                                                    split_method == "Split By Cell Line" &
1359
                                                    bottleneck == "No Data Bottleneck")),
1360
                          fill_by = quote(merge_method),
1361
                          bar_level_order = c("Elastic Net", "Baseline Neural Network"),
1362
                          data_order = data_order,
1363
                          facet_by = NULL,
1364
                          facet_level_order = NULL,
1365
                          legend_title = "Model Type:",
1366
                          plot_type = "bar_plot",
1367
                          calculate_avg_mae = F,
1368
                          add_mean = T,
1369
                          y_lim = 0.05)
1370
1371
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
1372
1373
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_ANN_Baseline_vs_ElasticNet_No_TargetRange_Separation_SplitByCellLine_Comparison.pdf")
1374
1375
# my_comparisons <- list( c("Base Model", "Base Model + LMF"), c("Base Model + Sum", "Base Model + LMF"), c("Base Model", "Base Model + Sum"))
1376
# my_comparisons <- list( c("Elastic Net", "Baseline Neural Network"))
1377
1378
# Box plot
1379
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1380
                          sub_results_by = quote((merge_method %in% c("Baseline Neural Network", "Elastic Net") &
1381
                                                    loss_type == "Base Model" &
1382
                                                    drug_type == "Base Model" &
1383
                                                    nchar(data_types) <= 5 &
1384
                                                    split_method == "Split By Cell Line" &
1385
                                                    bottleneck == "No Data Bottleneck")),
1386
                          fill_by = quote(merge_method),
1387
                          bar_level_order = c("Baseline Neural Network", "Elastic Net"),
1388
                          data_order = data_order,
1389
                          facet_by = "data_types",
1390
                          facet_level_order = NULL,
1391
                          legend_title = "Model Type:",
1392
                          y_lim = 0.05,
1393
                          plot_type = "box_plot",
1394
                          cur_comparisons = list(c("Elastic Net", "Baseline Neural Network")))
1395
1396
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
1397
ggsave(plot = cur_p,
1398
       filename = "Plots/CV_Results/Bimodal_CV_ANN_Baseline_vs_ElasticNet_No_TargetRange_Separation_SplitByBoth_Comparison_BoxPlot.pdf")
1399
1400
1401
## with separating target ranges ====
1402
all_results_copy <- fread("Data/all_results.csv")
1403
all_results_copy <- all_results_copy[nchar(data_types) <= 5]
1404
1405
# Average loss by TargetRange
1406
all_results_copy <- all_results[merge_method %in% c("Base Model", "Merge By Early Concat") &
1407
                                  loss_type == "Base Model" & drug_type == "Base Model" &
1408
                                  split_method == "Split By Both Cell Line & Drug Scaffold" &
1409
                                  nchar(data_types) <= 5 & data_types != "MUT"]
1410
all_results_copy[merge_method == "Base Model", merge_method := "Baseline Neural Network"]
1411
all_results_copy[merge_method == "Merge By Early Concat", merge_method := "Elastic Net"]
1412
all_results_copy[merge_method == "Elastic Net", bottleneck := "No Data Bottleneck"]
1413
1414
# all_results_copy_sub <- all_results_copy[TargetRange == "TargetAbove 0.7"]
1415
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "bottleneck", "TargetRange")
1416
all_results_copy <- all_results_copy[data_types != "MUT"]
1417
data_order <- c('CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA')
1418
1419
# Bar plot
1420
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1421
                          sub_results_by = quote((merge_method %in% c("Baseline Neural Network", "Elastic Net") &
1422
                                                    loss_type == "Base Model" &
1423
                                                    drug_type == "Base Model" &
1424
                                                    nchar(data_types) <= 5 &
1425
                                                    split_method == "Split By Cell Line" &
1426
                                                    bottleneck == "No Data Bottleneck")),
1427
                          fill_by = quote(merge_method),
1428
                          bar_level_order = c("Elastic Net", "Baseline Neural Network"),
1429
                          data_order = data_order,
1430
                          facet_by = c("TargetRange"),
1431
                          facet_level_order = c("Target Above 0.7", "Target Below 0.7"),
1432
                          legend_title = "Model Type:",
1433
                          plot_type = "bar_plot",
1434
                          calculate_avg_mae = F,
1435
                          add_mean = T, 
1436
                          # facet_nrow = 1,
1437
                          y_lim = 0.05,
1438
                          min_diff = 0.03)
1439
1440
1441
cur_p <- cur_p + theme(text = element_text(size = 18, face = "bold"))
1442
1443
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_ANN_Baseline_vs_ElasticNet_SplitByCellLine_Comparison.pdf")
1444
1445
# Violin plot
1446
all_results_copy <- all_results_copy[TargetRange == "Target Above 0.7"]
1447
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1448
                          sub_results_by = quote((merge_method %in% c("Baseline Neural Network", "Elastic Net") &
1449
                                                    loss_type == "Base Model" &
1450
                                                    drug_type == "Base Model" &
1451
                                                    nchar(data_types) <= 5 &
1452
                                                    split_method == "Split By Cell Line" &
1453
                                                    bottleneck == "No Data Bottleneck")),
1454
                          fill_by = quote(merge_method),
1455
                          bar_level_order = c("Baseline Neural Network", "Elastic Net"),
1456
                          data_order = data_order,
1457
                          facet_by = "data_types",
1458
                          facet_level_order = NULL,
1459
                          legend_title = "Model Type:",
1460
                          y_lim = 0.05,
1461
                          plot_type = "violin_plot",
1462
                          cur_comparisons = list(c("Elastic Net", "Baseline Neural Network")),
1463
                          test = "ks.test",
1464
                          paired = T)
1465
1466
cur_p <- cur_p + theme(text = element_text(size = 18, face = "bold"))
1467
1468
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_ANN_Baseline_vs_ElasticNet_SplitByCellLine_Upper_0.7_Comparison_ViolinPlot.pdf")
1469
1470
## Separating Targeted vs Untargeted drugs in upper AAC ====
1471
all_results_copy <- fread("Data/all_results.csv")
1472
all_results_copy <- all_results_copy[nchar(data_types) <= 5]
1473
1474
all_results_copy[merge_method == "Base Model", merge_method := "Baseline Neural Network"]
1475
all_results_copy[merge_method == "Merge By Early Concat", merge_method := "Elastic Net"]
1476
all_results_copy[merge_method == "Elastic Net", bottleneck := "No Data Bottleneck"]
1477
1478
# all_results_copy_sub <- all_results_copy[TargetRange == "TargetAbove 0.7"]
1479
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
1480
                 "split_method", "fold", "bottleneck", "Targeted", "TargetRange")
1481
all_results_copy <- all_results_copy[data_types != "MUT"]
1482
data_order <- c('CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA')
1483
1484
1485
# Bar plot
1486
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1487
                          sub_results_by = quote((merge_method %in% c("Baseline Neural Network", "Elastic Net") &
1488
                                                    loss_type == "Base Model" &
1489
                                                    drug_type == "Base Model" &
1490
                                                    nchar(data_types) <= 5 &
1491
                                                    TargetRange == "Target Above 0.7" &
1492
                                                    split_method == "Split By Cell Line" &
1493
                                                    bottleneck == "No Data Bottleneck")),
1494
                          fill_by = quote(merge_method),
1495
                          bar_level_order = c("Elastic Net", "Baseline Neural Network"),
1496
                          data_order = data_order,
1497
                          facet_by = "Targeted",
1498
                          facet_level_order = c("Targeted Drug", "Untargeted Drug"),
1499
                          legend_title = "Model Type:",
1500
                          plot_type = "bar_plot",
1501
                          add_mean = T,
1502
                          calculate_avg_mae = F, 
1503
                          facet_nrow = 1,
1504
                          min_diff = 0.03,
1505
                          y_lim = 0.05)
1506
1507
cur_p <- cur_p + theme(text = element_text(size = 18, face = "bold")) 
1508
  
1509
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_ANN_Baseline_vs_ElasticNet_Targeted_vs_Untargeted_Upper_SplitByCellLine_Comparison_BarPlot.pdf")
1510
1511
# violin plot
1512
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1513
                          sub_results_by = quote((merge_method %in% c("Baseline Neural Network", "Elastic Net") &
1514
                                                    loss_type == "Base Model" &
1515
                                                    drug_type == "Base Model" &
1516
                                                    nchar(data_types) <= 5 &
1517
                                                    TargetRange == "Target Above 0.7" &
1518
                                                    split_method == "Split By Cell Line" &
1519
                                                    bottleneck == "No Data Bottleneck")),
1520
                          fill_by = quote(merge_method),
1521
                          bar_level_order = c("Elastic Net", "Baseline Neural Network"),
1522
                          data_order = data_order,
1523
                          facet_by = c("Targeted", "data_types"),
1524
                          facet_level_order = NULL,
1525
                          legend_title = "Model Type:",
1526
                          y_lim = 0.05,
1527
                          plot_type = "violin_plot",
1528
                          cur_comparisons = list(c("Elastic Net", "Baseline Neural Network")),
1529
                          test = "ks.test", 
1530
                          paired = T)
1531
1532
cur_p <- cur_p + theme(text = element_text(size = 18, face = "bold")) + 
1533
  expand_limits(y = c(0, 1.5))
1534
1535
ggsave(plot = cur_p,
1536
       filename = "Plots/CV_Results/Bimodal_CV_ANN_Baseline_Targeted_vs_Untargeted_Upper_SplitByCellLine_Comparison_ViolinPlot.pdf",
1537
       height = 8)
1538
1539
# Bi-Modal Baseline vs LDS ====
1540
all_results <- fread("Data/all_results.csv")
1541
all_results <- all_results[nchar(data_types) <= 5]
1542
1543
all_results_copy <- all_results
1544
all_results_copy[target > 0.7 & target < 0.9, TargetRange := "Target Between 0.7 & 0.9"]
1545
all_results_copy[target >= 0.9, TargetRange := "Target Above 0.9"]
1546
1547
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
1548
                 "split_method", "fold", "bottleneck", "TargetRange", "Targeted")
1549
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
1550
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
1551
1552
## Split By Both Cell Line & Drug Scaffold ====
1553
# Bar plot
1554
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1555
                          sub_results_by = quote((merge_method == "Base Model" &
1556
                                                    drug_type == "Base Model" &
1557
                                                    nchar(data_types) <= 5 &
1558
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
1559
                                                    bottleneck == "No Data Bottleneck")),
1560
                          fill_by = quote(loss_type),
1561
                          bar_level_order = c("Base Model", "Base Model + LDS"),
1562
                          data_order = data_order,
1563
                          facet_by = quote(TargetRange),
1564
                          facet_level_order = c("Target Above 0.9",
1565
                                                "Target Between 0.7 & 0.9",
1566
                                                "Target Below 0.7"),
1567
                          legend_title = "Model Type:",
1568
                          y_lim = 0.05)
1569
1570
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_vs_LDS_SplitByBoth_Comparison.pdf")
1571
1572
# Box plot
1573
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1574
                          sub_results_by = quote((merge_method == "Base Model" &
1575
                                                    drug_type == "Base Model" &
1576
                                                    nchar(data_types) <= 5 &
1577
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
1578
                                                    bottleneck == "No Data Bottleneck")),
1579
                          fill_by = quote(loss_type),
1580
                          bar_level_order = c("Base Model", "Base Model + LDS"),
1581
                          data_order = data_order,
1582
                          facet_by = c("TargetRange", "data_types"),
1583
                          facet_level_order = NULL,
1584
                          legend_title = "Model Type:",
1585
                          y_lim = 0.05,
1586
                          plot_type = "box_plot",
1587
                          target_sub_by = c("Target Between 0.7 & 0.9", "Target Above 0.9"),
1588
                          cur_comparisons = list(c("Base Model", "Base Model + LDS")),
1589
                          test = "wilcox.test",
1590
                          paired = F
1591
                          )
1592
1593
ggsave(plot = cur_p,
1594
       filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_vs_LDS_SplitByBoth_Comparison_BoxPlot.pdf",
1595
       height = 8)
1596
1597
# Violin plot
1598
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1599
                          sub_results_by = quote((merge_method == "Base Model" &
1600
                                                    drug_type == "Base Model" &
1601
                                                    nchar(data_types) <= 5 &
1602
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
1603
                                                    bottleneck == "No Data Bottleneck")),
1604
                          fill_by = quote(loss_type),
1605
                          bar_level_order = c("Base Model", "Base Model + LDS"),
1606
                          data_order = data_order,
1607
                          facet_by = c("TargetRange", "data_types"),
1608
                          facet_level_order = NULL,
1609
                          legend_title = "Model Type:",
1610
                          y_lim = 0.05,
1611
                          plot_type = "violin_plot",
1612
                          target_sub_by = c("Target Between 0.7 & 0.9", "Target Above 0.9"),
1613
                          cur_comparisons = list(c("Base Model", "Base Model + LDS")),
1614
                          test = "wilcox.test",
1615
                          paired = F
1616
)
1617
1618
ggsave(plot = cur_p,
1619
       filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_vs_LDS_SplitByBoth_Comparison_ViolinPlot.pdf",
1620
       height = 8)
1621
1622
## Split By Drug Scaffold ====
1623
# Bar plot
1624
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1625
                          sub_results_by = quote((merge_method == "Base Model" &
1626
                                                    drug_type == "Base Model" &
1627
                                                    nchar(data_types) <= 5 &
1628
                                                    split_method == "Split By Drug Scaffold" &
1629
                                                    bottleneck == "No Data Bottleneck")),
1630
                          fill_by = quote(loss_type),
1631
                          bar_level_order = c("Base Model", "Base Model + LDS"),
1632
                          data_order = data_order,
1633
                          facet_by = quote(TargetRange),
1634
                          facet_level_order = c("Target Above 0.9",
1635
                                                "Target Between 0.7 & 0.9",
1636
                                                "Target Below 0.7"),
1637
                          legend_title = "Model Type:",
1638
                          y_lim = 0.05)
1639
1640
ggsave(plot = p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_vs_LDS_SplitByDrug_Comparison.pdf")
1641
1642
# Box plot
1643
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1644
                          sub_results_by = quote((merge_method == "Base Model" &
1645
                                                    drug_type == "Base Model" &
1646
                                                    nchar(data_types) <= 5 &
1647
                                                    split_method == "Split By Drug Scaffold" &
1648
                                                    bottleneck == "No Data Bottleneck")),
1649
                          fill_by = quote(loss_type),
1650
                          bar_level_order = c("Base Model", "Base Model + LDS"),
1651
                          data_order = data_order,
1652
                          facet_by = c("TargetRange", "data_types"),
1653
                          facet_level_order = NULL,
1654
                          legend_title = "Model Type:",
1655
                          y_lim = 0.05,
1656
                          plot_type = "box_plot",
1657
                          target_sub_by = c("Target Between 0.7 & 0.9", "Target Above 0.9"),
1658
                          cur_comparisons = list(c("Base Model", "Base Model + LDS")),
1659
                          test = "wilcox.test",
1660
                          paired = F
1661
)
1662
1663
ggsave(plot = cur_p,
1664
       filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_vs_LDS_SplitByDrug_Comparison_BoxPlot.pdf",
1665
       height = 8)
1666
## Split By Cell Line ====
1667
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
1668
                 "split_method", "fold", "bottleneck", "TargetRange", "Targeted")
1669
1670
# Bar plot
1671
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1672
                          sub_results_by = quote((merge_method == "Base Model" &
1673
                                                    drug_type == "Base Model" &
1674
                                                    nchar(data_types) <= 5 &
1675
                                                    split_method == "Split By Cell Line" &
1676
                                                    bottleneck == "No Data Bottleneck")),
1677
                          fill_by = quote(loss_type),
1678
                          bar_level_order = c("Base Model", "Base Model + LDS"),
1679
                          data_order = data_order,
1680
                          facet_by = c("TargetRange", "Targeted"),
1681
                          facet_level_order = list(c("Target Above 0.9", "Target Between 0.7 & 0.9","Target Below 0.7"),
1682
                                                   c("Targeted Drug", "Untargeted Drug")),
1683
                          facet_nrow = 3,
1684
                          legend_title = "Model Type:",
1685
                          plot_type = "bar_plot",
1686
                          calculate_avg_mae = F, y_lab = "Total RMSE Loss",
1687
                          y_lim = 0.1)
1688
1689
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
1690
1691
ggsave(plot = cur_p,
1692
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_LDS_Upper_SplitByCellLine_Comparison_BarPlot.pdf",
1693
       height = 12)
1694
1695
# Box plot
1696
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1697
                          sub_results_by = quote((merge_method == "Base Model" &
1698
                                                    drug_type == "Base Model" &
1699
                                                    nchar(data_types) <= 5 &
1700
                                                    split_method == "Split By Cell Line" &
1701
                                                    bottleneck == "No Data Bottleneck")),
1702
                          fill_by = quote(loss_type),
1703
                          bar_level_order = c("Base Model", "Base Model + LDS"),
1704
                          data_order = data_order,
1705
                          facet_by = c("TargetRange", "data_types"),
1706
                          facet_level_order = NULL,
1707
                          legend_title = "Model Type:",
1708
                          y_lim = 0.05,
1709
                          plot_type = "box_plot",
1710
                          target_sub_by = c("Target Between 0.7 & 0.9", "Target Above 0.9"),
1711
                          cur_comparisons = list(c("Base Model", "Base Model + LDS")),
1712
                          test = "wilcox.test",
1713
                          paired = T
1714
)
1715
1716
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
1717
ggsave(plot = cur_p,
1718
       filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_vs_LDS_SplitByCellLine_Comparison_BoxPlot.pdf",
1719
       height = 8)
1720
1721
# Violin plot
1722
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1723
                          sub_results_by = quote((merge_method == "Base Model" &
1724
                                                    drug_type == "Base Model" &
1725
                                                    nchar(data_types) <= 5 &
1726
                                                    split_method == "Split By Cell Line" &
1727
                                                    bottleneck == "No Data Bottleneck")),
1728
                          fill_by = quote(loss_type),
1729
                          bar_level_order = c("Base Model", "Base Model + LDS"),
1730
                          data_order = data_order,
1731
                          facet_by = c("TargetRange", "data_types"),
1732
                          facet_level_order = NULL,
1733
                          legend_title = "Model Type:",
1734
                          y_lim = 0.05,
1735
                          plot_type = "violin_plot",
1736
                          target_sub_by = c("Target Between 0.7 & 0.9", "Target Above 0.9"),
1737
                          cur_comparisons = list(c("Base Model", "Base Model + LDS")),
1738
                          test = "ks.test",
1739
                          paired = T
1740
)
1741
1742
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold")) +
1743
  expand_limits(y = c(0, 1.5))
1744
ggsave(plot = cur_p,
1745
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_LDS_SplitByCellLine_Comparison_ViolinPlot.pdf",
1746
       height = 8)
1747
1748
## Split Comparison ====
1749
# Bar plot
1750
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1751
                          sub_results_by = quote((merge_method == "Base Model" &
1752
                                                    drug_type == "Base Model" &
1753
                                                    loss_type == "Base Model + LDS" &
1754
                                                    nchar(data_types) <= 5 &
1755
                                                    bottleneck == "No Data Bottleneck")),
1756
                          fill_by = quote(split_method),
1757
                          bar_level_order = c("Split By Both Cell Line & Drug Scaffold", "Split By Cell Line", "Split By Drug Scaffold"),
1758
                          data_order = data_order,
1759
                          facet_by = quote(TargetRange),
1760
                          facet_level_order = c("Target Above 0.9",
1761
                                                "Target Between 0.7 & 0.9",
1762
                                                "Target Below 0.7"),
1763
                          legend_title = "Split Method:",
1764
                          y_lim = 0.05)
1765
1766
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_LDS_Split_Comparison.pdf")
1767
1768
# Box plot
1769
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1770
                          sub_results_by = quote((merge_method == "Base Model" &
1771
                                                    drug_type == "Base Model" &
1772
                                                    nchar(data_types) <= 5 &
1773
                                                    bottleneck == "No Data Bottleneck")),
1774
                          fill_by = quote(split_method),
1775
                          bar_level_order = c("Split By Cell Line", "Split By Drug Scaffold", "Split By Both Cell Line and Drug Scaffold"),
1776
                          data_order = data_order,
1777
                          facet_by = c("TargetRange", "data_types"),
1778
                          facet_level_order = NULL,
1779
                          legend_title = "Split Type:",
1780
                          y_lim = 0.05,
1781
                          plot_type = "box_plot",
1782
                          target_sub_by = c("Target Between 0.7 & 0.9", "Target Above 0.9"),
1783
                          cur_comparisons = list(c("Split By Cell Line", "Split By Drug Scaffold"),
1784
                                                 c("Split By Cell Line", "Split By Both Cell Line and Drug Scaffold"),
1785
                                                 c("Split By Drug Scaffold", "Split By Both Cell Line and Drug Scaffold")),
1786
                          test = "t.test",
1787
                          paired = F
1788
)
1789
1790
ggsave(plot = cur_p,
1791
       filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_LDS_Split_Comparison_BoxPlot.pdf",
1792
       height = 8)
1793
1794
# Bi-modal Baseline vs LMF ====
1795
all_results <- fread("Data/all_results.csv")
1796
all_results <- all_results[nchar(data_types) <= 5]
1797
all_results_copy <- all_results
1798
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "bottleneck", "TargetRange")
1799
all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
1800
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
1801
1802
## Split By Both Cell Line & Drug Scaffold ====
1803
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1804
                          sub_results_by = quote((merge_method != "Merge By Early Concat" &
1805
                                                    drug_type == "Base Model" &
1806
                                                    loss_type == "Base Model" &
1807
                                                    nchar(data_types) <= 5 &
1808
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
1809
                                                    bottleneck == "No Data Bottleneck")),
1810
                          fill_by = quote(merge_method),
1811
                          bar_level_order = c("Base Model", "Base Model + LMF", "Base Model + Sum"),
1812
                          data_order = data_order,
1813
                          facet_by = quote(TargetRange),
1814
                          facet_level_order = c("Target Above 0.7",
1815
                                                "Target Below 0.7"),
1816
                          legend_title = "Merge Method:",
1817
                          y_lim = 0.05)
1818
1819
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_vs_LMF_SplitByBoth_Comparison.pdf")
1820
1821
# Box plot
1822
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1823
                          sub_results_by = quote((merge_method != "Merge By Early Concat" &
1824
                                                    drug_type == "Base Model" &
1825
                                                    loss_type == "Base Model" &
1826
                                                    nchar(data_types) <= 5 &
1827
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
1828
                                                    bottleneck == "No Data Bottleneck")),
1829
                          fill_by = quote(merge_method),
1830
                          bar_level_order = c("Base Model", "Base Model + Sum", "Base Model + LMF"),
1831
                          data_order = data_order,
1832
                          facet_by = "data_types",
1833
                          facet_level_order = NULL,
1834
                          legend_title = "Model Type:",
1835
                          y_lim = 0.05,
1836
                          plot_type = "box_plot",
1837
                          target_sub_by = "Target Above 0.7",
1838
                          cur_comparisons = list(c("Base Model", "Base Model + Sum"),
1839
                                                 c("Base Model + Sum", "Base Model + LMF"),
1840
                                                 c("Base Model", "Base Model + LMF")),
1841
                          test = "wilcox.test",
1842
                          paired = F
1843
)
1844
1845
ggsave(plot = cur_p,
1846
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_LMF_SplitByBoth_Comparison_BoxPlot.pdf",
1847
       height = 8)
1848
1849
## Split By Drug Scaffold ====
1850
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1851
                          sub_results_by = quote((merge_method != "Merge By Early Concat" &
1852
                                                    drug_type == "Base Model" &
1853
                                                    loss_type == "Base Model" &
1854
                                                    nchar(data_types) <= 5 &
1855
                                                    split_method == "Split By Drug Scaffold" &
1856
                                                    bottleneck == "No Data Bottleneck")),
1857
                          fill_by = quote(merge_method),
1858
                          bar_level_order = c("Base Model", "Base Model + LMF", "Base Model + Sum"),
1859
                          data_order = data_order,
1860
                          facet_by = quote(TargetRange),
1861
                          facet_level_order = c("Target Above 0.7",
1862
                                                "Target Below 0.7"),
1863
                          legend_title = "Merge Method:",
1864
                          y_lim = 0.05)
1865
1866
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_LMF_SplitByDrugScaffold_Comparison.pdf")
1867
1868
# Box plot
1869
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1870
                          sub_results_by = quote((merge_method != "Merge By Early Concat" &
1871
                                                    drug_type == "Base Model" &
1872
                                                    loss_type == "Base Model" &
1873
                                                    nchar(data_types) <= 5 &
1874
                                                    split_method == "Split By Drug Scaffold" &
1875
                                                    bottleneck == "No Data Bottleneck")),
1876
                          fill_by = quote(merge_method),
1877
                          bar_level_order = c("Base Model", "Base Model + Sum", "Base Model + LMF"),
1878
                          data_order = data_order,
1879
                          facet_by = "data_types",
1880
                          facet_level_order = NULL,
1881
                          legend_title = "Model Type:",
1882
                          y_lim = 0.05,
1883
                          plot_type = "box_plot",
1884
                          target_sub_by = "Target Above 0.7",
1885
                          cur_comparisons = list(c("Base Model", "Base Model + Sum"),
1886
                                                 c("Base Model + Sum", "Base Model + LMF"),
1887
                                                 c("Base Model", "Base Model + LMF")),
1888
                          test = "wilcox.test",
1889
                          paired = F
1890
)
1891
1892
ggsave(plot = cur_p,
1893
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_LMF_SplitByDrugScaffold_Comparison_BoxPlot.pdf",
1894
       height = 8)
1895
1896
## Split By Cell Line ====
1897
# Bar plot
1898
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
1899
                 "split_method", "fold", "bottleneck", "TargetRange", "Targeted")
1900
1901
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1902
                          sub_results_by = quote((merge_method != "Merge By Early Concat" &
1903
                                                    drug_type == "Base Model" &
1904
                                                    loss_type == "Base Model" &
1905
                                                    nchar(data_types) <= 5 &
1906
                                                    split_method == "Split By Cell Line" &
1907
                                                    bottleneck == "No Data Bottleneck")),
1908
                          fill_by = quote(merge_method),
1909
                          bar_level_order = c("Base Model", "Base Model + LMF", "Base Model + Sum"),
1910
                          data_order = data_order,
1911
                          facet_by = c("Targeted", "TargetRange"),
1912
                          facet_level_order = list(c("Targeted Drug", "Untargeted Drug"),
1913
                                                   c("Target Above 0.7","Target Below 0.7")),
1914
                          legend_title = "Model Type:",
1915
                          plot_type = "bar_plot",
1916
                          calculate_avg_mae = F, y_lab = "Total RMSE Loss",
1917
                          y_lim = 0.1)
1918
1919
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
1920
1921
ggsave(plot = cur_p,
1922
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_LMF_SplitByCellLine_Comparison_BarPlot.pdf",
1923
       height = 8)
1924
1925
# Box plot
1926
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1927
                          sub_results_by = quote((merge_method != "Merge By Early Concat" &
1928
                                                    drug_type == "Base Model" &
1929
                                                    loss_type == "Base Model" &
1930
                                                    nchar(data_types) <= 5 &
1931
                                                    split_method == "Split By Cell Line" &
1932
                                                    bottleneck == "No Data Bottleneck")),
1933
                          fill_by = quote(merge_method),
1934
                          bar_level_order = c("Base Model", "Base Model + Sum", "Base Model + LMF"),
1935
                          data_order = data_order,
1936
                          facet_by = "data_types",
1937
                          facet_level_order = NULL,
1938
                          legend_title = "Model Type:",
1939
                          y_lim = 0.05,
1940
                          plot_type = "box_plot",
1941
                          target_sub_by = "Target Above 0.7",
1942
                          cur_comparisons = list(c("Base Model", "Base Model + Sum"),
1943
                                                 c("Base Model + Sum", "Base Model + LMF"),
1944
                                                 c("Base Model", "Base Model + LMF")),
1945
                          test = "wilcox.test",
1946
                          paired = F
1947
)
1948
1949
ggsave(plot = cur_p,
1950
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_LMF_SplitByCellLine_Comparison_BoxPlot.pdf",
1951
       width = 15)
1952
1953
# Violin plot
1954
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1955
                          sub_results_by = quote((merge_method != "Merge By Early Concat" &
1956
                                                    drug_type == "Base Model" &
1957
                                                    loss_type == "Base Model" &
1958
                                                    nchar(data_types) <= 5 &
1959
                                                    split_method == "Split By Cell Line" &
1960
                                                    bottleneck == "No Data Bottleneck")),
1961
                          fill_by = quote(merge_method),
1962
                          bar_level_order = c("Base Model", "Base Model + Sum", "Base Model + LMF"),
1963
                          data_order = data_order,
1964
                          facet_by = "data_types",
1965
                          facet_level_order = NULL,
1966
                          legend_title = "Model Type:",
1967
                          y_lim = 0.05,
1968
                          plot_type = "violin_plot",
1969
                          target_sub_by = "Target Above 0.7",
1970
                          cur_comparisons = list(c("Base Model", "Base Model + Sum"),
1971
                                                 c("Base Model + Sum", "Base Model + LMF"),
1972
                                                 c("Base Model", "Base Model + LMF")),
1973
                          test = "ks.test",
1974
                          paired = T
1975
)
1976
1977
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold")) + expand_limits(y = c(0, 1.5))
1978
1979
ggsave(plot = cur_p,
1980
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_LMF_SplitByCellLine_Comparison_ViolinPlot.pdf",
1981
       height = 10)
1982
1983
## Split Comparison ====
1984
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
1985
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
1986
                                                    drug_type == "Base Model" &
1987
                                                    loss_type == "Base Model" &
1988
                                                    nchar(data_types) <= 5 &
1989
                                                    bottleneck == "No Data Bottleneck")),
1990
                          fill_by = quote(split_method),
1991
                          bar_level_order = c("Split By Both Cell Line & Drug Scaffold", "Split By Cell Line", "Split By Drug Scaffold"),
1992
                          data_order = data_order,
1993
                          facet_by = quote(TargetRange),
1994
                          facet_level_order = c("Target Above 0.7",
1995
                                                "Target Below 0.7"),
1996
                          legend_title = "Split Method:",
1997
                          y_lim = 0.05)
1998
1999
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_LMF_Split_Comparison.pdf")
2000
2001
# Bi-Modal Baseline vs GNN ====
2002
all_results <- fread("Data/all_results.csv")
2003
all_results <- all_results[nchar(data_types) <= 5]
2004
2005
all_results_copy <- all_results
2006
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "bottleneck", "TargetRange")
2007
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
2008
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
2009
2010
## Split By Both Cell Line & Drug Scaffold ====
2011
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2012
                          sub_results_by = quote((merge_method == "Base Model" &
2013
                                                    loss_type == "Base Model" &
2014
                                                    nchar(data_types) <= 5 &
2015
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2016
                                                    bottleneck == "No Data Bottleneck")),
2017
                          fill_by = quote(drug_type),
2018
                          bar_level_order = c("Base Model", "Base Model + GNN"),
2019
                          data_order = data_order,
2020
                          facet_by = quote(TargetRange),
2021
                          facet_level_order = c("Target Above 0.7",
2022
                                                "Target Below 0.7"),
2023
                          legend_title = "Drug Model:",
2024
                          y_lim = 0.05)
2025
2026
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_GNN_SplitByBoth_Comparison.pdf")
2027
2028
# Box plot
2029
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2030
                          sub_results_by = quote((merge_method == "Base Model" &
2031
                                                    loss_type == "Base Model" &
2032
                                                    nchar(data_types) <= 5 &
2033
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2034
                                                    bottleneck == "No Data Bottleneck")),
2035
                          fill_by = quote(drug_type),
2036
                          bar_level_order = c("Base Model", "Base Model + GNN"),
2037
                          data_order = data_order,
2038
                          facet_by = "data_types",
2039
                          facet_level_order = NULL,
2040
                          legend_title = "Model Type:",
2041
                          y_lim = 0.05,
2042
                          plot_type = "box_plot",
2043
                          target_sub_by = "Target Above 0.7",
2044
                          cur_comparisons = list(c("Base Model", "Base Model + GNN")),
2045
                          test = "wilcox.test",
2046
                          paired = F
2047
)
2048
2049
ggsave(plot = cur_p,
2050
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_GNN_SplitByBoth_Comparison_BoxPlot.pdf",
2051
       height = 8)
2052
## Split By Drug Scaffold ====
2053
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
2054
                 "split_method", "fold", "bottleneck", "TargetRange", "Targeted")
2055
2056
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2057
                          sub_results_by = quote((merge_method == "Base Model" &
2058
                                                    loss_type == "Base Model" &
2059
                                                    nchar(data_types) <= 5 &
2060
                                                    split_method == "Split By Drug Scaffold" &
2061
                                                    bottleneck == "No Data Bottleneck")),
2062
                          fill_by = quote(drug_type),
2063
                          bar_level_order = c("Base Model", "Base Model + GNN"),
2064
                          data_order = data_order,
2065
                          facet_by = c("Targeted", "TargetRange"),
2066
                          facet_level_order = list(c("Targeted Drug", "Untargeted Drug"),
2067
                                                   c("Target Above 0.7", "Target Below 0.7")),
2068
                          legend_title = "Model Type:",
2069
                          plot_type = "bar_plot",
2070
                          calculate_avg_mae = F,
2071
                          y_lab = "Total RMSE Loss",
2072
                          y_lim = 0.1)
2073
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
2074
2075
ggsave(plot = cur_p,
2076
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_GNN_SplitByDrugScaffold_Comparison_BarPlot.pdf",
2077
       height = 8)
2078
2079
# Box plot
2080
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2081
                          sub_results_by = quote((merge_method == "Base Model" &
2082
                                                    loss_type == "Base Model" &
2083
                                                    nchar(data_types) <= 5 &
2084
                                                    split_method == "Split By Drug Scaffold" &
2085
                                                    bottleneck == "No Data Bottleneck")),
2086
                          fill_by = quote(drug_type),
2087
                          bar_level_order = c("Base Model", "Base Model + GNN"),
2088
                          data_order = data_order,
2089
                          facet_by = "data_types",
2090
                          facet_level_order = NULL,
2091
                          legend_title = "Model Type:",
2092
                          y_lim = 0.05,
2093
                          plot_type = "box_plot",
2094
                          target_sub_by = "Target Above 0.7",
2095
                          cur_comparisons = list(c("Base Model", "Base Model + GNN")),
2096
                          test = "wilcox.test",
2097
                          paired = F
2098
)
2099
2100
ggsave(plot = cur_p,
2101
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_GNN_SplitByDrugScaffold_Comparison_BoxPlot.pdf",
2102
       height = 8)
2103
2104
## Split By Cell Line ====
2105
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
2106
                 "split_method", "fold", "bottleneck", "TargetRange", "Targeted")
2107
2108
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2109
                          sub_results_by = quote((merge_method == "Base Model" &
2110
                                                    loss_type == "Base Model" &
2111
                                                    nchar(data_types) <= 5 &
2112
                                                    split_method == "Split By Cell Line" &
2113
                                                    bottleneck == "No Data Bottleneck")),
2114
                          fill_by = quote(drug_type),
2115
                          bar_level_order = c("Base Model", "Base Model + GNN"),
2116
                          data_order = data_order,
2117
                          facet_by = c("Targeted", "TargetRange"),
2118
                          facet_level_order = list(c("Targeted Drug", "Untargeted Drug"),
2119
                                                   c("Target Above 0.7","Target Below 0.7")),
2120
                          legend_title = "Model Type:",
2121
                          plot_type = "bar_plot",
2122
                          calculate_avg_mae = F, y_lab = "Total RMSE Loss",
2123
                          y_lim = 0.1)
2124
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
2125
2126
ggsave(plot = cur_p,
2127
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_GNN_SplitByCellLine_Comparison_BarPlot.pdf",
2128
       height = 8)
2129
2130
# Box plot
2131
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2132
                          sub_results_by = quote((merge_method == "Base Model" &
2133
                                                    loss_type == "Base Model" &
2134
                                                    nchar(data_types) <= 5 &
2135
                                                    split_method == "Split By Cell Line" &
2136
                                                    bottleneck == "No Data Bottleneck")),
2137
                          fill_by = quote(drug_type),
2138
                          bar_level_order = c("Base Model", "Base Model + GNN"),
2139
                          data_order = data_order,
2140
                          facet_by = "data_types",
2141
                          facet_level_order = NULL,
2142
                          legend_title = "Model Type:",
2143
                          y_lim = 0.05,
2144
                          plot_type = "box_plot",
2145
                          target_sub_by = "Target Above 0.7",
2146
                          cur_comparisons = list(c("Base Model", "Base Model + GNN")),
2147
                          test = "wilcox.test",
2148
                          paired = F
2149
)
2150
2151
ggsave(plot = cur_p,
2152
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_GNN_SplitByCellLine_Comparison_BoxPlot.pdf",
2153
       height = 8)
2154
2155
# Violin plot
2156
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2157
                          sub_results_by = quote((merge_method == "Base Model" &
2158
                                                    loss_type == "Base Model" &
2159
                                                    nchar(data_types) <= 5 &
2160
                                                    split_method == "Split By Cell Line" &
2161
                                                    bottleneck == "No Data Bottleneck")),
2162
                          fill_by = quote(drug_type),
2163
                          bar_level_order = c("Base Model", "Base Model + GNN"),
2164
                          data_order = data_order,
2165
                          facet_by = c("Targeted","data_types"),
2166
                          facet_level_order = NULL,
2167
                          legend_title = "Model Type:",
2168
                          y_lim = 0.05,
2169
                          plot_type = "violin_plot",
2170
                          target_sub_by = "Target Above 0.7",
2171
                          cur_comparisons = list(c("Base Model", "Base Model + GNN")),
2172
                          test = "ks.test", 
2173
                          paired = T
2174
)
2175
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
2176
2177
ggsave(plot = cur_p,
2178
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_GNN_SplitByCellLine_Comparison_ViolinPlot.pdf",
2179
       height = 8)
2180
## Split Comparison ====
2181
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2182
                          sub_results_by = quote((merge_method == "Base Model" &
2183
                                                    loss_type == "Base Model" &
2184
                                                    drug_type == "Base Model + GNN" &
2185
                                                    nchar(data_types) <= 5 &
2186
                                                    bottleneck == "No Data Bottleneck")),
2187
                          fill_by = quote(split_method),
2188
                          bar_level_order = c("Split By Both Cell Line & Drug Scaffold", "Split By Cell Line", "Split By Drug Scaffold"),
2189
                          data_order = data_order,
2190
                          facet_by = quote(TargetRange),
2191
                          facet_level_order = c("Target Above 0.7",
2192
                                                "Target Below 0.7"),
2193
                          legend_title = "Split Method:",
2194
                          y_lim = 0.05)
2195
2196
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_GNN_Split_Comparison.pdf")
2197
2198
## Targeted and Untargeted drugs in upper AAC range ====
2199
all_results_copy <- all_results[TargetRange == "Target Above 0.7"]
2200
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "bottleneck", "Targeted")
2201
all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
2202
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
2203
2204
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2205
                          sub_results_by = quote((merge_method == "Base Model" &
2206
                                                    loss_type == "Base Model" &
2207
                                                    nchar(data_types) <= 5 &
2208
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2209
                                                    bottleneck == "No Data Bottleneck")),
2210
                          fill_by = quote(drug_type),
2211
                          bar_level_order = c("Base Model", "Base Model + GNN"),
2212
                          data_order = data_order,
2213
                          facet_by = quote(Targeted),
2214
                          facet_level_order = c("Targeted Drug", "Untargeted Drug"),
2215
                          legend_title = "Drug Model:",
2216
                          y_lim = 0.05)
2217
2218
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_vs_GNN_Targeted_vs_Untargeted_SplitByBoth_Comparison.pdf")
2219
2220
# Bi-modal LMF + GNN without LDS (Split By Both Cell Line & Drug Scaffold) ====
2221
all_results <- fread("Data/all_results.csv")
2222
all_results <- all_results[nchar(data_types) <= 5]
2223
2224
all_results_copy <- all_results
2225
# all_results_copy[target > 0.7 & target < 0.9]$TargetRange <- "Target Between 0.7 & 0.9"
2226
# all_results_copy[target >= 0.9]$TargetRange <- "Target Above 0.9"
2227
2228
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
2229
                 "split_method", "fold", "bottleneck", "TargetRange", "Targeted")
2230
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
2231
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
2232
2233
# Must rename some columns to better distinguish differences on the plot
2234
all_results_copy[loss_type == "Base Model", loss_type := "LMF + GNN"]
2235
all_results_copy[loss_type == "Base Model + LDS", loss_type := "LDS + LMF + GNN"]
2236
2237
## Split By Both Cell Line & Drug Scaffold ====
2238
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2239
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2240
                                                    drug_type == "Base Model + GNN" &
2241
                                                    nchar(data_types) <= 5 &
2242
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2243
                                                    bottleneck == "No Data Bottleneck")),
2244
                          fill_by = quote(loss_type),
2245
                          bar_level_order = c("LMF + GNN", "LDS + LMF + GNN"),
2246
                          data_order = data_order,
2247
                          facet_by = quote(TargetRange),
2248
                          facet_level_order = c("Target Above 0.9",
2249
                                                "Target Between 0.7 & 0.9",
2250
                                                "Target Below 0.7"),
2251
                          legend_title = "Loss Type:",
2252
                          y_lim = 0.05)
2253
2254
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LDS_SplitByBoth_Comparison.pdf")
2255
2256
# Box plot
2257
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2258
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2259
                                                    drug_type == "Base Model + GNN" &
2260
                                                    nchar(data_types) <= 5 &
2261
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2262
                                                    bottleneck == "No Data Bottleneck")),
2263
                          fill_by = quote(loss_type),
2264
                          bar_level_order = c("LMF + GNN", "LDS + LMF + GNN"),
2265
                          data_order = data_order,
2266
                          facet_by = c("TargetRange", "data_types"),
2267
                          facet_level_order = NULL,
2268
                          legend_title = "Model Type:",
2269
                          y_lim = 0.05,
2270
                          plot_type = "box_plot",
2271
                          target_sub_by = c("Target Between 0.7 & 0.9", "Target Above 0.9"),
2272
                          cur_comparisons = list(c("LMF + GNN", "LDS + LMF + GNN")),
2273
                          test = "wilcox.test",
2274
                          paired = F
2275
)
2276
2277
ggsave(plot = cur_p,
2278
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LDS_SplitByBoth_Comparison_BoxPlot.pdf",
2279
       height = 8)
2280
2281
## Split By Drug Scaffold ====
2282
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2283
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2284
                                                    drug_type == "Base Model + GNN" &
2285
                                                    nchar(data_types) <= 5 &
2286
                                                    split_method == "Split By Drug Scaffold" &
2287
                                                    bottleneck == "No Data Bottleneck")),
2288
                          fill_by = quote(loss_type),
2289
                          bar_level_order = c("LMF + GNN", "LDS + LMF + GNN"),
2290
                          data_order = data_order,
2291
                          facet_by = quote(TargetRange),
2292
                          facet_level_order = c("Target Above 0.9",
2293
                                                "Target Between 0.7 & 0.9",
2294
                                                "Target Below 0.7"),
2295
                          legend_title = "Model Type:",
2296
                          y_lim = 0.05)
2297
2298
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LDS_SplitByDrugScaffold_Comparison.pdf")
2299
2300
# Box plot
2301
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2302
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2303
                                                    drug_type == "Base Model + GNN" &
2304
                                                    nchar(data_types) <= 5 &
2305
                                                    split_method == "Split By Drug Scaffold" &
2306
                                                    bottleneck == "No Data Bottleneck")),
2307
                          fill_by = quote(loss_type),
2308
                          bar_level_order = c("LMF + GNN", "LDS + LMF + GNN"),
2309
                          data_order = data_order,
2310
                          facet_by = c("TargetRange", "data_types"),
2311
                          facet_level_order = NULL,
2312
                          legend_title = "Model Type:",
2313
                          y_lim = 0.05,
2314
                          plot_type = "box_plot",
2315
                          target_sub_by = c("Target Between 0.7 & 0.9", "Target Above 0.9"),
2316
                          cur_comparisons = list(c("LMF + GNN", "LDS + LMF + GNN")),
2317
                          test = "wilcox.test",
2318
                          paired = F
2319
)
2320
2321
ggsave(plot = cur_p,
2322
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LDS_SplitByDrugScaffold_Comparison_BoxPlot.pdf",
2323
       height = 8)
2324
2325
## Split By Cell Line ====
2326
2327
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2328
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2329
                                                    drug_type == "Base Model + GNN" &
2330
                                                    nchar(data_types) <= 5 &
2331
                                                    split_method == "Split By Cell Line" &
2332
                                                    bottleneck == "No Data Bottleneck")),
2333
                          fill_by = quote(loss_type),
2334
                          bar_level_order = c("LMF + GNN", "LDS + LMF + GNN"),
2335
                          data_order = data_order,
2336
                          facet_by = c("Targeted", "TargetRange"), 
2337
                          # facet_level_order = c("Target Above 0.9",
2338
                          #                       "Target Between 0.7 & 0.9",
2339
                          #                       "Target Below 0.7"),
2340
                          facet_level_order = list(c("Targeted Drug", "Untargeted Drug"),
2341
                                                   c("Target Above 0.7", "Target Below 0.7")),
2342
                          # target_sub_by = c("Target Above 0.9", "Target Between 0.7 & 0.9"),
2343
                          legend_title = "Model Type:",
2344
                          calculate_avg_mae = F, y_lab = "Total RMSE Loss",
2345
                          y_lim = 0.1)
2346
2347
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
2348
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LDS_SplitByCellLine_Comparison_BarPlot.pdf",
2349
       height = 8)
2350
2351
# Box plot
2352
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2353
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2354
                                                    drug_type == "Base Model + GNN" &
2355
                                                    nchar(data_types) <= 5 &
2356
                                                    split_method == "Split By Cell Line" &
2357
                                                    bottleneck == "No Data Bottleneck")),
2358
                          fill_by = quote(loss_type),
2359
                          bar_level_order = c("LMF + GNN", "LDS + LMF + GNN"),
2360
                          data_order = data_order,
2361
                          facet_by = c("TargetRange", "data_types"),
2362
                          facet_level_order = NULL,
2363
                          legend_title = "Model Type:",
2364
                          y_lim = 0.05,
2365
                          plot_type = "box_plot",
2366
                          target_sub_by = c("Target Between 0.7 & 0.9", "Target Above 0.9"),
2367
                          cur_comparisons = list(c("LMF + GNN", "LDS + LMF + GNN")),
2368
                          test = "wilcox.test",
2369
                          paired = F
2370
)
2371
2372
ggsave(plot = cur_p,
2373
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LDS_SplitByCellLine_Comparison_BoxPlot.pdf",
2374
       height = 8)
2375
2376
# Violin plot
2377
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2378
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2379
                                                    drug_type == "Base Model + GNN" &
2380
                                                    nchar(data_types) <= 5 &
2381
                                                    split_method == "Split By Cell Line" &
2382
                                                    bottleneck == "No Data Bottleneck")),
2383
                          fill_by = quote(loss_type),
2384
                          bar_level_order = c("LMF + GNN", "LDS + LMF + GNN"),
2385
                          data_order = data_order,
2386
                          facet_by = c("Targeted", "data_types"), 
2387
                          plot_type = "violin_plot",
2388
                          # facet_level_order = c("Target Above 0.9",
2389
                          #                       "Target Between 0.7 & 0.9",
2390
                          #                       "Target Below 0.7"),
2391
                          # facet_level_order = list(c("Targeted Drug", "Untargeted Drug"),
2392
                          #                          c("Target Above 0.7", "Target Below 0.7")),
2393
                          cur_comparisons = list(c("LMF + GNN", "LDS + LMF + GNN")),
2394
                          target_sub_by = c("Target Above 0.7"),
2395
                          legend_title = "Model Type:",
2396
                          calculate_avg_mae = F, y_lab = "Total RMSE Loss",
2397
                          test = "ks.test", paired = T,
2398
                          y_lim = 0.1)
2399
2400
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
2401
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LDS_SplitByCellLine_Comparison_ViolinPlot.pdf",
2402
       height = 8)
2403
2404
2405
## Split Comparison ====
2406
# GNN + LMF - LDS
2407
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2408
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2409
                                                    drug_type == "Base Model + GNN" &
2410
                                                    loss_type == "Base Model" &
2411
                                                    nchar(data_types) <= 5 &
2412
                                                    bottleneck == "No Data Bottleneck")),
2413
                          fill_by = quote(split_method),
2414
                          bar_level_order = c("Split By Both Cell Line & Drug Scaffold", "Split By Cell Line", "Split By Drug Scaffold"),
2415
                          data_order = data_order,
2416
                          facet_by = quote(TargetRange),
2417
                          facet_level_order = c("Target Above 0.9",
2418
                                                "Target Between 0.7 & 0.9",
2419
                                                "Target Below 0.7"),
2420
                          legend_title = "Split Method:",
2421
                          y_lim = 0.05)
2422
2423
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Trifecta_without_LDS_Split_Comparison.pdf")
2424
2425
# Bi-modal LMF + LDS without GNN ====
2426
all_results <- fread("Data/all_results.csv")
2427
all_results <- all_results[nchar(data_types) <= 5]
2428
2429
## Upper vs Lower AAC Range ====
2430
all_results_copy <- all_results
2431
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
2432
                 "split_method", "fold", "bottleneck", "TargetRange", "Targeted")
2433
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
2434
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
2435
2436
# Must rename some columns to better distinguish differences on the plot
2437
all_results_copy[drug_type == "Base Model", drug_type := "LDS + LMF"]
2438
all_results_copy[drug_type == "Base Model + GNN", drug_type := "LDS + LMF + GNN"]
2439
2440
table(all_results_copy[merge_method == "Base Model + LMF" &
2441
                         loss_type == "Base Model + LDS" & nchar(data_types) <= 5 &
2442
                         split_method == "Split By Both Cell Line & Drug Scaffold" &
2443
                         bottleneck == "No Data Bottleneck"]$drug_type)
2444
### Split By Both Cell Line & Drug Scaffold ====
2445
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2446
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2447
                                                    loss_type == "Base Model + LDS" &
2448
                                                    nchar(data_types) <= 5 &
2449
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2450
                                                    bottleneck == "No Data Bottleneck")),
2451
                          fill_by = quote(drug_type),
2452
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2453
                          data_order = data_order,
2454
                          facet_by = quote(TargetRange),
2455
                          facet_level_order = c("Target Above 0.7",
2456
                                                "Target Below 0.7"),
2457
                          legend_title = "Model Type:",
2458
                          y_lim = 0.05)
2459
2460
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_GNN_Upper_vs_Lower_SplitByBoth_Comparison.pdf")
2461
2462
# Box plot
2463
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2464
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2465
                                                    loss_type == "Base Model + LDS" &
2466
                                                    nchar(data_types) <= 5 &
2467
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2468
                                                    bottleneck == "No Data Bottleneck")),
2469
                          fill_by = quote(drug_type),
2470
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2471
                          data_order = data_order,
2472
                          facet_by = "data_types",
2473
                          facet_level_order = NULL,
2474
                          legend_title = "Model Type:",
2475
                          y_lim = 0.05,
2476
                          plot_type = "box_plot",
2477
                          target_sub_by = "Target Above 0.7",
2478
                          cur_comparisons = list(c("LDS + LMF", "LDS + LMF + GNN")),
2479
                          test = "wilcox.test",
2480
                          paired = F
2481
)
2482
2483
ggsave(plot = cur_p,
2484
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_GNN_SplitByBoth_Comparison_BoxPlot.pdf")
2485
2486
### Split By Drug Scaffold ====
2487
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2488
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2489
                                                    loss_type == "Base Model + LDS" &
2490
                                                    nchar(data_types) <= 5 &
2491
                                                    split_method == "Split By Drug Scaffold" &
2492
                                                    TargetRange == "Target Above 0.7" &
2493
                                                    bottleneck == "No Data Bottleneck")),
2494
                          fill_by = quote(drug_type),
2495
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2496
                          data_order = data_order,
2497
                          facet_by = c("Targeted", "TargetRange"),
2498
                          facet_level_order = list(c("Targeted Drug", "Untargeted Drug"),
2499
                                                   c("Target Above 0.7", "Target Below 0.7")),
2500
                          legend_title = "Model Type:",
2501
                          calculate_avg_mae = F,
2502
                          y_lab = "Total RMSE Loss",
2503
                          add_mean = F,
2504
                          y_lim = 0.05)
2505
2506
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
2507
ggsave(plot = cur_p,
2508
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_GNN_Upper_vs_Lower_SplitByDrugScaffold_Comparison_BarPlot.pdf")
2509
2510
# Box plot
2511
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2512
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2513
                                                    loss_type == "Base Model + LDS" &
2514
                                                    nchar(data_types) <= 5 &
2515
                                                    split_method == "Split By Drug Scaffold" &
2516
                                                    bottleneck == "No Data Bottleneck")),
2517
                          fill_by = quote(drug_type),
2518
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2519
                          data_order = data_order,
2520
                          facet_by = "data_types",
2521
                          facet_level_order = NULL,
2522
                          legend_title = "Model Type:",
2523
                          y_lim = 0.05,
2524
                          plot_type = "box_plot",
2525
                          target_sub_by = "Target Above 0.7",
2526
                          cur_comparisons = list(c("LDS + LMF", "LDS + LMF + GNN")),
2527
                          test = "wilcox.test",
2528
                          paired = F
2529
)
2530
2531
ggsave(plot = cur_p,
2532
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_GNN_SplitByDrugScaffold_Comparison_BoxPlot.pdf")
2533
2534
# Violin plot
2535
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2536
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2537
                                                    loss_type == "Base Model + LDS" &
2538
                                                    nchar(data_types) <= 5 &
2539
                                                    split_method == "Split By Drug Scaffold" &
2540
                                                    TargetRange == "Target Above 0.7" &
2541
                                                    bottleneck == "No Data Bottleneck")),
2542
                          fill_by = quote(drug_type),
2543
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2544
                          data_order = data_order,
2545
                          facet_by = c("Targeted", "data_types"),
2546
                          facet_level_order = NULL,
2547
                          # facet_level_order = list(c("Targeted Drug", "Untargeted Drug"),
2548
                          #                          c("Target Above 0.7", "Target Below 0.7")),
2549
                          legend_title = "Model Type:",
2550
                          y_lab = "Total RMSE Loss",
2551
                          add_mean = F,
2552
                          plot_type = "violin_plot",
2553
                          cur_comparisons = list(c("LDS + LMF", "LDS + LMF + GNN")),
2554
                          test = "ks.test",
2555
                          paired = T,
2556
                          y_lim = 0.05)
2557
2558
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
2559
ggsave(plot = cur_p,
2560
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_GNN_Upper_vs_Lower_SplitByDrugScaffold_Comparison_ViolinPlot.pdf")
2561
2562
### Split By Cell Line ====
2563
table(all_results_copy[merge_method == "Base Model + LMF" &
2564
                         loss_type == "Base Model + LDS" & nchar(data_types) <= 5 &
2565
                         split_method == "Split By Cell Line" &
2566
                         bottleneck == "No Data Bottleneck"]$drug_type)
2567
2568
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2569
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2570
                                                    loss_type == "Base Model + LDS" &
2571
                                                    nchar(data_types) <= 5 &
2572
                                                    split_method == "Split By Cell Line" &
2573
                                                    TargetRange == "Target Above 0.7" &
2574
                                                    bottleneck == "No Data Bottleneck")),
2575
                          fill_by = quote(drug_type),
2576
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2577
                          data_order = data_order,
2578
                          facet_by = c("Targeted", "TargetRange"),
2579
                          facet_level_order = list(c("Targeted Drug", "Untargeted Drug"),
2580
                                                   c("Target Above 0.7", "Target Below 0.7")),
2581
                          legend_title = "Model Type:",
2582
                          calculate_avg_mae = F, y_lab = "Total RMSE Loss",
2583
                          add_mean = T,
2584
                          y_lim = 0.05)
2585
2586
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
2587
ggsave(plot = cur_p,
2588
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_GNN_Upper_vs_Lower_SplitByCellLine_Comparison_BarPlot.pdf",
2589
       height = 10)
2590
2591
# Box plot
2592
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2593
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2594
                                                    loss_type == "Base Model + LDS" &
2595
                                                    nchar(data_types) <= 5 &
2596
                                                    split_method == "Split By Cell Line" &
2597
                                                    bottleneck == "No Data Bottleneck")),
2598
                          fill_by = quote(drug_type),
2599
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2600
                          data_order = data_order,
2601
                          facet_by = "data_types",
2602
                          facet_level_order = NULL,
2603
                          legend_title = "Model Type:",
2604
                          y_lim = 0.05,
2605
                          plot_type = "box_plot",
2606
                          target_sub_by = "Target Above 0.7",
2607
                          cur_comparisons = list(c("LDS + LMF", "LDS + LMF + GNN")),
2608
                          test = "wilcox.test",
2609
                          paired = F
2610
)
2611
2612
ggsave(plot = cur_p,
2613
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_GNN_SplitByCellLine_Comparison_BoxPlot.pdf")
2614
2615
# Violin plot
2616
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2617
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2618
                                                    loss_type == "Base Model + LDS" &
2619
                                                    nchar(data_types) <= 5 &
2620
                                                    split_method == "Split By Cell Line" &
2621
                                                    bottleneck == "No Data Bottleneck")),
2622
                          fill_by = quote(drug_type),
2623
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2624
                          data_order = data_order,
2625
                          facet_by = c("Targeted", "data_types"),
2626
                          facet_level_order = NULL,
2627
                          legend_title = "Model Type:",
2628
                          y_lim = 0.05,
2629
                          plot_type = "violin_plot",
2630
                          target_sub_by = "Target Above 0.7",
2631
                          cur_comparisons = list(c("LDS + LMF", "LDS + LMF + GNN")),
2632
                          test = "ks.test",
2633
                          paired = T
2634
)
2635
2636
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
2637
ggsave(plot = cur_p,
2638
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_GNN_SplitByCellLine_Comparison_ViolinPlot.pdf",
2639
       height = 8)
2640
2641
### Split Comparison ====
2642
# LDS + LMF - GNN
2643
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2644
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2645
                                                    loss_type == "Base Model + LDS" &
2646
                                                    drug_type == "Base Model" &
2647
                                                    nchar(data_types) <= 5 &
2648
                                                    bottleneck == "No Data Bottleneck")),
2649
                          fill_by = quote(split_method),
2650
                          bar_level_order = c("Split By Both Cell Line & Drug Scaffold", "Split By Cell Line", "Split By Drug Scaffold"),
2651
                          data_order = data_order,
2652
                          facet_by = quote(TargetRange),
2653
                          facet_level_order = c("Target Above 0.7",
2654
                                                "Target Below 0.7"),
2655
                          legend_title = "Split Method:",
2656
                          y_lim = 0.05)
2657
2658
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Trifecta_without_GNN_Upper_vs_Lower_Split_Comparison.pdf")
2659
2660
## Targeted vs Untargeted Drugs ==== 
2661
all_results_copy <- all_results[TargetRange == "Target Above 0.7"]
2662
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "bottleneck", "Targeted")
2663
all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
2664
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
2665
2666
### Split By Both Cell Line & Drug Scaffold ====
2667
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2668
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2669
                                                    loss_type == "Base Model + LDS" &
2670
                                                    nchar(data_types) <= 5 &
2671
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2672
                                                    bottleneck == "No Data Bottleneck")),
2673
                          fill_by = quote(drug_type),
2674
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2675
                          data_order = data_order,
2676
                          facet_by = quote(Targeted),
2677
                          facet_level_order = c("Targeted Drug", "Untargeted Drug"),
2678
                          legend_title = "Model Type:",
2679
                          y_lim = 0.05)
2680
2681
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_without_GNN_Targeted_vs_Untargeted_Upper_0.7_SplitByBoth_Comparison.pdf")
2682
2683
# Box plot
2684
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2685
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2686
                                                    loss_type == "Base Model + LDS" &
2687
                                                    nchar(data_types) <= 5 &
2688
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2689
                                                    bottleneck == "No Data Bottleneck")),
2690
                          fill_by = quote(drug_type),
2691
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2692
                          data_order = data_order,
2693
                          facet_by = c("Targeted", "data_types"),
2694
                          facet_level_order = NULL,
2695
                          legend_title = "Model Type:",
2696
                          y_lim = 0.05,
2697
                          plot_type = "box_plot",
2698
                          target_sub_by = "Target Above 0.7",
2699
                          cur_comparisons = list(c("LDS + LMF", "LDS + LMF + GNN")),
2700
                          test = "wilcox.test",
2701
                          paired = F
2702
)
2703
2704
ggsave(plot = cur_p,
2705
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_without_GNN_Targeted_vs_Untargeted_Upper_0.7_SplitByBoth_Comparison_BoxPlot.pdf",
2706
       height = 8)
2707
2708
### Split By Drug Scaffold ====
2709
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2710
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2711
                                                    loss_type == "Base Model + LDS" &
2712
                                                    nchar(data_types) <= 5 &
2713
                                                    split_method == "Split By Drug Scaffold" &
2714
                                                    bottleneck == "No Data Bottleneck")),
2715
                          fill_by = quote(drug_type),
2716
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2717
                          data_order = data_order,
2718
                          facet_by = quote(Targeted),
2719
                          facet_level_order = c("Targeted Drug", "Untargeted Drug"),
2720
                          legend_title = "Model Type:",
2721
                          y_lim = 0.05)
2722
2723
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_without_GNN_Targeted_vs_Untargeted_Upper_0.7_SplitByDrugScaffold_Comparison.pdf")
2724
2725
# Box plot
2726
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2727
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2728
                                                    loss_type == "Base Model + LDS" &
2729
                                                    nchar(data_types) <= 5 &
2730
                                                    split_method == "Split By Drug Scaffold" &
2731
                                                    bottleneck == "No Data Bottleneck")),
2732
                          fill_by = quote(drug_type),
2733
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2734
                          data_order = data_order,
2735
                          facet_by = c("Targeted", "data_types"),
2736
                          facet_level_order = NULL,
2737
                          legend_title = "Model Type:",
2738
                          y_lim = 0.05,
2739
                          plot_type = "box_plot",
2740
                          target_sub_by = "Target Above 0.7",
2741
                          cur_comparisons = list(c("LDS + LMF", "LDS + LMF + GNN")),
2742
                          test = "wilcox.test",
2743
                          paired = F
2744
)
2745
2746
ggsave(plot = cur_p,
2747
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_without_GNN_Targeted_vs_Untargeted_Upper_0.7_SplitByDrugScaffold_Comparison_BoxPlot.pdf",
2748
       height = 8)
2749
2750
### Split By Cell Line ====
2751
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2752
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2753
                                                    loss_type == "Base Model + LDS" &
2754
                                                    nchar(data_types) <= 5 &
2755
                                                    split_method == "Split By Cell Line" &
2756
                                                    bottleneck == "No Data Bottleneck")),
2757
                          fill_by = quote(drug_type),
2758
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2759
                          data_order = data_order,
2760
                          facet_by = quote(Targeted),
2761
                          facet_level_order = c("Targeted Drug", "Untargeted Drug"),
2762
                          legend_title = "Model Type:",
2763
                          y_lim = 0.05)
2764
2765
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_without_GNN_Targeted_vs_Untargeted_Upper_0.7_SplitByCellLine_Comparison.pdf")
2766
2767
# Box plot
2768
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2769
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2770
                                                    loss_type == "Base Model + LDS" &
2771
                                                    nchar(data_types) <= 5 &
2772
                                                    split_method == "Split By Cell Line" &
2773
                                                    bottleneck == "No Data Bottleneck")),
2774
                          fill_by = quote(drug_type),
2775
                          bar_level_order = c("LDS + LMF", "LDS + LMF + GNN"),
2776
                          data_order = data_order,
2777
                          facet_by = c("Targeted", "data_types"),
2778
                          facet_level_order = NULL,
2779
                          legend_title = "Model Type:",
2780
                          y_lim = 0.05,
2781
                          plot_type = "box_plot",
2782
                          target_sub_by = "Target Above 0.7",
2783
                          cur_comparisons = list(c("LDS + LMF", "LDS + LMF + GNN")),
2784
                          test = "wilcox.test",
2785
                          paired = F
2786
)
2787
2788
ggsave(plot = cur_p,
2789
       filename = "Plots/CV_Results/Bimodal_CV_Trifecta_without_GNN_Targeted_vs_Untargeted_Upper_0.7_SplitByCellLine_Comparison_BoxPlot.pdf",
2790
       height = 8)
2791
2792
### Split Comparison ====
2793
# LDS + LMF - GNN, Upper Range, Targeted vs Untargeted
2794
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2795
                          sub_results_by = quote((merge_method == "Base Model + LMF" &
2796
                                                    loss_type == "Base Model + LDS" &
2797
                                                    drug_type == "Base Model" &
2798
                                                    nchar(data_types) <= 5 &
2799
                                                    bottleneck == "No Data Bottleneck")),
2800
                          fill_by = quote(split_method),
2801
                          bar_level_order = c("Split By Both Cell Line & Drug Scaffold", "Split By Cell Line", "Split By Drug Scaffold"),
2802
                          data_order = data_order,
2803
                          facet_by = quote(Targeted),
2804
                          facet_level_order = c("Targeted Drug", "Untargeted Drug"),
2805
                          legend_title = "Drug Model:",
2806
                          y_lim = 0.05)
2807
2808
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Trifecta_without_GNN_Targeted_vs_Untargeted_Upper_0.7_Split_Comparison.pdf")
2809
2810
2811
# Bi-modal LDS + GNN without LMF ====
2812
all_results <- fread("Data/all_results.csv")
2813
all_results <- all_results[nchar(data_types) <= 5]
2814
2815
all_results_copy <- all_results
2816
2817
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
2818
                 "split_method", "fold", "bottleneck", "TargetRange", "Targeted")
2819
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
2820
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
2821
2822
# Must rename some columns to better distinguish differences on the plot
2823
all_results_copy[merge_method == "Base Model", merge_method := "LDS + GNN"]
2824
all_results_copy[merge_method == "Base Model + LMF", merge_method := "LDS + LMF + GNN"]
2825
all_results_copy[merge_method == "Base Model + Sum", merge_method := "LDS + Sum + GNN"]
2826
2827
table(all_results_copy[(loss_type == "Base Model + LDS" &
2828
                          drug_type == "Base Model + GNN" &
2829
                          nchar(data_types) <= 5 &
2830
                          split_method == "Split By Drug Scaffold" &
2831
                          bottleneck == "No Data Bottleneck")]$merge_method)
2832
2833
## Split By Both Cell Line & Drug Scaffold ====
2834
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2835
                          sub_results_by = quote((loss_type == "Base Model + LDS" &
2836
                                                    drug_type == "Base Model + GNN" &
2837
                                                    nchar(data_types) <= 5 &
2838
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2839
                                                    bottleneck == "No Data Bottleneck")),
2840
                          fill_by = quote(merge_method),
2841
                          bar_level_order = c("LDS + GNN", "LDS + Sum + GNN", "LDS + LMF + GNN"),
2842
                          data_order = data_order,
2843
                          facet_by = quote(TargetRange),
2844
                          facet_level_order = c("Target Above 0.7",
2845
                                                "Target Below 0.7"),
2846
                          legend_title = "Model Type:",
2847
                          y_lim = 0.05)
2848
2849
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LMF_SplitByBoth_Comparison.pdf")
2850
2851
# Box plot
2852
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2853
                          sub_results_by = quote((loss_type == "Base Model + LDS" &
2854
                                                    drug_type == "Base Model + GNN" &
2855
                                                    nchar(data_types) <= 5 &
2856
                                                    split_method == "Split By Both Cell Line & Drug Scaffold" &
2857
                                                    bottleneck == "No Data Bottleneck")),
2858
                          fill_by = quote(merge_method),
2859
                          bar_level_order = c("LDS + GNN", "LDS + Sum + GNN", "LDS + LMF + GNN"),
2860
                          data_order = data_order,
2861
                          facet_by = "data_types",
2862
                          facet_level_order = NULL,
2863
                          legend_title = "Model Type:",
2864
                          y_lim = 0.05,
2865
                          plot_type = "box_plot",
2866
                          target_sub_by = "Target Above 0.7",
2867
                          cur_comparisons = list(c("LDS + GNN", "LDS + Sum + GNN"),
2868
                                                 c("LDS + Sum + GNN", "LDS + LMF + GNN"),
2869
                                                 c("LDS + GNN", "LDS + LMF + GNN")),
2870
                          test = "wilcox.test",
2871
                          paired = F
2872
)
2873
2874
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LMF_SplitByBoth_Comparison_BoxPlot.pdf")
2875
2876
## Split By Drug Scaffold ====
2877
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2878
                          sub_results_by = quote((loss_type == "Base Model + LDS" &
2879
                                                    drug_type == "Base Model + GNN" &
2880
                                                    nchar(data_types) <= 5 &
2881
                                                    split_method == "Split By Drug Scaffold" &
2882
                                                    bottleneck == "No Data Bottleneck")),
2883
                          fill_by = quote(merge_method),
2884
                          bar_level_order = c("LDS + GNN", "LDS + LMF + GNN"),
2885
                          data_order = data_order,
2886
                          facet_by = quote(TargetRange),
2887
                          facet_level_order = c("Target Above 0.7",
2888
                                                "Target Below 0.7"),
2889
                          legend_title = "Model Type:",
2890
                          y_lim = 0.05)
2891
2892
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LMF_SplitByDrugScaffold_Comparison.pdf")
2893
2894
# Box plot
2895
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2896
                          sub_results_by = quote((loss_type == "Base Model + LDS" &
2897
                                                    drug_type == "Base Model + GNN" &
2898
                                                    nchar(data_types) <= 5 &
2899
                                                    split_method == "Split By Drug Scaffold" &
2900
                                                    bottleneck == "No Data Bottleneck")),
2901
                          fill_by = quote(merge_method),
2902
                          bar_level_order = c("LDS + GNN", "LDS + LMF + GNN"),
2903
                          data_order = data_order,
2904
                          facet_by = "data_types",
2905
                          facet_level_order = NULL,
2906
                          legend_title = "Model Type:",
2907
                          y_lim = 0.05,
2908
                          plot_type = "box_plot",
2909
                          target_sub_by = "Target Above 0.7",
2910
                          cur_comparisons = list(c("LDS + GNN", "LDS + LMF + GNN")),
2911
                          test = "wilcox.test",
2912
                          paired = F
2913
)
2914
2915
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LMF_SplitByDrugScaffold_Comparison_BoxPlot.pdf")
2916
2917
## Split By Cell Line ====
2918
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
2919
                 "split_method", "fold", "bottleneck", "TargetRange", "Targeted")
2920
2921
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2922
                          sub_results_by = quote((loss_type == "Base Model + LDS" &
2923
                                                    drug_type == "Base Model + GNN" &
2924
                                                    nchar(data_types) <= 5 &
2925
                                                    split_method == "Split By Cell Line" &
2926
                                                    bottleneck == "No Data Bottleneck")),
2927
                          fill_by = quote(merge_method),
2928
                          bar_level_order = c("LDS + GNN", "LDS + Sum + GNN", "LDS + LMF + GNN"),
2929
                          data_order = data_order,
2930
                          plot_type = "bar_plot",
2931
                          facet_by = c("Targeted", "TargetRange"),
2932
                          facet_level_order = list(c("Targeted Drug", "Untargeted Drug"),
2933
                                                   c("Target Above 0.7", "Target Below 0.7")),
2934
                          legend_title = "Model Type:",
2935
                          calculate_avg_mae = F, y_lab = "Total RMSE Loss",
2936
                          y_lim = 0.1)
2937
2938
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
2939
2940
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LMF_SplitByCellLine_Comparison_BarPlot.pdf",
2941
       height = 10)
2942
2943
# Box plot
2944
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2945
                          sub_results_by = quote((loss_type == "Base Model + LDS" &
2946
                                                    drug_type == "Base Model + GNN" &
2947
                                                    nchar(data_types) <= 5 &
2948
                                                    split_method == "Split By Cell Line" &
2949
                                                    bottleneck == "No Data Bottleneck")),
2950
                          fill_by = quote(merge_method),
2951
                          bar_level_order = c("LDS + GNN", "LDS + Sum + GNN", "LDS + LMF + GNN"),
2952
                          data_order = data_order,
2953
                          facet_by = "data_types",
2954
                          facet_level_order = NULL,
2955
                          legend_title = "Model Type:",
2956
                          y_lim = 0.05,
2957
                          plot_type = "box_plot",
2958
                          target_sub_by = "Target Above 0.7",
2959
                          cur_comparisons = list(c("LDS + GNN", "LDS + Sum + GNN"),
2960
                                                 c("LDS + Sum + GNN", "LDS + LMF + GNN"),
2961
                                                 c("LDS + GNN", "LDS + LMF + GNN")),
2962
                          test = "wilcox.test",
2963
                          paired = F
2964
)
2965
2966
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LMF_SplitByCellLine_Comparison_BoxPlot.pdf")
2967
2968
# Violin plot
2969
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2970
                          sub_results_by = quote((loss_type == "Base Model + LDS" &
2971
                                                    drug_type == "Base Model + GNN" &
2972
                                                    nchar(data_types) <= 5 &
2973
                                                    split_method == "Split By Cell Line" &
2974
                                                    bottleneck == "No Data Bottleneck")),
2975
                          fill_by = quote(merge_method),
2976
                          bar_level_order = c("LDS + GNN", "LDS + Sum + GNN", "LDS + LMF + GNN"),
2977
                          data_order = data_order,
2978
                          facet_by = c("Targeted", "data_types"),
2979
                          facet_level_order = NULL,
2980
                          legend_title = "Model Type:",
2981
                          y_lim = 0.05,
2982
                          plot_type = "violin_plot",
2983
                          target_sub_by = "Target Above 0.7",
2984
                          cur_comparisons = list(c("LDS + GNN", "LDS + Sum + GNN"),
2985
                                                 c("LDS + Sum + GNN", "LDS + LMF + GNN"),
2986
                                                 c("LDS + GNN", "LDS + LMF + GNN")),
2987
                          test = "ks.test", step_increase = 0.075,
2988
                          paired = T
2989
)
2990
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold")) +
2991
  expand_limits(y = c(0, 1.7))
2992
2993
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_minus_LMF_SplitByCellLine_Comparison_ViolinPlot.pdf",
2994
       height = 12)
2995
2996
## Split Comparison ====
2997
# LDS + GNN - LMF
2998
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
2999
                          sub_results_by = quote((loss_type == "Base Model + LDS" &
3000
                                                    drug_type == "Base Model + GNN" &
3001
                                                    merge_method == "Base Model" &
3002
                                                    nchar(data_types) <= 5 &
3003
                                                    bottleneck == "No Data Bottleneck")),
3004
                          fill_by = quote(split_method),
3005
                          bar_level_order = c("Split By Both Cell Line & Drug Scaffold", "Split By Cell Line", "Split By Drug Scaffold"),
3006
                          data_order = data_order,
3007
                          facet_by = quote(TargetRange),
3008
                          facet_level_order = c("Target Above 0.7",
3009
                                                "Target Below 0.7"),
3010
                          legend_title = "Split Method:",
3011
                          y_lim = 0.05)
3012
3013
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_per_fold_Trifecta_without_LMF_Split_Comparison.pdf")
3014
3015
# Bi-modal Baseline vs Trifecta ====
3016
# all_results <- fread("Data/all_results.csv")
3017
all_results_copy <- all_results
3018
3019
avg_loss_by <- c("data_types", "merge_method", "loss_type", "drug_type",
3020
                 "split_method", "fold", "bottleneck", "TargetRange", "Targeted")
3021
# all_results_copy[, loss_by_config := mean(RMSELoss), by = avg_loss_by]
3022
data_order <- c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA")
3023
3024
all_results_copy[(merge_method == "Base Model + LMF" & drug_type == "Base Model + GNN" & loss_type == "Base Model + LDS"), config_type := "Trifecta"]
3025
all_results_copy[(merge_method == "Base Model" & drug_type == "Base Model" & loss_type == "Base Model"), config_type := "Baseline"]
3026
all_results_copy <- all_results_copy[config_type == "Trifecta" | config_type == "Baseline"]
3027
3028
avg_loss_by <- c(avg_loss_by, "config_type")
3029
3030
table(all_results_copy[split_method == "Split By Both Cell Line & Drug Scaffold" &
3031
                   nchar(data_types) <= 5 &
3032
                   bottleneck == "No Data Bottleneck"]$config_type)
3033
3034
## Split By Both Cell Line & Drug Scaffold ====
3035
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
3036
                          sub_results_by = quote(split_method == "Split By Both Cell Line & Drug Scaffold" &
3037
                                                   nchar(data_types) <= 5 &
3038
                                                   bottleneck == "No Data Bottleneck"),
3039
                          fill_by = quote(config_type),
3040
                          bar_level_order = c("Baseline", "Trifecta"),
3041
                          data_order = data_order,
3042
                          facet_by = quote(TargetRange),
3043
                          facet_level_order = c("Target Above 0.7",
3044
                                                "Target Below 0.7"),
3045
                          legend_title = "Model Type:",
3046
                          y_lim = 0.05)
3047
3048
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_Trifecta_SplitByBoth_Comparison.pdf")
3049
3050
# Box plot
3051
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
3052
                          sub_results_by = quote(split_method == "Split By Both Cell Line & Drug Scaffold" &
3053
                                                   nchar(data_types) <= 5 &
3054
                                                   bottleneck == "No Data Bottleneck"),
3055
                          fill_by = quote(config_type),
3056
                          bar_level_order = c("Baseline", "Trifecta"),
3057
                          data_order = data_order,
3058
                          facet_by = "data_types",
3059
                          facet_level_order = NULL,
3060
                          legend_title = "Model Type:",
3061
                          y_lim = 0.05,
3062
                          plot_type = "box_plot",
3063
                          target_sub_by = "Target Above 0.7",
3064
                          cur_comparisons = list(c("Baseline", "Trifecta")),
3065
                          test = "wilcox.test",
3066
                          paired = F
3067
)
3068
3069
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_Trifecta_SplitByBoth_Comparison_BoxPlot.pdf")
3070
3071
## Split By Drug Scaffold ====
3072
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
3073
                          sub_results_by = quote(split_method == "Split By Drug Scaffold" &
3074
                                                   nchar(data_types) <= 5 &
3075
                                                   bottleneck == "No Data Bottleneck"),
3076
                          fill_by = quote(config_type),
3077
                          bar_level_order = c("Baseline", "Trifecta"),
3078
                          data_order = data_order,
3079
                          facet_by = quote(TargetRange),
3080
                          facet_level_order = c("Target Above 0.7",
3081
                                                "Target Below 0.7"),
3082
                          legend_title = "Model Type:",
3083
                          y_lim = 0.05)
3084
3085
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_Trifecta_SplitByDrugScaffold_Comparison.pdf")
3086
3087
# Box plot
3088
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
3089
                          sub_results_by = quote(split_method == "Split By Drug Scaffold" &
3090
                                                   nchar(data_types) <= 5 &
3091
                                                   bottleneck == "No Data Bottleneck"),
3092
                          fill_by = quote(config_type),
3093
                          bar_level_order = c("Baseline", "Trifecta"),
3094
                          data_order = data_order,
3095
                          facet_by = "data_types",
3096
                          facet_level_order = NULL,
3097
                          legend_title = "Model Type:",
3098
                          y_lim = 0.05,
3099
                          plot_type = "box_plot",
3100
                          target_sub_by = "Target Above 0.7",
3101
                          cur_comparisons = list(c("Baseline", "Trifecta")),
3102
                          test = "wilcox.test",
3103
                          paired = F
3104
)
3105
3106
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_Trifecta_SplitByDrugScaffold_Comparison_BoxPlot.pdf")
3107
3108
## Split By Cell Line ====
3109
table(all_results_copy[split_method == "Split By Cell Line" &
3110
                         nchar(data_types) <= 5 &
3111
                         bottleneck == "No Data Bottleneck"]$config_type)
3112
3113
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
3114
                          sub_results_by = quote(split_method == "Split By Cell Line" &
3115
                                                   nchar(data_types) <= 5 &
3116
                                                   bottleneck == "No Data Bottleneck"),
3117
                          fill_by = quote(config_type),
3118
                          bar_level_order = c("Baseline", "Trifecta"),
3119
                          data_order = data_order,
3120
                          facet_by = c("Targeted", "TargetRange"),
3121
                          facet_level_order = list(c("Targeted Drug", "Untargeted Drug"),
3122
                                                   c("Target Above 0.7", "Target Below 0.7")),
3123
                          legend_title = "Model Type:",
3124
                          plot_type = "bar_plot",
3125
                          y_lab = "Total RMSE Loss", calculate_avg_mae = F,
3126
                          y_lim = 0.05)
3127
3128
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold"))
3129
3130
ggsave(plot = cur_p,
3131
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_Trifecta_SplitByCellLine_Comparison_BarPlot.pdf",
3132
       height = 10)
3133
3134
# Box plot
3135
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
3136
                          sub_results_by = quote(split_method == "Split By Cell Line" &
3137
                                                   nchar(data_types) <= 5 &
3138
                                                   bottleneck == "No Data Bottleneck"),
3139
                          fill_by = quote(config_type),
3140
                          bar_level_order = c("Baseline", "Trifecta"),
3141
                          data_order = data_order,
3142
                          facet_by = "data_types",
3143
                          facet_level_order = NULL,
3144
                          legend_title = "Model Type:",
3145
                          y_lim = 0.05,
3146
                          plot_type = "box_plot",
3147
                          target_sub_by = "Target Above 0.7",
3148
                          cur_comparisons = list(c("Baseline", "Trifecta")),
3149
                          test = "wilcox.test",
3150
                          paired = F
3151
)
3152
3153
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_Trifecta_SplitByCellLine_Comparison_BoxPlot.pdf")
3154
3155
# Violin plot
3156
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
3157
                          sub_results_by = quote(split_method == "Split By Cell Line" &
3158
                                                   nchar(data_types) <= 5 &
3159
                                                   bottleneck == "No Data Bottleneck"),
3160
                          fill_by = quote(config_type),
3161
                          bar_level_order = c("Baseline", "Trifecta"),
3162
                          data_order = data_order,
3163
                          facet_by = c("Targeted", "data_types"),
3164
                          facet_level_order = NULL,
3165
                          legend_title = "Model Type:",
3166
                          y_lim = 0.1,
3167
                          plot_type = "violin_plot",
3168
                          target_sub_by = "Target Above 0.7",
3169
                          cur_comparisons = list(c("Baseline", "Trifecta")),
3170
                          test = "ks.test",
3171
                          paired = T
3172
)
3173
cur_p <- cur_p + theme(text = element_text(size = 14, face = "bold")) + expand_limits(y = c(0, 1.3))
3174
3175
ggsave(plot = cur_p,
3176
       filename = "Plots/CV_Results/Bimodal_CV_Baseline_vs_Trifecta_SplitByCellLine_Comparison_ViolinPlot.pdf",
3177
       height = 8)
3178
3179
## Split Comparison ====
3180
# Trifecta by splitting method
3181
cur_p <- my_plot_function(avg_loss_by = avg_loss_by,
3182
                          sub_results_by = quote(config_type == "Trio" &
3183
                                                   nchar(data_types) <= 5 &
3184
                                                   bottleneck == "No Data Bottleneck"),
3185
                          fill_by = quote(split_method),
3186
                          bar_level_order = c("Split By Both Cell Line & Drug Scaffold", "Split By Cell Line", "Split By Drug Scaffold"),
3187
                          data_order = data_order,
3188
                          facet_by = quote(TargetRange),
3189
                          facet_level_order = c("Target Above 0.7",
3190
                                                "Target Below 0.7"),
3191
                          legend_title = "Model Type:",
3192
                          y_lim = 0.05)
3193
3194
ggsave(plot = cur_p, filename = "Plots/CV_Results/Bimodal_CV_Trifecta_Split_Comparison.pdf")
3195
3196
# Trimodal Baseline vs Trifecta (Split By Both Cell Line & Drug Scaffold) ====
3197
# install.packages("gt")
3198
require(gt)
3199
library(stringr)
3200
all_results_copy <- all_results[str_count(data_types, "_") == 1]
3201
all_results_copy[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange")]
3202
# all_results_copy[, Targeted := ifelse(cpd_name %in% targeted_drugs, T, F)]
3203
3204
all_results_long_copy <- melt(unique(all_results_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "TargetRange")]),
3205
                              id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange"))
3206
3207
all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange")]
3208
all_results_long_copy[, cv_sd := sd(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange")]
3209
length(unique(all_results_long_copy$data_types))  # 28 unique trimodal combinations
3210
3211
baseline_vs_trifecta <- all_results_long_copy[split_method == "Split By Both Cell Line & Drug Scaffold" & ((drug_type == "Base Model + GNN" &
3212
                                                                                 merge_method == "Base Model + LMF" &
3213
                                                                                 loss_type == "Base Model + LDS") | 
3214
                                                                                (drug_type == "Base Model" &
3215
                                                                                merge_method == "Base Model" &
3216
                                                                                loss_type == "Base Model"))]
3217
3218
baseline_vs_trifecta[split_method == "Split By Both Cell Line & Drug Scaffold" & ((drug_type == "Base Model + GNN" &
3219
                                                        merge_method == "Base Model + LMF" &
3220
                                                        loss_type == "Base Model + LDS")), config_type := "Trio "]
3221
baseline_vs_trifecta[split_method == "Split By Both Cell Line & Drug Scaffold" & ((drug_type == "Base Model" &
3222
                                                        merge_method == "Base Model" &
3223
                                                        loss_type == "Base Model")), config_type := "Baseline"]
3224
# baseline_with_lmf <- all_results_long_copy[(nchar(data_types) > 5)]
3225
dodge2 <- position_dodge2(width = 0.9, padding = 0)
3226
cur_data <- unique(baseline_vs_trifecta[,-c("fold", "value")])
3227
# Split data types column (cool function!)
3228
cur_data[, c("data_1", "data_2") := tstrsplit(data_types, "_", fixed = T)]
3229
3230
gt(cur_data, rowname_col = "data_1") %>%
3231
  tab_header(title = "Comparison of Baseline ANN and Trio of techniques in the tri-modal case",
3232
                            subtitle = "5-fold validation RMSE loss using strict splitting")
3233
  
3234
  
3235
p <- ggplot(cur_data) +
3236
  geom_bar(mapping = aes(x = data_types, y = cv_mean, fill = config_type), stat = "identity", position='dodge') +
3237
  facet_wrap(~TargetRange, ncol = 2) + 
3238
  scale_fill_discrete(name = "CV Fold:") +
3239
  scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
3240
                               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
3241
  geom_errorbar(aes(x=data_types,
3242
                    y=cv_mean,
3243
                    ymax=cv_mean + cv_sd, 
3244
                    ymin=cv_mean - cv_sd, col='red'),
3245
                linetype=1, show.legend = FALSE, position = dodge2, width = 0.9) +
3246
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
3247
        axis.title.x = element_blank()) +
3248
  ylab("RMSE Loss") +
3249
  ylim(0, max(cur_data$cv_mean) + max(cur_data$cv_sd) + 0.05) +
3250
  ggtitle(label = tools::toTitleCase("Comparison of Baseline ANN and Trio of techniques in the tri-modal case"),
3251
          subtitle = "5-fold validation RMSE loss using strict splitting") +
3252
  geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5)
3253
3254
ggsave(plot = p, filename = "Plots/CV_Results/Trimodal_CV_per_fold_Baseline_vs_Trifecta_SplitByBoth_Comparison.pdf",
3255
       width = 24, height = 16, units = "in")
3256
3257
3258
3259
# Tri-modal Baseline Bottleneck Comparison (split by cell line) ====
3260
all_results_copy <- all_results
3261
# all_results_copy_sub <- all_results_copy[TargetRange == "TargetAbove 0.7"]
3262
all_results_copy[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "bottleneck")]
3263
# all_results_copy[, Targeted := ifelse(cpd_name %in% targeted_drugs, T, F)]
3264
3265
all_results_long_copy <- melt(unique(all_results_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "TargetRange", "bottleneck")]),
3266
                              id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange", "bottleneck"))
3267
3268
all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange", "bottleneck")]
3269
all_results_long_copy[, cv_sd := sd(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange", "bottleneck")]
3270
3271
baseline <- all_results_long_copy[(split_method == "Split By Cell Line" & merge_method == "Base Model" & loss_type == "Base Model" &
3272
                                     drug_type == "Base Model" & nchar(data_types) > 6)]
3273
dodge2 <- position_dodge2(width = 0.9, padding = 0)
3274
cur_data <- unique(baseline[,-c("fold", "value")])
3275
3276
p <- ggplot(cur_data) +
3277
  geom_bar(mapping = aes(x = data_types, y = cv_mean,
3278
                         fill = factor(bottleneck,
3279
                                       levels = c("With Data Bottleneck",
3280
                                                  "No Data Bottleneck"))),
3281
           stat = "identity", position='dodge') +
3282
  facet_wrap(~factor(TargetRange,
3283
                     levels = c("Target Above 0.7",
3284
                                "Target Below 0.7")), ncol = 2) + 
3285
  geom_errorbar(aes(x=data_types,
3286
                    y=cv_mean,
3287
                    ymax=cv_mean + cv_sd, 
3288
                    ymin=cv_mean - cv_sd, col='red'),
3289
                linetype=1, show.legend = FALSE, position = dodge2, width = 0.9) +
3290
  scale_fill_discrete(name = "Loss Type:") +
3291
  scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
3292
                               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
3293
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
3294
        axis.title.x = element_blank(),
3295
        legend.position = c(.9,.85)) +
3296
  ylab("RMSE Loss") +
3297
  ylim(0, max(cur_data$cv_mean) + max(cur_data$cv_sd) + 0.05)
3298
  # ggtitle(label = tools::toTitleCase("Comparison of LDS Loss Weighting across three true AAC range groups"),
3299
  #         subtitle = "5-fold validation RMSE loss using strict splitting by cell lines") +
3300
  # geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean + cv_sd),
3301
  #           vjust = 0.5, hjust = -0.25, angle = 90, position = position_dodge2(width = .9))
3302
3303
ggsave(plot = p, filename = "Plots/CV_Results/Trimodal_CV_Baseline_Bottleneck_Comparison.pdf")
3304
# width = 24, height = 16, units = "in")
3305
3306
# Tri-modal Trifecta (Splitting Comparison) ====
3307
all_results_copy <- all_results
3308
# all_results_copy[target > 0.7 & target < 0.9]$TargetRange <- "Target Between 0.7 & 0.9"
3309
# all_results_copy[target >= 0.9]$TargetRange <- "Target Above 0.9"
3310
all_results_copy[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange")]
3311
# all_results_copy[, Targeted := ifelse(cpd_name %in% targeted_drugs, T, F)]
3312
3313
all_results_long_copy <- melt(unique(all_results_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "TargetRange")]),
3314
                              id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange"))
3315
3316
all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange")]
3317
all_results_long_copy[, cv_sd := sd(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange")]
3318
3319
3320
# trifecta_vs_baseline <- all_results_long_copy[((merge_method == "Base Model + LMF" & drug_type == "Base Model + GNN" & loss_type == "Base Model + LDS") |
3321
#                                                  (merge_method == "Base Model" & drug_type == "Base Model" & loss_type == "Base Model")) &
3322
#                                                 split_method == "Split By Both Cell Line & Drug Scaffold" & nchar(data_types) <= 5]
3323
trifecta <- all_results_long_copy[(merge_method == "Base Model + LMF" & drug_type == "Base Model + GNN" &
3324
                                     loss_type == "Base Model + LDS") & nchar(data_types) > 6]
3325
3326
dodge2 <- position_dodge2(width = 0.9, padding = 0)
3327
# cur_data <- unique(trifecta_vs_baseline[,-c("fold", "value")])
3328
cur_data <- unique(trifecta[,-c("fold", "value")])
3329
# cur_data[(merge_method == "Base Model + LMF" & drug_type == "Base Model + GNN" & loss_type == "Base Model + LDS"), config_type := "Trio"]
3330
# cur_data[(merge_method == "Base Model" & drug_type == "Base Model" & loss_type == "Base Model"), config_type := "Baseline"]
3331
3332
p <- ggplot(cur_data) +
3333
  geom_bar(mapping = aes(x = data_types, y = cv_mean, fill = split_method),
3334
           stat = "identity", position='dodge') +
3335
  facet_wrap(~TargetRange, ncol = 2) + 
3336
  scale_fill_discrete(name = "Configuration:") +
3337
  scale_x_discrete() +
3338
  scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
3339
                               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
3340
  geom_errorbar(aes(x=data_types,
3341
                    y=cv_mean,
3342
                    ymax=cv_mean + cv_sd, 
3343
                    ymin=cv_mean - cv_sd, col='red'),
3344
                linetype=1, show.legend = FALSE, position = dodge2, width = 0.9) +
3345
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
3346
        axis.title.x = element_blank(),
3347
        legend.position = c(.9,.85)) +
3348
  ylab("RMSE Loss") +
3349
  ylim(0, max(cur_data$cv_mean) + max(cur_data$cv_sd) + 0.05)
3350
  # ggtitle(label = tools::toTitleCase("Comparison of Baseline with LDS + LMF + GNN across two true AAC range groups"),
3351
  #         subtitle = "5-fold validation RMSE loss using strict splitting by both drugs and cell lines") +
3352
  # geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean + cv_sd),
3353
  #           vjust = 0.5, hjust = -0.25, angle = 90, position = position_dodge2(width = .9))
3354
3355
ggsave(plot = p, filename = "Plots/CV_Results/Trimodal_CV_Trifecta_Split_Comparison.pdf")
3356
# width = 24, height = 16, units = "in")
3357
3358
3359
3360
3361
# Trimodal Heatmap for Best Combinations ====
3362
library(stringr)
3363
all_results_copy <- all_results[str_count(data_types, "_") == 1]
3364
all_results_copy[, loss_by_config := rmse(target, predicted), by = c("data_types", "merge_method", "loss_type",
3365
                                                                     "drug_type", "split_method", "bottleneck",
3366
                                                                     "TargetRange")]
3367
3368
# No drug targetedness separation
3369
all_results_copy <- unique(all_results_copy[, c("data_types", "merge_method", "loss_type",
3370
                                                "drug_type", "split_method", "bottleneck",
3371
                                                "TargetRange", "loss_by_config")])
3372
3373
all_results_copy <- all_results_copy[bottleneck == "No Data Bottleneck"]
3374
3375
# all_results_copy[, Targeted := ifelse(cpd_name %in% targeted_drugs, T, F)]
3376
3377
# all_results_long_copy <- melt(unique(all_results_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "bottleneck", "loss_by_config", "TargetRange")]),
3378
#                               id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "bottleneck", "TargetRange"))
3379
3380
# all_results_long_copy[, loss_by_config := rmse(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "bottleneck", "TargetRange")]
3381
# all_results_long_copy[, cv_sd := sd(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "bottleneck", "TargetRange")]
3382
length(unique(all_results_long_copy$data_types))  # 28 unique trimodal combinations
3383
3384
save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
3385
  stopifnot(!missing(x))
3386
  stopifnot(!missing(filename))
3387
  pdf(filename, width=width, height=height)
3388
  grid::grid.newpage()
3389
  grid::grid.draw(x$gtable)
3390
  dev.off()
3391
}
3392
3393
require(pheatmap)
3394
require(igraph)
3395
## Split By Cell Line ====
3396
baseline_trimodal <- all_results_copy[split_method == "Split By Cell Line" & (drug_type == "Base Model" &
3397
                                                                                     merge_method == "Base Model" &
3398
                                                                                     loss_type == "Base Model")]
3399
trifectra_trimodal <- all_results_copy[split_method == "Split By Cell Line" & (drug_type == "Base Model + GNN" &
3400
                                                                                      merge_method == "Base Model + LMF" &
3401
                                                                                      loss_type == "Base Model + LDS")]
3402
baseline_trimodal_cv <- unique(baseline_trimodal[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method",
3403
                                                     "TargetRange", "loss_by_config")])
3404
trifecta_trimodal_cv <- unique(trifectra_trimodal[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method",
3405
                                                      "TargetRange", "loss_by_config")])
3406
3407
all_tri_omic_combos_el <- utils::combn(c("MUT", 'CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA'), 2, simplify = T)
3408
all_tri_omic_combos_el <- t(all_tri_omic_combos_el)
3409
3410
all_tri_omic_combos_el <- cbind(all_tri_omic_combos_el, rep(0.5, 28))
3411
baseline_trimodal_cv[TargetRange == "Target Below 0.7"]
3412
temp <- baseline_trimodal_cv[TargetRange == "Target Below 0.7"]
3413
all_cv_means <- vector(mode = "numeric", length = nrow(temp))
3414
for (i in 1:nrow(temp)) {
3415
  cur_combo <- paste(all_tri_omic_combos_el[i, 1:2], collapse = "_")
3416
  cur_cv_mean <- temp[data_types == cur_combo]$loss_by_config
3417
  all_cv_means[i] <- cur_cv_mean
3418
}
3419
3420
all_tri_omic_combos_el[,3] <- all_cv_means
3421
colnames(all_tri_omic_combos_el) <-  c("first", "second", "Weight")
3422
g=graph.data.frame(all_tri_omic_combos_el)
3423
m <- get.adjacency(g,sparse=FALSE, attr = 'Weight')
3424
storage.mode(m) <- "numeric"
3425
m <- round(m, 4)
3426
m2 <- m
3427
m2[is.na(m)] <- ""
3428
3429
p <- pheatmap(t(m), cluster_rows = FALSE, cluster_cols = FALSE, display_numbers = t(m2), angle_col = "0", legend = F, 
3430
              na_col = "white", border_color = NA, fontsize_number = 12)
3431
save_pheatmap_pdf(p, "Plots/CV_Results/Trimodal_RMSE_Baseline_LowerAAC_SplitByCellLine_Heatmap.pdf", 8, 8)
3432
3433
3434
all_tri_omic_combos_el <- utils::combn(c("MUT", 'CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA'), 2, simplify = T)
3435
all_tri_omic_combos_el <- t(all_tri_omic_combos_el)
3436
3437
all_tri_omic_combos_el <- cbind(all_tri_omic_combos_el, rep(0.5, 28))
3438
baseline_trimodal_cv[TargetRange == "Target Above 0.7"]
3439
temp <- baseline_trimodal_cv[TargetRange == "Target Above 0.7"]
3440
all_cv_means <- vector(mode = "numeric", length = nrow(temp))
3441
for (i in 1:nrow(temp)) {
3442
  cur_combo <- paste(all_tri_omic_combos_el[i, 1:2], collapse = "_")
3443
  cur_cv_mean <- temp[data_types == cur_combo]$loss_by_config
3444
  all_cv_means[i] <- cur_cv_mean
3445
}
3446
3447
all_tri_omic_combos_el[,3] <- all_cv_means
3448
colnames(all_tri_omic_combos_el) <-  c("first", "second", "Weight")
3449
g=graph.data.frame(all_tri_omic_combos_el)
3450
m <- get.adjacency(g,sparse=FALSE, attr = 'Weight')
3451
storage.mode(m) <- "numeric"
3452
m <- round(m, 4)
3453
m2 <- m
3454
m2[is.na(m)] <- ""
3455
3456
p <- pheatmap(t(m), cluster_rows = FALSE, cluster_cols = FALSE, display_numbers = t(m2), angle_col = "0", legend = F, 
3457
              na_col = "white", border_color = NA, fontsize_number = 12)
3458
save_pheatmap_pdf(p, "Plots/CV_Results/Trimodal_RMSE_Baseline_UpperAAC_SplitByCellLine_Heatmap.pdf", 8, 8)
3459
3460
3461
## Split By Both Cell Line & Drug Scaffold ====
3462
baseline_trimodal <- all_results_long_copy[split_method == "Split By Both Cell Line & Drug Scaffold" & (drug_type == "Base Model" &
3463
                                                                                   merge_method == "Base Model" &
3464
                                                                                   loss_type == "Base Model")]
3465
trifectra_trimodal <- all_results_long_copy[split_method == "Split By Both Cell Line & Drug Scaffold" & (drug_type == "Base Model + GNN" &
3466
                                                                                 merge_method == "Base Model + LMF" &
3467
                                                                                 loss_type == "Base Model + LDS")]
3468
baseline_trimodal_cv <- unique(baseline_trimodal[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method",
3469
                                              "TargetRange", "cv_mean")])
3470
trifecta_trimodal_cv <- unique(trifectra_trimodal[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method",
3471
                                              "TargetRange", "cv_mean")])
3472
3473
all_tri_omic_combos_el <- utils::combn(c("MUT", 'CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA'), 2, simplify = T)
3474
all_tri_omic_combos_el <- t(all_tri_omic_combos_el)
3475
# 
3476
# MUT   CNV  
3477
# MUT   EXP  
3478
# MUT   PROT 
3479
# MUT   MIRNA
3480
# MUT   METAB
3481
# MUT   HIST 
3482
# MUT   RPPA 
3483
# CNV   EXP  
3484
# CNV   PROT 
3485
# CNV   MIRNA
3486
# CNV   METAB
3487
# CNV   HIST 
3488
# CNV   RPPA 
3489
# EXP   PROT 
3490
# EXP   MIRNA
3491
# EXP   METAB
3492
# EXP   HIST 
3493
# EXP   RPPA 
3494
# PROT  MIRNA
3495
# PROT  METAB
3496
# PROT  HIST 
3497
# PROT  RPPA 
3498
# MIRNA METAB
3499
# MIRNA HIST 
3500
# MIRNA RPPA 
3501
# METAB HIST 
3502
# METAB RPPA 
3503
# HIST  RPPA
3504
3505
all_tri_omic_combos_el <- cbind(all_tri_omic_combos_el, rep(0.5, 28))
3506
baseline_trimodal_cv[TargetRange == "Target Above 0.7"]
3507
temp <- baseline_trimodal_cv[TargetRange == "Target Above 0.7"]
3508
all_cv_means <- vector(mode = "numeric", length = nrow(temp))
3509
for (i in 1:nrow(temp)) {
3510
  cur_combo <- paste(all_tri_omic_combos_el[i, 1:2], collapse = "_")
3511
  cur_cv_mean <- temp[data_types == cur_combo]$cv_mean
3512
  all_cv_means[i] <- cur_cv_mean
3513
}
3514
3515
all_tri_omic_combos_el[,3] <- all_cv_means
3516
colnames(all_tri_omic_combos_el) <-  c("first", "second", "Weight")
3517
3518
g=graph.data.frame(all_tri_omic_combos_el)
3519
m <- get.adjacency(g,sparse=FALSE, attr = 'Weight')
3520
storage.mode(m) <- "numeric"
3521
m <- round(m, 4)
3522
m2 <- m
3523
m2[is.na(m)] <- ""
3524
3525
# install.packages("pheatmap")
3526
require(pheatmap)
3527
p <- pheatmap(t(m), cluster_rows = FALSE, cluster_cols = FALSE, display_numbers = t(m2), angle_col = "0", legend = F, 
3528
         na_col = "white", border_color = NA, fontsize_number = 12)
3529
3530
save_pheatmap_pdf(p, "Plots/CV_Results/Trimodal_CV_Mean_Baseline_SplitByBoth_Heatmap.pdf", 8, 8)
3531
3532
## Split By Drug Scaffold ====
3533
baseline_trimodal <- all_results_long_copy[split_method == "Split By Drug Scaffold" & (drug_type == "Base Model" &
3534
                                                                                merge_method == "Base Model" &
3535
                                                                                loss_type == "Base Model")]
3536
trifectra_trimodal <- all_results_long_copy[split_method == "Split By Drug Scaffold" & (drug_type == "Base Model + GNN" &
3537
                                                                                 merge_method == "Base Model + LMF" &
3538
                                                                                 loss_type == "Base Model + LDS")]
3539
baseline_trimodal_cv <- unique(baseline_trimodal[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method",
3540
                                                     "TargetRange", "cv_mean")])
3541
trifecta_trimodal_cv <- unique(trifectra_trimodal[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method",
3542
                                                      "TargetRange", "cv_mean")])
3543
3544
all_tri_omic_combos_el <- utils::combn(c("MUT", 'CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA'), 2, simplify = T)
3545
all_tri_omic_combos_el <- t(all_tri_omic_combos_el)
3546
3547
all_tri_omic_combos_el <- cbind(all_tri_omic_combos_el, rep(0.5, 28))
3548
baseline_trimodal_cv[TargetRange == "Target Above 0.7"]
3549
temp <- baseline_trimodal_cv[TargetRange == "Target Above 0.7"]
3550
all_cv_means <- vector(mode = "numeric", length = nrow(temp))
3551
for (i in 1:nrow(temp)) {
3552
  cur_combo <- paste(all_tri_omic_combos_el[i, 1:2], collapse = "_")
3553
  cur_cv_mean <- temp[data_types == cur_combo]$cv_mean
3554
  all_cv_means[i] <- cur_cv_mean
3555
}
3556
3557
all_tri_omic_combos_el[,3] <- all_cv_means
3558
colnames(all_tri_omic_combos_el) <-  c("first", "second", "Weight")
3559
g=graph.data.frame(all_tri_omic_combos_el)
3560
m <- get.adjacency(g,sparse=FALSE, attr = 'Weight')
3561
storage.mode(m) <- "numeric"
3562
m <- round(m, 4)
3563
m2 <- m
3564
m2[is.na(m)] <- ""
3565
3566
p <- pheatmap(t(m), cluster_rows = FALSE, cluster_cols = FALSE, display_numbers = t(m2), angle_col = "0", legend = F, 
3567
              na_col = "white", border_color = NA, fontsize_number = 12)
3568
3569
save_pheatmap_pdf(p, "Plots/CV_Results/Trimodal_CV_Mean_Baseline_SplitByDrugScaffold_Heatmap.pdf", 8, 8)
3570
3571
# Trimodal Baseline vs Trifecta Bar Plot ====
3572
require(ggplot2)
3573
require(grid)
3574
library(stringr)
3575
require(data.table)
3576
dodge2 <- position_dodge2(width = 0.9, padding = 0)
3577
rmse <- function(x, y) sqrt(mean((x - y)^2))
3578
3579
3580
all_results_copy <- fread("Data/all_results.csv")
3581
3582
# all_results_copy <- all_results_copy[str_count(data_types, "_") == 1]
3583
3584
unique_combos <- fread("Data/shared_unique_combinations.csv")
3585
unique_combos[, unique_samples := paste0(cpd_name, "_", cell_name)]
3586
all_results_copy[, unique_samples := paste0(cpd_name, "_", cell_name)]
3587
all_results_copy <- all_results_copy[unique_samples %in% unique_combos$unique_samples]
3588
3589
all_results_copy[, loss_by_config := rmse(target, predicted),
3590
                 by = c("data_types", "merge_method", "loss_type", "drug_type",
3591
                        "split_method", "bottleneck", "TargetRange")]
3592
# all_results_copy[, loss_by_config := rmse(target, predicted),
3593
#                  by = c("data_types", "merge_method", "loss_type", "drug_type",
3594
#                         "split_method", "bottleneck", "TargetRange", "Targeted")]
3595
all_results_copy <- unique(all_results_copy[, c("data_types", "merge_method", "loss_type",
3596
                                                "drug_type", "split_method", "bottleneck",
3597
                                                "TargetRange", "loss_by_config")])
3598
# all_results_copy <- unique(all_results_copy[, c("data_types", "merge_method", "loss_type",
3599
#                                                 "drug_type", "split_method", "bottleneck",
3600
#                                                 "TargetRange", "Targeted", "loss_by_config")])
3601
length(unique(all_results_copy$data_types))  # 28 unique trimodal combinations
3602
3603
all_results_copy <- all_results_copy[bottleneck == "No Data Bottleneck"]
3604
3605
## Split By Both Cell Line ====
3606
# Subset by splitting method and AAC range
3607
all_results_long_copy <-
3608
  all_results_copy[split_method == "Split By Cell Line" &
3609
                     bottleneck == "No Data Bottleneck" &
3610
                     TargetRange == "Target Above 0.7" &
3611
                          ((
3612
                            drug_type == "Base Model" &
3613
                              merge_method == "Base Model" &
3614
                              loss_type == "Base Model"
3615
                          ) | (
3616
                            drug_type == "Base Model + GNN" &
3617
                              merge_method == "Base Model + LMF" &
3618
                              loss_type == "Base Model + LDS"
3619
                          ))]
3620
# Assign model name
3621
all_results_long_copy[(
3622
  drug_type == "Base Model" &
3623
    merge_method == "Base Model" &
3624
    loss_type == "Base Model"
3625
), model_type := "Baseline"]
3626
all_results_long_copy[(
3627
  drug_type == "Base Model + GNN" &
3628
    merge_method == "Base Model + LMF" &
3629
    loss_type == "Base Model + LDS"
3630
), model_type := "Trifecta"]
3631
3632
3633
all_results_long_copy <- unique(all_results_long_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "model_type",
3634
                                                          "TargetRange", "loss_by_config")])
3635
# all_results_long_copy <- unique(all_results_long_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "model_type",
3636
#                                                           "TargetRange", "Targeted", "loss_by_config")])
3637
3638
all_results_long_copy[, first_data := strsplit(data_types, "_", fixed = T)[[1]][1], by = "data_types"]
3639
all_results_long_copy[, second_data := strsplit(data_types, "_", fixed = T)[[1]][2], by = "data_types"]
3640
all_results_long_copy$first_data <- factor(all_results_long_copy$first_data,
3641
                                                levels = c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA"))
3642
all_results_long_copy$second_data <- factor(all_results_long_copy$second_data,
3643
                                                 levels = c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA"))
3644
3645
# all_results_long_copy[, max_config_cv_mean := max(loss_by_config), by = c("data_types")]
3646
3647
# all_top_trimodal[, data_types := factor(data_types, levels = data_order)]
3648
all_results_long_copy[, model_type := factor(unlist(all_results_long_copy[, "model_type", with = F]),
3649
                                                      levels = c("Baseline", "Trifecta"))]
3650
3651
p <- ggplot(all_results_long_copy) +
3652
  geom_bar(mapping = aes(x = model_type,
3653
                         y = loss_by_config,
3654
                         # fill = factor(model_type,
3655
                         #               levels = c("Baseline",
3656
                         #                          "Trifecta"))),
3657
                         fill = factor(Targeted,
3658
                                       levels = c("Untargeted Drug",
3659
                                                  "Targeted Drug"))),
3660
                         # fill = c("Targeted", "model_type")),
3661
           stat = "identity", position='dodge', width = 0.9) +
3662
  scale_color_manual(values = c(NA, 'red'), guide='none') +
3663
  # facet_geo(~ data_types, grid = mygrid,  scales = "free_x",
3664
  #           strip.position = "left",
3665
  #           drop = T
3666
  #           # switch = "x"
3667
  #           ) +
3668
  facet_grid(rows = vars(second_data), cols = vars(first_data),
3669
             scales = "free_x", switch = "both") +
3670
  # scale_x_reordered() +
3671
  # facet_wrap(~second_data + first_data,
3672
  #            scales = "free_x", strip.position = "bottom") +
3673
  scale_fill_discrete(name = "Drug Type:") +
3674
  # scale_x_discrete(name = "Model Type") +
3675
  # scale_x_discrete() +
3676
  # scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
3677
  #                              "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
3678
  # geom_errorbar(aes(x = model_type,
3679
  #                   y=cv_mean,
3680
  #                   ymax=cv_mean + cv_sd,
3681
  #                   ymin=cv_mean - cv_sd, col='red'),
3682
  #               linetype=1, show.legend = FALSE, position = dodge2, width = 0.9, colour = "black") +
3683
  theme(
3684
    text = element_text(size = 20, face = "bold"),
3685
    axis.text.x = element_text(angle = 45, hjust = 1),
3686
    # axis.text.x = element_blank(),
3687
    # axis.ticks = element_blank(),
3688
    axis.title.x = element_blank(),
3689
    legend.direction="horizontal",
3690
    legend.position="top",
3691
    legend.justification="right"
3692
    # strip.background = element_blank(),
3693
    # strip.text.x = element_blank(),
3694
    # legend.position = c(.8,.75)
3695
  ) +
3696
  # legend.position = c(.9,.85)) +
3697
  # ylab("Total RMSE Loss") +
3698
  # ylim(0, max(all_results_long_copy$cv_mean) + max(all_results_long_copy$cv_sd) + 0.05) +
3699
  # ylim(0, 1.2) +
3700
  scale_y_continuous(name = "Total RMSE Loss", limits = c(0, 1.25), breaks = c(0, 0.25, 0.5, 0.75, 1)) +
3701
  geom_text(aes(x=model_type, label = round(loss_by_config, 3), angle = 90,
3702
                group = factor(Targeted,
3703
                              levels = c("Untargeted Drug",
3704
                                         "Targeted Drug")),
3705
                y = loss_by_config), vjust = 0.5, hjust = -0.1, position = position_dodge(width = 0.9))
3706
3707
# p <- p + coord_flip()
3708
# all_results_long_copy[data_types %like% "MUT"]
3709
3710
# Get ggplot grob
3711
g = ggplotGrob(p)
3712
3713
# Get the layout dataframe. 
3714
# Note the names.
3715
# g$layout
3716
3717
# gtable::gtable_show_layout(g) # Might also be useful
3718
3719
# Replace the grobs with the nullGrob
3720
cur_patterns <- c("panel-6-7", "panel-5-7", "panel-4-7", "panel-3-7", "panel-2-7", "panel-1-7",
3721
                  "panel-5-6", "panel-4-6", "panel-3-6", "panel-2-6", "panel-1-6",
3722
                  "panel-4-5", "panel-3-5", "panel-2-5", "panel-1-5",
3723
                  "panel-3-4", "panel-2-4", "panel-1-4",
3724
                  "panel-2-3", "panel-1-3",
3725
                  "panel-1-2")
3726
g = ggplotGrob(p)
3727
for (pattern in cur_patterns) {
3728
  pos <- grep(pattern = pattern, g$layout$name)
3729
  g$grobs[[pos]] <- nullGrob()
3730
}
3731
3732
# If you want, move the axis
3733
# g$layout[g$layout$name == "axis-b-2", c("t", "b")] = c(8, 8)
3734
3735
# Draw the plot
3736
grid.newpage()
3737
grid.draw(g)
3738
  
3739
ggsave(filename = "Plots/CV_Results/Trimodal_CV_Baseline_vs_Trifecta_BarPlot_Comparison_Grid.pdf",
3740
       plot = g,
3741
       height = 12, units = "in")  
3742
3743
3744
cur_func <- function(data_name) {
3745
  if (!is.na(data_name)) {
3746
    return(all_results_long_copy[first_data == data_name &
3747
                                   is.na(second_data)]$loss_by_config)
3748
  } else {
3749
    return(NA)
3750
  }
3751
}
3752
3753
all_results_long_copy <- all_results_long_copy[str_count(data_types, "_") < 2]
3754
all_results_long_copy <- all_results_long_copy[model_type == "Baseline"]
3755
all_results_long_copy$first_loss <- sapply(all_results_long_copy$first_data, cur_func)
3756
all_results_long_copy$second_loss <- sapply(all_results_long_copy$second_data, cur_func)
3757
3758
all_results_long_copy <- all_results_long_copy[!is.na(second_data)]
3759
3760
molten_results <- melt(all_results_long_copy[, c("first_data", "second_data",
3761
                               "first_loss", "second_loss",
3762
                               "loss_by_config")],
3763
     id.vars = c("first_data", "second_data"),
3764
     measure.vars = c("first_loss", "second_loss", "loss_by_config"))
3765
3766
molten_results[variable == "first_loss", variable := "Bimodal 1"]
3767
molten_results[variable == "second_loss", variable := "Bimodal 2"]
3768
molten_results[variable == "loss_by_config", variable := "Trimodal"]
3769
# Compare BiModal and TriModal Performances
3770
p <- ggplot(molten_results) +
3771
  geom_bar(mapping = aes(x = variable,
3772
                         y = value,
3773
                         fill = factor(variable,
3774
                                       levels = c("Bimodal 1",
3775
                                                  "Bimodal 2",
3776
                                                  "Trimodal"))),
3777
           stat = "identity", position='dodge', width = 0.9) +
3778
  scale_color_manual(values = c(NA, 'red'), guide='none') +
3779
  facet_grid(rows = vars(second_data), cols = vars(first_data),
3780
             scales = "free_x", switch = "both") +
3781
  # scale_x_reordered() +
3782
  # facet_wrap(~second_data + first_data,
3783
  #            scales = "free_x", strip.position = "bottom") +
3784
  scale_fill_discrete(name = "Drug Type:") +
3785
  # scale_x_discrete(name = "Model Type") +
3786
  # scale_x_discrete() +
3787
  # scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
3788
  #                              "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
3789
  # geom_errorbar(aes(x = model_type,
3790
  #                   y=cv_mean,
3791
  #                   ymax=cv_mean + cv_sd,
3792
  #                   ymin=cv_mean - cv_sd, col='red'),
3793
  #               linetype=1, show.legend = FALSE, position = dodge2, width = 0.9, colour = "black") +
3794
  theme(
3795
    text = element_text(size = 20, face = "bold"),
3796
    axis.text.x = element_text(angle = 45, hjust = 1),
3797
    # axis.text.x = element_blank(),
3798
    # axis.ticks = element_blank(),
3799
    axis.title.x = element_blank(),
3800
    # legend.direction="horizontal",
3801
    # legend.position="top",
3802
    # legend.justification="right"
3803
    # strip.background = element_blank(),
3804
    # strip.text.x = element_blank(),
3805
    # legend.position = c(.8,.75)
3806
    legend.position = "none"
3807
  ) +
3808
  # legend.position = c(.9,.85)) +
3809
  # ylab("Total RMSE Loss") +
3810
  # ylim(0, max(all_results_long_copy$cv_mean) + max(all_results_long_copy$cv_sd) + 0.05) +
3811
  # ylim(0, 1.2) +
3812
  # scale_y_continuous(name = "Total RMSE Loss", limits = c(0, .5), breaks = c(0, 0.15, 0.2, 0.25, 0.35, 0.45)) +
3813
  scale_y_continuous(name = "Total RMSE Loss", limits = c(0, 1), breaks = c(0, 0.25, 0.5, 0.75, 1)) +
3814
  geom_text(aes(x=variable, label = round(value, 3), angle = 90,
3815
                group = factor(variable,
3816
                               levels = c("Bimodal 1",
3817
                                          "Bimodal 2",
3818
                                          "Trimodal")),
3819
                y = value), vjust = 0.5, hjust = -0.1, position = position_dodge(width = 0.9))
3820
3821
g = ggplotGrob(p)
3822
3823
# Get the layout dataframe. 
3824
# Note the names.
3825
# g$layout
3826
3827
# gtable::gtable_show_layout(g) # Might also be useful
3828
3829
# Replace the grobs with the nullGrob
3830
cur_patterns <- c("panel-6-7", "panel-5-7", "panel-4-7", "panel-3-7", "panel-2-7", "panel-1-7",
3831
                  "panel-5-6", "panel-4-6", "panel-3-6", "panel-2-6", "panel-1-6",
3832
                  "panel-4-5", "panel-3-5", "panel-2-5", "panel-1-5",
3833
                  "panel-3-4", "panel-2-4", "panel-1-4",
3834
                  "panel-2-3", "panel-1-3",
3835
                  "panel-1-2")
3836
g = ggplotGrob(p)
3837
for (pattern in cur_patterns) {
3838
  pos <- grep(pattern = pattern, g$layout$name)
3839
  g$grobs[[pos]] <- nullGrob()
3840
}
3841
3842
# If you want, move the axis
3843
# g$layout[g$layout$name == "axis-b-2", c("t", "b")] = c(8, 8)
3844
3845
# Draw the plot
3846
grid.newpage()
3847
grid.draw(g)
3848
3849
ggsave(filename = "Plots/CV_Results/Trimodal_vs_Bimodal_Baseline_BarPlot_Comparison_Grid.pdf",
3850
       plot = g,
3851
       height = 12, units = "in")  
3852
3853
# Repeat for Trifecta Models
3854
cur_func <- function(data_name) {
3855
  if (!is.na(data_name)) {
3856
    return(all_results_long_copy[first_data == data_name &
3857
                                   is.na(second_data)]$loss_by_config)
3858
  } else {
3859
    return(NA)
3860
  }
3861
}
3862
3863
all_results_long_copy <- all_results_long_copy[str_count(data_types, "_") < 2]
3864
all_results_long_copy <- all_results_long_copy[model_type == "Trifecta"]
3865
all_results_long_copy$first_loss <- sapply(all_results_long_copy$first_data, cur_func)
3866
all_results_long_copy$second_loss <- sapply(all_results_long_copy$second_data, cur_func)
3867
cur_func("RPPA")
3868
cur_func(NA)
3869
3870
all_results_long_copy <- all_results_long_copy[!is.na(second_data)]
3871
3872
molten_results <- melt(all_results_long_copy[, c("first_data", "second_data",
3873
                                                 "first_loss", "second_loss",
3874
                                                 "loss_by_config")],
3875
                       id.vars = c("first_data", "second_data"),
3876
                       measure.vars = c("first_loss", "second_loss", "loss_by_config"))
3877
3878
molten_results[variable == "first_loss", variable := "Bimodal 1"]
3879
molten_results[variable == "second_loss", variable := "Bimodal 2"]
3880
molten_results[variable == "loss_by_config", variable := "Trimodal"]
3881
# Compare BiModal and TriModal Performances
3882
p <- ggplot(molten_results) +
3883
  geom_bar(mapping = aes(x = variable,
3884
                         y = value,
3885
                         fill = factor(variable,
3886
                                       levels = c("Bimodal 1",
3887
                                                  "Bimodal 2",
3888
                                                  "Trimodal"))),
3889
           # fill = factor(model_type,
3890
           #               levels = c("Baseline",
3891
           #                          "Trifecta"))),
3892
           # fill = factor(Targeted,
3893
           #               levels = c("Untargeted Drug",
3894
           #                          "Targeted Drug"))),
3895
           # fill = c("Targeted", "model_type")),
3896
           stat = "identity", position='dodge', width = 0.9) +
3897
  scale_color_manual(values = c(NA, 'red'), guide='none') +
3898
  # facet_geo(~ data_types, grid = mygrid,  scales = "free_x",
3899
  #           strip.position = "left",
3900
  #           drop = T
3901
  #           # switch = "x"
3902
  #           ) +
3903
  facet_grid(rows = vars(second_data), cols = vars(first_data),
3904
             scales = "free_x", switch = "both") +
3905
  # scale_x_reordered() +
3906
  # facet_wrap(~second_data + first_data,
3907
  #            scales = "free_x", strip.position = "bottom") +
3908
  scale_fill_discrete(name = "Drug Type:") +
3909
  # scale_x_discrete(name = "Model Type") +
3910
  # scale_x_discrete() +
3911
  # scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
3912
  #                              "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
3913
  # geom_errorbar(aes(x = model_type,
3914
  #                   y=cv_mean,
3915
  #                   ymax=cv_mean + cv_sd,
3916
  #                   ymin=cv_mean - cv_sd, col='red'),
3917
  #               linetype=1, show.legend = FALSE, position = dodge2, width = 0.9, colour = "black") +
3918
  theme(
3919
    text = element_text(size = 20, face = "bold"),
3920
    axis.text.x = element_text(angle = 45, hjust = 1),
3921
    # axis.text.x = element_blank(),
3922
    # axis.ticks = element_blank(),
3923
    axis.title.x = element_blank(),
3924
    # legend.direction="horizontal",
3925
    # legend.position="top",
3926
    # legend.justification="right"
3927
    # strip.background = element_blank(),
3928
    # strip.text.x = element_blank(),
3929
    # legend.position = c(.8,.75)
3930
    legend.position = "none"
3931
  ) +
3932
  # legend.position = c(.9,.85)) +
3933
  # ylab("Total RMSE Loss") +
3934
  # ylim(0, max(all_results_long_copy$cv_mean) + max(all_results_long_copy$cv_sd) + 0.05) +
3935
  # ylim(0, 1.2) +
3936
  # scale_y_continuous(name = "Total RMSE Loss", limits = c(0, .5), breaks = c(0, 0.15, 0.2, 0.25, 0.35, 0.45)) + 
3937
  scale_y_continuous(name = "Total RMSE Loss", limits = c(0, 1), breaks = c(0, 0.25, 0.5, 0.75, 1)) +
3938
  geom_text(aes(x=variable, label = round(value, 3), angle = 90,
3939
                group = factor(variable,
3940
                               levels = c("Bimodal 1",
3941
                                          "Bimodal 2",
3942
                                          "Trimodal")),
3943
                y = value), vjust = 0.5, hjust = -0.1, position = position_dodge(width = 0.9))
3944
3945
g = ggplotGrob(p)
3946
3947
# Get the layout dataframe. 
3948
# Note the names.
3949
# g$layout
3950
3951
# gtable::gtable_show_layout(g) # Might also be useful
3952
3953
# Replace the grobs with the nullGrob
3954
cur_patterns <- c("panel-6-7", "panel-5-7", "panel-4-7", "panel-3-7", "panel-2-7", "panel-1-7",
3955
                  "panel-5-6", "panel-4-6", "panel-3-6", "panel-2-6", "panel-1-6",
3956
                  "panel-4-5", "panel-3-5", "panel-2-5", "panel-1-5",
3957
                  "panel-3-4", "panel-2-4", "panel-1-4",
3958
                  "panel-2-3", "panel-1-3",
3959
                  "panel-1-2")
3960
g = ggplotGrob(p)
3961
for (pattern in cur_patterns) {
3962
  pos <- grep(pattern = pattern, g$layout$name)
3963
  g$grobs[[pos]] <- nullGrob()
3964
}
3965
3966
# If you want, move the axis
3967
# g$layout[g$layout$name == "axis-b-2", c("t", "b")] = c(8, 8)
3968
3969
# Draw the plot
3970
grid.newpage()
3971
grid.draw(g)
3972
3973
ggsave(filename = "Plots/CV_Results/Trimodal_vs_Bimodal_Trifecta_BarPlot_Comparison_Grid.pdf",
3974
       plot = g,
3975
       height = 12, units = "in")  
3976
3977
# Trimodal Trifecta Splitting Comparison ====
3978
# install.packages("geofacet")
3979
# require(geofacet)
3980
# require(ggforce)
3981
# require(tidytext)
3982
require(ggplot2)
3983
require(grid)
3984
library(stringr)
3985
require(data.table)
3986
dodge2 <- position_dodge2(width = 0.9, padding = 0)
3987
rmse <- function(x, y) sqrt(mean((x - y)^2))
3988
3989
all_results_copy <- fread("Data/all_results.csv")
3990
all_results_copy <- all_results_copy[str_count(data_types, "_") == 1]
3991
3992
unique_combos <- fread("Data/shared_unique_combinations.csv")
3993
unique_combos[, unique_samples := paste0(cpd_name, "_", cell_name)]
3994
all_results_copy[, unique_samples := paste0(cpd_name, "_", cell_name)]
3995
all_results_copy <- all_results_copy[unique_samples %in% unique_combos$unique_samples]
3996
3997
all_results_copy <- all_results_copy[bottleneck == "No Data Bottleneck"]
3998
3999
# grid_design()
4000
4001
# mygrid <- data.frame(
4002
#   code = c("MUT_CNV", "MUT_EXP", "CNV_EXP", "CNV_PROT", "MUT_PROT", "EXP_PROT", "EXP_MIRNA", "CNV_MIRNA", "MUT_MIRNA", "PROT_MIRNA", "MIRNA_METAB", "PROT_METAB", "CNV_METAB", "EXP_METAB", "MUT_METAB", "MIRNA_HIST", "CNV_HIST", "EXP_HIST", "PROT_HIST", "MUT_HIST", "EXP_RPPA", "CNV_RPPA", "PROT_RPPA", "MIRNA_RPPA", "MUT_RPPA", "METAB_HIST", "METAB_RPPA", "HIST_RPPA"),
4003
#   name = c("", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""),
4004
#   row = c(1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 6, 7, 7),
4005
#   col = c(1, 1, 2, 2, 1, 3, 3, 2, 1, 4, 5, 4, 2, 3, 1, 5, 2, 3, 4, 1, 3, 2, 4, 5, 1, 6, 6, 7),
4006
#   stringsAsFactors = FALSE
4007
# )
4008
# geofacet::grid_preview(mygrid)
4009
4010
4011
all_results_copy[, loss_by_config := rmse(target, predicted),
4012
                 by = c("data_types", "merge_method", "loss_type", "drug_type",
4013
                        "split_method", "bottleneck", "TargetRange", "Targeted")]
4014
4015
all_results_copy <- unique(all_results_copy[, c("data_types", "merge_method", "loss_type",
4016
                                                "drug_type", "split_method", "bottleneck",
4017
                                                "TargetRange", "Targeted", "loss_by_config")])
4018
length(unique(all_results_copy$data_types))  # 28 unique trimodal combinations
4019
4020
# all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "bottleneck", "TargetRange")]
4021
# all_results_long_copy[, cv_sd := sd(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "bottleneck", "TargetRange")]
4022
length(unique(all_results_copy$data_types))  # 28 unique trimodal combinations
4023
4024
4025
# Show only trifecta results
4026
all_results_long_copy <-
4027
  all_results_copy[bottleneck == "No Data Bottleneck" &
4028
                     TargetRange == "Target Above 0.7" &
4029
                     (
4030
                       drug_type == "Base Model + GNN" &
4031
                         merge_method == "Base Model + LMF" &
4032
                         loss_type == "Base Model + LDS"
4033
                     )]
4034
# Assign model name
4035
# all_results_long_copy[(
4036
#   drug_type == "Base Model" &
4037
#     merge_method == "Base Model" &
4038
#     loss_type == "Base Model"
4039
# ), model_type := "Baseline"]
4040
all_results_long_copy[(
4041
  drug_type == "Base Model + GNN" &
4042
    merge_method == "Base Model + LMF" &
4043
    loss_type == "Base Model + LDS"
4044
), model_type := "Trifecta"]
4045
4046
4047
4048
all_results_long_copy <- unique(all_results_long_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "model_type",
4049
                                                          "TargetRange", "Targeted", "loss_by_config")])
4050
4051
all_results_long_copy[, first_data := strsplit(data_types, "_", fixed = T)[[1]][1], by = "data_types"]
4052
all_results_long_copy[, second_data := strsplit(data_types, "_", fixed = T)[[1]][2], by = "data_types"]
4053
all_results_long_copy$first_data <- factor(all_results_long_copy$first_data,
4054
                                           levels = c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA"))
4055
all_results_long_copy$second_data <- factor(all_results_long_copy$second_data,
4056
                                            levels = c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA"))
4057
4058
all_results_long_copy[, max_config_cv_mean := max(loss_by_config), by = c("data_types")]
4059
4060
# all_top_trimodal[, data_types := factor(data_types, levels = data_order)]
4061
# all_results_long_copy[, model_type := factor(unlist(all_results_long_copy[, "model_type", with = F]),
4062
#                                              levels = c("Baseline", "Trifecta"))]
4063
4064
table(all_results_long_copy[model_type == "Trifecta"]$data_types)
4065
4066
# baseline_trimodal <-
4067
#   all_results_copy[(
4068
#     drug_type == "Base Model" &
4069
#       merge_method == "Base Model" &
4070
#       loss_type == "Base Model"
4071
#   )]
4072
# trifectra_trimodal <-
4073
#   all_results_copy[(
4074
#     drug_type == "Base Model + GNN" &
4075
#       merge_method == "Base Model + LMF" &
4076
#       loss_type == "Base Model + LDS"
4077
#   )]
4078
# baseline_trimodal_cv <- unique(baseline_trimodal[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method",
4079
#                                                      "TargetRange", "cv_mean")])
4080
# trifecta_trimodal_cv <- unique(trifectra_trimodal[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method",
4081
#                                                       "TargetRange", "cv_mean", "cv_sd")])
4082
# 
4083
# upper_trifecta_trimodal_cv <- trifecta_trimodal_cv[TargetRange == "Target Above 0.7"]
4084
# 
4085
# upper_trifecta_trimodal_cv[, first_data := strsplit(data_types, "_", fixed = T)[[1]][1], by = "data_types"]
4086
# upper_trifecta_trimodal_cv[, second_data := strsplit(data_types, "_", fixed = T)[[1]][2], by = "data_types"]
4087
# upper_trifecta_trimodal_cv$first_data <- factor(upper_trifecta_trimodal_cv$first_data,
4088
#                                                 levels = c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA"))
4089
# upper_trifecta_trimodal_cv$second_data <- factor(upper_trifecta_trimodal_cv$second_data,
4090
#                                                 levels = c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA"))
4091
# 
4092
# upper_trifecta_trimodal_cv[, max_config_cv_mean := max(cv_mean), by = c("data_types")]
4093
4094
all_results_long_copy[split_method == "Split By Both Cell Line & Drug Scaffold",
4095
                      split_method := "Cell Line & Drug Scaffold"]
4096
all_results_long_copy[split_method == "Split By Cell Line",
4097
                      split_method := "Cell Line"]
4098
all_results_long_copy[split_method == "Split By Drug Scaffold",
4099
                      split_method := "Drug Scaffold"]
4100
all_results_long_copy[split_method == "Split By Cancer Type",
4101
                      split_method := "Cancer Type"]
4102
p <- ggplot(all_results_long_copy) +
4103
  geom_bar(mapping = aes(x = split_method,
4104
                         y = loss_by_config,
4105
                         # fill = factor(split_method,
4106
                         #               levels = c("Split By Cell Line",
4107
                         #                          "Split By Drug Scaffold",
4108
                         #                          "Split By Both Cell Line & Drug Scaffold",
4109
                         #                          "Split By Cancer Type")),
4110
                         fill = factor(Targeted,
4111
                                       levels = c("Untargeted Drug",
4112
                                                  "Targeted Drug")),
4113
                         color = loss_by_config == max_config_cv_mean),
4114
           stat = "identity", position='dodge', width = 0.9) +
4115
  scale_color_manual(values = c(NA, 'red'), guide='none') +
4116
  # facet_geo(~ data_types, grid = mygrid,  scales = "free_x",
4117
  #           strip.position = "left",
4118
  #           drop = T
4119
  #           # switch = "x"
4120
  #           ) +
4121
  facet_grid(rows = vars(second_data), cols = vars(first_data),
4122
             scales = "free_x", switch = "both") +
4123
  # scale_x_reordered() +
4124
  # facet_wrap(~second_data + first_data,
4125
  #            scales = "free_x", strip.position = "bottom") +
4126
  scale_fill_discrete(name = "Splitting Method:") +
4127
  # scale_x_discrete() +
4128
  # scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
4129
  #                              "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
4130
  # geom_errorbar(aes(x = split_method,
4131
  #                   y=cv_mean,
4132
  #                   ymax=cv_mean + cv_sd,
4133
  #                   ymin=cv_mean - cv_sd, col='red'),
4134
  #               linetype=1, show.legend = FALSE, position = dodge2, width = 0.9, colour = "black") +
4135
  theme(
4136
    text = element_text(size = 20, face = "bold"),
4137
    axis.text.x = element_text(angle = 45, hjust = 1),
4138
    # axis.text.x = element_blank(),
4139
    axis.title.x = element_blank(),
4140
    # axis.ticks = element_blank(),
4141
    legend.direction="horizontal",
4142
    legend.position="top",
4143
    legend.justification="right"
4144
    # strip.background = element_blank(),
4145
    # strip.text.x = element_blank(),
4146
    # legend.position = c(.8,.75)
4147
  ) +
4148
        # legend.position = c(.9,.85)) +
4149
  # ylab("RMSE Loss") +
4150
  # ylim(0, max(all_results_long_copy$loss_by_config) + 0.1)
4151
  # ylim(0, 1) +
4152
  scale_y_continuous(name = "Total RMSE Loss", limits = c(0, 1.25), breaks = c(0, 0.25, 0.5, 0.75, 1)) +
4153
  geom_text(aes(x=split_method, label = round(loss_by_config, 3), angle = 90,
4154
                group = factor(Targeted,
4155
                               levels = c("Untargeted Drug",
4156
                                          "Targeted Drug")),
4157
                y = loss_by_config), vjust = 0.5, hjust = -0.1, position = position_dodge(width = 0.9))
4158
4159
4160
p
4161
# Get ggplot grob
4162
g = ggplotGrob(p)
4163
4164
# Get the layout dataframe. 
4165
# Note the names.
4166
# g$layout
4167
4168
# gtable::gtable_show_layout(g) # Might also be useful
4169
4170
# Replace the grobs with the nullGrob
4171
cur_patterns <- c("panel-6-7", "panel-5-7", "panel-4-7", "panel-3-7", "panel-2-7", "panel-1-7",
4172
                  "panel-5-6", "panel-4-6", "panel-3-6", "panel-2-6", "panel-1-6",
4173
                  "panel-4-5", "panel-3-5", "panel-2-5", "panel-1-5",
4174
                  "panel-3-4", "panel-2-4", "panel-1-4",
4175
                  "panel-2-3", "panel-1-3",
4176
                  "panel-1-2")
4177
g = ggplotGrob(p)
4178
for (pattern in cur_patterns) {
4179
  pos <- grep(pattern = pattern, g$layout$name)
4180
  g$grobs[[pos]] <- nullGrob()
4181
}
4182
4183
# If you want, move the axis
4184
# g$layout[g$layout$name == "axis-b-2", c("t", "b")] = c(8, 8)
4185
4186
# Draw the plot
4187
grid.newpage()
4188
grid.draw(g)
4189
4190
4191
ggsave(filename = "Plots/CV_Results/Trimodal_CV_Trifecta_Split_Comparison_Grid.pdf",
4192
       plot = g,
4193
       height = 12, units = "in")
4194
4195
4196
# ==== Show sample counts for each trimodal combination (DepMap + CTRPv2 overlap)
4197
require(stringr)
4198
line_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
4199
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
4200
4201
exp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
4202
mut <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv")
4203
cnv <- fread("Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv")
4204
prot <- fread("Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv")
4205
4206
mirna <- fread("Data/DRP_Training_Data/DepMap_2019_miRNA.csv")
4207
metab <- fread("Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv")
4208
hist <- fread("Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv")
4209
rppa <- fread("Data/DRP_Training_Data/DepMap_2019_RPPA.csv")
4210
4211
mut$stripped_cell_line_name = str_replace(toupper(mut$stripped_cell_line_name), "-", "")
4212
cnv$stripped_cell_line_name = str_replace(toupper(cnv$stripped_cell_line_name), "-", "")
4213
exp$stripped_cell_line_name = str_replace(toupper(exp$stripped_cell_line_name), "-", "")
4214
prot$stripped_cell_line_name = str_replace(toupper(prot$stripped_cell_line_name), "-", "")
4215
4216
mirna$stripped_cell_line_name = str_replace(toupper(mirna$stripped_cell_line_name), "-", "")
4217
hist$stripped_cell_line_name = str_replace(toupper(hist$stripped_cell_line_name), "-", "")
4218
metab$stripped_cell_line_name = str_replace(toupper(metab$stripped_cell_line_name), "-", "")
4219
rppa$stripped_cell_line_name = str_replace(toupper(rppa$stripped_cell_line_name), "-", "")
4220
4221
ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
4222
4223
mut_line_info <- line_info[stripped_cell_line_name %in% unique(mut$stripped_cell_line_name)]  
4224
cnv_line_info <- line_info[stripped_cell_line_name %in% unique(cnv$stripped_cell_line_name)]  
4225
exp_line_info <- line_info[stripped_cell_line_name %in% unique(exp$stripped_cell_line_name)]  
4226
prot_line_info <- line_info[stripped_cell_line_name %in% unique(prot$stripped_cell_line_name)]
4227
4228
mirna_line_info <- line_info[stripped_cell_line_name %in% unique(mirna$stripped_cell_line_name)]  
4229
hist_line_info <- line_info[stripped_cell_line_name %in% unique(hist$stripped_cell_line_name)]  
4230
metab_line_info <- line_info[stripped_cell_line_name %in% unique(metab$stripped_cell_line_name)]  
4231
rppa_line_info <- line_info[stripped_cell_line_name %in% unique(rppa$stripped_cell_line_name)]
4232
4233
ctrp_line_info <- line_info[stripped_cell_line_name %in% unique(ctrp$ccl_name)]
4234
4235
mut_line_info <- mut_line_info[, c("stripped_cell_line_name", "primary_disease")]
4236
mut_line_info$data_type <- "MUT"
4237
cnv_line_info <- cnv_line_info[, c("stripped_cell_line_name", "primary_disease")]
4238
cnv_line_info$data_type <- "CNV"
4239
exp_line_info <- exp_line_info[, c("stripped_cell_line_name", "primary_disease")]
4240
exp_line_info$data_type <- "EXP"
4241
prot_line_info <- prot_line_info[, c("stripped_cell_line_name", "primary_disease")]
4242
prot_line_info$data_type <- "PROT"
4243
4244
mirna_line_info <- mirna_line_info[, c("stripped_cell_line_name", "primary_disease")]
4245
mirna_line_info$data_type <- "MIRNA"
4246
hist_line_info <- hist_line_info[, c("stripped_cell_line_name", "primary_disease")]
4247
hist_line_info$data_type <- "HIST"
4248
metab_line_info <- metab_line_info[, c("stripped_cell_line_name", "primary_disease")]
4249
metab_line_info$data_type <- "METAB"
4250
rppa_line_info <- rppa_line_info[, c("stripped_cell_line_name", "primary_disease")]
4251
rppa_line_info$data_type <- "RPPA"
4252
4253
ctrp_line_info <- ctrp_line_info[, c("stripped_cell_line_name", "primary_disease")]
4254
ctrp_line_info$data_type <- "CTRP"
4255
4256
all_cells <- rbindlist(list(mut_line_info, cnv_line_info, exp_line_info, prot_line_info,
4257
               mirna_line_info, metab_line_info, hist_line_info, rppa_line_info))
4258
all_cells <- unique(all_cells)
4259
4260
rm(list = c("mut", "cnv", "exp", "prot", "mirna", "metab", "hist", "rppa"))
4261
gc()
4262
4263
all_tri_omic_combos_el <- utils::combn(c("MUT", 'CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA'), 2, simplify = T)
4264
all_tri_omic_combos_el <- t(all_tri_omic_combos_el)
4265
all_tri_omic_combos_el <- as.data.table(all_tri_omic_combos_el)
4266
4267
# all_sample_counts <- vector(mode = "numeric", length = nrow(temp))
4268
ctrp_cells <- unique(ctrp_line_info$stripped_cell_line_name)
4269
all_tri_omic_combos_el$sample_counts <- vector(mode = "integer")
4270
for (i in 1:nrow(all_tri_omic_combos_el)) {
4271
  first_cells <- all_cells[data_type == all_tri_omic_combos_el[i, 1]]$stripped_cell_line_name
4272
  second_cells <- all_cells[data_type == all_tri_omic_combos_el[i, 2]]$stripped_cell_line_name
4273
  cell_overlap <- Reduce(intersect, list(first_cells, second_cells, ctrp_cells))
4274
  ctrp_overlap <- uniqueN(ctrp[ccl_name %in% cell_overlap])
4275
  all_tri_omic_combos_el[i, 3] <- ctrp_overlap
4276
}
4277
4278
temp <- trifecta_trimodal_cv[TargetRange == "Target Above 0.7"]
4279
4280
# ==== Trimodal Trifecta minus LMF (Split By Both Cell Line & Drug Scaffold) ====
4281
library(stringr)
4282
all_results_copy <- all_results[str_count(data_types, "_") == 1]
4283
all_results_copy[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange")]
4284
# all_results_copy[, Targeted := ifelse(cpd_name %in% targeted_drugs, T, F)]
4285
4286
all_results_long_copy <- melt(unique(all_results_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "TargetRange")]),
4287
                              id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange"))
4288
4289
all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange")]
4290
length(unique(all_results_long_copy$data_types))  # 28 unique trimodal combinations
4291
4292
baseline_with_lmf <- all_results_long_copy[split_method == "SplitByDrugScaffold"]
4293
# baseline_with_lmf <- all_results_long_copy[(nchar(data_types) > 5)]
4294
p <- ggplot(baseline_with_lmf) +
4295
  geom_bar(mapping = aes(x = data_types, y = value, fill = fold), stat = "identity", position='dodge') +
4296
  facet_wrap(~drug_type+merge_method+loss_type+split_method+TargetRange, ncol = 2) + 
4297
  scale_fill_discrete(name = "CV Fold:") +
4298
  scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
4299
                               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
4300
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
4301
  ggtitle(label = tools::toTitleCase("Comparison of LMF Fusion across two true AAC range groups"),
4302
          subtitle = "5-fold validation RMSE loss using strict splitting") +
4303
  geom_errorbar(aes(x=data_types,
4304
                    y=cv_mean,
4305
                    ymax=cv_mean, 
4306
                    ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) +
4307
  geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5)
4308
4309
ggsave(plot = p, filename = "Plots/CV_Results/Trimodal_CV_per_fold_Baseline_vs_Trifecta_SplitByBoth_Comparison.pdf",
4310
       width = 24, height = 16, units = "in")
4311
4312
4313
# Multi-modal Baseline vs Trifecta Bar Plot ====
4314
require(ggplot2)
4315
require(grid)
4316
library(stringr)
4317
require(data.table)
4318
dodge2 <- position_dodge2(width = 0.9, padding = 0)
4319
rmse <- function(x, y) sqrt(mean((x - y)^2))
4320
4321
4322
# all_results_copy <- fread("Data/all_results.csv")
4323
4324
all_results_copy <- all_results_copy[str_count(data_types, "_") > 1]
4325
4326
unique_combos <- fread("Data/shared_unique_combinations.csv")
4327
unique_combos[, unique_samples := paste0(cpd_name, "_", cell_name)]
4328
all_results_copy[, unique_samples := paste0(cpd_name, "_", cell_name)]
4329
all_results_copy <- all_results_copy[unique_samples %in% unique_combos$unique_samples]
4330
4331
all_results_copy[, loss_by_config := rmse(target, predicted),
4332
                 by = c("data_types", "merge_method", "loss_type", "drug_type",
4333
                        "split_method", "bottleneck", "TargetRange", "Targeted")]
4334
all_results_copy <- unique(all_results_copy[, c("data_types", "merge_method", "loss_type",
4335
                                                "drug_type", "split_method", "bottleneck",
4336
                                                "TargetRange", "Targeted", "loss_by_config")])
4337
length(unique(all_results_copy$data_types))  # 9 unique multimodal combinations
4338
4339
all_results_copy <- all_results_copy[bottleneck == "No Data Bottleneck"]
4340
4341
## Split By Both Cell Line ====
4342
# Subset by splitting method and AAC range
4343
all_results_long_copy <-
4344
  all_results_copy[split_method == "Split By Cell Line" &
4345
                     bottleneck == "No Data Bottleneck" &
4346
                     TargetRange == "Target Above 0.7" &
4347
                     ((
4348
                       drug_type == "Base Model" &
4349
                         merge_method == "Base Model" &
4350
                         loss_type == "Base Model"
4351
                     ) | (
4352
                       drug_type == "Base Model + GNN" &
4353
                         merge_method == "Base Model + LMF" &
4354
                         loss_type == "Base Model + LDS"
4355
                     ))]
4356
# Assign model name
4357
all_results_long_copy[(
4358
  drug_type == "Base Model" &
4359
    merge_method == "Base Model" &
4360
    loss_type == "Base Model"
4361
), model_type := "Baseline"]
4362
all_results_long_copy[(
4363
  drug_type == "Base Model + GNN" &
4364
    merge_method == "Base Model + LMF" &
4365
    loss_type == "Base Model + LDS"
4366
), model_type := "Trifecta"]
4367
4368
# all_results_long_copy <- unique(all_results_long_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "model_type",
4369
#                                                           "TargetRange", "Targeted", "loss_by_config")])
4370
4371
# all_results_long_copy[, first_data := strsplit(data_types, "_", fixed = T)[[1]][1], by = "data_types"]
4372
# all_results_long_copy[, second_data := strsplit(data_types, "_", fixed = T)[[1]][2], by = "data_types"]
4373
# all_results_long_copy$first_data <- factor(all_results_long_copy$first_data,
4374
#                                            levels = c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA"))
4375
# all_results_long_copy$second_data <- factor(all_results_long_copy$second_data,
4376
#                                             levels = c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA"))
4377
4378
all_results_long_copy[, max_config_cv_mean := max(loss_by_config), by = c("data_types")]
4379
4380
# all_top_trimodal[, data_types := factor(data_types, levels = data_order)]
4381
all_results_long_copy[, model_type := factor(unlist(all_results_long_copy[, "model_type", with = F]),
4382
                                             levels = c("Baseline", "Trifecta"))]
4383
4384
all_results_long_copy[, data_types := gsub("_", "+", data_types, fixed = T)]
4385
p <- ggplot(all_results_long_copy) +
4386
  geom_bar(mapping = aes(x = model_type,
4387
                         y = loss_by_config,
4388
                         # fill = factor(model_type,
4389
                         #               levels = c("Baseline",
4390
                         #                          "Trifecta"))),
4391
                         fill = factor(Targeted,
4392
                                       levels = c("Untargeted Drug",
4393
                                                  "Targeted Drug"))),
4394
           # fill = c("Targeted", "model_type")),
4395
           stat = "identity", position='dodge', width = 0.9) +
4396
  scale_color_manual(values = c(NA, 'red'), guide='none') +
4397
  # facet_geo(~ data_types, grid = mygrid,  scales = "free_x",
4398
  #           strip.position = "left",
4399
  #           drop = T
4400
  #           # switch = "x"
4401
  #           ) +
4402
  # facet_grid(rows = vars(second_data), cols = vars(first_data),
4403
  #            scales = "free_x", switch = "both") +
4404
  # scale_x_reordered() +
4405
  facet_wrap(~data_types,
4406
             scales = "free_x", strip.position = "bottom") +
4407
  scale_fill_discrete(name = "Drug Type:") +
4408
  # scale_x_discrete(name = "Model Type") +
4409
  # scale_x_discrete() +
4410
  # scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
4411
  #                              "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
4412
  # geom_errorbar(aes(x = model_type,
4413
  #                   y=cv_mean,
4414
  #                   ymax=cv_mean + cv_sd,
4415
  #                   ymin=cv_mean - cv_sd, col='red'),
4416
  #               linetype=1, show.legend = FALSE, position = dodge2, width = 0.9, colour = "black") +
4417
  theme(
4418
    text = element_text(size = 20, face = "bold"),
4419
    # axis.text.x = element_text(angle = 0),
4420
    # axis.text.x = element_blank(),
4421
    # axis.ticks = element_blank(),
4422
    axis.title.x = element_blank(),
4423
    legend.direction="horizontal",
4424
    legend.position="top",
4425
    legend.justification="right"
4426
    # strip.background = element_blank(),
4427
    # strip.text.x = element_blank(),
4428
    # legend.position = c(.8,.75)
4429
  ) +
4430
  # legend.position = c(.9,.85)) +
4431
  # ylab("Total RMSE Loss") +
4432
  # ylim(0, max(all_results_long_copy$cv_mean) + max(all_results_long_copy$cv_sd) + 0.05) +
4433
  # ylim(0, 1.2) +
4434
  scale_y_continuous(name = "Total RMSE Loss", limits = c(0, 1.25), breaks = c(0, 0.25, 0.5, 0.75, 1)) +
4435
  geom_text(aes(x=model_type, label = round(loss_by_config, 3), angle = 90,
4436
                group = factor(Targeted,
4437
                               levels = c("Untargeted Drug",
4438
                                          "Targeted Drug")),
4439
                y = loss_by_config), vjust = 0.5, hjust = -0.1, position = position_dodge(width = 0.9))
4440
4441
ggsave(filename = "Plots/CV_Results/Multimodal_CV_Baseline_vs_Trifecta_BarPlot_Comparison_Grid.pdf",
4442
       plot = p,
4443
       height = 12, width = 14, units = "in")  
4444
4445
# p <- p + coord_flip()
4446
# all_results_long_copy[data_types %like% "MUT"]
4447
4448
# Get ggplot grob
4449
g = ggplotGrob(p)
4450
4451
# Get the layout dataframe. 
4452
# Note the names.
4453
# g$layout
4454
4455
# gtable::gtable_show_layout(g) # Might also be useful
4456
4457
# Replace the grobs with the nullGrob
4458
cur_patterns <- c("panel-6-7", "panel-5-7", "panel-4-7", "panel-3-7", "panel-2-7", "panel-1-7",
4459
                  "panel-5-6", "panel-4-6", "panel-3-6", "panel-2-6", "panel-1-6",
4460
                  "panel-4-5", "panel-3-5", "panel-2-5", "panel-1-5",
4461
                  "panel-3-4", "panel-2-4", "panel-1-4",
4462
                  "panel-2-3", "panel-1-3",
4463
                  "panel-1-2")
4464
g = ggplotGrob(p)
4465
for (pattern in cur_patterns) {
4466
  pos <- grep(pattern = pattern, g$layout$name)
4467
  g$grobs[[pos]] <- nullGrob()
4468
}
4469
4470
# If you want, move the axis
4471
# g$layout[g$layout$name == "axis-b-2", c("t", "b")] = c(8, 8)
4472
4473
# Draw the plot
4474
grid.newpage()
4475
grid.draw(g)
4476
4477
ggsave(filename = "Plots/CV_Results/Trimodal_CV_Baseline_vs_Trifecta_BarPlot_Comparison_Grid.pdf",
4478
       plot = g,
4479
       height = 12, units = "in")  
4480
4481
4482
4483
4484
# ==== Multimodal Baseline vs LMF (Split By Both Cell Line & Drug Scaffold) ====
4485
all_results_copy <- all_results
4486
all_results_copy[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange")]
4487
# all_results_copy[, Targeted := ifelse(cpd_name %in% targeted_drugs, T, F)]
4488
4489
all_results_long_copy <- melt(unique(all_results_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "TargetRange")]),
4490
                              id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange"))
4491
4492
all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange")]
4493
4494
baseline_with_lmf <- all_results_long_copy[(drug_type == "Morgan" &
4495
                                              split_method == "SplitByBoth" & nchar(data_types) > 5)]
4496
baseline_with_lmf <- all_results_long_copy[(nchar(data_types) > 5)]
4497
p <- ggplot(baseline_with_lmf) +
4498
  geom_bar(mapping = aes(x = data_types, y = value, fill = fold), stat = "identity", position='dodge') +
4499
  facet_wrap(~merge_method+loss_type+split_method+TargetRange, ncol = 2) + 
4500
  scale_fill_discrete(name = "CV Fold:") +
4501
  scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
4502
                               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
4503
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
4504
  ggtitle(label = tools::toTitleCase("Comparison of LMF Fusion across two true AAC range groups"),
4505
          subtitle = "5-fold validation RMSE loss using strict splitting") +
4506
  geom_errorbar(aes(x=data_types,
4507
                    y=cv_mean,
4508
                    ymax=cv_mean, 
4509
                    ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) +
4510
  geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5)
4511
4512
ggsave(plot = p, filename = "Plots/CV_Results/Multimodal_CV_per_fold_Baseline_vs_LMF_SplitByBoth_Comparison.pdf",
4513
       width = 24, height = 16, units = "in")
4514
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_GNN_Upper_0.7_Comparison_long.pdf",
4515
#        width = 24, height = 48, units = "in")
4516
4517
# ==== Multimodal Baseline vs LDS (Split By Both Cell Line & Drug Scaffold) ====
4518
all_results_copy <- all_results
4519
all_results_copy[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange")]
4520
# all_results_copy[, Targeted := ifelse(cpd_name %in% targeted_drugs, T, F)]
4521
4522
all_results_long_copy <- melt(unique(all_results_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "TargetRange")]),
4523
                              id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "TargetRange"))
4524
4525
all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "TargetRange")]
4526
4527
baseline_with_lmf <- all_results_long_copy[(drug_type == "Morgan" & merge_method == "MergeByConcat" &
4528
                                              split_method == "SplitByBoth" & nchar(data_types) > 5)]
4529
# baseline_with_lmf <- all_results_long_copy[(nchar(data_types) > 5)]
4530
p <- ggplot(baseline_with_lmf) +
4531
  geom_bar(mapping = aes(x = data_types, y = value, fill = fold), stat = "identity", position='dodge') +
4532
  facet_wrap(~merge_method+loss_type+split_method+TargetRange, ncol = 2) + 
4533
  scale_fill_discrete(name = "CV Fold:") +
4534
  scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
4535
                               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
4536
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
4537
  ggtitle(label = tools::toTitleCase("Comparison of LMF Fusion across two true AAC range groups"),
4538
          subtitle = "5-fold validation RMSE loss using strict splitting") +
4539
  geom_errorbar(aes(x=data_types,
4540
                    y=cv_mean,
4541
                    ymax=cv_mean, 
4542
                    ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) +
4543
  geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5)
4544
4545
ggsave(plot = p, filename = "Plots/CV_Results/Multimodal_CV_per_fold_Baseline_vs_LMF_SplitByBoth_Comparison.pdf",
4546
       width = 24, height = 16, units = "in")
4547
# ==== Upper Range AAC Comparison ====
4548
# targeted_drug_results <- all_results[cpd_name %in% targeted_drugs]
4549
all_results_copy <- all_results
4550
all_results_copy <- all_results_copy[target >= 0.7]
4551
all_results_copy[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")]
4552
all_results_copy[, Targeted := ifelse(cpd_name %in% targeted_drugs, T, F)]
4553
4554
all_results_long_copy <- melt(unique(all_results_copy[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "Targeted")]),
4555
                              id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "Targeted"))
4556
all_results_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "split_method", "Targeted")]
4557
4558
baseline_with_lds <- all_results_long_copy[(merge_method == "Concat" & drug_type == "DRUG" & split_method == "DRUG")]
4559
4560
ggplot(baseline_with_lds) +
4561
  geom_bar(mapping = aes(x = data_types, y = value, fill = fold), stat = "identity", position='dodge') +
4562
  facet_wrap(~merge_method+loss_type+drug_type+split_method+Targeted, nrow = 2) + 
4563
  scale_fill_discrete(name = "CV Fold:") +
4564
  scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
4565
                               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
4566
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
4567
  ggtitle(label = tools::toTitleCase("Comparison of Loss-weighting, fusion method and drug representation in the bi-modal case"),
4568
          subtitle = "Validation RMSE loss using strict splitting") +
4569
  geom_errorbar(aes(x=data_types,
4570
                    y=cv_mean,
4571
                    ymax=cv_mean, 
4572
                    ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) +
4573
  geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5)
4574
4575
4576
# ==== 4 targeted drugs ("Gefitinib", "Tamoxifen", "MK-2206", "PLX-4720") ====
4577
temp <- all_results[cpd_name %in% c("Gefitinib", "Tamoxifen", "MK-2206", "PLX-4720", "Imatinib")]
4578
temp[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")]
4579
# temp[, Targeted := ifelse(cpd_name %in% targeted_drugs, T, F)]
4580
4581
# temp_long_copy <- melt(unique(temp[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config", "Targeted")]),
4582
#                               id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "Targeted"))
4583
# temp_long_copy[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "split_method", "Targeted")]
4584
# 
4585
# baseline_with_lds <- temp_long_copy[(merge_method == "Concat" & drug_type == "DRUG" & split_method == "DRUG")]
4586
# se <- function(y) sd(y)/length(y)
4587
temp_baseline_with_lds <- temp[(merge_method == "Concat" & drug_type == "DRUG" & split_method == "DRUG")]
4588
ggplot(data = temp_baseline_with_lds, mapping = aes(x = cpd_name, y = RMSELoss)) +
4589
  # geom_bar(stat = "identity", position='dodge') +
4590
  facet_wrap(~loss_type+split_method+data_types, nrow = 2) + 
4591
  scale_fill_discrete(name = "CV Fold:") +
4592
  # stat_summary_bin(geom = "errorbar", fun.data=function(RMSELoss)c(ymin=mean(RMSELoss)-se(RMSELoss),ymax=mean(RMSELoss)+se(RMSELoss)), position = "dodge") +
4593
  # stat_summary_bin(geom = "errorbar", fun.data='mean', position = "dodge") +
4594
  stat_summary(fun = mean, geom = "bar") +
4595
  stat_summary(fun.data = mean_se, geom = "errorbar") +
4596
4597
  
4598
  # scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
4599
  #                              "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
4600
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
4601
  ggtitle(label = tools::toTitleCase("Comparison of Loss-weighting, fusion method and drug representation in the bi-modal case"),
4602
          subtitle = "Validation RMSE loss using strict splitting")
4603
  # geom_errorbar(aes(x=data_types,
4604
  #                   y=cv_mean,
4605
  #                   ymax=cv_mean, 
4606
  #                   ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) +
4607
  # geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5)