a b/R/cv_per_fold_results.R
1
# cv_per_fold_results.R
2
3
require(data.table)
4
require(ggplot2)
5
options(scipen = 3)
6
7
drug_info <- fread("Data/DRP_Training_Data/CTRP_DRUG_INFO.csv")
8
targeted_drugs <- drug_info[gene_symbol_of_protein_target != "" & cpd_status == "clinical"]$rn
9
10
cur_cv_files <- list.files("Data/CV_Results/", recursive = T,
11
                            pattern = ".*final_validation.*", full.names = T)
12
13
# all_csv_results <- list.files("Data/CV_Results/", "CV_results.csv", recursive = T, full.names = T)
14
# all_csv_results <- list.files("Data/CV_Results/", "CTRP_AAC_SMILES_inference_results.csv", recursive = T, full.names = T)
15
# cur_cv_files <- grep(pattern = ".+drug_.{3,5}_HyperOpt.+", x = all_csv_results, value = T)
16
17
18
all_results <- vector(mode = "list", length = length(cur_cv_files))
19
for (i in 1:length(cur_cv_files)) {
20
  cur_res <- fread(cur_cv_files[i])
21
  data_types <- gsub(".+ResponseOnly_\\w*drug_(.+)_HyperOpt.+", "\\1", cur_cv_files[i])
22
  data_types <- toupper(data_types)
23
  merge_method <- gsub(".+MergeBy(\\w+)_.*RMSE.+", "\\1", cur_cv_files[i])
24
  loss_method <- gsub(".+_(.*)RMSE.+", "\\1RMSE", cur_cv_files[i])
25
  drug_type <- gsub(".+ResponseOnly_(\\w*)drug.+_HyperOpt.+", "\\1drug", cur_cv_files[i])
26
  drug_type <- toupper(drug_type)
27
  split_method <- gsub(".+Split_(\\w+)_NoBottleNeck.+", "\\1", cur_cv_files[i])
28
  cur_fold <- gsub(".+CV_Index_(\\d)_.+", "\\1", cur_cv_files[i])
29
  # data_types <- strsplit(data_types, "_")[[1]]
30
  # cur_res$epoch <- as.integer(epoch)
31
  cur_res$data_types <- data_types
32
  cur_res$merge_method <- merge_method
33
  cur_res$loss_type <- loss_method
34
  cur_res$drug_type <- drug_type
35
  cur_res$split_method <- split_method
36
  cur_res$fold <- cur_fold
37
  
38
  all_results[[i]] <- cur_res
39
}
40
all_results <- rbindlist(all_results)
41
all_results[, RMSELoss := abs(target - predicted), by = .I]
42
43
# mean(all_results$RMSELoss)
44
all_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")]
45
all_results$V1 <- NULL
46
long_results <- melt(unique(all_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config")]),
47
                     id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold"))
48
49
50
long_results[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "split_method")]
51
# split_both_results <- long_results[split_method == "BOTH"]
52
# split_drug_results <- long_results[split_method == "DRUG"]
53
baseline_with_lds_results <- long_results[(merge_method == "Concat" & drug_type == "DRUG")]
54
55
targeted_drug_results <- all_results[cpd_name %in% targeted_drugs]
56
targeted_drug_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")]
57
long_targeted_drug_results <- melt(unique(targeted_drug_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config")]),
58
                     id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold"))
59
long_targeted_drug_results[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "split_method")]
60
61
baseline_with_lds_targeted <- long_targeted_drug_results[(merge_method == "Concat" & drug_type == "DRUG")]
62
63
ggplot(baseline_with_lds_targeted) +
64
  geom_bar(mapping = aes(x = data_types, y = value, fill = fold), stat = "identity", position='dodge') +
65
  facet_wrap(~merge_method+loss_type+drug_type+split_method, nrow = 2) + 
66
  scale_fill_discrete(name = "CV Fold:") +
67
  scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
68
                               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
69
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
70
  ggtitle(label = tools::toTitleCase("Comparison of Loss-weighting, fusion method and drug representation in the bi-modal case"),
71
          subtitle = "Validation RMSE loss using strict splitting") +
72
  geom_errorbar(aes(x=data_types,
73
                    y=cv_mean,
74
                    ymax=cv_mean, 
75
                    ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) +
76
  geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5)
77
78
# scale_y_continuous(breaks = c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.5)) + ylim(c(0, 0.7))
79
80
  
81
dir.create("Plots/CV_Results/")
82
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_split_BOTH_per_fold_Full_Comparison.pdf")
83
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_split_DRUG_per_fold_Full_Comparison.pdf")
84
ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_LDS_Full_Comparison.pdf")
85
86
# = Upper AAC Comparison ====
87
temp_results <- all_results
88
temp_results$loss_by_config <- NULL
89
temp_results <- temp_results[target > 0.7]
90
91
temp_results <- temp_results[(merge_method == "Concat" & drug_type == "DRUG")]
92
93
temp_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")]
94
95
long_temp_results <- melt(unique(temp_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config")]),
96
                     id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold"))
97
98
99
long_temp_results[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "split_method")]
100
# split_both_results <- long_temp_results[split_method == "BOTH"]
101
# split_drug_results <- long_temp_results[split_method == "DRUG"]
102
103
ggplot(long_temp_results) +
104
  geom_bar(mapping = aes(x = data_types, y = value, fill = fold), stat = "identity", position='dodge') +
105
  facet_wrap(~merge_method+loss_type+drug_type+split_method, nrow = 2) + 
106
  scale_fill_discrete(name = "CV Fold:") +
107
  scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
108
                               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
109
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
110
  ggtitle(label = tools::toTitleCase("Comparison of Loss-weighting, fusion method and drug representation in the bi-modal case"),
111
          subtitle = "Subset of AAC >= 0.7 Validation RMSE loss using strict splitting") +
112
  geom_errorbar(aes(x=data_types,
113
                    y=cv_mean,
114
                    ymax=cv_mean, 
115
                    ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) +
116
  geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5) 
117
118
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_split_BOTH_Upper_0.7_Comparison.pdf")
119
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_split_DRUG_Upper_0.7_Comparison.pdf")
120
ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_LDS_Upper_0.7_Comparison.pdf")
121
122
# = Upper AAC (0.9) Comparison ====
123
temp_results <- all_results
124
temp_results$loss_by_config <- NULL
125
temp_results <- temp_results[target > 0.9]
126
127
temp_results <- temp_results[(merge_method == "Concat" & drug_type == "DRUG")]
128
129
temp_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")]
130
131
long_temp_results <- melt(unique(temp_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config")]),
132
                          id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold"))
133
134
135
long_temp_results[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "split_method")]
136
# split_both_results <- long_temp_results[split_method == "BOTH"]
137
# split_drug_results <- long_temp_results[split_method == "DRUG"]
138
139
ggplot(long_temp_results) +
140
  geom_bar(mapping = aes(x = data_types, y = value, fill = fold), stat = "identity", position='dodge') +
141
  facet_wrap(~merge_method+loss_type+drug_type+split_method, nrow = 2) + 
142
  scale_fill_discrete(name = "CV Fold:") +
143
  scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73",
144
                               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
145
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
146
  ggtitle(label = tools::toTitleCase("Comparison of Loss-weighting, fusion method and drug representation in the bi-modal case"),
147
          subtitle = "Subset of AAC >= 0.9 Validation RMSE loss using strict splitting") +
148
  geom_errorbar(aes(x=data_types,
149
                    y=cv_mean,
150
                    ymax=cv_mean, 
151
                    ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) +
152
  geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5) 
153
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_split_BOTH_Upper_0.9_Comparison.pdf")
154
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_split_DRUG_Upper_0.9_Comparison.pdf")
155
ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_LDS_Upper_0.9_Comparison.pdf")