|
a |
|
b/R/cv_per_fold_results.R |
|
|
1 |
# cv_per_fold_results.R |
|
|
2 |
|
|
|
3 |
require(data.table) |
|
|
4 |
require(ggplot2) |
|
|
5 |
options(scipen = 3) |
|
|
6 |
|
|
|
7 |
drug_info <- fread("Data/DRP_Training_Data/CTRP_DRUG_INFO.csv") |
|
|
8 |
targeted_drugs <- drug_info[gene_symbol_of_protein_target != "" & cpd_status == "clinical"]$rn |
|
|
9 |
|
|
|
10 |
cur_cv_files <- list.files("Data/CV_Results/", recursive = T, |
|
|
11 |
pattern = ".*final_validation.*", full.names = T) |
|
|
12 |
|
|
|
13 |
# all_csv_results <- list.files("Data/CV_Results/", "CV_results.csv", recursive = T, full.names = T) |
|
|
14 |
# all_csv_results <- list.files("Data/CV_Results/", "CTRP_AAC_SMILES_inference_results.csv", recursive = T, full.names = T) |
|
|
15 |
# cur_cv_files <- grep(pattern = ".+drug_.{3,5}_HyperOpt.+", x = all_csv_results, value = T) |
|
|
16 |
|
|
|
17 |
|
|
|
18 |
all_results <- vector(mode = "list", length = length(cur_cv_files)) |
|
|
19 |
for (i in 1:length(cur_cv_files)) { |
|
|
20 |
cur_res <- fread(cur_cv_files[i]) |
|
|
21 |
data_types <- gsub(".+ResponseOnly_\\w*drug_(.+)_HyperOpt.+", "\\1", cur_cv_files[i]) |
|
|
22 |
data_types <- toupper(data_types) |
|
|
23 |
merge_method <- gsub(".+MergeBy(\\w+)_.*RMSE.+", "\\1", cur_cv_files[i]) |
|
|
24 |
loss_method <- gsub(".+_(.*)RMSE.+", "\\1RMSE", cur_cv_files[i]) |
|
|
25 |
drug_type <- gsub(".+ResponseOnly_(\\w*)drug.+_HyperOpt.+", "\\1drug", cur_cv_files[i]) |
|
|
26 |
drug_type <- toupper(drug_type) |
|
|
27 |
split_method <- gsub(".+Split_(\\w+)_NoBottleNeck.+", "\\1", cur_cv_files[i]) |
|
|
28 |
cur_fold <- gsub(".+CV_Index_(\\d)_.+", "\\1", cur_cv_files[i]) |
|
|
29 |
# data_types <- strsplit(data_types, "_")[[1]] |
|
|
30 |
# cur_res$epoch <- as.integer(epoch) |
|
|
31 |
cur_res$data_types <- data_types |
|
|
32 |
cur_res$merge_method <- merge_method |
|
|
33 |
cur_res$loss_type <- loss_method |
|
|
34 |
cur_res$drug_type <- drug_type |
|
|
35 |
cur_res$split_method <- split_method |
|
|
36 |
cur_res$fold <- cur_fold |
|
|
37 |
|
|
|
38 |
all_results[[i]] <- cur_res |
|
|
39 |
} |
|
|
40 |
all_results <- rbindlist(all_results) |
|
|
41 |
all_results[, RMSELoss := abs(target - predicted), by = .I] |
|
|
42 |
|
|
|
43 |
# mean(all_results$RMSELoss) |
|
|
44 |
all_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")] |
|
|
45 |
all_results$V1 <- NULL |
|
|
46 |
long_results <- melt(unique(all_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config")]), |
|
|
47 |
id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")) |
|
|
48 |
|
|
|
49 |
|
|
|
50 |
long_results[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "split_method")] |
|
|
51 |
# split_both_results <- long_results[split_method == "BOTH"] |
|
|
52 |
# split_drug_results <- long_results[split_method == "DRUG"] |
|
|
53 |
baseline_with_lds_results <- long_results[(merge_method == "Concat" & drug_type == "DRUG")] |
|
|
54 |
|
|
|
55 |
targeted_drug_results <- all_results[cpd_name %in% targeted_drugs] |
|
|
56 |
targeted_drug_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")] |
|
|
57 |
long_targeted_drug_results <- melt(unique(targeted_drug_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config")]), |
|
|
58 |
id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")) |
|
|
59 |
long_targeted_drug_results[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "split_method")] |
|
|
60 |
|
|
|
61 |
baseline_with_lds_targeted <- long_targeted_drug_results[(merge_method == "Concat" & drug_type == "DRUG")] |
|
|
62 |
|
|
|
63 |
ggplot(baseline_with_lds_targeted) + |
|
|
64 |
geom_bar(mapping = aes(x = data_types, y = value, fill = fold), stat = "identity", position='dodge') + |
|
|
65 |
facet_wrap(~merge_method+loss_type+drug_type+split_method, nrow = 2) + |
|
|
66 |
scale_fill_discrete(name = "CV Fold:") + |
|
|
67 |
scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73", |
|
|
68 |
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")) + |
|
|
69 |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
|
|
70 |
ggtitle(label = tools::toTitleCase("Comparison of Loss-weighting, fusion method and drug representation in the bi-modal case"), |
|
|
71 |
subtitle = "Validation RMSE loss using strict splitting") + |
|
|
72 |
geom_errorbar(aes(x=data_types, |
|
|
73 |
y=cv_mean, |
|
|
74 |
ymax=cv_mean, |
|
|
75 |
ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) + |
|
|
76 |
geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5) |
|
|
77 |
|
|
|
78 |
# scale_y_continuous(breaks = c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.5)) + ylim(c(0, 0.7)) |
|
|
79 |
|
|
|
80 |
|
|
|
81 |
dir.create("Plots/CV_Results/") |
|
|
82 |
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_split_BOTH_per_fold_Full_Comparison.pdf") |
|
|
83 |
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_split_DRUG_per_fold_Full_Comparison.pdf") |
|
|
84 |
ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_LDS_Full_Comparison.pdf") |
|
|
85 |
|
|
|
86 |
# = Upper AAC Comparison ==== |
|
|
87 |
temp_results <- all_results |
|
|
88 |
temp_results$loss_by_config <- NULL |
|
|
89 |
temp_results <- temp_results[target > 0.7] |
|
|
90 |
|
|
|
91 |
temp_results <- temp_results[(merge_method == "Concat" & drug_type == "DRUG")] |
|
|
92 |
|
|
|
93 |
temp_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")] |
|
|
94 |
|
|
|
95 |
long_temp_results <- melt(unique(temp_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config")]), |
|
|
96 |
id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")) |
|
|
97 |
|
|
|
98 |
|
|
|
99 |
long_temp_results[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "split_method")] |
|
|
100 |
# split_both_results <- long_temp_results[split_method == "BOTH"] |
|
|
101 |
# split_drug_results <- long_temp_results[split_method == "DRUG"] |
|
|
102 |
|
|
|
103 |
ggplot(long_temp_results) + |
|
|
104 |
geom_bar(mapping = aes(x = data_types, y = value, fill = fold), stat = "identity", position='dodge') + |
|
|
105 |
facet_wrap(~merge_method+loss_type+drug_type+split_method, nrow = 2) + |
|
|
106 |
scale_fill_discrete(name = "CV Fold:") + |
|
|
107 |
scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73", |
|
|
108 |
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")) + |
|
|
109 |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
|
|
110 |
ggtitle(label = tools::toTitleCase("Comparison of Loss-weighting, fusion method and drug representation in the bi-modal case"), |
|
|
111 |
subtitle = "Subset of AAC >= 0.7 Validation RMSE loss using strict splitting") + |
|
|
112 |
geom_errorbar(aes(x=data_types, |
|
|
113 |
y=cv_mean, |
|
|
114 |
ymax=cv_mean, |
|
|
115 |
ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) + |
|
|
116 |
geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5) |
|
|
117 |
|
|
|
118 |
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_split_BOTH_Upper_0.7_Comparison.pdf") |
|
|
119 |
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_split_DRUG_Upper_0.7_Comparison.pdf") |
|
|
120 |
ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_LDS_Upper_0.7_Comparison.pdf") |
|
|
121 |
|
|
|
122 |
# = Upper AAC (0.9) Comparison ==== |
|
|
123 |
temp_results <- all_results |
|
|
124 |
temp_results$loss_by_config <- NULL |
|
|
125 |
temp_results <- temp_results[target > 0.9] |
|
|
126 |
|
|
|
127 |
temp_results <- temp_results[(merge_method == "Concat" & drug_type == "DRUG")] |
|
|
128 |
|
|
|
129 |
temp_results[, loss_by_config := mean(RMSELoss), by = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")] |
|
|
130 |
|
|
|
131 |
long_temp_results <- melt(unique(temp_results[, c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold", "loss_by_config")]), |
|
|
132 |
id.vars = c("data_types", "merge_method", "loss_type", "drug_type", "split_method", "fold")) |
|
|
133 |
|
|
|
134 |
|
|
|
135 |
long_temp_results[, cv_mean := mean(value), by = c("data_types", "merge_method", "loss_type", "split_method")] |
|
|
136 |
# split_both_results <- long_temp_results[split_method == "BOTH"] |
|
|
137 |
# split_drug_results <- long_temp_results[split_method == "DRUG"] |
|
|
138 |
|
|
|
139 |
ggplot(long_temp_results) + |
|
|
140 |
geom_bar(mapping = aes(x = data_types, y = value, fill = fold), stat = "identity", position='dodge') + |
|
|
141 |
facet_wrap(~merge_method+loss_type+drug_type+split_method, nrow = 2) + |
|
|
142 |
scale_fill_discrete(name = "CV Fold:") + |
|
|
143 |
scale_colour_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73", |
|
|
144 |
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")) + |
|
|
145 |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
|
|
146 |
ggtitle(label = tools::toTitleCase("Comparison of Loss-weighting, fusion method and drug representation in the bi-modal case"), |
|
|
147 |
subtitle = "Subset of AAC >= 0.9 Validation RMSE loss using strict splitting") + |
|
|
148 |
geom_errorbar(aes(x=data_types, |
|
|
149 |
y=cv_mean, |
|
|
150 |
ymax=cv_mean, |
|
|
151 |
ymin=cv_mean, col='red'), linetype=2, show.legend = FALSE) + |
|
|
152 |
geom_text(aes(x=data_types, label = round(cv_mean, 3), y = cv_mean), vjust = -0.5) |
|
|
153 |
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_split_BOTH_Upper_0.9_Comparison.pdf") |
|
|
154 |
# ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_split_DRUG_Upper_0.9_Comparison.pdf") |
|
|
155 |
ggsave(filename = "Plots/CV_Results/Bimodal_CV_per_fold_Baseline_with_LDS_Upper_0.9_Comparison.pdf") |