|
a |
|
b/R/performance_analysis.R |
|
|
1 |
# performance_analysis.R |
|
|
2 |
|
|
|
3 |
require(data.table) |
|
|
4 |
require(ggplot2) |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
plot_loss_by_bottleneck_and_split <- function(with_bottleneck_path, without_bottleneck_path, split_by_cell_path, |
|
|
8 |
split_by_drug_path, split_by_both_path, |
|
|
9 |
plot_path, cell_line_data, inference_results_paths, subtitle, plot_name) { |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
ctrp_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv")) |
|
|
13 |
ctrp_data <- merge(ctrp_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
14 |
gdsc1_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv")) |
|
|
15 |
gdsc1_data <- merge(gdsc1_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
16 |
gdsc2_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv")) |
|
|
17 |
gdsc2_data <- merge(gdsc2_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
18 |
|
|
|
19 |
# ctrp_data[, abs_loss := sqrt(MSE_loss)] |
|
|
20 |
ctrp_data[, lineage_loss_avg := mean(MAE_loss), by = "lineage"] |
|
|
21 |
ctrp_data[, lineage_loss_sd := sd(MAE_loss), by = "lineage"] |
|
|
22 |
ctrp_data[, sample_by_lineage_count := .N, by = "lineage"] |
|
|
23 |
ctrp_avg_abs_by_lineage <- unique(ctrp_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")]) |
|
|
24 |
ctrp_avg_abs_by_lineage$Dataset <- "CTRPv2" |
|
|
25 |
|
|
|
26 |
# gdsc1_data[, abs_loss := sqrt(MSE_loss)] |
|
|
27 |
gdsc1_data[, lineage_loss_avg := mean(MAE_loss), by = "lineage"] |
|
|
28 |
gdsc1_data[, lineage_loss_sd := sd(MAE_loss), by = "lineage"] |
|
|
29 |
gdsc1_data[, sample_by_lineage_count := .N, by = "lineage"] |
|
|
30 |
gdsc1_avg_abs_by_lineage <- unique(gdsc1_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")]) |
|
|
31 |
gdsc1_avg_abs_by_lineage$Dataset <- "GDSC1" |
|
|
32 |
|
|
|
33 |
# gdsc2_data[, abs_loss := sqrt(MSE_loss)] |
|
|
34 |
gdsc2_data[, lineage_loss_avg := mean(MAE_loss), by = "lineage"] |
|
|
35 |
gdsc2_data[, lineage_loss_sd := sd(MAE_loss), by = "lineage"] |
|
|
36 |
gdsc2_data[, sample_by_lineage_count := .N, by = "lineage"] |
|
|
37 |
gdsc2_avg_abs_by_lineage <- unique(gdsc2_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")]) |
|
|
38 |
gdsc2_avg_abs_by_lineage$Dataset <- "GDSC2" |
|
|
39 |
|
|
|
40 |
all_avg_abs_by_lineage <- rbindlist(list(ctrp_avg_abs_by_lineage, gdsc1_avg_abs_by_lineage, gdsc2_avg_abs_by_lineage)) |
|
|
41 |
all_avg_abs_by_lineage <- merge(all_avg_abs_by_lineage, unique(ctrp_data[, c("lineage", "sample_by_lineage_count")])) |
|
|
42 |
all_avg_abs_by_lineage$lineage <- paste0(all_avg_abs_by_lineage$lineage, ", n = ", all_avg_abs_by_lineage$sample_by_lineage_count) |
|
|
43 |
|
|
|
44 |
ggplot(data = all_avg_abs_by_lineage, mapping = aes(x = reorder(lineage, -lineage_loss_avg), y = lineage_loss_avg, fill = Dataset)) + |
|
|
45 |
geom_bar(stat = "identity", position = position_dodge()) + |
|
|
46 |
# geom_errorbar(aes(ymin = lineage_loss_avg - lineage_loss_sd, ymax = lineage_loss_avg + lineage_loss_sd), width = 0.2, position = position_dodge(0.9)) + |
|
|
47 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + |
|
|
48 |
geom_hline(yintercept = mean(ctrp_data$abs_loss), linetype="dashed", color = "red") + |
|
|
49 |
# geom_text(aes(10, mean(ctrp_data$abs_loss),label = mean(ctrp_data$abs_loss), vjust = -1)) + |
|
|
50 |
geom_hline(yintercept = mean(gdsc1_data$abs_loss), linetype="dashed", color = "green") + |
|
|
51 |
geom_hline(yintercept = mean(gdsc2_data$abs_loss), linetype="dashed", color = "blue") + |
|
|
52 |
xlab("Cell Line Lineage + # training datapoints") + ylab("Mean Absolute Loss") + |
|
|
53 |
# scale_y_discrete(limits = c("0.001", "0.002")) + |
|
|
54 |
scale_y_continuous(breaks = sort(c(seq(0, 0.25, length.out=10), |
|
|
55 |
c(mean(ctrp_data$abs_loss), |
|
|
56 |
mean(gdsc1_data$abs_loss), |
|
|
57 |
mean(gdsc2_data$abs_loss)) |
|
|
58 |
))) + |
|
|
59 |
# ggtitle(label = "Full DRP Mean Absolute Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on All 3") |
|
|
60 |
ggtitle(label = title, subtitle = subtitle) |
|
|
61 |
# ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_All_avg_Abs_by_lineage.pdf"), device = "pdf") |
|
|
62 |
ggsave(filename = paste0(plot_path, plot_name), device = "pdf") |
|
|
63 |
|
|
|
64 |
} |
|
|
65 |
|
|
|
66 |
|
|
|
67 |
plot_loss_by_lineage <- function(path, |
|
|
68 |
plot_path, cell_line_data, title, subtitle, plot_filename, display_plot = FALSE) { |
|
|
69 |
|
|
|
70 |
cv_results <- fread(paste0(path, "CV_results.csv")) |
|
|
71 |
cv_valid_loss <- cv_results[V1 == "avg_cv_valid_loss"][,2] |
|
|
72 |
cv_valid_loss <- format(round(cv_valid_loss, 4), nsmall = 4) |
|
|
73 |
ctrp_data <- fread(paste0(path, "CTRP_AAC_MORGAN_1024_inference_results.csv")) |
|
|
74 |
ctrp_data <- merge(ctrp_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
75 |
# gdsc1_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_1024_inference_results.csv")) |
|
|
76 |
# gdsc1_data <- merge(gdsc1_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
77 |
# gdsc2_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_1024_inference_results.csv")) |
|
|
78 |
# gdsc2_data <- merge(gdsc2_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
79 |
|
|
|
80 |
# ctrp_data[, abs_loss := sqrt(MSE_loss)] |
|
|
81 |
ctrp_data[, lineage_loss_avg := mean(RMSE_loss), by = "lineage"] |
|
|
82 |
ctrp_data[, lineage_loss_sd := sd(RMSE_loss), by = "lineage"] |
|
|
83 |
ctrp_data[, sample_by_lineage_count := .N, by = "lineage"] |
|
|
84 |
ctrp_avg_abs_by_lineage <- unique(ctrp_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")]) |
|
|
85 |
ctrp_avg_abs_by_lineage$Dataset <- "CTRPv2" |
|
|
86 |
|
|
|
87 |
# gdsc1_data[, lineage_loss_avg := mean(RMSE_loss), by = "lineage"] |
|
|
88 |
# gdsc1_data[, lineage_loss_sd := sd(RMSE_loss), by = "lineage"] |
|
|
89 |
# gdsc1_data[, sample_by_lineage_count := .N, by = "lineage"] |
|
|
90 |
# gdsc1_avg_abs_by_lineage <- unique(gdsc1_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")]) |
|
|
91 |
# gdsc1_avg_abs_by_lineage$Dataset <- "GDSC1" |
|
|
92 |
# |
|
|
93 |
# gdsc2_data[, lineage_loss_avg := mean(RMSE_loss), by = "lineage"] |
|
|
94 |
# gdsc2_data[, lineage_loss_sd := sd(RMSE_loss), by = "lineage"] |
|
|
95 |
# gdsc2_data[, sample_by_lineage_count := .N, by = "lineage"] |
|
|
96 |
# gdsc2_avg_abs_by_lineage <- unique(gdsc2_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")]) |
|
|
97 |
# gdsc2_avg_abs_by_lineage$Dataset <- "GDSC2" |
|
|
98 |
|
|
|
99 |
# all_avg_abs_by_lineage <- rbindlist(list(ctrp_avg_abs_by_lineage, gdsc1_avg_abs_by_lineage, gdsc2_avg_abs_by_lineage)) |
|
|
100 |
all_avg_abs_by_lineage <- ctrp_avg_abs_by_lineage |
|
|
101 |
all_avg_abs_by_lineage <- merge(all_avg_abs_by_lineage, unique(ctrp_data[, c("lineage", "sample_by_lineage_count")])) |
|
|
102 |
all_avg_abs_by_lineage$lineage <- paste0(all_avg_abs_by_lineage$lineage, ", n = ", all_avg_abs_by_lineage$sample_by_lineage_count) |
|
|
103 |
|
|
|
104 |
g <- ggplot(data = all_avg_abs_by_lineage, mapping = aes(x = reorder(lineage, -lineage_loss_avg), y = lineage_loss_avg, fill = Dataset)) + |
|
|
105 |
geom_bar(stat = "identity", position = position_dodge()) + |
|
|
106 |
# geom_errorbar(aes(ymin = lineage_loss_avg - lineage_loss_sd, ymax = lineage_loss_avg + lineage_loss_sd), width = 0.2, position = position_dodge(0.9)) + |
|
|
107 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + |
|
|
108 |
geom_hline(yintercept = mean(ctrp_data$lineage_loss_avg), linetype="dashed", color = "red") + |
|
|
109 |
# geom_text(aes(10, mean(ctrp_data$abs_loss),label = mean(ctrp_data$abs_loss), vjust = -1)) + |
|
|
110 |
# geom_hline(yintercept = mean(gdsc1_data$lineage_loss_avg), linetype="dashed", color = "green") + |
|
|
111 |
# geom_hline(yintercept = mean(gdsc2_data$lineage_loss_avg), linetype="dashed", color = "blue") + |
|
|
112 |
xlab("Cell Line Lineage + # testing datapoints") + ylab("RMSE Loss") + |
|
|
113 |
# scale_y_discrete(limits = c("0.001", "0.002")) + |
|
|
114 |
scale_y_continuous(breaks = sort(c(seq(0, 0.25, length.out=10), |
|
|
115 |
c(mean(ctrp_data$lineage_loss_avg) |
|
|
116 |
# mean(gdsc1_data$lineage_loss_avg), |
|
|
117 |
# mean(gdsc2_data$lineage_loss_avg) |
|
|
118 |
) |
|
|
119 |
))) + |
|
|
120 |
# ggtitle(label = "Full DRP Mean Absolute Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on All 3") |
|
|
121 |
ggtitle(label = title, subtitle = paste0(subtitle, "\nAverage Cross-Validation RMSE Loss:", as.character(cv_valid_loss))) |
|
|
122 |
if (display_plot == TRUE) { |
|
|
123 |
print(g) |
|
|
124 |
} |
|
|
125 |
# ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_All_avg_Abs_by_lineage.pdf"), device = "pdf") |
|
|
126 |
ggsave(plot = g, filename = paste0(plot_path, plot_filename), device = "pdf") |
|
|
127 |
|
|
|
128 |
} |
|
|
129 |
# plot_path <- "Plots/DRP/Lineage_Results/" |
|
|
130 |
cell_line_data <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv") |
|
|
131 |
|
|
|
132 |
|
|
|
133 |
plot_grid_mono <- function(model_type, data_type, split, bottleneck, drug_type) { |
|
|
134 |
path <- paste0("Data/CV_Results/HyperOpt_DRP_", model_type, "_drug", data_type, |
|
|
135 |
"_HyperOpt_DRP_CTRP_1024_", model_type, "_EncoderTrain_Split_", split, "_", bottleneck, "_NoTCGAPretrain_MergeBySum_RMSELoss_", drug_type, "_drug", data_type, "/") |
|
|
136 |
# HyperOpt_DRP_ResponseOnly_drug_rppa_HyperOpt_DRP_CTRP_1024_ResponseOnly_EncoderTrain_Split_DRUG_NoBottleNeck_NoTCGAPretrain_MergeBySum_RMSELoss_OneHotDrugs_drug_rppa |
|
|
137 |
if (split == "CELL_LINE") { |
|
|
138 |
plot_path <- "Plots/DRP/Split_by_Cell/" |
|
|
139 |
plot_split_name <- "SplitByCell" |
|
|
140 |
title_split_name <- "Cell Line" |
|
|
141 |
} else if (split == "DRUG") { |
|
|
142 |
plot_path <- "Plots/DRP/Split_by_Drug/" |
|
|
143 |
plot_split_name <- "SplitByDrug" |
|
|
144 |
title_split_name <- "Drug" |
|
|
145 |
|
|
|
146 |
} else { |
|
|
147 |
plot_path <- "Plots/DRP/Split_by_Both/" |
|
|
148 |
plot_split_name <- "SplitByBoth" |
|
|
149 |
title_split_name <- "Cell Line & Drug" |
|
|
150 |
|
|
|
151 |
} |
|
|
152 |
if (bottleneck == "WithBottleNeck") { |
|
|
153 |
subtitle_bottleneck_name <- "With Bottleneck" |
|
|
154 |
|
|
|
155 |
} else { |
|
|
156 |
subtitle_bottleneck_name <- "No Bottleneck" |
|
|
157 |
} |
|
|
158 |
|
|
|
159 |
dir.create(plot_path) |
|
|
160 |
plot_filename <- paste0(model_type, "_drug", data_type, "_train_CTRPv2_test_All_RMSE_", plot_split_name, "_", bottleneck, "_", drug_type, ".pdf") |
|
|
161 |
title <- paste0("DRP RMSE (Validation by Strict ", title_split_name, " Splitting)") |
|
|
162 |
subtitle <- paste0("Model Type: ", model_type, " | Data: Drug + ", gsub("_", "", toupper(data_type)), " | Drug Type: ", drug_type, " | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: ", subtitle_bottleneck_name) |
|
|
163 |
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename) |
|
|
164 |
|
|
|
165 |
} |
|
|
166 |
|
|
|
167 |
model_types <- c("FullModel", "ResponseOnly") |
|
|
168 |
data_types <- c("mut", "exp", "prot", "mirna", "metab", "rppa", "hist") |
|
|
169 |
data_types <- paste0("_", data_types) |
|
|
170 |
data_types <- c("", data_types) |
|
|
171 |
# splits <- c("CELL_LINE", "DRUG", "BOTH") |
|
|
172 |
splits <- c("DRUG") |
|
|
173 |
# bottlenecking <- c("WithBottleNeck", "NoBottleNeck") |
|
|
174 |
bottlenecking <- c("NoBottleNeck") |
|
|
175 |
drug_types <- c("OneHotDrugs") |
|
|
176 |
grid <- expand.grid(model_types, data_types, splits, bottlenecking, drug_types) |
|
|
177 |
|
|
|
178 |
for (i in 1:nrow(grid)) { |
|
|
179 |
plot_grid_mono(model_type = grid[i, 1], data_type = grid[i, 2], split = grid[i, 3], bottleneck = grid[i, 4], drug_type = grid[i, 5]) |
|
|
180 |
} |
|
|
181 |
|
|
|
182 |
# ==== Drug + Mut ==== |
|
|
183 |
|
|
|
184 |
# Split by Cell, No Bottleneck |
|
|
185 |
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_CELL_LINE_NoBottleNeck_WithTCGAPretrain_drug_mut/" |
|
|
186 |
plot_path <- "Plots/DRP/Split_by_Cell/" |
|
|
187 |
dir.create(plot_path) |
|
|
188 |
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByCell_NoBottleneck.pdf" |
|
|
189 |
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: No Bottleneck" |
|
|
190 |
title <- "Full DRP Mean Absolute Loss (Validation by Strict Cell Line Splitting)" |
|
|
191 |
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename) |
|
|
192 |
|
|
|
193 |
# Split by Cell, With Bottleneck |
|
|
194 |
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_CELL_LINE_WithBottleNeck_WithTCGAPretrain_drug_mut/" |
|
|
195 |
plot_path <- "Plots/DRP/Split_by_Cell/" |
|
|
196 |
dir.create(plot_path) |
|
|
197 |
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByCell_WithBottleneck.pdf" |
|
|
198 |
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: With Bottleneck" |
|
|
199 |
title <- "Full DRP Mean Absolute Loss (Validation by Strict Cell Line Splitting)" |
|
|
200 |
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename) |
|
|
201 |
# ============ |
|
|
202 |
# Split by Drug, No Bottleneck |
|
|
203 |
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_DRUG_NoBottleNeck_WithTCGAPretrain_drug_mut/" |
|
|
204 |
plot_path <- "Plots/DRP/Split_by_Drug/" |
|
|
205 |
dir.create(plot_path) |
|
|
206 |
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByDrug_NoBottleneck.pdf" |
|
|
207 |
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: No Bottleneck" |
|
|
208 |
title <- "Full DRP Mean Absolute Loss (Validation by Strict Drug Splitting)" |
|
|
209 |
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename) |
|
|
210 |
|
|
|
211 |
# Split by Drug, With Bottleneck |
|
|
212 |
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_DRUG_WithBottleNeck_WithTCGAPretrain_drug_mut/" |
|
|
213 |
plot_path <- "Plots/DRP/Split_by_Drug/" |
|
|
214 |
dir.create(plot_path) |
|
|
215 |
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByDrug_WithBottleneck.pdf" |
|
|
216 |
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: With Bottleneck" |
|
|
217 |
title <- "Full DRP Mean Absolute Loss (Validation by Strict Drug Splitting)" |
|
|
218 |
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename) |
|
|
219 |
# ============ |
|
|
220 |
# Split by Drug, No Bottleneck |
|
|
221 |
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_BOTH_NoBottleNeck_WithTCGAPretrain_drug_mut/" |
|
|
222 |
plot_path <- "Plots/DRP/Split_by_Both/" |
|
|
223 |
dir.create(plot_path) |
|
|
224 |
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByBoth_NoBottleneck.pdf" |
|
|
225 |
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: No Bottleneck" |
|
|
226 |
title <- "Full DRP Mean Absolute Loss (Validation by Strict Cell Line & Drug Splitting)" |
|
|
227 |
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename) |
|
|
228 |
|
|
|
229 |
# Split by Drug, With Bottleneck |
|
|
230 |
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_BOTH_WithBottleNeck_WithTCGAPretrain_drug_mut/" |
|
|
231 |
plot_path <- "Plots/DRP/Split_by_Both/" |
|
|
232 |
dir.create(plot_path) |
|
|
233 |
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByBoth_NoBottleneck.pdf" |
|
|
234 |
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: With Bottleneck" |
|
|
235 |
title <- "Full DRP Mean Absolute Loss (Validation by Strict Cell Line & Drug Splitting)" |
|
|
236 |
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename) |
|
|
237 |
|
|
|
238 |
|
|
|
239 |
# |
|
|
240 |
plot_path <- "Plots/DRP/Split_by_Drug/" |
|
|
241 |
plot_path <- "Plots/DRP/Split_by_Both/" |
|
|
242 |
plot_name <- "" |
|
|
243 |
# Plot average MSE by lineage Full (drug + prot) ================================ |
|
|
244 |
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_prot_CTRP_Full/" |
|
|
245 |
|
|
|
246 |
### GDSC1 ==== |
|
|
247 |
cur_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv")) |
|
|
248 |
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
249 |
|
|
|
250 |
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
251 |
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")]) |
|
|
252 |
ggplot(data = avg_mse_by_lineage)+ |
|
|
253 |
geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") + |
|
|
254 |
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + |
|
|
255 |
geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
256 |
xlab("Cell Line Lineage") + ylab("Average MSE Loss") + |
|
|
257 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on GDSC1") |
|
|
258 |
ggsave(filename = paste0(plot_path, "drug_prot_full_train_CTRPv2_test_GDSC1_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
259 |
|
|
|
260 |
|
|
|
261 |
### GDSC2 ==== |
|
|
262 |
cur_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv")) |
|
|
263 |
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
264 |
|
|
|
265 |
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
266 |
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")]) |
|
|
267 |
ggplot(data = avg_mse_by_lineage)+ |
|
|
268 |
geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") + |
|
|
269 |
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + |
|
|
270 |
geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
271 |
xlab("Cell Line Lineage") + ylab("Average MSE Loss") + |
|
|
272 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on GDSC2") |
|
|
273 |
ggsave(filename = paste0(plot_path, "drug_prot_full_train_CTRPv2_test_GDSC2_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
274 |
|
|
|
275 |
|
|
|
276 |
### CTRPv2 ==== |
|
|
277 |
cur_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv")) |
|
|
278 |
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
279 |
|
|
|
280 |
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
281 |
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")]) |
|
|
282 |
ggplot(data = avg_mse_by_lineage) + |
|
|
283 |
geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") + |
|
|
284 |
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + |
|
|
285 |
geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
286 |
xlab("Cell Line Lineage") + ylab("Average MSE Loss") + |
|
|
287 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on CTRPv2") |
|
|
288 |
ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_CTRP_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
289 |
|
|
|
290 |
|
|
|
291 |
### All side by side (lineage bar plot) ==== |
|
|
292 |
ctrp_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv")) |
|
|
293 |
ctrp_data <- merge(ctrp_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
294 |
gdsc1_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv")) |
|
|
295 |
gdsc1_data <- merge(gdsc1_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
296 |
gdsc2_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv")) |
|
|
297 |
gdsc2_data <- merge(gdsc2_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
298 |
|
|
|
299 |
ctrp_data[, abs_loss := sqrt(MSE_loss)] |
|
|
300 |
ctrp_data[, lineage_loss_avg := mean(abs_loss), by = "lineage"] |
|
|
301 |
ctrp_data[, lineage_loss_sd := sd(abs_loss), by = "lineage"] |
|
|
302 |
ctrp_data[, sample_by_lineage_count := .N, by = "lineage"] |
|
|
303 |
ctrp_avg_abs_by_lineage <- unique(ctrp_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")]) |
|
|
304 |
ctrp_avg_abs_by_lineage$Dataset <- "CTRPv2" |
|
|
305 |
|
|
|
306 |
gdsc1_data[, abs_loss := sqrt(MSE_loss)] |
|
|
307 |
gdsc1_data[, lineage_loss_avg := mean(abs_loss), by = "lineage"] |
|
|
308 |
gdsc1_data[, lineage_loss_sd := sd(abs_loss), by = "lineage"] |
|
|
309 |
gdsc1_data[, sample_by_lineage_count := .N, by = "lineage"] |
|
|
310 |
gdsc1_avg_abs_by_lineage <- unique(gdsc1_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")]) |
|
|
311 |
gdsc1_avg_abs_by_lineage$Dataset <- "GDSC1" |
|
|
312 |
|
|
|
313 |
gdsc2_data[, abs_loss := sqrt(MSE_loss)] |
|
|
314 |
gdsc2_data[, lineage_loss_avg := mean(abs_loss), by = "lineage"] |
|
|
315 |
gdsc2_data[, lineage_loss_sd := sd(abs_loss), by = "lineage"] |
|
|
316 |
gdsc2_data[, sample_by_lineage_count := .N, by = "lineage"] |
|
|
317 |
gdsc2_avg_abs_by_lineage <- unique(gdsc2_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")]) |
|
|
318 |
gdsc2_avg_abs_by_lineage$Dataset <- "GDSC2" |
|
|
319 |
|
|
|
320 |
all_avg_abs_by_lineage <- rbindlist(list(ctrp_avg_abs_by_lineage, gdsc1_avg_abs_by_lineage, gdsc2_avg_abs_by_lineage)) |
|
|
321 |
all_avg_abs_by_lineage <- merge(all_avg_abs_by_lineage, unique(ctrp_data[, c("lineage", "sample_by_lineage_count")])) |
|
|
322 |
all_avg_abs_by_lineage$lineage <- paste0(all_avg_abs_by_lineage$lineage, ", n = ", all_avg_abs_by_lineage$sample_by_lineage_count) |
|
|
323 |
|
|
|
324 |
ggplot(data = all_avg_abs_by_lineage, mapping = aes(x = reorder(lineage, -lineage_loss_avg), y = lineage_loss_avg, fill = Dataset)) + |
|
|
325 |
geom_bar(stat = "identity", position = position_dodge()) + |
|
|
326 |
# geom_errorbar(aes(ymin = lineage_loss_avg - lineage_loss_sd, ymax = lineage_loss_avg + lineage_loss_sd), width = 0.2, position = position_dodge(0.9)) + |
|
|
327 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + |
|
|
328 |
geom_hline(yintercept = mean(ctrp_data$abs_loss), linetype="dashed", color = "red") + |
|
|
329 |
# geom_text(aes(10, mean(ctrp_data$abs_loss),label = mean(ctrp_data$abs_loss), vjust = -1)) + |
|
|
330 |
geom_hline(yintercept = mean(gdsc1_data$abs_loss), linetype="dashed", color = "green") + |
|
|
331 |
geom_hline(yintercept = mean(gdsc2_data$abs_loss), linetype="dashed", color = "blue") + |
|
|
332 |
xlab("Cell Line Lineage + # training datapoints") + ylab("Average Absolute Loss") + |
|
|
333 |
# scale_y_discrete(limits = c("0.001", "0.002")) + |
|
|
334 |
scale_y_continuous(breaks = sort(c(seq(0, 0.12, length.out=5), |
|
|
335 |
c(mean(ctrp_data$abs_loss), |
|
|
336 |
mean(gdsc1_data$abs_loss), |
|
|
337 |
mean(gdsc2_data$abs_loss)) |
|
|
338 |
))) + |
|
|
339 |
ggtitle(label = "Full DRP Mean Absolute Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on All 3") |
|
|
340 |
ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_All_avg_Abs_by_lineage.pdf"), device = "pdf") |
|
|
341 |
|
|
|
342 |
|
|
|
343 |
### All side by side (cell line dot plot) ==== |
|
|
344 |
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_prot_CTRP_Full/" |
|
|
345 |
|
|
|
346 |
ctrp_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv")) |
|
|
347 |
ctrp_data <- merge(ctrp_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
348 |
gdsc1_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv")) |
|
|
349 |
gdsc1_data <- merge(gdsc1_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
350 |
gdsc2_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv")) |
|
|
351 |
gdsc2_data <- merge(gdsc2_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
352 |
|
|
|
353 |
ctrp_data[, lineage_loss_sd := sd(MSE_loss), by = "lineage"] |
|
|
354 |
ctrp_data[, cell_line_loss_avg := mean(MSE_loss), by = "cell_name"] |
|
|
355 |
ctrp_avg_mse_by_cell_line <- unique(ctrp_data[, c("cell_name", "lineage", "cell_line_loss_avg", "lineage_loss_sd")]) |
|
|
356 |
ctrp_avg_mse_by_cell_line$Dataset <- "CTRPv2" |
|
|
357 |
|
|
|
358 |
gdsc1_data[, lineage_loss_sd := sd(MSE_loss), by = "lineage"] |
|
|
359 |
gdsc1_data[, cell_line_loss_avg := mean(MSE_loss), by = "cell_name"] |
|
|
360 |
gdsc1_avg_mse_by_cell_line <- unique(gdsc1_data[, c("cell_name", "lineage", "cell_line_loss_avg", "lineage_loss_sd")]) |
|
|
361 |
gdsc1_avg_mse_by_cell_line$Dataset <- "GDSC1" |
|
|
362 |
|
|
|
363 |
gdsc2_data[, lineage_loss_sd := sd(MSE_loss), by = "lineage"] |
|
|
364 |
gdsc2_data[, cell_line_loss_avg := mean(MSE_loss), by = "cell_name"] |
|
|
365 |
gdsc2_avg_mse_by_cell_line <- unique(gdsc2_data[, c("cell_name", "lineage", "cell_line_loss_avg", "lineage_loss_sd")]) |
|
|
366 |
gdsc2_avg_mse_by_cell_line$Dataset <- "GDSC2" |
|
|
367 |
|
|
|
368 |
all_avg_mse_by_cell_line <- rbindlist(list(ctrp_avg_mse_by_cell_line, gdsc1_avg_mse_by_cell_line, gdsc2_avg_mse_by_cell_line)) |
|
|
369 |
ggplot(data = all_avg_mse_by_cell_line, mapping = aes(x = cell_name, y = cell_line_loss_avg, group = Dataset)) + |
|
|
370 |
facet_wrap(vars(lineage), scales = "free") + |
|
|
371 |
# geom_bar(stat = "identity", position = position_dodge()) + |
|
|
372 |
# geom_dotplot(binaxis = 'y', stackdir = 'center') + |
|
|
373 |
geom_boxplot() + |
|
|
374 |
geom_errorbar(aes(ymin = cell_line_loss_avg - lineage_loss_sd, ymax = cell_line_loss_avg + lineage_loss_sd), width = 0.2, position = position_dodge(0.9)) + |
|
|
375 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + |
|
|
376 |
geom_hline(yintercept = mean(ctrp_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
377 |
# geom_text(aes(10, mean(ctrp_data$MSE_loss),label = mean(ctrp_data$MSE_loss), vjust = -1)) + |
|
|
378 |
geom_hline(yintercept = mean(gdsc1_data$MSE_loss), linetype="dashed", color = "green") + |
|
|
379 |
geom_hline(yintercept = mean(gdsc2_data$MSE_loss), linetype="dashed", color = "blue") + |
|
|
380 |
xlab("Cell Line Lineage + # training datapoints") + ylab("Average MSE Loss") + |
|
|
381 |
# scale_y_discrete(limits = c("0.001", "0.002")) + |
|
|
382 |
scale_y_continuous(breaks = sort(c(seq(0, 0.12, length.out=5), |
|
|
383 |
c(mean(ctrp_data$MSE_loss), |
|
|
384 |
mean(gdsc1_data$MSE_loss), |
|
|
385 |
mean(gdsc2_data$MSE_loss)) |
|
|
386 |
))) + |
|
|
387 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on All 3") |
|
|
388 |
ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_All_avg_MSE_by_cell_line.pdf"), device = "pdf") |
|
|
389 |
|
|
|
390 |
# Plot average MSE by lineage Full Response Only + EncoderTrain + PreTrain (drug + exp) ================================ |
|
|
391 |
path = "Data/CV_Results/HyperOpt_DRP_ResponseOnly_drug_exp_CTRP_EncoderTrain_PreTrain/" |
|
|
392 |
|
|
|
393 |
### GDSC1 |
|
|
394 |
cur_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv")) |
|
|
395 |
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
396 |
|
|
|
397 |
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
398 |
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")]) |
|
|
399 |
ggplot(data = avg_mse_by_lineage)+ |
|
|
400 |
geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") + |
|
|
401 |
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + |
|
|
402 |
geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
403 |
xlab("Cell Line Lineage") + ylab("Average MSE Loss") + |
|
|
404 |
ggtitle(label = "ResponseOnly DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on GDSC1") |
|
|
405 |
ggsave(filename = paste0(plot_path, "drug_prot_full_train_CTRPv2_test_GDSC1_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
406 |
|
|
|
407 |
|
|
|
408 |
### GDSC2 |
|
|
409 |
cur_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv")) |
|
|
410 |
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
411 |
|
|
|
412 |
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
413 |
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")]) |
|
|
414 |
ggplot(data = avg_mse_by_lineage)+ |
|
|
415 |
geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") + |
|
|
416 |
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + |
|
|
417 |
geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
418 |
xlab("Cell Line Lineage") + ylab("Average MSE Loss") + |
|
|
419 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on GDSC2") |
|
|
420 |
ggsave(filename = paste0(plot_path, "drug_prot_full_train_CTRPv2_test_GDSC2_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
421 |
|
|
|
422 |
|
|
|
423 |
### CTRPv2 |
|
|
424 |
cur_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv")) |
|
|
425 |
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
426 |
|
|
|
427 |
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
428 |
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")]) |
|
|
429 |
ggplot(data = avg_mse_by_lineage)+ |
|
|
430 |
geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") + |
|
|
431 |
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + |
|
|
432 |
geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
433 |
xlab("Cell Line Lineage") + ylab("Average MSE Loss") + |
|
|
434 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on CTRPv2") |
|
|
435 |
ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_CTRP_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
436 |
|
|
|
437 |
|
|
|
438 |
# Plot average MSE by lineage (drug + exp) ================================ |
|
|
439 |
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_exp_CTRP_Full/" |
|
|
440 |
# GDSC1 |
|
|
441 |
cur_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv")) |
|
|
442 |
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
443 |
|
|
|
444 |
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
445 |
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")]) |
|
|
446 |
ggplot(data = avg_mse_by_lineage)+ |
|
|
447 |
geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") + |
|
|
448 |
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + |
|
|
449 |
geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
450 |
xlab("Cell Line Lineage") + ylab("Average MSE Loss") + |
|
|
451 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Gene Expression | Trained on CTRPv2 | Tested on GDSC1") |
|
|
452 |
ggsave(filename = paste0(plot_path, "drug_exp_train_CTRPv2_test_GDSC1_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
453 |
|
|
|
454 |
# Plot average MSE by lineage Full (drug + exp + prot) ================================ |
|
|
455 |
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_exp_prot_CTRP_Full/" |
|
|
456 |
### GDSC1 ==== |
|
|
457 |
cur_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv")) |
|
|
458 |
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
459 |
|
|
|
460 |
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
461 |
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")]) |
|
|
462 |
ggplot(data = avg_mse_by_lineage)+ |
|
|
463 |
geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") + |
|
|
464 |
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + |
|
|
465 |
geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
466 |
xlab("Cell Line Lineage") + ylab("Average MSE Loss") + |
|
|
467 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Gene Expression + Proteomics | Trained on CTRPv2 | Tested on GDSC1") |
|
|
468 |
ggsave(filename = paste0(plot_path, "drug_exp_prot_full_train_CTRPv2_test_GDSC1_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
469 |
|
|
|
470 |
|
|
|
471 |
# GDSC2 |
|
|
472 |
cur_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv")) |
|
|
473 |
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
474 |
|
|
|
475 |
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
476 |
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")]) |
|
|
477 |
ggplot(data = avg_mse_by_lineage)+ |
|
|
478 |
geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") + |
|
|
479 |
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + |
|
|
480 |
geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
481 |
xlab("Cell Line Lineage") + ylab("Average MSE Loss") + |
|
|
482 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Gene Expression + Proteomics | Trained on CTRPv2 | Tested on GDSC2") |
|
|
483 |
ggsave(filename = paste0(plot_path, "drug_exp_prot_full_train_CTRPv2_test_GDSC2_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
484 |
|
|
|
485 |
# CTRPv2 |
|
|
486 |
cur_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv")) |
|
|
487 |
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
488 |
|
|
|
489 |
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
490 |
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")]) |
|
|
491 |
ggplot(data = avg_mse_by_lineage)+ |
|
|
492 |
geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") + |
|
|
493 |
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + |
|
|
494 |
geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
495 |
xlab("Cell Line Lineage") + ylab("Average MSE Loss") + |
|
|
496 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Gene Expression + Proteomics | Trained on CTRPv2 | Tested on CTRPv2") |
|
|
497 |
ggsave(filename = paste0(plot_path, "drug_exp_prot_full_train_CTRPv2_test_CTRP_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
498 |
|
|
|
499 |
|
|
|
500 |
### All side by side ==== |
|
|
501 |
ctrp_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv")) |
|
|
502 |
ctrp_data <- merge(ctrp_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
503 |
gdsc1_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv")) |
|
|
504 |
gdsc1_data <- merge(gdsc1_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
505 |
gdsc2_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv")) |
|
|
506 |
gdsc2_data <- merge(gdsc2_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name") |
|
|
507 |
|
|
|
508 |
ctrp_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
509 |
ctrp_avg_mse_by_lineage <- unique(ctrp_data[, c("lineage", "lineage_loss_avg")]) |
|
|
510 |
ctrp_data[, sample_by_lineage_count := .N, by = "lineage"] |
|
|
511 |
ctrp_avg_mse_by_lineage$Dataset <- "CTRPv2" |
|
|
512 |
|
|
|
513 |
gdsc1_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
514 |
gdsc1_avg_mse_by_lineage <- unique(gdsc1_data[, c("lineage", "lineage_loss_avg")]) |
|
|
515 |
gdsc1_avg_mse_by_lineage$Dataset <- "GDSC1" |
|
|
516 |
|
|
|
517 |
gdsc2_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"] |
|
|
518 |
gdsc2_avg_mse_by_lineage <- unique(gdsc2_data[, c("lineage", "lineage_loss_avg")]) |
|
|
519 |
gdsc2_avg_mse_by_lineage$Dataset <- "GDSC2" |
|
|
520 |
|
|
|
521 |
all_avg_mse_by_lineage <- rbindlist(list(ctrp_avg_mse_by_lineage, gdsc1_avg_mse_by_lineage, gdsc2_avg_mse_by_lineage)) |
|
|
522 |
all_avg_mse_by_lineage <- merge(all_avg_mse_by_lineage, unique(ctrp_data[, c("lineage", "sample_by_lineage_count")])) |
|
|
523 |
all_avg_mse_by_lineage$lineage <- paste0(all_avg_mse_by_lineage$lineage, ", n = ", all_avg_mse_by_lineage$sample_by_lineage_count) |
|
|
524 |
|
|
|
525 |
ggplot(data = all_avg_mse_by_lineage) + |
|
|
526 |
geom_bar(mapping = aes(x = reorder(lineage, -lineage_loss_avg), y = lineage_loss_avg, fill = Dataset), stat = "identity", position = "dodge") + |
|
|
527 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + |
|
|
528 |
geom_hline(yintercept = mean(ctrp_data$MSE_loss), linetype="dashed", color = "red") + |
|
|
529 |
# geom_text(aes(10, mean(ctrp_data$MSE_loss),label = mean(ctrp_data$MSE_loss), vjust = -1)) + |
|
|
530 |
geom_hline(yintercept = mean(gdsc1_data$MSE_loss), linetype="dashed", color = "green") + |
|
|
531 |
geom_hline(yintercept = mean(gdsc2_data$MSE_loss), linetype="dashed", color = "blue") + |
|
|
532 |
xlab("Cell Line Lineage + # training datapoints") + ylab("Average MSE Loss") + |
|
|
533 |
# scale_y_discrete(limits = c("0.001", "0.002")) + |
|
|
534 |
scale_y_continuous(breaks = sort(c(seq(0, 0.12, length.out=5), |
|
|
535 |
c(mean(ctrp_data$MSE_loss), |
|
|
536 |
mean(gdsc1_data$MSE_loss), |
|
|
537 |
mean(gdsc2_data$MSE_loss)) |
|
|
538 |
))) + |
|
|
539 |
ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Expression + Proteomics | Trained on CTRPv2 | Tested on All 3") |
|
|
540 |
ggsave(filename = paste0(plot_path, "drug_exp_prot_train_CTRPv2_test_All_avg_MSE_by_lineage.pdf"), device = "pdf") |
|
|
541 |
|