a b/R/performance_analysis.R
1
# performance_analysis.R
2
3
require(data.table)
4
require(ggplot2)
5
6
7
plot_loss_by_bottleneck_and_split <- function(with_bottleneck_path, without_bottleneck_path, split_by_cell_path,
8
                                 split_by_drug_path, split_by_both_path,
9
                                 plot_path, cell_line_data, inference_results_paths, subtitle, plot_name) {
10
  
11
  
12
  ctrp_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv"))
13
  ctrp_data <- merge(ctrp_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
14
  gdsc1_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv"))
15
  gdsc1_data <- merge(gdsc1_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
16
  gdsc2_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv"))
17
  gdsc2_data <- merge(gdsc2_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
18
  
19
  # ctrp_data[, abs_loss := sqrt(MSE_loss)]
20
  ctrp_data[, lineage_loss_avg := mean(MAE_loss), by = "lineage"]
21
  ctrp_data[, lineage_loss_sd := sd(MAE_loss), by = "lineage"]
22
  ctrp_data[, sample_by_lineage_count := .N, by = "lineage"]
23
  ctrp_avg_abs_by_lineage <- unique(ctrp_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")])
24
  ctrp_avg_abs_by_lineage$Dataset <- "CTRPv2"
25
  
26
  # gdsc1_data[, abs_loss := sqrt(MSE_loss)]
27
  gdsc1_data[, lineage_loss_avg := mean(MAE_loss), by = "lineage"]
28
  gdsc1_data[, lineage_loss_sd := sd(MAE_loss), by = "lineage"]
29
  gdsc1_data[, sample_by_lineage_count := .N, by = "lineage"]
30
  gdsc1_avg_abs_by_lineage <- unique(gdsc1_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")])
31
  gdsc1_avg_abs_by_lineage$Dataset <- "GDSC1"
32
  
33
  # gdsc2_data[, abs_loss := sqrt(MSE_loss)]
34
  gdsc2_data[, lineage_loss_avg := mean(MAE_loss), by = "lineage"]
35
  gdsc2_data[, lineage_loss_sd := sd(MAE_loss), by = "lineage"]
36
  gdsc2_data[, sample_by_lineage_count := .N, by = "lineage"]
37
  gdsc2_avg_abs_by_lineage <- unique(gdsc2_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")])
38
  gdsc2_avg_abs_by_lineage$Dataset <- "GDSC2"
39
  
40
  all_avg_abs_by_lineage <- rbindlist(list(ctrp_avg_abs_by_lineage, gdsc1_avg_abs_by_lineage, gdsc2_avg_abs_by_lineage))
41
  all_avg_abs_by_lineage <- merge(all_avg_abs_by_lineage, unique(ctrp_data[, c("lineage", "sample_by_lineage_count")]))
42
  all_avg_abs_by_lineage$lineage <- paste0(all_avg_abs_by_lineage$lineage, ", n = ", all_avg_abs_by_lineage$sample_by_lineage_count)
43
  
44
  ggplot(data = all_avg_abs_by_lineage, mapping = aes(x = reorder(lineage, -lineage_loss_avg), y = lineage_loss_avg, fill = Dataset)) +
45
    geom_bar(stat = "identity", position = position_dodge()) +
46
    # geom_errorbar(aes(ymin = lineage_loss_avg - lineage_loss_sd, ymax = lineage_loss_avg + lineage_loss_sd), width = 0.2, position = position_dodge(0.9)) +
47
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
48
    geom_hline(yintercept = mean(ctrp_data$abs_loss), linetype="dashed", color = "red") +
49
    # geom_text(aes(10, mean(ctrp_data$abs_loss),label = mean(ctrp_data$abs_loss), vjust = -1)) +
50
    geom_hline(yintercept = mean(gdsc1_data$abs_loss), linetype="dashed", color = "green") +
51
    geom_hline(yintercept = mean(gdsc2_data$abs_loss), linetype="dashed", color = "blue") +
52
    xlab("Cell Line Lineage + # training datapoints") + ylab("Mean Absolute Loss") + 
53
    # scale_y_discrete(limits = c("0.001", "0.002")) +
54
    scale_y_continuous(breaks = sort(c(seq(0, 0.25, length.out=10),
55
                                       c(mean(ctrp_data$abs_loss),
56
                                         mean(gdsc1_data$abs_loss),
57
                                         mean(gdsc2_data$abs_loss))
58
    ))) +
59
    # ggtitle(label = "Full DRP Mean Absolute Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on All 3")
60
    ggtitle(label = title, subtitle = subtitle)
61
  # ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_All_avg_Abs_by_lineage.pdf"), device = "pdf")
62
  ggsave(filename = paste0(plot_path, plot_name), device = "pdf")
63
  
64
}
65
66
67
plot_loss_by_lineage <- function(path,
68
                                 plot_path, cell_line_data, title, subtitle, plot_filename, display_plot = FALSE) {
69
  
70
  cv_results <- fread(paste0(path, "CV_results.csv"))
71
  cv_valid_loss <- cv_results[V1 == "avg_cv_valid_loss"][,2]
72
  cv_valid_loss <- format(round(cv_valid_loss, 4), nsmall = 4)
73
  ctrp_data <- fread(paste0(path, "CTRP_AAC_MORGAN_1024_inference_results.csv"))
74
  ctrp_data <- merge(ctrp_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
75
  # gdsc1_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_1024_inference_results.csv"))
76
  # gdsc1_data <- merge(gdsc1_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
77
  # gdsc2_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_1024_inference_results.csv"))
78
  # gdsc2_data <- merge(gdsc2_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
79
  
80
  # ctrp_data[, abs_loss := sqrt(MSE_loss)]
81
  ctrp_data[, lineage_loss_avg := mean(RMSE_loss), by = "lineage"]
82
  ctrp_data[, lineage_loss_sd := sd(RMSE_loss), by = "lineage"]
83
  ctrp_data[, sample_by_lineage_count := .N, by = "lineage"]
84
  ctrp_avg_abs_by_lineage <- unique(ctrp_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")])
85
  ctrp_avg_abs_by_lineage$Dataset <- "CTRPv2"
86
  
87
  # gdsc1_data[, lineage_loss_avg := mean(RMSE_loss), by = "lineage"]
88
  # gdsc1_data[, lineage_loss_sd := sd(RMSE_loss), by = "lineage"]
89
  # gdsc1_data[, sample_by_lineage_count := .N, by = "lineage"]
90
  # gdsc1_avg_abs_by_lineage <- unique(gdsc1_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")])
91
  # gdsc1_avg_abs_by_lineage$Dataset <- "GDSC1"
92
  # 
93
  # gdsc2_data[, lineage_loss_avg := mean(RMSE_loss), by = "lineage"]
94
  # gdsc2_data[, lineage_loss_sd := sd(RMSE_loss), by = "lineage"]
95
  # gdsc2_data[, sample_by_lineage_count := .N, by = "lineage"]
96
  # gdsc2_avg_abs_by_lineage <- unique(gdsc2_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")])
97
  # gdsc2_avg_abs_by_lineage$Dataset <- "GDSC2"
98
  
99
  # all_avg_abs_by_lineage <- rbindlist(list(ctrp_avg_abs_by_lineage, gdsc1_avg_abs_by_lineage, gdsc2_avg_abs_by_lineage))
100
  all_avg_abs_by_lineage <- ctrp_avg_abs_by_lineage
101
  all_avg_abs_by_lineage <- merge(all_avg_abs_by_lineage, unique(ctrp_data[, c("lineage", "sample_by_lineage_count")]))
102
  all_avg_abs_by_lineage$lineage <- paste0(all_avg_abs_by_lineage$lineage, ", n = ", all_avg_abs_by_lineage$sample_by_lineage_count)
103
  
104
  g <- ggplot(data = all_avg_abs_by_lineage, mapping = aes(x = reorder(lineage, -lineage_loss_avg), y = lineage_loss_avg, fill = Dataset)) +
105
    geom_bar(stat = "identity", position = position_dodge()) +
106
    # geom_errorbar(aes(ymin = lineage_loss_avg - lineage_loss_sd, ymax = lineage_loss_avg + lineage_loss_sd), width = 0.2, position = position_dodge(0.9)) +
107
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
108
    geom_hline(yintercept = mean(ctrp_data$lineage_loss_avg), linetype="dashed", color = "red") +
109
    # geom_text(aes(10, mean(ctrp_data$abs_loss),label = mean(ctrp_data$abs_loss), vjust = -1)) +
110
    # geom_hline(yintercept = mean(gdsc1_data$lineage_loss_avg), linetype="dashed", color = "green") +
111
    # geom_hline(yintercept = mean(gdsc2_data$lineage_loss_avg), linetype="dashed", color = "blue") +
112
    xlab("Cell Line Lineage + # testing datapoints") + ylab("RMSE Loss") + 
113
    # scale_y_discrete(limits = c("0.001", "0.002")) +
114
    scale_y_continuous(breaks = sort(c(seq(0, 0.25, length.out=10),
115
                                       c(mean(ctrp_data$lineage_loss_avg)
116
                                         # mean(gdsc1_data$lineage_loss_avg),
117
                                         # mean(gdsc2_data$lineage_loss_avg)
118
                                         )
119
    ))) +
120
    # ggtitle(label = "Full DRP Mean Absolute Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on All 3")
121
    ggtitle(label = title, subtitle = paste0(subtitle, "\nAverage Cross-Validation RMSE Loss:", as.character(cv_valid_loss)))
122
  if (display_plot == TRUE) {
123
    print(g)
124
  }
125
  # ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_All_avg_Abs_by_lineage.pdf"), device = "pdf")
126
  ggsave(plot = g, filename = paste0(plot_path, plot_filename), device = "pdf")
127
  
128
}
129
# plot_path <- "Plots/DRP/Lineage_Results/"
130
cell_line_data <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
131
132
133
plot_grid_mono <- function(model_type, data_type, split, bottleneck, drug_type) {
134
  path <- paste0("Data/CV_Results/HyperOpt_DRP_", model_type, "_drug", data_type,
135
                "_HyperOpt_DRP_CTRP_1024_", model_type, "_EncoderTrain_Split_", split, "_", bottleneck, "_NoTCGAPretrain_MergeBySum_RMSELoss_", drug_type, "_drug", data_type, "/")
136
  # HyperOpt_DRP_ResponseOnly_drug_rppa_HyperOpt_DRP_CTRP_1024_ResponseOnly_EncoderTrain_Split_DRUG_NoBottleNeck_NoTCGAPretrain_MergeBySum_RMSELoss_OneHotDrugs_drug_rppa
137
  if (split == "CELL_LINE") {
138
    plot_path <- "Plots/DRP/Split_by_Cell/"
139
    plot_split_name <- "SplitByCell"
140
    title_split_name <- "Cell Line"
141
  } else if (split == "DRUG") {
142
    plot_path <- "Plots/DRP/Split_by_Drug/"
143
    plot_split_name <- "SplitByDrug"
144
    title_split_name <- "Drug"
145
    
146
  } else {
147
    plot_path <- "Plots/DRP/Split_by_Both/"
148
    plot_split_name <- "SplitByBoth"
149
    title_split_name <- "Cell Line & Drug"
150
    
151
  }
152
  if (bottleneck == "WithBottleNeck") {
153
    subtitle_bottleneck_name <- "With Bottleneck"
154
    
155
  } else  {
156
    subtitle_bottleneck_name <- "No Bottleneck"
157
  }
158
  
159
  dir.create(plot_path)
160
  plot_filename <- paste0(model_type, "_drug", data_type, "_train_CTRPv2_test_All_RMSE_", plot_split_name, "_", bottleneck, "_", drug_type, ".pdf")
161
  title <- paste0("DRP RMSE (Validation by Strict ", title_split_name, " Splitting)")
162
  subtitle <- paste0("Model Type: ", model_type, " | Data: Drug + ", gsub("_", "", toupper(data_type)), " | Drug Type: ", drug_type, " | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: ", subtitle_bottleneck_name)
163
  plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename)
164
  
165
}
166
167
model_types <- c("FullModel", "ResponseOnly")
168
data_types <- c("mut", "exp", "prot", "mirna", "metab", "rppa", "hist")
169
data_types <- paste0("_", data_types)
170
data_types <- c("", data_types)
171
# splits <- c("CELL_LINE", "DRUG", "BOTH")
172
splits <- c("DRUG")
173
# bottlenecking <- c("WithBottleNeck", "NoBottleNeck")
174
bottlenecking <- c("NoBottleNeck")
175
drug_types <- c("OneHotDrugs")
176
grid <- expand.grid(model_types, data_types, splits, bottlenecking, drug_types)
177
178
for (i in 1:nrow(grid)) {
179
  plot_grid_mono(model_type = grid[i, 1], data_type = grid[i, 2], split = grid[i, 3], bottleneck = grid[i, 4], drug_type = grid[i, 5])
180
}
181
182
# ==== Drug + Mut ====
183
184
# Split by Cell, No Bottleneck
185
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_CELL_LINE_NoBottleNeck_WithTCGAPretrain_drug_mut/"
186
plot_path <- "Plots/DRP/Split_by_Cell/"
187
dir.create(plot_path)
188
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByCell_NoBottleneck.pdf"
189
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: No Bottleneck"
190
title <- "Full DRP Mean Absolute Loss (Validation by Strict Cell Line Splitting)"
191
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename)
192
193
# Split by Cell, With Bottleneck
194
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_CELL_LINE_WithBottleNeck_WithTCGAPretrain_drug_mut/"
195
plot_path <- "Plots/DRP/Split_by_Cell/"
196
dir.create(plot_path)
197
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByCell_WithBottleneck.pdf"
198
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: With Bottleneck"
199
title <- "Full DRP Mean Absolute Loss (Validation by Strict Cell Line Splitting)"
200
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename)
201
# ============
202
# Split by Drug, No Bottleneck
203
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_DRUG_NoBottleNeck_WithTCGAPretrain_drug_mut/"
204
plot_path <- "Plots/DRP/Split_by_Drug/"
205
dir.create(plot_path)
206
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByDrug_NoBottleneck.pdf"
207
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: No Bottleneck"
208
title <- "Full DRP Mean Absolute Loss (Validation by Strict Drug Splitting)"
209
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename)
210
211
# Split by Drug, With Bottleneck
212
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_DRUG_WithBottleNeck_WithTCGAPretrain_drug_mut/"
213
plot_path <- "Plots/DRP/Split_by_Drug/"
214
dir.create(plot_path)
215
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByDrug_WithBottleneck.pdf"
216
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: With Bottleneck"
217
title <- "Full DRP Mean Absolute Loss (Validation by Strict Drug Splitting)"
218
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename)
219
# ============
220
# Split by Drug, No Bottleneck
221
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_BOTH_NoBottleNeck_WithTCGAPretrain_drug_mut/"
222
plot_path <- "Plots/DRP/Split_by_Both/"
223
dir.create(plot_path)
224
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByBoth_NoBottleneck.pdf"
225
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: No Bottleneck"
226
title <- "Full DRP Mean Absolute Loss (Validation by Strict Cell Line & Drug Splitting)"
227
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename)
228
229
# Split by Drug, With Bottleneck
230
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_mut_HyperOpt_DRP_CTRP_FullModel_EncoderTrain_Split_BOTH_WithBottleNeck_WithTCGAPretrain_drug_mut/"
231
plot_path <- "Plots/DRP/Split_by_Both/"
232
dir.create(plot_path)
233
plot_filename <- "drug_prot_train_CTRPv2_test_All_MAE_SplitByBoth_NoBottleneck.pdf"
234
subtitle <- "Data: Drug + Mutational | Trained on CTRPv2 | Tested on All 3 | Hyper-Param Search: With Bottleneck"
235
title <- "Full DRP Mean Absolute Loss (Validation by Strict Cell Line & Drug Splitting)"
236
plot_loss_by_lineage(path = path, plot_path = plot_path, cell_line_data = cell_line_data, title = title, subtitle = subtitle, plot_filename = plot_filename)
237
238
239
# 
240
plot_path <- "Plots/DRP/Split_by_Drug/"
241
plot_path <- "Plots/DRP/Split_by_Both/"
242
plot_name <- ""
243
# Plot average MSE by lineage Full (drug + prot) ================================
244
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_prot_CTRP_Full/"
245
246
### GDSC1 ====
247
cur_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv"))
248
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
249
250
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
251
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")])
252
ggplot(data =  avg_mse_by_lineage)+
253
  geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") +
254
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + 
255
  geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") +
256
  xlab("Cell Line Lineage") + ylab("Average MSE Loss") +
257
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on GDSC1")
258
ggsave(filename = paste0(plot_path, "drug_prot_full_train_CTRPv2_test_GDSC1_avg_MSE_by_lineage.pdf"), device = "pdf")
259
260
261
### GDSC2 ====
262
cur_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv"))
263
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
264
265
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
266
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")])
267
ggplot(data =  avg_mse_by_lineage)+
268
  geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") +
269
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + 
270
  geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") +
271
  xlab("Cell Line Lineage") + ylab("Average MSE Loss") +
272
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on GDSC2")
273
ggsave(filename = paste0(plot_path, "drug_prot_full_train_CTRPv2_test_GDSC2_avg_MSE_by_lineage.pdf"), device = "pdf")
274
275
276
### CTRPv2 ====
277
cur_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv"))
278
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
279
280
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
281
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")])
282
ggplot(data = avg_mse_by_lineage) +
283
  geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") +
284
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + 
285
  geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") +
286
  xlab("Cell Line Lineage") + ylab("Average MSE Loss") +
287
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on CTRPv2")
288
ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_CTRP_avg_MSE_by_lineage.pdf"), device = "pdf")
289
290
291
### All side by side (lineage bar plot) ====
292
ctrp_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv"))
293
ctrp_data <- merge(ctrp_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
294
gdsc1_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv"))
295
gdsc1_data <- merge(gdsc1_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
296
gdsc2_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv"))
297
gdsc2_data <- merge(gdsc2_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
298
299
ctrp_data[, abs_loss := sqrt(MSE_loss)]
300
ctrp_data[, lineage_loss_avg := mean(abs_loss), by = "lineage"]
301
ctrp_data[, lineage_loss_sd := sd(abs_loss), by = "lineage"]
302
ctrp_data[, sample_by_lineage_count := .N, by = "lineage"]
303
ctrp_avg_abs_by_lineage <- unique(ctrp_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")])
304
ctrp_avg_abs_by_lineage$Dataset <- "CTRPv2"
305
306
gdsc1_data[, abs_loss := sqrt(MSE_loss)]
307
gdsc1_data[, lineage_loss_avg := mean(abs_loss), by = "lineage"]
308
gdsc1_data[, lineage_loss_sd := sd(abs_loss), by = "lineage"]
309
gdsc1_data[, sample_by_lineage_count := .N, by = "lineage"]
310
gdsc1_avg_abs_by_lineage <- unique(gdsc1_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")])
311
gdsc1_avg_abs_by_lineage$Dataset <- "GDSC1"
312
313
gdsc2_data[, abs_loss := sqrt(MSE_loss)]
314
gdsc2_data[, lineage_loss_avg := mean(abs_loss), by = "lineage"]
315
gdsc2_data[, lineage_loss_sd := sd(abs_loss), by = "lineage"]
316
gdsc2_data[, sample_by_lineage_count := .N, by = "lineage"]
317
gdsc2_avg_abs_by_lineage <- unique(gdsc2_data[, c("lineage", "lineage_loss_avg", "lineage_loss_sd")])
318
gdsc2_avg_abs_by_lineage$Dataset <- "GDSC2"
319
320
all_avg_abs_by_lineage <- rbindlist(list(ctrp_avg_abs_by_lineage, gdsc1_avg_abs_by_lineage, gdsc2_avg_abs_by_lineage))
321
all_avg_abs_by_lineage <- merge(all_avg_abs_by_lineage, unique(ctrp_data[, c("lineage", "sample_by_lineage_count")]))
322
all_avg_abs_by_lineage$lineage <- paste0(all_avg_abs_by_lineage$lineage, ", n = ", all_avg_abs_by_lineage$sample_by_lineage_count)
323
324
ggplot(data = all_avg_abs_by_lineage, mapping = aes(x = reorder(lineage, -lineage_loss_avg), y = lineage_loss_avg, fill = Dataset)) +
325
  geom_bar(stat = "identity", position = position_dodge()) +
326
  # geom_errorbar(aes(ymin = lineage_loss_avg - lineage_loss_sd, ymax = lineage_loss_avg + lineage_loss_sd), width = 0.2, position = position_dodge(0.9)) +
327
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
328
  geom_hline(yintercept = mean(ctrp_data$abs_loss), linetype="dashed", color = "red") +
329
  # geom_text(aes(10, mean(ctrp_data$abs_loss),label = mean(ctrp_data$abs_loss), vjust = -1)) +
330
  geom_hline(yintercept = mean(gdsc1_data$abs_loss), linetype="dashed", color = "green") +
331
  geom_hline(yintercept = mean(gdsc2_data$abs_loss), linetype="dashed", color = "blue") +
332
  xlab("Cell Line Lineage + # training datapoints") + ylab("Average Absolute Loss") + 
333
  # scale_y_discrete(limits = c("0.001", "0.002")) +
334
  scale_y_continuous(breaks = sort(c(seq(0, 0.12, length.out=5),
335
                                     c(mean(ctrp_data$abs_loss),
336
                                       mean(gdsc1_data$abs_loss),
337
                                       mean(gdsc2_data$abs_loss))
338
  ))) +
339
  ggtitle(label = "Full DRP Mean Absolute Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on All 3")
340
ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_All_avg_Abs_by_lineage.pdf"), device = "pdf")
341
342
343
### All side by side (cell line dot plot) ====
344
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_prot_CTRP_Full/"
345
346
ctrp_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv"))
347
ctrp_data <- merge(ctrp_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
348
gdsc1_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv"))
349
gdsc1_data <- merge(gdsc1_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
350
gdsc2_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv"))
351
gdsc2_data <- merge(gdsc2_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
352
353
ctrp_data[, lineage_loss_sd := sd(MSE_loss), by = "lineage"]
354
ctrp_data[, cell_line_loss_avg := mean(MSE_loss), by = "cell_name"]
355
ctrp_avg_mse_by_cell_line <- unique(ctrp_data[, c("cell_name", "lineage", "cell_line_loss_avg", "lineage_loss_sd")])
356
ctrp_avg_mse_by_cell_line$Dataset <- "CTRPv2"
357
358
gdsc1_data[, lineage_loss_sd := sd(MSE_loss), by = "lineage"]
359
gdsc1_data[, cell_line_loss_avg := mean(MSE_loss), by = "cell_name"]
360
gdsc1_avg_mse_by_cell_line <- unique(gdsc1_data[, c("cell_name", "lineage", "cell_line_loss_avg", "lineage_loss_sd")])
361
gdsc1_avg_mse_by_cell_line$Dataset <- "GDSC1"
362
363
gdsc2_data[, lineage_loss_sd := sd(MSE_loss), by = "lineage"]
364
gdsc2_data[, cell_line_loss_avg := mean(MSE_loss), by = "cell_name"]
365
gdsc2_avg_mse_by_cell_line <- unique(gdsc2_data[, c("cell_name", "lineage", "cell_line_loss_avg", "lineage_loss_sd")])
366
gdsc2_avg_mse_by_cell_line$Dataset <- "GDSC2"
367
368
all_avg_mse_by_cell_line <- rbindlist(list(ctrp_avg_mse_by_cell_line, gdsc1_avg_mse_by_cell_line, gdsc2_avg_mse_by_cell_line))
369
ggplot(data = all_avg_mse_by_cell_line, mapping = aes(x = cell_name, y = cell_line_loss_avg, group = Dataset)) +
370
  facet_wrap(vars(lineage), scales = "free") +
371
  # geom_bar(stat = "identity", position = position_dodge()) +
372
  # geom_dotplot(binaxis = 'y', stackdir = 'center') +
373
  geom_boxplot() +
374
  geom_errorbar(aes(ymin = cell_line_loss_avg - lineage_loss_sd, ymax = cell_line_loss_avg + lineage_loss_sd), width = 0.2, position = position_dodge(0.9)) +
375
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
376
  geom_hline(yintercept = mean(ctrp_data$MSE_loss), linetype="dashed", color = "red") +
377
  # geom_text(aes(10, mean(ctrp_data$MSE_loss),label = mean(ctrp_data$MSE_loss), vjust = -1)) +
378
  geom_hline(yintercept = mean(gdsc1_data$MSE_loss), linetype="dashed", color = "green") +
379
  geom_hline(yintercept = mean(gdsc2_data$MSE_loss), linetype="dashed", color = "blue") +
380
  xlab("Cell Line Lineage + # training datapoints") + ylab("Average MSE Loss") + 
381
  # scale_y_discrete(limits = c("0.001", "0.002")) +
382
  scale_y_continuous(breaks = sort(c(seq(0, 0.12, length.out=5),
383
                                     c(mean(ctrp_data$MSE_loss),
384
                                       mean(gdsc1_data$MSE_loss),
385
                                       mean(gdsc2_data$MSE_loss))
386
  ))) +
387
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on All 3")
388
ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_All_avg_MSE_by_cell_line.pdf"), device = "pdf")
389
390
# Plot average MSE by lineage Full Response Only + EncoderTrain + PreTrain (drug + exp) ================================
391
path = "Data/CV_Results/HyperOpt_DRP_ResponseOnly_drug_exp_CTRP_EncoderTrain_PreTrain/"
392
393
### GDSC1
394
cur_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv"))
395
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
396
397
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
398
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")])
399
ggplot(data =  avg_mse_by_lineage)+
400
  geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") +
401
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + 
402
  geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") +
403
  xlab("Cell Line Lineage") + ylab("Average MSE Loss") +
404
  ggtitle(label = "ResponseOnly DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on GDSC1")
405
ggsave(filename = paste0(plot_path, "drug_prot_full_train_CTRPv2_test_GDSC1_avg_MSE_by_lineage.pdf"), device = "pdf")
406
407
408
### GDSC2
409
cur_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv"))
410
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
411
412
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
413
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")])
414
ggplot(data =  avg_mse_by_lineage)+
415
  geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") +
416
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + 
417
  geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") +
418
  xlab("Cell Line Lineage") + ylab("Average MSE Loss") +
419
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on GDSC2")
420
ggsave(filename = paste0(plot_path, "drug_prot_full_train_CTRPv2_test_GDSC2_avg_MSE_by_lineage.pdf"), device = "pdf")
421
422
423
### CTRPv2
424
cur_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv"))
425
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
426
427
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
428
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")])
429
ggplot(data =  avg_mse_by_lineage)+
430
  geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") +
431
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + 
432
  geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") +
433
  xlab("Cell Line Lineage") + ylab("Average MSE Loss") +
434
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Proteomics | Trained on CTRPv2 | Tested on CTRPv2")
435
ggsave(filename = paste0(plot_path, "drug_prot_train_CTRPv2_test_CTRP_avg_MSE_by_lineage.pdf"), device = "pdf")
436
437
438
# Plot average MSE by lineage (drug + exp) ================================
439
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_exp_CTRP_Full/"
440
# GDSC1
441
cur_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv"))
442
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
443
444
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
445
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")])
446
ggplot(data =  avg_mse_by_lineage)+
447
  geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") +
448
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + 
449
  geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") +
450
  xlab("Cell Line Lineage") + ylab("Average MSE Loss") +
451
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Gene Expression | Trained on CTRPv2 | Tested on GDSC1")
452
ggsave(filename = paste0(plot_path, "drug_exp_train_CTRPv2_test_GDSC1_avg_MSE_by_lineage.pdf"), device = "pdf")
453
454
# Plot average MSE by lineage Full (drug + exp + prot) ================================
455
path = "Data/CV_Results/HyperOpt_DRP_FullModel_drug_exp_prot_CTRP_Full/"
456
### GDSC1 ====
457
cur_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv"))
458
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
459
460
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
461
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")])
462
ggplot(data =  avg_mse_by_lineage)+
463
  geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") +
464
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + 
465
  geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") +
466
  xlab("Cell Line Lineage") + ylab("Average MSE Loss") +
467
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Gene Expression + Proteomics | Trained on CTRPv2 | Tested on GDSC1")
468
ggsave(filename = paste0(plot_path, "drug_exp_prot_full_train_CTRPv2_test_GDSC1_avg_MSE_by_lineage.pdf"), device = "pdf")
469
470
471
# GDSC2
472
cur_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv"))
473
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
474
475
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
476
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")])
477
ggplot(data =  avg_mse_by_lineage)+
478
  geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") +
479
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + 
480
  geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") +
481
  xlab("Cell Line Lineage") + ylab("Average MSE Loss") +
482
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Gene Expression + Proteomics | Trained on CTRPv2 | Tested on GDSC2")
483
ggsave(filename = paste0(plot_path, "drug_exp_prot_full_train_CTRPv2_test_GDSC2_avg_MSE_by_lineage.pdf"), device = "pdf")
484
485
# CTRPv2
486
cur_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv"))
487
cur_data <- merge(cur_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
488
489
cur_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
490
avg_mse_by_lineage <- unique(cur_data[, c("lineage", "lineage_loss_avg")])
491
ggplot(data =  avg_mse_by_lineage)+
492
  geom_bar(mapping = aes(x = reorder(lineage,-lineage_loss_avg), y = lineage_loss_avg), stat = "identity") +
493
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text()) + 
494
  geom_hline(yintercept = mean(cur_data$MSE_loss), linetype="dashed", color = "red") +
495
  xlab("Cell Line Lineage") + ylab("Average MSE Loss") +
496
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Gene Expression + Proteomics | Trained on CTRPv2 | Tested on CTRPv2")
497
ggsave(filename = paste0(plot_path, "drug_exp_prot_full_train_CTRPv2_test_CTRP_avg_MSE_by_lineage.pdf"), device = "pdf")
498
499
500
### All side by side ====
501
ctrp_data <- fread(paste0(path, "CTRP_AAC_MORGAN_512_inference_results.csv"))
502
ctrp_data <- merge(ctrp_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
503
gdsc1_data <- fread(paste0(path, "GDSC1_AAC_MORGAN_512_inference_results.csv"))
504
gdsc1_data <- merge(gdsc1_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
505
gdsc2_data <- fread(paste0(path, "GDSC2_AAC_MORGAN_512_inference_results.csv"))
506
gdsc2_data <- merge(gdsc2_data, cell_line_data[, c("stripped_cell_line_name", "lineage")], by.x = "cell_name", by.y = "stripped_cell_line_name")
507
508
ctrp_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
509
ctrp_avg_mse_by_lineage <- unique(ctrp_data[, c("lineage", "lineage_loss_avg")])
510
ctrp_data[, sample_by_lineage_count := .N, by = "lineage"]
511
ctrp_avg_mse_by_lineage$Dataset <- "CTRPv2"
512
513
gdsc1_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
514
gdsc1_avg_mse_by_lineage <- unique(gdsc1_data[, c("lineage", "lineage_loss_avg")])
515
gdsc1_avg_mse_by_lineage$Dataset <- "GDSC1"
516
517
gdsc2_data[, lineage_loss_avg := mean(MSE_loss), by = "lineage"]
518
gdsc2_avg_mse_by_lineage <- unique(gdsc2_data[, c("lineage", "lineage_loss_avg")])
519
gdsc2_avg_mse_by_lineage$Dataset <- "GDSC2"
520
521
all_avg_mse_by_lineage <- rbindlist(list(ctrp_avg_mse_by_lineage, gdsc1_avg_mse_by_lineage, gdsc2_avg_mse_by_lineage))
522
all_avg_mse_by_lineage <- merge(all_avg_mse_by_lineage, unique(ctrp_data[, c("lineage", "sample_by_lineage_count")]))
523
all_avg_mse_by_lineage$lineage <- paste0(all_avg_mse_by_lineage$lineage, ", n = ", all_avg_mse_by_lineage$sample_by_lineage_count)
524
525
ggplot(data = all_avg_mse_by_lineage) +
526
  geom_bar(mapping = aes(x = reorder(lineage, -lineage_loss_avg), y = lineage_loss_avg, fill = Dataset), stat = "identity", position = "dodge") +
527
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
528
  geom_hline(yintercept = mean(ctrp_data$MSE_loss), linetype="dashed", color = "red") +
529
  # geom_text(aes(10, mean(ctrp_data$MSE_loss),label = mean(ctrp_data$MSE_loss), vjust = -1)) +
530
  geom_hline(yintercept = mean(gdsc1_data$MSE_loss), linetype="dashed", color = "green") +
531
  geom_hline(yintercept = mean(gdsc2_data$MSE_loss), linetype="dashed", color = "blue") +
532
  xlab("Cell Line Lineage  + # training datapoints") + ylab("Average MSE Loss") + 
533
  # scale_y_discrete(limits = c("0.001", "0.002")) +
534
  scale_y_continuous(breaks = sort(c(seq(0, 0.12, length.out=5),
535
                                     c(mean(ctrp_data$MSE_loss),
536
                                       mean(gdsc1_data$MSE_loss),
537
                                       mean(gdsc2_data$MSE_loss))
538
  ))) +
539
  ggtitle(label = "Full DRP Mean MSE Loss by Cell Line Lineage", subtitle = "Data: Drug + Expression + Proteomics | Trained on CTRPv2 | Tested on All 3")
540
ggsave(filename = paste0(plot_path, "drug_exp_prot_train_CTRPv2_test_All_avg_MSE_by_lineage.pdf"), device = "pdf")
541