a b/R/Data_Source_Comparison.R
1
# Find the percentage of cell lines with proteomics data that also have drug response data in DepMap
2
3
require(data.table)
4
d <- fread("/Users/ftaj/Downloads/primary-screen-replicate-collapsed-logfold-change.csv")
5
protein <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/20Q2/CCLE_protein_quant_current_normalized.csv")
6
cells <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/20Q2/DepMap_Cell_Line_info-3.csv")
7
8
dim(d)
9
10
c <- c[, c("DepMap_ID", "CCLE_Name")]
11
cp <- merge(c, p, by.x = "CCLE_Name", by.y = "CCLE Code")
12
p
13
 
14
sum(cp$DepMap_ID %in% d$V1)/375
15
colnames(d)
16
17
18
# ==== Compare to GDSC ====
19
require(data.table)
20
gdsc1 <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/GDSC/GDSC1_Fitted_Dose_Response.csv")
21
length(unique(gdsc1$CELL_LINE_NAME))  # 987 cell lines in GDSC2
22
gdsc2 <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/GDSC/GDSC2_Fitted_Dose_Response.csv")
23
length(unique(gdsc2$CELL_LINE_NAME))  # 809 cell lines in GDSC2
24
25
gdsc_cells <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/GDSC/GDSC_Cell_Lines_Details.csv")
26
27
# Find overlap with DepMap cell lines
28
sum(toupper(gdsc_cells$`Sample Name`) %in% toupper(cells$stripped_cell_line_name)) / length(gdsc_cells$`Sample Name`)
29
sum(toupper(gdsc_cells$`COSMIC identifier`) %in% toupper(cells$COSMICID)) / length(gdsc_cells$`Sample Name`)
30
# COSMIC IDs have ~98% overlap
31
# This implies that different names are being used