[c3b4f8]: / R / Data_Source_Comparison.R

Download this file

31 lines (23 with data), 1.5 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# Find the percentage of cell lines with proteomics data that also have drug response data in DepMap
require(data.table)
d <- fread("/Users/ftaj/Downloads/primary-screen-replicate-collapsed-logfold-change.csv")
protein <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/20Q2/CCLE_protein_quant_current_normalized.csv")
cells <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/20Q2/DepMap_Cell_Line_info-3.csv")
dim(d)
c <- c[, c("DepMap_ID", "CCLE_Name")]
cp <- merge(c, p, by.x = "CCLE_Name", by.y = "CCLE Code")
p
sum(cp$DepMap_ID %in% d$V1)/375
colnames(d)
# ==== Compare to GDSC ====
require(data.table)
gdsc1 <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/GDSC/GDSC1_Fitted_Dose_Response.csv")
length(unique(gdsc1$CELL_LINE_NAME)) # 987 cell lines in GDSC2
gdsc2 <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/GDSC/GDSC2_Fitted_Dose_Response.csv")
length(unique(gdsc2$CELL_LINE_NAME)) # 809 cell lines in GDSC2
gdsc_cells <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/GDSC/GDSC_Cell_Lines_Details.csv")
# Find overlap with DepMap cell lines
sum(toupper(gdsc_cells$`Sample Name`) %in% toupper(cells$stripped_cell_line_name)) / length(gdsc_cells$`Sample Name`)
sum(toupper(gdsc_cells$`COSMIC identifier`) %in% toupper(cells$COSMICID)) / length(gdsc_cells$`Sample Name`)
# COSMIC IDs have ~98% overlap
# This implies that different names are being used