Switch to unified view

a b/R/02_Omic_Data_Preparation.R
1
# Complete_Sample_Prep.R
2
3
# This script is intended to pair genomics, transcriptomics, proteomics and drug response data
4
# mainly from the DepMap resource.
5
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
6
dir.create(paste0(path, "Data/DRP_Training_Data"))
7
8
require(data.table)
9
10
# ==== Cell line info cleanup ====
11
depmap_samples <- fread(paste0(path, "Data/DepMap/21Q2/sample_info.csv"))
12
# Subset relevant (machine learning) columns 
13
depmap_samples <- depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")]
14
15
fwrite(depmap_samples, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv"))
16
17
# depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
18
# ==== Expression data cleanup ====
19
ccle_exp <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_expression.csv"))
20
max(ccle_exp[, -1])
21
min(ccle_exp[, -1])
22
dim(ccle_exp)
23
ccle_exp[1:5, 1:20]
24
# Change column names to only contain HGNC name: replace everything after first word with ""
25
colnames(ccle_exp) <- gsub(" .+", "", colnames(ccle_exp))
26
colnames(ccle_exp)[1] <- "DepMap_ID"
27
# Merge with sample info to have cell line name in addition to DepMap ID
28
ccle_exp <- merge(ccle_exp, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "DepMap_ID")
29
ccle_exp[, DepMap_ID := NULL]
30
ccle_exp[1:5, 1:20]
31
32
# Move cell line name to the first column: just giving the column name to the function moves it to first place
33
setcolorder(ccle_exp, neworder = sort(colnames(ccle_exp)))
34
setcolorder(ccle_exp, neworder = c("stripped_cell_line_name", "primary_disease"))
35
ccle_exp[1:5, 1:20]
36
37
# Save
38
fwrite(ccle_exp, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Expression.csv"), sep = ',')
39
40
ccle_exp <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Expression.csv"))
41
42
ccle_exp
43
# DIMENSIONS OF EXPRESSION DATA: 1375 X 19178
44
rm(ccle_exp)
45
# ==== Copy number data cleanup ====
46
ccle_cn <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_gene_copy_number.csv"))
47
dim(ccle_cn)
48
ccle_cn[1:5, 1:10]
49
# Change column names to only contain HGNC name: replace everything after first word with ""
50
colnames(ccle_cn) <- gsub(" .+", "", colnames(ccle_cn))
51
colnames(ccle_cn)[1] <- "DepMap_ID"
52
# Merge with sample info to have cell line name in addition to DepMap ID
53
ccle_cn <- merge(ccle_cn, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "DepMap_ID")
54
ccle_cn[, DepMap_ID := NULL]
55
56
setcolorder(ccle_cn, neworder = sort(colnames(ccle_cn)))
57
setcolorder(ccle_cn, neworder = c("stripped_cell_line_name", "primary_disease"))
58
ccle_cn[1:5, 1:20]
59
which(is.na(ccle_cn), arr.ind = T)
60
ccle_cn[1407, 26569]
61
62
# Replace NA with 0
63
setnafill(ccle_cn, fill = 0, cols = unique(which(is.na(ccle_cn), arr.ind = T)[,2]))
64
65
dim(ccle_cn)
66
anyNA(ccle_cn)
67
sum(is.na(ccle_cn))
68
which(is.na(ccle_cn))
69
# Save
70
fwrite(ccle_cn, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv"), sep = ',')
71
72
# DIMENSIONS OF COPY NUMBER DATA: 1740 X 27563
73
rm(ccle_cn)
74
gc()
75
# ==== Proteomic data cleanup ====
76
ccle_prot <- fread(paste0(path, "Data/DepMap/20Q2/CCLE_protein_quant_current_normalized.csv"))
77
dim(ccle_prot)
78
ccle_prot[1:5, 1:10]
79
ccle_prot[1:5, 48:60]
80
# Subset only the Uniprot accession (since its unique unlike HGNC) and the cell line experimental data
81
ccle_prot <- ccle_prot[, c(6, 49:ncol(ccle_prot)), with = F]
82
colnames(ccle_prot) <- gsub("\\_.+", "", colnames(ccle_prot))
83
colnames(ccle_prot)[1] <- "Uniprot_Acc"
84
# Transpose the data.table to match with other data type tables
85
t <- transpose(ccle_prot, make.names = "Uniprot_Acc")
86
87
# Check if transpose worked as intended
88
as.numeric(unlist(t[1,])) == as.numeric(unlist(ccle_prot[,2]))
89
as.numeric(unlist(t[2,])) == as.numeric(unlist(ccle_prot[,3]))
90
91
# Add cell lines
92
t$stripped_cell_line_name <- colnames(ccle_prot)[-1]
93
# Merge with sample info to have cell line name in addition to DepMap ID
94
t <- merge(t, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "stripped_cell_line_name")
95
96
# Move to front
97
setcolorder(t, neworder = c("DepMap_ID", "stripped_cell_line_name", "primary_disease"))
98
t[1:5, 1:10]
99
100
# Save
101
fwrite(t, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_ProteinQuant.csv"), sep = ',')
102
103
104
### Get proteins that are observed in all cell lines
105
# Create the same transposed table as above
106
# Remove all rows and columns that have any NA in them
107
prot_nona <- na.omit(ccle_prot)
108
which(is.na(prot_nona))
109
# Transpose the data.table to match with other data type tables
110
t <- transpose(prot_nona, make.names = "Uniprot_Acc")
111
# Check if transpose worked as intended
112
as.numeric(unlist(t[1,])) == as.numeric(unlist(prot_nona[,2]))
113
as.numeric(unlist(t[2,])) == as.numeric(unlist(prot_nona[,3]))
114
# Add cell lines
115
t$stripped_cell_line_name <- colnames(prot_nona)[-1]
116
# Merge with sample info to have cell line name in addition to DepMap ID
117
t <- merge(t, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "stripped_cell_line_name")
118
# Move to front
119
setcolorder(t, neworder = c("DepMap_ID", "stripped_cell_line_name", "primary_disease"))
120
t[1:5, 1:10]
121
# Now we have ~5000 proteins that are available in all samples
122
dim(t)
123
124
# We have 3 duplicates
125
sum(duplicated(t$stripped_cell_line_name))
126
127
# Save
128
fwrite(t, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"), sep = ',')
129
# ccle_prot <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"))
130
dim(ccle_prot)
131
ccle_prot[1:5, 1:5]
132
ccle_prot[, DepMap_ID := NULL]
133
# fwrite(ccle_prot, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"), sep = ',')
134
# DIMENSIONS OF PROTEIN QUANTITY DATA: 378 X 5155
135
136
ccle_prot <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"))
137
anyDuplicated(ccle_prot$stripped_cell_line_name)
138
139
# ==== Mutation data cleanup ====
140
rm(list = ls(pattern = "ccle"))
141
require(data.table)
142
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
143
144
ccle_mut <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_mutations.csv"))
145
table(ccle_mut$isCOSMIChotspot)
146
table(ccle_mut$isTCGAhotspot)
147
table(ccle_mut$Variant_Type)
148
length(unique(ccle_mut$DepMap_ID))
149
150
dim(ccle_mut)
151
ccle_mut[1,]
152
colnames(ccle_mut)
153
# Calculate number of mutations per cell line
154
temp <- ccle_mut[, c("Variant_Type", "DepMap_ID")]
155
temp[, nMut := .N, by = "DepMap_ID"]
156
temp
157
unique(temp$Variant_Type)
158
# For simplicity, extract only SNP data for now: this discards ~90,000 mutations
159
# ccle_mut <- ccle_mut[Variant_Type == "SNP"]
160
dim(ccle_mut)
161
t <- ccle_mut[, c("DepMap_ID", "Chromosome", "Strand", "Start_position", "End_position")]
162
dim(unique(t))
163
length(unique(ccle_mut$DepMap_ID))
164
# Keep relevant columns/features
165
# Aside: Should the sequence change be provided, or just whether the SNP is deleterious or not?
166
ccle_mut <- ccle_mut[, c("DepMap_ID", "Hugo_Symbol", "Chromosome", "Start_position", "End_position", "Strand",
167
             "Variant_Classification", "Variant_Type", "isDeleterious",
168
             "isTCGAhotspot", "isCOSMIChotspot", "Genome_Change", "cDNA_Change")]
169
dim(ccle_mut)
170
length(unique(ccle_mut$DepMap_ID))
171
table(ccle_mut$isDeleterious)
172
table(ccle_mut$isTCGAhotspot)
173
table(ccle_mut$isCOSMIChotspot)
174
175
# ==== CCLE Mut Overlap with COSMIC CGC ====
176
# Perhaps it's best to use the mutations in genes that COSMIC considers important, like another paper in
177
# the field (~500 genes)
178
# Or, we can use a binary vector for genes and whether they have a deleterious mutation: this will result in 
179
# ~20,000 parameters
180
length(unique(ccle_mut$Hugo_Symbol))
181
182
length(unique(ccle_mut[isCOSMIChotspot == T]$Hugo_Symbol))
183
length(unique(ccle_mut[isTCGAhotspot == T]$Hugo_Symbol))
184
length(unique(ccle_mut[isDeleterious == T]$Hugo_Symbol))
185
186
tcga_hotspot_genes <- unique(ccle_mut[isTCGAhotspot == T]$Hugo_Symbol)
187
# Read COSMIC Cancer Gene Census data
188
cgc <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/COSMIC/cancer_gene_census.csv")
189
dim(cgc)
190
cgc[1:5, 1:20]
191
length(unique(cgc$`Gene Symbol`))
192
length(unique(cgc$HGVSG))
193
# Get Genes in this census
194
cgc_genes <- unique(cgc$`Gene Symbol`)
195
cgc[Tier == 1]
196
length(unique(cgc$`Genome Location`))  # 922,732
197
# rm(cgc)
198
199
# Subset DepMap mutations based on the CGC genes
200
sum(unique(ccle_mut$Hugo_Symbol) %in% unique(cgc_genes))
201
ccle_mut <- ccle_mut[Hugo_Symbol %in% cgc_genes]
202
length(unique(ccle_mut$DepMap_ID))
203
204
sum(ccle_mut$isDeleterious)
205
ccle_mut[Variant_Classification == "Missense_Mutation"]
206
length(unique(ccle_mut[isDeleterious == T]$Hugo_Symbol))
207
ccle_mut[isDeleterious == T]
208
209
210
# TODO: Use CGC to check for overlap with CCLE cell lines, then collapse to whether each of the 700 genes for
211
# that cell line has a mutation listed in the CGC
212
length(unique(cgc$`Mutation genome position`))  # ~922,000 unique mutations
213
unique(ccle_mut$NCBI_Build)  # CCLE is with GRCh 37
214
unique(cgc$GRCh)  # CGC has GRCh 38
215
# We must "lift over" the mutations from 37 to 38 before checking for overlap
216
if (!require(liftOver)) {
217
    BiocManager::install("liftOver")
218
    require(liftOver)
219
    require(rtracklayer)
220
}
221
# liftOver requires a chain file to convert 37 to 38: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/
222
223
chain_path <- paste0(path, "Data/hg19ToHg38.over.chain")
224
grch_37_38_chain <- import.chain(chain_path)
225
226
# Must add "chr" to start of chromosome names
227
ccle_mut$Chromosome <- paste0("chr", ccle_mut$Chromosome)
228
# Must convert positions to GRanges
229
ccle_mut_gr <- makeGRangesFromDataFrame(df = ccle_mut, keep.extra.columns = T,
230
                         seqnames.field = "Chromosome", start.field = "Start_position",
231
                         end.field = "End_position", strand.field = "Strand")
232
length(unique(ccle_mut_gr$DepMap_ID))
233
234
# Lift over
235
lifted_ccle_mut <- liftOver(x = ccle_mut_gr, chain = grch_37_38_chain)
236
# Convert GRangesList to GRanges
237
lifted_ccle_mut <- unlist(lifted_ccle_mut)
238
# Convert back to data.table
239
lifted_ccle_mut <- as.data.table(lifted_ccle_mut)
240
# Note: Genome_Change is now out of date!
241
# Remove chr from seqnames
242
lifted_ccle_mut$seqnames <- gsub("chr", "", lifted_ccle_mut$seqnames)
243
# Can find the overlap of Mutation genome position in CGC with a newly created column based on CCLE positions
244
lifted_ccle_mut[, Mutation_Position := paste0(seqnames, ':', start, '-', end)]
245
246
ccle_mut$seqnames <- gsub("chr", "", ccle_mut$Chromosome)
247
ccle_mut[, Mutation_Position := paste0(seqnames, ':', as.character(Start_position), '-', as.character(End_position))]
248
249
250
length(unique(lifted_ccle_mut$DepMap_ID))
251
252
sum(ccle_mut$Mutation_Position %in% unique(cgc$`Genome Location`))
253
254
# Now find the overlap with CGC (which already has GRCh38)
255
subset <- lifted_ccle_mut[Mutation_Position %in% unique(cgc$`Genome Location`)]
256
table(subset$Variant_Type)
257
length(unique(subset$DepMap_ID))
258
# IMPORTANT! There is a loss of 8 cell lines (which do not have a mutation that is in
259
# CGC) using the Tier 1 data only
260
261
# Alternative (March 2021) ====
262
# Take those mutations that are COSMIC or TCGA hotspots, ignoring CGC
263
subset <- ccle_mut[isTCGAhotspot | isCOSMIChotspot]
264
265
uniqueN(subset$Hugo_Symbol)
266
table(ccle_mut[isTCGAhotspot == T]$Variant_Classification)
267
uniqueN(ccle_mut[isTCGAhotspot == T]$Hugo_Symbol)
268
uniqueN(ccle_mut[isCOSMIChotspot == T]$Hugo_Symbol)
269
270
uniqueN(subset$isTCGAhotspot)
271
272
### Create a vector of mutations for each cell line with the CGC genes
273
length(unique(subset$Hugo_Symbol))
274
sub_dcast <- dcast.data.table(data = subset[, c("DepMap_ID", "Hugo_Symbol")],
275
                 formula = DepMap_ID ~ Hugo_Symbol, fun.aggregate = length, value.var = "DepMap_ID")
276
dim(sub_dcast)
277
sub_dcast[1:5, 1:50]
278
sum(sub_dcast$A1BG)
279
sum(sub_dcast$A1CF)
280
281
depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv"))
282
sub_dcast <- merge(sub_dcast, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")],
283
                  by = "DepMap_ID")
284
setcolorder(sub_dcast, c("DepMap_ID", "stripped_cell_line_name", "primary_disease"))
285
sub_dcast[1:5, 1:50]
286
287
# Save
288
fwrite(sub_dcast, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv"), sep = ',')
289
dim(cgc_muts)
290
cgc_muts[1:5, 1:5]
291
typeof(cgc_muts[1,2])
292
293
temp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv")
294
dim(temp)
295
temp[1:5, 1:50]
296
297
# # Attach the cell line name and primary disease
298
# # cgc_muts <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv"))
299
# depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv"))
300
# cgc_muts <- merge(cgc_muts, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")],
301
#                   by = "DepMap_ID")
302
# setcolorder(cgc_muts, neworder = c("stripped_cell_line_name", colnames(cgc_muts)[-ncol(cgc_muts)]))
303
# 
304
# # Save
305
# fwrite(cgc_muts, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv"), sep = ',')
306
# cgc_muts <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv"))
307
# cgc_muts[1:5, 1:5]
308
# cgc_muts[, DepMap_ID := NULL]
309
# DIMENSIONS OF CGC MUTATIONAL DATA: 1733 X 697
310
311
312
313
# ==== miRNA Data Cleanup ====
314
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
315
require(data.table)
316
depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
317
318
ccle_mirna <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_miRNA_20181103.gct")
319
dim(ccle_mirna)
320
anyNA(ccle_mirna)
321
ccle_mirna[1:5, 1:5]
322
323
min(ccle_mirna[, -c(1:2)], na.rm = T)
324
max(ccle_mirna[, -c(1:2)], na.rm = T)
325
326
ccle_mirna <- transpose(ccle_mirna, keep.names = "Name")
327
dim(ccle_mirna)
328
ccle_mirna[1:5, 1:5]
329
ccle_mirna$Name
330
sum(duplicated(unlist(ccle_mirna[2, ])))
331
ccle_mirna <- ccle_mirna[-1,]
332
ccle_mirna[1:5, 1:5]
333
colnames(ccle_mirna) <- unlist(ccle_mirna[1,])
334
ccle_mirna <- ccle_mirna[-1,]
335
336
# Clean cell line name
337
ccle_mirna$Description <- gsub(pattern = "\\_.+", replacement = "", ccle_mirna$Description)
338
colnames(ccle_mirna)[1] <- "stripped_cell_line_name"
339
ccle_mirna <- merge(ccle_mirna, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
340
                    by = "stripped_cell_line_name")
341
dim(ccle_mirna)
342
ccle_mirna[1:5, 1:5]
343
setcolorder(ccle_mirna, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
344
345
# fwrite(ccle_mirna, paste0(path, "Data/DRP_Training_Data/DepMap_2019_miRNA.csv"), sep = ',')
346
347
rm(ccle_mirna)
348
349
# ccle_mirna <- fread("Data/DRP_Training_Data/DepMap_2019_miRNA.csv")
350
# dim(ccle_mirna)
351
# ==== Metabolomics Data Cleanup ====
352
depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
353
ccle_metab <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_metabolomics_20190502.csv")
354
355
dim(ccle_metab)
356
ccle_metab[1:5, 1:5]
357
358
min(ccle_metab[, -c(1:2)], na.rm = T)
359
max(ccle_metab[, -c(1:2)], na.rm = T)
360
361
anyNA(ccle_metab)
362
sum(is.na(ccle_metab))
363
which(is.na(ccle_metab), arr.ind = T)
364
ccle_metab[which(is.na(ccle_metab), arr.ind = T)]
365
ccle_metab[554, 2]  # DepMap_ID is NA
366
367
min(ccle_metab[, -c(1:2)])
368
ccle_metab <- merge(ccle_metab[, -1], depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
369
                    by = "DepMap_ID")
370
371
setcolorder(ccle_metab, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
372
ccle_metab$DepMap_ID <- NULL
373
dim(ccle_metab)
374
375
fwrite(ccle_metab, paste0(path, "Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv"), sep = ',')
376
rm(ccle_metab)
377
378
temp <- fread("Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv")
379
dim(temp)
380
381
# ==== RPPA Data Cleanup ====
382
ccle_rppa <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RPPA_20181003.csv")
383
dim(ccle_rppa)
384
ccle_rppa[1:5, 1:5]
385
anyNA(ccle_rppa)
386
387
min(ccle_rppa[, -c(1:2)], na.rm = T)  # has negative values
388
max(ccle_rppa[, -c(1:2)], na.rm = T)
389
390
ccle_rppa$V1 <- gsub(pattern = "\\_.+", replacement = "", ccle_rppa$V1)
391
colnames(ccle_rppa)[1] <- "stripped_cell_line_name"
392
ccle_rppa <- merge(ccle_rppa, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
393
                    by = "stripped_cell_line_name")
394
dim(ccle_rppa)
395
ccle_rppa[1:5, 1:10]
396
setcolorder(ccle_rppa, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
397
398
fwrite(ccle_rppa, paste0(path, "Data/DRP_Training_Data/DepMap_2019_RPPA.csv"), sep = ',')
399
400
rm(ccle_rppa)
401
402
temp <- fread("Data/DRP_Training_Data/DepMap_2019_RPPA.csv")
403
dim(temp)
404
# ==== Chromatin Profiling Data Cleanup ====
405
ccle_chrom <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_GlobalChromatinProfiling_20181130.csv")
406
dim(ccle_chrom)
407
ccle_chrom[1:5, 1:10]
408
409
min(ccle_chrom[, -c(1:2)], na.rm = T)  # has negative values
410
max(ccle_chrom[, -c(1:2)], na.rm = T)
411
412
413
anyNA(ccle_chrom)
414
sum(is.na(ccle_chrom))  # 842 NA values
415
416
unique(which(is.na(ccle_chrom), arr.ind = T)[,2])
417
length(unique(which(is.na(ccle_chrom), arr.ind = T)[,2]))  # 26 columns have NAs
418
419
# Convert NA to 0
420
setnafill(ccle_chrom, fill = 0, cols = unique(which(is.na(ccle_chrom), arr.ind = T)[,2]))
421
anyNA(ccle_chrom)
422
423
ccle_chrom$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_chrom$CellLineName)
424
colnames(ccle_chrom)[1] <- "stripped_cell_line_name"
425
dim(ccle_chrom)
426
ccle_chrom <- merge(ccle_chrom, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
427
                   by = "stripped_cell_line_name")
428
ccle_chrom$BroadID <- NULL
429
dim(ccle_chrom)
430
ccle_chrom[1:5, 1:10]
431
setcolorder(ccle_chrom, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
432
433
# fwrite(ccle_chrom, paste0(path, "Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv"), sep = ',')
434
435
rm(ccle_chrom)
436
437
temp <- fread(paste0("Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv"))
438
dim(temp)
439
# ==== Fusion Data Cleanup ====
440
ccle_fusion <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_fusions.csv")
441
dim(ccle_fusion)
442
ccle_fusion[1:5, 1:17]
443
length(unique(ccle_fusion$FusionName))
444
length(unique(ccle_fusion$DepMap_ID))
445
unique(ccle_fusion$SpliceType)
446
quantile(ccle_fusion$FFPM)
447
448
ccle_fusion$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_fusion$CellLineName)
449
colnames(ccle_fusion)[1] <- "stripped_cell_line_name"
450
dim(ccle_fusion)
451
ccle_fusion <- merge(ccle_fusion, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
452
                    by = "stripped_cell_line_name")
453
ccle_fusion$BroadID <- NULL
454
dim(ccle_fusion)
455
ccle_fusion[1:5, 1:10]
456
setcolorder(ccle_fusion, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
457
458
fwrite(ccle_fusion, paste0(path, "Data/DRP_Training_Data/DepMap_2019_GeneFusion.csv"), sep = ',')
459
460
rm(ccle_fusion)
461
462
# ==== Exon Usage Ratio Data Cleanup ====
463
require(data.table)
464
setDTthreads(8)
465
ccle_exon <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RNAseq_ExonUsageRatio_20180929.gct")
466
dim(ccle_exon)
467
ccle_exon[1:10, 1:17]
468
469
transpose(ccle_exon, keep.names = "exon")
470
length(unique(ccle_exon$FusionName))
471
length(unique(ccle_exon$DepMap_ID))
472
unique(ccle_exon$SpliceType)
473
quantile(ccle_exon$FFPM)
474
475
ccle_exon$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_exon$CellLineName)
476
colnames(ccle_exon)[1] <- "stripped_cell_line_name"
477
dim(ccle_exon)
478
ccle_exon <- merge(ccle_exon, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
479
                     by = "stripped_cell_line_name")
480
ccle_exon$BroadID <- NULL
481
dim(ccle_exon)
482
ccle_exon[1:5, 1:10]
483
setcolorder(ccle_exon, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
484
485
# fwrite(ccle_exon, paste0(path, "Data/DRP_Training_Data/DepMap_2019_ExonUsageRatio.csv"), sep = ',')
486
487
rm(ccle_exon)
488
489
# ==== RRBS Profiling Data Cleanup ====
490
require(data.table)
491
setDTthreads(8)
492
493
# === TSS
494
ccle_tss <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RRBS_TSS1kb_20181022.txt")
495
dim(ccle_tss)
496
ccle_tss[1:5, 1:5]
497
length(unique(ccle_tss$cluster_id))
498
499
ccle_tss <- transpose(ccle_tss[, -2], keep.names = "cluster_id")
500
colnames(ccle_tss) <- unlist(ccle_tss[1,])
501
ccle_tss <- ccle_tss[-1,]
502
503
# === Promoter
504
ccle_tss <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE")
505
dim(ccle_tss)
506
ccle_tss[1:5, 1:5]
507
length(unique(ccle_tss$cluster_id))
508
509
ccle_tss <- transpose(ccle_tss[, -2], keep.names = "cluster_id")
510
colnames(ccle_tss) <- unlist(ccle_tss[1,])
511
ccle_tss <- ccle_tss[-1,]
512
# === Enhancers
513
514
515
# ==== GDSC Cell Line Characterization Data Cleanup ====
516
require(data.table)
517
gdsc_line_info <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/GDSC/GDSC_Line_Info.csv")
518
gdsc_line_info <- gdsc_line_info[-c(1,2,3, 1005), c("V1", "V2", "V9")]
519
colnames(gdsc_line_info) <- c("stripped_cell_line_name", "COSMIC_ID", "primary_disease")
520
gdsc_line_info$stripped_cell_line_name <- gsub("[^[:alnum:] ]", "", gdsc_line_info$stripped_cell_line_name)
521
gdsc_line_info$stripped_cell_line_name <- gsub(" ", "", gdsc_line_info$stripped_cell_line_name)
522
gdsc_line_info$stripped_cell_line_name <- toupper(gdsc_line_info$stripped_cell_line_name)
523
524
# Match with DepMap cell lines
525
depmap_line_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
526
sum(gdsc_line_info$stripped_cell_line_name %in% depmap_line_info$stripped_cell_line_name) / nrow(gdsc_line_info)  # 98%
527
gdsc_line_info[!(gdsc_line_info$stripped_cell_line_name %in% depmap_line_info$stripped_cell_line_name)]
528
gdsc_line_info[stripped_cell_line_name == "EOL1CELL"] <- "EOL1"
529
gdsc_line_info[stripped_cell_line_name == "SR"] <- "SR786"
530
gdsc_line_info[stripped_cell_line_name == "U266"] <- "U266B1"
531
gdsc_line_info[stripped_cell_line_name == "G292"] <- "G292CLONEA141B1"
532
gdsc_line_info[stripped_cell_line_name == "NCISNU1"] <- "SNU1"
533
gdsc_line_info[stripped_cell_line_name == "NCISNU5"] <- "SNU5"
534
gdsc_line_info[stripped_cell_line_name == "NCISNU16"] <- "SNU16"
535
gdsc_line_info[stripped_cell_line_name == "7860"] <- "786O"
536
gdsc_line_info[stripped_cell_line_name == "U031"] <- "UO31"
537
gdsc_line_info[stripped_cell_line_name == "H3255"] <- "NCIH3255"
538
gdsc_line_info[stripped_cell_line_name == "NCIH510A"] <- "NCIH510"
539
gdsc_line_info[stripped_cell_line_name == "U251"] <- "U251MG"
540
gdsc_line_info[stripped_cell_line_name == "WM793B"] <- "WM793"
541
gdsc_line_info[stripped_cell_line_name == "OVCAR3"] <- "NIHOVCAR3"
542
gdsc_line_info[stripped_cell_line_name == "SCC90"] <- "UPCISCC090"
543
gdsc_line_info[COSMIC_ID == "1299064"] <- "TDOTT"
544
gdsc_line_info[COSMIC_ID == "930299"] <- "TT"
545
gdsc_line_info[COSMIC_ID == "909976"] <- "KMH2"
546
gdsc_line_info[COSMIC_ID == "1298167"] <- "KMHDASH2"
547
sum(gdsc_line_info$stripped_cell_line_name %in% depmap_line_info$stripped_cell_line_name) / nrow(gdsc_line_info)  # 100%
548
549
550
# ==== GDSC Expression Data Cleanup ====
551
gdsc_exp <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/GDSC/GDSC_Expression.txt")
552
gdsc_exp[1:5, 1:5]
553
# Delete invalid gene symbol rows
554
gdsc_exp <- gdsc_exp[GENE_SYMBOLS != ""]
555
556
# Rename Expression columns
557
colnames(gdsc_exp) <- gsub("DATA.", "", colnames(gdsc_exp))
558
gdsc_exp <- gdsc_exp[, !"GENE_title", with = F]
559
# Must transpose to have cell lines on each row
560
gdsc_exp <- transpose(gdsc_exp, keep.names = "GENE_SYMBOLS")
561
gdsc_exp[1:5, 1:5]
562
colnames(gdsc_exp) <- unlist(gdsc_exp[1,])
563
gdsc_exp <- gdsc_exp[-1,]
564
colnames(gdsc_exp)[1] <- "COSMIC_ID"
565
# gdsc_exp[COSMIC_ID == "1299064"][1:5, 1:5]
566
# gdsc_exp[COSMIC_ID == "930299"][1:5, 1:5]
567
# Merge with cell line names
568
gdsc_exp <- merge(gdsc_exp, gdsc_line_info, by = "COSMIC_ID")
569
gdsc_exp$COSMIC_ID <- NULL
570
setcolorder(gdsc_exp, order(colnames(gdsc_exp)))
571
setcolorder(gdsc_exp, c("stripped_cell_line_name", "primary_disease"))
572
gdsc_exp[1:5, 1:5]
573
# Match genes with those from DepMap Cell lines
574
depmap_exp <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
575
# depmap_exp <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DRP_Training_Data/DepMap_21Q2_Training_Expression.csv")
576
depmap_exp[1:5, 1:5]
577
578
dim(gdsc_exp)
579
dim(depmap_exp)
580
581
t1 <- colnames(gdsc_exp)
582
t2 <- colnames(depmap_exp)
583
sum(t1 %in% t2) / ncol(gdsc_exp)  # 91%
584
colnames(gdsc_exp)[!colnames(gdsc_exp) %in% colnames(depmap_exp)]
585
586
# Update gene names with biomaRt
587
# require(biomaRt)
588
# ensembl = useMart(
589
#   biomart = "ENSEMBL_MART_ENSEMBL",
590
#   # host = "feb2014.archive.ensembl.org",
591
#   path = "/biomart/martservice",
592
#   dataset = "hsapiens_gene_ensembl"
593
# )
594
# atts <- listAttributes(ensembl, what = "name")
595
# atts[grep(pattern = "gene", x = atts, ignore.case = T)]
596
# depmap_dict <- biomaRt::getBM(attributes = c("ensembl_gene_id",
597
#                                           "external_gene_name", "wikigene_name"),
598
#                            filters = "external_gene_name", values = colnames(depmap_exp),
599
#                            mart = ensembl)
600
# depmap_dict <- as.data.table(depmap_dict)
601
# depmap_dict <- unique(depmap_dict[, 2:3])
602
# 
603
# gdsc_dict <- biomaRt::getBM(attributes = c("ensembl_gene_id",
604
#                                              "external_gene_name", "wikigene_name"),
605
#                               filters = "external_gene_name", values = colnames(gdsc_exp),
606
#                               mart = ensembl)
607
# gdsc_dict <- as.data.table(gdsc_dict)
608
609
# Get updated gene names from: https://www.genenames.org/tools/multi-symbol-checker/
610
611
all_gene_names <- data.table(unique(c(colnames(depmap_exp), colnames(gdsc_exp)))[-(1:2)])
612
fwrite(all_gene_names, "GDSC_DepMap_All_Gene_Names.txt")
613
hgnc_check <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/hgnc-symbol-check.csv", skip = 1)
614
615
depmap_hgnc_check <- hgnc_check[Input %in% colnames(depmap_exp)[-(1:2)]]
616
names(depmap_exp)[match(depmap_hgnc_check$Input, names(depmap_exp))] <- depmap_hgnc_check$`Approved symbol`
617
gdsc_hgnc_check <- hgnc_check[Input %in% colnames(gdsc_exp)[-(1:2)]]
618
names(gdsc_exp)[match(gdsc_hgnc_check$Input, names(gdsc_exp))] <- gdsc_hgnc_check$`Approved symbol`
619
620
621
t1 <- colnames(gdsc_exp)
622
t2 <- colnames(depmap_exp)
623
sum(t1 %in% t2) / ncol(gdsc_exp)  # 97.3%
624
sum(t2 %in% t1) / ncol(depmap_exp)  # 88.6%
625
sum(t2 %in% t1)  # 16988 
626
sum(t1 %in% t2)  # 16948 
627
colnames(gdsc_exp)[!colnames(gdsc_exp) %in% colnames(depmap_exp)]
628
629
fwrite(gdsc_exp, "Data/DRP_Training_Data/GDSC_Expression.csv")
630
631
# Subset both datasets's genes based on data available to both
632
shared <- intersect(colnames(gdsc_exp), colnames(depmap_exp))
633
length(shared)
634
gdsc_exp_sub <- gdsc_exp[, shared, with = F]
635
gdsc_exp_sub[1:5, 1:5]
636
637
depmap_exp_sub <- depmap_exp[, shared, with = F]
638
depmap_exp_sub[1:5, 1:5]
639
640
# Save
641
fwrite(gdsc_exp_sub, "Data/DRP_Training_Data/GDSC_Training_Expression.csv")
642
fwrite(depmap_exp_sub, "Data/DRP_Training_Data/DepMap_21Q2_Training_Expression.csv")
643
644
# ==== Drug Sensitivity Data Cleanup ====
645
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
646
require(data.table)
647
require(webchem)
648
# BiocManager::install("ChemmineR")
649
require(ChemmineR)
650
options(chemspider_key = "N98K4aOip0VpcSc8F9GilqIIktLt0hux")
651
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
652
ctrp <- fread("Data/DRP_Training_Data/CTRP_AUC_SMILES.txt")
653
gdsc1 <- fread("Data/DRP_Training_Data/GDSC1_AUC_SMILES.txt")
654
gdsc2 <- fread("Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt")
655
656
657
# Clean up duplicate with missing pubchem
658
cpd_info_1 <- fread(paste0(path, "Data/GDSC/GDSC1_Drug_Info.csv"))
659
cpd_info_1[drug_name == unique(cpd_info_1[, c("drug_name", "pubchem")])[anyDuplicated(unique(cpd_info_1[, c("drug_name", "pubchem")])$drug_name),]$drug_name]
660
cpd_info_1 <- cpd_info_1[drug_id != 476]
661
cpd_info_1 <- cpd_info_1[drug_id != 1490]
662
cpd_info_1 <- cpd_info_1[drug_id != 1496]
663
cpd_info_1 <- cpd_info_1[drug_id != 1386]
664
cpd_info_1 <- cpd_info_1[drug_id != 1402]
665
cpd_info_1 <- cpd_info_1[drug_id != 1393]
666
nrow(cpd_info_1[pubchem == "-"])
667
sum(cpd_info_1$drug_name %in% unique(ctrp$cpd_name))
668
# Subset for valid pubchem IDs
669
cpd_info_1 <- cpd_info_1[pubchem != "-"]
670
cpd_info_1 <- cpd_info_1[pubchem != "none"]
671
cpd_info_1 <- cpd_info_1[pubchem != "several"]
672
cpd_info_1$pubchem <- as.numeric(cpd_info_1$pubchem)
673
674
cpd_1_smiles <- webchem::pc_prop(cid = cpd_info_1$pubchem, properties = "CanonicalSMILES")
675
cpd_info_1 <- merge(cpd_info_1, cpd_1_smiles, by.x = "pubchem", by.y = "CID")
676
# Save
677
fwrite(cpd_info_1, "Data/GDSC/GDSC1_VALID_Drug_Info.csv")
678
679
680
cpd_info_2 <- fread(paste0(path, "Data/GDSC/GDSC2_Drug_Info.csv"))
681
cpd_info_2[drug_name == unique(cpd_info_2[, c("drug_name", "pubchem")])[anyDuplicated(unique(cpd_info_2[, c("drug_name", "pubchem")])$drug_name),]$drug_name]
682
cpd_info_2 <- cpd_info_2[drug_id != 1811]
683
cpd_info_2 <- cpd_info_2[drug_id != 1806]
684
cpd_info_2 <- cpd_info_2[drug_id != 1819]
685
cpd_info_2 <- cpd_info_2[drug_id != 1816]
686
cpd_info_2[pubchem == "25227436, 42602260"]$pubchem <- "25227436"
687
cpd_info_2[pubchem == "11719003, 58641927"]$pubchem <- "11719003"
688
cpd_info_2[pubchem == "66577015, 16654980"]$pubchem <- "66577015"
689
cpd_info_2[pubchem == "11719003, 58641927"]$pubchem <- "11719003"
690
691
nrow(cpd_info_2[pubchem == "-"])
692
sum(cpd_info_2$pubchem %in% cpd_info_1$pubchem) / nrow(cpd_info_2)
693
694
cpd_info_2 <- cpd_info_2[pubchem != "-"]
695
cpd_info_2 <- cpd_info_2[pubchem != "none"]
696
cpd_info_2 <- cpd_info_2[pubchem != "several"]
697
698
cpd_info_2$pubchem <- as.numeric(cpd_info_2$pubchem)
699
700
cpd_2_smiles <- webchem::pc_prop(cid = cpd_info_2$pubchem, properties = "CanonicalSMILES")
701
cpd_info_2 <- merge(cpd_info_2, cpd_2_smiles, by.x = "pubchem", by.y = "CID")
702
# Save
703
fwrite(cpd_info_2, "Data/GDSC/GDSC2_VALID_Drug_Info.csv")
704
705
706
depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv"))
707
708
# ==== GDSC ====
709
require(stringr)
710
gdsc1 <- fread(paste0(path, "Data/GDSC/GDSC1_Fitted_Dose_Response.csv"))
711
sum(unique(gdsc1$CELL_LINE_NAME) %in% depmap_samples$stripped_cell_line_name) / length(unique(gdsc1$CELL_LINE_NAME))  # 0.22
712
sum(toupper(unique(gdsc1$CELL_LINE_NAME)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc1$CELL_LINE_NAME))  # 0.24
713
sum(str_remove_all(toupper(unique(gdsc1$CELL_LINE_NAME)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc1$CELL_LINE_NAME))  # 0.9696049
714
715
dim(gdsc1)  # 310K Combinations
716
colnames(gdsc1)
717
sum(gdsc1$AUC == 0)
718
min(gdsc1$AUC)
719
max(gdsc1$AUC)
720
721
# Count unique combinations in GDSC1
722
length(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")])$CELL_LINE_NAME))  # 987
723
length(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")])$DRUG_NAME))  # 345
724
nrow(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")]))) # 292,849
725
726
727
gdsc1_final <- merge(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME", "AUC")]), unique(cpd_info_1[, c("drug_name", "CanonicalSMILES")]), by.x = "DRUG_NAME", by.y = "drug_name")
728
colnames(gdsc1_final) <- c("cpd_name", "ccl_name", "area_under_curve", "cpd_smiles")
729
# Save
730
fwrite(gdsc1_final, "Data/DRP_Training_Data/GDSC1_AUC_SMILES.txt")
731
732
unique(gdsc1_pubchem$DRUG_NAME)
733
# gdsc1_cs_ids <- webchem::get_csid(query = unique(gdsc1$DRUG_NAME), from = "name", match = "all", verbose = T)
734
gdsc1_cs_ids <- webchem::cir_query(identifier = unique(gdsc1$DRUG_NAME), representation = "smiles", verbose = T, )
735
736
# Count unique combinations in GDSC2
737
gdsc2 <- fread(paste0(path, "Data/GDSC/GDSC2_Fitted_Dose_Response.csv"))
738
sum(unique(gdsc2$CELL_LINE_NAME) %in% depmap_samples$stripped_cell_line_name) / length(unique(gdsc2$CELL_LINE_NAME))  # 0.2311496
739
sum(toupper(unique(gdsc2$CELL_LINE_NAME)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc2$CELL_LINE_NAME))  # 0.2546354
740
sum(str_remove_all(toupper(unique(gdsc2$CELL_LINE_NAME)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc2$CELL_LINE_NAME))  # 0.9678616
741
742
gdsc2_cpd_smiles <- webchem::cir_query(identifier = unique(gdsc2$DRUG_NAME), representation = "smiles", verbose = T)
743
744
dim(gdsc2)  # 135K Combinations
745
colnames(gdsc2)
746
length(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")])$CELL_LINE_NAME))  # 809
747
length(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")])$DRUG_NAME))  # 192
748
nrow(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")]))) # 131,108
749
750
gdsc2_final <- merge(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME", "AUC")]), unique(cpd_info_2[, c("drug_name", "CanonicalSMILES")]), by.x = "DRUG_NAME", by.y = "drug_name")
751
colnames(gdsc2_final) <- c("cpd_name", "ccl_name", "area_under_curve", "cpd_smiles")
752
# Save
753
fwrite(gdsc2_final, "Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt")
754
755
756
# Count overlap of drugs and cell lines
757
sum(unique(gdsc1$DRUG_NAME) %in% unique(gdsc2$DRUG_NAME)) # Drug Overlap: 88
758
sum(unique(gdsc1$CELL_LINE_NAME) %in% unique(gdsc2$CELL_LINE_NAME))  # Cell Line Overlap: 808
759
760
# ==== CTRP ====
761
require(data.table)
762
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
763
764
# NOTE: Newer and better AUC calculation in PharmacoGx.R file!
765
ctrp_curves <- fread(paste0(path, "Data/CTRP/v20.data.curves_post_qc.txt"))
766
exper_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_experiment.txt"))
767
cell_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_cell_line.txt"))
768
table(cell_data$ccl_availability)
769
770
# Merge sensitivity, experimental and cell line data
771
temp <- merge(unique(ctrp_curves[, c("experiment_id", "master_cpd_id")]),
772
              unique(exper_data[, c("experiment_id", "master_ccl_id")]),
773
              by = "experiment_id")
774
ctrp <- merge(temp, cell_data[, c("master_ccl_id", "ccl_name")], by = "master_ccl_id")
775
sum(unique(ctrp$ccl_name) %in% depmap_samples$stripped_cell_line_name) / length(unique(ctrp$ccl_name))  # 0.9492672
776
sum(toupper(unique(ctrp$ccl_name)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(ctrp$ccl_name))  # 0.9492672
777
sum(str_remove_all(toupper(unique(ctrp$ccl_name)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(ctrp$ccl_name))  # 0.9503946
778
779
780
# Add compound information
781
cpd_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_compound.txt"))
782
ctrp <- merge(ctrp, cpd_data[, c("master_cpd_id", "cpd_name", "cpd_smiles")], by = "master_cpd_id")
783
784
# Add AUC curve information
785
ctrp_auc <- fread(paste0(path, "Data/CTRP/v20.data.curves_post_qc.txt"))
786
787
ctrp <- merge(ctrp, ctrp_auc[, c("experiment_id", "master_cpd_id", "area_under_curve")], by = c("experiment_id", "master_cpd_id"))
788
789
790
# Save
791
fwrite(ctrp, paste0(path, "Data/DRP_Training_Data/CTRP_AUC_SMILES.txt"))
792
793
# Add primary disease information. NOTE: This removes some DR data as 45 cell lines in CTRPv2 cannot be paired with DepMap!!!
794
line_info <- fread("Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv")
795
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
796
797
sum(unique(ctrp$ccl_name) %in% unique(line_info$stripped_cell_line_name))  # 150
798
799
line_info$other_ccl_name <- str_replace(toupper(line_info$stripped_cell_line_name), "-", "")
800
ctrp$other_ccl_name <- str_replace(toupper(ctrp$ccl_name), "-", "")
801
802
ctrp <- merge(ctrp, line_info[, c("other_ccl_name", "primary_disease")], by = "other_ccl_name")
803
ctrp$other_ccl_name <- NULL
804
setcolorder(ctrp, neworder = c("cpd_name", "ccl_name", "primary_disease", "area_under_curve", "cpd_smiles"))
805
806
fwrite(ctrp, "Data/DRP_Training_Data/CTRP_AUC_SMILES.txt")
807
808
809
# Experiment ID 
810
unique(ctrp[, c("master_ccl_id", "experiment_id")])
811
length(unique(ctrp$master_ccl_id))
812
length(unique(ctrp$experiment_id))
813
length(unique(ctrp$ccl_name))
814
length(unique(ctrp$master_cpd_id))
815
816
# Check overlap with GDSC 1 and 2
817
sum(unique(ctrp$ccl_name) %in% gdsc1$CELL_LINE_NAME)
818
sum(unique(ctrp$ccl_name) %in% gdsc2$CELL_LINE_NAME)
819
820
dim(ctrp)  # 395K Combinations
821
822
823
824
# ==== Chemical Data Cleanup ====
825
require(data.table)
826
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
827
828
chembl <- fread(paste0(path, "Data/chembl_27_chemreps.txt"))
829
dim(chembl)
830
831
832
833
# ==== EDA ======
834
require(data.table)
835
require(stringr)
836
require(ggplot2)
837
line_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
838
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
839
gdsc2 <- fread("Data/DRP_Training_Data/GDSC2_AAC_SMILES.txt")
840
exp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
841
mut <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv")
842
cnv <- fread("Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv")
843
prot <- fread("Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv")
844
mirna <- fread("Data/DRP_Training_Data/DepMap_2019_miRNA.csv")
845
metab <- fread("Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv")
846
hist <- fread("Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv")
847
rppa <- fread("Data/DRP_Training_Data/DepMap_2019_RPPA.csv")
848
849
# pdb_table <- fread("Data/cell_annotation_table_1.1.1.csv")
850
# pdb_sub <- pdb_table[, c("CTRPv2.cellid", "CCLE.cellid")]
851
# pdb_sub <- pdb_sub[!is.na(CTRPv2.cellid) & !is.na(CCLE.cellid)]
852
853
854
exp[1:5., 1:5]
855
856
length(unique(ctrp$ccl_name))
857
length(unique(ctrp$cpd_name))
858
859
sum(unique(ctrp$ccl_name) %in% line_info$stripped_cell_line_name) / length(unique(ctrp$ccl_name))
860
ccl_names = toupper(ctrp$ccl_name)
861
ccl_names = unique(str_replace(ccl_names, "-", ""))
862
length(ccl_names)
863
864
sum(ccl_names %in% line_info$stripped_cell_line_name) / length(ccl_names)
865
866
line_info[!(stripped_cell_line_name %in% ccl_names)]
867
ctrp[ccl_name %like% "NIHOVCAR3"]
868
ctrp[ccl_name %like% "HEL"]
869
870
# sum(exp$stripped_cell_line_name %in% pdb_sub$CCLE.cellid) 
871
872
# Remove hyphens and convert all to upper case
873
# pdb_ccl_names = pdb_sub$CCLE.cellid
874
# pdb_ccl_names = str_replace(toupper(pdb_ccl_names), "-", "")
875
876
ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
877
     
878
exp_ccl_names = exp$stripped_cell_line_name
879
exp_ccl_names = str_replace(toupper(exp_ccl_names), "-", "")
880
881
mut_ccl_names = mut$stripped_cell_line_name
882
mut_ccl_names = str_replace(toupper(mut_ccl_names), "-", "")
883
884
cnv_ccl_names = cnv$stripped_cell_line_name
885
cnv_ccl_names = str_replace(toupper(cnv_ccl_names), "-", "")
886
887
sum(exp_ccl_names %in% ccl_names) / length(unique(ccl_names))
888
# sum(exp_ccl_names %in% pdb_ccl_names) / length(unique(pdb_ccl_names))
889
890
891
sum(mut_ccl_names %in% ccl_names) / length(unique(ccl_names)) * length(unique(ccl_names))
892
# sum(mut_ccl_names %in% pdb_ccl_names) / length(unique(pdb_ccl_names)) * length(unique(pdb_ccl_names))
893
894
ctrp[ccl_name %in% mut_ccl_names[mut_ccl_names %in% ccl_names]]   ### 308K!!!!! Not 144K
895
# ctrp[ccl_name %in% mut_ccl_names[mut_ccl_names %in% pdb_ccl_names]]   ### 302K!!!!! Not 144K
896
897
sum(cnv_ccl_names %in% ccl_names) / length(unique(ccl_names))
898
sum(exp_ccl_names %in% cnv_ccl_names) / length(unique(exp_ccl_names))
899
sum(cnv_ccl_names %in% exp_ccl_names) / length(unique(cnv_ccl_names))
900
901
dir.create(path = "Plots")
902
dir.create(path = "Plots/DepMap")
903
ggplot(data = line_info) +
904
  geom_bar(mapping = aes(x = primary_disease), stat = "count") +
905
  xlab("Primary Disease") +
906
  ylab("# of cell lines") + 
907
  ggtitle(label = "Proportion of Cancer Types in DepMap Data (overall)", subtitle = "20Q2 Version") +
908
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
909
910
ggsave(filename = "Plots/DepMap/DepMap_Cell_Lines_Proportion.pdf", device = "pdf")
911
  
912
prot[, 1:5]
913
unique(prot$stripped_cell_line_name)
914
915
mut$stripped_cell_line_name = str_replace(toupper(mut$stripped_cell_line_name), "-", "")
916
cnv$stripped_cell_line_name = str_replace(toupper(cnv$stripped_cell_line_name), "-", "")
917
exp$stripped_cell_line_name = str_replace(toupper(exp$stripped_cell_line_name), "-", "")
918
prot$stripped_cell_line_name = str_replace(toupper(prot$stripped_cell_line_name), "-", "")
919
920
mirna$stripped_cell_line_name = str_replace(toupper(mirna$stripped_cell_line_name), "-", "")
921
metab$stripped_cell_line_name = str_replace(toupper(metab$stripped_cell_line_name), "-", "")
922
hist$stripped_cell_line_name = str_replace(toupper(hist$stripped_cell_line_name), "-", "")
923
rppa$stripped_cell_line_name = str_replace(toupper(rppa$stripped_cell_line_name), "-", "")
924
925
ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
926
927
mut_line_info <- line_info[stripped_cell_line_name %in% unique(mut$stripped_cell_line_name)]  
928
cnv_line_info <- line_info[stripped_cell_line_name %in% unique(cnv$stripped_cell_line_name)]  
929
exp_line_info <- line_info[stripped_cell_line_name %in% unique(exp$stripped_cell_line_name)]  
930
prot_line_info <- line_info[stripped_cell_line_name %in% unique(prot$stripped_cell_line_name)]
931
932
mirna_line_info <- line_info[stripped_cell_line_name %in% unique(mirna$stripped_cell_line_name)]
933
metab_line_info <- line_info[stripped_cell_line_name %in% unique(metab$stripped_cell_line_name)]
934
hist_line_info <- line_info[stripped_cell_line_name %in% unique(hist$stripped_cell_line_name)]
935
rppa_line_info <- line_info[stripped_cell_line_name %in% unique(rppa$stripped_cell_line_name)]
936
937
ctrp_line_info <- line_info[stripped_cell_line_name %in% unique(ctrp$ccl_name)]
938
939
mut_line_info <- mut_line_info[, c("stripped_cell_line_name", "primary_disease")]
940
mut_line_info$data_type <- "Mutational"
941
942
cnv_line_info <- cnv_line_info[, c("stripped_cell_line_name", "primary_disease")]
943
cnv_line_info$data_type <- "Copy Number"
944
945
exp_line_info <- exp_line_info[, c("stripped_cell_line_name", "primary_disease")]
946
exp_line_info$data_type <- "Gene Expression"
947
948
prot_line_info <- prot_line_info[, c("stripped_cell_line_name", "primary_disease")]
949
prot_line_info$data_type <- "Protein Quantification"
950
951
mirna_line_info <- mirna_line_info[, c("stripped_cell_line_name", "primary_disease")]
952
mirna_line_info$data_type <- "miRNA Expression"
953
954
metab_line_info <- metab_line_info[, c("stripped_cell_line_name", "primary_disease")]
955
metab_line_info$data_type <- "Metabolomics"
956
957
hist_line_info <- hist_line_info[, c("stripped_cell_line_name", "primary_disease")]
958
hist_line_info$data_type <- "Histone Profiling"
959
960
rppa_line_info <- rppa_line_info[, c("stripped_cell_line_name", "primary_disease")]
961
rppa_line_info$data_type <- "RPPA"
962
963
ctrp_line_info <- ctrp_line_info[, c("stripped_cell_line_name", "primary_disease")]
964
ctrp_line_info$data_type <- "Dose-Response"
965
966
datatype_line_info <- rbindlist(list(mut_line_info, cnv_line_info, exp_line_info, prot_line_info,
967
                                     mirna_line_info, metab_line_info, hist_line_info, rppa_line_info,
968
                                     ctrp_line_info))
969
datatype_line_info[, count_per_disease := .N, by = "primary_disease"]
970
# .(count = .N, var = sum(VAR)), by = MNTH
971
972
ggplot(data = datatype_line_info) +
973
  geom_bar(mapping = aes(x = reorder(primary_disease, -count_per_disease),
974
                         fill = data_type), stat = "count", position = "dodge") +
975
  xlab("Primary Disease") +
976
  ylab("Number of cell lines") +
977
  labs(fill = "Data Type") +
978
  # ggtitle(label = "Proportion of Cancer Types in DepMap Data", subtitle = "By data type, 21Q2 Version - Overlap with CTRPv2: 79%") +
979
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
980
        text = element_text(face = "bold"),
981
        legend.position = c(.9,.55))
982
983
ggsave(filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Proportion.pdf", device = "pdf")
984
985
986
BiocManager::install("VennDiagram")
987
require(VennDiagram)
988
989
library(RColorBrewer)
990
myCol <- brewer.pal(5, "Pastel2")
991
992
# NOTE: The CTRPv2 here is from before ctrp was merged with cell line info to add primary disease!
993
venn.diagram(x = list(mut_line_info$stripped_cell_line_name,
994
                      cnv_line_info$stripped_cell_line_name,
995
                      exp_line_info$stripped_cell_line_name,
996
                      prot_line_info$stripped_cell_line_name,
997
                      unique(ctrp$ccl_name)),
998
             category.names = c("Mutational", "Copy Number", "Gene Expression", "Protein Quantification", "CTRPv2 Dose-Response"),
999
             filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Venn.png",
1000
             imagetype = "png",
1001
             output = TRUE,
1002
             height = 3000 ,
1003
             width = 3000 ,
1004
             resolution = 600,
1005
             # Circles
1006
             lwd = 2,
1007
             # lty = 'blank',
1008
             fill = myCol,
1009
             # Numbers
1010
             cex = .6,
1011
             fontface = "bold",
1012
             fontfamily = "sans",
1013
             
1014
             # Set names
1015
             cat.cex = 0.6,
1016
             cat.fontface = "bold",
1017
             cat.default.pos = "outer",
1018
             cat.pos = c(0, 0, -130, 150, 0),
1019
             cat.dist = c(0.2, 0.2, 0.2, 0.2, 0.2),
1020
             cat.fontfamily = "sans",
1021
             # rotation = 1
1022
             
1023
)
1024
1025
# ==== Bottleneck Cancer Cell Lines ====
1026
require(data.table)
1027
require(stringr)
1028
require(ggplot2)
1029
# line_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
1030
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
1031
mut <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv")
1032
cnv <- fread("Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv")
1033
exp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
1034
prot <- fread("Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv")
1035
mirna <- fread("Data/DRP_Training_Data/DepMap_2019_miRNA.csv")
1036
metab <- fread("Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv")
1037
hist <- fread("Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv")
1038
rppa <- fread("Data/DRP_Training_Data/DepMap_2019_RPPA.csv")
1039
1040
bottleneck_keys <- Reduce(intersect, list(ctrp$ccl_name, mut$stripped_cell_line_name, cnv$stripped_cell_line_name,
1041
                                          exp$stripped_cell_line_name, prot$stripped_cell_line_name, mirna$stripped_cell_line_name,
1042
                                          metab$stripped_cell_line_name, hist$stripped_cell_line_name, rppa$stripped_cell_line_name)
1043
)
1044
1045
bottleneck_keys <- data.table(keys = bottleneck_keys)
1046
fwrite(bottleneck_keys, "Data/DRP_Training_Data/bottleneck_keys.csv")
1047
1048
bottleneck_keys <- Reduce(intersect, list(ctrp$ccl_name, mut$stripped_cell_line_name, cnv$stripped_cell_line_name,
1049
                                          exp$stripped_cell_line_name, prot$stripped_cell_line_name, mirna$stripped_cell_line_name,
1050
                                          metab$stripped_cell_line_name, hist$stripped_cell_line_name, rppa$stripped_cell_line_name)
1051
)
1052
1053
bottleneck_datatype_line_info <- datatype_line_info[stripped_cell_line_name %in% bottleneck_keys]
1054
bottleneck_datatype_line_info[, count_per_disease := .N, by = "primary_disease"]
1055
1056
ggplot(data = bottleneck_datatype_line_info) +
1057
  geom_bar(mapping = aes(x = reorder(primary_disease, -count_per_disease),
1058
                         fill = data_type), stat = "count", position = "dodge") +
1059
  xlab("Cancer Type") +
1060
  ylab("Number of Cell Lines") +
1061
  labs(fill = "Data Type") +
1062
  # ggtitle(label = "Proportion of Cancer Types in DepMap Data (Bottleneck Subset)", subtitle = "By data type, 21Q2 Version - Overlap with CTRPv2: 79%") +
1063
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
1064
        legend.position = c(.9,.55))
1065
1066
# ggsave(filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Bottleneck_Proportion.pdf", device = "pdf")
1067
1068
1069
# Compare bottleneck to original data
1070
uniqueN(datatype_line_info$primary_disease)
1071
uniqueN(datatype_line_info$stripped_cell_line_name)
1072
uniqueN(bottleneck_datatype_line_info$primary_disease)
1073
uniqueN(bottleneck_datatype_line_info$stripped_cell_line_name)
1074
unique(bottleneck_datatype_line_info$stripped_cell_line_name)
1075
ctrp[ccl_name %in% unique(bottleneck_datatype_line_info$stripped_cell_line_name)]
1076
ctrp[ccl_name %in% unique(datatype_line_info$stripped_cell_line_name)] 
1077
nrow(ctrp[ccl_name %in% unique(bottleneck_datatype_line_info$stripped_cell_line_name)]) /
1078
nrow(ctrp[ccl_name %in% unique(datatype_line_info$stripped_cell_line_name)])   # 0.4038907
1079
1080
# Show cell line counts in bottleneck and original data side by side
1081
bottleneck_datatype_line_info$`Data Subset` <- "Bottleneck"
1082
datatype_line_info$`Data Subset` <- "Original"
1083
both_data_type_line_info <- rbindlist(list(bottleneck_datatype_line_info, datatype_line_info))
1084
1085
# Change bar order
1086
require(dplyr)
1087
bar_level_df <- data.frame(x1 = c("Original", "Bottleneck"))
1088
colnames(bar_level_df) <- "Data Subset"
1089
both_data_type_line_info <- left_join(bar_level_df,  
1090
                      both_data_type_line_info,
1091
                      by = "Data Subset")
1092
both_data_type_line_info <- as.data.table(both_data_type_line_info)
1093
1094
ggplot(data = both_data_type_line_info) +
1095
  geom_bar(mapping = aes(x = reorder(primary_disease, -count_per_disease),
1096
                         y = count_per_disease,
1097
                         fill = `Data Subset`), stat = "identity", position = "dodge") +
1098
  xlab("Cancer Type") +
1099
  ylab("Number of Cell Line Profiles") +
1100
  labs(fill = "Data Subset Type") +
1101
  # scale_x_discrete(limits = c("Original", "Bottleneck")) +
1102
  
1103
  # ggtitle(label = "Proportion of Cancer Types in DepMap Data (Bottleneck Subset)", subtitle = "By data type, 21Q2 Version - Overlap with CTRPv2: 79%") +
1104
  theme(axis.text.x = element_text(size = 14, angle = 45, hjust = 1, face = "bold"),
1105
        axis.text.y = element_text(size = 14, face = "bold"),
1106
        axis.title = element_text(size = 16, face = "bold"),
1107
        legend.title = element_text(size = 16),
1108
        legend.text = element_text(size=14),
1109
        legend.position = c(.9,.55))
1110
1111
ggsave(filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Bottleneck_Proportion_Comparison.pdf",
1112
       device = "pdf")
1113
1114
# both_data_type_line_info <- merge(bottleneck_datatype_line_info, datatype_line_info, by)
1115
1116
1117
1118
bottleneck_datatype_line_info
1119
# proteomics_datatype_line_info <- datatype_line_info[data_type == "Protein Quantification"]
1120
1121
# ggplot(data = proteomics_datatype_line_info) +
1122
#   geom_bar(mapping = aes(x = reorder(primary_disease, -count_per_disease),
1123
#                          fill = data_type), stat = "count", position = "dodge") +
1124
#   xlab("Primary Disease") +
1125
#   ylab("# of cell lines") +
1126
#   labs(fill = "Data Type") +
1127
#   ggtitle(label = "Proportion of Cancer Types in DepMap Data (Bottleneck Subset)", subtitle = "By data type, 21Q2 Version - Overlap with CTRPv2: 79%") +
1128
#   theme(axis.text.x = element_text(angle = 45, hjust = 1),
1129
#         legend.position = c(.9,.55))
1130
1131
# ggsave(filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Bottleneck_Proportion.pdf", device = "pdf")
1132