Switch to unified view

a b/R/Complete_Sample_Prep-ML3867-FTAJ.R
1
# Complete_Sample_Prep.R
2
3
# This script is intended to pair genomics, transcriptomics, proteomics and drug response data
4
# mainly from the DepMap resource.
5
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
6
dir.create(paste0(path, "Data/DRP_Training_Data"))
7
8
require(data.table)
9
10
# ==== Cell line info cleanup ====
11
depmap_samples <- fread(paste0(path, "Data/DepMap/21Q2/sample_info.csv"))
12
# Subset relevant (machine learning) columns 
13
depmap_samples <- depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")]
14
15
fwrite(depmap_samples, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv"))
16
17
# depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
18
# ==== Expression data cleanup ====
19
ccle_exp <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_expression.csv"))
20
dim(ccle_exp)
21
ccle_exp[1:5, 1:20]
22
# Change column names to only contain HGNC name: replace everything after first word with ""
23
colnames(ccle_exp) <- gsub(" .+", "", colnames(ccle_exp))
24
colnames(ccle_exp)[1] <- "DepMap_ID"
25
# Merge with sample info to have cell line name in addition to DepMap ID
26
ccle_exp <- merge(ccle_exp, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "DepMap_ID")
27
ccle_exp[, DepMap_ID := NULL]
28
ccle_exp[1:5, 1:20]
29
30
# Move cell line name to the first column: just giving the column name to the function moves it to first place
31
setcolorder(ccle_exp, neworder = sort(colnames(ccle_exp)))
32
setcolorder(ccle_exp, neworder = c("stripped_cell_line_name", "primary_disease"))
33
ccle_exp[1:5, 1:20]
34
35
# Save
36
fwrite(ccle_exp, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Expression.csv"), sep = ',')
37
38
# ccle_exp <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Expression.csv"))
39
40
ccle_exp
41
# DIMENSIONS OF EXPRESSION DATA: 1375 X 19178
42
rm(ccle_exp)
43
# ==== Copy number data cleanup ====
44
ccle_cn <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_gene_copy_number.csv"))
45
dim(ccle_cn)
46
ccle_cn[1:5, 1:10]
47
# Change column names to only contain HGNC name: replace everything after first word with ""
48
colnames(ccle_cn) <- gsub(" .+", "", colnames(ccle_cn))
49
colnames(ccle_cn)[1] <- "DepMap_ID"
50
# Merge with sample info to have cell line name in addition to DepMap ID
51
ccle_cn <- merge(ccle_cn, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "DepMap_ID")
52
ccle_cn[, DepMap_ID := NULL]
53
54
setcolorder(ccle_cn, neworder = sort(colnames(ccle_cn)))
55
setcolorder(ccle_cn, neworder = c("stripped_cell_line_name", "primary_disease"))
56
ccle_cn[1:5, 1:20]
57
dim(ccle_cn)
58
# Save
59
fwrite(ccle_cn, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv"), sep = ',')
60
61
# DIMENSIONS OF COPY NUMBER DATA: 1740 X 27563
62
rm(ccle_cn)
63
gc()
64
# ==== Proteomic data cleanup ====
65
ccle_prot <- fread(paste0(path, "Data/DepMap/20Q2/CCLE_protein_quant_current_normalized.csv"))
66
dim(ccle_prot)
67
ccle_prot[1:5, 1:10]
68
ccle_prot[1:5, 48:60]
69
# Subset only the Uniprot accession (since its unique unlike HGNC) and the cell line experimental data
70
ccle_prot <- ccle_prot[, c(6, 49:ncol(ccle_prot)), with = F]
71
colnames(ccle_prot) <- gsub("\\_.+", "", colnames(ccle_prot))
72
colnames(ccle_prot)[1] <- "Uniprot_Acc"
73
# Transpose the data.table to match with other data type tables
74
t <- transpose(ccle_prot, make.names = "Uniprot_Acc")
75
76
# Check if transpose worked as intended
77
as.numeric(unlist(t[1,])) == as.numeric(unlist(ccle_prot[,2]))
78
as.numeric(unlist(t[2,])) == as.numeric(unlist(ccle_prot[,3]))
79
80
# Add cell lines
81
t$stripped_cell_line_name <- colnames(ccle_prot)[-1]
82
# Merge with sample info to have cell line name in addition to DepMap ID
83
t <- merge(t, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "stripped_cell_line_name")
84
85
# Move to front
86
setcolorder(t, neworder = c("DepMap_ID", "stripped_cell_line_name", "primary_disease"))
87
t[1:5, 1:10]
88
89
# Save
90
fwrite(t, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_ProteinQuant.csv"), sep = ',')
91
92
93
### Get proteins that are observed in all cell lines
94
# Create the same transposed table as above
95
# Remove all rows and columns that have any NA in them
96
prot_nona <- na.omit(ccle_prot)
97
which(is.na(prot_nona))
98
# Transpose the data.table to match with other data type tables
99
t <- transpose(prot_nona, make.names = "Uniprot_Acc")
100
# Check if transpose worked as intended
101
as.numeric(unlist(t[1,])) == as.numeric(unlist(prot_nona[,2]))
102
as.numeric(unlist(t[2,])) == as.numeric(unlist(prot_nona[,3]))
103
# Add cell lines
104
t$stripped_cell_line_name <- colnames(prot_nona)[-1]
105
# Merge with sample info to have cell line name in addition to DepMap ID
106
t <- merge(t, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "stripped_cell_line_name")
107
# Move to front
108
setcolorder(t, neworder = c("DepMap_ID", "stripped_cell_line_name", "primary_disease"))
109
t[1:5, 1:10]
110
# Now we have ~5000 proteins that are available in all samples
111
dim(t)
112
113
# We have 3 duplicates
114
sum(duplicated(t$stripped_cell_line_name))
115
116
# Save
117
fwrite(t, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"), sep = ',')
118
# ccle_prot <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"))
119
dim(ccle_prot)
120
ccle_prot[1:5, 1:5]
121
ccle_prot[, DepMap_ID := NULL]
122
# fwrite(ccle_prot, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"), sep = ',')
123
# DIMENSIONS OF PROTEIN QUANTITY DATA: 378 X 5155
124
125
ccle_prot <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"))
126
anyDuplicated(ccle_prot$stripped_cell_line_name)
127
128
# ==== Mutation data cleanup ====
129
rm(list = ls(pattern = "ccle"))
130
require(data.table)
131
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
132
133
ccle_mut <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_mutations.csv"))
134
table(ccle_mut$isCOSMIChotspot)
135
table(ccle_mut$isTCGAhotspot)
136
table(ccle_mut$Variant_Type)
137
length(unique(ccle_mut$DepMap_ID))
138
139
dim(ccle_mut)
140
ccle_mut[1,]
141
colnames(ccle_mut)
142
# Calculate number of mutations per cell line
143
temp <- ccle_mut[, c("Variant_Type", "DepMap_ID")]
144
temp[, nMut := .N, by = "DepMap_ID"]
145
temp
146
unique(temp$Variant_Type)
147
# For simplicity, extract only SNP data for now: this discards ~90,000 mutations
148
# ccle_mut <- ccle_mut[Variant_Type == "SNP"]
149
dim(ccle_mut)
150
t <- ccle_mut[, c("DepMap_ID", "Chromosome", "Strand", "Start_position", "End_position")]
151
dim(unique(t))
152
length(unique(ccle_mut$DepMap_ID))
153
# Keep relevant columns/features
154
# Aside: Should the sequence change be provided, or just whether the SNP is deleterious or not?
155
ccle_mut <- ccle_mut[, c("DepMap_ID", "Hugo_Symbol", "Chromosome", "Start_position", "End_position", "Strand",
156
             "Variant_Classification", "Variant_Type", "isDeleterious",
157
             "isTCGAhotspot", "isCOSMIChotspot", "Genome_Change", "cDNA_Change")]
158
dim(ccle_mut)
159
length(unique(ccle_mut$DepMap_ID))
160
table(ccle_mut$isDeleterious)
161
table(ccle_mut$isTCGAhotspot)
162
table(ccle_mut$isCOSMIChotspot)
163
164
# ==== CCLE Mut Overlap with COSMIC CGC ====
165
# Perhaps it's best to use the mutations in genes that COSMIC considers important, like another paper in
166
# the field (~500 genes)
167
# Or, we can use a binary vector for genes and whether they have a deleterious mutation: this will result in 
168
# ~20,000 parameters
169
length(unique(ccle_mut$Hugo_Symbol))
170
171
length(unique(ccle_mut[isCOSMIChotspot == T]$Hugo_Symbol))
172
length(unique(ccle_mut[isTCGAhotspot == T]$Hugo_Symbol))
173
length(unique(ccle_mut[isDeleterious == T]$Hugo_Symbol))
174
175
tcga_hotspot_genes <- unique(ccle_mut[isTCGAhotspot == T]$Hugo_Symbol)
176
# Read COSMIC Cancer Gene Census data
177
cgc <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/COSMIC/cancer_gene_census.csv")
178
dim(cgc)
179
cgc[1:5, 1:20]
180
length(unique(cgc$`Gene Symbol`))
181
length(unique(cgc$HGVSG))
182
# Get Genes in this census
183
cgc_genes <- unique(cgc$`Gene Symbol`)
184
cgc[Tier == 1]
185
length(unique(cgc$`Genome Location`))  # 922,732
186
# rm(cgc)
187
188
# Subset DepMap mutations based on the CGC genes
189
sum(unique(ccle_mut$Hugo_Symbol) %in% unique(cgc_genes))
190
ccle_mut <- ccle_mut[Hugo_Symbol %in% cgc_genes]
191
length(unique(ccle_mut$DepMap_ID))
192
193
sum(ccle_mut$isDeleterious)
194
ccle_mut[Variant_Classification == "Missense_Mutation"]
195
length(unique(ccle_mut[isDeleterious == T]$Hugo_Symbol))
196
ccle_mut[isDeleterious == T]
197
198
199
# TODO: Use CGC to check for overlap with CCLE cell lines, then collapse to whether each of the 700 genes for
200
# that cell line has a mutation listed in the CGC
201
length(unique(cgc$`Mutation genome position`))  # ~922,000 unique mutations
202
unique(ccle_mut$NCBI_Build)  # CCLE is with GRCh 37
203
unique(cgc$GRCh)  # CGC has GRCh 38
204
# We must "lift over" the mutations from 37 to 38 before checking for overlap
205
if (!require(liftOver)) {
206
    BiocManager::install("liftOver")
207
    require(liftOver)
208
    require(rtracklayer)
209
}
210
# liftOver requires a chain file to convert 37 to 38: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/
211
212
chain_path <- paste0(path, "Data/hg19ToHg38.over.chain")
213
grch_37_38_chain <- import.chain(chain_path)
214
215
# Must add "chr" to start of chromosome names
216
ccle_mut$Chromosome <- paste0("chr", ccle_mut$Chromosome)
217
# Must convert positions to GRanges
218
ccle_mut_gr <- makeGRangesFromDataFrame(df = ccle_mut, keep.extra.columns = T,
219
                         seqnames.field = "Chromosome", start.field = "Start_position",
220
                         end.field = "End_position", strand.field = "Strand")
221
length(unique(ccle_mut_gr$DepMap_ID))
222
223
# Lift over
224
lifted_ccle_mut <- liftOver(x = ccle_mut_gr, chain = grch_37_38_chain)
225
# Convert GRangesList to GRanges
226
lifted_ccle_mut <- unlist(lifted_ccle_mut)
227
# Convert back to data.table
228
lifted_ccle_mut <- as.data.table(lifted_ccle_mut)
229
# Note: Genome_Change is now out of date!
230
# Remove chr from seqnames
231
lifted_ccle_mut$seqnames <- gsub("chr", "", lifted_ccle_mut$seqnames)
232
# Can find the overlap of Mutation genome position in CGC with a newly created column based on CCLE positions
233
lifted_ccle_mut[, Mutation_Position := paste0(seqnames, ':', start, '-', end)]
234
235
ccle_mut$seqnames <- gsub("chr", "", ccle_mut$Chromosome)
236
ccle_mut[, Mutation_Position := paste0(seqnames, ':', as.character(Start_position), '-', as.character(End_position))]
237
238
239
length(unique(lifted_ccle_mut$DepMap_ID))
240
241
sum(ccle_mut$Mutation_Position %in% unique(cgc$`Genome Location`))
242
243
# Now find the overlap with CGC (which already has GRCh38)
244
subset <- lifted_ccle_mut[Mutation_Position %in% unique(cgc$`Genome Location`)]
245
table(subset$Variant_Type)
246
length(unique(subset$DepMap_ID))
247
# IMPORTANT! There is a loss of 8 cell lines (which do not have a mutation that is in
248
# CGC) using the Tier 1 data only
249
250
# Alternative (March 2021) ====
251
# Take those mutations that are COSMIC or TCGA hotspots, ignoring CGC
252
subset <- ccle_mut[isTCGAhotspot | isCOSMIChotspot]
253
254
### Create a vector of mutations for each cell line with the CGC genes
255
length(unique(subset$Hugo_Symbol))
256
sub_dcast <- dcast.data.table(data = subset[, c("DepMap_ID", "Hugo_Symbol")],
257
                 formula = DepMap_ID ~ Hugo_Symbol, fun.aggregate = length, value.var = "DepMap_ID")
258
dim(sub_dcast)
259
sub_dcast[1:5, 1:50]
260
sum(sub_dcast$A1BG)
261
sum(sub_dcast$A1CF)
262
263
depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv"))
264
sub_dcast <- merge(sub_dcast, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")],
265
                  by = "DepMap_ID")
266
setcolorder(sub_dcast, c("DepMap_ID", "stripped_cell_line_name", "primary_disease"))
267
sub_dcast[1:5, 1:50]
268
269
# Save
270
fwrite(sub_dcast, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv"), sep = ',')
271
dim(cgc_muts)
272
cgc_muts[1:5, 1:5]
273
typeof(cgc_muts[1,2])
274
275
temp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv")
276
dim(temp)
277
temp[1:5, 1:50]
278
279
# # Attach the cell line name and primary disease
280
# # cgc_muts <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv"))
281
# depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv"))
282
# cgc_muts <- merge(cgc_muts, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")],
283
#                   by = "DepMap_ID")
284
# setcolorder(cgc_muts, neworder = c("stripped_cell_line_name", colnames(cgc_muts)[-ncol(cgc_muts)]))
285
# 
286
# # Save
287
# fwrite(cgc_muts, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv"), sep = ',')
288
# cgc_muts <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv"))
289
# cgc_muts[1:5, 1:5]
290
# cgc_muts[, DepMap_ID := NULL]
291
# DIMENSIONS OF CGC MUTATIONAL DATA: 1733 X 697
292
293
294
295
# ==== miRNA Data Cleanup ====
296
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
297
require(data.table)
298
depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
299
300
ccle_mirna <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_miRNA_20181103.gct")
301
dim(ccle_mirna)
302
anyNA(ccle_mirna)
303
ccle_mirna[1:5, 1:5]
304
305
min(ccle_mirna[, -c(1:2)], na.rm = T)
306
max(ccle_mirna[, -c(1:2)], na.rm = T)
307
308
ccle_mirna <- transpose(ccle_mirna, keep.names = "Name")
309
dim(ccle_mirna)
310
ccle_mirna[1:5, 1:5]
311
ccle_mirna$Name
312
sum(duplicated(unlist(ccle_mirna[2, ])))
313
ccle_mirna <- ccle_mirna[-1,]
314
ccle_mirna[1:5, 1:5]
315
colnames(ccle_mirna) <- unlist(ccle_mirna[1,])
316
ccle_mirna <- ccle_mirna[-1,]
317
318
# Clean cell line name
319
ccle_mirna$Description <- gsub(pattern = "\\_.+", replacement = "", ccle_mirna$Description)
320
colnames(ccle_mirna)[1] <- "stripped_cell_line_name"
321
ccle_mirna <- merge(ccle_mirna, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
322
                    by = "stripped_cell_line_name")
323
dim(ccle_mirna)
324
ccle_mirna[1:5, 1:5]
325
setcolorder(ccle_mirna, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
326
327
fwrite(ccle_mirna, paste0(path, "Data/DRP_Training_Data/DepMap_2019_miRNA.csv"), sep = ',')
328
329
rm(ccle_mirna)
330
331
332
# ==== Metabolomics Data Cleanup ====
333
depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
334
ccle_metab <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_metabolomics_20190502.csv")
335
336
dim(ccle_metab)
337
ccle_metab[1:5, 1:5]
338
339
min(ccle_metab[, -c(1:2)], na.rm = T)
340
max(ccle_metab[, -c(1:2)], na.rm = T)
341
342
anyNA(ccle_metab)
343
sum(is.na(ccle_metab))
344
which(is.na(ccle_metab), arr.ind = T)
345
ccle_metab[which(is.na(ccle_metab), arr.ind = T)]
346
ccle_metab[554, 2]  # DepMap_ID is NA
347
348
min(ccle_metab[, -c(1:2)])
349
ccle_metab <- merge(ccle_metab[, -1], depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
350
                    by = "DepMap_ID")
351
352
setcolorder(ccle_metab, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
353
ccle_metab$DepMap_ID <- NULL
354
dim(ccle_metab)
355
356
fwrite(ccle_metab, paste0(path, "Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv"), sep = ',')
357
rm(ccle_metab)
358
# ==== RPPA Data Cleanup ====
359
ccle_rppa <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RPPA_20181003.csv")
360
dim(ccle_rppa)
361
ccle_rppa[1:5, 1:5]
362
anyNA(ccle_rppa)
363
364
min(ccle_rppa[, -c(1:2)], na.rm = T)  # has negative values
365
max(ccle_rppa[, -c(1:2)], na.rm = T)
366
367
ccle_rppa$V1 <- gsub(pattern = "\\_.+", replacement = "", ccle_rppa$V1)
368
colnames(ccle_rppa)[1] <- "stripped_cell_line_name"
369
ccle_rppa <- merge(ccle_rppa, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
370
                    by = "stripped_cell_line_name")
371
dim(ccle_rppa)
372
ccle_rppa[1:5, 1:10]
373
setcolorder(ccle_rppa, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
374
375
fwrite(ccle_rppa, paste0(path, "Data/DRP_Training_Data/DepMap_2019_RPPA.csv"), sep = ',')
376
377
rm(ccle_rppa)
378
379
# ==== Chromatin Profiling Data Cleanup ====
380
ccle_chrom <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_GlobalChromatinProfiling_20181130.csv")
381
dim(ccle_chrom)
382
ccle_chrom[1:5, 1:10]
383
384
min(ccle_chrom[, -c(1:2)], na.rm = T)  # has negative values
385
max(ccle_chrom[, -c(1:2)], na.rm = T)
386
387
388
anyNA(ccle_chrom)
389
sum(is.na(ccle_chrom))  # 842 NA values
390
391
unique(which(is.na(ccle_chrom), arr.ind = T)[,2])
392
length(unique(which(is.na(ccle_chrom), arr.ind = T)[,2]))  # 26 columns have NAs
393
394
# Convert NA to 0
395
setnafill(ccle_chrom, fill = 0, cols = unique(which(is.na(ccle_chrom), arr.ind = T)[,2]))
396
anyNA(ccle_chrom)
397
398
ccle_chrom$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_chrom$CellLineName)
399
colnames(ccle_chrom)[1] <- "stripped_cell_line_name"
400
dim(ccle_chrom)
401
ccle_chrom <- merge(ccle_chrom, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
402
                   by = "stripped_cell_line_name")
403
ccle_chrom$BroadID <- NULL
404
dim(ccle_chrom)
405
ccle_chrom[1:5, 1:10]
406
setcolorder(ccle_chrom, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
407
408
fwrite(ccle_chrom, paste0(path, "Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv"), sep = ',')
409
410
rm(ccle_chrom)
411
412
# ==== Fusion Data Cleanup ====
413
ccle_fusion <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_fusions.csv")
414
dim(ccle_fusion)
415
ccle_fusion[1:5, 1:17]
416
length(unique(ccle_fusion$FusionName))
417
length(unique(ccle_fusion$DepMap_ID))
418
unique(ccle_fusion$SpliceType)
419
quantile(ccle_fusion$FFPM)
420
421
ccle_fusion$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_fusion$CellLineName)
422
colnames(ccle_fusion)[1] <- "stripped_cell_line_name"
423
dim(ccle_fusion)
424
ccle_fusion <- merge(ccle_fusion, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
425
                    by = "stripped_cell_line_name")
426
ccle_fusion$BroadID <- NULL
427
dim(ccle_fusion)
428
ccle_fusion[1:5, 1:10]
429
setcolorder(ccle_fusion, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
430
431
fwrite(ccle_fusion, paste0(path, "Data/DRP_Training_Data/DepMap_2019_GeneFusion.csv"), sep = ',')
432
433
rm(ccle_fusion)
434
435
# ==== Exon Usage Ratio Data Cleanup ====
436
require(data.table)
437
setDTthreads(8)
438
ccle_exon <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RNAseq_ExonUsageRatio_20180929.gct")
439
dim(ccle_exon)
440
ccle_exon[1:10, 1:17]
441
442
transpose(ccle_exon, keep.names = "exon")
443
length(unique(ccle_exon$FusionName))
444
length(unique(ccle_exon$DepMap_ID))
445
unique(ccle_exon$SpliceType)
446
quantile(ccle_exon$FFPM)
447
448
ccle_exon$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_exon$CellLineName)
449
colnames(ccle_exon)[1] <- "stripped_cell_line_name"
450
dim(ccle_exon)
451
ccle_exon <- merge(ccle_exon, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")],
452
                     by = "stripped_cell_line_name")
453
ccle_exon$BroadID <- NULL
454
dim(ccle_exon)
455
ccle_exon[1:5, 1:10]
456
setcolorder(ccle_exon, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype"))
457
458
# fwrite(ccle_exon, paste0(path, "Data/DRP_Training_Data/DepMap_2019_ExonUsageRatio.csv"), sep = ',')
459
460
rm(ccle_exon)
461
462
# ==== RRBS Profiling Data Cleanup ====
463
require(data.table)
464
setDTthreads(8)
465
466
# === TSS
467
ccle_tss <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RRBS_TSS1kb_20181022.txt")
468
dim(ccle_tss)
469
ccle_tss[1:5, 1:5]
470
length(unique(ccle_tss$cluster_id))
471
472
ccle_tss <- transpose(ccle_tss[, -2], keep.names = "cluster_id")
473
colnames(ccle_tss) <- unlist(ccle_tss[1,])
474
ccle_tss <- ccle_tss[-1,]
475
476
# === Promoter
477
ccle_tss <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE")
478
dim(ccle_tss)
479
ccle_tss[1:5, 1:5]
480
length(unique(ccle_tss$cluster_id))
481
482
ccle_tss <- transpose(ccle_tss[, -2], keep.names = "cluster_id")
483
colnames(ccle_tss) <- unlist(ccle_tss[1,])
484
ccle_tss <- ccle_tss[-1,]
485
# === Enhancers
486
487
# ==== Drug Sensitivity Data Cleanup ====
488
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
489
require(data.table)
490
require(webchem)
491
# BiocManager::install("ChemmineR")
492
require(ChemmineR)
493
options(chemspider_key = "N98K4aOip0VpcSc8F9GilqIIktLt0hux")
494
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
495
ctrp <- fread("Data/DRP_Training_Data/CTRP_AUC_SMILES.txt")
496
gdsc1 <- fread("Data/DRP_Training_Data/GDSC1_AUC_SMILES.txt")
497
gdsc2 <- fread("Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt")
498
499
500
# Clean up duplicate with missing pubchem
501
cpd_info_1 <- fread(paste0(path, "Data/GDSC/GDSC1_Drug_Info.csv"))
502
cpd_info_1[drug_name == unique(cpd_info_1[, c("drug_name", "pubchem")])[anyDuplicated(unique(cpd_info_1[, c("drug_name", "pubchem")])$drug_name),]$drug_name]
503
cpd_info_1 <- cpd_info_1[drug_id != 476]
504
cpd_info_1 <- cpd_info_1[drug_id != 1490]
505
cpd_info_1 <- cpd_info_1[drug_id != 1496]
506
cpd_info_1 <- cpd_info_1[drug_id != 1386]
507
cpd_info_1 <- cpd_info_1[drug_id != 1402]
508
cpd_info_1 <- cpd_info_1[drug_id != 1393]
509
nrow(cpd_info_1[pubchem == "-"])
510
sum(cpd_info_1$drug_name %in% unique(ctrp$cpd_name))
511
# Subset for valid pubchem IDs
512
cpd_info_1 <- cpd_info_1[pubchem != "-"]
513
cpd_info_1 <- cpd_info_1[pubchem != "none"]
514
cpd_info_1 <- cpd_info_1[pubchem != "several"]
515
cpd_info_1$pubchem <- as.numeric(cpd_info_1$pubchem)
516
517
cpd_1_smiles <- webchem::pc_prop(cid = cpd_info_1$pubchem, properties = "CanonicalSMILES")
518
cpd_info_1 <- merge(cpd_info_1, cpd_1_smiles, by.x = "pubchem", by.y = "CID")
519
# Save
520
fwrite(cpd_info_1, "Data/GDSC/GDSC1_VALID_Drug_Info.csv")
521
522
523
cpd_info_2 <- fread(paste0(path, "Data/GDSC/GDSC2_Drug_Info.csv"))
524
cpd_info_2[drug_name == unique(cpd_info_2[, c("drug_name", "pubchem")])[anyDuplicated(unique(cpd_info_2[, c("drug_name", "pubchem")])$drug_name),]$drug_name]
525
cpd_info_2 <- cpd_info_2[drug_id != 1811]
526
cpd_info_2 <- cpd_info_2[drug_id != 1806]
527
cpd_info_2 <- cpd_info_2[drug_id != 1819]
528
cpd_info_2 <- cpd_info_2[drug_id != 1816]
529
cpd_info_2[pubchem == "25227436, 42602260"]$pubchem <- "25227436"
530
cpd_info_2[pubchem == "11719003, 58641927"]$pubchem <- "11719003"
531
cpd_info_2[pubchem == "66577015, 16654980"]$pubchem <- "66577015"
532
cpd_info_2[pubchem == "11719003, 58641927"]$pubchem <- "11719003"
533
534
nrow(cpd_info_2[pubchem == "-"])
535
sum(cpd_info_2$pubchem %in% cpd_info_1$pubchem) / nrow(cpd_info_2)
536
537
cpd_info_2 <- cpd_info_2[pubchem != "-"]
538
cpd_info_2 <- cpd_info_2[pubchem != "none"]
539
cpd_info_2 <- cpd_info_2[pubchem != "several"]
540
541
cpd_info_2$pubchem <- as.numeric(cpd_info_2$pubchem)
542
543
cpd_2_smiles <- webchem::pc_prop(cid = cpd_info_2$pubchem, properties = "CanonicalSMILES")
544
cpd_info_2 <- merge(cpd_info_2, cpd_2_smiles, by.x = "pubchem", by.y = "CID")
545
# Save
546
fwrite(cpd_info_2, "Data/GDSC/GDSC2_VALID_Drug_Info.csv")
547
548
549
depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv"))
550
551
# ==== GDSC ====
552
require(stringr)
553
gdsc1 <- fread(paste0(path, "Data/GDSC/GDSC1_Fitted_Dose_Response.csv"))
554
sum(unique(gdsc1$CELL_LINE_NAME) %in% depmap_samples$stripped_cell_line_name) / length(unique(gdsc1$CELL_LINE_NAME))  # 0.22
555
sum(toupper(unique(gdsc1$CELL_LINE_NAME)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc1$CELL_LINE_NAME))  # 0.24
556
sum(str_remove_all(toupper(unique(gdsc1$CELL_LINE_NAME)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc1$CELL_LINE_NAME))  # 0.9696049
557
558
dim(gdsc1)  # 310K Combinations
559
colnames(gdsc1)
560
sum(gdsc1$AUC == 0)
561
min(gdsc1$AUC)
562
max(gdsc1$AUC)
563
564
# Count unique combinations in GDSC1
565
length(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")])$CELL_LINE_NAME))  # 987
566
length(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")])$DRUG_NAME))  # 345
567
nrow(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")]))) # 292,849
568
569
570
gdsc1_final <- merge(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME", "AUC")]), unique(cpd_info_1[, c("drug_name", "CanonicalSMILES")]), by.x = "DRUG_NAME", by.y = "drug_name")
571
colnames(gdsc1_final) <- c("cpd_name", "ccl_name", "area_under_curve", "cpd_smiles")
572
# Save
573
fwrite(gdsc1_final, "Data/DRP_Training_Data/GDSC1_AUC_SMILES.txt")
574
575
unique(gdsc1_pubchem$DRUG_NAME)
576
# gdsc1_cs_ids <- webchem::get_csid(query = unique(gdsc1$DRUG_NAME), from = "name", match = "all", verbose = T)
577
gdsc1_cs_ids <- webchem::cir_query(identifier = unique(gdsc1$DRUG_NAME), representation = "smiles", verbose = T, )
578
579
# Count unique combinations in GDSC2
580
gdsc2 <- fread(paste0(path, "Data/GDSC/GDSC2_Fitted_Dose_Response.csv"))
581
sum(unique(gdsc2$CELL_LINE_NAME) %in% depmap_samples$stripped_cell_line_name) / length(unique(gdsc2$CELL_LINE_NAME))  # 0.2311496
582
sum(toupper(unique(gdsc2$CELL_LINE_NAME)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc2$CELL_LINE_NAME))  # 0.2546354
583
sum(str_remove_all(toupper(unique(gdsc2$CELL_LINE_NAME)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc2$CELL_LINE_NAME))  # 0.9678616
584
585
gdsc2_cpd_smiles <- webchem::cir_query(identifier = unique(gdsc2$DRUG_NAME), representation = "smiles", verbose = T)
586
587
dim(gdsc2)  # 135K Combinations
588
colnames(gdsc2)
589
length(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")])$CELL_LINE_NAME))  # 809
590
length(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")])$DRUG_NAME))  # 192
591
nrow(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")]))) # 131,108
592
593
gdsc2_final <- merge(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME", "AUC")]), unique(cpd_info_2[, c("drug_name", "CanonicalSMILES")]), by.x = "DRUG_NAME", by.y = "drug_name")
594
colnames(gdsc2_final) <- c("cpd_name", "ccl_name", "area_under_curve", "cpd_smiles")
595
# Save
596
fwrite(gdsc2_final, "Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt")
597
598
599
# Count overlap of drugs and cell lines
600
sum(unique(gdsc1$DRUG_NAME) %in% unique(gdsc2$DRUG_NAME)) # Drug Overlap: 88
601
sum(unique(gdsc1$CELL_LINE_NAME) %in% unique(gdsc2$CELL_LINE_NAME))  # Cell Line Overlap: 808
602
603
# ==== CTRP ====
604
require(data.table)
605
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
606
607
# NOTE: Newer and better AUC calculation in PharmacoGx.R file!
608
ctrp_curves <- fread(paste0(path, "Data/CTRP/v20.data.curves_post_qc.txt"))
609
exper_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_experiment.txt"))
610
cell_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_cell_line.txt"))
611
table(cell_data$ccl_availability)
612
613
# Merge sensitivity, experimental and cell line data
614
temp <- merge(unique(ctrp_curves[, c("experiment_id", "master_cpd_id")]),
615
              unique(exper_data[, c("experiment_id", "master_ccl_id")]),
616
              by = "experiment_id")
617
ctrp <- merge(temp, cell_data[, c("master_ccl_id", "ccl_name")], by = "master_ccl_id")
618
sum(unique(ctrp$ccl_name) %in% depmap_samples$stripped_cell_line_name) / length(unique(ctrp$ccl_name))  # 0.9492672
619
sum(toupper(unique(ctrp$ccl_name)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(ctrp$ccl_name))  # 0.9492672
620
sum(str_remove_all(toupper(unique(ctrp$ccl_name)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(ctrp$ccl_name))  # 0.9503946
621
622
623
# Add compound information
624
cpd_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_compound.txt"))
625
ctrp <- merge(ctrp, cpd_data[, c("master_cpd_id", "cpd_name", "cpd_smiles")], by = "master_cpd_id")
626
627
# Add AUC curve information
628
ctrp_auc <- fread(paste0(path, "Data/CTRP/v20.data.curves_post_qc.txt"))
629
630
ctrp <- merge(ctrp, ctrp_auc[, c("experiment_id", "master_cpd_id", "area_under_curve")], by = c("experiment_id", "master_cpd_id"))
631
632
633
# Save
634
fwrite(ctrp, paste0(path, "Data/DRP_Training_Data/CTRP_AUC_SMILES.txt"))
635
636
# Add primary disease information. NOTE: This removes some DR data as 45 cell lines in CTRPv2 cannot be paired with DepMap!!!
637
line_info <- fread("Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv")
638
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
639
640
sum(unique(ctrp$ccl_name) %in% unique(line_info$stripped_cell_line_name))  # 150
641
642
line_info$other_ccl_name <- str_replace(toupper(line_info$stripped_cell_line_name), "-", "")
643
ctrp$other_ccl_name <- str_replace(toupper(ctrp$ccl_name), "-", "")
644
645
ctrp <- merge(ctrp, line_info[, c("other_ccl_name", "primary_disease")], by = "other_ccl_name")
646
ctrp$other_ccl_name <- NULL
647
setcolorder(ctrp, neworder = c("cpd_name", "ccl_name", "primary_disease", "area_under_curve", "cpd_smiles"))
648
649
fwrite(ctrp, "Data/DRP_Training_Data/CTRP_AUC_SMILES.txt")
650
651
652
# Experiment ID 
653
unique(ctrp[, c("master_ccl_id", "experiment_id")])
654
length(unique(ctrp$master_ccl_id))
655
length(unique(ctrp$experiment_id))
656
length(unique(ctrp$ccl_name))
657
length(unique(ctrp$master_cpd_id))
658
659
# Check overlap with GDSC 1 and 2
660
sum(unique(ctrp$ccl_name) %in% gdsc1$CELL_LINE_NAME)
661
sum(unique(ctrp$ccl_name) %in% gdsc2$CELL_LINE_NAME)
662
663
dim(ctrp)  # 395K Combinations
664
665
666
667
# ==== Chemical Data Cleanup ====
668
require(data.table)
669
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
670
671
chembl <- fread(paste0(path, "Data/chembl_27_chemreps.txt"))
672
673
674
675
# ==== EDA ======
676
require(data.table)
677
require(stringr)
678
require(ggplot2)
679
line_info <- fread("Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv")
680
ctrp <- fread("Data/DRP_Training_Data/CTRP_AUC_SMILES.txt")
681
gdsc2 <- fread("Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt")
682
exp <- fread("Data/DRP_Training_Data/DepMap_20Q2_Expression.csv")
683
mut <- fread("Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv")
684
cnv <- fread("Data/DRP_Training_Data/DepMap_20Q2_CopyNumber.csv")
685
prot <- fread("Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv")
686
pdb_table <- fread("Data/cell_annotation_table_1.1.1.csv")
687
pdb_sub <- pdb_table[, c("CTRPv2.cellid", "CCLE.cellid")]
688
pdb_sub <- pdb_sub[!is.na(CTRPv2.cellid) & !is.na(CCLE.cellid)]
689
690
691
exp[1:5., 1:5]
692
693
length(unique(ctrp$ccl_name))
694
length(unique(ctrp$cpd_name))
695
696
sum(unique(ctrp$ccl_name) %in% line_info$stripped_cell_line_name) / length(unique(ctrp$ccl_name))
697
ccl_names = toupper(ctrp$ccl_name)
698
ccl_names = unique(str_replace(ccl_names, "-", ""))
699
length(ccl_names)
700
701
sum(ccl_names %in% line_info$stripped_cell_line_name) / length(ccl_names)
702
703
line_info[!(stripped_cell_line_name %in% ccl_names)]
704
ctrp[ccl_name %like% "NIHOVCAR3"]
705
ctrp[ccl_name %like% "HEL"]
706
707
sum(exp$stripped_cell_line_name %in% pdb_sub$CCLE.cellid) 
708
709
# Remove hyphens and convert all to upper case
710
pdb_ccl_names = pdb_sub$CCLE.cellid
711
pdb_ccl_names = str_replace(toupper(pdb_ccl_names), "-", "")
712
713
ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
714
     
715
exp_ccl_names = exp$stripped_cell_line_name
716
exp_ccl_names = str_replace(toupper(exp_ccl_names), "-", "")
717
718
mut_ccl_names = mut$stripped_cell_line_name
719
mut_ccl_names = str_replace(toupper(mut_ccl_names), "-", "")
720
721
cnv_ccl_names = cnv$stripped_cell_line_name
722
cnv_ccl_names = str_replace(toupper(cnv_ccl_names), "-", "")
723
724
sum(exp_ccl_names %in% ccl_names) / length(unique(ccl_names))
725
sum(exp_ccl_names %in% pdb_ccl_names) / length(unique(pdb_ccl_names))
726
727
728
sum(mut_ccl_names %in% ccl_names) / length(unique(ccl_names)) * length(unique(ccl_names))
729
sum(mut_ccl_names %in% pdb_ccl_names) / length(unique(pdb_ccl_names)) * length(unique(pdb_ccl_names))
730
731
ctrp[ccl_name %in% mut_ccl_names[mut_ccl_names %in% ccl_names]]   ### 302K!!!!! Not 144K
732
ctrp[ccl_name %in% mut_ccl_names[mut_ccl_names %in% pdb_ccl_names]]   ### 302K!!!!! Not 144K
733
734
sum(cnv_ccl_names %in% ccl_names) / length(unique(ccl_names))
735
sum(exp_ccl_names %in% cnv_ccl_names) / length(unique(exp_ccl_names))
736
sum(cnv_ccl_names %in% exp_ccl_names) / length(unique(cnv_ccl_names))
737
738
739
dir.create(path = "Plots")
740
dir.create(path = "Plots/DepMap")
741
ggplot(data = line_info) +
742
  geom_bar(mapping = aes(x = primary_disease), stat = "count") +
743
  xlab("Primary Disease") +
744
  ylab("# of cell lines") + 
745
  ggtitle(label = "Proportion of Cancer Types in DepMap Data (overall)", subtitle = "20Q2 Version") +
746
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
747
748
ggsave(filename = "Plots/DepMap/DepMap_Cell_Lines_Proportion.pdf", device = "pdf")
749
  
750
prot[, 1:5]
751
unique(prot$stripped_cell_line_name)
752
753
mut$stripped_cell_line_name = str_replace(toupper(mut$stripped_cell_line_name), "-", "")
754
cnv$stripped_cell_line_name = str_replace(toupper(cnv$stripped_cell_line_name), "-", "")
755
exp$stripped_cell_line_name = str_replace(toupper(exp$stripped_cell_line_name), "-", "")
756
prot$stripped_cell_line_name = str_replace(toupper(prot$stripped_cell_line_name), "-", "")
757
ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
758
759
mut_line_info <- line_info[stripped_cell_line_name %in% unique(mut$stripped_cell_line_name)]  
760
cnv_line_info <- line_info[stripped_cell_line_name %in% unique(cnv$stripped_cell_line_name)]  
761
exp_line_info <- line_info[stripped_cell_line_name %in% unique(exp$stripped_cell_line_name)]  
762
prot_line_info <- line_info[stripped_cell_line_name %in% unique(prot$stripped_cell_line_name)]
763
ctrp_line_info <- line_info[stripped_cell_line_name %in% unique(ctrp$ccl_name)]
764
765
mut_line_info <- mut_line_info[, c("stripped_cell_line_name", "primary_disease")]
766
mut_line_info$data_type <- "Mutational"
767
768
cnv_line_info <- cnv_line_info[, c("stripped_cell_line_name", "primary_disease")]
769
cnv_line_info$data_type <- "Copy Number"
770
771
exp_line_info <- exp_line_info[, c("stripped_cell_line_name", "primary_disease")]
772
exp_line_info$data_type <- "Gene Expression"
773
774
prot_line_info <- prot_line_info[, c("stripped_cell_line_name", "primary_disease")]
775
prot_line_info$data_type <- "Protein Quantification"
776
777
ctrp_line_info <- ctrp_line_info[, c("stripped_cell_line_name", "primary_disease")]
778
ctrp_line_info$data_type <- "Dose-Response"
779
780
datatype_line_info <- rbindlist(list(mut_line_info, cnv_line_info, exp_line_info, prot_line_info, ctrp_line_info))
781
782
ggplot(data = datatype_line_info) +
783
  geom_bar(mapping = aes(x = primary_disease, fill = data_type), stat = "count", position = "dodge") +
784
  xlab("Primary Disease") +
785
  ylab("# of cell lines") +
786
  labs(fill = "Data Type") +
787
  ggtitle(label = "Proportion of Cancer Types in DepMap Data", subtitle = "By data type, 20Q2 Version - Overlap with CTRPv2: 79%") +
788
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
789
790
ggsave(filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Proportion.pdf", device = "pdf")
791
792
793
BiocManager::install("VennDiagram")
794
require(VennDiagram)
795
796
library(RColorBrewer)
797
myCol <- brewer.pal(5, "Pastel2")
798
799
# NOTE: The CTRPv2 here is from before ctrp was merged with cell line info to add primary disease!
800
venn.diagram(x = list(mut_line_info$stripped_cell_line_name,
801
                      cnv_line_info$stripped_cell_line_name,
802
                      exp_line_info$stripped_cell_line_name,
803
                      prot_line_info$stripped_cell_line_name,
804
                      unique(ctrp$ccl_name)),
805
             category.names = c("Mutational", "Copy Number", "Gene Expression", "Protein Quantification", "CTRPv2 Dose-Response"),
806
             filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Venn.png",
807
             imagetype = "png",
808
             output = TRUE,
809
             height = 3000 ,
810
             width = 3000 ,
811
             resolution = 600,
812
             # Circles
813
             lwd = 2,
814
             # lty = 'blank',
815
             fill = myCol,
816
             # Numbers
817
             cex = .6,
818
             fontface = "bold",
819
             fontfamily = "sans",
820
             
821
             # Set names
822
             cat.cex = 0.6,
823
             cat.fontface = "bold",
824
             cat.default.pos = "outer",
825
             cat.pos = c(0, 0, -130, 150, 0),
826
             cat.dist = c(0.2, 0.2, 0.2, 0.2, 0.2),
827
             cat.fontfamily = "sans",
828
             # rotation = 1
829
             
830
)