|
a |
|
b/R/Complete_Sample_Prep-ML3867-FTAJ.R |
|
|
1 |
# Complete_Sample_Prep.R |
|
|
2 |
|
|
|
3 |
# This script is intended to pair genomics, transcriptomics, proteomics and drug response data |
|
|
4 |
# mainly from the DepMap resource. |
|
|
5 |
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/" |
|
|
6 |
dir.create(paste0(path, "Data/DRP_Training_Data")) |
|
|
7 |
|
|
|
8 |
require(data.table) |
|
|
9 |
|
|
|
10 |
# ==== Cell line info cleanup ==== |
|
|
11 |
depmap_samples <- fread(paste0(path, "Data/DepMap/21Q2/sample_info.csv")) |
|
|
12 |
# Subset relevant (machine learning) columns |
|
|
13 |
depmap_samples <- depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")] |
|
|
14 |
|
|
|
15 |
fwrite(depmap_samples, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")) |
|
|
16 |
|
|
|
17 |
# depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv") |
|
|
18 |
# ==== Expression data cleanup ==== |
|
|
19 |
ccle_exp <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_expression.csv")) |
|
|
20 |
dim(ccle_exp) |
|
|
21 |
ccle_exp[1:5, 1:20] |
|
|
22 |
# Change column names to only contain HGNC name: replace everything after first word with "" |
|
|
23 |
colnames(ccle_exp) <- gsub(" .+", "", colnames(ccle_exp)) |
|
|
24 |
colnames(ccle_exp)[1] <- "DepMap_ID" |
|
|
25 |
# Merge with sample info to have cell line name in addition to DepMap ID |
|
|
26 |
ccle_exp <- merge(ccle_exp, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "DepMap_ID") |
|
|
27 |
ccle_exp[, DepMap_ID := NULL] |
|
|
28 |
ccle_exp[1:5, 1:20] |
|
|
29 |
|
|
|
30 |
# Move cell line name to the first column: just giving the column name to the function moves it to first place |
|
|
31 |
setcolorder(ccle_exp, neworder = sort(colnames(ccle_exp))) |
|
|
32 |
setcolorder(ccle_exp, neworder = c("stripped_cell_line_name", "primary_disease")) |
|
|
33 |
ccle_exp[1:5, 1:20] |
|
|
34 |
|
|
|
35 |
# Save |
|
|
36 |
fwrite(ccle_exp, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Expression.csv"), sep = ',') |
|
|
37 |
|
|
|
38 |
# ccle_exp <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")) |
|
|
39 |
|
|
|
40 |
ccle_exp |
|
|
41 |
# DIMENSIONS OF EXPRESSION DATA: 1375 X 19178 |
|
|
42 |
rm(ccle_exp) |
|
|
43 |
# ==== Copy number data cleanup ==== |
|
|
44 |
ccle_cn <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_gene_copy_number.csv")) |
|
|
45 |
dim(ccle_cn) |
|
|
46 |
ccle_cn[1:5, 1:10] |
|
|
47 |
# Change column names to only contain HGNC name: replace everything after first word with "" |
|
|
48 |
colnames(ccle_cn) <- gsub(" .+", "", colnames(ccle_cn)) |
|
|
49 |
colnames(ccle_cn)[1] <- "DepMap_ID" |
|
|
50 |
# Merge with sample info to have cell line name in addition to DepMap ID |
|
|
51 |
ccle_cn <- merge(ccle_cn, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "DepMap_ID") |
|
|
52 |
ccle_cn[, DepMap_ID := NULL] |
|
|
53 |
|
|
|
54 |
setcolorder(ccle_cn, neworder = sort(colnames(ccle_cn))) |
|
|
55 |
setcolorder(ccle_cn, neworder = c("stripped_cell_line_name", "primary_disease")) |
|
|
56 |
ccle_cn[1:5, 1:20] |
|
|
57 |
dim(ccle_cn) |
|
|
58 |
# Save |
|
|
59 |
fwrite(ccle_cn, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv"), sep = ',') |
|
|
60 |
|
|
|
61 |
# DIMENSIONS OF COPY NUMBER DATA: 1740 X 27563 |
|
|
62 |
rm(ccle_cn) |
|
|
63 |
gc() |
|
|
64 |
# ==== Proteomic data cleanup ==== |
|
|
65 |
ccle_prot <- fread(paste0(path, "Data/DepMap/20Q2/CCLE_protein_quant_current_normalized.csv")) |
|
|
66 |
dim(ccle_prot) |
|
|
67 |
ccle_prot[1:5, 1:10] |
|
|
68 |
ccle_prot[1:5, 48:60] |
|
|
69 |
# Subset only the Uniprot accession (since its unique unlike HGNC) and the cell line experimental data |
|
|
70 |
ccle_prot <- ccle_prot[, c(6, 49:ncol(ccle_prot)), with = F] |
|
|
71 |
colnames(ccle_prot) <- gsub("\\_.+", "", colnames(ccle_prot)) |
|
|
72 |
colnames(ccle_prot)[1] <- "Uniprot_Acc" |
|
|
73 |
# Transpose the data.table to match with other data type tables |
|
|
74 |
t <- transpose(ccle_prot, make.names = "Uniprot_Acc") |
|
|
75 |
|
|
|
76 |
# Check if transpose worked as intended |
|
|
77 |
as.numeric(unlist(t[1,])) == as.numeric(unlist(ccle_prot[,2])) |
|
|
78 |
as.numeric(unlist(t[2,])) == as.numeric(unlist(ccle_prot[,3])) |
|
|
79 |
|
|
|
80 |
# Add cell lines |
|
|
81 |
t$stripped_cell_line_name <- colnames(ccle_prot)[-1] |
|
|
82 |
# Merge with sample info to have cell line name in addition to DepMap ID |
|
|
83 |
t <- merge(t, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "stripped_cell_line_name") |
|
|
84 |
|
|
|
85 |
# Move to front |
|
|
86 |
setcolorder(t, neworder = c("DepMap_ID", "stripped_cell_line_name", "primary_disease")) |
|
|
87 |
t[1:5, 1:10] |
|
|
88 |
|
|
|
89 |
# Save |
|
|
90 |
fwrite(t, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_ProteinQuant.csv"), sep = ',') |
|
|
91 |
|
|
|
92 |
|
|
|
93 |
### Get proteins that are observed in all cell lines |
|
|
94 |
# Create the same transposed table as above |
|
|
95 |
# Remove all rows and columns that have any NA in them |
|
|
96 |
prot_nona <- na.omit(ccle_prot) |
|
|
97 |
which(is.na(prot_nona)) |
|
|
98 |
# Transpose the data.table to match with other data type tables |
|
|
99 |
t <- transpose(prot_nona, make.names = "Uniprot_Acc") |
|
|
100 |
# Check if transpose worked as intended |
|
|
101 |
as.numeric(unlist(t[1,])) == as.numeric(unlist(prot_nona[,2])) |
|
|
102 |
as.numeric(unlist(t[2,])) == as.numeric(unlist(prot_nona[,3])) |
|
|
103 |
# Add cell lines |
|
|
104 |
t$stripped_cell_line_name <- colnames(prot_nona)[-1] |
|
|
105 |
# Merge with sample info to have cell line name in addition to DepMap ID |
|
|
106 |
t <- merge(t, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], by = "stripped_cell_line_name") |
|
|
107 |
# Move to front |
|
|
108 |
setcolorder(t, neworder = c("DepMap_ID", "stripped_cell_line_name", "primary_disease")) |
|
|
109 |
t[1:5, 1:10] |
|
|
110 |
# Now we have ~5000 proteins that are available in all samples |
|
|
111 |
dim(t) |
|
|
112 |
|
|
|
113 |
# We have 3 duplicates |
|
|
114 |
sum(duplicated(t$stripped_cell_line_name)) |
|
|
115 |
|
|
|
116 |
# Save |
|
|
117 |
fwrite(t, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"), sep = ',') |
|
|
118 |
# ccle_prot <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv")) |
|
|
119 |
dim(ccle_prot) |
|
|
120 |
ccle_prot[1:5, 1:5] |
|
|
121 |
ccle_prot[, DepMap_ID := NULL] |
|
|
122 |
# fwrite(ccle_prot, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv"), sep = ',') |
|
|
123 |
# DIMENSIONS OF PROTEIN QUANTITY DATA: 378 X 5155 |
|
|
124 |
|
|
|
125 |
ccle_prot <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv")) |
|
|
126 |
anyDuplicated(ccle_prot$stripped_cell_line_name) |
|
|
127 |
|
|
|
128 |
# ==== Mutation data cleanup ==== |
|
|
129 |
rm(list = ls(pattern = "ccle")) |
|
|
130 |
require(data.table) |
|
|
131 |
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/" |
|
|
132 |
|
|
|
133 |
ccle_mut <- fread(paste0(path, "Data/DepMap/21Q2/CCLE_mutations.csv")) |
|
|
134 |
table(ccle_mut$isCOSMIChotspot) |
|
|
135 |
table(ccle_mut$isTCGAhotspot) |
|
|
136 |
table(ccle_mut$Variant_Type) |
|
|
137 |
length(unique(ccle_mut$DepMap_ID)) |
|
|
138 |
|
|
|
139 |
dim(ccle_mut) |
|
|
140 |
ccle_mut[1,] |
|
|
141 |
colnames(ccle_mut) |
|
|
142 |
# Calculate number of mutations per cell line |
|
|
143 |
temp <- ccle_mut[, c("Variant_Type", "DepMap_ID")] |
|
|
144 |
temp[, nMut := .N, by = "DepMap_ID"] |
|
|
145 |
temp |
|
|
146 |
unique(temp$Variant_Type) |
|
|
147 |
# For simplicity, extract only SNP data for now: this discards ~90,000 mutations |
|
|
148 |
# ccle_mut <- ccle_mut[Variant_Type == "SNP"] |
|
|
149 |
dim(ccle_mut) |
|
|
150 |
t <- ccle_mut[, c("DepMap_ID", "Chromosome", "Strand", "Start_position", "End_position")] |
|
|
151 |
dim(unique(t)) |
|
|
152 |
length(unique(ccle_mut$DepMap_ID)) |
|
|
153 |
# Keep relevant columns/features |
|
|
154 |
# Aside: Should the sequence change be provided, or just whether the SNP is deleterious or not? |
|
|
155 |
ccle_mut <- ccle_mut[, c("DepMap_ID", "Hugo_Symbol", "Chromosome", "Start_position", "End_position", "Strand", |
|
|
156 |
"Variant_Classification", "Variant_Type", "isDeleterious", |
|
|
157 |
"isTCGAhotspot", "isCOSMIChotspot", "Genome_Change", "cDNA_Change")] |
|
|
158 |
dim(ccle_mut) |
|
|
159 |
length(unique(ccle_mut$DepMap_ID)) |
|
|
160 |
table(ccle_mut$isDeleterious) |
|
|
161 |
table(ccle_mut$isTCGAhotspot) |
|
|
162 |
table(ccle_mut$isCOSMIChotspot) |
|
|
163 |
|
|
|
164 |
# ==== CCLE Mut Overlap with COSMIC CGC ==== |
|
|
165 |
# Perhaps it's best to use the mutations in genes that COSMIC considers important, like another paper in |
|
|
166 |
# the field (~500 genes) |
|
|
167 |
# Or, we can use a binary vector for genes and whether they have a deleterious mutation: this will result in |
|
|
168 |
# ~20,000 parameters |
|
|
169 |
length(unique(ccle_mut$Hugo_Symbol)) |
|
|
170 |
|
|
|
171 |
length(unique(ccle_mut[isCOSMIChotspot == T]$Hugo_Symbol)) |
|
|
172 |
length(unique(ccle_mut[isTCGAhotspot == T]$Hugo_Symbol)) |
|
|
173 |
length(unique(ccle_mut[isDeleterious == T]$Hugo_Symbol)) |
|
|
174 |
|
|
|
175 |
tcga_hotspot_genes <- unique(ccle_mut[isTCGAhotspot == T]$Hugo_Symbol) |
|
|
176 |
# Read COSMIC Cancer Gene Census data |
|
|
177 |
cgc <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/COSMIC/cancer_gene_census.csv") |
|
|
178 |
dim(cgc) |
|
|
179 |
cgc[1:5, 1:20] |
|
|
180 |
length(unique(cgc$`Gene Symbol`)) |
|
|
181 |
length(unique(cgc$HGVSG)) |
|
|
182 |
# Get Genes in this census |
|
|
183 |
cgc_genes <- unique(cgc$`Gene Symbol`) |
|
|
184 |
cgc[Tier == 1] |
|
|
185 |
length(unique(cgc$`Genome Location`)) # 922,732 |
|
|
186 |
# rm(cgc) |
|
|
187 |
|
|
|
188 |
# Subset DepMap mutations based on the CGC genes |
|
|
189 |
sum(unique(ccle_mut$Hugo_Symbol) %in% unique(cgc_genes)) |
|
|
190 |
ccle_mut <- ccle_mut[Hugo_Symbol %in% cgc_genes] |
|
|
191 |
length(unique(ccle_mut$DepMap_ID)) |
|
|
192 |
|
|
|
193 |
sum(ccle_mut$isDeleterious) |
|
|
194 |
ccle_mut[Variant_Classification == "Missense_Mutation"] |
|
|
195 |
length(unique(ccle_mut[isDeleterious == T]$Hugo_Symbol)) |
|
|
196 |
ccle_mut[isDeleterious == T] |
|
|
197 |
|
|
|
198 |
|
|
|
199 |
# TODO: Use CGC to check for overlap with CCLE cell lines, then collapse to whether each of the 700 genes for |
|
|
200 |
# that cell line has a mutation listed in the CGC |
|
|
201 |
length(unique(cgc$`Mutation genome position`)) # ~922,000 unique mutations |
|
|
202 |
unique(ccle_mut$NCBI_Build) # CCLE is with GRCh 37 |
|
|
203 |
unique(cgc$GRCh) # CGC has GRCh 38 |
|
|
204 |
# We must "lift over" the mutations from 37 to 38 before checking for overlap |
|
|
205 |
if (!require(liftOver)) { |
|
|
206 |
BiocManager::install("liftOver") |
|
|
207 |
require(liftOver) |
|
|
208 |
require(rtracklayer) |
|
|
209 |
} |
|
|
210 |
# liftOver requires a chain file to convert 37 to 38: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/ |
|
|
211 |
|
|
|
212 |
chain_path <- paste0(path, "Data/hg19ToHg38.over.chain") |
|
|
213 |
grch_37_38_chain <- import.chain(chain_path) |
|
|
214 |
|
|
|
215 |
# Must add "chr" to start of chromosome names |
|
|
216 |
ccle_mut$Chromosome <- paste0("chr", ccle_mut$Chromosome) |
|
|
217 |
# Must convert positions to GRanges |
|
|
218 |
ccle_mut_gr <- makeGRangesFromDataFrame(df = ccle_mut, keep.extra.columns = T, |
|
|
219 |
seqnames.field = "Chromosome", start.field = "Start_position", |
|
|
220 |
end.field = "End_position", strand.field = "Strand") |
|
|
221 |
length(unique(ccle_mut_gr$DepMap_ID)) |
|
|
222 |
|
|
|
223 |
# Lift over |
|
|
224 |
lifted_ccle_mut <- liftOver(x = ccle_mut_gr, chain = grch_37_38_chain) |
|
|
225 |
# Convert GRangesList to GRanges |
|
|
226 |
lifted_ccle_mut <- unlist(lifted_ccle_mut) |
|
|
227 |
# Convert back to data.table |
|
|
228 |
lifted_ccle_mut <- as.data.table(lifted_ccle_mut) |
|
|
229 |
# Note: Genome_Change is now out of date! |
|
|
230 |
# Remove chr from seqnames |
|
|
231 |
lifted_ccle_mut$seqnames <- gsub("chr", "", lifted_ccle_mut$seqnames) |
|
|
232 |
# Can find the overlap of Mutation genome position in CGC with a newly created column based on CCLE positions |
|
|
233 |
lifted_ccle_mut[, Mutation_Position := paste0(seqnames, ':', start, '-', end)] |
|
|
234 |
|
|
|
235 |
ccle_mut$seqnames <- gsub("chr", "", ccle_mut$Chromosome) |
|
|
236 |
ccle_mut[, Mutation_Position := paste0(seqnames, ':', as.character(Start_position), '-', as.character(End_position))] |
|
|
237 |
|
|
|
238 |
|
|
|
239 |
length(unique(lifted_ccle_mut$DepMap_ID)) |
|
|
240 |
|
|
|
241 |
sum(ccle_mut$Mutation_Position %in% unique(cgc$`Genome Location`)) |
|
|
242 |
|
|
|
243 |
# Now find the overlap with CGC (which already has GRCh38) |
|
|
244 |
subset <- lifted_ccle_mut[Mutation_Position %in% unique(cgc$`Genome Location`)] |
|
|
245 |
table(subset$Variant_Type) |
|
|
246 |
length(unique(subset$DepMap_ID)) |
|
|
247 |
# IMPORTANT! There is a loss of 8 cell lines (which do not have a mutation that is in |
|
|
248 |
# CGC) using the Tier 1 data only |
|
|
249 |
|
|
|
250 |
# Alternative (March 2021) ==== |
|
|
251 |
# Take those mutations that are COSMIC or TCGA hotspots, ignoring CGC |
|
|
252 |
subset <- ccle_mut[isTCGAhotspot | isCOSMIChotspot] |
|
|
253 |
|
|
|
254 |
### Create a vector of mutations for each cell line with the CGC genes |
|
|
255 |
length(unique(subset$Hugo_Symbol)) |
|
|
256 |
sub_dcast <- dcast.data.table(data = subset[, c("DepMap_ID", "Hugo_Symbol")], |
|
|
257 |
formula = DepMap_ID ~ Hugo_Symbol, fun.aggregate = length, value.var = "DepMap_ID") |
|
|
258 |
dim(sub_dcast) |
|
|
259 |
sub_dcast[1:5, 1:50] |
|
|
260 |
sum(sub_dcast$A1BG) |
|
|
261 |
sum(sub_dcast$A1CF) |
|
|
262 |
|
|
|
263 |
depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")) |
|
|
264 |
sub_dcast <- merge(sub_dcast, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], |
|
|
265 |
by = "DepMap_ID") |
|
|
266 |
setcolorder(sub_dcast, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")) |
|
|
267 |
sub_dcast[1:5, 1:50] |
|
|
268 |
|
|
|
269 |
# Save |
|
|
270 |
fwrite(sub_dcast, paste0(path, "Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv"), sep = ',') |
|
|
271 |
dim(cgc_muts) |
|
|
272 |
cgc_muts[1:5, 1:5] |
|
|
273 |
typeof(cgc_muts[1,2]) |
|
|
274 |
|
|
|
275 |
temp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv") |
|
|
276 |
dim(temp) |
|
|
277 |
temp[1:5, 1:50] |
|
|
278 |
|
|
|
279 |
# # Attach the cell line name and primary disease |
|
|
280 |
# # cgc_muts <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv")) |
|
|
281 |
# depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv")) |
|
|
282 |
# cgc_muts <- merge(cgc_muts, depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease")], |
|
|
283 |
# by = "DepMap_ID") |
|
|
284 |
# setcolorder(cgc_muts, neworder = c("stripped_cell_line_name", colnames(cgc_muts)[-ncol(cgc_muts)])) |
|
|
285 |
# |
|
|
286 |
# # Save |
|
|
287 |
# fwrite(cgc_muts, paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv"), sep = ',') |
|
|
288 |
# cgc_muts <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv")) |
|
|
289 |
# cgc_muts[1:5, 1:5] |
|
|
290 |
# cgc_muts[, DepMap_ID := NULL] |
|
|
291 |
# DIMENSIONS OF CGC MUTATIONAL DATA: 1733 X 697 |
|
|
292 |
|
|
|
293 |
|
|
|
294 |
|
|
|
295 |
# ==== miRNA Data Cleanup ==== |
|
|
296 |
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/" |
|
|
297 |
require(data.table) |
|
|
298 |
depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv") |
|
|
299 |
|
|
|
300 |
ccle_mirna <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_miRNA_20181103.gct") |
|
|
301 |
dim(ccle_mirna) |
|
|
302 |
anyNA(ccle_mirna) |
|
|
303 |
ccle_mirna[1:5, 1:5] |
|
|
304 |
|
|
|
305 |
min(ccle_mirna[, -c(1:2)], na.rm = T) |
|
|
306 |
max(ccle_mirna[, -c(1:2)], na.rm = T) |
|
|
307 |
|
|
|
308 |
ccle_mirna <- transpose(ccle_mirna, keep.names = "Name") |
|
|
309 |
dim(ccle_mirna) |
|
|
310 |
ccle_mirna[1:5, 1:5] |
|
|
311 |
ccle_mirna$Name |
|
|
312 |
sum(duplicated(unlist(ccle_mirna[2, ]))) |
|
|
313 |
ccle_mirna <- ccle_mirna[-1,] |
|
|
314 |
ccle_mirna[1:5, 1:5] |
|
|
315 |
colnames(ccle_mirna) <- unlist(ccle_mirna[1,]) |
|
|
316 |
ccle_mirna <- ccle_mirna[-1,] |
|
|
317 |
|
|
|
318 |
# Clean cell line name |
|
|
319 |
ccle_mirna$Description <- gsub(pattern = "\\_.+", replacement = "", ccle_mirna$Description) |
|
|
320 |
colnames(ccle_mirna)[1] <- "stripped_cell_line_name" |
|
|
321 |
ccle_mirna <- merge(ccle_mirna, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")], |
|
|
322 |
by = "stripped_cell_line_name") |
|
|
323 |
dim(ccle_mirna) |
|
|
324 |
ccle_mirna[1:5, 1:5] |
|
|
325 |
setcolorder(ccle_mirna, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")) |
|
|
326 |
|
|
|
327 |
fwrite(ccle_mirna, paste0(path, "Data/DRP_Training_Data/DepMap_2019_miRNA.csv"), sep = ',') |
|
|
328 |
|
|
|
329 |
rm(ccle_mirna) |
|
|
330 |
|
|
|
331 |
|
|
|
332 |
# ==== Metabolomics Data Cleanup ==== |
|
|
333 |
depmap_samples <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv") |
|
|
334 |
ccle_metab <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_metabolomics_20190502.csv") |
|
|
335 |
|
|
|
336 |
dim(ccle_metab) |
|
|
337 |
ccle_metab[1:5, 1:5] |
|
|
338 |
|
|
|
339 |
min(ccle_metab[, -c(1:2)], na.rm = T) |
|
|
340 |
max(ccle_metab[, -c(1:2)], na.rm = T) |
|
|
341 |
|
|
|
342 |
anyNA(ccle_metab) |
|
|
343 |
sum(is.na(ccle_metab)) |
|
|
344 |
which(is.na(ccle_metab), arr.ind = T) |
|
|
345 |
ccle_metab[which(is.na(ccle_metab), arr.ind = T)] |
|
|
346 |
ccle_metab[554, 2] # DepMap_ID is NA |
|
|
347 |
|
|
|
348 |
min(ccle_metab[, -c(1:2)]) |
|
|
349 |
ccle_metab <- merge(ccle_metab[, -1], depmap_samples[, c("DepMap_ID", "stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")], |
|
|
350 |
by = "DepMap_ID") |
|
|
351 |
|
|
|
352 |
setcolorder(ccle_metab, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")) |
|
|
353 |
ccle_metab$DepMap_ID <- NULL |
|
|
354 |
dim(ccle_metab) |
|
|
355 |
|
|
|
356 |
fwrite(ccle_metab, paste0(path, "Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv"), sep = ',') |
|
|
357 |
rm(ccle_metab) |
|
|
358 |
# ==== RPPA Data Cleanup ==== |
|
|
359 |
ccle_rppa <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RPPA_20181003.csv") |
|
|
360 |
dim(ccle_rppa) |
|
|
361 |
ccle_rppa[1:5, 1:5] |
|
|
362 |
anyNA(ccle_rppa) |
|
|
363 |
|
|
|
364 |
min(ccle_rppa[, -c(1:2)], na.rm = T) # has negative values |
|
|
365 |
max(ccle_rppa[, -c(1:2)], na.rm = T) |
|
|
366 |
|
|
|
367 |
ccle_rppa$V1 <- gsub(pattern = "\\_.+", replacement = "", ccle_rppa$V1) |
|
|
368 |
colnames(ccle_rppa)[1] <- "stripped_cell_line_name" |
|
|
369 |
ccle_rppa <- merge(ccle_rppa, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")], |
|
|
370 |
by = "stripped_cell_line_name") |
|
|
371 |
dim(ccle_rppa) |
|
|
372 |
ccle_rppa[1:5, 1:10] |
|
|
373 |
setcolorder(ccle_rppa, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")) |
|
|
374 |
|
|
|
375 |
fwrite(ccle_rppa, paste0(path, "Data/DRP_Training_Data/DepMap_2019_RPPA.csv"), sep = ',') |
|
|
376 |
|
|
|
377 |
rm(ccle_rppa) |
|
|
378 |
|
|
|
379 |
# ==== Chromatin Profiling Data Cleanup ==== |
|
|
380 |
ccle_chrom <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_GlobalChromatinProfiling_20181130.csv") |
|
|
381 |
dim(ccle_chrom) |
|
|
382 |
ccle_chrom[1:5, 1:10] |
|
|
383 |
|
|
|
384 |
min(ccle_chrom[, -c(1:2)], na.rm = T) # has negative values |
|
|
385 |
max(ccle_chrom[, -c(1:2)], na.rm = T) |
|
|
386 |
|
|
|
387 |
|
|
|
388 |
anyNA(ccle_chrom) |
|
|
389 |
sum(is.na(ccle_chrom)) # 842 NA values |
|
|
390 |
|
|
|
391 |
unique(which(is.na(ccle_chrom), arr.ind = T)[,2]) |
|
|
392 |
length(unique(which(is.na(ccle_chrom), arr.ind = T)[,2])) # 26 columns have NAs |
|
|
393 |
|
|
|
394 |
# Convert NA to 0 |
|
|
395 |
setnafill(ccle_chrom, fill = 0, cols = unique(which(is.na(ccle_chrom), arr.ind = T)[,2])) |
|
|
396 |
anyNA(ccle_chrom) |
|
|
397 |
|
|
|
398 |
ccle_chrom$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_chrom$CellLineName) |
|
|
399 |
colnames(ccle_chrom)[1] <- "stripped_cell_line_name" |
|
|
400 |
dim(ccle_chrom) |
|
|
401 |
ccle_chrom <- merge(ccle_chrom, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")], |
|
|
402 |
by = "stripped_cell_line_name") |
|
|
403 |
ccle_chrom$BroadID <- NULL |
|
|
404 |
dim(ccle_chrom) |
|
|
405 |
ccle_chrom[1:5, 1:10] |
|
|
406 |
setcolorder(ccle_chrom, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")) |
|
|
407 |
|
|
|
408 |
fwrite(ccle_chrom, paste0(path, "Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv"), sep = ',') |
|
|
409 |
|
|
|
410 |
rm(ccle_chrom) |
|
|
411 |
|
|
|
412 |
# ==== Fusion Data Cleanup ==== |
|
|
413 |
ccle_fusion <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_fusions.csv") |
|
|
414 |
dim(ccle_fusion) |
|
|
415 |
ccle_fusion[1:5, 1:17] |
|
|
416 |
length(unique(ccle_fusion$FusionName)) |
|
|
417 |
length(unique(ccle_fusion$DepMap_ID)) |
|
|
418 |
unique(ccle_fusion$SpliceType) |
|
|
419 |
quantile(ccle_fusion$FFPM) |
|
|
420 |
|
|
|
421 |
ccle_fusion$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_fusion$CellLineName) |
|
|
422 |
colnames(ccle_fusion)[1] <- "stripped_cell_line_name" |
|
|
423 |
dim(ccle_fusion) |
|
|
424 |
ccle_fusion <- merge(ccle_fusion, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")], |
|
|
425 |
by = "stripped_cell_line_name") |
|
|
426 |
ccle_fusion$BroadID <- NULL |
|
|
427 |
dim(ccle_fusion) |
|
|
428 |
ccle_fusion[1:5, 1:10] |
|
|
429 |
setcolorder(ccle_fusion, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")) |
|
|
430 |
|
|
|
431 |
fwrite(ccle_fusion, paste0(path, "Data/DRP_Training_Data/DepMap_2019_GeneFusion.csv"), sep = ',') |
|
|
432 |
|
|
|
433 |
rm(ccle_fusion) |
|
|
434 |
|
|
|
435 |
# ==== Exon Usage Ratio Data Cleanup ==== |
|
|
436 |
require(data.table) |
|
|
437 |
setDTthreads(8) |
|
|
438 |
ccle_exon <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RNAseq_ExonUsageRatio_20180929.gct") |
|
|
439 |
dim(ccle_exon) |
|
|
440 |
ccle_exon[1:10, 1:17] |
|
|
441 |
|
|
|
442 |
transpose(ccle_exon, keep.names = "exon") |
|
|
443 |
length(unique(ccle_exon$FusionName)) |
|
|
444 |
length(unique(ccle_exon$DepMap_ID)) |
|
|
445 |
unique(ccle_exon$SpliceType) |
|
|
446 |
quantile(ccle_exon$FFPM) |
|
|
447 |
|
|
|
448 |
ccle_exon$CellLineName <- gsub(pattern = "\\_.+", replacement = "", ccle_exon$CellLineName) |
|
|
449 |
colnames(ccle_exon)[1] <- "stripped_cell_line_name" |
|
|
450 |
dim(ccle_exon) |
|
|
451 |
ccle_exon <- merge(ccle_exon, depmap_samples[, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")], |
|
|
452 |
by = "stripped_cell_line_name") |
|
|
453 |
ccle_exon$BroadID <- NULL |
|
|
454 |
dim(ccle_exon) |
|
|
455 |
ccle_exon[1:5, 1:10] |
|
|
456 |
setcolorder(ccle_exon, c("stripped_cell_line_name", "primary_disease", "lineage", "lineage_subtype")) |
|
|
457 |
|
|
|
458 |
# fwrite(ccle_exon, paste0(path, "Data/DRP_Training_Data/DepMap_2019_ExonUsageRatio.csv"), sep = ',') |
|
|
459 |
|
|
|
460 |
rm(ccle_exon) |
|
|
461 |
|
|
|
462 |
# ==== RRBS Profiling Data Cleanup ==== |
|
|
463 |
require(data.table) |
|
|
464 |
setDTthreads(8) |
|
|
465 |
|
|
|
466 |
# === TSS |
|
|
467 |
ccle_tss <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE_RRBS_TSS1kb_20181022.txt") |
|
|
468 |
dim(ccle_tss) |
|
|
469 |
ccle_tss[1:5, 1:5] |
|
|
470 |
length(unique(ccle_tss$cluster_id)) |
|
|
471 |
|
|
|
472 |
ccle_tss <- transpose(ccle_tss[, -2], keep.names = "cluster_id") |
|
|
473 |
colnames(ccle_tss) <- unlist(ccle_tss[1,]) |
|
|
474 |
ccle_tss <- ccle_tss[-1,] |
|
|
475 |
|
|
|
476 |
# === Promoter |
|
|
477 |
ccle_tss <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/DepMap/Extra/CCLE") |
|
|
478 |
dim(ccle_tss) |
|
|
479 |
ccle_tss[1:5, 1:5] |
|
|
480 |
length(unique(ccle_tss$cluster_id)) |
|
|
481 |
|
|
|
482 |
ccle_tss <- transpose(ccle_tss[, -2], keep.names = "cluster_id") |
|
|
483 |
colnames(ccle_tss) <- unlist(ccle_tss[1,]) |
|
|
484 |
ccle_tss <- ccle_tss[-1,] |
|
|
485 |
# === Enhancers |
|
|
486 |
|
|
|
487 |
# ==== Drug Sensitivity Data Cleanup ==== |
|
|
488 |
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/" |
|
|
489 |
require(data.table) |
|
|
490 |
require(webchem) |
|
|
491 |
# BiocManager::install("ChemmineR") |
|
|
492 |
require(ChemmineR) |
|
|
493 |
options(chemspider_key = "N98K4aOip0VpcSc8F9GilqIIktLt0hux") |
|
|
494 |
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/" |
|
|
495 |
ctrp <- fread("Data/DRP_Training_Data/CTRP_AUC_SMILES.txt") |
|
|
496 |
gdsc1 <- fread("Data/DRP_Training_Data/GDSC1_AUC_SMILES.txt") |
|
|
497 |
gdsc2 <- fread("Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt") |
|
|
498 |
|
|
|
499 |
|
|
|
500 |
# Clean up duplicate with missing pubchem |
|
|
501 |
cpd_info_1 <- fread(paste0(path, "Data/GDSC/GDSC1_Drug_Info.csv")) |
|
|
502 |
cpd_info_1[drug_name == unique(cpd_info_1[, c("drug_name", "pubchem")])[anyDuplicated(unique(cpd_info_1[, c("drug_name", "pubchem")])$drug_name),]$drug_name] |
|
|
503 |
cpd_info_1 <- cpd_info_1[drug_id != 476] |
|
|
504 |
cpd_info_1 <- cpd_info_1[drug_id != 1490] |
|
|
505 |
cpd_info_1 <- cpd_info_1[drug_id != 1496] |
|
|
506 |
cpd_info_1 <- cpd_info_1[drug_id != 1386] |
|
|
507 |
cpd_info_1 <- cpd_info_1[drug_id != 1402] |
|
|
508 |
cpd_info_1 <- cpd_info_1[drug_id != 1393] |
|
|
509 |
nrow(cpd_info_1[pubchem == "-"]) |
|
|
510 |
sum(cpd_info_1$drug_name %in% unique(ctrp$cpd_name)) |
|
|
511 |
# Subset for valid pubchem IDs |
|
|
512 |
cpd_info_1 <- cpd_info_1[pubchem != "-"] |
|
|
513 |
cpd_info_1 <- cpd_info_1[pubchem != "none"] |
|
|
514 |
cpd_info_1 <- cpd_info_1[pubchem != "several"] |
|
|
515 |
cpd_info_1$pubchem <- as.numeric(cpd_info_1$pubchem) |
|
|
516 |
|
|
|
517 |
cpd_1_smiles <- webchem::pc_prop(cid = cpd_info_1$pubchem, properties = "CanonicalSMILES") |
|
|
518 |
cpd_info_1 <- merge(cpd_info_1, cpd_1_smiles, by.x = "pubchem", by.y = "CID") |
|
|
519 |
# Save |
|
|
520 |
fwrite(cpd_info_1, "Data/GDSC/GDSC1_VALID_Drug_Info.csv") |
|
|
521 |
|
|
|
522 |
|
|
|
523 |
cpd_info_2 <- fread(paste0(path, "Data/GDSC/GDSC2_Drug_Info.csv")) |
|
|
524 |
cpd_info_2[drug_name == unique(cpd_info_2[, c("drug_name", "pubchem")])[anyDuplicated(unique(cpd_info_2[, c("drug_name", "pubchem")])$drug_name),]$drug_name] |
|
|
525 |
cpd_info_2 <- cpd_info_2[drug_id != 1811] |
|
|
526 |
cpd_info_2 <- cpd_info_2[drug_id != 1806] |
|
|
527 |
cpd_info_2 <- cpd_info_2[drug_id != 1819] |
|
|
528 |
cpd_info_2 <- cpd_info_2[drug_id != 1816] |
|
|
529 |
cpd_info_2[pubchem == "25227436, 42602260"]$pubchem <- "25227436" |
|
|
530 |
cpd_info_2[pubchem == "11719003, 58641927"]$pubchem <- "11719003" |
|
|
531 |
cpd_info_2[pubchem == "66577015, 16654980"]$pubchem <- "66577015" |
|
|
532 |
cpd_info_2[pubchem == "11719003, 58641927"]$pubchem <- "11719003" |
|
|
533 |
|
|
|
534 |
nrow(cpd_info_2[pubchem == "-"]) |
|
|
535 |
sum(cpd_info_2$pubchem %in% cpd_info_1$pubchem) / nrow(cpd_info_2) |
|
|
536 |
|
|
|
537 |
cpd_info_2 <- cpd_info_2[pubchem != "-"] |
|
|
538 |
cpd_info_2 <- cpd_info_2[pubchem != "none"] |
|
|
539 |
cpd_info_2 <- cpd_info_2[pubchem != "several"] |
|
|
540 |
|
|
|
541 |
cpd_info_2$pubchem <- as.numeric(cpd_info_2$pubchem) |
|
|
542 |
|
|
|
543 |
cpd_2_smiles <- webchem::pc_prop(cid = cpd_info_2$pubchem, properties = "CanonicalSMILES") |
|
|
544 |
cpd_info_2 <- merge(cpd_info_2, cpd_2_smiles, by.x = "pubchem", by.y = "CID") |
|
|
545 |
# Save |
|
|
546 |
fwrite(cpd_info_2, "Data/GDSC/GDSC2_VALID_Drug_Info.csv") |
|
|
547 |
|
|
|
548 |
|
|
|
549 |
depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv")) |
|
|
550 |
|
|
|
551 |
# ==== GDSC ==== |
|
|
552 |
require(stringr) |
|
|
553 |
gdsc1 <- fread(paste0(path, "Data/GDSC/GDSC1_Fitted_Dose_Response.csv")) |
|
|
554 |
sum(unique(gdsc1$CELL_LINE_NAME) %in% depmap_samples$stripped_cell_line_name) / length(unique(gdsc1$CELL_LINE_NAME)) # 0.22 |
|
|
555 |
sum(toupper(unique(gdsc1$CELL_LINE_NAME)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc1$CELL_LINE_NAME)) # 0.24 |
|
|
556 |
sum(str_remove_all(toupper(unique(gdsc1$CELL_LINE_NAME)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc1$CELL_LINE_NAME)) # 0.9696049 |
|
|
557 |
|
|
|
558 |
dim(gdsc1) # 310K Combinations |
|
|
559 |
colnames(gdsc1) |
|
|
560 |
sum(gdsc1$AUC == 0) |
|
|
561 |
min(gdsc1$AUC) |
|
|
562 |
max(gdsc1$AUC) |
|
|
563 |
|
|
|
564 |
# Count unique combinations in GDSC1 |
|
|
565 |
length(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")])$CELL_LINE_NAME)) # 987 |
|
|
566 |
length(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")])$DRUG_NAME)) # 345 |
|
|
567 |
nrow(unique(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME")]))) # 292,849 |
|
|
568 |
|
|
|
569 |
|
|
|
570 |
gdsc1_final <- merge(unique(gdsc1[, c("DRUG_NAME", "CELL_LINE_NAME", "AUC")]), unique(cpd_info_1[, c("drug_name", "CanonicalSMILES")]), by.x = "DRUG_NAME", by.y = "drug_name") |
|
|
571 |
colnames(gdsc1_final) <- c("cpd_name", "ccl_name", "area_under_curve", "cpd_smiles") |
|
|
572 |
# Save |
|
|
573 |
fwrite(gdsc1_final, "Data/DRP_Training_Data/GDSC1_AUC_SMILES.txt") |
|
|
574 |
|
|
|
575 |
unique(gdsc1_pubchem$DRUG_NAME) |
|
|
576 |
# gdsc1_cs_ids <- webchem::get_csid(query = unique(gdsc1$DRUG_NAME), from = "name", match = "all", verbose = T) |
|
|
577 |
gdsc1_cs_ids <- webchem::cir_query(identifier = unique(gdsc1$DRUG_NAME), representation = "smiles", verbose = T, ) |
|
|
578 |
|
|
|
579 |
# Count unique combinations in GDSC2 |
|
|
580 |
gdsc2 <- fread(paste0(path, "Data/GDSC/GDSC2_Fitted_Dose_Response.csv")) |
|
|
581 |
sum(unique(gdsc2$CELL_LINE_NAME) %in% depmap_samples$stripped_cell_line_name) / length(unique(gdsc2$CELL_LINE_NAME)) # 0.2311496 |
|
|
582 |
sum(toupper(unique(gdsc2$CELL_LINE_NAME)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc2$CELL_LINE_NAME)) # 0.2546354 |
|
|
583 |
sum(str_remove_all(toupper(unique(gdsc2$CELL_LINE_NAME)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(gdsc2$CELL_LINE_NAME)) # 0.9678616 |
|
|
584 |
|
|
|
585 |
gdsc2_cpd_smiles <- webchem::cir_query(identifier = unique(gdsc2$DRUG_NAME), representation = "smiles", verbose = T) |
|
|
586 |
|
|
|
587 |
dim(gdsc2) # 135K Combinations |
|
|
588 |
colnames(gdsc2) |
|
|
589 |
length(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")])$CELL_LINE_NAME)) # 809 |
|
|
590 |
length(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")])$DRUG_NAME)) # 192 |
|
|
591 |
nrow(unique(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME")]))) # 131,108 |
|
|
592 |
|
|
|
593 |
gdsc2_final <- merge(unique(gdsc2[, c("DRUG_NAME", "CELL_LINE_NAME", "AUC")]), unique(cpd_info_2[, c("drug_name", "CanonicalSMILES")]), by.x = "DRUG_NAME", by.y = "drug_name") |
|
|
594 |
colnames(gdsc2_final) <- c("cpd_name", "ccl_name", "area_under_curve", "cpd_smiles") |
|
|
595 |
# Save |
|
|
596 |
fwrite(gdsc2_final, "Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt") |
|
|
597 |
|
|
|
598 |
|
|
|
599 |
# Count overlap of drugs and cell lines |
|
|
600 |
sum(unique(gdsc1$DRUG_NAME) %in% unique(gdsc2$DRUG_NAME)) # Drug Overlap: 88 |
|
|
601 |
sum(unique(gdsc1$CELL_LINE_NAME) %in% unique(gdsc2$CELL_LINE_NAME)) # Cell Line Overlap: 808 |
|
|
602 |
|
|
|
603 |
# ==== CTRP ==== |
|
|
604 |
require(data.table) |
|
|
605 |
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/" |
|
|
606 |
|
|
|
607 |
# NOTE: Newer and better AUC calculation in PharmacoGx.R file! |
|
|
608 |
ctrp_curves <- fread(paste0(path, "Data/CTRP/v20.data.curves_post_qc.txt")) |
|
|
609 |
exper_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_experiment.txt")) |
|
|
610 |
cell_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_cell_line.txt")) |
|
|
611 |
table(cell_data$ccl_availability) |
|
|
612 |
|
|
|
613 |
# Merge sensitivity, experimental and cell line data |
|
|
614 |
temp <- merge(unique(ctrp_curves[, c("experiment_id", "master_cpd_id")]), |
|
|
615 |
unique(exper_data[, c("experiment_id", "master_ccl_id")]), |
|
|
616 |
by = "experiment_id") |
|
|
617 |
ctrp <- merge(temp, cell_data[, c("master_ccl_id", "ccl_name")], by = "master_ccl_id") |
|
|
618 |
sum(unique(ctrp$ccl_name) %in% depmap_samples$stripped_cell_line_name) / length(unique(ctrp$ccl_name)) # 0.9492672 |
|
|
619 |
sum(toupper(unique(ctrp$ccl_name)) %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(ctrp$ccl_name)) # 0.9492672 |
|
|
620 |
sum(str_remove_all(toupper(unique(ctrp$ccl_name)), "-") %in% toupper(depmap_samples$stripped_cell_line_name)) / length(unique(ctrp$ccl_name)) # 0.9503946 |
|
|
621 |
|
|
|
622 |
|
|
|
623 |
# Add compound information |
|
|
624 |
cpd_data <- fread(paste0(path, "Data/CTRP/v20.meta.per_compound.txt")) |
|
|
625 |
ctrp <- merge(ctrp, cpd_data[, c("master_cpd_id", "cpd_name", "cpd_smiles")], by = "master_cpd_id") |
|
|
626 |
|
|
|
627 |
# Add AUC curve information |
|
|
628 |
ctrp_auc <- fread(paste0(path, "Data/CTRP/v20.data.curves_post_qc.txt")) |
|
|
629 |
|
|
|
630 |
ctrp <- merge(ctrp, ctrp_auc[, c("experiment_id", "master_cpd_id", "area_under_curve")], by = c("experiment_id", "master_cpd_id")) |
|
|
631 |
|
|
|
632 |
|
|
|
633 |
# Save |
|
|
634 |
fwrite(ctrp, paste0(path, "Data/DRP_Training_Data/CTRP_AUC_SMILES.txt")) |
|
|
635 |
|
|
|
636 |
# Add primary disease information. NOTE: This removes some DR data as 45 cell lines in CTRPv2 cannot be paired with DepMap!!! |
|
|
637 |
line_info <- fread("Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv") |
|
|
638 |
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt") |
|
|
639 |
|
|
|
640 |
sum(unique(ctrp$ccl_name) %in% unique(line_info$stripped_cell_line_name)) # 150 |
|
|
641 |
|
|
|
642 |
line_info$other_ccl_name <- str_replace(toupper(line_info$stripped_cell_line_name), "-", "") |
|
|
643 |
ctrp$other_ccl_name <- str_replace(toupper(ctrp$ccl_name), "-", "") |
|
|
644 |
|
|
|
645 |
ctrp <- merge(ctrp, line_info[, c("other_ccl_name", "primary_disease")], by = "other_ccl_name") |
|
|
646 |
ctrp$other_ccl_name <- NULL |
|
|
647 |
setcolorder(ctrp, neworder = c("cpd_name", "ccl_name", "primary_disease", "area_under_curve", "cpd_smiles")) |
|
|
648 |
|
|
|
649 |
fwrite(ctrp, "Data/DRP_Training_Data/CTRP_AUC_SMILES.txt") |
|
|
650 |
|
|
|
651 |
|
|
|
652 |
# Experiment ID |
|
|
653 |
unique(ctrp[, c("master_ccl_id", "experiment_id")]) |
|
|
654 |
length(unique(ctrp$master_ccl_id)) |
|
|
655 |
length(unique(ctrp$experiment_id)) |
|
|
656 |
length(unique(ctrp$ccl_name)) |
|
|
657 |
length(unique(ctrp$master_cpd_id)) |
|
|
658 |
|
|
|
659 |
# Check overlap with GDSC 1 and 2 |
|
|
660 |
sum(unique(ctrp$ccl_name) %in% gdsc1$CELL_LINE_NAME) |
|
|
661 |
sum(unique(ctrp$ccl_name) %in% gdsc2$CELL_LINE_NAME) |
|
|
662 |
|
|
|
663 |
dim(ctrp) # 395K Combinations |
|
|
664 |
|
|
|
665 |
|
|
|
666 |
|
|
|
667 |
# ==== Chemical Data Cleanup ==== |
|
|
668 |
require(data.table) |
|
|
669 |
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/" |
|
|
670 |
|
|
|
671 |
chembl <- fread(paste0(path, "Data/chembl_27_chemreps.txt")) |
|
|
672 |
|
|
|
673 |
|
|
|
674 |
|
|
|
675 |
# ==== EDA ====== |
|
|
676 |
require(data.table) |
|
|
677 |
require(stringr) |
|
|
678 |
require(ggplot2) |
|
|
679 |
line_info <- fread("Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv") |
|
|
680 |
ctrp <- fread("Data/DRP_Training_Data/CTRP_AUC_SMILES.txt") |
|
|
681 |
gdsc2 <- fread("Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt") |
|
|
682 |
exp <- fread("Data/DRP_Training_Data/DepMap_20Q2_Expression.csv") |
|
|
683 |
mut <- fread("Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv") |
|
|
684 |
cnv <- fread("Data/DRP_Training_Data/DepMap_20Q2_CopyNumber.csv") |
|
|
685 |
prot <- fread("Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv") |
|
|
686 |
pdb_table <- fread("Data/cell_annotation_table_1.1.1.csv") |
|
|
687 |
pdb_sub <- pdb_table[, c("CTRPv2.cellid", "CCLE.cellid")] |
|
|
688 |
pdb_sub <- pdb_sub[!is.na(CTRPv2.cellid) & !is.na(CCLE.cellid)] |
|
|
689 |
|
|
|
690 |
|
|
|
691 |
exp[1:5., 1:5] |
|
|
692 |
|
|
|
693 |
length(unique(ctrp$ccl_name)) |
|
|
694 |
length(unique(ctrp$cpd_name)) |
|
|
695 |
|
|
|
696 |
sum(unique(ctrp$ccl_name) %in% line_info$stripped_cell_line_name) / length(unique(ctrp$ccl_name)) |
|
|
697 |
ccl_names = toupper(ctrp$ccl_name) |
|
|
698 |
ccl_names = unique(str_replace(ccl_names, "-", "")) |
|
|
699 |
length(ccl_names) |
|
|
700 |
|
|
|
701 |
sum(ccl_names %in% line_info$stripped_cell_line_name) / length(ccl_names) |
|
|
702 |
|
|
|
703 |
line_info[!(stripped_cell_line_name %in% ccl_names)] |
|
|
704 |
ctrp[ccl_name %like% "NIHOVCAR3"] |
|
|
705 |
ctrp[ccl_name %like% "HEL"] |
|
|
706 |
|
|
|
707 |
sum(exp$stripped_cell_line_name %in% pdb_sub$CCLE.cellid) |
|
|
708 |
|
|
|
709 |
# Remove hyphens and convert all to upper case |
|
|
710 |
pdb_ccl_names = pdb_sub$CCLE.cellid |
|
|
711 |
pdb_ccl_names = str_replace(toupper(pdb_ccl_names), "-", "") |
|
|
712 |
|
|
|
713 |
ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "") |
|
|
714 |
|
|
|
715 |
exp_ccl_names = exp$stripped_cell_line_name |
|
|
716 |
exp_ccl_names = str_replace(toupper(exp_ccl_names), "-", "") |
|
|
717 |
|
|
|
718 |
mut_ccl_names = mut$stripped_cell_line_name |
|
|
719 |
mut_ccl_names = str_replace(toupper(mut_ccl_names), "-", "") |
|
|
720 |
|
|
|
721 |
cnv_ccl_names = cnv$stripped_cell_line_name |
|
|
722 |
cnv_ccl_names = str_replace(toupper(cnv_ccl_names), "-", "") |
|
|
723 |
|
|
|
724 |
sum(exp_ccl_names %in% ccl_names) / length(unique(ccl_names)) |
|
|
725 |
sum(exp_ccl_names %in% pdb_ccl_names) / length(unique(pdb_ccl_names)) |
|
|
726 |
|
|
|
727 |
|
|
|
728 |
sum(mut_ccl_names %in% ccl_names) / length(unique(ccl_names)) * length(unique(ccl_names)) |
|
|
729 |
sum(mut_ccl_names %in% pdb_ccl_names) / length(unique(pdb_ccl_names)) * length(unique(pdb_ccl_names)) |
|
|
730 |
|
|
|
731 |
ctrp[ccl_name %in% mut_ccl_names[mut_ccl_names %in% ccl_names]] ### 302K!!!!! Not 144K |
|
|
732 |
ctrp[ccl_name %in% mut_ccl_names[mut_ccl_names %in% pdb_ccl_names]] ### 302K!!!!! Not 144K |
|
|
733 |
|
|
|
734 |
sum(cnv_ccl_names %in% ccl_names) / length(unique(ccl_names)) |
|
|
735 |
sum(exp_ccl_names %in% cnv_ccl_names) / length(unique(exp_ccl_names)) |
|
|
736 |
sum(cnv_ccl_names %in% exp_ccl_names) / length(unique(cnv_ccl_names)) |
|
|
737 |
|
|
|
738 |
|
|
|
739 |
dir.create(path = "Plots") |
|
|
740 |
dir.create(path = "Plots/DepMap") |
|
|
741 |
ggplot(data = line_info) + |
|
|
742 |
geom_bar(mapping = aes(x = primary_disease), stat = "count") + |
|
|
743 |
xlab("Primary Disease") + |
|
|
744 |
ylab("# of cell lines") + |
|
|
745 |
ggtitle(label = "Proportion of Cancer Types in DepMap Data (overall)", subtitle = "20Q2 Version") + |
|
|
746 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) |
|
|
747 |
|
|
|
748 |
ggsave(filename = "Plots/DepMap/DepMap_Cell_Lines_Proportion.pdf", device = "pdf") |
|
|
749 |
|
|
|
750 |
prot[, 1:5] |
|
|
751 |
unique(prot$stripped_cell_line_name) |
|
|
752 |
|
|
|
753 |
mut$stripped_cell_line_name = str_replace(toupper(mut$stripped_cell_line_name), "-", "") |
|
|
754 |
cnv$stripped_cell_line_name = str_replace(toupper(cnv$stripped_cell_line_name), "-", "") |
|
|
755 |
exp$stripped_cell_line_name = str_replace(toupper(exp$stripped_cell_line_name), "-", "") |
|
|
756 |
prot$stripped_cell_line_name = str_replace(toupper(prot$stripped_cell_line_name), "-", "") |
|
|
757 |
ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "") |
|
|
758 |
|
|
|
759 |
mut_line_info <- line_info[stripped_cell_line_name %in% unique(mut$stripped_cell_line_name)] |
|
|
760 |
cnv_line_info <- line_info[stripped_cell_line_name %in% unique(cnv$stripped_cell_line_name)] |
|
|
761 |
exp_line_info <- line_info[stripped_cell_line_name %in% unique(exp$stripped_cell_line_name)] |
|
|
762 |
prot_line_info <- line_info[stripped_cell_line_name %in% unique(prot$stripped_cell_line_name)] |
|
|
763 |
ctrp_line_info <- line_info[stripped_cell_line_name %in% unique(ctrp$ccl_name)] |
|
|
764 |
|
|
|
765 |
mut_line_info <- mut_line_info[, c("stripped_cell_line_name", "primary_disease")] |
|
|
766 |
mut_line_info$data_type <- "Mutational" |
|
|
767 |
|
|
|
768 |
cnv_line_info <- cnv_line_info[, c("stripped_cell_line_name", "primary_disease")] |
|
|
769 |
cnv_line_info$data_type <- "Copy Number" |
|
|
770 |
|
|
|
771 |
exp_line_info <- exp_line_info[, c("stripped_cell_line_name", "primary_disease")] |
|
|
772 |
exp_line_info$data_type <- "Gene Expression" |
|
|
773 |
|
|
|
774 |
prot_line_info <- prot_line_info[, c("stripped_cell_line_name", "primary_disease")] |
|
|
775 |
prot_line_info$data_type <- "Protein Quantification" |
|
|
776 |
|
|
|
777 |
ctrp_line_info <- ctrp_line_info[, c("stripped_cell_line_name", "primary_disease")] |
|
|
778 |
ctrp_line_info$data_type <- "Dose-Response" |
|
|
779 |
|
|
|
780 |
datatype_line_info <- rbindlist(list(mut_line_info, cnv_line_info, exp_line_info, prot_line_info, ctrp_line_info)) |
|
|
781 |
|
|
|
782 |
ggplot(data = datatype_line_info) + |
|
|
783 |
geom_bar(mapping = aes(x = primary_disease, fill = data_type), stat = "count", position = "dodge") + |
|
|
784 |
xlab("Primary Disease") + |
|
|
785 |
ylab("# of cell lines") + |
|
|
786 |
labs(fill = "Data Type") + |
|
|
787 |
ggtitle(label = "Proportion of Cancer Types in DepMap Data", subtitle = "By data type, 20Q2 Version - Overlap with CTRPv2: 79%") + |
|
|
788 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) |
|
|
789 |
|
|
|
790 |
ggsave(filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Proportion.pdf", device = "pdf") |
|
|
791 |
|
|
|
792 |
|
|
|
793 |
BiocManager::install("VennDiagram") |
|
|
794 |
require(VennDiagram) |
|
|
795 |
|
|
|
796 |
library(RColorBrewer) |
|
|
797 |
myCol <- brewer.pal(5, "Pastel2") |
|
|
798 |
|
|
|
799 |
# NOTE: The CTRPv2 here is from before ctrp was merged with cell line info to add primary disease! |
|
|
800 |
venn.diagram(x = list(mut_line_info$stripped_cell_line_name, |
|
|
801 |
cnv_line_info$stripped_cell_line_name, |
|
|
802 |
exp_line_info$stripped_cell_line_name, |
|
|
803 |
prot_line_info$stripped_cell_line_name, |
|
|
804 |
unique(ctrp$ccl_name)), |
|
|
805 |
category.names = c("Mutational", "Copy Number", "Gene Expression", "Protein Quantification", "CTRPv2 Dose-Response"), |
|
|
806 |
filename = "Plots/DepMap/DepMap_CTRP_Cell_Lines_Venn.png", |
|
|
807 |
imagetype = "png", |
|
|
808 |
output = TRUE, |
|
|
809 |
height = 3000 , |
|
|
810 |
width = 3000 , |
|
|
811 |
resolution = 600, |
|
|
812 |
# Circles |
|
|
813 |
lwd = 2, |
|
|
814 |
# lty = 'blank', |
|
|
815 |
fill = myCol, |
|
|
816 |
# Numbers |
|
|
817 |
cex = .6, |
|
|
818 |
fontface = "bold", |
|
|
819 |
fontfamily = "sans", |
|
|
820 |
|
|
|
821 |
# Set names |
|
|
822 |
cat.cex = 0.6, |
|
|
823 |
cat.fontface = "bold", |
|
|
824 |
cat.default.pos = "outer", |
|
|
825 |
cat.pos = c(0, 0, -130, 150, 0), |
|
|
826 |
cat.dist = c(0.2, 0.2, 0.2, 0.2, 0.2), |
|
|
827 |
cat.fontfamily = "sans", |
|
|
828 |
# rotation = 1 |
|
|
829 |
|
|
|
830 |
) |