Switch to unified view

a b/R/Read_Achilles_DRIVE_RNAi.R
1
# Read_Achilles_DRIVE_RNAi.R
2
# install.packages("fastmatch")
3
library(data.table)
4
library(fastmatch)
5
library(cmapR)
6
if (!require(biomaRt)) {
7
  BiocManager::install("biomaRt", version = "3.8")
8
  library(biomaRt)
9
}
10
library(HGNChelper)
11
if (!require(Biostrings)) {
12
    BiocManager::install("Biostrings", version = "3.8")
13
    library(Biostrings)
14
}
15
# cur_map <- getCurrentHumanMap()
16
# saveRDS(cur_map, "Data/HGNC_Map.rds")
17
cur_map <- readRDS("Data/HGNC_Map.rds")
18
19
# ==== Expression Data ====
20
ccle_rna <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_mRNA_expression.csv")
21
ccle_rna[1:5, 1:5]
22
dim(ccle_rna)
23
# ccle_drug_data <- fread("Data/DepMap/CCLE/CCLE_NP24.2009_Drug_data_2015.02.24.csv")
24
ccle_line_info <- fread("Data/DepMap/20Q1/sample_info.csv")
25
26
# Merge DepMap_ID and CCLE_Name with RNA data
27
ccle_rna_with_names <- merge(ccle_line_info[, c("DepMap_ID", "CCLE_Name")],
28
                             ccle_rna, by.x = "DepMap_ID", by.y = "V1")
29
# Get tumor type and subtype
30
ccle_sub_info <- ccle_line_info[match(ccle_rna_with_names$CCLE_Name, ccle_line_info$CCLE_Name),]
31
ccle_sub_info <- ccle_sub_info[, c("CCLE_Name", "disease", "disease_subtype")]
32
33
ccle_rna_with_names$cell_line <- gsub("_.*", "", ccle_rna_with_names$CCLE_Name)
34
ccle_sub_info$cell_line <- gsub("_.*", "", ccle_sub_info$CCLE_Name)
35
ccle_rna_with_names[, c("DepMap_ID", "CCLE_Name") := NULL]
36
ccle_sub_info[, CCLE_Name := NULL]
37
38
setcolorder(ccle_rna_with_names, c("cell_line",
39
                                   colnames(ccle_rna_with_names)[-ncol(ccle_rna_with_names)]))
40
41
dim(ccle_sub_info)
42
dim(ccle_rna_with_names)
43
all(ccle_rna_with_names$cell_line == ccle_sub_info$cell_line) # TRUE
44
# colnames(ccle_rna_with_names)[ncol(ccle_rna_with_names)]
45
length(unique(ccle_sub_info$disease))
46
unique(ccle_sub_info$disease)
47
unique(ccle_sub_info$disease_subtype)
48
anyNA(ccle_sub_info)
49
50
fwrite(ccle_sub_info, "Data/RNAi/Train_Data/ccle_line_info.txt")
51
fwrite(ccle_rna_with_names, "Data/RNAi/Train_Data/ccle_exp_data.txt")
52
53
ccle_long_exp <- melt.data.table(ccle_rna_with_names, id.vars = "cell_line",
54
                                 measure.vars = colnames(ccle_rna_with_names)[-1],
55
                                 value.name = "exp")
56
colnames(ccle_long_exp)[2] <- "gene_name"
57
fwrite(ccle_long_exp, "Data/RNAi/Train_Data/ccle_long_exp.txt")
58
59
# Get expression data gene sequences
60
ccle_rna_with_names <- fread("Data/RNAi/Train_Data/ccle_exp_data.txt")
61
anyDuplicated(colnames(ccle_rna_with_names))
62
cur_genes <- colnames(ccle_rna_with_names)[-1]
63
cur_genes <- gsub(".+ \\((ENSG\\d+)\\)", "\\1", cur_genes)
64
anyDuplicated(cur_genes)  # None
65
grch37 <- useMart(host = "http://grch37.ensembl.org",
66
                  biomart='ENSEMBL_MART_ENSEMBL', 
67
                  dataset='hsapiens_gene_ensembl')
68
biomaRt::searchAttributes(grch37, pattern = "gene")
69
# Get the transcripts of the current genes
70
if (!require(TxDb.Hsapiens.UCSC.hg38.knownGene)) {
71
  BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene")
72
  library(TxDb.Hsapiens.UCSC.hg38.knownGene)
73
}
74
if (!require(BSgenome.Hsapiens.UCSC.hg38)) {
75
  BiocManager::install("BSgenome.Hsapiens.UCSC.hg38")
76
  library(BSgenome.Hsapiens.UCSC.hg38)
77
}
78
if (!require(AnnotationHub)) {
79
  BiocManager::install("AnnotationHub")
80
  library(AnnotationHub)
81
}
82
83
# Get gene boundaries from 
84
ahub <- AnnotationHub()
85
qh <- query(ahub, c("hg38", "transcript", "homo sapiens"))
86
genes <- qh[["AH5046"]]
87
anyDuplicated(genes$name)
88
89
hg19 <- BSgenome.Hsapiens.UCSC.hg19
90
# cdsBy(txdb, by = "tx", use.names = T)
91
tran_seqs <- extractTranscriptSeqs(hg19, txdb, use.names = T)
92
length(unique(names(tran_seqs)))
93
keytypes(txdb)
94
# Get ENTREZ IDs
95
gene_ids <- select(txdb, keys = names(tran_seqs), columns="GENEID", keytype="TXNAME")
96
length(unique(gene_ids$GENEID))
97
biomaRt::searchAttributes(grch37, pattern = "entrez")
98
# Translate to ENSG and ENST
99
ensembl_ids <- getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id", "transcript_length", "entrezgene"),
100
                     filters = "entrezgene", values = gene_ids$GENEID, mart = grch37)
101
# Get ENST of genes in CCLE
102
ccle_genes <- gsub(".+ \\((ENSG\\d+)\\)", "\\1", colnames(ccle_rna_with_names)[-1])
103
sum(ccle_genes %in% ensembl_ids$ensembl_gene_id) 
104
length(unique(ccle_genes))
105
length(unique(ensembl_ids$ensembl_gene_id))
106
length(unique(ensembl_ids$ensembl_transcript_id))
107
sum(duplicated(ensembl_ids$ensembl_gene_id))
108
ccle_match <- ensembl_ids[ensembl_ids$ensembl_gene_id %in% ccle_genes, ]
109
length(unique(ccle_match$ensembl_transcript_id))
110
length(unique(ccle_match$ensembl_gene_id))
111
112
length(unique(merge_seq$Annotation_Transcript))
113
merge_seq$Tumor_Sample_Barcode <- gsub("\\_.+", "", merge_seq$Tumor_Sample_Barcode)
114
length(unique(merge_seq$Tumor_Sample_Barcode))
115
length(unique(ccle_rna_with_names$cell_line))
116
117
sum(drive_cells %in% merge_seq$Tumor_Sample_Barcode) / length(drive_cells)
118
119
ensembl_trans <- Biostrings::readDNAStringSet("Data/RNAi/Train_Data/Homo_sapiens.GRCh37.cdna.all.fa")
120
names(ensembl_trans)[1]
121
tran_names <- gsub(" .*", "", names(ensembl_trans))
122
gene_names <- gsub(".*gene:(ENSG\\d+) .*", "\\1", names(ensembl_trans))
123
124
names(ensembl_trans) <- gsub("\\.\\d+", "", names(ensembl_trans))
125
sum(ensembl_ids$ensembl_transcript_id %in% tran_names) / length(ensembl_ids$ensembl_transcript_id)
126
sum(unique(gene_names) %in% ccle_genes)
127
# All mutated transcripts are available in the data
128
sum(unique(merge_seq$Annotation_Transcript) %in% tran_names) /length(unique(merge_seq$Annotation_Transcript))
129
130
names(ensembl_trans) <- tran_names
131
sub_trans <- ensembl_trans[names(ensembl_trans) %in% merge_seq$Annotation_Transcript]
132
133
merge_seq
134
135
136
137
promoters(genes)
138
transcripts(genes)
139
trans_seqs <- Views(hg19, genes$blocks)
140
141
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
142
txdb
143
144
all_trans <- transcriptsBy(txdb, by = c("gene"))
145
all_trans <- transcripts(txdb)
146
head(all_trans)
147
148
gene_transcripts = getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id"),
149
                    filters = "ensembl_gene_id", values = cur_genes, mart = grch37)
150
anyDuplicated(gene_transcripts$ensembl_transcript_id)
151
transcript_seqs <- getSequence(id = gene_transcripts$ensembl_transcript_id,
152
                               type = "ensembl_transcript_id", seqType = "cdna",
153
                               mart = grch37)
154
dim(transcript_seqs)
155
156
getSequence()
157
158
159
# ==== CCLE Mutation Data ====
160
ccle_mutation <- fread("Data/RNAi/CCLE_mutation_data.csv")
161
unique(ccle_mutation$Variant_Classification)
162
# length(unique(ccle_mutation$Hugo_Symbol))
163
# length(unique(ccle_mutation$Annotation_Transcript))
164
cur_mutation_data <- ccle_mutation[, c("Variant_Classification", "Hugo_Symbol", "Tumor_Sample_Barcode")]
165
cur_mutation_data$cell_name <- gsub(pattern = "_.*", replacement = "",
166
                                    x = cur_mutation_data$Tumor_Sample_Barcode)
167
cur_mutation_data[, Tumor_Sample_Barcode := NULL]
168
cur_mutation_data <- unique(cur_mutation_data)
169
cur_mutation_data$Variant_Labels <- as.numeric(as.factor(cur_mutation_data$Variant_Classification))
170
cur_mutation_data[, Variant_Classification := NULL]
171
colnames(cur_mutation_data)[2] <- "cell_line"
172
fwrite(cur_mutation_data, "Data/RNAi/Train_Data/labeled_ccle_mut_data.txt")
173
174
dcast_mut_data <- dcast.data.table(cur_mutation_data,
175
                                   cell_line ~ Hugo_Symbol + Variant_Labels,
176
                                   drop = F, # Ensures all possible combinations are kept
177
                                   fill = 0, fun.aggregate = length)
178
sum(dcast_mut_data$A1CF_6)
179
fwrite(dcast_mut_data, "Data/RNAi/Train_Data/simple_ccle_mut_data.txt")
180
181
dcast_mut_data <- dcast.data.table(cur_mutation_data,
182
                                   cell_line + Hugo_Symbol ~ Variant_Labels,
183
                                   drop = F, # Ensures all possible combinations are kept
184
                                   fill = 0, fun.aggregate = length)
185
fwrite(dcast_mut_data, "Data/RNAi/Train_Data/long_ccle_mut_data.txt")
186
187
188
### Generate per gene features
189
length(unique(ccle_mutation$Hugo_Symbol))
190
length(unique(ccle_mutation$Annotation_Transcript))
191
# Check and update current HUGO symbols
192
cur_hgnc <- HGNChelper::checkGeneSymbols(x = ccle_mutation$Hugo_Symbol, map = cur_map)
193
ccle_mutation$updated_hugo <- cur_hgnc$Suggested.Symbol
194
# Get gene lengths from biomaRt
195
hugo_list <- unique(ccle_mutation$updated_hugo)
196
enst_list <- ccle_mutation$Annotation_Transcript
197
enst_list <- gsub("\\.\\d+", "", enst_list)
198
ccle_mutation$Annotation_Transcript <- enst_list
199
200
201
hugo_list <- sort(hugo_list)
202
nchar(hugo_list[1:5])
203
grch37 <- useMart(host = "http://grch37.ensembl.org",
204
                  biomart='ENSEMBL_MART_ENSEMBL', 
205
                  dataset='hsapiens_gene_ensembl')
206
biomaRt::searchAttributes(grch37, pattern = "length")
207
biomaRt::searchAttributes(grch37, pattern = "sequence")
208
# t1 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "coding", mart = grch37)
209
# t2 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "transcript_exon_intron", mart = grch37)
210
# t3 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "gene_exon_intron", mart = grch37)
211
# t4 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "gene_exon", mart = grch37)
212
# t5 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "cdna", mart = grch37)
213
# t6 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "3utr", mart = grch37)
214
# t7 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "5utr", mart = grch37)
215
# 
216
# t1$coding == t2$transcript_exon_intron
217
# nchar(t2$transcript_exon_intron)
218
# nchar(t4$gene_exon)
219
# nchar(t5$cdna)
220
# nchar(t6$`3utr`) + nchar(t1$coding) + nchar(t7$`5utr`)
221
222
gene_coords = getBM(attributes = c("ensembl_transcript_id", "transcript_length"),
223
                    filters = "ensembl_transcript_id", values = enst_list, mart = grch37)
224
gene_coords <- as.data.table(gene_coords)
225
226
transcript_seqs <- getSequence(id = gene_coords$ensembl_transcript_id,
227
                               type = "ensembl_transcript_id", seqType = "cdna",
228
                               mart = grch37)
229
transcript_seqs <- as.data.table(transcript_seqs)
230
# Save
231
fwrite(transcript_seqs, "Data/RNAi/Train_Data/transcript_seqs.txt")
232
transcript_seqs <- fread("Data/RNAi/Train_Data/transcript_seqs.txt")
233
colnames(transcript_seqs)
234
head(transcript_seqs$ensembl_transcript_id)
235
head(ccle_mutation$Annotation_Transcript)
236
# Merge sequences with mutational data
237
merge_seq <- merge(ccle_mutation[, c("Annotation_Transcript",
238
                                     "cDNA_Change", "Variant_Type",
239
                                     "Tumor_Sample_Barcode")],
240
                   transcript_seqs,
241
                   by.x = "Annotation_Transcript", by.y = "ensembl_transcript_id")
242
colnames(merge_seq)[5] <- "ref_cdna"
243
merge_seq$alt_cdna <- merge_seq$ref_cdna
244
dim(merge_seq)
245
246
 library(stringr)
247
mutations <- merge_seq$cDNA_Change
248
vartypes <- merge_seq$Variant_Type
249
cur_mutations <- gsub("c\\.", "", mutations[vartypes == "SNP"])
250
snps <- gsub("(\\d+)(\\w)(>)(\\w)", "\\1 \\2 \\4", cur_mutations)
251
snp_split <- str_split(snps, " ", simplify = T)
252
subseq(merge_seq[vartypes == "SNP"]$alt_cdna,
253
          as.integer(snp_split[, 1]), as.integer(snp_split[, 1])) <- snp_split[, 3]
254
sum(merge_seq$ref_cdna == merge_seq$alt_cdna) / nrow(merge_seq)
255
256
cur_mutations <- gsub("c\\.", "", mutations[vartypes == "INS"])
257
ins <- gsub("(\\d+)(\\_)(\\d+)ins(\\w+)", "\\1 \\3 \\4", cur_mutations)
258
ins_split <- str_split(ins, " ", simplify = T)
259
# For insertion in a sequence, the second position is 1 minus the first position
260
subseq(merge_seq[vartypes == "INS"]$alt_cdna,
261
          as.integer(ins_split[, 1]), as.integer(ins_split[, 1])-1) <- ins_split[, 3]
262
sum(merge_seq$ref_cdna == merge_seq$alt_cdna) / nrow(merge_seq)
263
264
cur_mutations <- gsub("c\\.", "", mutations[vartypes == "DEL"])
265
dels <- gsub("(\\d+)\\_+(\\d+)*del\\w+", "\\1 \\2", cur_mutations)
266
dels <- gsub("(\\d+)del\\w+", "\\1", dels)
267
268
dels_split <- str_split(dels, " ", simplify = T)
269
# For deletion in a sequence, copy the first position if there only a single deletion
270
dels_split[dels_split[, 2] == "", 2] <- dels_split[dels_split[, 2] == "", 1]
271
subseq(merge_seq[vartypes == "DEL"]$alt_cdna,
272
       as.integer(dels_split[, 1]), as.integer(dels_split[, 2])) <- ""
273
sum(merge_seq$ref_cdna == merge_seq$alt_cdna) / nrow(merge_seq)
274
275
# The remaining variants are outside the transcript, but may affect the transcript indirectly,
276
# e.g. splice site variants
277
anyNA(merge_seq$cDNA_Change)
278
sum(merge_seq$cDNA_Change == "")
279
ccle_mutation[cDNA_Change == ""]
280
281
# Remove invalid cDNA changes that result in empty alt seqs
282
sum(merge_seq$alt_len == 0)
283
sum(merge_seq$cDNA_Change == "")
284
merge_seq <- merge_seq[cDNA_Change != ""]
285
merge_seq[, ref_len := nchar(ref_cdna), by = "Annotation_Transcript"]
286
colnames(merge_seq)
287
# Save 
288
fwrite(merge_seq, "Data/RNAi/Train_Data/transcripts_alt_and_ref.txt")
289
290
merge_seq <- fread("Data/RNAi/Train_Data/transcripts_alt_and_ref.txt")
291
colnames(merge_seq)
292
(sum(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len) / 8) * 32 * 4 / 1e6
293
294
rm(list = c("dels", "dels_split", "ins", "ins_split", "snps", "snp_split"))
295
296
merge_seq <- fread("Data/RNAi/Train_Data/transcripts_alt_and_ref.txt")
297
colnames(merge_seq)
298
length(unique(merge_seq[,Tumor_Sample_Barcode]))
299
300
# Save only the transcripts as well
301
library(data.table)
302
alt_transcripts <- unique(merge_seq[, alt_cdna])
303
ref_transcripts <- unique(merge_seq[, ref_cdna])
304
length(alt_transcripts)
305
length(ref_transcripts)
306
all_transcripts <-
307
  rbindlist(list(data.table(alt_transcripts), data.table(ref_transcripts)), use.names = F)
308
dim(all_transcripts)
309
colnames(all_transcripts) <- "transcript"
310
all_transcripts <- unique(all_transcripts)
311
all_transcripts[, len := nchar(transcript)]
312
# Remove longest 1% transcripts
313
quantile(unique(all_transcripts$len), 0.99)
314
all_transcripts <- all_transcripts[len < 20326]
315
# Order by increasing length (want to train on smaller sequences first)
316
data.table::setorder(all_transcripts, -len)
317
# all_transcripts$len
318
319
library(data.table)
320
fwrite(unique(all_transcripts[, 'transcript']), "Data/RNAi/Train_Data/all_transcript_seqs.txt")
321
all_transcripts <- fread("Data/RNAi/Train_Data/all_transcript_seqs.txt")
322
all_transcripts
323
324
nrow(all_transcripts)
325
326
unique(merge_seq[is.nan(cDNA_Change)])
327
which(is.na(merge_seq$alt_cdna))
328
merge_seq[34,]
329
merge_seq[Annotation_Transcript == "ENST00000001146"]
330
sum(merge)
331
sum(merge_seq$ref_len == 0)
332
colnames(merge_seq)
333
length(unique(merge_seq[Variant_Type == "SNP"]$Annotation_Transcript))
334
merge_seq[, ref_len := nchar(ref_cdna), by = "Annotation_Transcript"]
335
merge_seq[, alt_len := nchar(alt_cdna), by = "Annotation_Transcript"]
336
merge_seq[Variant_Type == "SNP", "Annotation_Transcript"]
337
sum(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len)
338
max(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len)
339
min(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len)
340
median(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len)
341
plot(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len)
342
343
sum(nchar(merge_seq$alt_cdna))
344
345
346
colnames(merge_seq)
347
unique(ccle_mutation$Variant_Type)
348
349
mean(gene_coords$transcript_length)
350
median(gene_coords$transcript_length)
351
max(gene_coords$transcript_length)
352
min(gene_coords$transcript_length)
353
354
gene_coords[, size := as.integer(mean(size)), by = "hgnc_symbol"]
355
# Update CCLE mutation data
356
ccle_mutation$Annotation_Transcript <- gsub("\\.\\d+", "", ccle_mutation$Annotation_Transcript)
357
updated_ccle_mutation <- merge(ccle_mutation, gene_coords,
358
                          by.x = "Annotation_Transcript",
359
                          by.y = "ensembl_transcript_id")
360
361
ccle_mut <- updated_ccle_mutation  # make copy
362
# Number of silent mutations per gene divided by length
363
all_var_types <- unique(ccle_mut$Variant_Classification)
364
unique(ccle_mut$Variant_Type)
365
# c("Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "Frame_Shift_Del", "Frame_Shift_Ins")
366
for (var_type in c("SNP", "DEL", "INS")) {
367
  ccle_mut[Variant_Type == var_type,
368
           paste0("Num_", var_type) := nrow(.SD),
369
           by = c("Annotation_Transcript", "Tumor_Sample_Barcode")]
370
}
371
ncol(ccle_mut)
372
# Replace NAs with 0
373
ccle_mut[, 34:36][is.na(ccle_mut[, 34:36, with = F])] <- 0
374
375
mut_sub <- as.data.table(ccle_mut[, c("Annotation_Transcript", "Tumor_Sample_Barcode", "Num_SNP", "Num_DEL", "Num_INS")])
376
mut_sub$cell_line <- gsub("\\_.+", "", mut_sub$Tumor_Sample_Barcode)
377
setkey(mut_sub, "cell_line")
378
379
fwrite(mut_sub[, -2], "ccle_muts_by_type.txt")
380
381
# Get original transcript sequence
382
human <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
383
listMarts(host = "http://grch37.ensembl.org")
384
listEnsemblArchives()
385
listDatasets(human)
386
387
listDatasets(grch37)
388
389
biomaRt::searchAttributes(human, pattern = "length")
390
biomaRt::searchAttributes(human, pattern = "transcript")
391
392
393
# ==== DRIVE Data ====
394
drive_count <- readRDS("Data/RNAi/drive-raw-data/DriveCountData.RDS")
395
drive_count <- as.data.table(drive_count)
396
unique(drive_count$CLEANNAME)
397
drive_cells <- toupper(unique(drive_count$CLEANNAME))
398
399
shrna_map <- readRDS("Data/RNAi/drive-raw-data/ShrnaGeneMap.RDS")
400
shrna_map <- as.data.table(shrna_map)
401
length(unique(drive_count$EXPERIMENT_ID))
402
length(unique(shrna_map$GENESYMBOLS))
403
404
405
all_merged <- merge(drive_count, shrna_map, by = "SEQ")
406
407
# Get DRIVE cell line mutation data from CCLE
408
drive_mutation <- cur_mutation_data[cell_name %in% drive_cells]
409
410
# Get DRIVE cell line expression data from CCLE
411
ccle_rna_with_names$CCLE_Name
412
drive_rna <- ccle_rna_with_names[CCLE_Short_Name %in% drive_cells]
413
dim(drive_rna)
414
415
# Order shRNA screens with mutation and expression data
416
all_merged[PLASMID_COUNT > 0, num_shrna_exp := .N, by = "EXPERIMENT_ID"]
417
418
temp_exp <- all_merged[EXPERIMENT_ID == 1004182]
419
temp <- temp_exp[SAMPLE_COUNT > 1000 & !is.na(GENESYMBOLS)]
420
length(unique(temp$GENESYMBOLS))
421
all_merged
422
423
all_merged[POOL == "BGPD" & !is.na(PLASMID_COUNT)]
424
drive_rna[1:5, 1:5]
425
426
427
# ==== Read LFCs ====
428
achilles_batch_1 <- fread("Data/RNAi/LFCs/achilles55kbatch1repcollapsedlfc.csv")
429
achilles_batch_2 <- fread("Data/RNAi/LFCs/achilles55kbatch2repcollapsedlfc.csv")
430
achilles_batch_3 <- fread("Data/RNAi/LFCs/achilles98krepcollapsedlfc.csv")
431
colnames(achilles_batch_1)[-1] <- gsub("_.*", "", colnames(achilles_batch_1)[-1])
432
colnames(achilles_batch_2)[-1] <- gsub("_.*", "", colnames(achilles_batch_2)[-1])
433
colnames(achilles_batch_3)[-1] <- gsub("_.*", "", colnames(achilles_batch_3)[-1])
434
all_achilles <- Reduce(function(...) merge(..., all = T, by = "V1"), list(achilles_batch_1, achilles_batch_2, achilles_batch_3))
435
dim(all_achilles)
436
all_molten <- melt(data = all_achilles, id.vars = "V1",
437
     measure.vars = colnames(all_achilles)[-1],
438
     variable.name = "cell_line", value.name = "lfc")
439
colnames(all_molten)[1] <- "shRNA"
440
441
anyNA(all_achilles)
442
anyNA(all_molten$shRNA)
443
anyNA(all_molten$cell_line)
444
anyNA(all_molten$lfc)
445
all_molten <- all_molten[!is.na(lfc),]
446
setkey(all_molten, "cell_line")
447
fwrite(all_molten, "Data/RNAi/Train_Data/achilles_shrna_cell_lfc.txt")
448
rm(list("achilles_batch_1", "achilles_batch_2", "achilles_batch_3"))
449
450
# Match cell line expression values
451
# ccle_rna_with_names[fmatch(all_molten$cell_line, ccle_rna_with_names$CCLE_Short_Name),]
452
453
# Read DRIVE data
454
drive_bgpd <- fread("Data/RNAi/LFCs/drivebgpdlfcmat.csv")
455
drive_a <- fread("Data/RNAi/LFCs/drivepoolalfcmat.csv")
456
drive_b <- fread("Data/RNAi/LFCs/drivepoolblfcmat.csv")
457
458
dim(drive_bgpd)
459
dim(drive_a)
460
dim(drive_b)
461
colnames(drive_bgpd)[-1] <- gsub("_.*", "", colnames(drive_bgpd)[-1])
462
colnames(drive_a)[-1] <- gsub("_.*", "", colnames(drive_a)[-1])
463
colnames(drive_b)[-1] <- gsub("_.*", "", colnames(drive_b)[-1])
464
465
all_drive <- Reduce(function(...) merge(..., all = T, by = "V1"), list(drive_bgpd, drive_a, drive_b))
466
dim(all_drive)
467
468
all_drive[, c("U87MG.y", "U87MG.x")]
469
drive_molten <- melt(data = all_drive, id.vars = "V1",
470
                   measure.vars = colnames(all_drive)[-1],
471
                   variable.name = "cell_line", value.name = "lfc")
472
drive_molten <- drive_molten[!is.na(lfc),]
473
drive_molten$cell_line <- gsub("\\.x", "", drive_molten$cell_line)
474
unique(drive_molten$cell_line)
475
drive_molten$cell_line <- gsub("\\.y", "", drive_molten$cell_line)
476
unique(drive_molten$cell_line)
477
unique(drive_molten[, c("V1", "cell_line")])
478
# Average lfc by cell line for each shRNA
479
drive_molten[, avg_lfc := mean(lfc), by = c("V1", "cell_line")]
480
drive_molten[, lfc := NULL]
481
colnames(drive_molten) <- c("shRNA", "cell_line", "lfc")
482
drive_molten <- unique(drive_molten)
483
setkey(drive_molten, "cell_line")
484
485
fwrite(drive_molten, "Data/RNAi/Train_Data/drive_shrna_cell_lfc.txt")
486
487
488
# Read Marcotte Data
489
marcotte <- fread("Data/RNAi/LFCs/Marcotte_LFC_matrix.csv")
490
dim(marcotte)
491
colnames(marcotte)[-1] <- gsub("_.*", "", colnames(marcotte)[-1])
492
493
molten_marcotte <- melt(data = marcotte, id.vars = "V1",
494
                       measure.vars = colnames(marcotte)[-1],
495
                       variable.name = "cell_line", value.name = "lfc")
496
colnames(molten_marcotte)[1] <- "shRNA"
497
unique(molten_marcotte[, c("shRNA", "cell_line")])
498
fwrite(molten_marcotte, "Data/RNAi/Train_Data/marcotte_shrna_cell_lfc.txt")
499
500
501
# ==== Batch and Match Training Data ====
502
ccle_exp <- fread("Data/RNAi/Train_Data/ccle_exp_data.txt")
503
ccle_long_exp <- fread("Data/RNAi/Train_Data/ccle_long_exp.txt")
504
typeof(ccle_exp$`TSPAN6 (ENSG00000000003)`[1])
505
506
min(ccle_exp)
507
max(ccle_exp)
508
509
ccle_mut <- fread("Data/RNAi/Train_Data/simple_ccle_mut_data.txt")
510
511
achilles <- fread("Data/RNAi/Train_Data/achilles_shrna_cell_lfc.txt")
512
achilles[, sh_length := nchar(shRNA), by = c("cell_line", "lfc")]
513
unique(achilles$sh_length)
514
515
achilles_cells <- unique(achilles$cell_line)
516
ccle_cells <- unique(ccle_long_exp$cell_line)
517
sum(achilles_cells %in% ccle_cells)
518
achilles_cells[!(achilles_cells %in% ccle_cells)]
519
rm(list=c("ccle_long_exp", "achilles"))
520
drive <- fread("Data/RNAi/Train_Data/drive_shrna_cell_lfc.txt")
521
drive_cells <- unique(drive$cell_line)
522
sum(drive_cells %in% ccle_cells)
523
drive_cells[!(drive_cells %in% ccle_cells)]
524
rm(drive)
525
526
marcotte <- fread("Data/RNAi/Train_Data/marcotte_shrna_cell_lfc.txt")
527
marcotte_cells <- unique(marcotte$cell_line)
528
sum(marcotte_cells %in% ccle_cells)
529
marcotte_cells[!(marcotte_cells %in% ccle_cells)]
530
531
shared_cells <- unique(c(achilles_cells, drive_cells, marcotte_cells))
532
533
# Subset the expression data based on these cells
534
ccle_exp <- fread("Data/RNAi/Train_Data/ccle_exp_data.txt")
535
achilles_exp <- ccle_exp[cell_line %in% achilles_cells]
536
dim(achilles_exp)
537
fwrite(achilles_exp, "Data/RNAi/Train_Data/achilles_exp.txt")
538
rm(achilles_exp)
539
540
drive_exp <- ccle_exp[cell_line %in% drive_cells]
541
dim(drive_exp)
542
fwrite(drive_exp, "Data/RNAi/Train_Data/drive_exp.txt")
543
rm(drive_exp)
544
545
marcotte_exp <- ccle_exp[cell_line %in% marcotte_cells]
546
dim(marcotte_exp)
547
fwrite(marcotte_exp, "Data/RNAi/Train_Data/marcotte_exp.txt")
548
549
550
551
552
553
554
# Batch 1 million training cases
555
for (sub in seq(1, nrow(achilles) - 1e6, by = 1e6)) {
556
  print(sub)
557
  cur_sub <- achilles[sub:(sub + 1e6),]
558
  # Match cell line exp
559
  exp_sub <- ccle_exp[fmatch(cur_sub$cell_line, ccle_exp$cell_line),]
560
}
561
achilles
562
563
564
# ==== Merge all shRNA sequences for training, subset to match sequencing data ====
565
library(data.table)
566
# Consider sequencing data the reference with which everything is matched
567
merge_seq <- fread("Data/RNAi/Train_Data/transcripts_alt_and_ref.txt")
568
# Subset only cell lines available in the CCLE sequencing data
569
ccle_lines <- unique(merge_seq$Tumor_Sample_Barcode)
570
ccle_data <- fread("Data/RNAi/Train_Data/ccle_exp_data.txt")
571
# length(ccle_rna_with_names$cell_line)
572
ccle_data <- ccle_data[cell_line %in% ccle_lines]
573
# Update CCLE lines with those that have both sequencing and expression data
574
ccle_lines <- ccle_data$cell_line
575
576
achilles <- fread("Data/RNAi/Train_Data/achilles_shrna_cell_lfc.txt")
577
achilles <- achilles[cell_line %in% ccle_lines]
578
length(unique(achilles$cell_line))
579
drive <- fread("Data/RNAi/Train_Data/drive_shrna_cell_lfc.txt")
580
drive <- drive[cell_line %in% ccle_lines]
581
length(unique(drive$cell_line))
582
marcotte <- fread("Data/RNAi/Train_Data/marcotte_shrna_cell_lfc.txt")
583
marcotte <- marcotte[cell_line %in% ccle_lines]
584
length(unique(marcotte$cell_line))
585
586
# Extract unique sequences for autoencoder training
587
all_shrna_seqs <- rbindlist(list(achilles[,'shRNA'], drive[,'shRNA'], marcotte[,'shRNA']))
588
all_shrna_seqs <- unique(all_shrna_seqs)
589
590
# Save
591
fwrite(all_shrna_seqs, "Data/RNAi/Train_Data/all_shrna_seqs.txt")
592
593
594
# Extract all info for matching with sequencing and expression data
595
ccle_shrna_seqs <- rbindlist(list(achilles, drive, marcotte))
596
ccle_shrna_seqs <- unique(ccle_shrna_seqs)
597
598
# Save CCLE ShRNA
599
fwrite(ccle_shrna_seqs, "Data/RNAi/Train_Data/ccle_shrna_seqs.txt")
600
fread()
601
602
# Separate each cell line shRNA data
603
dir.create("Data/RNAi/Train_Data/shRNA_by_line")
604
all_lines <- unique(ccle_shrna_seqs$cell_line)
605
for (line in all_lines) {
606
  cur_sub <- ccle_shrna_seqs[cell_line == line]
607
  fwrite(cur_sub, paste0("Data/RNAi/Train_Data/shRNA_by_line/", line, ".txt"))
608
}
609
610
# Save CCLE Expression Data
611
fwrite(ccle_data, "Data/RNAi/Train_Data/ccle_sub_exp.txt")
612
613
rm(list = c("achilles", "drive", "marcotte"))
614
# Get an index in the 'all_transcript_seqs' files for every cell line for easier subsetting
615
# All cell lines should have the same number of transcripts; those that don't have a mutated
616
# transcript must be assigned the reference sequence
617
merge_sub <- merge_seq[Variant_Type == "SNP"]
618
619
length(merge_sub[Tumor_Sample_Barcode == 'OELE']$alt_cdna)
620
length(merge_sub[Tumor_Sample_Barcode == 'LN18']$alt_cdna)
621
colnames(merge_sub)
622
head(merge_sub$Tumor_Sample_Barcode)
623
colnames(ccle_data)
624
all_transcripts <- fread("Data/RNAi/Train_Data/all_transcript_seqs.txt")
625
626
all_reference <- unique(merge_sub[, c("Annotation_Transcript", "ref_cdna")])
627
nrow(all_reference)
628
merge_sub <- merge_sub[ref_len < 20326]
629
library(fastmatch)
630
# Find indices of reference/normal transcripts in 'all_transcripts'
631
ref_idx <- fmatch(x = all_reference$ref_cdna, table = all_transcripts$transcript)
632
ref_idx <- data.table(ref_id = all_reference$Annotation_Transcript, ref_idx = ref_idx)
633
634
dir.create("Data/RNAi/Train_Data/Cell_Line_Indices")
635
for (line in ccle_lines) {
636
  # Find the indices of the altered sequences in 'all_transcripts'
637
  cur_idx <- fmatch(x = merge_sub[Tumor_Sample_Barcode == line]$alt_cdna,
638
                table = all_transcripts$transcript)
639
  cur_idx <- data.table(alt_id = merge_sub[Tumor_Sample_Barcode == line]$Annotation_Transcript,
640
                        alt_idx = cur_idx)
641
  ref_copy <- ref_idx
642
  # Merge and replace transcript ids in the reference; this now represents the indices of
643
  # transcripts in 'all_transcripts' that are relevant to this cell line
644
  matched_idx <- fmatch(cur_idx$alt_id, ref_idx$ref_id)
645
  ref_copy[matched_idx,] <- cur_idx
646
  
647
  fwrite(ref_copy, paste0("Data/RNAi/Train_Data/Cell_Line_Indices/", line, ".txt"))
648
}
649
# fread("Data/RNAi/Train_Data/Cell_Line_Indices/22RV1.txt")
650
651
line_name_indices <- fread("Data/RNAi/Train_Data/ccle_exp_data.txt")
652
colnames(line_name_indices)[1]
653
line_name_indices <- data.table(seq_along(line_name_indices$cell_line),
654
                                line_name_indices$cell_line)
655
colnames(line_name_indices)[1] <- "index"
656
fwrite(line_name_indices, "Data/RNAi/Train_Data/ccle_name_index.txt")
657
658
colnames(merge_seq)
659
fmatch(unique(merge_seq$Annotation_Transcript), merge_seq$Annotation_Transcript)
660
sum(merge_seq[fmatch(unique(merge_seq$Annotation_Transcript), merge_seq$Annotation_Transcript)]$ref_len) / 8
661
662
663
# ==== shRNA mapping ====
664
library(stringr)
665
sh_map <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/RNAi/Train_Data/shRNAmapping.csv")
666
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene Symbol` <- "NO_CURRENT"
667
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene ID` <- "NO_CURRENT"
668
sh_map[`Gene Symbol` %like% "-"]$`Gene Symbol`[781]
669
sh_map[`Gene Symbol` %like% "-"]$`Gene ID`[781]
670
# colnames(sh_map) <- c("shRNA", "HGNC", "ENTREZ")
671
sh_map <- sh_map[, c(1,3)]
672
673
# sh_split <- str_split(sh_map$`Gene Symbol`, "-", simplify = T)
674
# head(sh_split)
675
# sh_map$`Gene ID`
676
# 
677
# temp <- sh_map[, strsplit(as.character(`Gene Symbol`), ",", fixed=TRUE),
678
#                      by = .(`Barcode Sequence`, `Gene Symbol`)][, `Gene Symbol` := NULL][
679
#                        , setnames(.SD, "Barcode Sequence", "Gene Symbol")]
680
681
682
# length(unique(sh_map$`Gene Symbol`))
683
# length(unique(sh_map$`Barcode Sequence`))
684
# anyDuplicated(sh_map$`Barcode Sequence`)
685
# which(duplicated(sh_map$`Barcode Sequence`))
686
# sh_map[c(28,29),]
687
688
colnames(sh_map) <- c("shRNA", "ENTREZ")
689
# sh_long <- melt(data = sh_map, id.vars = c("shRNA", "Gene"))
690
# sh_long <- dcast.data.table(sh_map, formula = shRNA ~ Gene, fill = 0,
691
#                             fun.aggregate = function(x) {1L})
692
# dim(sh_long)
693
# (sh_long[1:5, 2:5])
694
# 
695
# sum(sh_long[1,2:5000])
696
697
# Create a one-hot vector
698
# library(caret)
699
# Pair shRNA sequences with one-hot encoded gene target vector
700
# Each shRNA sequence will have a ~22000 length vector indicating its target
701
ccle_shrna_seqs <- fread("Data/RNAi/Train_Data/ccle_shrna_seqs.txt")
702
setkey(ccle_shrna_seqs, shRNA)
703
ccle_shrna_seqs$INDEX <- 1:nrow(ccle_shrna_seqs)
704
setkey(sh_map, shRNA)
705
706
length(unique(ccle_shrna_seqs$shRNA))
707
sh_map <- sh_map[shRNA %in% unique(ccle_shrna_seqs$shRNA)]
708
temp <- merge(ccle_shrna_seqs[, c(1,4)], sh_map, by = "shRNA", allow.cartesian = TRUE)
709
temp <- unique(temp)
710
711
# anyDuplicated(temp$INDEX)
712
713
# install.packages("onehot")
714
library(onehot)
715
716
# cur_sub$ENTREZ <- as.factor(cur_sub$ENTREZ)
717
# cur_sub$shRNA <- as.factor(cur_sub$shRNA)
718
# class(cur_sub$ENTREZ)
719
sh_map$ENTREZ <- as.factor(sh_map$ENTREZ)
720
sh_map$shRNA <- as.factor(sh_map$shRNA)
721
class(sh_map$ENTREZ)
722
723
# Separate into files by indices
724
# cur_dummy <- dummyVars(formula = '~ ENTREZ', data = sh_map,
725
#                        fullRank = T, sep = "_", levelsOnly = F)
726
727
onehot_encoder <- onehot::onehot(data = as.data.frame(sh_map),
728
                                 max_levels = length(unique(sh_map$ENTREZ)))
729
options(scipen=999)
730
dir.create("Data/RNAi/Train_Data/shRNA_by_index")
731
732
# all_results <- onehot_results
733
for (idx in seq(1, nrow(temp), by = 100000)[-(1:230)]) {
734
  # cur_sub <- sh_map[cell_line == line]
735
  cur_sub <- temp[idx:(idx+100000-1),]
736
  onehot_results <- predict(onehot_encoder, cur_sub)
737
  # dim(onehot_results)
738
  rownames(onehot_results) <- cur_sub$shRNA
739
  onehot_results <- data.table(onehot_results, keep.rownames = T)
740
  colnames(onehot_results)[1] <- "shRNA"
741
  
742
  onehot_results <- onehot_results[, lapply(.SD, sum), by = shRNA, .SDcols = colnames(onehot_results)[-1]]
743
  
744
  fwrite(onehot_results, paste0("Data/RNAi/Train_Data/shRNA_by_index/", idx, "_",
745
                         idx+100000-1, ".txt"))
746
}
747
library(data.table)
748
all_shrna_files <- list.files("Data/RNAi/Train_Data/shRNA_by_index/", full.names = T)
749
all_shrna_indices <- fread(all_shrna_files[1])
750
751
for (i in 2:length(all_shrna_files)) {
752
  cur_file <- fread(all_shrna_files[i])
753
  all_shrna_indices <- rbindlist(list(all_shrna_indices, cur_file))
754
}
755
all_shrna_indices <- unique(all_shrna_indices)
756
757
fwrite(x = all_shrna_indices, "Data/RNAi/Train_Data/all_shrna_indices.txt")
758
759
# dool <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/RNAi/Train_Data/shRNA_by_index/1_100000.txt")
760
# dim(dool)
761
dir.create("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/RNAi/Train_Data/shRNA_by_index")
762
library(stringr)
763
sh_map <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/RNAi/shRNAmapping.csv")
764
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene Symbol` <- "NO_CURRENT"
765
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene ID` <- "NO_CURRENT"
766
sh_map[`Gene Symbol` %like% "-"]$`Gene Symbol`[781]
767
sh_map[`Gene Symbol` %like% "-"]$`Gene ID`[781]
768
colnames(sh_map) <- c("shRNA", "HGNC", "ENTREZ")
769
770
dummification <- function(dummifier, shrna_data, idx) {
771
  cur_sub <- temp[idx:(idx+100000-1),]
772
  cur_onehot <- data.frame(predict(cur_dummy, cur_sub))
773
  fwrite(cur_onehot, paste0("Data/RNAi/Train_Data/shRNA_by_index/", idx, "_",
774
                            idx+100000-1, ".txt"))
775
  
776
}
777
778
mclapply(X = seq(1, nrow(temp), by = 100000), FUN = dummification,
779
         mc.cores = 6, mc.cleanup = T)
780
781
782
783
ccle_shrna_seqs <- ccle_shrna_seqs[, 1:2]
784
ccle_shrna_seqs <- unique(ccle_shrna_seqs)
785
all_merger <- merge(ccle_shrna_seqs, sh_map, by = "shRNA")
786
787
length(unique(ccle_shrna_seqs$shRNA))
788
# Add cell line to sh map
789
temp <- unique(ccle_shrna_seqs[, 1:2])
790
temp <- unique(temp)
791
sh_map <- unique(sh_map)
792
# Put all genes targeted by an shRNA in the same column
793
sh_map[, all_genes := paste0(Gene, collapse = ","), by = shRNA]
794
which(sh_map$Gene != sh_map$all_genes)
795
sh_map[27:28,]
796
sh_map[, Gene := NULL]
797
sh_map <- unique(sh_map)
798
799
sh_map$shRNA <- factor(sh_map$shRNA)
800
sh_map$Gene <- factor(sh_map$Gene)
801
802
sh_line_gene <- merge(temp, sh_map, by = "shRNA")
803
sh_line_gene$all_genes <- factor(sh_line_gene$all_genes)
804
sh_line_gene$shRNA <- factor(sh_line_gene$shRNA)
805
806
temp <- sh_line_gene[, strsplit(as.character(all_genes), ",", fixed=TRUE),
807
   by = .(shRNA, cell_line, all_genes)][, all_genes := NULL][
808
     , setnames(.SD, "shRNA", "cell_line", "Genes")]
809
810
811
merge_sub <- fread("Data/RNAi/Train_Data/transcripts_alt_and_ref.txt")
812
merge_sub <- merge_sub[Variant_Type == "SNP"]
813
colnames(merge_sub)
814
merge_sub <- merge_sub[ref_len < 20326]
815
816
# Separate each cell line shRNA data
817
line <- "TEN"
818
dir.create("Data/RNAi/Train_Data/shRNA_by_line_and_target")
819
all_lines <- unique(ccle_shrna_seqs$cell_line)
820
for (line in all_lines) {
821
  # cur_sub <- sh_map[cell_line == line]
822
  cur_sub <- sh_map
823
  sh_map$ENTREZ <- factor(sh_map$ENTREZ)
824
  sh_map$shRNA <- factor(sh_map$shRNA)
825
  sh_map[, HGNC := NULL]
826
  levels(sh_map$ENTREZ)
827
  cur_dummy <- dummyVars(formula = '~ ENTREZ', data = sh_map,
828
                         fullRank = T, sep = "_", levelsOnly = F)
829
  temp <- data.frame(predict(cur_dummy, sh_map[1:100,]))
830
  dim(temp)
831
  temp[1:5, 10:15]
832
  fwrite(cur_sub, paste0("Data/RNAi/Train_Data/shRNA_by_line/", line, ".txt"))
833
}
834
835
when <- data.frame(time = c("afternoon", "night", "afternoon",
836
                            "morning", "morning", "morning",
837
                            "morning", "afternoon", "afternoon"),
838
                   day = c("Mon", "Mon", "Mon",
839
                           "Wed", "Wed", "Fri",
840
                           "Sat", "Sat", "Fri"))
841
842
levels(when$time) <- list(morning="morning",
843
                          afternoon="afternoon",
844
                          night="night")
845
levels(when$day) <- list(Mon="Mon", Tue="Tue", Wed="Wed", Thu="Thu",
846
                         Fri="Fri", Sat="Sat", Sun="Sun")
847
model.matrix(~day, when)
848
849
interactionModel <- dummyVars(~ day + time + day:time,
850
                              data = when,
851
                              sep = ".")
852
predict(interactionModel, when[1:3,])
853
noNames <- dummyVars(~ day + time + day:time,
854
                     data = when,
855
                     levelsOnly = TRUE)
856
857
858
859
860
sh_onehot <- dummyVars(formula = " shRNA~ . ", data = sh_map)
861
temp <- data.table(predict(sh_onehot, sh_map[1:10000, ]), keep.rownames = T)
862
dim(temp)
863
temp[1:5, 1:5]
864
rownames(temp)[1:5]
865
866
View(sh_map)
867
868