|
a |
|
b/R/Read_Achilles_DRIVE_RNAi.R |
|
|
1 |
# Read_Achilles_DRIVE_RNAi.R |
|
|
2 |
# install.packages("fastmatch") |
|
|
3 |
library(data.table) |
|
|
4 |
library(fastmatch) |
|
|
5 |
library(cmapR) |
|
|
6 |
if (!require(biomaRt)) { |
|
|
7 |
BiocManager::install("biomaRt", version = "3.8") |
|
|
8 |
library(biomaRt) |
|
|
9 |
} |
|
|
10 |
library(HGNChelper) |
|
|
11 |
if (!require(Biostrings)) { |
|
|
12 |
BiocManager::install("Biostrings", version = "3.8") |
|
|
13 |
library(Biostrings) |
|
|
14 |
} |
|
|
15 |
# cur_map <- getCurrentHumanMap() |
|
|
16 |
# saveRDS(cur_map, "Data/HGNC_Map.rds") |
|
|
17 |
cur_map <- readRDS("Data/HGNC_Map.rds") |
|
|
18 |
|
|
|
19 |
# ==== Expression Data ==== |
|
|
20 |
ccle_rna <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_mRNA_expression.csv") |
|
|
21 |
ccle_rna[1:5, 1:5] |
|
|
22 |
dim(ccle_rna) |
|
|
23 |
# ccle_drug_data <- fread("Data/DepMap/CCLE/CCLE_NP24.2009_Drug_data_2015.02.24.csv") |
|
|
24 |
ccle_line_info <- fread("Data/DepMap/20Q1/sample_info.csv") |
|
|
25 |
|
|
|
26 |
# Merge DepMap_ID and CCLE_Name with RNA data |
|
|
27 |
ccle_rna_with_names <- merge(ccle_line_info[, c("DepMap_ID", "CCLE_Name")], |
|
|
28 |
ccle_rna, by.x = "DepMap_ID", by.y = "V1") |
|
|
29 |
# Get tumor type and subtype |
|
|
30 |
ccle_sub_info <- ccle_line_info[match(ccle_rna_with_names$CCLE_Name, ccle_line_info$CCLE_Name),] |
|
|
31 |
ccle_sub_info <- ccle_sub_info[, c("CCLE_Name", "disease", "disease_subtype")] |
|
|
32 |
|
|
|
33 |
ccle_rna_with_names$cell_line <- gsub("_.*", "", ccle_rna_with_names$CCLE_Name) |
|
|
34 |
ccle_sub_info$cell_line <- gsub("_.*", "", ccle_sub_info$CCLE_Name) |
|
|
35 |
ccle_rna_with_names[, c("DepMap_ID", "CCLE_Name") := NULL] |
|
|
36 |
ccle_sub_info[, CCLE_Name := NULL] |
|
|
37 |
|
|
|
38 |
setcolorder(ccle_rna_with_names, c("cell_line", |
|
|
39 |
colnames(ccle_rna_with_names)[-ncol(ccle_rna_with_names)])) |
|
|
40 |
|
|
|
41 |
dim(ccle_sub_info) |
|
|
42 |
dim(ccle_rna_with_names) |
|
|
43 |
all(ccle_rna_with_names$cell_line == ccle_sub_info$cell_line) # TRUE |
|
|
44 |
# colnames(ccle_rna_with_names)[ncol(ccle_rna_with_names)] |
|
|
45 |
length(unique(ccle_sub_info$disease)) |
|
|
46 |
unique(ccle_sub_info$disease) |
|
|
47 |
unique(ccle_sub_info$disease_subtype) |
|
|
48 |
anyNA(ccle_sub_info) |
|
|
49 |
|
|
|
50 |
fwrite(ccle_sub_info, "Data/RNAi/Train_Data/ccle_line_info.txt") |
|
|
51 |
fwrite(ccle_rna_with_names, "Data/RNAi/Train_Data/ccle_exp_data.txt") |
|
|
52 |
|
|
|
53 |
ccle_long_exp <- melt.data.table(ccle_rna_with_names, id.vars = "cell_line", |
|
|
54 |
measure.vars = colnames(ccle_rna_with_names)[-1], |
|
|
55 |
value.name = "exp") |
|
|
56 |
colnames(ccle_long_exp)[2] <- "gene_name" |
|
|
57 |
fwrite(ccle_long_exp, "Data/RNAi/Train_Data/ccle_long_exp.txt") |
|
|
58 |
|
|
|
59 |
# Get expression data gene sequences |
|
|
60 |
ccle_rna_with_names <- fread("Data/RNAi/Train_Data/ccle_exp_data.txt") |
|
|
61 |
anyDuplicated(colnames(ccle_rna_with_names)) |
|
|
62 |
cur_genes <- colnames(ccle_rna_with_names)[-1] |
|
|
63 |
cur_genes <- gsub(".+ \\((ENSG\\d+)\\)", "\\1", cur_genes) |
|
|
64 |
anyDuplicated(cur_genes) # None |
|
|
65 |
grch37 <- useMart(host = "http://grch37.ensembl.org", |
|
|
66 |
biomart='ENSEMBL_MART_ENSEMBL', |
|
|
67 |
dataset='hsapiens_gene_ensembl') |
|
|
68 |
biomaRt::searchAttributes(grch37, pattern = "gene") |
|
|
69 |
# Get the transcripts of the current genes |
|
|
70 |
if (!require(TxDb.Hsapiens.UCSC.hg38.knownGene)) { |
|
|
71 |
BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene") |
|
|
72 |
library(TxDb.Hsapiens.UCSC.hg38.knownGene) |
|
|
73 |
} |
|
|
74 |
if (!require(BSgenome.Hsapiens.UCSC.hg38)) { |
|
|
75 |
BiocManager::install("BSgenome.Hsapiens.UCSC.hg38") |
|
|
76 |
library(BSgenome.Hsapiens.UCSC.hg38) |
|
|
77 |
} |
|
|
78 |
if (!require(AnnotationHub)) { |
|
|
79 |
BiocManager::install("AnnotationHub") |
|
|
80 |
library(AnnotationHub) |
|
|
81 |
} |
|
|
82 |
|
|
|
83 |
# Get gene boundaries from |
|
|
84 |
ahub <- AnnotationHub() |
|
|
85 |
qh <- query(ahub, c("hg38", "transcript", "homo sapiens")) |
|
|
86 |
genes <- qh[["AH5046"]] |
|
|
87 |
anyDuplicated(genes$name) |
|
|
88 |
|
|
|
89 |
hg19 <- BSgenome.Hsapiens.UCSC.hg19 |
|
|
90 |
# cdsBy(txdb, by = "tx", use.names = T) |
|
|
91 |
tran_seqs <- extractTranscriptSeqs(hg19, txdb, use.names = T) |
|
|
92 |
length(unique(names(tran_seqs))) |
|
|
93 |
keytypes(txdb) |
|
|
94 |
# Get ENTREZ IDs |
|
|
95 |
gene_ids <- select(txdb, keys = names(tran_seqs), columns="GENEID", keytype="TXNAME") |
|
|
96 |
length(unique(gene_ids$GENEID)) |
|
|
97 |
biomaRt::searchAttributes(grch37, pattern = "entrez") |
|
|
98 |
# Translate to ENSG and ENST |
|
|
99 |
ensembl_ids <- getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id", "transcript_length", "entrezgene"), |
|
|
100 |
filters = "entrezgene", values = gene_ids$GENEID, mart = grch37) |
|
|
101 |
# Get ENST of genes in CCLE |
|
|
102 |
ccle_genes <- gsub(".+ \\((ENSG\\d+)\\)", "\\1", colnames(ccle_rna_with_names)[-1]) |
|
|
103 |
sum(ccle_genes %in% ensembl_ids$ensembl_gene_id) |
|
|
104 |
length(unique(ccle_genes)) |
|
|
105 |
length(unique(ensembl_ids$ensembl_gene_id)) |
|
|
106 |
length(unique(ensembl_ids$ensembl_transcript_id)) |
|
|
107 |
sum(duplicated(ensembl_ids$ensembl_gene_id)) |
|
|
108 |
ccle_match <- ensembl_ids[ensembl_ids$ensembl_gene_id %in% ccle_genes, ] |
|
|
109 |
length(unique(ccle_match$ensembl_transcript_id)) |
|
|
110 |
length(unique(ccle_match$ensembl_gene_id)) |
|
|
111 |
|
|
|
112 |
length(unique(merge_seq$Annotation_Transcript)) |
|
|
113 |
merge_seq$Tumor_Sample_Barcode <- gsub("\\_.+", "", merge_seq$Tumor_Sample_Barcode) |
|
|
114 |
length(unique(merge_seq$Tumor_Sample_Barcode)) |
|
|
115 |
length(unique(ccle_rna_with_names$cell_line)) |
|
|
116 |
|
|
|
117 |
sum(drive_cells %in% merge_seq$Tumor_Sample_Barcode) / length(drive_cells) |
|
|
118 |
|
|
|
119 |
ensembl_trans <- Biostrings::readDNAStringSet("Data/RNAi/Train_Data/Homo_sapiens.GRCh37.cdna.all.fa") |
|
|
120 |
names(ensembl_trans)[1] |
|
|
121 |
tran_names <- gsub(" .*", "", names(ensembl_trans)) |
|
|
122 |
gene_names <- gsub(".*gene:(ENSG\\d+) .*", "\\1", names(ensembl_trans)) |
|
|
123 |
|
|
|
124 |
names(ensembl_trans) <- gsub("\\.\\d+", "", names(ensembl_trans)) |
|
|
125 |
sum(ensembl_ids$ensembl_transcript_id %in% tran_names) / length(ensembl_ids$ensembl_transcript_id) |
|
|
126 |
sum(unique(gene_names) %in% ccle_genes) |
|
|
127 |
# All mutated transcripts are available in the data |
|
|
128 |
sum(unique(merge_seq$Annotation_Transcript) %in% tran_names) /length(unique(merge_seq$Annotation_Transcript)) |
|
|
129 |
|
|
|
130 |
names(ensembl_trans) <- tran_names |
|
|
131 |
sub_trans <- ensembl_trans[names(ensembl_trans) %in% merge_seq$Annotation_Transcript] |
|
|
132 |
|
|
|
133 |
merge_seq |
|
|
134 |
|
|
|
135 |
|
|
|
136 |
|
|
|
137 |
promoters(genes) |
|
|
138 |
transcripts(genes) |
|
|
139 |
trans_seqs <- Views(hg19, genes$blocks) |
|
|
140 |
|
|
|
141 |
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene |
|
|
142 |
txdb |
|
|
143 |
|
|
|
144 |
all_trans <- transcriptsBy(txdb, by = c("gene")) |
|
|
145 |
all_trans <- transcripts(txdb) |
|
|
146 |
head(all_trans) |
|
|
147 |
|
|
|
148 |
gene_transcripts = getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id"), |
|
|
149 |
filters = "ensembl_gene_id", values = cur_genes, mart = grch37) |
|
|
150 |
anyDuplicated(gene_transcripts$ensembl_transcript_id) |
|
|
151 |
transcript_seqs <- getSequence(id = gene_transcripts$ensembl_transcript_id, |
|
|
152 |
type = "ensembl_transcript_id", seqType = "cdna", |
|
|
153 |
mart = grch37) |
|
|
154 |
dim(transcript_seqs) |
|
|
155 |
|
|
|
156 |
getSequence() |
|
|
157 |
|
|
|
158 |
|
|
|
159 |
# ==== CCLE Mutation Data ==== |
|
|
160 |
ccle_mutation <- fread("Data/RNAi/CCLE_mutation_data.csv") |
|
|
161 |
unique(ccle_mutation$Variant_Classification) |
|
|
162 |
# length(unique(ccle_mutation$Hugo_Symbol)) |
|
|
163 |
# length(unique(ccle_mutation$Annotation_Transcript)) |
|
|
164 |
cur_mutation_data <- ccle_mutation[, c("Variant_Classification", "Hugo_Symbol", "Tumor_Sample_Barcode")] |
|
|
165 |
cur_mutation_data$cell_name <- gsub(pattern = "_.*", replacement = "", |
|
|
166 |
x = cur_mutation_data$Tumor_Sample_Barcode) |
|
|
167 |
cur_mutation_data[, Tumor_Sample_Barcode := NULL] |
|
|
168 |
cur_mutation_data <- unique(cur_mutation_data) |
|
|
169 |
cur_mutation_data$Variant_Labels <- as.numeric(as.factor(cur_mutation_data$Variant_Classification)) |
|
|
170 |
cur_mutation_data[, Variant_Classification := NULL] |
|
|
171 |
colnames(cur_mutation_data)[2] <- "cell_line" |
|
|
172 |
fwrite(cur_mutation_data, "Data/RNAi/Train_Data/labeled_ccle_mut_data.txt") |
|
|
173 |
|
|
|
174 |
dcast_mut_data <- dcast.data.table(cur_mutation_data, |
|
|
175 |
cell_line ~ Hugo_Symbol + Variant_Labels, |
|
|
176 |
drop = F, # Ensures all possible combinations are kept |
|
|
177 |
fill = 0, fun.aggregate = length) |
|
|
178 |
sum(dcast_mut_data$A1CF_6) |
|
|
179 |
fwrite(dcast_mut_data, "Data/RNAi/Train_Data/simple_ccle_mut_data.txt") |
|
|
180 |
|
|
|
181 |
dcast_mut_data <- dcast.data.table(cur_mutation_data, |
|
|
182 |
cell_line + Hugo_Symbol ~ Variant_Labels, |
|
|
183 |
drop = F, # Ensures all possible combinations are kept |
|
|
184 |
fill = 0, fun.aggregate = length) |
|
|
185 |
fwrite(dcast_mut_data, "Data/RNAi/Train_Data/long_ccle_mut_data.txt") |
|
|
186 |
|
|
|
187 |
|
|
|
188 |
### Generate per gene features |
|
|
189 |
length(unique(ccle_mutation$Hugo_Symbol)) |
|
|
190 |
length(unique(ccle_mutation$Annotation_Transcript)) |
|
|
191 |
# Check and update current HUGO symbols |
|
|
192 |
cur_hgnc <- HGNChelper::checkGeneSymbols(x = ccle_mutation$Hugo_Symbol, map = cur_map) |
|
|
193 |
ccle_mutation$updated_hugo <- cur_hgnc$Suggested.Symbol |
|
|
194 |
# Get gene lengths from biomaRt |
|
|
195 |
hugo_list <- unique(ccle_mutation$updated_hugo) |
|
|
196 |
enst_list <- ccle_mutation$Annotation_Transcript |
|
|
197 |
enst_list <- gsub("\\.\\d+", "", enst_list) |
|
|
198 |
ccle_mutation$Annotation_Transcript <- enst_list |
|
|
199 |
|
|
|
200 |
|
|
|
201 |
hugo_list <- sort(hugo_list) |
|
|
202 |
nchar(hugo_list[1:5]) |
|
|
203 |
grch37 <- useMart(host = "http://grch37.ensembl.org", |
|
|
204 |
biomart='ENSEMBL_MART_ENSEMBL', |
|
|
205 |
dataset='hsapiens_gene_ensembl') |
|
|
206 |
biomaRt::searchAttributes(grch37, pattern = "length") |
|
|
207 |
biomaRt::searchAttributes(grch37, pattern = "sequence") |
|
|
208 |
# t1 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "coding", mart = grch37) |
|
|
209 |
# t2 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "transcript_exon_intron", mart = grch37) |
|
|
210 |
# t3 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "gene_exon_intron", mart = grch37) |
|
|
211 |
# t4 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "gene_exon", mart = grch37) |
|
|
212 |
# t5 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "cdna", mart = grch37) |
|
|
213 |
# t6 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "3utr", mart = grch37) |
|
|
214 |
# t7 <- getSequence(id = gene_coords$ensembl_transcript_id[2], type = "ensembl_transcript_id", seqType = "5utr", mart = grch37) |
|
|
215 |
# |
|
|
216 |
# t1$coding == t2$transcript_exon_intron |
|
|
217 |
# nchar(t2$transcript_exon_intron) |
|
|
218 |
# nchar(t4$gene_exon) |
|
|
219 |
# nchar(t5$cdna) |
|
|
220 |
# nchar(t6$`3utr`) + nchar(t1$coding) + nchar(t7$`5utr`) |
|
|
221 |
|
|
|
222 |
gene_coords = getBM(attributes = c("ensembl_transcript_id", "transcript_length"), |
|
|
223 |
filters = "ensembl_transcript_id", values = enst_list, mart = grch37) |
|
|
224 |
gene_coords <- as.data.table(gene_coords) |
|
|
225 |
|
|
|
226 |
transcript_seqs <- getSequence(id = gene_coords$ensembl_transcript_id, |
|
|
227 |
type = "ensembl_transcript_id", seqType = "cdna", |
|
|
228 |
mart = grch37) |
|
|
229 |
transcript_seqs <- as.data.table(transcript_seqs) |
|
|
230 |
# Save |
|
|
231 |
fwrite(transcript_seqs, "Data/RNAi/Train_Data/transcript_seqs.txt") |
|
|
232 |
transcript_seqs <- fread("Data/RNAi/Train_Data/transcript_seqs.txt") |
|
|
233 |
colnames(transcript_seqs) |
|
|
234 |
head(transcript_seqs$ensembl_transcript_id) |
|
|
235 |
head(ccle_mutation$Annotation_Transcript) |
|
|
236 |
# Merge sequences with mutational data |
|
|
237 |
merge_seq <- merge(ccle_mutation[, c("Annotation_Transcript", |
|
|
238 |
"cDNA_Change", "Variant_Type", |
|
|
239 |
"Tumor_Sample_Barcode")], |
|
|
240 |
transcript_seqs, |
|
|
241 |
by.x = "Annotation_Transcript", by.y = "ensembl_transcript_id") |
|
|
242 |
colnames(merge_seq)[5] <- "ref_cdna" |
|
|
243 |
merge_seq$alt_cdna <- merge_seq$ref_cdna |
|
|
244 |
dim(merge_seq) |
|
|
245 |
|
|
|
246 |
library(stringr) |
|
|
247 |
mutations <- merge_seq$cDNA_Change |
|
|
248 |
vartypes <- merge_seq$Variant_Type |
|
|
249 |
cur_mutations <- gsub("c\\.", "", mutations[vartypes == "SNP"]) |
|
|
250 |
snps <- gsub("(\\d+)(\\w)(>)(\\w)", "\\1 \\2 \\4", cur_mutations) |
|
|
251 |
snp_split <- str_split(snps, " ", simplify = T) |
|
|
252 |
subseq(merge_seq[vartypes == "SNP"]$alt_cdna, |
|
|
253 |
as.integer(snp_split[, 1]), as.integer(snp_split[, 1])) <- snp_split[, 3] |
|
|
254 |
sum(merge_seq$ref_cdna == merge_seq$alt_cdna) / nrow(merge_seq) |
|
|
255 |
|
|
|
256 |
cur_mutations <- gsub("c\\.", "", mutations[vartypes == "INS"]) |
|
|
257 |
ins <- gsub("(\\d+)(\\_)(\\d+)ins(\\w+)", "\\1 \\3 \\4", cur_mutations) |
|
|
258 |
ins_split <- str_split(ins, " ", simplify = T) |
|
|
259 |
# For insertion in a sequence, the second position is 1 minus the first position |
|
|
260 |
subseq(merge_seq[vartypes == "INS"]$alt_cdna, |
|
|
261 |
as.integer(ins_split[, 1]), as.integer(ins_split[, 1])-1) <- ins_split[, 3] |
|
|
262 |
sum(merge_seq$ref_cdna == merge_seq$alt_cdna) / nrow(merge_seq) |
|
|
263 |
|
|
|
264 |
cur_mutations <- gsub("c\\.", "", mutations[vartypes == "DEL"]) |
|
|
265 |
dels <- gsub("(\\d+)\\_+(\\d+)*del\\w+", "\\1 \\2", cur_mutations) |
|
|
266 |
dels <- gsub("(\\d+)del\\w+", "\\1", dels) |
|
|
267 |
|
|
|
268 |
dels_split <- str_split(dels, " ", simplify = T) |
|
|
269 |
# For deletion in a sequence, copy the first position if there only a single deletion |
|
|
270 |
dels_split[dels_split[, 2] == "", 2] <- dels_split[dels_split[, 2] == "", 1] |
|
|
271 |
subseq(merge_seq[vartypes == "DEL"]$alt_cdna, |
|
|
272 |
as.integer(dels_split[, 1]), as.integer(dels_split[, 2])) <- "" |
|
|
273 |
sum(merge_seq$ref_cdna == merge_seq$alt_cdna) / nrow(merge_seq) |
|
|
274 |
|
|
|
275 |
# The remaining variants are outside the transcript, but may affect the transcript indirectly, |
|
|
276 |
# e.g. splice site variants |
|
|
277 |
anyNA(merge_seq$cDNA_Change) |
|
|
278 |
sum(merge_seq$cDNA_Change == "") |
|
|
279 |
ccle_mutation[cDNA_Change == ""] |
|
|
280 |
|
|
|
281 |
# Remove invalid cDNA changes that result in empty alt seqs |
|
|
282 |
sum(merge_seq$alt_len == 0) |
|
|
283 |
sum(merge_seq$cDNA_Change == "") |
|
|
284 |
merge_seq <- merge_seq[cDNA_Change != ""] |
|
|
285 |
merge_seq[, ref_len := nchar(ref_cdna), by = "Annotation_Transcript"] |
|
|
286 |
colnames(merge_seq) |
|
|
287 |
# Save |
|
|
288 |
fwrite(merge_seq, "Data/RNAi/Train_Data/transcripts_alt_and_ref.txt") |
|
|
289 |
|
|
|
290 |
merge_seq <- fread("Data/RNAi/Train_Data/transcripts_alt_and_ref.txt") |
|
|
291 |
colnames(merge_seq) |
|
|
292 |
(sum(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len) / 8) * 32 * 4 / 1e6 |
|
|
293 |
|
|
|
294 |
rm(list = c("dels", "dels_split", "ins", "ins_split", "snps", "snp_split")) |
|
|
295 |
|
|
|
296 |
merge_seq <- fread("Data/RNAi/Train_Data/transcripts_alt_and_ref.txt") |
|
|
297 |
colnames(merge_seq) |
|
|
298 |
length(unique(merge_seq[,Tumor_Sample_Barcode])) |
|
|
299 |
|
|
|
300 |
# Save only the transcripts as well |
|
|
301 |
library(data.table) |
|
|
302 |
alt_transcripts <- unique(merge_seq[, alt_cdna]) |
|
|
303 |
ref_transcripts <- unique(merge_seq[, ref_cdna]) |
|
|
304 |
length(alt_transcripts) |
|
|
305 |
length(ref_transcripts) |
|
|
306 |
all_transcripts <- |
|
|
307 |
rbindlist(list(data.table(alt_transcripts), data.table(ref_transcripts)), use.names = F) |
|
|
308 |
dim(all_transcripts) |
|
|
309 |
colnames(all_transcripts) <- "transcript" |
|
|
310 |
all_transcripts <- unique(all_transcripts) |
|
|
311 |
all_transcripts[, len := nchar(transcript)] |
|
|
312 |
# Remove longest 1% transcripts |
|
|
313 |
quantile(unique(all_transcripts$len), 0.99) |
|
|
314 |
all_transcripts <- all_transcripts[len < 20326] |
|
|
315 |
# Order by increasing length (want to train on smaller sequences first) |
|
|
316 |
data.table::setorder(all_transcripts, -len) |
|
|
317 |
# all_transcripts$len |
|
|
318 |
|
|
|
319 |
library(data.table) |
|
|
320 |
fwrite(unique(all_transcripts[, 'transcript']), "Data/RNAi/Train_Data/all_transcript_seqs.txt") |
|
|
321 |
all_transcripts <- fread("Data/RNAi/Train_Data/all_transcript_seqs.txt") |
|
|
322 |
all_transcripts |
|
|
323 |
|
|
|
324 |
nrow(all_transcripts) |
|
|
325 |
|
|
|
326 |
unique(merge_seq[is.nan(cDNA_Change)]) |
|
|
327 |
which(is.na(merge_seq$alt_cdna)) |
|
|
328 |
merge_seq[34,] |
|
|
329 |
merge_seq[Annotation_Transcript == "ENST00000001146"] |
|
|
330 |
sum(merge) |
|
|
331 |
sum(merge_seq$ref_len == 0) |
|
|
332 |
colnames(merge_seq) |
|
|
333 |
length(unique(merge_seq[Variant_Type == "SNP"]$Annotation_Transcript)) |
|
|
334 |
merge_seq[, ref_len := nchar(ref_cdna), by = "Annotation_Transcript"] |
|
|
335 |
merge_seq[, alt_len := nchar(alt_cdna), by = "Annotation_Transcript"] |
|
|
336 |
merge_seq[Variant_Type == "SNP", "Annotation_Transcript"] |
|
|
337 |
sum(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len) |
|
|
338 |
max(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len) |
|
|
339 |
min(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len) |
|
|
340 |
median(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len) |
|
|
341 |
plot(unique(merge_seq[, c("Annotation_Transcript", "ref_len")])$ref_len) |
|
|
342 |
|
|
|
343 |
sum(nchar(merge_seq$alt_cdna)) |
|
|
344 |
|
|
|
345 |
|
|
|
346 |
colnames(merge_seq) |
|
|
347 |
unique(ccle_mutation$Variant_Type) |
|
|
348 |
|
|
|
349 |
mean(gene_coords$transcript_length) |
|
|
350 |
median(gene_coords$transcript_length) |
|
|
351 |
max(gene_coords$transcript_length) |
|
|
352 |
min(gene_coords$transcript_length) |
|
|
353 |
|
|
|
354 |
gene_coords[, size := as.integer(mean(size)), by = "hgnc_symbol"] |
|
|
355 |
# Update CCLE mutation data |
|
|
356 |
ccle_mutation$Annotation_Transcript <- gsub("\\.\\d+", "", ccle_mutation$Annotation_Transcript) |
|
|
357 |
updated_ccle_mutation <- merge(ccle_mutation, gene_coords, |
|
|
358 |
by.x = "Annotation_Transcript", |
|
|
359 |
by.y = "ensembl_transcript_id") |
|
|
360 |
|
|
|
361 |
ccle_mut <- updated_ccle_mutation # make copy |
|
|
362 |
# Number of silent mutations per gene divided by length |
|
|
363 |
all_var_types <- unique(ccle_mut$Variant_Classification) |
|
|
364 |
unique(ccle_mut$Variant_Type) |
|
|
365 |
# c("Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "Frame_Shift_Del", "Frame_Shift_Ins") |
|
|
366 |
for (var_type in c("SNP", "DEL", "INS")) { |
|
|
367 |
ccle_mut[Variant_Type == var_type, |
|
|
368 |
paste0("Num_", var_type) := nrow(.SD), |
|
|
369 |
by = c("Annotation_Transcript", "Tumor_Sample_Barcode")] |
|
|
370 |
} |
|
|
371 |
ncol(ccle_mut) |
|
|
372 |
# Replace NAs with 0 |
|
|
373 |
ccle_mut[, 34:36][is.na(ccle_mut[, 34:36, with = F])] <- 0 |
|
|
374 |
|
|
|
375 |
mut_sub <- as.data.table(ccle_mut[, c("Annotation_Transcript", "Tumor_Sample_Barcode", "Num_SNP", "Num_DEL", "Num_INS")]) |
|
|
376 |
mut_sub$cell_line <- gsub("\\_.+", "", mut_sub$Tumor_Sample_Barcode) |
|
|
377 |
setkey(mut_sub, "cell_line") |
|
|
378 |
|
|
|
379 |
fwrite(mut_sub[, -2], "ccle_muts_by_type.txt") |
|
|
380 |
|
|
|
381 |
# Get original transcript sequence |
|
|
382 |
human <- useMart("ensembl", dataset="hsapiens_gene_ensembl") |
|
|
383 |
listMarts(host = "http://grch37.ensembl.org") |
|
|
384 |
listEnsemblArchives() |
|
|
385 |
listDatasets(human) |
|
|
386 |
|
|
|
387 |
listDatasets(grch37) |
|
|
388 |
|
|
|
389 |
biomaRt::searchAttributes(human, pattern = "length") |
|
|
390 |
biomaRt::searchAttributes(human, pattern = "transcript") |
|
|
391 |
|
|
|
392 |
|
|
|
393 |
# ==== DRIVE Data ==== |
|
|
394 |
drive_count <- readRDS("Data/RNAi/drive-raw-data/DriveCountData.RDS") |
|
|
395 |
drive_count <- as.data.table(drive_count) |
|
|
396 |
unique(drive_count$CLEANNAME) |
|
|
397 |
drive_cells <- toupper(unique(drive_count$CLEANNAME)) |
|
|
398 |
|
|
|
399 |
shrna_map <- readRDS("Data/RNAi/drive-raw-data/ShrnaGeneMap.RDS") |
|
|
400 |
shrna_map <- as.data.table(shrna_map) |
|
|
401 |
length(unique(drive_count$EXPERIMENT_ID)) |
|
|
402 |
length(unique(shrna_map$GENESYMBOLS)) |
|
|
403 |
|
|
|
404 |
|
|
|
405 |
all_merged <- merge(drive_count, shrna_map, by = "SEQ") |
|
|
406 |
|
|
|
407 |
# Get DRIVE cell line mutation data from CCLE |
|
|
408 |
drive_mutation <- cur_mutation_data[cell_name %in% drive_cells] |
|
|
409 |
|
|
|
410 |
# Get DRIVE cell line expression data from CCLE |
|
|
411 |
ccle_rna_with_names$CCLE_Name |
|
|
412 |
drive_rna <- ccle_rna_with_names[CCLE_Short_Name %in% drive_cells] |
|
|
413 |
dim(drive_rna) |
|
|
414 |
|
|
|
415 |
# Order shRNA screens with mutation and expression data |
|
|
416 |
all_merged[PLASMID_COUNT > 0, num_shrna_exp := .N, by = "EXPERIMENT_ID"] |
|
|
417 |
|
|
|
418 |
temp_exp <- all_merged[EXPERIMENT_ID == 1004182] |
|
|
419 |
temp <- temp_exp[SAMPLE_COUNT > 1000 & !is.na(GENESYMBOLS)] |
|
|
420 |
length(unique(temp$GENESYMBOLS)) |
|
|
421 |
all_merged |
|
|
422 |
|
|
|
423 |
all_merged[POOL == "BGPD" & !is.na(PLASMID_COUNT)] |
|
|
424 |
drive_rna[1:5, 1:5] |
|
|
425 |
|
|
|
426 |
|
|
|
427 |
# ==== Read LFCs ==== |
|
|
428 |
achilles_batch_1 <- fread("Data/RNAi/LFCs/achilles55kbatch1repcollapsedlfc.csv") |
|
|
429 |
achilles_batch_2 <- fread("Data/RNAi/LFCs/achilles55kbatch2repcollapsedlfc.csv") |
|
|
430 |
achilles_batch_3 <- fread("Data/RNAi/LFCs/achilles98krepcollapsedlfc.csv") |
|
|
431 |
colnames(achilles_batch_1)[-1] <- gsub("_.*", "", colnames(achilles_batch_1)[-1]) |
|
|
432 |
colnames(achilles_batch_2)[-1] <- gsub("_.*", "", colnames(achilles_batch_2)[-1]) |
|
|
433 |
colnames(achilles_batch_3)[-1] <- gsub("_.*", "", colnames(achilles_batch_3)[-1]) |
|
|
434 |
all_achilles <- Reduce(function(...) merge(..., all = T, by = "V1"), list(achilles_batch_1, achilles_batch_2, achilles_batch_3)) |
|
|
435 |
dim(all_achilles) |
|
|
436 |
all_molten <- melt(data = all_achilles, id.vars = "V1", |
|
|
437 |
measure.vars = colnames(all_achilles)[-1], |
|
|
438 |
variable.name = "cell_line", value.name = "lfc") |
|
|
439 |
colnames(all_molten)[1] <- "shRNA" |
|
|
440 |
|
|
|
441 |
anyNA(all_achilles) |
|
|
442 |
anyNA(all_molten$shRNA) |
|
|
443 |
anyNA(all_molten$cell_line) |
|
|
444 |
anyNA(all_molten$lfc) |
|
|
445 |
all_molten <- all_molten[!is.na(lfc),] |
|
|
446 |
setkey(all_molten, "cell_line") |
|
|
447 |
fwrite(all_molten, "Data/RNAi/Train_Data/achilles_shrna_cell_lfc.txt") |
|
|
448 |
rm(list("achilles_batch_1", "achilles_batch_2", "achilles_batch_3")) |
|
|
449 |
|
|
|
450 |
# Match cell line expression values |
|
|
451 |
# ccle_rna_with_names[fmatch(all_molten$cell_line, ccle_rna_with_names$CCLE_Short_Name),] |
|
|
452 |
|
|
|
453 |
# Read DRIVE data |
|
|
454 |
drive_bgpd <- fread("Data/RNAi/LFCs/drivebgpdlfcmat.csv") |
|
|
455 |
drive_a <- fread("Data/RNAi/LFCs/drivepoolalfcmat.csv") |
|
|
456 |
drive_b <- fread("Data/RNAi/LFCs/drivepoolblfcmat.csv") |
|
|
457 |
|
|
|
458 |
dim(drive_bgpd) |
|
|
459 |
dim(drive_a) |
|
|
460 |
dim(drive_b) |
|
|
461 |
colnames(drive_bgpd)[-1] <- gsub("_.*", "", colnames(drive_bgpd)[-1]) |
|
|
462 |
colnames(drive_a)[-1] <- gsub("_.*", "", colnames(drive_a)[-1]) |
|
|
463 |
colnames(drive_b)[-1] <- gsub("_.*", "", colnames(drive_b)[-1]) |
|
|
464 |
|
|
|
465 |
all_drive <- Reduce(function(...) merge(..., all = T, by = "V1"), list(drive_bgpd, drive_a, drive_b)) |
|
|
466 |
dim(all_drive) |
|
|
467 |
|
|
|
468 |
all_drive[, c("U87MG.y", "U87MG.x")] |
|
|
469 |
drive_molten <- melt(data = all_drive, id.vars = "V1", |
|
|
470 |
measure.vars = colnames(all_drive)[-1], |
|
|
471 |
variable.name = "cell_line", value.name = "lfc") |
|
|
472 |
drive_molten <- drive_molten[!is.na(lfc),] |
|
|
473 |
drive_molten$cell_line <- gsub("\\.x", "", drive_molten$cell_line) |
|
|
474 |
unique(drive_molten$cell_line) |
|
|
475 |
drive_molten$cell_line <- gsub("\\.y", "", drive_molten$cell_line) |
|
|
476 |
unique(drive_molten$cell_line) |
|
|
477 |
unique(drive_molten[, c("V1", "cell_line")]) |
|
|
478 |
# Average lfc by cell line for each shRNA |
|
|
479 |
drive_molten[, avg_lfc := mean(lfc), by = c("V1", "cell_line")] |
|
|
480 |
drive_molten[, lfc := NULL] |
|
|
481 |
colnames(drive_molten) <- c("shRNA", "cell_line", "lfc") |
|
|
482 |
drive_molten <- unique(drive_molten) |
|
|
483 |
setkey(drive_molten, "cell_line") |
|
|
484 |
|
|
|
485 |
fwrite(drive_molten, "Data/RNAi/Train_Data/drive_shrna_cell_lfc.txt") |
|
|
486 |
|
|
|
487 |
|
|
|
488 |
# Read Marcotte Data |
|
|
489 |
marcotte <- fread("Data/RNAi/LFCs/Marcotte_LFC_matrix.csv") |
|
|
490 |
dim(marcotte) |
|
|
491 |
colnames(marcotte)[-1] <- gsub("_.*", "", colnames(marcotte)[-1]) |
|
|
492 |
|
|
|
493 |
molten_marcotte <- melt(data = marcotte, id.vars = "V1", |
|
|
494 |
measure.vars = colnames(marcotte)[-1], |
|
|
495 |
variable.name = "cell_line", value.name = "lfc") |
|
|
496 |
colnames(molten_marcotte)[1] <- "shRNA" |
|
|
497 |
unique(molten_marcotte[, c("shRNA", "cell_line")]) |
|
|
498 |
fwrite(molten_marcotte, "Data/RNAi/Train_Data/marcotte_shrna_cell_lfc.txt") |
|
|
499 |
|
|
|
500 |
|
|
|
501 |
# ==== Batch and Match Training Data ==== |
|
|
502 |
ccle_exp <- fread("Data/RNAi/Train_Data/ccle_exp_data.txt") |
|
|
503 |
ccle_long_exp <- fread("Data/RNAi/Train_Data/ccle_long_exp.txt") |
|
|
504 |
typeof(ccle_exp$`TSPAN6 (ENSG00000000003)`[1]) |
|
|
505 |
|
|
|
506 |
min(ccle_exp) |
|
|
507 |
max(ccle_exp) |
|
|
508 |
|
|
|
509 |
ccle_mut <- fread("Data/RNAi/Train_Data/simple_ccle_mut_data.txt") |
|
|
510 |
|
|
|
511 |
achilles <- fread("Data/RNAi/Train_Data/achilles_shrna_cell_lfc.txt") |
|
|
512 |
achilles[, sh_length := nchar(shRNA), by = c("cell_line", "lfc")] |
|
|
513 |
unique(achilles$sh_length) |
|
|
514 |
|
|
|
515 |
achilles_cells <- unique(achilles$cell_line) |
|
|
516 |
ccle_cells <- unique(ccle_long_exp$cell_line) |
|
|
517 |
sum(achilles_cells %in% ccle_cells) |
|
|
518 |
achilles_cells[!(achilles_cells %in% ccle_cells)] |
|
|
519 |
rm(list=c("ccle_long_exp", "achilles")) |
|
|
520 |
drive <- fread("Data/RNAi/Train_Data/drive_shrna_cell_lfc.txt") |
|
|
521 |
drive_cells <- unique(drive$cell_line) |
|
|
522 |
sum(drive_cells %in% ccle_cells) |
|
|
523 |
drive_cells[!(drive_cells %in% ccle_cells)] |
|
|
524 |
rm(drive) |
|
|
525 |
|
|
|
526 |
marcotte <- fread("Data/RNAi/Train_Data/marcotte_shrna_cell_lfc.txt") |
|
|
527 |
marcotte_cells <- unique(marcotte$cell_line) |
|
|
528 |
sum(marcotte_cells %in% ccle_cells) |
|
|
529 |
marcotte_cells[!(marcotte_cells %in% ccle_cells)] |
|
|
530 |
|
|
|
531 |
shared_cells <- unique(c(achilles_cells, drive_cells, marcotte_cells)) |
|
|
532 |
|
|
|
533 |
# Subset the expression data based on these cells |
|
|
534 |
ccle_exp <- fread("Data/RNAi/Train_Data/ccle_exp_data.txt") |
|
|
535 |
achilles_exp <- ccle_exp[cell_line %in% achilles_cells] |
|
|
536 |
dim(achilles_exp) |
|
|
537 |
fwrite(achilles_exp, "Data/RNAi/Train_Data/achilles_exp.txt") |
|
|
538 |
rm(achilles_exp) |
|
|
539 |
|
|
|
540 |
drive_exp <- ccle_exp[cell_line %in% drive_cells] |
|
|
541 |
dim(drive_exp) |
|
|
542 |
fwrite(drive_exp, "Data/RNAi/Train_Data/drive_exp.txt") |
|
|
543 |
rm(drive_exp) |
|
|
544 |
|
|
|
545 |
marcotte_exp <- ccle_exp[cell_line %in% marcotte_cells] |
|
|
546 |
dim(marcotte_exp) |
|
|
547 |
fwrite(marcotte_exp, "Data/RNAi/Train_Data/marcotte_exp.txt") |
|
|
548 |
|
|
|
549 |
|
|
|
550 |
|
|
|
551 |
|
|
|
552 |
|
|
|
553 |
|
|
|
554 |
# Batch 1 million training cases |
|
|
555 |
for (sub in seq(1, nrow(achilles) - 1e6, by = 1e6)) { |
|
|
556 |
print(sub) |
|
|
557 |
cur_sub <- achilles[sub:(sub + 1e6),] |
|
|
558 |
# Match cell line exp |
|
|
559 |
exp_sub <- ccle_exp[fmatch(cur_sub$cell_line, ccle_exp$cell_line),] |
|
|
560 |
} |
|
|
561 |
achilles |
|
|
562 |
|
|
|
563 |
|
|
|
564 |
# ==== Merge all shRNA sequences for training, subset to match sequencing data ==== |
|
|
565 |
library(data.table) |
|
|
566 |
# Consider sequencing data the reference with which everything is matched |
|
|
567 |
merge_seq <- fread("Data/RNAi/Train_Data/transcripts_alt_and_ref.txt") |
|
|
568 |
# Subset only cell lines available in the CCLE sequencing data |
|
|
569 |
ccle_lines <- unique(merge_seq$Tumor_Sample_Barcode) |
|
|
570 |
ccle_data <- fread("Data/RNAi/Train_Data/ccle_exp_data.txt") |
|
|
571 |
# length(ccle_rna_with_names$cell_line) |
|
|
572 |
ccle_data <- ccle_data[cell_line %in% ccle_lines] |
|
|
573 |
# Update CCLE lines with those that have both sequencing and expression data |
|
|
574 |
ccle_lines <- ccle_data$cell_line |
|
|
575 |
|
|
|
576 |
achilles <- fread("Data/RNAi/Train_Data/achilles_shrna_cell_lfc.txt") |
|
|
577 |
achilles <- achilles[cell_line %in% ccle_lines] |
|
|
578 |
length(unique(achilles$cell_line)) |
|
|
579 |
drive <- fread("Data/RNAi/Train_Data/drive_shrna_cell_lfc.txt") |
|
|
580 |
drive <- drive[cell_line %in% ccle_lines] |
|
|
581 |
length(unique(drive$cell_line)) |
|
|
582 |
marcotte <- fread("Data/RNAi/Train_Data/marcotte_shrna_cell_lfc.txt") |
|
|
583 |
marcotte <- marcotte[cell_line %in% ccle_lines] |
|
|
584 |
length(unique(marcotte$cell_line)) |
|
|
585 |
|
|
|
586 |
# Extract unique sequences for autoencoder training |
|
|
587 |
all_shrna_seqs <- rbindlist(list(achilles[,'shRNA'], drive[,'shRNA'], marcotte[,'shRNA'])) |
|
|
588 |
all_shrna_seqs <- unique(all_shrna_seqs) |
|
|
589 |
|
|
|
590 |
# Save |
|
|
591 |
fwrite(all_shrna_seqs, "Data/RNAi/Train_Data/all_shrna_seqs.txt") |
|
|
592 |
|
|
|
593 |
|
|
|
594 |
# Extract all info for matching with sequencing and expression data |
|
|
595 |
ccle_shrna_seqs <- rbindlist(list(achilles, drive, marcotte)) |
|
|
596 |
ccle_shrna_seqs <- unique(ccle_shrna_seqs) |
|
|
597 |
|
|
|
598 |
# Save CCLE ShRNA |
|
|
599 |
fwrite(ccle_shrna_seqs, "Data/RNAi/Train_Data/ccle_shrna_seqs.txt") |
|
|
600 |
fread() |
|
|
601 |
|
|
|
602 |
# Separate each cell line shRNA data |
|
|
603 |
dir.create("Data/RNAi/Train_Data/shRNA_by_line") |
|
|
604 |
all_lines <- unique(ccle_shrna_seqs$cell_line) |
|
|
605 |
for (line in all_lines) { |
|
|
606 |
cur_sub <- ccle_shrna_seqs[cell_line == line] |
|
|
607 |
fwrite(cur_sub, paste0("Data/RNAi/Train_Data/shRNA_by_line/", line, ".txt")) |
|
|
608 |
} |
|
|
609 |
|
|
|
610 |
# Save CCLE Expression Data |
|
|
611 |
fwrite(ccle_data, "Data/RNAi/Train_Data/ccle_sub_exp.txt") |
|
|
612 |
|
|
|
613 |
rm(list = c("achilles", "drive", "marcotte")) |
|
|
614 |
# Get an index in the 'all_transcript_seqs' files for every cell line for easier subsetting |
|
|
615 |
# All cell lines should have the same number of transcripts; those that don't have a mutated |
|
|
616 |
# transcript must be assigned the reference sequence |
|
|
617 |
merge_sub <- merge_seq[Variant_Type == "SNP"] |
|
|
618 |
|
|
|
619 |
length(merge_sub[Tumor_Sample_Barcode == 'OELE']$alt_cdna) |
|
|
620 |
length(merge_sub[Tumor_Sample_Barcode == 'LN18']$alt_cdna) |
|
|
621 |
colnames(merge_sub) |
|
|
622 |
head(merge_sub$Tumor_Sample_Barcode) |
|
|
623 |
colnames(ccle_data) |
|
|
624 |
all_transcripts <- fread("Data/RNAi/Train_Data/all_transcript_seqs.txt") |
|
|
625 |
|
|
|
626 |
all_reference <- unique(merge_sub[, c("Annotation_Transcript", "ref_cdna")]) |
|
|
627 |
nrow(all_reference) |
|
|
628 |
merge_sub <- merge_sub[ref_len < 20326] |
|
|
629 |
library(fastmatch) |
|
|
630 |
# Find indices of reference/normal transcripts in 'all_transcripts' |
|
|
631 |
ref_idx <- fmatch(x = all_reference$ref_cdna, table = all_transcripts$transcript) |
|
|
632 |
ref_idx <- data.table(ref_id = all_reference$Annotation_Transcript, ref_idx = ref_idx) |
|
|
633 |
|
|
|
634 |
dir.create("Data/RNAi/Train_Data/Cell_Line_Indices") |
|
|
635 |
for (line in ccle_lines) { |
|
|
636 |
# Find the indices of the altered sequences in 'all_transcripts' |
|
|
637 |
cur_idx <- fmatch(x = merge_sub[Tumor_Sample_Barcode == line]$alt_cdna, |
|
|
638 |
table = all_transcripts$transcript) |
|
|
639 |
cur_idx <- data.table(alt_id = merge_sub[Tumor_Sample_Barcode == line]$Annotation_Transcript, |
|
|
640 |
alt_idx = cur_idx) |
|
|
641 |
ref_copy <- ref_idx |
|
|
642 |
# Merge and replace transcript ids in the reference; this now represents the indices of |
|
|
643 |
# transcripts in 'all_transcripts' that are relevant to this cell line |
|
|
644 |
matched_idx <- fmatch(cur_idx$alt_id, ref_idx$ref_id) |
|
|
645 |
ref_copy[matched_idx,] <- cur_idx |
|
|
646 |
|
|
|
647 |
fwrite(ref_copy, paste0("Data/RNAi/Train_Data/Cell_Line_Indices/", line, ".txt")) |
|
|
648 |
} |
|
|
649 |
# fread("Data/RNAi/Train_Data/Cell_Line_Indices/22RV1.txt") |
|
|
650 |
|
|
|
651 |
line_name_indices <- fread("Data/RNAi/Train_Data/ccle_exp_data.txt") |
|
|
652 |
colnames(line_name_indices)[1] |
|
|
653 |
line_name_indices <- data.table(seq_along(line_name_indices$cell_line), |
|
|
654 |
line_name_indices$cell_line) |
|
|
655 |
colnames(line_name_indices)[1] <- "index" |
|
|
656 |
fwrite(line_name_indices, "Data/RNAi/Train_Data/ccle_name_index.txt") |
|
|
657 |
|
|
|
658 |
colnames(merge_seq) |
|
|
659 |
fmatch(unique(merge_seq$Annotation_Transcript), merge_seq$Annotation_Transcript) |
|
|
660 |
sum(merge_seq[fmatch(unique(merge_seq$Annotation_Transcript), merge_seq$Annotation_Transcript)]$ref_len) / 8 |
|
|
661 |
|
|
|
662 |
|
|
|
663 |
# ==== shRNA mapping ==== |
|
|
664 |
library(stringr) |
|
|
665 |
sh_map <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/RNAi/Train_Data/shRNAmapping.csv") |
|
|
666 |
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene Symbol` <- "NO_CURRENT" |
|
|
667 |
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene ID` <- "NO_CURRENT" |
|
|
668 |
sh_map[`Gene Symbol` %like% "-"]$`Gene Symbol`[781] |
|
|
669 |
sh_map[`Gene Symbol` %like% "-"]$`Gene ID`[781] |
|
|
670 |
# colnames(sh_map) <- c("shRNA", "HGNC", "ENTREZ") |
|
|
671 |
sh_map <- sh_map[, c(1,3)] |
|
|
672 |
|
|
|
673 |
# sh_split <- str_split(sh_map$`Gene Symbol`, "-", simplify = T) |
|
|
674 |
# head(sh_split) |
|
|
675 |
# sh_map$`Gene ID` |
|
|
676 |
# |
|
|
677 |
# temp <- sh_map[, strsplit(as.character(`Gene Symbol`), ",", fixed=TRUE), |
|
|
678 |
# by = .(`Barcode Sequence`, `Gene Symbol`)][, `Gene Symbol` := NULL][ |
|
|
679 |
# , setnames(.SD, "Barcode Sequence", "Gene Symbol")] |
|
|
680 |
|
|
|
681 |
|
|
|
682 |
# length(unique(sh_map$`Gene Symbol`)) |
|
|
683 |
# length(unique(sh_map$`Barcode Sequence`)) |
|
|
684 |
# anyDuplicated(sh_map$`Barcode Sequence`) |
|
|
685 |
# which(duplicated(sh_map$`Barcode Sequence`)) |
|
|
686 |
# sh_map[c(28,29),] |
|
|
687 |
|
|
|
688 |
colnames(sh_map) <- c("shRNA", "ENTREZ") |
|
|
689 |
# sh_long <- melt(data = sh_map, id.vars = c("shRNA", "Gene")) |
|
|
690 |
# sh_long <- dcast.data.table(sh_map, formula = shRNA ~ Gene, fill = 0, |
|
|
691 |
# fun.aggregate = function(x) {1L}) |
|
|
692 |
# dim(sh_long) |
|
|
693 |
# (sh_long[1:5, 2:5]) |
|
|
694 |
# |
|
|
695 |
# sum(sh_long[1,2:5000]) |
|
|
696 |
|
|
|
697 |
# Create a one-hot vector |
|
|
698 |
# library(caret) |
|
|
699 |
# Pair shRNA sequences with one-hot encoded gene target vector |
|
|
700 |
# Each shRNA sequence will have a ~22000 length vector indicating its target |
|
|
701 |
ccle_shrna_seqs <- fread("Data/RNAi/Train_Data/ccle_shrna_seqs.txt") |
|
|
702 |
setkey(ccle_shrna_seqs, shRNA) |
|
|
703 |
ccle_shrna_seqs$INDEX <- 1:nrow(ccle_shrna_seqs) |
|
|
704 |
setkey(sh_map, shRNA) |
|
|
705 |
|
|
|
706 |
length(unique(ccle_shrna_seqs$shRNA)) |
|
|
707 |
sh_map <- sh_map[shRNA %in% unique(ccle_shrna_seqs$shRNA)] |
|
|
708 |
temp <- merge(ccle_shrna_seqs[, c(1,4)], sh_map, by = "shRNA", allow.cartesian = TRUE) |
|
|
709 |
temp <- unique(temp) |
|
|
710 |
|
|
|
711 |
# anyDuplicated(temp$INDEX) |
|
|
712 |
|
|
|
713 |
# install.packages("onehot") |
|
|
714 |
library(onehot) |
|
|
715 |
|
|
|
716 |
# cur_sub$ENTREZ <- as.factor(cur_sub$ENTREZ) |
|
|
717 |
# cur_sub$shRNA <- as.factor(cur_sub$shRNA) |
|
|
718 |
# class(cur_sub$ENTREZ) |
|
|
719 |
sh_map$ENTREZ <- as.factor(sh_map$ENTREZ) |
|
|
720 |
sh_map$shRNA <- as.factor(sh_map$shRNA) |
|
|
721 |
class(sh_map$ENTREZ) |
|
|
722 |
|
|
|
723 |
# Separate into files by indices |
|
|
724 |
# cur_dummy <- dummyVars(formula = '~ ENTREZ', data = sh_map, |
|
|
725 |
# fullRank = T, sep = "_", levelsOnly = F) |
|
|
726 |
|
|
|
727 |
onehot_encoder <- onehot::onehot(data = as.data.frame(sh_map), |
|
|
728 |
max_levels = length(unique(sh_map$ENTREZ))) |
|
|
729 |
options(scipen=999) |
|
|
730 |
dir.create("Data/RNAi/Train_Data/shRNA_by_index") |
|
|
731 |
|
|
|
732 |
# all_results <- onehot_results |
|
|
733 |
for (idx in seq(1, nrow(temp), by = 100000)[-(1:230)]) { |
|
|
734 |
# cur_sub <- sh_map[cell_line == line] |
|
|
735 |
cur_sub <- temp[idx:(idx+100000-1),] |
|
|
736 |
onehot_results <- predict(onehot_encoder, cur_sub) |
|
|
737 |
# dim(onehot_results) |
|
|
738 |
rownames(onehot_results) <- cur_sub$shRNA |
|
|
739 |
onehot_results <- data.table(onehot_results, keep.rownames = T) |
|
|
740 |
colnames(onehot_results)[1] <- "shRNA" |
|
|
741 |
|
|
|
742 |
onehot_results <- onehot_results[, lapply(.SD, sum), by = shRNA, .SDcols = colnames(onehot_results)[-1]] |
|
|
743 |
|
|
|
744 |
fwrite(onehot_results, paste0("Data/RNAi/Train_Data/shRNA_by_index/", idx, "_", |
|
|
745 |
idx+100000-1, ".txt")) |
|
|
746 |
} |
|
|
747 |
library(data.table) |
|
|
748 |
all_shrna_files <- list.files("Data/RNAi/Train_Data/shRNA_by_index/", full.names = T) |
|
|
749 |
all_shrna_indices <- fread(all_shrna_files[1]) |
|
|
750 |
|
|
|
751 |
for (i in 2:length(all_shrna_files)) { |
|
|
752 |
cur_file <- fread(all_shrna_files[i]) |
|
|
753 |
all_shrna_indices <- rbindlist(list(all_shrna_indices, cur_file)) |
|
|
754 |
} |
|
|
755 |
all_shrna_indices <- unique(all_shrna_indices) |
|
|
756 |
|
|
|
757 |
fwrite(x = all_shrna_indices, "Data/RNAi/Train_Data/all_shrna_indices.txt") |
|
|
758 |
|
|
|
759 |
# dool <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/RNAi/Train_Data/shRNA_by_index/1_100000.txt") |
|
|
760 |
# dim(dool) |
|
|
761 |
dir.create("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/RNAi/Train_Data/shRNA_by_index") |
|
|
762 |
library(stringr) |
|
|
763 |
sh_map <- fread("/Users/ftaj/OneDrive - University of Toronto/Drug_Response/Data/RNAi/shRNAmapping.csv") |
|
|
764 |
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene Symbol` <- "NO_CURRENT" |
|
|
765 |
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene ID` <- "NO_CURRENT" |
|
|
766 |
sh_map[`Gene Symbol` %like% "-"]$`Gene Symbol`[781] |
|
|
767 |
sh_map[`Gene Symbol` %like% "-"]$`Gene ID`[781] |
|
|
768 |
colnames(sh_map) <- c("shRNA", "HGNC", "ENTREZ") |
|
|
769 |
|
|
|
770 |
dummification <- function(dummifier, shrna_data, idx) { |
|
|
771 |
cur_sub <- temp[idx:(idx+100000-1),] |
|
|
772 |
cur_onehot <- data.frame(predict(cur_dummy, cur_sub)) |
|
|
773 |
fwrite(cur_onehot, paste0("Data/RNAi/Train_Data/shRNA_by_index/", idx, "_", |
|
|
774 |
idx+100000-1, ".txt")) |
|
|
775 |
|
|
|
776 |
} |
|
|
777 |
|
|
|
778 |
mclapply(X = seq(1, nrow(temp), by = 100000), FUN = dummification, |
|
|
779 |
mc.cores = 6, mc.cleanup = T) |
|
|
780 |
|
|
|
781 |
|
|
|
782 |
|
|
|
783 |
ccle_shrna_seqs <- ccle_shrna_seqs[, 1:2] |
|
|
784 |
ccle_shrna_seqs <- unique(ccle_shrna_seqs) |
|
|
785 |
all_merger <- merge(ccle_shrna_seqs, sh_map, by = "shRNA") |
|
|
786 |
|
|
|
787 |
length(unique(ccle_shrna_seqs$shRNA)) |
|
|
788 |
# Add cell line to sh map |
|
|
789 |
temp <- unique(ccle_shrna_seqs[, 1:2]) |
|
|
790 |
temp <- unique(temp) |
|
|
791 |
sh_map <- unique(sh_map) |
|
|
792 |
# Put all genes targeted by an shRNA in the same column |
|
|
793 |
sh_map[, all_genes := paste0(Gene, collapse = ","), by = shRNA] |
|
|
794 |
which(sh_map$Gene != sh_map$all_genes) |
|
|
795 |
sh_map[27:28,] |
|
|
796 |
sh_map[, Gene := NULL] |
|
|
797 |
sh_map <- unique(sh_map) |
|
|
798 |
|
|
|
799 |
sh_map$shRNA <- factor(sh_map$shRNA) |
|
|
800 |
sh_map$Gene <- factor(sh_map$Gene) |
|
|
801 |
|
|
|
802 |
sh_line_gene <- merge(temp, sh_map, by = "shRNA") |
|
|
803 |
sh_line_gene$all_genes <- factor(sh_line_gene$all_genes) |
|
|
804 |
sh_line_gene$shRNA <- factor(sh_line_gene$shRNA) |
|
|
805 |
|
|
|
806 |
temp <- sh_line_gene[, strsplit(as.character(all_genes), ",", fixed=TRUE), |
|
|
807 |
by = .(shRNA, cell_line, all_genes)][, all_genes := NULL][ |
|
|
808 |
, setnames(.SD, "shRNA", "cell_line", "Genes")] |
|
|
809 |
|
|
|
810 |
|
|
|
811 |
merge_sub <- fread("Data/RNAi/Train_Data/transcripts_alt_and_ref.txt") |
|
|
812 |
merge_sub <- merge_sub[Variant_Type == "SNP"] |
|
|
813 |
colnames(merge_sub) |
|
|
814 |
merge_sub <- merge_sub[ref_len < 20326] |
|
|
815 |
|
|
|
816 |
# Separate each cell line shRNA data |
|
|
817 |
line <- "TEN" |
|
|
818 |
dir.create("Data/RNAi/Train_Data/shRNA_by_line_and_target") |
|
|
819 |
all_lines <- unique(ccle_shrna_seqs$cell_line) |
|
|
820 |
for (line in all_lines) { |
|
|
821 |
# cur_sub <- sh_map[cell_line == line] |
|
|
822 |
cur_sub <- sh_map |
|
|
823 |
sh_map$ENTREZ <- factor(sh_map$ENTREZ) |
|
|
824 |
sh_map$shRNA <- factor(sh_map$shRNA) |
|
|
825 |
sh_map[, HGNC := NULL] |
|
|
826 |
levels(sh_map$ENTREZ) |
|
|
827 |
cur_dummy <- dummyVars(formula = '~ ENTREZ', data = sh_map, |
|
|
828 |
fullRank = T, sep = "_", levelsOnly = F) |
|
|
829 |
temp <- data.frame(predict(cur_dummy, sh_map[1:100,])) |
|
|
830 |
dim(temp) |
|
|
831 |
temp[1:5, 10:15] |
|
|
832 |
fwrite(cur_sub, paste0("Data/RNAi/Train_Data/shRNA_by_line/", line, ".txt")) |
|
|
833 |
} |
|
|
834 |
|
|
|
835 |
when <- data.frame(time = c("afternoon", "night", "afternoon", |
|
|
836 |
"morning", "morning", "morning", |
|
|
837 |
"morning", "afternoon", "afternoon"), |
|
|
838 |
day = c("Mon", "Mon", "Mon", |
|
|
839 |
"Wed", "Wed", "Fri", |
|
|
840 |
"Sat", "Sat", "Fri")) |
|
|
841 |
|
|
|
842 |
levels(when$time) <- list(morning="morning", |
|
|
843 |
afternoon="afternoon", |
|
|
844 |
night="night") |
|
|
845 |
levels(when$day) <- list(Mon="Mon", Tue="Tue", Wed="Wed", Thu="Thu", |
|
|
846 |
Fri="Fri", Sat="Sat", Sun="Sun") |
|
|
847 |
model.matrix(~day, when) |
|
|
848 |
|
|
|
849 |
interactionModel <- dummyVars(~ day + time + day:time, |
|
|
850 |
data = when, |
|
|
851 |
sep = ".") |
|
|
852 |
predict(interactionModel, when[1:3,]) |
|
|
853 |
noNames <- dummyVars(~ day + time + day:time, |
|
|
854 |
data = when, |
|
|
855 |
levelsOnly = TRUE) |
|
|
856 |
|
|
|
857 |
|
|
|
858 |
|
|
|
859 |
|
|
|
860 |
sh_onehot <- dummyVars(formula = " shRNA~ . ", data = sh_map) |
|
|
861 |
temp <- data.table(predict(sh_onehot, sh_map[1:10000, ]), keep.rownames = T) |
|
|
862 |
dim(temp) |
|
|
863 |
temp[1:5, 1:5] |
|
|
864 |
rownames(temp)[1:5] |
|
|
865 |
|
|
|
866 |
View(sh_map) |
|
|
867 |
|
|
|
868 |
|