|
a |
|
b/R/shrna_by_index_script.R |
|
|
1 |
library(data.table) |
|
|
2 |
library(parallel) |
|
|
3 |
# library(fastmatch) |
|
|
4 |
# library(cmapR) |
|
|
5 |
# if (!require(biomaRt)) { |
|
|
6 |
# BiocManager::install("biomaRt", version = "3.8") |
|
|
7 |
# library(biomaRt) |
|
|
8 |
# } |
|
|
9 |
# library(HGNChelper) |
|
|
10 |
# if (!require(Biostrings)) { |
|
|
11 |
# BiocManager::install("Biostrings", version = "3.8") |
|
|
12 |
# library(Biostrings) |
|
|
13 |
# } |
|
|
14 |
if (!require(onehot)) { |
|
|
15 |
install.packages("onehot") |
|
|
16 |
library(onehot) |
|
|
17 |
} |
|
|
18 |
library(stringr) |
|
|
19 |
print(paste0("Total number of cores: ", detectCores())) |
|
|
20 |
|
|
|
21 |
sh_map <- fread("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/shRNAmapping.csv") |
|
|
22 |
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene Symbol` <- "NO_CURRENT" |
|
|
23 |
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene ID` <- "NO_CURRENT" |
|
|
24 |
sh_map[`Gene Symbol` %like% "-"]$`Gene Symbol`[781] |
|
|
25 |
sh_map[`Gene Symbol` %like% "-"]$`Gene ID`[781] |
|
|
26 |
# colnames(sh_map) <- c("shRNA", "HGNC", "ENTREZ") |
|
|
27 |
sh_map <- sh_map[, c(1,3)] |
|
|
28 |
|
|
|
29 |
# sh_split <- str_split(sh_map$`Gene Symbol`, "-", simplify = T) |
|
|
30 |
# head(sh_split) |
|
|
31 |
# sh_map$`Gene ID` |
|
|
32 |
# |
|
|
33 |
# temp <- sh_map[, strsplit(as.character(`Gene Symbol`), ",", fixed=TRUE), |
|
|
34 |
# by = .(`Barcode Sequence`, `Gene Symbol`)][, `Gene Symbol` := NULL][ |
|
|
35 |
# , setnames(.SD, "Barcode Sequence", "Gene Symbol")] |
|
|
36 |
|
|
|
37 |
|
|
|
38 |
# length(unique(sh_map$`Gene Symbol`)) |
|
|
39 |
# length(unique(sh_map$`Barcode Sequence`)) |
|
|
40 |
# anyDuplicated(sh_map$`Barcode Sequence`) |
|
|
41 |
# which(duplicated(sh_map$`Barcode Sequence`)) |
|
|
42 |
# sh_map[c(28,29),] |
|
|
43 |
|
|
|
44 |
colnames(sh_map) <- c("shRNA", "ENTREZ") |
|
|
45 |
# sh_long <- melt(data = sh_map, id.vars = c("shRNA", "Gene")) |
|
|
46 |
# sh_long <- dcast.data.table(sh_map, formula = shRNA ~ Gene, fill = 0, |
|
|
47 |
# fun.aggregate = function(x) {1L}) |
|
|
48 |
# dim(sh_long) |
|
|
49 |
# (sh_long[1:5, 2:5]) |
|
|
50 |
# |
|
|
51 |
# sum(sh_long[1,2:5000]) |
|
|
52 |
|
|
|
53 |
# Create a one-hot vector |
|
|
54 |
# library(caret) |
|
|
55 |
# Pair shRNA sequences with one-hot encoded gene target vector |
|
|
56 |
# Each shRNA sequence will have a ~22000 length vector indicating its target |
|
|
57 |
ccle_shrna_seqs <- fread("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/ccle_shrna_seqs.txt") |
|
|
58 |
setkey(ccle_shrna_seqs, shRNA) |
|
|
59 |
ccle_shrna_seqs$INDEX <- 1:nrow(ccle_shrna_seqs) |
|
|
60 |
setkey(sh_map, shRNA) |
|
|
61 |
|
|
|
62 |
length(unique(ccle_shrna_seqs$shRNA)) |
|
|
63 |
sh_map <- sh_map[shRNA %in% unique(ccle_shrna_seqs$shRNA)] |
|
|
64 |
temp <- merge(ccle_shrna_seqs[, c(1,4)], sh_map, by = "shRNA", allow.cartesian = TRUE) |
|
|
65 |
temp <- unique(temp) |
|
|
66 |
|
|
|
67 |
# anyDuplicated(temp$INDEX) |
|
|
68 |
|
|
|
69 |
# install.packages("onehot") |
|
|
70 |
library(onehot) |
|
|
71 |
|
|
|
72 |
# cur_sub$ENTREZ <- as.factor(cur_sub$ENTREZ) |
|
|
73 |
# cur_sub$shRNA <- as.factor(cur_sub$shRNA) |
|
|
74 |
# class(cur_sub$ENTREZ) |
|
|
75 |
sh_map$ENTREZ <- as.factor(sh_map$ENTREZ) |
|
|
76 |
sh_map$shRNA <- as.factor(sh_map$shRNA) |
|
|
77 |
class(sh_map$ENTREZ) |
|
|
78 |
|
|
|
79 |
# Separate into files by indices |
|
|
80 |
# cur_dummy <- dummyVars(formula = '~ ENTREZ', data = sh_map, |
|
|
81 |
# fullRank = T, sep = "_", levelsOnly = F) |
|
|
82 |
|
|
|
83 |
onehot_encoder <- onehot::onehot(data = as.data.frame(sh_map), |
|
|
84 |
max_levels = length(unique(sh_map$ENTREZ))) |
|
|
85 |
options(scipen=999) |
|
|
86 |
dir.create("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/shRNA_by_index") |
|
|
87 |
|
|
|
88 |
onehot_encoder <- onehot::onehot(data = as.data.frame(sh_map), |
|
|
89 |
max_levels = length(unique(sh_map$ENTREZ))) |
|
|
90 |
|
|
|
91 |
dummification <- function(idx, encoder, shrna_data) { |
|
|
92 |
# for (idx in seq(1, nrow(temp), by = 100000)) { |
|
|
93 |
# cur_sub <- sh_map[cell_line == line] |
|
|
94 |
cur_sub <- shrna_data[idx:(idx+100000-1),] |
|
|
95 |
print(paste0("Encoding ", as.character(idx))) |
|
|
96 |
onehot_results <- predict(encoder, cur_sub) |
|
|
97 |
dim(onehot_results) |
|
|
98 |
head(onehot_results) |
|
|
99 |
rownames(onehot_results) <- cur_sub$shRNA |
|
|
100 |
onehot_results <- data.table(onehot_results, keep.rownames = T) |
|
|
101 |
colnames(onehot_results)[1] <- "shRNA" |
|
|
102 |
|
|
|
103 |
onehot_results <- onehot_results[, lapply(.SD, sum), by = shRNA, .SDcols = colnames(onehot_results)[-1]] |
|
|
104 |
fwrite(onehot_results, paste0("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/shRNA_by_index/", idx, "_", |
|
|
105 |
idx+100000-1, ".txt")) |
|
|
106 |
# } |
|
|
107 |
} |
|
|
108 |
# [-(1:230)] |
|
|
109 |
mc_results <- mclapply(X = seq(1, nrow(temp), by = 100000)[1:4], FUN = dummification, encoder = onehot_encoder, |
|
|
110 |
shrna_data = temp, mc.cores = 2, |
|
|
111 |
mc.cleanup = T, mc.preschedule = F) |
|
|
112 |
print(mc_results) |