Diff of /R/shrna_by_index_script.R [000000] .. [c3b4f8]

Switch to unified view

a b/R/shrna_by_index_script.R
1
library(data.table)
2
library(parallel)
3
# library(fastmatch)
4
# library(cmapR)
5
# if (!require(biomaRt)) {
6
#   BiocManager::install("biomaRt", version = "3.8")
7
#   library(biomaRt)
8
# }
9
# library(HGNChelper)
10
# if (!require(Biostrings)) {
11
#   BiocManager::install("Biostrings", version = "3.8")
12
#   library(Biostrings)
13
# }
14
if (!require(onehot)) {
15
  install.packages("onehot")
16
  library(onehot)
17
}
18
library(stringr)
19
print(paste0("Total number of cores: ", detectCores()))
20
21
sh_map <- fread("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/shRNAmapping.csv")
22
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene Symbol` <- "NO_CURRENT"
23
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene ID` <- "NO_CURRENT"
24
sh_map[`Gene Symbol` %like% "-"]$`Gene Symbol`[781]
25
sh_map[`Gene Symbol` %like% "-"]$`Gene ID`[781]
26
# colnames(sh_map) <- c("shRNA", "HGNC", "ENTREZ")
27
sh_map <- sh_map[, c(1,3)]
28
29
# sh_split <- str_split(sh_map$`Gene Symbol`, "-", simplify = T)
30
# head(sh_split)
31
# sh_map$`Gene ID`
32
# 
33
# temp <- sh_map[, strsplit(as.character(`Gene Symbol`), ",", fixed=TRUE),
34
#                      by = .(`Barcode Sequence`, `Gene Symbol`)][, `Gene Symbol` := NULL][
35
#                        , setnames(.SD, "Barcode Sequence", "Gene Symbol")]
36
37
38
# length(unique(sh_map$`Gene Symbol`))
39
# length(unique(sh_map$`Barcode Sequence`))
40
# anyDuplicated(sh_map$`Barcode Sequence`)
41
# which(duplicated(sh_map$`Barcode Sequence`))
42
# sh_map[c(28,29),]
43
44
colnames(sh_map) <- c("shRNA", "ENTREZ")
45
# sh_long <- melt(data = sh_map, id.vars = c("shRNA", "Gene"))
46
# sh_long <- dcast.data.table(sh_map, formula = shRNA ~ Gene, fill = 0,
47
#                             fun.aggregate = function(x) {1L})
48
# dim(sh_long)
49
# (sh_long[1:5, 2:5])
50
# 
51
# sum(sh_long[1,2:5000])
52
53
# Create a one-hot vector
54
# library(caret)
55
# Pair shRNA sequences with one-hot encoded gene target vector
56
# Each shRNA sequence will have a ~22000 length vector indicating its target
57
ccle_shrna_seqs <- fread("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/ccle_shrna_seqs.txt")
58
setkey(ccle_shrna_seqs, shRNA)
59
ccle_shrna_seqs$INDEX <- 1:nrow(ccle_shrna_seqs)
60
setkey(sh_map, shRNA)
61
62
length(unique(ccle_shrna_seqs$shRNA))
63
sh_map <- sh_map[shRNA %in% unique(ccle_shrna_seqs$shRNA)]
64
temp <- merge(ccle_shrna_seqs[, c(1,4)], sh_map, by = "shRNA", allow.cartesian = TRUE)
65
temp <- unique(temp)
66
67
# anyDuplicated(temp$INDEX)
68
69
# install.packages("onehot")
70
library(onehot)
71
72
# cur_sub$ENTREZ <- as.factor(cur_sub$ENTREZ)
73
# cur_sub$shRNA <- as.factor(cur_sub$shRNA)
74
# class(cur_sub$ENTREZ)
75
sh_map$ENTREZ <- as.factor(sh_map$ENTREZ)
76
sh_map$shRNA <- as.factor(sh_map$shRNA)
77
class(sh_map$ENTREZ)
78
79
# Separate into files by indices
80
# cur_dummy <- dummyVars(formula = '~ ENTREZ', data = sh_map,
81
#                        fullRank = T, sep = "_", levelsOnly = F)
82
83
onehot_encoder <- onehot::onehot(data = as.data.frame(sh_map),
84
                                 max_levels = length(unique(sh_map$ENTREZ)))
85
options(scipen=999)
86
dir.create("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/shRNA_by_index")
87
88
onehot_encoder <- onehot::onehot(data = as.data.frame(sh_map),
89
                                 max_levels = length(unique(sh_map$ENTREZ)))
90
91
dummification <- function(idx, encoder, shrna_data) {
92
  # for (idx in seq(1, nrow(temp), by = 100000)) {
93
    # cur_sub <- sh_map[cell_line == line]
94
  cur_sub <- shrna_data[idx:(idx+100000-1),]
95
  print(paste0("Encoding ", as.character(idx)))
96
  onehot_results <- predict(encoder, cur_sub)
97
  dim(onehot_results)
98
  head(onehot_results)
99
  rownames(onehot_results) <- cur_sub$shRNA
100
  onehot_results <- data.table(onehot_results, keep.rownames = T)
101
  colnames(onehot_results)[1] <- "shRNA"
102
  
103
  onehot_results <- onehot_results[, lapply(.SD, sum), by = shRNA, .SDcols = colnames(onehot_results)[-1]]
104
  fwrite(onehot_results, paste0("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/shRNA_by_index/", idx, "_",
105
                                idx+100000-1, ".txt"))
106
  # }  
107
}
108
# [-(1:230)]
109
mc_results <- mclapply(X = seq(1, nrow(temp), by = 100000)[1:4], FUN = dummification, encoder = onehot_encoder,
110
         shrna_data = temp, mc.cores = 2,
111
         mc.cleanup = T, mc.preschedule = F)
112
print(mc_results)