[c3b4f8]: / R / shrna_by_index_script.R

Download this file

113 lines (97 with data), 4.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
library(data.table)
library(parallel)
# library(fastmatch)
# library(cmapR)
# if (!require(biomaRt)) {
# BiocManager::install("biomaRt", version = "3.8")
# library(biomaRt)
# }
# library(HGNChelper)
# if (!require(Biostrings)) {
# BiocManager::install("Biostrings", version = "3.8")
# library(Biostrings)
# }
if (!require(onehot)) {
install.packages("onehot")
library(onehot)
}
library(stringr)
print(paste0("Total number of cores: ", detectCores()))
sh_map <- fread("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/shRNAmapping.csv")
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene Symbol` <- "NO_CURRENT"
sh_map[`Gene Symbol` %like% "NO_CURRENT"]$`Gene ID` <- "NO_CURRENT"
sh_map[`Gene Symbol` %like% "-"]$`Gene Symbol`[781]
sh_map[`Gene Symbol` %like% "-"]$`Gene ID`[781]
# colnames(sh_map) <- c("shRNA", "HGNC", "ENTREZ")
sh_map <- sh_map[, c(1,3)]
# sh_split <- str_split(sh_map$`Gene Symbol`, "-", simplify = T)
# head(sh_split)
# sh_map$`Gene ID`
#
# temp <- sh_map[, strsplit(as.character(`Gene Symbol`), ",", fixed=TRUE),
# by = .(`Barcode Sequence`, `Gene Symbol`)][, `Gene Symbol` := NULL][
# , setnames(.SD, "Barcode Sequence", "Gene Symbol")]
# length(unique(sh_map$`Gene Symbol`))
# length(unique(sh_map$`Barcode Sequence`))
# anyDuplicated(sh_map$`Barcode Sequence`)
# which(duplicated(sh_map$`Barcode Sequence`))
# sh_map[c(28,29),]
colnames(sh_map) <- c("shRNA", "ENTREZ")
# sh_long <- melt(data = sh_map, id.vars = c("shRNA", "Gene"))
# sh_long <- dcast.data.table(sh_map, formula = shRNA ~ Gene, fill = 0,
# fun.aggregate = function(x) {1L})
# dim(sh_long)
# (sh_long[1:5, 2:5])
#
# sum(sh_long[1,2:5000])
# Create a one-hot vector
# library(caret)
# Pair shRNA sequences with one-hot encoded gene target vector
# Each shRNA sequence will have a ~22000 length vector indicating its target
ccle_shrna_seqs <- fread("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/ccle_shrna_seqs.txt")
setkey(ccle_shrna_seqs, shRNA)
ccle_shrna_seqs$INDEX <- 1:nrow(ccle_shrna_seqs)
setkey(sh_map, shRNA)
length(unique(ccle_shrna_seqs$shRNA))
sh_map <- sh_map[shRNA %in% unique(ccle_shrna_seqs$shRNA)]
temp <- merge(ccle_shrna_seqs[, c(1,4)], sh_map, by = "shRNA", allow.cartesian = TRUE)
temp <- unique(temp)
# anyDuplicated(temp$INDEX)
# install.packages("onehot")
library(onehot)
# cur_sub$ENTREZ <- as.factor(cur_sub$ENTREZ)
# cur_sub$shRNA <- as.factor(cur_sub$shRNA)
# class(cur_sub$ENTREZ)
sh_map$ENTREZ <- as.factor(sh_map$ENTREZ)
sh_map$shRNA <- as.factor(sh_map$shRNA)
class(sh_map$ENTREZ)
# Separate into files by indices
# cur_dummy <- dummyVars(formula = '~ ENTREZ', data = sh_map,
# fullRank = T, sep = "_", levelsOnly = F)
onehot_encoder <- onehot::onehot(data = as.data.frame(sh_map),
max_levels = length(unique(sh_map$ENTREZ)))
options(scipen=999)
dir.create("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/shRNA_by_index")
onehot_encoder <- onehot::onehot(data = as.data.frame(sh_map),
max_levels = length(unique(sh_map$ENTREZ)))
dummification <- function(idx, encoder, shrna_data) {
# for (idx in seq(1, nrow(temp), by = 100000)) {
# cur_sub <- sh_map[cell_line == line]
cur_sub <- shrna_data[idx:(idx+100000-1),]
print(paste0("Encoding ", as.character(idx)))
onehot_results <- predict(encoder, cur_sub)
dim(onehot_results)
head(onehot_results)
rownames(onehot_results) <- cur_sub$shRNA
onehot_results <- data.table(onehot_results, keep.rownames = T)
colnames(onehot_results)[1] <- "shRNA"
onehot_results <- onehot_results[, lapply(.SD, sum), by = shRNA, .SDcols = colnames(onehot_results)[-1]]
fwrite(onehot_results, paste0("/u/ftaj/anaconda3/envs/Drug_Response/Data/RNAi/Train_Data/shRNA_by_index/", idx, "_",
idx+100000-1, ".txt"))
# }
}
# [-(1:230)]
mc_results <- mclapply(X = seq(1, nrow(temp), by = 100000)[1:4], FUN = dummification, encoder = onehot_encoder,
shrna_data = temp, mc.cores = 2,
mc.cleanup = T, mc.preschedule = F)
print(mc_results)