Switch to side-by-side view

--- a
+++ b/4-Multi-Omic Integration/scripts/4.KO2cmpd.r
@@ -0,0 +1,124 @@
+# files required:
+# 1) "ko01000.keg" downloaded from KEGG
+# 2) "metacyc_reactions.txt" storing information on susbstrates, products of EC numbers
+
+# output:
+# "KO2EC.list.RData", "EC2CMPD.lists.RData", "KO2CMPD.lists.RData" stored in the "database" directory
+
+library(data.table)
+library(dplyr)
+
+## generate KO2EC list #############
+dbDir = "database"
+fileName <- paste(dbDir,"/ko01000.keg",sep = "")
+conn <- file(fileName,open="r")
+linn <-readLines(conn)
+#for (i in 1:length(linn)){
+#  print(linn[i])
+#}
+close(conn)
+
+
+
+KO2EC_list <- vector("list",length=length(linn))
+for(i in c(1:length(linn))){
+  l = linn[i]
+  
+  if(!grepl("K\\d{5}",l,perl = T)) next
+  ko <- sub(".*(K\\d{5}).*","\\1",l)
+  
+  if(grepl("\\[EC\\:(.*)\\]",l, perl = T)) ecs <- strsplit( sub(".*\\[EC\\:(.*)\\]$", "\\1", l, perl = T), " ", fixed = T)[[1]] else ecs <- ""
+  
+  if(ko %in% names(KO2EC_list)) {
+    i_exist = which(names(KO2EC_list) == ko) 
+    updated_ecs <-  unique(c( KO2EC_list[[i_exist]], ecs))
+    updated_ecs <- updated_ecs[updated_ecs != ""]
+    KO2EC_list[[i_exist]] <- updated_ecs
+    next
+  }
+  
+  KO2EC_list[[i]] <- ecs
+  names(KO2EC_list)[i] <- ko
+  
+}
+
+
+KO2EC_list <- KO2EC_list[!sapply(KO2EC_list, is.null) ] 
+save(KO2EC_list, file = paste(dbDir,"/KO2EC.list.RData",sep = "")) 
+
+
+## generate EC2CMPD lists ##################
+metacyc_rxns <- fread(paste(dbDir,"/metacyc_reactions.txt",sep = "")) %>% filter(`EC-NUMBER` != "")
+EC.Substrates_list <- vector("list", length = nrow(metacyc_rxns))
+EC.Products_list <- vector("list", length = nrow(metacyc_rxns))
+EC.SubsProd_list <- vector("list", length = nrow(metacyc_rxns))
+
+for(i in c(1:nrow(metacyc_rxns))){
+  
+  ecs <- metacyc_rxns$`EC-NUMBER`[[i]]
+  ecs <- sub("\\|?EC\\-(.*)\\|?", "\\1", ecs)
+  rxn_drct = metacyc_rxns$`REACTION-DIRECTION`[[i]]
+  
+  
+  if(grepl('LEFT-TO-RIGHT', rxn_drct) | rxn_drct == "") {
+    substrates <- strsplit(metacyc_rxns$LEFT[i],";", fixed = T)[[1]]
+    products <- strsplit(metacyc_rxns$RIGHT[i],";",fixed = T)[[1]]
+    subs.prod <- NA
+  }else if(grepl("RIGHT-TO-LEFT", rxn_drct)){
+    substrates <- strsplit(metacyc_rxns$RIGHT[i], ";", fixed = T)[[1]]
+    products <- strsplit(metacyc_rxns$LEFT[i],";",fixed = T)[[1]]
+    subs.prod <- NA
+  }else if(rxn_drct == "REVERSIBLE"){
+    substrates <- NA
+    products <- NA
+    subs.prod <- unique(c(strsplit(metacyc_rxns$RIGHT[i], ";", fixed = T)[[1]], strsplit(metacyc_rxns$LEFT[i],";",fixed = T)[[1]] ))
+  }
+  
+  EC.Substrates_list[[i]] <- substrates; names(EC.Substrates_list)[i] <- ecs
+  EC.Products_list[[i]] <- products; names(EC.Products_list)[i] <- ecs
+  EC.SubsProd_list[[i]] <- subs.prod; names(EC.SubsProd_list)[i] <- ecs
+  
+}
+
+save(EC.Substrates_list, EC.Products_list, EC.SubsProd_list, 
+     file = paste(dbDir,"/EC2CMPD.lists.RData",sep = ""))
+
+
+## generate KO.Substrates, KO.Products and KO.SubsProd lists #########
+KO.Substrates_list <- vector("list", length = length(KO2EC_list))
+KO.Products_list <- vector("list", length = length(KO2EC_list))
+KO.SubsProd_list <- vector("list", length = length(KO2EC_list))
+
+for(i in c(1:length(KO2EC_list))){
+  
+  ko <- names(KO2EC_list)[i]
+  ecs <- KO2EC_list[[ko]]
+  
+  products <- vector("character")
+  substrates <- vector("character")
+  subs.prod <- vector("character")
+  
+  for(ec in ecs){
+    list.indexes <- which(names(EC.Products_list) == ec)
+    
+    for(l_ind in list.indexes){
+      products <- unique(append(products, EC.Products_list[[l_ind]]) )
+      substrates <- unique(append(substrates, EC.Substrates_list[[l_ind]]) )
+      subs.prod <- unique(append(subs.prod, EC.SubsProd_list[[l_ind]]) )
+    }
+  }
+  
+  s.p <- intersect(products, substrates) 
+  subs.prod <- unique(c(subs.prod, s.p)[!is.na(c(subs.prod, s.p))] )  # remove na and then unique
+  
+  substrates <- substrates[!is.na(substrates) & !(substrates %in% subs.prod)] # remove na and remove subs.prod
+  products <- products[!is.na(products) & !(products %in% subs.prod)]
+  
+  KO.Substrates_list[[i]] <- substrates; names(KO.Substrates_list)[i] <- ko
+  KO.Products_list[[i]] <- products; names(KO.Products_list)[i] <- ko
+  KO.SubsProd_list[[i]] <- subs.prod; names(KO.SubsProd_list)[i] <- ko
+  
+}
+
+save(KO.SubsProd_list, KO.Products_list, KO.Substrates_list,
+     file = paste(dbDir,"/KO2CMPD.lists.RData",sep = ""))