--- a +++ b/4-Multi-Omic Integration/scripts/5.LOSO.KO.r @@ -0,0 +1,99 @@ +# files required: +# 1) "geneDepth.txt": gene abundance table, with the first column being gene ids which are formatted as "scaffold_number" +# 2) "ko.txt" file giving informaiton on gene ids and KO number. +# 3) "bin_membership.txt" file giving information on bins and scaffold id +# 4) "bin_species.txt" file giving information on bins and Species annotaion + +# output: +# A directory "5_LOSO_ko.abund" containing many "ko.abund_rm.species.gct" files, one for each species removed. +# Each KO abundance files will go through ssGSEA to generate abundance table for KEGG modules. +# For information of ssGSEA, see https://github.com/broadinstitute/ssGSEA2.0 + + + +## ##################################################### +## +## load data and delete unused data to save memory +## +## ##################################################### +log.file='LOSO.ko.log' + +cat(paste(as.character(Sys.time()), '\n'), file=log.file, append=T) +cat('Performing LOSO.ko analysis: \n', file=log.file, append=T) +cat('Importing data: \n', file=log.file, append=T) + +library(data.table) +library(tibble) +library(reshape2) +library(dplyr) + +gene.KO_df <- fread("ko.txt", header = T) %>% dplyr::select(Gene, KOnumber) + +geneDepth_df <- fread("geneDepth.txt", data.table = F) +colnames(geneDepth_df)[1] <- "GeneID" + +geneDepth_df <- geneDepth_df %>% # ,nrows = 300 to test + dplyr::filter(GeneID %in% gene.KO_df$Gene) %>% + tibble::column_to_rownames("GeneID") + +gc() + +geneDepth_df <- merge(geneDepth_df, + gene.KO_df, by.x = 0, by.y="Gene") +colnames(geneDepth_df)[which(colnames(geneDepth_df) == "Row.names")] <- "Gene" +remove(gene.KO_df) +gc() + + + +## ##################################################### +## +## create species - scaffold file +## +## ##################################################### +bin.scaf_df <- fread("bin_membership.txt") %>% dplyr::select(Scaffold, Bin) +bin.species_df <- fread("bin_species.txt") %>% dplyr::select(Bin, Species) + +species.scaffold_df <- merge(bin.species_df, bin.scaf_df, by="Bin") +remove(bin.scaf_df, bin.species_df) +gc() + +if(!dir.exists("LOSO_ko.abund") ) dir.create("LOSO_ko.abund") + +uniqSpecies = unique(species.scaffold_df$Species) +rows.scaf = sub("(_\\d+$)","", geneDepth_df$Gene) + +cat(paste('Excluding', length(uniqSpecies),' species one by one: \n'), file=log.file, append=T) +for(species in uniqSpecies){ + i=which(uniqSpecies == species) + cat(paste(i,". ", species, ' \n', sep = ""), file=log.file, append=T) + + # species = uniqSpecies[3] + + scafs = species.scaffold_df$Scaffold[species.scaffold_df$Species == species] + + + geneDepth_df_sub <- geneDepth_df[!(rows.scaf %in% scafs),] + + res <- geneDepth_df_sub %>% + dplyr::select(-Gene) %>% + # reshape2::melt(id.var = "KO", ) + dplyr::group_by(KOnumber) %>% + dplyr::summarise_if(is.numeric, sum, na.rm = TRUE) %>% + tibble::column_to_rownames("KOnumber") + + + ## export gct + gct.tmp <- new('GCT') + gct.tmp@mat <- data.matrix( res ) + gct.tmp@rid <- rownames(res) + gct.tmp@cid <- colnames(res) + gct.tmp@cdesc <- data.frame(id=colnames(res)) + gct.tmp@rdesc <- data.frame(row.names = rownames(res), Description = rownames(res)) + fnn = paste("5_LOSO_ko.abund/ko.abund_rm.", species, ".gct", sep = "") + gct.tmp@src <- fnn + + write.gct(gct.tmp, ofile=fnn, appenddim = F) +} + +