Download this file

100 lines (71 with data), 3.4 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# files required:
# 1) "geneDepth.txt": gene abundance table, with the first column being gene ids which are formatted as "scaffold_number"
# 2) "ko.txt" file giving informaiton on gene ids and KO number.
# 3) "bin_membership.txt" file giving information on bins and scaffold id
# 4) "bin_species.txt" file giving information on bins and Species annotaion
# output:
# A directory "5_LOSO_ko.abund" containing many "ko.abund_rm.species.gct" files, one for each species removed.
# Each KO abundance files will go through ssGSEA to generate abundance table for KEGG modules.
# For information of ssGSEA, see https://github.com/broadinstitute/ssGSEA2.0
## #####################################################
##
## load data and delete unused data to save memory
##
## #####################################################
log.file='LOSO.ko.log'
cat(paste(as.character(Sys.time()), '\n'), file=log.file, append=T)
cat('Performing LOSO.ko analysis: \n', file=log.file, append=T)
cat('Importing data: \n', file=log.file, append=T)
library(data.table)
library(tibble)
library(reshape2)
library(dplyr)
gene.KO_df <- fread("ko.txt", header = T) %>% dplyr::select(Gene, KOnumber)
geneDepth_df <- fread("geneDepth.txt", data.table = F)
colnames(geneDepth_df)[1] <- "GeneID"
geneDepth_df <- geneDepth_df %>% # ,nrows = 300 to test
dplyr::filter(GeneID %in% gene.KO_df$Gene) %>%
tibble::column_to_rownames("GeneID")
gc()
geneDepth_df <- merge(geneDepth_df,
gene.KO_df, by.x = 0, by.y="Gene")
colnames(geneDepth_df)[which(colnames(geneDepth_df) == "Row.names")] <- "Gene"
remove(gene.KO_df)
gc()
## #####################################################
##
## create species - scaffold file
##
## #####################################################
bin.scaf_df <- fread("bin_membership.txt") %>% dplyr::select(Scaffold, Bin)
bin.species_df <- fread("bin_species.txt") %>% dplyr::select(Bin, Species)
species.scaffold_df <- merge(bin.species_df, bin.scaf_df, by="Bin")
remove(bin.scaf_df, bin.species_df)
gc()
if(!dir.exists("LOSO_ko.abund") ) dir.create("LOSO_ko.abund")
uniqSpecies = unique(species.scaffold_df$Species)
rows.scaf = sub("(_\\d+$)","", geneDepth_df$Gene)
cat(paste('Excluding', length(uniqSpecies),' species one by one: \n'), file=log.file, append=T)
for(species in uniqSpecies){
i=which(uniqSpecies == species)
cat(paste(i,". ", species, ' \n', sep = ""), file=log.file, append=T)
# species = uniqSpecies[3]
scafs = species.scaffold_df$Scaffold[species.scaffold_df$Species == species]
geneDepth_df_sub <- geneDepth_df[!(rows.scaf %in% scafs),]
res <- geneDepth_df_sub %>%
dplyr::select(-Gene) %>%
# reshape2::melt(id.var = "KO", )
dplyr::group_by(KOnumber) %>%
dplyr::summarise_if(is.numeric, sum, na.rm = TRUE) %>%
tibble::column_to_rownames("KOnumber")
## export gct
gct.tmp <- new('GCT')
gct.tmp@mat <- data.matrix( res )
gct.tmp@rid <- rownames(res)
gct.tmp@cid <- colnames(res)
gct.tmp@cdesc <- data.frame(id=colnames(res))
gct.tmp@rdesc <- data.frame(row.names = rownames(res), Description = rownames(res))
fnn = paste("5_LOSO_ko.abund/ko.abund_rm.", species, ".gct", sep = "")
gct.tmp@src <- fnn
write.gct(gct.tmp, ofile=fnn, appenddim = F)
}