--- a +++ b/4-Multi-Omic Integration/scripts/4.HostT.HostP.link.r @@ -0,0 +1,160 @@ +# files required: +# 1) "3_HostT_affects_NEU_through_HostP.txt": resulting data frame generated from script "4.mediation_hostT.2.hostP.r" +# 2) "transcriptome.txt" transcriptomic abundance table on the feature level +# 3) "sputum_proteome.txt" proteomic abundance table on the feature level +# 4) "1_hostT.module_assign.txt" module assignment file of the transcriptomic featrues, generated from script "1.WGCNA_transcriptomics.r" +# 5) "protein_info.txt" protein names and corresponding gene names +# 6) "human_pathway.gmt" Human pathway information file (.gmt file) + +# output: +# "4_HostT.module_HostP.protein.linked.txt" + + +output.dir = "Output" +log.file = "HostT.HostP.link.log" + + +## ###################################################################### +## +## import data +## +## ##################################################################### +library(data.table) +library(dplyr) + +cat(paste("\n\n",as.character(Sys.time()), '\n'), file=log.file, append=T) +cat("Importing data : \n", file=log.file, append=T) + +mediation.res <- fread("3_HosT_affects_NEU_through_HosP.txt") + + +# impport host t data +m1 <- data.frame(fread("transcriptome.txt"), row.names=1) +feature.abb_df1 <- cbind.data.frame(feature = rownames(m1), + abb = paste("feature",seq(1,nrow(m1),1),sep = ""), + stringsAsFactors = F) +rownames(m1) <- sapply(rownames(m1), function(x) feature.abb_df1$abb[which(feature.abb_df1$feature == x)]) +m1 <- t(m1) %>% as.data.frame(stringsAsFactors=F) +colnames(m1) <- sapply(colnames(m1), function(x) feature.abb_df1$feature[which(feature.abb_df1$abb == x)]) + +HostT.dat <- m1 + + + +# import host p data +m2 = data.frame(fread("sputum_proteome.txt"), row.names = 1 ) +feature.abb_df2 <- cbind.data.frame(feature = rownames(m2), + abb = paste("feature",seq(1,nrow(m2),1),sep = ""), + stringsAsFactors = F) +rownames(m2) <- sapply(rownames(m2), function(x) feature.abb_df2$abb[which(feature.abb_df2$feature == x)]) +m2 <- t(m2) %>% as.data.frame(stringsAsFactors=F) +colnames(m2) <- sapply(colnames(m2), function(x) feature.abb_df2$feature[which(feature.abb_df2$abb == x)]) + +HostP.dat <- m2 + +# hostT module - feature assignment + +HostT_module.feature <- fread("1_hostT.module_assign.txt", data.table = F, col.names =c("Feature","Module")) + +# protein - gene information + +temp <- fread("database/protein_info.txt", data.table = T) %>% dplyr::select(Target, GeneName) %>% unique() +Protein.Gene_list <- vector("list", length = nrow(temp)) +names(Protein.Gene_list) <- temp$Target +for(i in c(1:nrow(temp))){ + #i=1 + prot = temp$Target[i] + genenames = strsplit(temp$GeneName[i], split = '[\\s\\/]', perl = T )[[1]] + Protein.Gene_list[[prot]] <- genenames + +} + +# pathway gene information + +PTWY_list <- Read.GeneSets.db2("database/human_pathway.gmt") +Pthwy.genes_list <- PTWY_list$gs # the gene names +PTWY_list$gs.desc[1:3] # the pathway id + + +# merge data +HostT.HostP.dat <- merge(HostT.dat, HostP.dat, by=0) + + + +## ############################################### +## +## perform linke analysis +## +## ############################################### +cat("Performing link analysis : \n", file=log.file, append=T) +# identify HostT-HostP module pairs +# HostT.HostP.modPairs <- strsplit((mediation.res %>% dplyr::filter(ACME.p <= ACME.p.co))$Treat_Mediator_Y,"_",fixed = T) +ACME.p.co = 0.10 +HostT.HostP.modPairs <-(mediation.res %>% dplyr::filter(ACME.p <= ACME.p.co))$Treat_Mediator_Y + +# identify links +links <- NULL +for(mdp in HostT.HostP.modPairs){ + # mdp = HostT.HostP.modPairs[1] + + + parts = strsplit(mdp,"_", fixed = T)[[1]] + + + + #cat(paste("\n\nAnalyzing module pairs: ", parts[1], "_",parts[2],"\n", sep = ""), file=log.file, append=T) + i_mdp <- which(HostT.HostP.modPairs == mdp) + if(i_mdp %% 100 == 0) cat(paste(" -------------------- progress: ", i_mdp, " out of ", length(HostT.HostP.modPairs)," module pairs -------------------------- \n", sep = ""), file=log.file, append=T) + + + + + + + # identify hostp associated gene + hostp.pro = sub("ME(.*)$","\\1",sub("HostP\\.(.*)$","\\1",parts[2]) ) + hostp.gene = Protein.Gene_list[[hostp.pro]] + + # identify hostt associated gene + # identify hostt associated gene by module assignment + hostt.md = sub("ME(.*)$","\\1",sub("HostT\\.(.*)$","\\1",parts[1]) ) + hostt.ftr = HostT_module.feature$Feature[HostT_module.feature$Module == hostt.md] + + # identify hostt associated gene through pathway enrichment analysis + pathwy.pvalue <- NULL + for(pthy in names(Pthwy.genes_list)[!grepl("^GENEGO", names(Pthwy.genes_list))]){ + #pthy = names(Pthwy.genes_list)[!grepl("^GENEGO", names(Pthwy.genes_list))][1] + + pthy.genes = Pthwy.genes_list[[pthy]] + overlaped = intersect(pthy.genes, hostt.ftr) + totalgene.no = ncol(HostT.dat) + + pvalue <-dhyper(x=length(overlaped), m=length(pthy.genes), n= totalgene.no - length(pthy.genes), k = length(hostt.ftr)) + p.vec_temp <- c(pthy, pvalue) + names(p.vec_temp) <- c("Pathway","dhyper.pvalue") + pathwy.pvalue <- bind_rows(pathwy.pvalue, p.vec_temp) + } + pathwy.pvalue <- pathwy.pvalue %>% + mutate(fdr = p.adjust(dhyper.pvalue, method = "fdr")) %>% + dplyr::arrange(fdr) %>% filter(fdr <= 0.25) + + if(nrow(pathwy.pvalue) == 0) { + hostt.ftr.integrated <- hostt.ftr + } else { + hostt.ftr.pthwy = Pthwy.genes_list[[pathwy.pvalue$Pathway[1]]] + hostt.ftr.integrated <- unique(c(hostt.ftr,hostt.ftr.pthwy )) + } + + + if(length( intersect(hostt.ftr.integrated, hostp.gene)) > 0) { + link = c(paste(parts[1], parts[2], sep = "_" ), paste(intersect(hostt.ftr.integrated, hostp.gene), collapse = ";") ) + names(link) <- c("HostT.module_HostP.protein_pair", "Genes.association") + links <- bind_rows(links,link) + remove(link) + } + +} # loop through module pairs + + +if(!dir.exists(output.dir)) dir.create(output.dir) +write.table(links, file = paste(output.dir,"/HostT.module_HostP.protein.NEU.linked.txt",sep = ""), sep = "\t", quote = F, row.names = F)