GOAT_multi-omics_biomarks / Git / [14cb68] /modules/network

Models:
AlyssaS/
GOAT_multi-omics_biomarks
Downloads: 1
[14cb68]: / modules / network_propagation.py
History
Download this file
339 lines (290 with data), 17.2 kB

"""
Implementation of tissue-specific graph walk with RWR

"""
import sys
import pandas as pd
import numpy as np
import networkx as nx
import argparse
import sklearn.preprocessing
from scipy.stats import spearmanr
# convergence criterion - when vector L1 norm drops below 10^(-6)
# (this is the same as the original RWR paper)
CONV_THRESHOLD = 0.000001

def str2bool(v):
        return v.lower() in ("yes", "true", "t", "1")

def isNum(x):
        try:
                float(x)
                return True
        except:
                return False

class Walker:
        """ Class for multi-graph walk to convergence, using matrix computation.

        Random walk with restart (RWR) algorithm adapted from:

        Kohler S, Bauer S, Horn D, Robinson PN. Walking the interactome for
        prioritization of candidate disease genes. The American Journal of Human
        Genetics. 2008 Apr 11;82(4):949-58.

        Attributes:
        -----------
                og_matrix (np.array) : The column-normalized adjacency matrix
                                                           representing the original graph LCC, with no
                                                           nodes removed
                teleport_matrix (np.array): The column-normalized adjacency matrix
                                                           representing the graph LCC, which adds additional edges for teleport  
                restart_prob (float) : The probability of restarting from the source
                                                           node for each step in run_path (i.e. r in the
                                                           original Kohler paper RWR formulation)
                normalize (bool)         : Whether normalizing p0 to [0,1]
        """

        def __init__(self, original_ppi, teleport_ppi, remove_nodes=[], constantWeight=False, absWeight=False, addBidirectionEdge=False):
                self._build_matrices(original_ppi, teleport_ppi, remove_nodes, constantWeight, absWeight, addBidirectionEdge)
                self.dic_node2idx = dict([(node, i) for i, node in enumerate(self.OG.nodes())])

        def run_exp(self, seed2weight, restart_prob, teleport_prob=0.5, normalize=False, node_list=[]):
                #NP for one sample
                """ Run a multi-graph random walk experiment, and print results.

                Parameters:
                -----------
                        seed2weight (dictionary):                The source node indices (i.e. a list of Entrez
                                                                  gene IDs)
                        restart_prob (float): As above
                        teleport_prob (float):          As above
                        normalize (bool):         As above
                """
                self.restart_prob = restart_prob
                self.teleport_prob = teleport_prob
                # set up the starting probability vector
                criteria_p=self._set_up_p0(seed2weight)
                #mask TG with 0
                p_0 = self._set_up_p0(seed2weight)
                if normalize == True:
                        p_0 /= np.sum(p_0) # normalize
                diff_norm = 1
                # this needs to be a deep copy, since we're reusing p_0 later
                p_t = np.copy(p_0)
        
                # arr_p includes all p_t for tracing
                arr_p = np.empty((len(p_t),1))
                arr_p[:,0] = p_t
                
                while (diff_norm > CONV_THRESHOLD):
                        # first, calculate p^(t + 1) from p^(t)
                        p_t_1 = self._calculate_next_p(p_t, p_0)
                        if normalize == True:
                                p_t_1 /= np.sum(p_t_1) # normalize
                        # calculate L1 norm of difference between p^(t + 1) and p^(t),
                        # for checking the convergence condition
                        diff_norm = np.linalg.norm(np.subtract(p_t_1, p_t), 1)
                        # then, set p^(t) = p^(t + 1), and loop again if necessary
                        # no deep copy necessary here, we're just renaming p
                        p_t = p_t_1
                        # append p_t to arr_p
                        arr_p = np.c_[arr_p, p_t]
                        if arr_p.shape[1] >= 50000:
                                break
                print('%d iterated'%(arr_p.shape[1]))
                # now, generate and print a rank list from the final prob vector
                if node_list:#if I want to get propagation result only from selected node list
                        gene_idx = dict(zip(self.OG.nodes(), range(len(self.OG.nodes()))))
                        output = []
                        for node in node_list:
                                i = gene_idx[node]
                                output.append([node,arr_p[i,-1],arr_p[i,:].tolist()])
                        return output
                        #return list(self._generate_prob_list(arr_p, node_list))
                else:
                        gene_idx = dict(zip(self.OG.nodes(), range(len(self.OG.nodes()))))
                        output = []
                        for node in sorted(self.OG.nodes()):
                                i = gene_idx[node]
                                output.append([node,arr_p[i,-1],arr_p[i,:].tolist()])
                        return output
                        #return list(self._generate_rank_list(arr_p))

        def _generate_prob_list(self, p_t, node_list):
                gene_probs = dict(zip(self.OG.nodes(), p_t.tolist()))
                for node in node_list:
                        yield node, gene_probs[node]

        def _generate_rank_list(self, p_t):
                """ Return a rank list, generated from the final probability vector.

                Gene rank list is ordered from highest to lowest probability.
                """
                gene_probs = zip(self.OG.nodes(), p_t.tolist())
                # sort by probability (from largest to smallest), and generate a
                # sorted list of Gene IDs
                for s in sorted(gene_probs, key=lambda x: x[0]):
                        yield s[0], s[1]


        def _calculate_next_p(self, p_t, p_0):
                """ Calculate the next probability vector. """
                if self.teleport_prob is not None:
                        epsilon = np.squeeze(np.asarray(np.dot(self.og_matrix, p_t)))
                        no_restart = epsilon * (1 - self.restart_prob)
                        epsilon_teleport = np.squeeze(np.asarray(np.dot(self.tg_matrix, p_t)))
                else:
                        epsilon = np.squeeze(np.asarray(np.dot(self.og_matrix, p_t)))
                        no_restart = epsilon * (1 - self.restart_prob)
                restart = p_0 * self.restart_prob 
                
                return np.add(no_restart, restart)*(1-self.teleport_prob) + epsilon_teleport*self.teleport_prob

        def _set_up_p0(self, seed2weight,set_TF=None):
                """ Set up and return the 0th probability vector. """
                        
                p_0 = [0] * self.OG.number_of_nodes()
                weightSum = 0.0
                for seed, weight in seed2weight.items():
                        if seed not in self.dic_node2idx:
                                #print "Source node %s is not in original graph. It is ignored."%(seed)
                                continue
                        weightSum += seed2weight[seed]
                for seed, weight in seed2weight.items():
                        if seed not in self.dic_node2idx:
                                continue
                        idx = self.dic_node2idx[seed]
                        p_0[idx] = seed2weight[seed]
                        #p_0[idx] = seed2weight[seed]/weightSum
                return np.array(p_0)


        def _build_matrices(self, original_ppi, teleport_ppi, remove_nodes, constantWeight, absWeight, addBidirectionEdge):
                """ Build column-normalized adjacency matrix for each graph.

                NOTE: these are column-normalized adjacency matrices (not nx
                          graphs), used to compute each p-vector
                """
                original_graph = self._build_og(original_ppi, constantWeight, absWeight, addBidirectionEdge)

                if remove_nodes:
                        # remove nodes, then get the largest connected component once
                        # the nodes are removed
                        original_graph.remove_nodes_from(remove_nodes)
                        original_graph = max(
                                        nx.connected_component_subgraphs(original_graph),
                                        key=len)

                self.OG = original_graph
                og_not_normalized = nx.to_numpy_matrix(original_graph)
                self.og_matrix = self._normalize_rows(np.transpose(og_not_normalized))


                teleport_graph = self._build_og(teleport_ppi, constantWeight, absWeight, addBidirectionEdge)
                
                if remove_nodes:
                        # remove nodes, then get the largest connected component once
                        # the nodes are removed
                        teleport_graph.remove_nodes_from(remove_nodes)
                        teleport_graph = max(
                                        nx.connected_component_subgraphs(original_graph),
                                        key=len)

                self.TG = teleport_graph ##nx object 

                tg_tmp = nx.to_pandas_adjacency(teleport_graph)
                tg_df = pd.DataFrame(np.zeros(og_not_normalized.shape), index=original_graph.nodes, columns=original_graph.nodes)
                tg_df.update(tg_tmp) 

                tg_not_normalized = tg_df.to_numpy() 
                self.tg_matrix = self._normalize_rows(np.transpose(tg_not_normalized))
            

        def _build_og(self, original_ppi, constantWeight=False, absWeight=False, addBidirectionEdge=False):
                """ Build the original graph, without any nodes removed. """

                try:
                        graph_fp = open(original_ppi, 'r')
                except IOError:
                        sys.exit("Could not open file: {}".format(original_ppi))

                G = nx.DiGraph()
                edge_list = []

                # parse network input
                for line in graph_fp.readlines():
                        split_line = line.rstrip().split('\t')
                        #if len(split_line) > 3:
                        #        # assume input graph is in the form of HIPPIE network
                        #        edge_list.append((split_line[1], split_line[3],
                        #                                          float(split_line[4])))
                        if len(split_line) < 3:
                                # assume input graph is a simple edgelist without weights
                                #edge_list.append((split_line[0], split_line[1], float(1)))
                                weight = 1.0
                        else:
                                # assume input graph is a simple edgelist with weights
                                #edge_list.append((split_line[0], split_line[1],
                                #                                  float(split_line[2])))
                                weight = float(split_line[2])
                        if constantWeight:
                                weight = 1.0
                        if absWeight:
                                weight = abs(weight)
                        #edge_list.append((split_line[0], split_line[1], float(weight)))
                        edge_list.append((split_line[0], split_line[1], 1))
                        if addBidirectionEdge:
                                edge_list.append((split_line[1], split_line[0], float(weight)))

                G.add_weighted_edges_from(edge_list)
                graph_fp.close()
                return G


        def _normalize_cols(self, matrix):
                """ Normalize the columns of the adjacency matrix """
                return sklearn.preprocessing.normalize(matrix, norm='l1', axis=0)
        
        def _normalize_rows(self, matrix):
                """ Normalize the rows of the adjacency matrix """
                return sklearn.preprocessing.normalize(matrix, norm='l1', axis=1)

def main_propagation(argv):
        # set up argument parsing
        parser = argparse.ArgumentParser(usage='''\
        python %(prog)s input_graph teleport_graph seed -o myout -e 0.01
        ''')
        parser.add_argument('input_graph', help='Original graph input file, in edge list format')
        parser.add_argument('teleport_graph', help='Teleport graph input file, in edge list format')
        parser.add_argument('seed', help='Seed file, to pull start nodes from')
        parser.add_argument('-o',required=True, help='outfile')
        parser.add_argument('-e', '--restart_prob', type=float, default=0.1, help='Restart probability for random walk')
        parser.add_argument('--teleport_prob',type=float, default=0.5)
        parser.add_argument('-constantWeight', default='False', choices=['True', 'False'], help='Whether constant weight or not')
        parser.add_argument('-absoluteWeight', default='False', choices=['True', 'False'], help='Whether absolute weight or not')
        parser.add_argument('-addBidirectionEdge', default='False', choices=['True', 'False'], help='Whether adding bidirection edges')
        parser.add_argument('-normalize', default='False', choices=['True', 'False'], help='Whether normalizing p0 or not')
        args = parser.parse_args()

        try:
                fp = open(args.seed, "r")
        except IOError:
                 sys.exit("Error opening file {}".format(args.seed))
        
        lst_columnName=['0']
        lst_seed=[]
        lst_weights=[]
        for line in fp.readlines():
                s = line.rstrip().split()
                if len(s) >= 2:
                        if not isNum(s[1]):#header
                                lst_columnName=s[1:]
                        else:
                                lst_columnName=[str(i) for i in np.arange(len(s[1:]))+1]      
                seed = s[0]
                if len(s) == 1: #if only the gene lists are given, set weights to 1
                        weights = [1.0]
                if len(s) >= 2:
                        weights = list(map(float,s[1:]))
                lst_seed.append(seed)
                lst_weights.append(weights)
        arr_weights=np.array(lst_weights)
        fp.close()
        
        # run the experiments, and write a rank list to stdout
        dic_node2weights={}
        set_nodes=set()

        lst_wk = []
        wk = Walker(args.input_graph, args.teleport_graph, constantWeight=str2bool(args.constantWeight), absWeight=str2bool(args.absoluteWeight), addBidirectionEdge=str2bool(args.addBidirectionEdge))#1 wk for 1 input graph
        set_nodes |= set(wk.OG.nodes())
        lst_wk.append(wk)
        
        column_name=[]
        for idx, wk in enumerate(lst_wk):#if there's multiple input graphs
                for j in range(arr_weights.shape[1]): #iterate (# samples) times
                        column_name.append(lst_columnName[j])
                        if sum(np.abs(arr_weights[:,j])) == 0.0:
                                for node in set_nodes:
                                        if node not in dic_node2weights:
                                                dic_node2weights[node]=[]
                                        dic_node2weights[node].append(0.0)
                                continue
                        seed2weight=dict()
                        for ii in range(len(lst_seed)):
                                seed2weight[lst_seed[ii]]=arr_weights[ii,j]
                        lst_node_weight = wk.run_exp(seed2weight, args.restart_prob, normalize=str2bool(args.normalize))
                        set_tmpNodes=set()
                        for node, weight, all_weight in lst_node_weight:
                                if node not in dic_node2weights:
                                        dic_node2weights[node]=[]
                                dic_node2weights[node].append(weight)
                                set_tmpNodes.add(node)
                        for node in set_nodes-set_tmpNodes:
                                if node not in dic_node2weights:
                                        dic_node2weights[node]=[]
                                dic_node2weights[node].append(0.0)
                                
        OF=open(args.o,'w')
        #OF.write('Gene\t'+'\t'.join(column_name)+'\n')
        for node, weights in dic_node2weights.items():
                #OF.write('\t'.join(map(str,[node]+all_weight))+'\n')
                OF.write('\t'.join(map(str,[node]+weights))+'\n')
                OF.flush()
        OF.close()

if __name__ == '__main__':
        main_propagation(sys.argv)