VITAE / Git / [2c6b19] /paper/Benchmarking with real and synthetic datasets/evaluate_other

Models:
MarcoTheBlack/
VITAE
Downloads: 1
[2c6b19]: / paper / Benchmarking with real and synthetic datasets / evaluate_other_methods.py
History
Download this file
366 lines (321 with data), 14.2 kB

import networkx as nx
import networkx.algorithms.isomorphism as iso
import numpy as np
import pandas as pd
from scipy.sparse.csgraph import laplacian
from scipy.linalg import eigh
from scipy.integrate import quad
from sklearn.metrics import pairwise_distances
import warnings

import numba
from numba import jit, float32

def IM_dist(G1, G2):
    adj1 = nx.to_numpy_array(G1)
    adj2 = nx.to_numpy_array(G2)
    hwhm = 0.08
    
    N = len(adj1)
    # get laplacian matrix
    L1 = laplacian(adj1, normed=False)
    L2 = laplacian(adj2, normed=False)

    # get the modes for the positive-semidefinite laplacian
    w1 = np.sqrt(np.abs(eigh(L1)[0][1:]))
    w2 = np.sqrt(np.abs(eigh(L2)[0][1:]))

    # we calculate the norm for both spectrum
    norm1 = (N - 1) * np.pi / 2 - np.sum(np.arctan(-w1 / hwhm))
    norm2 = (N - 1) * np.pi / 2 - np.sum(np.arctan(-w2 / hwhm))

    # define both spectral densities
    density1 = lambda w: np.sum(hwhm / ((w - w1) ** 2 + hwhm ** 2)) / norm1
    density2 = lambda w: np.sum(hwhm / ((w - w2) ** 2 + hwhm ** 2)) / norm2

    func = lambda w: (density1(w) - density2(w)) ** 2
    return np.sqrt(quad(func, 0, np.inf, limit=100)[0])

def build_milestone_net(subgraph, init_node):
    '''
    Args:
        subgraph     - a connected component of the graph, csr_matrix
        init_node    - root node
    Returns:
        df_subgraph  - dataframe of milestone network
    '''

    if len(subgraph)==1:
        warnings.warn('Singular node.')
        return []
    else:
        # Dijkstra's Algorithm
        unvisited = {node: {'parent':None,
                            'score':np.inf,
                            'distance':np.inf} for node in subgraph.nodes}
        current = init_node
        currentScore = 0
        currentDistance = 0
        unvisited[current]['score'] = currentScore

        milestone_net = []
        while True:
            for neighbour in subgraph.neighbors(current):
                if neighbour not in unvisited: continue
                newScore = currentScore + subgraph[current][neighbour]['weight']
                if unvisited[neighbour]['score'] > newScore:
                    unvisited[neighbour]['score'] = newScore
                    unvisited[neighbour]['parent'] = current
                    unvisited[neighbour]['distance'] = currentDistance+1

            if len(unvisited)<len(subgraph):
                milestone_net.append([unvisited[current]['parent'],
                                      current,
                                      unvisited[current]['distance']])
            del unvisited[current]
            if not unvisited: break
            current, currentScore, currentDistance = \
                sorted([(i[0],i[1]['score'],i[1]['distance']) for i in unvisited.items()],
                        key = lambda x: x[1])[0]
        return np.array(milestone_net)

def comp_pseudotime(G, node, df):
    connected_comps = nx.node_connected_component(G, node)
    subG = G.subgraph(connected_comps)
    milestone_net = build_milestone_net(subG,node)

    # compute pseudotime
    pseudotime = - np.ones(len(df))
    for i in range(len(milestone_net)):
        _from, _to = milestone_net[i,:2]
        _from, _to = int(_from), int(_to)

        idc = (df['from']==_from)&(df['to']==_to)
        if np.sum(idc)>0:
            pseudotime[idc] = df['percentage'].values[idc] + milestone_net[i,-1] - 1
            
        idc = (df['from']==_to)&(df['to']==_from)
        if np.sum(idc)>0:
            pseudotime[idc] = 1-df['percentage'].values[idc] + milestone_net[i,-1] - 1        

        if np.any(df['from']==_from):
            idc = (df['from']==_from)&(df['to']==_from)
            pseudotime[idc] =  milestone_net[i,-1] - 1
    if len(milestone_net)>0 and np.any((df['from']==_to)&(df['to']==_to)):
        idc = (df['from']==_to)&(df['to']==_to)
        pseudotime[idc] =  milestone_net[i,-1]
    return pseudotime

def topology(G_true, G_pred, is_GED=True):   
    res = {}
    
    # 1. Isomorphism with same initial node
    def comparison(N1, N2):    
        if N1['is_init'] != N2['is_init']:
            return False
        else:
            return True
    score_isomorphism = int(nx.is_isomorphic(G_true, G_pred, node_match=comparison))
    res['ISO score'] = score_isomorphism
    
    # 2. GED (graph edit distance)
    if len(G_true)>10 or len(G_pred)>10:
        warnings.warn("Didn't calculate graph edit distances for large graphs.")
        res['GED score'] = np.nan  
    else:
        max_num_oper = len(G_true)
        score_GED = 1 - np.min([nx.graph_edit_distance(G_pred, G_true, node_match=comparison),
                            max_num_oper]) / max_num_oper
        res['GED score'] = score_GED
        
    # 3. Hamming-Ipsen-Mikhailov distance
    if len(G_true)==len(G_pred):
        score_IM = 1-IM_dist(G_true, G_pred)
        score_IM = np.maximum(0, score_IM)
    else:
        score_IM = 0
    res['score_IM'] = score_IM
    return res

@jit((float32[:,:], float32[:,:]), nopython=True, nogil=True)
def _rand_index(true, pred):
    n = true.shape[0]
    m_true = true.shape[1]
    m_pred = pred.shape[1]
    RI = 0.0
    for i in range(1, n-1):
        for j in range(i, n):
            RI_ij = 0.0
            for k in range(m_true):
                RI_ij += true[i,k]*true[j,k]
            for k in range(m_pred):
                RI_ij -= pred[i,k]*pred[j,k]     
            RI += 1-np.abs(RI_ij)
    return RI / (n*(n-1)/2.0)


def get_GRI(true, pred):
    '''
    Params:
        ture - [n_samples, n_cluster_1] for proportions or [n_samples, ] for grouping
        pred - [n_samples, n_cluster_2] for estimated proportions
    '''    
    if len(true)!=len(pred):
        raise ValueError('Inputs should have same lengths!')
        
    if len(true.shape)==1:
        true = pd.get_dummies(true).values
    if len(pred.shape)==1:
        pred = pd.get_dummies(pred).values
        
    true = np.sqrt(true).astype(np.float32)
    pred = np.sqrt(pred).astype(np.float32)

    return _rand_index(true, pred)

import numpy as np
import pandas as pd
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.preprocessing import LabelEncoder
import sys#; sys.path.insert(0, '../..')
sys.path.append('../..')
from VITAE import load_data

type_dict = {
    # dyno
    'dentate':'UMI', 
    'immune':'UMI', 
    'planaria_full':'UMI', 
    'planaria_muscle':'UMI',
    'aging':'non-UMI', 
    'cell_cycle':'non-UMI',
    'fibroblast':'non-UMI', 
    'germline':'non-UMI',    
    'human_embryos':'non-UMI', 
    'mesoderm':'non-UMI',
    
    # dyngen    
    "cycle_1":'non-UMI', 
    "cycle_2":'non-UMI', 
    "cycle_3":'non-UMI',
    "linear_1":'non-UMI', 
    "linear_2":'non-UMI', 
    "linear_3":'non-UMI', 
    "trifurcating_1":'non-UMI', 
    "trifurcating_2":'non-UMI', 
    "bifurcating_1":'non-UMI', 
    'bifurcating_2':'non-UMI',
    "bifurcating_3":'non-UMI', 
    "converging_1":'non-UMI',
    
    # our model
    'linear':'UMI',
    'bifurcation':'UMI',
    'multifurcating':'UMI',
    'tree':'UMI',
}
source_dict = {
    'dentate':'real', 
    'immune':'real', 
    'planaria_muscle':'real',
    'planaria_full':'real',
    'aging':'real', 
    'cell_cycle':'real',
    'fibroblast':'real', 
    'germline':'real',    
    'human_embryos':'real', 
    'mesoderm':'real',
        
    "cycle_1":'dyngen', 
    "cycle_2":'dyngen', 
    "cycle_3":'dyngen',
    "linear_1":'dyngen', 
    "linear_2":'dyngen', 
    "linear_3":'dyngen', 
    "trifurcating_1":'dyngen', 
    "trifurcating_2":'dyngen', 
    "bifurcating_1":'dyngen', 
    'bifurcating_2':'dyngen',
    "bifurcating_3":'dyngen', 
    "converging_1":'dyngen',
    
    'linear':'our model',
    'bifurcation':'our model',
    'multifurcating':'our model',
    'tree':'our model',
}
df = pd.DataFrame()
for data_name in type_dict.keys():
    data = load_data('data/', data_name)
    milestone_net = data['milestone_network']    
    le = LabelEncoder()
    grouping = le.fit_transform(data['grouping'])
    begin_node_true = le.transform([data['root_milestone_id']])[0]
    if 'w' in milestone_net.columns:
        grouping = None
    if milestone_net is not None:
        milestone_net['from'] = le.transform(milestone_net['from'])
        milestone_net['to'] = le.transform(milestone_net['to'])  
    G_true = nx.Graph()
    G_true.add_nodes_from(np.unique(milestone_net[['from','to']].values.flatten()))
    if grouping is None:
        G_true.add_edges_from(list(
            milestone_net[~pd.isna(milestone_net['w'])].groupby(['from', 'to']).count().index))
    # otherwise, 'milestone_net' indicates edges
    else:
        if milestone_net is not None:             
            G_true.add_edges_from(list(
                milestone_net.groupby(['from', 'to']).count().index))
    G_true.remove_edges_from(nx.selfloop_edges(G_true))
    nx.set_node_attributes(G_true, False, 'is_init')
    G_true.nodes[begin_node_true]['is_init'] = True

        
    for method in ['monocle','paga','slingshot']:
                
        _df = pd.read_csv('result/other methods/%s/%s_progressions.csv'%(data_name,method), 
                          index_col=0).astype({'from':str,'to':str})   
        pred_milestone_net = pd.read_csv('result/other methods/%s/%s_milestone_network.csv'%(data_name,method), 
                                         index_col=0).sort_index().astype({'from':str,'to':str})
        if method=='paga':
            _df['from'] = _df['from'].apply(lambda x:int(x.split('-')[0]) if type(x)==str else x)    
            _df['to'] = _df['to'].apply(lambda x:int(x.split('-')[0]) if type(x)==str else x)  
            pred_milestone_net['from'] = pred_milestone_net['from'].apply(
                lambda x:int(x.split('-')[0]) if type(x)==str else x)  
            pred_milestone_net['to'] = pred_milestone_net['to'].apply(
                lambda x:int(x.split('-')[0]) if type(x)==str else x)              

        le = LabelEncoder()
        le.fit(np.unique(pred_milestone_net[['from','to']].values.flatten()))
        pred_milestone_net['from'] = le.transform(pred_milestone_net['from'])
        pred_milestone_net['to'] = le.transform(pred_milestone_net['to'])
        _df['from'] = le.transform(_df['from'])
        _df['to'] = le.transform(_df['to'])
        
        G_pred = nx.Graph()
        G_pred.add_nodes_from(np.unique(_df[['from', 'to']].values.flatten()))
        G_pred.add_edges_from(pred_milestone_net[['from', 'to']].values)
        G_pred.remove_edges_from(nx.selfloop_edges(G_pred))  
        nx.set_node_attributes(G_pred, False, 'is_init')
        G_pred.nodes[pred_milestone_net.iloc[0]['from']]['is_init'] = True
        
        pseudotime = comp_pseudotime(nx.from_numpy_array(nx.to_numpy_array(G_pred)), 
                                     pred_milestone_net.iloc[0]['from'], 
                                     _df.copy())
        
        # 1. topology
        res = topology(G_true, G_pred)

        # 2. Milestones assignment
        if grouping is None:
            milestones_true = milestone_net['from'].values.copy()
            milestones_true[(milestone_net['from']!=milestone_net['to'])
                           &(milestone_net['w']<0.5)] = milestone_net[(milestone_net['from']!=milestone_net['to'])
                                                                      &(milestone_net['w']<0.5)]['to'].values
        else:
            milestones_true = grouping
        milestones_true = milestones_true[pseudotime!=-1]
        milestones_pred = _df['from'].values.copy()
        milestones_pred[_df['percentage']>0.5] = _df[_df['percentage']>0.5]['to'].values.copy()
        milestones_pred = milestones_pred[pseudotime!=-1]
        res['ARI'] = (adjusted_rand_score(milestones_true, milestones_pred) + 1)/2

        n_samples = len(pseudotime)
        w = np.zeros((n_samples,n_samples))
        w[np.arange(n_samples), _df['to']] = _df['percentage']
        w[np.arange(n_samples), _df['from']] = 1 - _df['percentage']
        if grouping is None:
            n_samples = len(milestone_net)
            prop = np.zeros((n_samples,n_samples))
            prop[np.arange(n_samples), milestone_net['to']] = 1-milestone_net['w']
            prop[np.arange(n_samples), milestone_net['from']] = np.where(np.isnan(milestone_net['w']), 1, milestone_net['w'])
            res['GRI'] = get_GRI(prop, w)
        else:
            res['GRI'] = get_GRI(grouping, w)
        
        # 3. Correlation between geodesic distances / Pseudotime   
        if 'cycle' in data_name or 'immune' in data_name:
            res['PDT score'] = np.nan
        else:
            if grouping is None:
                pseudotime_true = milestone_net['from'].values + 1 - milestone_net['w'].values
                pseudotime_true[np.isnan(pseudotime_true)] = milestone_net[pd.isna(milestone_net['w'])]['from'].values            
            else:
                pseudotime_true = - np.ones(len(grouping))
                nx.set_edge_attributes(G_true, values = 1, name = 'weight')
                connected_comps = nx.node_connected_component(G_true, begin_node_true)
                subG = G_true.subgraph(connected_comps)
                milestone_net_true = build_milestone_net(subG, begin_node_true)
                if len(milestone_net_true)>0:
                    pseudotime_true[grouping==int(milestone_net_true[0,0])] = 0
                    for i in range(len(milestone_net_true)):
                        pseudotime_true[grouping==int(milestone_net_true[i,1])] = milestone_net_true[i,-1]
            pseudotime_true = pseudotime_true[pseudotime>-1]
            pseudotime_pred = pseudotime[pseudotime>-1]
            if np.std(pseudotime_true)==0 or np.std(pseudotime_pred)==0:
                res['PDT score'] = 0
            else:
                res['PDT score'] = (np.corrcoef(pseudotime_true,pseudotime_pred)[0,1]+1)/2
    
        res['method'] = method
        res['data'] = data_name            
        res['source'] = source_dict[data_name]       
        df = df.append(pd.DataFrame(res, index=[0]),ignore_index=True)        

df.to_csv('result/result_other_methods.csv')