--- a +++ b/paper/Benchmarking with real and synthetic datasets/evaluate_other_methods.py @@ -0,0 +1,365 @@ +import networkx as nx +import networkx.algorithms.isomorphism as iso +import numpy as np +import pandas as pd +from scipy.sparse.csgraph import laplacian +from scipy.linalg import eigh +from scipy.integrate import quad +from sklearn.metrics import pairwise_distances +import warnings + +import numba +from numba import jit, float32 + +def IM_dist(G1, G2): + adj1 = nx.to_numpy_array(G1) + adj2 = nx.to_numpy_array(G2) + hwhm = 0.08 + + N = len(adj1) + # get laplacian matrix + L1 = laplacian(adj1, normed=False) + L2 = laplacian(adj2, normed=False) + + # get the modes for the positive-semidefinite laplacian + w1 = np.sqrt(np.abs(eigh(L1)[0][1:])) + w2 = np.sqrt(np.abs(eigh(L2)[0][1:])) + + # we calculate the norm for both spectrum + norm1 = (N - 1) * np.pi / 2 - np.sum(np.arctan(-w1 / hwhm)) + norm2 = (N - 1) * np.pi / 2 - np.sum(np.arctan(-w2 / hwhm)) + + # define both spectral densities + density1 = lambda w: np.sum(hwhm / ((w - w1) ** 2 + hwhm ** 2)) / norm1 + density2 = lambda w: np.sum(hwhm / ((w - w2) ** 2 + hwhm ** 2)) / norm2 + + func = lambda w: (density1(w) - density2(w)) ** 2 + return np.sqrt(quad(func, 0, np.inf, limit=100)[0]) + +def build_milestone_net(subgraph, init_node): + ''' + Args: + subgraph - a connected component of the graph, csr_matrix + init_node - root node + Returns: + df_subgraph - dataframe of milestone network + ''' + + if len(subgraph)==1: + warnings.warn('Singular node.') + return [] + else: + # Dijkstra's Algorithm + unvisited = {node: {'parent':None, + 'score':np.inf, + 'distance':np.inf} for node in subgraph.nodes} + current = init_node + currentScore = 0 + currentDistance = 0 + unvisited[current]['score'] = currentScore + + milestone_net = [] + while True: + for neighbour in subgraph.neighbors(current): + if neighbour not in unvisited: continue + newScore = currentScore + subgraph[current][neighbour]['weight'] + if unvisited[neighbour]['score'] > newScore: + unvisited[neighbour]['score'] = newScore + unvisited[neighbour]['parent'] = current + unvisited[neighbour]['distance'] = currentDistance+1 + + if len(unvisited)<len(subgraph): + milestone_net.append([unvisited[current]['parent'], + current, + unvisited[current]['distance']]) + del unvisited[current] + if not unvisited: break + current, currentScore, currentDistance = \ + sorted([(i[0],i[1]['score'],i[1]['distance']) for i in unvisited.items()], + key = lambda x: x[1])[0] + return np.array(milestone_net) + +def comp_pseudotime(G, node, df): + connected_comps = nx.node_connected_component(G, node) + subG = G.subgraph(connected_comps) + milestone_net = build_milestone_net(subG,node) + + # compute pseudotime + pseudotime = - np.ones(len(df)) + for i in range(len(milestone_net)): + _from, _to = milestone_net[i,:2] + _from, _to = int(_from), int(_to) + + idc = (df['from']==_from)&(df['to']==_to) + if np.sum(idc)>0: + pseudotime[idc] = df['percentage'].values[idc] + milestone_net[i,-1] - 1 + + idc = (df['from']==_to)&(df['to']==_from) + if np.sum(idc)>0: + pseudotime[idc] = 1-df['percentage'].values[idc] + milestone_net[i,-1] - 1 + + if np.any(df['from']==_from): + idc = (df['from']==_from)&(df['to']==_from) + pseudotime[idc] = milestone_net[i,-1] - 1 + if len(milestone_net)>0 and np.any((df['from']==_to)&(df['to']==_to)): + idc = (df['from']==_to)&(df['to']==_to) + pseudotime[idc] = milestone_net[i,-1] + return pseudotime + +def topology(G_true, G_pred, is_GED=True): + res = {} + + # 1. Isomorphism with same initial node + def comparison(N1, N2): + if N1['is_init'] != N2['is_init']: + return False + else: + return True + score_isomorphism = int(nx.is_isomorphic(G_true, G_pred, node_match=comparison)) + res['ISO score'] = score_isomorphism + + # 2. GED (graph edit distance) + if len(G_true)>10 or len(G_pred)>10: + warnings.warn("Didn't calculate graph edit distances for large graphs.") + res['GED score'] = np.nan + else: + max_num_oper = len(G_true) + score_GED = 1 - np.min([nx.graph_edit_distance(G_pred, G_true, node_match=comparison), + max_num_oper]) / max_num_oper + res['GED score'] = score_GED + + # 3. Hamming-Ipsen-Mikhailov distance + if len(G_true)==len(G_pred): + score_IM = 1-IM_dist(G_true, G_pred) + score_IM = np.maximum(0, score_IM) + else: + score_IM = 0 + res['score_IM'] = score_IM + return res + +@jit((float32[:,:], float32[:,:]), nopython=True, nogil=True) +def _rand_index(true, pred): + n = true.shape[0] + m_true = true.shape[1] + m_pred = pred.shape[1] + RI = 0.0 + for i in range(1, n-1): + for j in range(i, n): + RI_ij = 0.0 + for k in range(m_true): + RI_ij += true[i,k]*true[j,k] + for k in range(m_pred): + RI_ij -= pred[i,k]*pred[j,k] + RI += 1-np.abs(RI_ij) + return RI / (n*(n-1)/2.0) + + +def get_GRI(true, pred): + ''' + Params: + ture - [n_samples, n_cluster_1] for proportions or [n_samples, ] for grouping + pred - [n_samples, n_cluster_2] for estimated proportions + ''' + if len(true)!=len(pred): + raise ValueError('Inputs should have same lengths!') + + if len(true.shape)==1: + true = pd.get_dummies(true).values + if len(pred.shape)==1: + pred = pd.get_dummies(pred).values + + true = np.sqrt(true).astype(np.float32) + pred = np.sqrt(pred).astype(np.float32) + + return _rand_index(true, pred) + +import numpy as np +import pandas as pd +from sklearn.metrics.cluster import adjusted_rand_score +from sklearn.preprocessing import LabelEncoder +import sys#; sys.path.insert(0, '../..') +sys.path.append('../..') +from VITAE import load_data + +type_dict = { + # dyno + 'dentate':'UMI', + 'immune':'UMI', + 'planaria_full':'UMI', + 'planaria_muscle':'UMI', + 'aging':'non-UMI', + 'cell_cycle':'non-UMI', + 'fibroblast':'non-UMI', + 'germline':'non-UMI', + 'human_embryos':'non-UMI', + 'mesoderm':'non-UMI', + + # dyngen + "cycle_1":'non-UMI', + "cycle_2":'non-UMI', + "cycle_3":'non-UMI', + "linear_1":'non-UMI', + "linear_2":'non-UMI', + "linear_3":'non-UMI', + "trifurcating_1":'non-UMI', + "trifurcating_2":'non-UMI', + "bifurcating_1":'non-UMI', + 'bifurcating_2':'non-UMI', + "bifurcating_3":'non-UMI', + "converging_1":'non-UMI', + + # our model + 'linear':'UMI', + 'bifurcation':'UMI', + 'multifurcating':'UMI', + 'tree':'UMI', +} +source_dict = { + 'dentate':'real', + 'immune':'real', + 'planaria_muscle':'real', + 'planaria_full':'real', + 'aging':'real', + 'cell_cycle':'real', + 'fibroblast':'real', + 'germline':'real', + 'human_embryos':'real', + 'mesoderm':'real', + + "cycle_1":'dyngen', + "cycle_2":'dyngen', + "cycle_3":'dyngen', + "linear_1":'dyngen', + "linear_2":'dyngen', + "linear_3":'dyngen', + "trifurcating_1":'dyngen', + "trifurcating_2":'dyngen', + "bifurcating_1":'dyngen', + 'bifurcating_2':'dyngen', + "bifurcating_3":'dyngen', + "converging_1":'dyngen', + + 'linear':'our model', + 'bifurcation':'our model', + 'multifurcating':'our model', + 'tree':'our model', +} +df = pd.DataFrame() +for data_name in type_dict.keys(): + data = load_data('data/', data_name) + milestone_net = data['milestone_network'] + le = LabelEncoder() + grouping = le.fit_transform(data['grouping']) + begin_node_true = le.transform([data['root_milestone_id']])[0] + if 'w' in milestone_net.columns: + grouping = None + if milestone_net is not None: + milestone_net['from'] = le.transform(milestone_net['from']) + milestone_net['to'] = le.transform(milestone_net['to']) + G_true = nx.Graph() + G_true.add_nodes_from(np.unique(milestone_net[['from','to']].values.flatten())) + if grouping is None: + G_true.add_edges_from(list( + milestone_net[~pd.isna(milestone_net['w'])].groupby(['from', 'to']).count().index)) + # otherwise, 'milestone_net' indicates edges + else: + if milestone_net is not None: + G_true.add_edges_from(list( + milestone_net.groupby(['from', 'to']).count().index)) + G_true.remove_edges_from(nx.selfloop_edges(G_true)) + nx.set_node_attributes(G_true, False, 'is_init') + G_true.nodes[begin_node_true]['is_init'] = True + + + for method in ['monocle','paga','slingshot']: + + _df = pd.read_csv('result/other methods/%s/%s_progressions.csv'%(data_name,method), + index_col=0).astype({'from':str,'to':str}) + pred_milestone_net = pd.read_csv('result/other methods/%s/%s_milestone_network.csv'%(data_name,method), + index_col=0).sort_index().astype({'from':str,'to':str}) + if method=='paga': + _df['from'] = _df['from'].apply(lambda x:int(x.split('-')[0]) if type(x)==str else x) + _df['to'] = _df['to'].apply(lambda x:int(x.split('-')[0]) if type(x)==str else x) + pred_milestone_net['from'] = pred_milestone_net['from'].apply( + lambda x:int(x.split('-')[0]) if type(x)==str else x) + pred_milestone_net['to'] = pred_milestone_net['to'].apply( + lambda x:int(x.split('-')[0]) if type(x)==str else x) + + le = LabelEncoder() + le.fit(np.unique(pred_milestone_net[['from','to']].values.flatten())) + pred_milestone_net['from'] = le.transform(pred_milestone_net['from']) + pred_milestone_net['to'] = le.transform(pred_milestone_net['to']) + _df['from'] = le.transform(_df['from']) + _df['to'] = le.transform(_df['to']) + + G_pred = nx.Graph() + G_pred.add_nodes_from(np.unique(_df[['from', 'to']].values.flatten())) + G_pred.add_edges_from(pred_milestone_net[['from', 'to']].values) + G_pred.remove_edges_from(nx.selfloop_edges(G_pred)) + nx.set_node_attributes(G_pred, False, 'is_init') + G_pred.nodes[pred_milestone_net.iloc[0]['from']]['is_init'] = True + + pseudotime = comp_pseudotime(nx.from_numpy_array(nx.to_numpy_array(G_pred)), + pred_milestone_net.iloc[0]['from'], + _df.copy()) + + # 1. topology + res = topology(G_true, G_pred) + + # 2. Milestones assignment + if grouping is None: + milestones_true = milestone_net['from'].values.copy() + milestones_true[(milestone_net['from']!=milestone_net['to']) + &(milestone_net['w']<0.5)] = milestone_net[(milestone_net['from']!=milestone_net['to']) + &(milestone_net['w']<0.5)]['to'].values + else: + milestones_true = grouping + milestones_true = milestones_true[pseudotime!=-1] + milestones_pred = _df['from'].values.copy() + milestones_pred[_df['percentage']>0.5] = _df[_df['percentage']>0.5]['to'].values.copy() + milestones_pred = milestones_pred[pseudotime!=-1] + res['ARI'] = (adjusted_rand_score(milestones_true, milestones_pred) + 1)/2 + + n_samples = len(pseudotime) + w = np.zeros((n_samples,n_samples)) + w[np.arange(n_samples), _df['to']] = _df['percentage'] + w[np.arange(n_samples), _df['from']] = 1 - _df['percentage'] + if grouping is None: + n_samples = len(milestone_net) + prop = np.zeros((n_samples,n_samples)) + prop[np.arange(n_samples), milestone_net['to']] = 1-milestone_net['w'] + prop[np.arange(n_samples), milestone_net['from']] = np.where(np.isnan(milestone_net['w']), 1, milestone_net['w']) + res['GRI'] = get_GRI(prop, w) + else: + res['GRI'] = get_GRI(grouping, w) + + # 3. Correlation between geodesic distances / Pseudotime + if 'cycle' in data_name or 'immune' in data_name: + res['PDT score'] = np.nan + else: + if grouping is None: + pseudotime_true = milestone_net['from'].values + 1 - milestone_net['w'].values + pseudotime_true[np.isnan(pseudotime_true)] = milestone_net[pd.isna(milestone_net['w'])]['from'].values + else: + pseudotime_true = - np.ones(len(grouping)) + nx.set_edge_attributes(G_true, values = 1, name = 'weight') + connected_comps = nx.node_connected_component(G_true, begin_node_true) + subG = G_true.subgraph(connected_comps) + milestone_net_true = build_milestone_net(subG, begin_node_true) + if len(milestone_net_true)>0: + pseudotime_true[grouping==int(milestone_net_true[0,0])] = 0 + for i in range(len(milestone_net_true)): + pseudotime_true[grouping==int(milestone_net_true[i,1])] = milestone_net_true[i,-1] + pseudotime_true = pseudotime_true[pseudotime>-1] + pseudotime_pred = pseudotime[pseudotime>-1] + if np.std(pseudotime_true)==0 or np.std(pseudotime_pred)==0: + res['PDT score'] = 0 + else: + res['PDT score'] = (np.corrcoef(pseudotime_true,pseudotime_pred)[0,1]+1)/2 + + res['method'] = method + res['data'] = data_name + res['source'] = source_dict[data_name] + df = df.append(pd.DataFrame(res, index=[0]),ignore_index=True) + +df.to_csv('result/result_other_methods.csv')