a b/paper/Benchmarking with real and synthetic datasets/evaluate_other_methods.py
1
import networkx as nx
2
import networkx.algorithms.isomorphism as iso
3
import numpy as np
4
import pandas as pd
5
from scipy.sparse.csgraph import laplacian
6
from scipy.linalg import eigh
7
from scipy.integrate import quad
8
from sklearn.metrics import pairwise_distances
9
import warnings
10
11
import numba
12
from numba import jit, float32
13
14
def IM_dist(G1, G2):
15
    adj1 = nx.to_numpy_array(G1)
16
    adj2 = nx.to_numpy_array(G2)
17
    hwhm = 0.08
18
    
19
    N = len(adj1)
20
    # get laplacian matrix
21
    L1 = laplacian(adj1, normed=False)
22
    L2 = laplacian(adj2, normed=False)
23
24
    # get the modes for the positive-semidefinite laplacian
25
    w1 = np.sqrt(np.abs(eigh(L1)[0][1:]))
26
    w2 = np.sqrt(np.abs(eigh(L2)[0][1:]))
27
28
    # we calculate the norm for both spectrum
29
    norm1 = (N - 1) * np.pi / 2 - np.sum(np.arctan(-w1 / hwhm))
30
    norm2 = (N - 1) * np.pi / 2 - np.sum(np.arctan(-w2 / hwhm))
31
32
    # define both spectral densities
33
    density1 = lambda w: np.sum(hwhm / ((w - w1) ** 2 + hwhm ** 2)) / norm1
34
    density2 = lambda w: np.sum(hwhm / ((w - w2) ** 2 + hwhm ** 2)) / norm2
35
36
    func = lambda w: (density1(w) - density2(w)) ** 2
37
    return np.sqrt(quad(func, 0, np.inf, limit=100)[0])
38
39
def build_milestone_net(subgraph, init_node):
40
    '''
41
    Args:
42
        subgraph     - a connected component of the graph, csr_matrix
43
        init_node    - root node
44
    Returns:
45
        df_subgraph  - dataframe of milestone network
46
    '''
47
48
    if len(subgraph)==1:
49
        warnings.warn('Singular node.')
50
        return []
51
    else:
52
        # Dijkstra's Algorithm
53
        unvisited = {node: {'parent':None,
54
                            'score':np.inf,
55
                            'distance':np.inf} for node in subgraph.nodes}
56
        current = init_node
57
        currentScore = 0
58
        currentDistance = 0
59
        unvisited[current]['score'] = currentScore
60
61
        milestone_net = []
62
        while True:
63
            for neighbour in subgraph.neighbors(current):
64
                if neighbour not in unvisited: continue
65
                newScore = currentScore + subgraph[current][neighbour]['weight']
66
                if unvisited[neighbour]['score'] > newScore:
67
                    unvisited[neighbour]['score'] = newScore
68
                    unvisited[neighbour]['parent'] = current
69
                    unvisited[neighbour]['distance'] = currentDistance+1
70
71
            if len(unvisited)<len(subgraph):
72
                milestone_net.append([unvisited[current]['parent'],
73
                                      current,
74
                                      unvisited[current]['distance']])
75
            del unvisited[current]
76
            if not unvisited: break
77
            current, currentScore, currentDistance = \
78
                sorted([(i[0],i[1]['score'],i[1]['distance']) for i in unvisited.items()],
79
                        key = lambda x: x[1])[0]
80
        return np.array(milestone_net)
81
82
def comp_pseudotime(G, node, df):
83
    connected_comps = nx.node_connected_component(G, node)
84
    subG = G.subgraph(connected_comps)
85
    milestone_net = build_milestone_net(subG,node)
86
87
    # compute pseudotime
88
    pseudotime = - np.ones(len(df))
89
    for i in range(len(milestone_net)):
90
        _from, _to = milestone_net[i,:2]
91
        _from, _to = int(_from), int(_to)
92
93
        idc = (df['from']==_from)&(df['to']==_to)
94
        if np.sum(idc)>0:
95
            pseudotime[idc] = df['percentage'].values[idc] + milestone_net[i,-1] - 1
96
            
97
        idc = (df['from']==_to)&(df['to']==_from)
98
        if np.sum(idc)>0:
99
            pseudotime[idc] = 1-df['percentage'].values[idc] + milestone_net[i,-1] - 1        
100
101
        if np.any(df['from']==_from):
102
            idc = (df['from']==_from)&(df['to']==_from)
103
            pseudotime[idc] =  milestone_net[i,-1] - 1
104
    if len(milestone_net)>0 and np.any((df['from']==_to)&(df['to']==_to)):
105
        idc = (df['from']==_to)&(df['to']==_to)
106
        pseudotime[idc] =  milestone_net[i,-1]
107
    return pseudotime
108
109
def topology(G_true, G_pred, is_GED=True):   
110
    res = {}
111
    
112
    # 1. Isomorphism with same initial node
113
    def comparison(N1, N2):    
114
        if N1['is_init'] != N2['is_init']:
115
            return False
116
        else:
117
            return True
118
    score_isomorphism = int(nx.is_isomorphic(G_true, G_pred, node_match=comparison))
119
    res['ISO score'] = score_isomorphism
120
    
121
    # 2. GED (graph edit distance)
122
    if len(G_true)>10 or len(G_pred)>10:
123
        warnings.warn("Didn't calculate graph edit distances for large graphs.")
124
        res['GED score'] = np.nan  
125
    else:
126
        max_num_oper = len(G_true)
127
        score_GED = 1 - np.min([nx.graph_edit_distance(G_pred, G_true, node_match=comparison),
128
                            max_num_oper]) / max_num_oper
129
        res['GED score'] = score_GED
130
        
131
    # 3. Hamming-Ipsen-Mikhailov distance
132
    if len(G_true)==len(G_pred):
133
        score_IM = 1-IM_dist(G_true, G_pred)
134
        score_IM = np.maximum(0, score_IM)
135
    else:
136
        score_IM = 0
137
    res['score_IM'] = score_IM
138
    return res
139
140
@jit((float32[:,:], float32[:,:]), nopython=True, nogil=True)
141
def _rand_index(true, pred):
142
    n = true.shape[0]
143
    m_true = true.shape[1]
144
    m_pred = pred.shape[1]
145
    RI = 0.0
146
    for i in range(1, n-1):
147
        for j in range(i, n):
148
            RI_ij = 0.0
149
            for k in range(m_true):
150
                RI_ij += true[i,k]*true[j,k]
151
            for k in range(m_pred):
152
                RI_ij -= pred[i,k]*pred[j,k]     
153
            RI += 1-np.abs(RI_ij)
154
    return RI / (n*(n-1)/2.0)
155
156
157
def get_GRI(true, pred):
158
    '''
159
    Params:
160
        ture - [n_samples, n_cluster_1] for proportions or [n_samples, ] for grouping
161
        pred - [n_samples, n_cluster_2] for estimated proportions
162
    '''    
163
    if len(true)!=len(pred):
164
        raise ValueError('Inputs should have same lengths!')
165
        
166
    if len(true.shape)==1:
167
        true = pd.get_dummies(true).values
168
    if len(pred.shape)==1:
169
        pred = pd.get_dummies(pred).values
170
        
171
    true = np.sqrt(true).astype(np.float32)
172
    pred = np.sqrt(pred).astype(np.float32)
173
174
    return _rand_index(true, pred)
175
176
import numpy as np
177
import pandas as pd
178
from sklearn.metrics.cluster import adjusted_rand_score
179
from sklearn.preprocessing import LabelEncoder
180
import sys#; sys.path.insert(0, '../..')
181
sys.path.append('../..')
182
from VITAE import load_data
183
184
type_dict = {
185
    # dyno
186
    'dentate':'UMI', 
187
    'immune':'UMI', 
188
    'planaria_full':'UMI', 
189
    'planaria_muscle':'UMI',
190
    'aging':'non-UMI', 
191
    'cell_cycle':'non-UMI',
192
    'fibroblast':'non-UMI', 
193
    'germline':'non-UMI',    
194
    'human_embryos':'non-UMI', 
195
    'mesoderm':'non-UMI',
196
    
197
    # dyngen    
198
    "cycle_1":'non-UMI', 
199
    "cycle_2":'non-UMI', 
200
    "cycle_3":'non-UMI',
201
    "linear_1":'non-UMI', 
202
    "linear_2":'non-UMI', 
203
    "linear_3":'non-UMI', 
204
    "trifurcating_1":'non-UMI', 
205
    "trifurcating_2":'non-UMI', 
206
    "bifurcating_1":'non-UMI', 
207
    'bifurcating_2':'non-UMI',
208
    "bifurcating_3":'non-UMI', 
209
    "converging_1":'non-UMI',
210
    
211
    # our model
212
    'linear':'UMI',
213
    'bifurcation':'UMI',
214
    'multifurcating':'UMI',
215
    'tree':'UMI',
216
}
217
source_dict = {
218
    'dentate':'real', 
219
    'immune':'real', 
220
    'planaria_muscle':'real',
221
    'planaria_full':'real',
222
    'aging':'real', 
223
    'cell_cycle':'real',
224
    'fibroblast':'real', 
225
    'germline':'real',    
226
    'human_embryos':'real', 
227
    'mesoderm':'real',
228
        
229
    "cycle_1":'dyngen', 
230
    "cycle_2":'dyngen', 
231
    "cycle_3":'dyngen',
232
    "linear_1":'dyngen', 
233
    "linear_2":'dyngen', 
234
    "linear_3":'dyngen', 
235
    "trifurcating_1":'dyngen', 
236
    "trifurcating_2":'dyngen', 
237
    "bifurcating_1":'dyngen', 
238
    'bifurcating_2':'dyngen',
239
    "bifurcating_3":'dyngen', 
240
    "converging_1":'dyngen',
241
    
242
    'linear':'our model',
243
    'bifurcation':'our model',
244
    'multifurcating':'our model',
245
    'tree':'our model',
246
}
247
df = pd.DataFrame()
248
for data_name in type_dict.keys():
249
    data = load_data('data/', data_name)
250
    milestone_net = data['milestone_network']    
251
    le = LabelEncoder()
252
    grouping = le.fit_transform(data['grouping'])
253
    begin_node_true = le.transform([data['root_milestone_id']])[0]
254
    if 'w' in milestone_net.columns:
255
        grouping = None
256
    if milestone_net is not None:
257
        milestone_net['from'] = le.transform(milestone_net['from'])
258
        milestone_net['to'] = le.transform(milestone_net['to'])  
259
    G_true = nx.Graph()
260
    G_true.add_nodes_from(np.unique(milestone_net[['from','to']].values.flatten()))
261
    if grouping is None:
262
        G_true.add_edges_from(list(
263
            milestone_net[~pd.isna(milestone_net['w'])].groupby(['from', 'to']).count().index))
264
    # otherwise, 'milestone_net' indicates edges
265
    else:
266
        if milestone_net is not None:             
267
            G_true.add_edges_from(list(
268
                milestone_net.groupby(['from', 'to']).count().index))
269
    G_true.remove_edges_from(nx.selfloop_edges(G_true))
270
    nx.set_node_attributes(G_true, False, 'is_init')
271
    G_true.nodes[begin_node_true]['is_init'] = True
272
273
        
274
    for method in ['monocle','paga','slingshot']:
275
                
276
        _df = pd.read_csv('result/other methods/%s/%s_progressions.csv'%(data_name,method), 
277
                          index_col=0).astype({'from':str,'to':str})   
278
        pred_milestone_net = pd.read_csv('result/other methods/%s/%s_milestone_network.csv'%(data_name,method), 
279
                                         index_col=0).sort_index().astype({'from':str,'to':str})
280
        if method=='paga':
281
            _df['from'] = _df['from'].apply(lambda x:int(x.split('-')[0]) if type(x)==str else x)    
282
            _df['to'] = _df['to'].apply(lambda x:int(x.split('-')[0]) if type(x)==str else x)  
283
            pred_milestone_net['from'] = pred_milestone_net['from'].apply(
284
                lambda x:int(x.split('-')[0]) if type(x)==str else x)  
285
            pred_milestone_net['to'] = pred_milestone_net['to'].apply(
286
                lambda x:int(x.split('-')[0]) if type(x)==str else x)              
287
288
        le = LabelEncoder()
289
        le.fit(np.unique(pred_milestone_net[['from','to']].values.flatten()))
290
        pred_milestone_net['from'] = le.transform(pred_milestone_net['from'])
291
        pred_milestone_net['to'] = le.transform(pred_milestone_net['to'])
292
        _df['from'] = le.transform(_df['from'])
293
        _df['to'] = le.transform(_df['to'])
294
        
295
        G_pred = nx.Graph()
296
        G_pred.add_nodes_from(np.unique(_df[['from', 'to']].values.flatten()))
297
        G_pred.add_edges_from(pred_milestone_net[['from', 'to']].values)
298
        G_pred.remove_edges_from(nx.selfloop_edges(G_pred))  
299
        nx.set_node_attributes(G_pred, False, 'is_init')
300
        G_pred.nodes[pred_milestone_net.iloc[0]['from']]['is_init'] = True
301
        
302
        pseudotime = comp_pseudotime(nx.from_numpy_array(nx.to_numpy_array(G_pred)), 
303
                                     pred_milestone_net.iloc[0]['from'], 
304
                                     _df.copy())
305
        
306
        # 1. topology
307
        res = topology(G_true, G_pred)
308
309
        # 2. Milestones assignment
310
        if grouping is None:
311
            milestones_true = milestone_net['from'].values.copy()
312
            milestones_true[(milestone_net['from']!=milestone_net['to'])
313
                           &(milestone_net['w']<0.5)] = milestone_net[(milestone_net['from']!=milestone_net['to'])
314
                                                                      &(milestone_net['w']<0.5)]['to'].values
315
        else:
316
            milestones_true = grouping
317
        milestones_true = milestones_true[pseudotime!=-1]
318
        milestones_pred = _df['from'].values.copy()
319
        milestones_pred[_df['percentage']>0.5] = _df[_df['percentage']>0.5]['to'].values.copy()
320
        milestones_pred = milestones_pred[pseudotime!=-1]
321
        res['ARI'] = (adjusted_rand_score(milestones_true, milestones_pred) + 1)/2
322
323
        n_samples = len(pseudotime)
324
        w = np.zeros((n_samples,n_samples))
325
        w[np.arange(n_samples), _df['to']] = _df['percentage']
326
        w[np.arange(n_samples), _df['from']] = 1 - _df['percentage']
327
        if grouping is None:
328
            n_samples = len(milestone_net)
329
            prop = np.zeros((n_samples,n_samples))
330
            prop[np.arange(n_samples), milestone_net['to']] = 1-milestone_net['w']
331
            prop[np.arange(n_samples), milestone_net['from']] = np.where(np.isnan(milestone_net['w']), 1, milestone_net['w'])
332
            res['GRI'] = get_GRI(prop, w)
333
        else:
334
            res['GRI'] = get_GRI(grouping, w)
335
        
336
        # 3. Correlation between geodesic distances / Pseudotime   
337
        if 'cycle' in data_name or 'immune' in data_name:
338
            res['PDT score'] = np.nan
339
        else:
340
            if grouping is None:
341
                pseudotime_true = milestone_net['from'].values + 1 - milestone_net['w'].values
342
                pseudotime_true[np.isnan(pseudotime_true)] = milestone_net[pd.isna(milestone_net['w'])]['from'].values            
343
            else:
344
                pseudotime_true = - np.ones(len(grouping))
345
                nx.set_edge_attributes(G_true, values = 1, name = 'weight')
346
                connected_comps = nx.node_connected_component(G_true, begin_node_true)
347
                subG = G_true.subgraph(connected_comps)
348
                milestone_net_true = build_milestone_net(subG, begin_node_true)
349
                if len(milestone_net_true)>0:
350
                    pseudotime_true[grouping==int(milestone_net_true[0,0])] = 0
351
                    for i in range(len(milestone_net_true)):
352
                        pseudotime_true[grouping==int(milestone_net_true[i,1])] = milestone_net_true[i,-1]
353
            pseudotime_true = pseudotime_true[pseudotime>-1]
354
            pseudotime_pred = pseudotime[pseudotime>-1]
355
            if np.std(pseudotime_true)==0 or np.std(pseudotime_pred)==0:
356
                res['PDT score'] = 0
357
            else:
358
                res['PDT score'] = (np.corrcoef(pseudotime_true,pseudotime_pred)[0,1]+1)/2
359
    
360
        res['method'] = method
361
        res['data'] = data_name            
362
        res['source'] = source_dict[data_name]       
363
        df = df.append(pd.DataFrame(res, index=[0]),ignore_index=True)        
364
365
df.to_csv('result/result_other_methods.csv')