|
a |
|
b/paper/Benchmarking with real and synthetic datasets/evaluate_other_methods.py |
|
|
1 |
import networkx as nx |
|
|
2 |
import networkx.algorithms.isomorphism as iso |
|
|
3 |
import numpy as np |
|
|
4 |
import pandas as pd |
|
|
5 |
from scipy.sparse.csgraph import laplacian |
|
|
6 |
from scipy.linalg import eigh |
|
|
7 |
from scipy.integrate import quad |
|
|
8 |
from sklearn.metrics import pairwise_distances |
|
|
9 |
import warnings |
|
|
10 |
|
|
|
11 |
import numba |
|
|
12 |
from numba import jit, float32 |
|
|
13 |
|
|
|
14 |
def IM_dist(G1, G2): |
|
|
15 |
adj1 = nx.to_numpy_array(G1) |
|
|
16 |
adj2 = nx.to_numpy_array(G2) |
|
|
17 |
hwhm = 0.08 |
|
|
18 |
|
|
|
19 |
N = len(adj1) |
|
|
20 |
# get laplacian matrix |
|
|
21 |
L1 = laplacian(adj1, normed=False) |
|
|
22 |
L2 = laplacian(adj2, normed=False) |
|
|
23 |
|
|
|
24 |
# get the modes for the positive-semidefinite laplacian |
|
|
25 |
w1 = np.sqrt(np.abs(eigh(L1)[0][1:])) |
|
|
26 |
w2 = np.sqrt(np.abs(eigh(L2)[0][1:])) |
|
|
27 |
|
|
|
28 |
# we calculate the norm for both spectrum |
|
|
29 |
norm1 = (N - 1) * np.pi / 2 - np.sum(np.arctan(-w1 / hwhm)) |
|
|
30 |
norm2 = (N - 1) * np.pi / 2 - np.sum(np.arctan(-w2 / hwhm)) |
|
|
31 |
|
|
|
32 |
# define both spectral densities |
|
|
33 |
density1 = lambda w: np.sum(hwhm / ((w - w1) ** 2 + hwhm ** 2)) / norm1 |
|
|
34 |
density2 = lambda w: np.sum(hwhm / ((w - w2) ** 2 + hwhm ** 2)) / norm2 |
|
|
35 |
|
|
|
36 |
func = lambda w: (density1(w) - density2(w)) ** 2 |
|
|
37 |
return np.sqrt(quad(func, 0, np.inf, limit=100)[0]) |
|
|
38 |
|
|
|
39 |
def build_milestone_net(subgraph, init_node): |
|
|
40 |
''' |
|
|
41 |
Args: |
|
|
42 |
subgraph - a connected component of the graph, csr_matrix |
|
|
43 |
init_node - root node |
|
|
44 |
Returns: |
|
|
45 |
df_subgraph - dataframe of milestone network |
|
|
46 |
''' |
|
|
47 |
|
|
|
48 |
if len(subgraph)==1: |
|
|
49 |
warnings.warn('Singular node.') |
|
|
50 |
return [] |
|
|
51 |
else: |
|
|
52 |
# Dijkstra's Algorithm |
|
|
53 |
unvisited = {node: {'parent':None, |
|
|
54 |
'score':np.inf, |
|
|
55 |
'distance':np.inf} for node in subgraph.nodes} |
|
|
56 |
current = init_node |
|
|
57 |
currentScore = 0 |
|
|
58 |
currentDistance = 0 |
|
|
59 |
unvisited[current]['score'] = currentScore |
|
|
60 |
|
|
|
61 |
milestone_net = [] |
|
|
62 |
while True: |
|
|
63 |
for neighbour in subgraph.neighbors(current): |
|
|
64 |
if neighbour not in unvisited: continue |
|
|
65 |
newScore = currentScore + subgraph[current][neighbour]['weight'] |
|
|
66 |
if unvisited[neighbour]['score'] > newScore: |
|
|
67 |
unvisited[neighbour]['score'] = newScore |
|
|
68 |
unvisited[neighbour]['parent'] = current |
|
|
69 |
unvisited[neighbour]['distance'] = currentDistance+1 |
|
|
70 |
|
|
|
71 |
if len(unvisited)<len(subgraph): |
|
|
72 |
milestone_net.append([unvisited[current]['parent'], |
|
|
73 |
current, |
|
|
74 |
unvisited[current]['distance']]) |
|
|
75 |
del unvisited[current] |
|
|
76 |
if not unvisited: break |
|
|
77 |
current, currentScore, currentDistance = \ |
|
|
78 |
sorted([(i[0],i[1]['score'],i[1]['distance']) for i in unvisited.items()], |
|
|
79 |
key = lambda x: x[1])[0] |
|
|
80 |
return np.array(milestone_net) |
|
|
81 |
|
|
|
82 |
def comp_pseudotime(G, node, df): |
|
|
83 |
connected_comps = nx.node_connected_component(G, node) |
|
|
84 |
subG = G.subgraph(connected_comps) |
|
|
85 |
milestone_net = build_milestone_net(subG,node) |
|
|
86 |
|
|
|
87 |
# compute pseudotime |
|
|
88 |
pseudotime = - np.ones(len(df)) |
|
|
89 |
for i in range(len(milestone_net)): |
|
|
90 |
_from, _to = milestone_net[i,:2] |
|
|
91 |
_from, _to = int(_from), int(_to) |
|
|
92 |
|
|
|
93 |
idc = (df['from']==_from)&(df['to']==_to) |
|
|
94 |
if np.sum(idc)>0: |
|
|
95 |
pseudotime[idc] = df['percentage'].values[idc] + milestone_net[i,-1] - 1 |
|
|
96 |
|
|
|
97 |
idc = (df['from']==_to)&(df['to']==_from) |
|
|
98 |
if np.sum(idc)>0: |
|
|
99 |
pseudotime[idc] = 1-df['percentage'].values[idc] + milestone_net[i,-1] - 1 |
|
|
100 |
|
|
|
101 |
if np.any(df['from']==_from): |
|
|
102 |
idc = (df['from']==_from)&(df['to']==_from) |
|
|
103 |
pseudotime[idc] = milestone_net[i,-1] - 1 |
|
|
104 |
if len(milestone_net)>0 and np.any((df['from']==_to)&(df['to']==_to)): |
|
|
105 |
idc = (df['from']==_to)&(df['to']==_to) |
|
|
106 |
pseudotime[idc] = milestone_net[i,-1] |
|
|
107 |
return pseudotime |
|
|
108 |
|
|
|
109 |
def topology(G_true, G_pred, is_GED=True): |
|
|
110 |
res = {} |
|
|
111 |
|
|
|
112 |
# 1. Isomorphism with same initial node |
|
|
113 |
def comparison(N1, N2): |
|
|
114 |
if N1['is_init'] != N2['is_init']: |
|
|
115 |
return False |
|
|
116 |
else: |
|
|
117 |
return True |
|
|
118 |
score_isomorphism = int(nx.is_isomorphic(G_true, G_pred, node_match=comparison)) |
|
|
119 |
res['ISO score'] = score_isomorphism |
|
|
120 |
|
|
|
121 |
# 2. GED (graph edit distance) |
|
|
122 |
if len(G_true)>10 or len(G_pred)>10: |
|
|
123 |
warnings.warn("Didn't calculate graph edit distances for large graphs.") |
|
|
124 |
res['GED score'] = np.nan |
|
|
125 |
else: |
|
|
126 |
max_num_oper = len(G_true) |
|
|
127 |
score_GED = 1 - np.min([nx.graph_edit_distance(G_pred, G_true, node_match=comparison), |
|
|
128 |
max_num_oper]) / max_num_oper |
|
|
129 |
res['GED score'] = score_GED |
|
|
130 |
|
|
|
131 |
# 3. Hamming-Ipsen-Mikhailov distance |
|
|
132 |
if len(G_true)==len(G_pred): |
|
|
133 |
score_IM = 1-IM_dist(G_true, G_pred) |
|
|
134 |
score_IM = np.maximum(0, score_IM) |
|
|
135 |
else: |
|
|
136 |
score_IM = 0 |
|
|
137 |
res['score_IM'] = score_IM |
|
|
138 |
return res |
|
|
139 |
|
|
|
140 |
@jit((float32[:,:], float32[:,:]), nopython=True, nogil=True) |
|
|
141 |
def _rand_index(true, pred): |
|
|
142 |
n = true.shape[0] |
|
|
143 |
m_true = true.shape[1] |
|
|
144 |
m_pred = pred.shape[1] |
|
|
145 |
RI = 0.0 |
|
|
146 |
for i in range(1, n-1): |
|
|
147 |
for j in range(i, n): |
|
|
148 |
RI_ij = 0.0 |
|
|
149 |
for k in range(m_true): |
|
|
150 |
RI_ij += true[i,k]*true[j,k] |
|
|
151 |
for k in range(m_pred): |
|
|
152 |
RI_ij -= pred[i,k]*pred[j,k] |
|
|
153 |
RI += 1-np.abs(RI_ij) |
|
|
154 |
return RI / (n*(n-1)/2.0) |
|
|
155 |
|
|
|
156 |
|
|
|
157 |
def get_GRI(true, pred): |
|
|
158 |
''' |
|
|
159 |
Params: |
|
|
160 |
ture - [n_samples, n_cluster_1] for proportions or [n_samples, ] for grouping |
|
|
161 |
pred - [n_samples, n_cluster_2] for estimated proportions |
|
|
162 |
''' |
|
|
163 |
if len(true)!=len(pred): |
|
|
164 |
raise ValueError('Inputs should have same lengths!') |
|
|
165 |
|
|
|
166 |
if len(true.shape)==1: |
|
|
167 |
true = pd.get_dummies(true).values |
|
|
168 |
if len(pred.shape)==1: |
|
|
169 |
pred = pd.get_dummies(pred).values |
|
|
170 |
|
|
|
171 |
true = np.sqrt(true).astype(np.float32) |
|
|
172 |
pred = np.sqrt(pred).astype(np.float32) |
|
|
173 |
|
|
|
174 |
return _rand_index(true, pred) |
|
|
175 |
|
|
|
176 |
import numpy as np |
|
|
177 |
import pandas as pd |
|
|
178 |
from sklearn.metrics.cluster import adjusted_rand_score |
|
|
179 |
from sklearn.preprocessing import LabelEncoder |
|
|
180 |
import sys#; sys.path.insert(0, '../..') |
|
|
181 |
sys.path.append('../..') |
|
|
182 |
from VITAE import load_data |
|
|
183 |
|
|
|
184 |
type_dict = { |
|
|
185 |
# dyno |
|
|
186 |
'dentate':'UMI', |
|
|
187 |
'immune':'UMI', |
|
|
188 |
'planaria_full':'UMI', |
|
|
189 |
'planaria_muscle':'UMI', |
|
|
190 |
'aging':'non-UMI', |
|
|
191 |
'cell_cycle':'non-UMI', |
|
|
192 |
'fibroblast':'non-UMI', |
|
|
193 |
'germline':'non-UMI', |
|
|
194 |
'human_embryos':'non-UMI', |
|
|
195 |
'mesoderm':'non-UMI', |
|
|
196 |
|
|
|
197 |
# dyngen |
|
|
198 |
"cycle_1":'non-UMI', |
|
|
199 |
"cycle_2":'non-UMI', |
|
|
200 |
"cycle_3":'non-UMI', |
|
|
201 |
"linear_1":'non-UMI', |
|
|
202 |
"linear_2":'non-UMI', |
|
|
203 |
"linear_3":'non-UMI', |
|
|
204 |
"trifurcating_1":'non-UMI', |
|
|
205 |
"trifurcating_2":'non-UMI', |
|
|
206 |
"bifurcating_1":'non-UMI', |
|
|
207 |
'bifurcating_2':'non-UMI', |
|
|
208 |
"bifurcating_3":'non-UMI', |
|
|
209 |
"converging_1":'non-UMI', |
|
|
210 |
|
|
|
211 |
# our model |
|
|
212 |
'linear':'UMI', |
|
|
213 |
'bifurcation':'UMI', |
|
|
214 |
'multifurcating':'UMI', |
|
|
215 |
'tree':'UMI', |
|
|
216 |
} |
|
|
217 |
source_dict = { |
|
|
218 |
'dentate':'real', |
|
|
219 |
'immune':'real', |
|
|
220 |
'planaria_muscle':'real', |
|
|
221 |
'planaria_full':'real', |
|
|
222 |
'aging':'real', |
|
|
223 |
'cell_cycle':'real', |
|
|
224 |
'fibroblast':'real', |
|
|
225 |
'germline':'real', |
|
|
226 |
'human_embryos':'real', |
|
|
227 |
'mesoderm':'real', |
|
|
228 |
|
|
|
229 |
"cycle_1":'dyngen', |
|
|
230 |
"cycle_2":'dyngen', |
|
|
231 |
"cycle_3":'dyngen', |
|
|
232 |
"linear_1":'dyngen', |
|
|
233 |
"linear_2":'dyngen', |
|
|
234 |
"linear_3":'dyngen', |
|
|
235 |
"trifurcating_1":'dyngen', |
|
|
236 |
"trifurcating_2":'dyngen', |
|
|
237 |
"bifurcating_1":'dyngen', |
|
|
238 |
'bifurcating_2':'dyngen', |
|
|
239 |
"bifurcating_3":'dyngen', |
|
|
240 |
"converging_1":'dyngen', |
|
|
241 |
|
|
|
242 |
'linear':'our model', |
|
|
243 |
'bifurcation':'our model', |
|
|
244 |
'multifurcating':'our model', |
|
|
245 |
'tree':'our model', |
|
|
246 |
} |
|
|
247 |
df = pd.DataFrame() |
|
|
248 |
for data_name in type_dict.keys(): |
|
|
249 |
data = load_data('data/', data_name) |
|
|
250 |
milestone_net = data['milestone_network'] |
|
|
251 |
le = LabelEncoder() |
|
|
252 |
grouping = le.fit_transform(data['grouping']) |
|
|
253 |
begin_node_true = le.transform([data['root_milestone_id']])[0] |
|
|
254 |
if 'w' in milestone_net.columns: |
|
|
255 |
grouping = None |
|
|
256 |
if milestone_net is not None: |
|
|
257 |
milestone_net['from'] = le.transform(milestone_net['from']) |
|
|
258 |
milestone_net['to'] = le.transform(milestone_net['to']) |
|
|
259 |
G_true = nx.Graph() |
|
|
260 |
G_true.add_nodes_from(np.unique(milestone_net[['from','to']].values.flatten())) |
|
|
261 |
if grouping is None: |
|
|
262 |
G_true.add_edges_from(list( |
|
|
263 |
milestone_net[~pd.isna(milestone_net['w'])].groupby(['from', 'to']).count().index)) |
|
|
264 |
# otherwise, 'milestone_net' indicates edges |
|
|
265 |
else: |
|
|
266 |
if milestone_net is not None: |
|
|
267 |
G_true.add_edges_from(list( |
|
|
268 |
milestone_net.groupby(['from', 'to']).count().index)) |
|
|
269 |
G_true.remove_edges_from(nx.selfloop_edges(G_true)) |
|
|
270 |
nx.set_node_attributes(G_true, False, 'is_init') |
|
|
271 |
G_true.nodes[begin_node_true]['is_init'] = True |
|
|
272 |
|
|
|
273 |
|
|
|
274 |
for method in ['monocle','paga','slingshot']: |
|
|
275 |
|
|
|
276 |
_df = pd.read_csv('result/other methods/%s/%s_progressions.csv'%(data_name,method), |
|
|
277 |
index_col=0).astype({'from':str,'to':str}) |
|
|
278 |
pred_milestone_net = pd.read_csv('result/other methods/%s/%s_milestone_network.csv'%(data_name,method), |
|
|
279 |
index_col=0).sort_index().astype({'from':str,'to':str}) |
|
|
280 |
if method=='paga': |
|
|
281 |
_df['from'] = _df['from'].apply(lambda x:int(x.split('-')[0]) if type(x)==str else x) |
|
|
282 |
_df['to'] = _df['to'].apply(lambda x:int(x.split('-')[0]) if type(x)==str else x) |
|
|
283 |
pred_milestone_net['from'] = pred_milestone_net['from'].apply( |
|
|
284 |
lambda x:int(x.split('-')[0]) if type(x)==str else x) |
|
|
285 |
pred_milestone_net['to'] = pred_milestone_net['to'].apply( |
|
|
286 |
lambda x:int(x.split('-')[0]) if type(x)==str else x) |
|
|
287 |
|
|
|
288 |
le = LabelEncoder() |
|
|
289 |
le.fit(np.unique(pred_milestone_net[['from','to']].values.flatten())) |
|
|
290 |
pred_milestone_net['from'] = le.transform(pred_milestone_net['from']) |
|
|
291 |
pred_milestone_net['to'] = le.transform(pred_milestone_net['to']) |
|
|
292 |
_df['from'] = le.transform(_df['from']) |
|
|
293 |
_df['to'] = le.transform(_df['to']) |
|
|
294 |
|
|
|
295 |
G_pred = nx.Graph() |
|
|
296 |
G_pred.add_nodes_from(np.unique(_df[['from', 'to']].values.flatten())) |
|
|
297 |
G_pred.add_edges_from(pred_milestone_net[['from', 'to']].values) |
|
|
298 |
G_pred.remove_edges_from(nx.selfloop_edges(G_pred)) |
|
|
299 |
nx.set_node_attributes(G_pred, False, 'is_init') |
|
|
300 |
G_pred.nodes[pred_milestone_net.iloc[0]['from']]['is_init'] = True |
|
|
301 |
|
|
|
302 |
pseudotime = comp_pseudotime(nx.from_numpy_array(nx.to_numpy_array(G_pred)), |
|
|
303 |
pred_milestone_net.iloc[0]['from'], |
|
|
304 |
_df.copy()) |
|
|
305 |
|
|
|
306 |
# 1. topology |
|
|
307 |
res = topology(G_true, G_pred) |
|
|
308 |
|
|
|
309 |
# 2. Milestones assignment |
|
|
310 |
if grouping is None: |
|
|
311 |
milestones_true = milestone_net['from'].values.copy() |
|
|
312 |
milestones_true[(milestone_net['from']!=milestone_net['to']) |
|
|
313 |
&(milestone_net['w']<0.5)] = milestone_net[(milestone_net['from']!=milestone_net['to']) |
|
|
314 |
&(milestone_net['w']<0.5)]['to'].values |
|
|
315 |
else: |
|
|
316 |
milestones_true = grouping |
|
|
317 |
milestones_true = milestones_true[pseudotime!=-1] |
|
|
318 |
milestones_pred = _df['from'].values.copy() |
|
|
319 |
milestones_pred[_df['percentage']>0.5] = _df[_df['percentage']>0.5]['to'].values.copy() |
|
|
320 |
milestones_pred = milestones_pred[pseudotime!=-1] |
|
|
321 |
res['ARI'] = (adjusted_rand_score(milestones_true, milestones_pred) + 1)/2 |
|
|
322 |
|
|
|
323 |
n_samples = len(pseudotime) |
|
|
324 |
w = np.zeros((n_samples,n_samples)) |
|
|
325 |
w[np.arange(n_samples), _df['to']] = _df['percentage'] |
|
|
326 |
w[np.arange(n_samples), _df['from']] = 1 - _df['percentage'] |
|
|
327 |
if grouping is None: |
|
|
328 |
n_samples = len(milestone_net) |
|
|
329 |
prop = np.zeros((n_samples,n_samples)) |
|
|
330 |
prop[np.arange(n_samples), milestone_net['to']] = 1-milestone_net['w'] |
|
|
331 |
prop[np.arange(n_samples), milestone_net['from']] = np.where(np.isnan(milestone_net['w']), 1, milestone_net['w']) |
|
|
332 |
res['GRI'] = get_GRI(prop, w) |
|
|
333 |
else: |
|
|
334 |
res['GRI'] = get_GRI(grouping, w) |
|
|
335 |
|
|
|
336 |
# 3. Correlation between geodesic distances / Pseudotime |
|
|
337 |
if 'cycle' in data_name or 'immune' in data_name: |
|
|
338 |
res['PDT score'] = np.nan |
|
|
339 |
else: |
|
|
340 |
if grouping is None: |
|
|
341 |
pseudotime_true = milestone_net['from'].values + 1 - milestone_net['w'].values |
|
|
342 |
pseudotime_true[np.isnan(pseudotime_true)] = milestone_net[pd.isna(milestone_net['w'])]['from'].values |
|
|
343 |
else: |
|
|
344 |
pseudotime_true = - np.ones(len(grouping)) |
|
|
345 |
nx.set_edge_attributes(G_true, values = 1, name = 'weight') |
|
|
346 |
connected_comps = nx.node_connected_component(G_true, begin_node_true) |
|
|
347 |
subG = G_true.subgraph(connected_comps) |
|
|
348 |
milestone_net_true = build_milestone_net(subG, begin_node_true) |
|
|
349 |
if len(milestone_net_true)>0: |
|
|
350 |
pseudotime_true[grouping==int(milestone_net_true[0,0])] = 0 |
|
|
351 |
for i in range(len(milestone_net_true)): |
|
|
352 |
pseudotime_true[grouping==int(milestone_net_true[i,1])] = milestone_net_true[i,-1] |
|
|
353 |
pseudotime_true = pseudotime_true[pseudotime>-1] |
|
|
354 |
pseudotime_pred = pseudotime[pseudotime>-1] |
|
|
355 |
if np.std(pseudotime_true)==0 or np.std(pseudotime_pred)==0: |
|
|
356 |
res['PDT score'] = 0 |
|
|
357 |
else: |
|
|
358 |
res['PDT score'] = (np.corrcoef(pseudotime_true,pseudotime_pred)[0,1]+1)/2 |
|
|
359 |
|
|
|
360 |
res['method'] = method |
|
|
361 |
res['data'] = data_name |
|
|
362 |
res['source'] = source_dict[data_name] |
|
|
363 |
df = df.append(pd.DataFrame(res, index=[0]),ignore_index=True) |
|
|
364 |
|
|
|
365 |
df.to_csv('result/result_other_methods.csv') |