[9e8054]: / aggmap / map.py

Download this file

683 lines (534 with data), 26.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 25 20:29:36 2019
@author: wanxiang.shen@u.nus.edu
main aggmap code
"""
from aggmap.utils.logtools import print_info, print_warn, print_error
from aggmap.utils.matrixopt import Scatter2Grid, Scatter2Array, smartpadding
from aggmap.utils import vismap, summary, calculator
from aggmap.utils.gen_nwk import mp2newick
from sklearn.manifold import TSNE, MDS, Isomap, LocallyLinearEmbedding, SpectralEmbedding
from joblib import Parallel, delayed, load, dump
from scipy.spatial.distance import squareform, cdist, pdist
from scipy.cluster.hierarchy import fcluster, linkage, dendrogram
import matplotlib.pylab as plt
import seaborn as sns
from umap import UMAP
from tqdm import tqdm
from copy import copy, deepcopy
import pandas as pd
import numpy as np
class Base:
def __init__(self):
pass
def _save(self, filename):
return dump(self, filename)
def _load(self, filename):
return load(filename)
def MinMaxScaleClip(self, x, xmin, xmax):
scaled = (x - xmin) / ((xmax - xmin) + 1e-8)
return scaled
def StandardScaler(self, x, xmean, xstd):
return (x-xmean) / (xstd + 1e-8)
class Random_2DEmbedding:
def __init__(self, random_state=123, n_components=2):
self.random_state=random_state
self.n_components = n_components
def fit(self, X):
M, N = X.shape
np.random.seed(self.random_state)
rho = np.sqrt(np.random.uniform(0, 1, N))
phi = np.random.uniform(0, 4*np.pi, N)
x = rho * np.cos(phi)
y = rho * np.sin(phi)
rd = pd.DataFrame([x,y]).T.sample(frac=1, random_state=123).reset_index(drop=True)
self.embedding_ = rd.values
return self
def _get_df_scatter(mp):
xy = mp.embedded.embedding_
colormaps = mp.colormaps
df = pd.DataFrame(xy, columns = ['x', 'y'])
bitsinfo = mp.bitsinfo.set_index('IDs')
df = df.join(bitsinfo.loc[mp.flist].reset_index())
df['colors'] = df['Subtypes'].map(colormaps)
return df
def _get_df_grid(mp):
p,q = mp._S.fmap_shape
position = np.zeros(mp._S.fmap_shape, dtype='O').reshape(p*q,)
position[mp._S.col_asses] = mp.flist
position = position.reshape(p, q)
if mp.fmap_shape != None:
m, n = mp.fmap_shape
if (m > p) | (n > q):
position = smartpadding(position, (m,n), constant_values=0)
M, N = position.shape
x = []
y = []
for i in range(M):
for j in range(N):
x.append(j) #, position[j,i]
y.append(i)
v = position.reshape(M*N,)
df = pd.DataFrame(list(zip(x,y, v)), columns = ['x', 'y', 'v'])
bitsinfo = mp.bitsinfo
subtypedict = bitsinfo.set_index('IDs')['Subtypes'].to_dict()
subtypedict.update({0:'NaN'})
df['Subtypes'] = df.v.map(subtypedict)
df['colors'] = df['Subtypes'].map(mp.colormaps)
feature_list = df.v
feature_names = []
for i, j in feature_list.items():
if j == 0:
j = 'NaN-' + str(i)
feature_names.append(j)
df.v = feature_names
return df
class AggMap(Base):
""" The feature restructuring class AggMap
Parameters
----------
dfx: pandas DataFrame
Input data frame.
metric: string, default: 'correlation'
measurement of feature distance, support {'cosine', 'correlation', 'euclidean', 'jaccard', 'hamming', 'dice'}
info_distance: numpy array, defalt: None
a vector-form distance vector of the feature points, shape should be: (n*(n-1)/2), where n is the number of the features. It can be useful when you have you own vector-form distance to pass
by_scipy: bool, defalt: False.
calculate the distance by using the scipy pdist fuction.
It can bu useful when dfx.shape[1] > 20000, i.e., the number of features is very large
Using pdist will increase the speed to calculate the distance.
n_cpus: int, default: 16
number of cpu cores to use to calculate the distance.
"""
def __init__(self,
dfx,
metric = 'correlation',
by_scipy = False,
n_cpus = 16,
info_distance = None,
):
assert type(dfx) == pd.core.frame.DataFrame, 'input dfx must be pandas DataFrame!'
super().__init__()
self.metric = metric
self.by_scipy = by_scipy
self.isfit = False
self.alist = dfx.columns.tolist()
self.ftype = 'feature points'
self.cluster_flag = False
m,n = dfx.shape
info_distance_length = int(n*(n-1)/2)
assert len(dfx.columns.unique()) == n, 'the column names of dataframe must be unique !'
## calculating distance
if np.array(info_distance).any():
assert len(info_distance) == info_distance_length, 'shape of info_distance must be (%s,)' % info_distance_length
print_info('Skipping the distance calculation, using the customized vector-form distance...')
self.info_distance = np.array(info_distance)
self.metric = 'precomputed'
else:
print_info('Calculating distance ...')
if self.by_scipy:
D = pdist(dfx.T, metric=metric)
D = np.nan_to_num(D,copy=False)
self.info_distance = D.clip(0, np.inf)
else:
D = calculator.pairwise_distance(dfx.values, n_cpus=n_cpus, method=metric)
D = np.nan_to_num(D,copy=False)
D_ = squareform(D)
self.info_distance = D_.clip(0, np.inf)
## statistic info
S = summary.Summary(n_jobs = 10)
res= []
for i in tqdm(range(dfx.shape[1]), ascii=True):
r = S._statistics_one(dfx.values, i)
res.append(r)
dfs = pd.DataFrame(res, index = self.alist)
self.info_scale = dfs
def _fit_embedding(self,
dist_matrix,
emb_method = 'umap',
n_components = 2,
random_state = 32,
verbose = 2,
n_neighbors = 15,
min_dist = 0.1,
a = 1.576943460405378,
b = 0.8950608781227859,
**kwargs):
"""
parameters
-----------------
dist_matrix: distance matrix to fit
emb_method: {'tsne', 'umap', 'mds'}, algorithm to embedd high-D to 2D
kwargs: the extra parameters for the conresponding algorithm
"""
if 'metric' in kwargs.keys():
metric = kwargs.get('metric')
kwargs.pop('metric')
else:
metric = 'precomputed'
if emb_method == 'tsne':
embedded = TSNE(n_components=n_components,
random_state=random_state,
metric = metric,
verbose = verbose,
**kwargs)
embedded = embedded.fit(dist_matrix)
elif emb_method == 'umap':
embedded = UMAP(n_components = n_components,
n_neighbors = n_neighbors,
min_dist = min_dist,
a = a,
b = b,
verbose = verbose,
random_state=random_state,
metric = metric, **kwargs)
embedded = embedded.fit(dist_matrix)
elif emb_method =='mds':
if 'metric' in kwargs.keys():
kwargs.pop('metric')
if 'dissimilarity' in kwargs.keys():
dissimilarity = kwargs.get('dissimilarity')
kwargs.pop('dissimilarity')
else:
dissimilarity = 'precomputed'
embedded = MDS(metric = True,
n_components= n_components,
verbose = verbose,
dissimilarity = dissimilarity,
random_state = random_state, **kwargs)
embedded = embedded.fit(dist_matrix)
elif emb_method == 'random':
embedded = Random_2DEmbedding(random_state=random_state,
n_components=n_components)
embedded = embedded.fit(dist_matrix)
elif emb_method == 'isomap':
embedded = Isomap(n_neighbors = n_neighbors,
n_components=n_components,
metric = metric,
**kwargs)
embedded = embedded.fit(dist_matrix)
elif emb_method == 'lle':
embedded = LocallyLinearEmbedding(random_state=random_state,
n_neighbors = n_neighbors,
n_components=n_components,
**kwargs)
embedded = embedded.fit(dist_matrix)
elif emb_method == 'se':
embedded = SpectralEmbedding(random_state=random_state,
n_neighbors = n_neighbors,
n_components=n_components,
affinity = metric,
**kwargs)
affinity_matrix = np.exp(-dist_matrix**2) #make more uniformly embedding
embedded = embedded.fit(affinity_matrix)
return embedded
def fit(self,
feature_group_list = [],
cluster_channels = 5,
var_thr = -1,
split_channels = True,
fmap_shape = None,
emb_method = 'umap',
min_dist = 0.1,
n_neighbors = 15,
a = 1.576943460405378,
b = 0.8950608781227859,
verbose = 2,
random_state = 32,
group_color_dict = {},
lnk_method = 'complete',
**kwargs):
"""
parameters
-----------------
feature_group_list: list of the group name for each feature point
cluster_channels: int, number of the channels(clusters) if feature_group_list is empty
var_thr: float, defalt is -1, meaning that feature will be included only if the conresponding variance larger than this value. Since some of the feature has pretty low variances, we can remove them by increasing this threshold
split_channels: bool, if True, outputs will split into various channels using the types of feature
fmap_shape: None or tuple, size of molmap, if None, the size of feature map will be calculated automatically
emb_method: {'umap', 'tsne', 'mds', 'isomap', 'random', 'lle', 'se'}, algorithm to embedd high-D to 2D
min_dist: float, UMAP parameters for the effective minimum distance between embedded points.
n_neighbors: init, UMAP parameters of controlling the embedding. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved.
a: float, UMAP parameters of controlling the embedding. If None, it will automatically be determined by ``min_dist`` and ``spread``.
b: float, UMAP parameters of controlling the embedding. If None, it will automatically be determined by ``min_dist`` and ``spread``.
group_color_dict: dict of the group colors, keys are the group names, values are the colors
lnk_method: {'complete', 'average', 'single', 'weighted', 'centroid'}, linkage method
kwargs: the extra parameters for the conresponding embedding method
"""
if 'n_components' in kwargs.keys():
kwargs.pop('n_components')
## embedding into a 2d
assert emb_method in ['tsne', 'umap', 'mds', 'isomap', 'random', 'lle', 'se'], 'No Such Method Supported: %s' % emb_method
self.feature_group_list = feature_group_list
self.var_thr = var_thr
self.split_channels = split_channels
self.fmap_shape = fmap_shape
self.emb_method = emb_method
self.lnk_method = lnk_method
self.random_state = random_state
if fmap_shape != None:
assert len(fmap_shape) == 2, "fmap_shape must be a tuple with two elements!"
# flist and distance
flist = self.info_scale[self.info_scale['var'] > self.var_thr].index.tolist()
dfd = pd.DataFrame(squareform(self.info_distance),
index=self.alist,
columns=self.alist)
dist_matrix = dfd.loc[flist][flist]
self.flist = flist
self.x_mean = self.info_scale['mean'].values
self.x_std = self.info_scale['std'].values
self.x_min = self.info_scale['min'].values
self.x_max = self.info_scale['max'].values
#bitsinfo
dfb = pd.DataFrame(self.flist, columns = ['IDs'])
if feature_group_list != []:
self.cluster_flag = False
assert len(feature_group_list) == len(self.alist), "the length of the input group list is not equal to length of the feature list"
self.cluster_channels = len(set(feature_group_list))
self.feature_group_list_ = feature_group_list
dfb['Subtypes'] = dfb['IDs'].map(pd.Series(feature_group_list, index = self.alist))
if set(feature_group_list).issubset(set(group_color_dict.keys())):
self.group_color_dict = group_color_dict
dfb['colors'] = dfb['Subtypes'].map(group_color_dict)
else:
unique_types = dfb['Subtypes'].unique()
color_list = sns.color_palette("hsv", len(unique_types)).as_hex()
group_color_dict = dict(zip(unique_types, color_list))
dfb['colors'] = dfb['Subtypes'].map(group_color_dict)
self.group_color_dict = group_color_dict
else:
self.cluster_channels = cluster_channels
print_info('applying hierarchical clustering to obtain group information ...')
self.cluster_flag = True
Z = linkage(squareform(dist_matrix.values), lnk_method)
labels = fcluster(Z, cluster_channels, criterion='maxclust')
feature_group_list_ = ['cluster_%s' % str(i).zfill(2) for i in labels]
dfb['Subtypes'] = feature_group_list_
dfb = dfb.sort_values('Subtypes')
unique_types = dfb['Subtypes'].unique()
if not set(unique_types).issubset(set(group_color_dict.keys())):
color_list = sns.color_palette("hsv", len(unique_types)).as_hex()
group_color_dict = dict(zip(unique_types, color_list))
dfb['colors'] = dfb['Subtypes'].map(group_color_dict)
self.group_color_dict = group_color_dict
self.Z = Z
self.feature_group_list_ = feature_group_list_
self.bitsinfo = dfb
colormaps = dfb.set_index('Subtypes')['colors'].to_dict()
colormaps.update({'NaN': '#000000'})
self.colormaps = colormaps
self._S = Scatter2Grid()
## 2d embedding first
embedded = self._fit_embedding(dist_matrix,
emb_method = emb_method,
n_neighbors = n_neighbors,
random_state = random_state,
min_dist = min_dist,
a = a,
b = b,
verbose = verbose,
n_components = 2, **kwargs)
self.embedded = embedded
df = pd.DataFrame(embedded.embedding_, index = self.flist,columns=['x', 'y'])
typemap = self.bitsinfo.set_index('IDs')
df = df.join(typemap)
df['Channels'] = df['Subtypes']
self.df_embedding = df
## linear assignment algorithm
print_info('Applying grid assignment of feature points, this may take several minutes(1~30 min)')
self._S.fit(self.df_embedding, self.split_channels, channel_col = 'Channels')
print_info('Finished')
## fit flag
self.isfit = True
if self.fmap_shape == None:
self.fmap_shape = self._S.fmap_shape
else:
m, n = self.fmap_shape
p, q = self._S.fmap_shape
assert (m >= p) & (n >=q), "fmap_shape's width must >= %s, height >= %s " % (p, q)
self.df_scatter = _get_df_scatter(self)
self.df_grid = _get_df_grid(self)
self.df_grid_reshape = _get_df_grid(self)
self.feature_names_reshape = self.df_grid.v.tolist()
return self
def refit_c(self, cluster_channels = 10, lnk_method = 'complete', group_color_dict = {}):
"""
re-fit the aggmap object to update the number of channels
parameters
--------------------
cluster_channels: int, number of the channels(clusters)
group_color_dict: dict of the group colors, keys are the group names, values are the colors
lnk_method: {'complete', 'average', 'single', 'weighted', 'centroid'}, linkage method
"""
if not self.isfit:
print_error('please fit first!')
return
self.split_channels = True
self.lnk_method = lnk_method
self.cluster_channels = cluster_channels
dfd = pd.DataFrame(squareform(self.info_distance),
index=self.alist,
columns=self.alist)
dist_matrix = dfd.loc[self.flist][self.flist]
dfb = pd.DataFrame(self.flist, columns = ['IDs'])
print_info('applying hierarchical clustering to obtain group information ...')
self.cluster_flag = True
Z = linkage(squareform(dist_matrix.values), self.lnk_method)
labels = fcluster(Z, self.cluster_channels, criterion='maxclust')
feature_group_list_ = ['cluster_%s' % str(i).zfill(2) for i in labels]
dfb['Subtypes'] = feature_group_list_
dfb = dfb.sort_values('Subtypes')
unique_types = dfb['Subtypes'].unique()
if not set(unique_types).issubset(set(group_color_dict.keys())):
color_list = sns.color_palette("hsv", len(unique_types)).as_hex()
group_color_dict = dict(zip(unique_types, color_list))
dfb['colors'] = dfb['Subtypes'].map(group_color_dict)
self.group_color_dict = group_color_dict
self.Z = Z
self.feature_group_list_ = feature_group_list_
# update self.bitsinfo
self.bitsinfo = dfb
colormaps = dfb.set_index('Subtypes')['colors'].to_dict()
colormaps.update({'NaN': '#000000'})
self.colormaps = colormaps
# update self.df_embedding
df = pd.DataFrame(self.embedded.embedding_, index = self.flist,columns=['x', 'y'])
typemap = self.bitsinfo.set_index('IDs')
df = df.join(typemap)
df['Channels'] = df['Subtypes']
self.df_embedding = df
## linear assignment not performed, only refit the channel number
print_info('skipping grid assignment of feature points, fitting to target channel number')
self._S.refit_c(self.df_embedding)
print_info('Finished')
if self.fmap_shape == None:
self.fmap_shape = self._S.fmap_shape
else:
m, n = self.fmap_shape
p, q = self._S.fmap_shape
assert (m >= p) & (n >=q), "fmap_shape's width must >= %s, height >= %s " % (p, q)
self.df_scatter = _get_df_scatter(self)
self.df_grid = _get_df_grid(self)
self.df_grid_reshape = _get_df_grid(self)
self.feature_names_reshape = self.df_grid.v.tolist()
return self
def transform_mpX_to_df(self, X):
'''
input 4D X, output 2D dataframe
'''
n, w,h, c = X.shape
X_2D = X.sum(axis=-1).reshape(n, w*h)
return pd.DataFrame(X_2D, columns = self.feature_names_reshape)
def transform(self,
arr_1d,
scale = True,
scale_method = 'minmax',
fillnan = 0):
"""
parameters
--------------------
arr_1d: 1d numpy array feature points
scale: bool, if True, we will apply MinMax scaling by the precomputed values
scale_method: {'minmax', 'standard'}
fillnan: fill nan value, default: 0
"""
if not self.isfit:
print_error('please fit first!')
return
if scale:
if scale_method == 'standard':
arr_1d = self.StandardScaler(arr_1d, self.x_mean, self.x_std)
else:
arr_1d = self.MinMaxScaleClip(arr_1d, self.x_min, self.x_max)
df = pd.DataFrame(arr_1d).T
df.columns = self.alist
df = df[self.flist]
vector_1d = df.values[0] #shape = (N, )
fmap = self._S.transform(vector_1d)
p, q, c = fmap.shape
if self.fmap_shape != None:
m, n = self.fmap_shape
if (m > p) | (n > q):
fps = []
for i in range(c):
fp = smartpadding(fmap[:,:,i], self.fmap_shape)
fps.append(fp)
fmap = np.stack(fps, axis=-1)
return np.nan_to_num(fmap, nan = fillnan)
def batch_transform(self,
array_2d,
scale = True,
scale_method = 'minmax',
n_jobs=4,
fillnan = 0):
"""
parameters
--------------------
array_2d: 2D numpy array feature points, M(samples) x N(feature ponits)
scale: bool, if True, we will apply MinMax scaling by the precomputed values
scale_method: {'minmax', 'standard'}
n_jobs: number of parallel
fillnan: fill nan value, default: 0
"""
if not self.isfit:
print_error('please fit first!')
return
assert type(array_2d) == np.ndarray, 'input must be numpy ndarray!'
assert array_2d.ndim == 2, 'input must be 2-D numpy array!'
P = Parallel(n_jobs=n_jobs)
res = P(delayed(self.transform)(arr_1d,
scale,
scale_method,
fillnan) for arr_1d in tqdm(array_2d, ascii=True))
X = np.stack(res)
return X
def plot_scatter(self, htmlpath='./', htmlname=None, radius = 2, enabled_data_labels = False):
"""Scatter plot, radius: the size of the scatter, must be int"""
H_scatter = vismap.plot_scatter(self,
htmlpath=htmlpath,
htmlname=htmlname,
radius = radius,
enabled_data_labels = enabled_data_labels)
return H_scatter
def plot_grid(self, htmlpath='./', htmlname=None, enabled_data_labels = False):
"""Grid plot"""
H_grid = vismap.plot_grid(self,
htmlpath=htmlpath,
htmlname=htmlname,
enabled_data_labels = enabled_data_labels)
return H_grid
def plot_tree(self, figsize=(16,8), add_leaf_labels = True, leaf_font_size = 18, leaf_rotation = 90):
"""Diagram tree plot"""
fig = plt.figure(figsize=figsize)
if self.cluster_flag:
Z = self.Z
D_leaf_colors = self.bitsinfo['colors'].to_dict()
link_cols = {}
for i, i12 in enumerate(Z[:,:2].astype(int)):
c1, c2 = (link_cols[x] if x > len(Z) else D_leaf_colors[x] for x in i12)
link_cols[i+1+len(Z)] = c1
if add_leaf_labels:
labels = self.alist
else:
labels = None
P =dendrogram(Z, labels = labels,
leaf_rotation = leaf_rotation,
leaf_font_size = leaf_font_size,
link_color_func=lambda x: link_cols[x])
return fig
def to_nwk_tree(self, treefile = 'mytree', leaf_names = None):
'''
convert mp object to newick tree and the data file to submit to itol sever
'''
return mp2newick(self, treefile = treefile, leaf_names=leaf_names)
def copy(self):
"""copy self"""
return deepcopy(self)
def load(self, filename):
"""load self"""
return self._load(filename)
def save(self, filename):
"""save self"""
return self._save(filename)