|
a |
|
b/aggmap/utils/calculator.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
""" |
|
|
4 |
Created on Sat Aug 17 16:54:12 2019 |
|
|
5 |
|
|
|
6 |
@author: wanxiang.shen@u.nus.edu |
|
|
7 |
|
|
|
8 |
@usecase: calculate varies distances |
|
|
9 |
""" |
|
|
10 |
from aggmap.utils.distances import named_distances |
|
|
11 |
from aggmap.utils.multiproc import MultiProcessUnorderedBarRun |
|
|
12 |
|
|
|
13 |
import numpy as np |
|
|
14 |
from tqdm import tqdm |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
def _yield_combinations(N): |
|
|
18 |
for i1 in range(N): |
|
|
19 |
for i2 in range(i1): |
|
|
20 |
yield (i1,i2) |
|
|
21 |
|
|
|
22 |
def _calculate(i1, i2): |
|
|
23 |
x1 = data[:, i1] |
|
|
24 |
x2 = data[:, i2] |
|
|
25 |
## dropna |
|
|
26 |
X = np.stack([x1,x2], axis=1) |
|
|
27 |
X = X[~np.isnan(X).any(axis=1)] |
|
|
28 |
x1 = X[:, 0] |
|
|
29 |
x2 = X[:, 1] |
|
|
30 |
|
|
|
31 |
# x1 = np.nan_to_num(x1) |
|
|
32 |
# x2 = np.nan_to_num(x2) |
|
|
33 |
if x1.any(): |
|
|
34 |
dist = func(x1, x2) |
|
|
35 |
else: |
|
|
36 |
dist = np.nan |
|
|
37 |
return (i1, i2, dist) |
|
|
38 |
|
|
|
39 |
def _fuc(x): |
|
|
40 |
i1, i2 = x |
|
|
41 |
return _calculate(i1, i2) |
|
|
42 |
|
|
|
43 |
|
|
|
44 |
def pairwise_distance(npydata, n_cpus=8, method='correlation'): |
|
|
45 |
""" |
|
|
46 |
parameters |
|
|
47 |
--------------- |
|
|
48 |
method: {'euclidean', 'manhattan', 'canberra', 'chebyshev', |
|
|
49 |
'cosine', 'braycurtis', 'correlation', |
|
|
50 |
'jaccard', 'rogerstanimoto', 'hamming', 'dice', 'kulsinski', 'sokal_sneath'} |
|
|
51 |
npydata: np.array or np.memmap, Note that the default we will calcuate the vector's distances instead of sample's distances, if you wish to calculate distances between samples, you can pass data.T instead of data |
|
|
52 |
|
|
|
53 |
Usage |
|
|
54 |
-------------- |
|
|
55 |
>>> import numpy as np |
|
|
56 |
>>> data = np.random.random_sample(size=(10000,10) |
|
|
57 |
>>> dist_matrix = pairwise_distance(data) |
|
|
58 |
>>> dist_matrix.shape |
|
|
59 |
>>> (10,10) |
|
|
60 |
""" |
|
|
61 |
global data, func |
|
|
62 |
|
|
|
63 |
func = named_distances.get(method) |
|
|
64 |
data = npydata |
|
|
65 |
N = data.shape[1] |
|
|
66 |
lst = list(_yield_combinations(N)) |
|
|
67 |
res = MultiProcessUnorderedBarRun(_fuc, lst, n_cpus=n_cpus) |
|
|
68 |
dist_matrix = np.zeros(shape = (N,N)) |
|
|
69 |
for x,y,v in tqdm(res,ascii=True): |
|
|
70 |
dist_matrix[x,y] = v |
|
|
71 |
dist_matrix[y,x] = v |
|
|
72 |
return dist_matrix |
|
|
73 |
|
|
|
74 |
|
|
|
75 |
|
|
|
76 |
if __name__ == '__main__': |
|
|
77 |
|
|
|
78 |
import numpy as np |
|
|
79 |
import pandas as pd |
|
|
80 |
from umap import UMAP |
|
|
81 |
import matplotlib.pyplot as plt |
|
|
82 |
|
|
|
83 |
X = np.random.random_sample(size=(1000000,40)) |
|
|
84 |
distmatrix = pairwise_distance(X, n_cpus=6) |
|
|
85 |
embedding = UMAP(metric = 'precomputed',random_state=10) |
|
|
86 |
df = pd.DataFrame(embedding.fit_transform(distmatrix)) |
|
|
87 |
ax = plt.plot(df[0],df[1], 'bo') |