Switch to unified view

a b/aggmap/utils/calculator.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Sat Aug 17 16:54:12 2019
5
6
@author: wanxiang.shen@u.nus.edu
7
8
@usecase: calculate varies distances
9
"""
10
from aggmap.utils.distances import named_distances
11
from aggmap.utils.multiproc import MultiProcessUnorderedBarRun
12
13
import numpy as np
14
from tqdm import tqdm
15
16
17
def _yield_combinations(N):
18
    for i1 in range(N):
19
        for i2 in range(i1):
20
            yield (i1,i2)
21
            
22
def _calculate(i1, i2):
23
    x1 = data[:, i1]
24
    x2 = data[:, i2]
25
    ## dropna
26
    X = np.stack([x1,x2], axis=1)
27
    X = X[~np.isnan(X).any(axis=1)]
28
    x1 = X[:, 0]
29
    x2 = X[:, 1]
30
    
31
#     x1 = np.nan_to_num(x1)
32
#     x2 = np.nan_to_num(x2)
33
    if x1.any():
34
        dist = func(x1, x2)
35
    else:
36
        dist = np.nan
37
    return (i1, i2, dist)
38
39
def _fuc(x):
40
    i1, i2  = x
41
    return _calculate(i1, i2)
42
43
44
def pairwise_distance(npydata, n_cpus=8, method='correlation'):
45
    """
46
    parameters
47
    ---------------
48
    method: {'euclidean', 'manhattan', 'canberra', 'chebyshev', 
49
             'cosine', 'braycurtis', 'correlation',
50
             'jaccard', 'rogerstanimoto', 'hamming', 'dice', 'kulsinski', 'sokal_sneath'}
51
    npydata: np.array or np.memmap, Note that the default we will calcuate the vector's distances instead of sample's distances, if you wish to calculate distances between samples, you can pass data.T instead of data
52
53
    Usage
54
    --------------
55
    >>> import numpy as np
56
    >>> data = np.random.random_sample(size=(10000,10)
57
    >>> dist_matrix = pairwise_distance(data)
58
    >>> dist_matrix.shape
59
    >>> (10,10)  
60
    """    
61
    global data, func
62
    
63
    func = named_distances.get(method)
64
    data = npydata
65
    N = data.shape[1]
66
    lst = list(_yield_combinations(N))
67
    res = MultiProcessUnorderedBarRun(_fuc, lst, n_cpus=n_cpus)
68
    dist_matrix = np.zeros(shape = (N,N))
69
    for x,y,v in tqdm(res,ascii=True):
70
        dist_matrix[x,y] = v
71
        dist_matrix[y,x] = v
72
    return dist_matrix
73
74
    
75
    
76
if __name__ == '__main__':
77
    
78
    import numpy as np
79
    import pandas as pd
80
    from umap import UMAP
81
    import matplotlib.pyplot as plt
82
    
83
    X = np.random.random_sample(size=(1000000,40))
84
    distmatrix = pairwise_distance(X, n_cpus=6)
85
    embedding = UMAP(metric = 'precomputed',random_state=10)
86
    df = pd.DataFrame(embedding.fit_transform(distmatrix))
87
    ax = plt.plot(df[0],df[1], 'bo')