Diff of /aggmap/utils/distances.py [000000] .. [9e8054]

Switch to unified view

a b/aggmap/utils/distances.py
1
import numpy as np
2
import numba
3
4
5
6
################### numeric data #########################
7
@numba.njit(fastmath=True)
8
def euclidean(x, y):
9
    """Standard euclidean distance. l2 distance
10
    ..math::
11
        D(x, y) = \sqrt{\sum_i (x_i - y_i)^2}
12
    """
13
    result = 0.0
14
    for i in range(x.shape[0]):
15
        result += (x[i] - y[i]) ** 2
16
    return np.sqrt(result)
17
18
@numba.njit(fastmath=True)
19
def sqeuclidean(x, y):
20
    """Standard euclidean distance. l2 distance
21
    ..math::
22
        D(x, y) = \sqrt{\sum_i (x_i - y_i)^2}
23
    """
24
    result = 0.0
25
    for i in range(x.shape[0]):
26
        result += (x[i] - y[i]) ** 2
27
    return result
28
29
30
31
@numba.njit()
32
def manhattan(x, y):
33
    """Manhatten, taxicab, or l1 distance.
34
    ..math::
35
        D(x, y) = \sum_i |x_i - y_i|
36
    """
37
    result = 0.0
38
    for i in range(x.shape[0]):
39
        result += np.abs(x[i] - y[i])
40
41
    return result
42
43
44
@numba.njit()
45
def canberra(x, y):
46
    result = 0.0
47
    for i in range(x.shape[0]):
48
        denominator = np.abs(x[i]) + np.abs(y[i])
49
        if denominator > 0:
50
            result += np.abs(x[i] - y[i]) / denominator
51
52
    return result
53
54
55
56
@numba.njit()
57
def chebyshev(x, y):
58
    """Chebyshev or l-infinity distance.
59
    ..math::
60
        D(x, y) = \max_i |x_i - y_i|
61
    """
62
    result = 0.0
63
    for i in range(x.shape[0]):
64
        result = max(result, np.abs(x[i] - y[i]))
65
66
    return result
67
68
69
70
############### binary data ################
71
@numba.njit()
72
def jaccard(x, y):
73
    num_non_zero = 0.0
74
    num_equal = 0.0
75
    for i in range(x.shape[0]):
76
        x_true = x[i] != 0
77
        y_true = y[i] != 0
78
        num_non_zero += x_true or y_true
79
        num_equal += x_true and y_true
80
81
    if num_non_zero == 0.0:
82
        return 0.0
83
    else:
84
        return float(num_non_zero - num_equal) / num_non_zero
85
    
86
@numba.njit()
87
def rogers_tanimoto(x, y):
88
    num_not_equal = 0.0
89
    for i in range(x.shape[0]):
90
        x_true = x[i] != 0
91
        y_true = y[i] != 0
92
        num_not_equal += x_true != y_true
93
94
    return (2.0 * num_not_equal) / (x.shape[0] + num_not_equal)
95
96
97
98
@numba.njit()
99
def hamming(x, y):
100
    result = 0.0
101
    for i in range(x.shape[0]):
102
        if x[i] != y[i]:
103
            result += 1.0
104
105
    return float(result) / x.shape[0]
106
107
108
@numba.njit()
109
def dice(x, y):
110
    num_true_true = 0.0
111
    num_not_equal = 0.0
112
    for i in range(x.shape[0]):
113
        x_true = x[i] != 0
114
        y_true = y[i] != 0
115
        num_true_true += x_true and y_true
116
        num_not_equal += x_true != y_true
117
118
    if num_not_equal == 0.0:
119
        return 0.0
120
    else:
121
        return num_not_equal / (2.0 * num_true_true + num_not_equal)
122
123
124
@numba.njit()
125
def kulsinski(x, y):
126
    num_true_true = 0.0
127
    num_not_equal = 0.0
128
    for i in range(x.shape[0]):
129
        x_true = x[i] != 0
130
        y_true = y[i] != 0
131
        num_true_true += x_true and y_true
132
        num_not_equal += x_true != y_true
133
134
    if num_not_equal == 0:
135
        return 0.0
136
    else:
137
        return float(num_not_equal - num_true_true + x.shape[0]) / (
138
            num_not_equal + x.shape[0]
139
        )
140
    
141
@numba.njit()
142
def sokal_sneath(x, y):
143
    num_true_true = 0.0
144
    num_not_equal = 0.0
145
    for i in range(x.shape[0]):
146
        x_true = x[i] != 0
147
        y_true = y[i] != 0
148
        num_true_true += x_true and y_true
149
        num_not_equal += x_true != y_true
150
151
    if num_not_equal == 0.0:
152
        return 0.0
153
    else:
154
        return num_not_equal / (0.5 * num_true_true + num_not_equal)
155
156
    
157
158
    
159
################### both #############
160
@numba.njit()
161
def bray_curtis(x, y):
162
    numerator = 0.0
163
    denominator = 0.0
164
    for i in range(x.shape[0]):
165
        numerator += np.abs(x[i] - y[i])
166
        denominator += np.abs(x[i] + y[i])
167
168
    if denominator > 0.0:
169
        return float(numerator) / denominator
170
    else:
171
        return 0.0
172
173
174
@numba.njit()
175
def cosine(x, y):
176
    result = 0.0
177
    norm_x = 0.0
178
    norm_y = 0.0
179
    for i in range(x.shape[0]):
180
        result += x[i] * y[i]
181
        norm_x += x[i] ** 2
182
        norm_y += y[i] ** 2
183
184
    if norm_x == 0.0 and norm_y == 0.0:
185
        return 0.0
186
    elif norm_x == 0.0 or norm_y == 0.0:
187
        return 1.0
188
    else:
189
        return 1.0 - (result / np.sqrt(norm_x * norm_y))
190
191
192
@numba.njit()
193
def correlation(x, y):
194
    mu_x = 0.0
195
    mu_y = 0.0
196
    norm_x = 0.0
197
    norm_y = 0.0
198
    dot_product = 0.0
199
200
    for i in range(x.shape[0]):
201
        mu_x += x[i]
202
        mu_y += y[i]
203
204
    mu_x /= x.shape[0]
205
    mu_y /= x.shape[0]
206
207
    for i in range(x.shape[0]):
208
        shifted_x = x[i] - mu_x
209
        shifted_y = y[i] - mu_y
210
        norm_x += shifted_x ** 2
211
        norm_y += shifted_y ** 2
212
        dot_product += shifted_x * shifted_y
213
214
    if norm_x == 0.0 and norm_y == 0.0:
215
        return 0.0
216
    elif dot_product == 0.0:
217
        return 1.0
218
    else:
219
        return 1.0 - (dot_product / np.sqrt(norm_x * norm_y))
220
    
221
    
222
    
223
descriptors_dist = [(euclidean,'euclidean'),
224
                    (sqeuclidean,'sqeuclidean'),
225
                    (manhattan,'manhattan'),
226
                    (canberra,'canberra'),
227
                    (chebyshev,'chebyshev'),
228
                    (cosine,'cosine'),
229
                    (correlation,'correlation'),
230
                    (bray_curtis,'braycurtis')]
231
232
233
234
fingerprint_dist = [(jaccard, 'jaccard'),
235
                    (rogers_tanimoto, 'rogers_tanimoto'),
236
                    (hamming,'hamming'),
237
                    (dice, 'dice'),
238
                    (kulsinski, 'kulsinski'),
239
                    (sokal_sneath,'sokal_sneath'),
240
                    (cosine,'cosine'),
241
                    (correlation,'correlation'),
242
                    (bray_curtis,'braycurtis')]
243
244
245
246
247
def GenNamedDist(descriptors_dist, fingerprint_dist):
248
    _dist_fuc = {}
249
    _all = descriptors_dist.copy()
250
    _all.extend(fingerprint_dist)
251
    for i in _all:
252
        _dist_fuc[i[1]] = i[0]
253
    return _dist_fuc
254
255
256
named_distances = GenNamedDist(descriptors_dist, fingerprint_dist)
257
258
259
260
261
if __name__ == '__main__':
262
    
263
    import pandas as pd
264
    
265
    x = np.random.random_sample(size=(100,2))
266
    x1 = x.round()
267
    
268
    res = {}
269
    for f,k in descriptors_dist:
270
        ks = 'descriptors-' + k
271
        res.update({ks:f(x[:,0], x[:,1])})
272
        
273
    for f,k in fingerprint_dist:
274
        ks = 'fingerprint-' + k
275
        res.update({ks :f(x1[:,0], x1[:,1])})   
276
        
277
    
278
    print(pd.Series(res))