[090c8c]: / src / .ipynb_checkpoints / utils-checkpoint.py

Download this file

72 lines (60 with data), 2.7 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
Authors: Pinar Demetci, Rebecca Santorella
12 February 2020
Utils for SCOT
"""
import numpy as np
import scipy as sp
from scipy.sparse.csgraph import dijkstra
from scipy.sparse import csr_matrix
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
def unit_normalize(data, norm="l2", bySample=True):
"""
Default norm used is l2-norm. Other options: "l1", and "max"
If bySample==True, then we independently normalize each sample. If bySample==False, then we independently normalize each feature
"""
assert (norm in ["l1","l2","max"]), "Norm argument has to be either one of 'max', 'l1', or 'l2'."
if bySample==True:
axis=1
else:
axis=0
return normalize(data, norm=norm, axis=axis)
def zscore_standardize(data):
scaler=StandardScaler()
scaledData=scaler.fit_transform(data)
return scaledData
def get_spatial_distance_matrix(data, metric="eucledian"):
Cdata= sp.spatial.distance.cdist(data,data,metric=metric)
return Cdata/Cdata.max()
def get_graph_distance_matrix(data, num_neighbors, mode="distance", metric="minkowski"):
"""
The default distance metric used with sklearn kneighbors_graph is ‘euclidean’ (‘minkowski’ metric with the p param equal to 2.).
That's why metric is set to "minkowski". If set to something else, it's possible we might need to input other params.
I have not played with it to see if it makes any difference or if another metric makes more sense.
"""
assert (mode in ["connectivity", "distance"]), "Norm argument has to be either one of 'connectivity', or 'distance'. "
if mode=="connectivity":
include_self=True
else:
include_self=False
graph_data=kneighbors_graph(data, num_neighbors, mode=mode, metric=metric, include_self=include_self)
shortestPath_data= dijkstra(csgraph= csr_matrix(graph_data), directed=False, return_predecessors=False)
shortestPath_max= np.nanmax(shortestPath_data[shortestPath_data != np.inf])
shortestPath_data[shortestPath_data > shortestPath_max] = shortestPath_max
shortestPath_data=shortestPath_data/shortestPath_data.max()
return shortestPath_data
def transport_data(source, target, couplingMatrix, transposeCoupling=False):
"""
Given: data in the target space, data in the source space, a coupling matrix learned via Gromow-Wasserstein OT
Returns:
transposeCoupling would need to be True only when the coupling matrix is of the form
"""
if transposeCoupling == False:
P = (couplingMatrix.T/couplingMatrix.sum(1)).T
transported_data= np.matmul(P, target)
else:
P = (couplingMatrix/couplingMatrix.sum(0)).T
transported_data=np.matmul(P, source)
return transported_data