a b/aggmap/aggmodel/explain_dev.py
1
# -*- coding: utf-8 -*-
2
"""
3
Created on Tue Feb  2 14:54:38 2021
4
5
@author: wanxiang.shen@u.nus.edu
6
"""
7
8
9
import numpy as np
10
import pandas as pd
11
12
from tqdm import tqdm
13
from copy import copy
14
15
from aggmap.utils.matrixopt import conv2
16
from sklearn.metrics import mean_squared_error, log_loss
17
from sklearn.preprocessing import StandardScaler
18
19
20
21
def islice(lst, n):
22
    return [lst[i:i + n] for i in range(0, len(lst), n)]
23
24
25
def GlobalIMP(clf, mp, X, Y, task_type = 'classification', 
26
              binary_task = False,
27
              sigmoidy = False, 
28
              apply_logrithm = False,
29
              apply_smoothing = False, 
30
              kernel_size = 5, 
31
              sigma = 1.6):
32
    '''
33
    Forward prop. Feature importance
34
    
35
    apply_scale_smothing: alpplying a smothing on the map
36
    
37
    '''
38
39
    if task_type == 'classification':
40
        f = log_loss
41
    else:
42
        f = mean_squared_error
43
        
44
    def sigmoid(x):
45
        return 1 / (1 + np.exp(-x))
46
47
    scaler = StandardScaler()
48
    df_grid = mp.df_grid_reshape
49
    backgroud = mp.transform_mpX_to_df(clf.X_).min().values #min value in the training set
50
51
    dfY = pd.DataFrame(Y)
52
    Y_true = Y
53
    Y_prob = clf._model.predict(X)
54
    N, W, H, C = X.shape
55
    T = len(df_grid)
56
    nX = 20 # 10 arrX to predict
57
58
    if (sigmoidy) & (task_type == 'classification'):
59
        Y_prob = sigmoid(Y_prob)
60
    
61
62
    final_res = {}
63
    for k, col in enumerate(dfY.columns):
64
        if (task_type == 'classification') & (binary_task):
65
            if k == 0:
66
                continue
67
        print('calculating feature importance for column %s ...' % col)
68
        results = []
69
        loss = f(Y_true[:, k].tolist(), Y_prob[:, k].tolist())
70
        
71
        tmp_X = []
72
        flag = 0
73
        for i in tqdm(range(T), ascii= True):
74
            ts = df_grid.iloc[i]
75
            y = ts.y
76
            x = ts.x
77
            
78
            ## step 1: make permutaions
79
            X1 = np.array(X)
80
            #x_min = X[:, y, x,:].min()
81
            vmin = backgroud[i]
82
            X1[:, y, x,:] = np.full(X1[:, y, x,:].shape, fill_value = vmin)
83
            tmp_X.append(X1)
84
85
            if (flag == nX) | (i == T-1):
86
                X2p = np.concatenate(tmp_X)
87
                ## step 2: make predictions
88
                Y_pred_prob = clf._model.predict(X2p) #predict ont by one is not efficiency
89
                if (sigmoidy) & (task_type == 'classification'):
90
                    Y_pred_prob = sigmoid(Y_pred_prob)    
91
92
                ## step 3: calculate changes
93
                for Y_pred in islice(Y_pred_prob, N):
94
                    mut_loss = f(Y_true[:, k].tolist(), Y_pred[:, k].tolist()) 
95
                    res =  mut_loss - loss # if res > 0, important, othervise, not important
96
                    results.append(res)
97
98
                flag = 0
99
                tmp_X = []
100
            flag += 1
101
102
        ## step 4:apply scaling or smothing 
103
        s = pd.DataFrame(results).values
104
        if apply_logrithm:
105
            s = np.log(s)
106
        smin = np.nanmin(s[s != -np.inf])
107
        smax = np.nanmax(s[s != np.inf])
108
        s = np.nan_to_num(s, nan=smin, posinf=smax, neginf=smin) #fillna with smin
109
        a = scaler.fit_transform(s)
110
        a = a.reshape(*mp._S.fmap_shape)
111
        if apply_smoothing:
112
            covda = conv2(a, kernel_size=kernel_size, sigma=sigma)
113
            results = covda.reshape(-1,).tolist()
114
        else:
115
            results = a.reshape(-1,).tolist()
116
117
        final_res.update({col:results})
118
        
119
    df = pd.DataFrame(final_res)
120
    df.columns = df.columns.astype(str)
121
    df.columns = 'col_' + df.columns + '_importance'
122
    df = df_grid.join(df)
123
    return df
124
125
126
127
def LocalIMP(clf, mp, X, Y, 
128
             task_type = 'classification', 
129
             binary_task = False,
130
             sigmoidy = False,  
131
             apply_logrithm = False, 
132
             apply_smoothing = False,
133
             kernel_size = 3, sigma = 1.2):
134
    '''
135
    Forward prop. Feature importance
136
    '''
137
    
138
    assert len(X) == 1, 'each for only one image!'
139
    
140
    if task_type == 'classification':
141
        f = log_loss
142
    else:
143
        f = mean_squared_error
144
        
145
    def sigmoid(x):
146
        return 1 / (1 + np.exp(-x))
147
    
148
    scaler = StandardScaler()
149
    df_grid = mp.df_grid_reshape
150
    backgroud = mp.transform_mpX_to_df(clf.X_).min().values #min value in the training set
151
    
152
    Y_true = Y
153
    Y_prob = clf._model.predict(X)
154
    N, W, H, C = X.shape
155
156
    if (sigmoidy) & (task_type == 'classification'):
157
        Y_prob = sigmoid(Y_prob)
158
159
    results = []
160
    loss = f(Y_true.ravel().tolist(),  Y_prob.ravel().tolist())
161
    
162
    all_X1 = []
163
    for i in tqdm(range(len(df_grid)), ascii= True):
164
        ts = df_grid.iloc[i]
165
        y = ts.y
166
        x = ts.x
167
        X1 = np.array(X)
168
        vmin = backgroud[i]
169
        X1[:, y, x,:] = np.full(X1[:, y, x,:].shape, fill_value = vmin)
170
        all_X1.append(X1)
171
        
172
    all_X = np.concatenate(all_X1)
173
    all_Y_pred_prob = clf._model.predict(all_X)
174
175
    for Y_pred_prob in all_Y_pred_prob:
176
        if (sigmoidy) & (task_type == 'classification'):
177
            Y_pred_prob = sigmoid(Y_pred_prob)
178
        mut_loss = f(Y_true.ravel().tolist(), Y_pred_prob.ravel().tolist()) 
179
        res =  mut_loss - loss # if res > 0, important, othervise, not important
180
        results.append(res)
181
182
    ## apply smothing and scalings
183
    s = pd.DataFrame(results).values
184
    if apply_logrithm:
185
        s = np.log(s)
186
    smin = np.nanmin(s[s != -np.inf])
187
    smax = np.nanmax(s[s != np.inf])
188
    s = np.nan_to_num(s, nan=smin, posinf=smax, neginf=smin) #fillna with smin
189
    a = scaler.fit_transform(s)
190
    a = a.reshape(*mp._S.fmap_shape)
191
    if apply_smoothing:
192
        covda = conv2(a, kernel_size=kernel_size, sigma=sigma)
193
        results = covda.reshape(-1,).tolist()
194
    else:
195
        results = a.reshape(-1,).tolist()
196
197
    df = pd.DataFrame(results, columns = ['imp'])
198
    #df.columns = df.columns + '_importance'
199
    df = df_grid.join(df)
200
    return df