Diff of /model_tester.py [000000] .. [8d2107]

Switch to unified view

a b/model_tester.py
1
import logging
2
import os
3
import re
4
import sys
5
import json
6
7
import numpy as np
8
import cProfile
9
from sklearn.base import TransformerMixin
10
from sklearn.cross_validation import train_test_split
11
from sklearn.linear_model import LogisticRegression
12
from sklearn.svm import SVC
13
from sklearn.tree import DecisionTreeClassifier
14
from sklearn.ensemble import AdaBoostClassifier
15
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, f1_score, confusion_matrix
16
from sklearn.pipeline import FeatureUnion, Pipeline
17
18
from extract_data import get_doc_rel_dates, get_operation_date, get_ef_values
19
from extract_data import get_operation_date,  is_note_doc, get_date_key
20
from language_processing import parse_date 
21
from loader import get_data
22
from decision_model import ClinicalDecisionModel
23
from mix_of_exp import MixtureOfExperts
24
logger = logging.getLogger("DaemonLog")
25
26
def _specificity_score(y,y_hat):
27
    mc = confusion_matrix(y,y_hat)
28
    return float(mc[0,0])/np.sum(mc[0,:])
29
30
def get_preprocessed_patients(sample_size = 25, rebuild_cache=False):
31
    cache_file = '/PHShome/ju601/crt/data/patient_cache.json'
32
    
33
    # Build cache
34
    if not os.path.isfile(cache_file) or rebuild_cache:
35
        patients_out = []
36
        delta_efs_out = []
37
        patient_nums = range(906)
38
        for i in patient_nums:
39
            if i%100 == 0:
40
                logger.info(str(i) + '/' + str(patient_nums[-1]))
41
            patient_data = get_data([i])[0]
42
            if patient_data is not None:
43
                ef_delta = get_ef_delta(patient_data)
44
                if ef_delta is not None:
45
                    patients_out.append(patient_data['NEW_EMPI'])
46
                    delta_efs_out.append(ef_delta)
47
        with open(cache_file, 'w') as cache:
48
            cache_obj = {
49
                'patients': patients_out,
50
                'delta_efs': delta_efs_out
51
            }
52
            json.dump(cache_obj, cache)
53
54
    # Load from cache
55
    with open(cache_file, 'r') as f:
56
        cached = json.load(f)
57
    n = min(sample_size, len(cached['patients']))
58
    return cached['patients'][:n], cached['delta_efs'][:n]
59
60
61
def change_ef_values_to_categories(ef_values):
62
    output = []
63
64
    for value in ef_values:
65
        if value < 5:
66
            output.append(1)
67
        else:
68
            output.append(0)
69
        '''
70
        if value <-2:
71
            output.append("reduction")
72
        elif value < 5:
73
            output.append("non-responder")
74
        elif value < 15:
75
            output.append("responder")
76
        else:
77
            output.append("super-responder")
78
        '''
79
    return output
80
            
81
def get_ef_delta(patient_data):
82
    after_threshold = 12*30 #traub: changed to 12mo optimal measurement
83
    ef_values = get_ef_values(patient_data)
84
    sorted_ef = sorted(ef_values)
85
    before = None
86
    after = None
87
    dist_from_thresh = float('inf')
88
    for (rel_date, ef_value) in sorted_ef:
89
        if rel_date <= 0:
90
            before = ef_value
91
        else:
92
            dist = abs(rel_date - after_threshold)
93
            if dist < dist_from_thresh:
94
                after = ef_value
95
                dist_from_thresh = dist
96
97
    if before is not None and after is not None:
98
        return after - before
99
    else:
100
        return None
101
102
103
class PrintTransformer(TransformerMixin):
104
    def fit(self, X, y=None, **fit_params):
105
        return self
106
    def transform(self, X, **transform_params):
107
        logger.info(X.shape)
108
        print X[0].shape
109
        return X
110
111
class FeaturePipeline(Pipeline):
112
113
    def get_feature_names(self):
114
        return self.steps[-1][1].get_feature_names()
115
116
117
def display_summary(name, values):
118
    logger.info(name)
119
    logger.info("\tmean:\t", 1.* sum(values) / len(values))
120
    logger.info("\tmin:\t", min(values))
121
    logger.info("\tmax:\t", max(values))
122
123
def get_mu_std(values):
124
    mu = 1. * sum(values) / len(values)
125
    sq_dev = [(x - mu)**2 for x in values]
126
    std = (sum(sq_dev) / len(values))**.5
127
    return (mu, std)
128
129
############################################
130
# Inputs:
131
#       clf: some model object with a fit(X,y) and predict(X) function
132
#       data_size: num patients
133
#       num_cv_splits: number of cv runs
134
#       status_file: opened status file (status_file.write("hello") should work)
135
# Outputs: a dictionary with the following
136
#       precision_mean(_std)
137
#       recall_mean(_std)
138
#       f1_mean(_std)
139
#       accuracy_mean(_std)
140
#       important_features: a string of the most 100 important features 
141
############################################
142
143
def execute_test(clf, data_size, num_cv_splits): 
144
    
145
    logger.info('Preprocessing...')
146
    X, Y = get_preprocessed_patients(data_size)
147
    Y = change_ef_values_to_categories(Y)
148
149
    logger.info(str(len(X)) + " patients in dataset")
150
    
151
    counts = {}
152
    for y in Y:
153
        if y not in counts:
154
            counts[y] = 0
155
        counts[y] += 1
156
    logger.info("Summary:")
157
    logger.info(counts)
158
159
    precision = []
160
    precision_train = []
161
    recall = []
162
    recall_train = []
163
    f1 = []
164
    specificity = []
165
    f1_train = []
166
    accuracy = []    
167
    accuracy_train = []    
168
169
    logger.info("Beginning runs")
170
    
171
    for cv_run in range(int(num_cv_splits)):
172
173
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .33)
174
        logger.info("fitting " + str(len(X_train)) + " patients...")
175
        if type(clf.steps[-1][1]) == MixtureOfExperts:
176
            logger.info("alex debug, mixture of experts")
177
            Y_real = np.zeros((len(Y_train), 2))
178
            for i in range(len(Y_train)):    
179
                Y_real[i, Y_train[i]] = 1
180
            Y_train = Y_real
181
182
        logger.info("Fitting")
183
        clf.fit(X_train, Y_train)
184
        logger.info("Predicting on test set")
185
        Y_predict = clf.predict(X_test)
186
        logger.info("Predicting on train set")
187
        Y_train_predict = clf.predict(X_train)
188
189
        precision += [precision_score(Y_test, Y_predict)]
190
        precision_train += [precision_score(Y_train, Y_train_predict)]
191
        recall += [recall_score(Y_test, Y_predict)]
192
        recall_train += [recall_score(Y_train, Y_train_predict)]
193
        f1 += [f1_score(Y_test, Y_predict)]
194
        f1_train += [f1_score(Y_train, Y_train_predict)]
195
        accuracy += [accuracy_score(Y_test, Y_predict)]
196
        specificity += [_specificity_score(Y_test, Y_predict)]
197
        accuracy_train += [accuracy_score(Y_train, Y_train_predict)]
198
199
        logger.info("CV Split #" + str(cv_run + 1))
200
        logger.info("\tPrecision: " + str(precision[-1]))
201
        logger.info("\tRecall: " + str(recall[-1]))
202
        logger.info("\tF1 Score: " + str(f1[-1]))
203
        logger.info("\tAccuracy: " + str(accuracy[-1]))
204
        logger.info("\tSpecificity: " + str(specificity[-1]))
205
        logger.info("\tTrain Precision: " + str(precision_train[-1]))
206
        logger.info("\tTrain Recall: " + str(recall_train[-1]))
207
        logger.info("\tTrain F1 Score: " + str(f1_train[-1]))
208
        logger.info("\tTrain Accuracy: " + str(accuracy_train[-1]))
209
210
    try:
211
        features, model = (clf.steps[0][1], clf.steps[-1][1])
212
        column_names = features.get_feature_names()
213
        feature_importances = model.coef_[0] if not type(model) in [type(AdaBoostClassifier()), type(DecisionTreeClassifier)]  else model.feature_importances_
214
        if len(column_names) == len(feature_importances):
215
            Z = zip(column_names, feature_importances)
216
            Z.sort(key = lambda x: abs(x[1]), reverse = True)
217
            important_features = ""
218
            for z in  Z[:min(100, len(Z))]:
219
                important_features += str(z[1]) + ": " + str(z[0]) + "\\n" 
220
    except Exception as e:
221
        logger.error(e)
222
        important_features = "error"
223
224
    result = dict()
225
    result['mode'] = max([1. * x/ sum(counts.values()) for x in counts.values()])
226
    result['precision_mean'], result['precision_std'] = get_mu_std(precision)
227
    result['recall_mean'], result['recall_std'] = get_mu_std(recall)
228
    result['f1_mean'], result['f1_std'] = get_mu_std(f1)
229
    result['accuracy_mean'], result['accuracy_std'] = get_mu_std(accuracy)
230
    result['train_precision_mean'], result['train_precision_std'] = get_mu_std(precision_train)
231
    result['train_recall_mean'], result['train_recall_std'] = get_mu_std(recall_train)
232
    result['train_f1_mean'], result['train_f1_std'] = get_mu_std(f1_train)
233
    result['train_accuracy_mean'], result['train_accuracy_std'] = get_mu_std(accuracy_train)
234
    result['important_features'] = important_features
235
     
236
    return result    
237
238
# DEPRECATED
239
def test_model(features, data_size = 25, num_cv_splits = 5, method = 'logistic regression', show_progress = True, model_args = dict()):
240
241
    if method in ['logistic regression', 'lr', 'logitr', 'logistic']:
242
        is_regression = False
243
        clf = LogisticRegression(**model_args)
244
    elif method in ['svm']:
245
        is_regression = False
246
        clf = SVC(**model_args)
247
    elif method in ['boosting', 'adaboost']:
248
        is_regression = False
249
        clf = AdaBoostClassifier(**model_args)
250
    elif method in ['clinical', 'decision', 'cdm', 'clinical decision model']:
251
        is_regression = False
252
        clf = ClinicalDecisionModel()
253
    else:
254
        raise ValueError("'" + method + "' is not a supported classification method")
255
256
    logger.info('Preprocessing...')
257
    X, Y = get_preprocessed_patients(data_size)
258
    Y = change_ef_values_to_categories(Y)
259
    logger.info(str(len(X)) + " patients in dataset")
260
    
261
    if not is_regression:
262
        counts = {}
263
        for y in Y:
264
            if y not in counts:
265
                counts[y] = 0
266
            counts[y] += 1
267
        logger.info("Summary:")
268
        logger.info(counts)
269
    
270
    pipeline =  Pipeline([
271
        ('feature_union', features),
272
        ('Classifier', clf)
273
    ])
274
275
    #If using the ClinicalDecisionModel then no pipeline needed
276
    if method in ['clinical', 'decision', 'cdm', 'clinical decision model']:
277
        pipeline = clf
278
279
    mse = []
280
    r2 = []
281
    precision = []
282
    recall = []
283
    f1 = []
284
    accuracy = []
285
    specificity = []    
286
287
    for cv_run in range(num_cv_splits):
288
289
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .33)
290
        pipeline.fit(X_train, Y_train)
291
        Y_predict = pipeline.predict(X_test)
292
        if is_regression:
293
            mse += [mean_squared_error(Y_test, Y_predict)]
294
            r2 += [r2_score(Y_test, Y_predict)]
295
            if show_progress:
296
                logger.info("CV Split #" + str(cv_run + 1))
297
                logger.info("\tMean Squared Error: ", mse[-1])
298
                logger.info("\tR2 Score: ", r2[-1])
299
        else:
300
            precision += [precision_score(Y_test, Y_predict)]
301
            recall += [recall_score(Y_test, Y_predict)]
302
            f1 += [f1_score(Y_test, Y_predict)]
303
            accuracy += [accuracy_score(Y_test, Y_predict)]
304
            specificity += [_specificity_score(Y_test, Y_predict)]
305
            if show_progress:
306
                logger.info("CV Split #" + str(cv_run + 1))
307
                logger.info("\tPrecision: " + str(precision[-1]))
308
                logger.info("\tRecall: " + str(recall[-1]))
309
                logger.info("\tF1 Score: " + str(f1[-1]))
310
                logger.info("\tAccuracy: " + str(accuracy[-1]))
311
                logger.info("\tSpecificity: " +  str(specificity[-1]))
312
    logger.info("\n---------------------------------------")
313
    logger.info("Overall (" + str(num_cv_splits) +  " cv cuts)")
314
    if is_regression:
315
        display_summary("Mean Squared Error", mse)
316
        display_summary("R2 Score", r2)
317
    else:
318
        display_summary("Precision", precision)
319
        display_summary("Recall", recall)
320
        display_summary("F1 Score", f1)
321
        display_summary("Accuracy", accuracy)
322
        display_summary("Specificity", specificity)
323
324
    try:
325
        column_names = features.get_feature_names()
326
        logger.info("Number of column names: " + str( len(column_names)))
327
        feature_importances = clf.coef_[0] if not method in ['boosting', 'adaboost'] else clf.feature_importances_
328
        Z = zip(column_names, feature_importances)
329
        Z.sort(key = lambda x: abs(x[1]), reverse = True)
330
        logger.info("100 biggest theta components of last CV run:")
331
        for z in Z[:min(100, len(Z))]:
332
            logger.info(str(z[1]) + "\t" + z[0])
333
    except Exception as e:
334
        logger.info("Feature name extraction failed")
335
        logger.info(e)
336
 
337
338
if __name__ == "__main__":
339
    main()
340
    #cProfile.run('main()')