|
a |
|
b/model_tester.py |
|
|
1 |
import logging |
|
|
2 |
import os |
|
|
3 |
import re |
|
|
4 |
import sys |
|
|
5 |
import json |
|
|
6 |
|
|
|
7 |
import numpy as np |
|
|
8 |
import cProfile |
|
|
9 |
from sklearn.base import TransformerMixin |
|
|
10 |
from sklearn.cross_validation import train_test_split |
|
|
11 |
from sklearn.linear_model import LogisticRegression |
|
|
12 |
from sklearn.svm import SVC |
|
|
13 |
from sklearn.tree import DecisionTreeClassifier |
|
|
14 |
from sklearn.ensemble import AdaBoostClassifier |
|
|
15 |
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, f1_score, confusion_matrix |
|
|
16 |
from sklearn.pipeline import FeatureUnion, Pipeline |
|
|
17 |
|
|
|
18 |
from extract_data import get_doc_rel_dates, get_operation_date, get_ef_values |
|
|
19 |
from extract_data import get_operation_date, is_note_doc, get_date_key |
|
|
20 |
from language_processing import parse_date |
|
|
21 |
from loader import get_data |
|
|
22 |
from decision_model import ClinicalDecisionModel |
|
|
23 |
from mix_of_exp import MixtureOfExperts |
|
|
24 |
logger = logging.getLogger("DaemonLog") |
|
|
25 |
|
|
|
26 |
def _specificity_score(y,y_hat): |
|
|
27 |
mc = confusion_matrix(y,y_hat) |
|
|
28 |
return float(mc[0,0])/np.sum(mc[0,:]) |
|
|
29 |
|
|
|
30 |
def get_preprocessed_patients(sample_size = 25, rebuild_cache=False): |
|
|
31 |
cache_file = '/PHShome/ju601/crt/data/patient_cache.json' |
|
|
32 |
|
|
|
33 |
# Build cache |
|
|
34 |
if not os.path.isfile(cache_file) or rebuild_cache: |
|
|
35 |
patients_out = [] |
|
|
36 |
delta_efs_out = [] |
|
|
37 |
patient_nums = range(906) |
|
|
38 |
for i in patient_nums: |
|
|
39 |
if i%100 == 0: |
|
|
40 |
logger.info(str(i) + '/' + str(patient_nums[-1])) |
|
|
41 |
patient_data = get_data([i])[0] |
|
|
42 |
if patient_data is not None: |
|
|
43 |
ef_delta = get_ef_delta(patient_data) |
|
|
44 |
if ef_delta is not None: |
|
|
45 |
patients_out.append(patient_data['NEW_EMPI']) |
|
|
46 |
delta_efs_out.append(ef_delta) |
|
|
47 |
with open(cache_file, 'w') as cache: |
|
|
48 |
cache_obj = { |
|
|
49 |
'patients': patients_out, |
|
|
50 |
'delta_efs': delta_efs_out |
|
|
51 |
} |
|
|
52 |
json.dump(cache_obj, cache) |
|
|
53 |
|
|
|
54 |
# Load from cache |
|
|
55 |
with open(cache_file, 'r') as f: |
|
|
56 |
cached = json.load(f) |
|
|
57 |
n = min(sample_size, len(cached['patients'])) |
|
|
58 |
return cached['patients'][:n], cached['delta_efs'][:n] |
|
|
59 |
|
|
|
60 |
|
|
|
61 |
def change_ef_values_to_categories(ef_values): |
|
|
62 |
output = [] |
|
|
63 |
|
|
|
64 |
for value in ef_values: |
|
|
65 |
if value < 5: |
|
|
66 |
output.append(1) |
|
|
67 |
else: |
|
|
68 |
output.append(0) |
|
|
69 |
''' |
|
|
70 |
if value <-2: |
|
|
71 |
output.append("reduction") |
|
|
72 |
elif value < 5: |
|
|
73 |
output.append("non-responder") |
|
|
74 |
elif value < 15: |
|
|
75 |
output.append("responder") |
|
|
76 |
else: |
|
|
77 |
output.append("super-responder") |
|
|
78 |
''' |
|
|
79 |
return output |
|
|
80 |
|
|
|
81 |
def get_ef_delta(patient_data): |
|
|
82 |
after_threshold = 12*30 #traub: changed to 12mo optimal measurement |
|
|
83 |
ef_values = get_ef_values(patient_data) |
|
|
84 |
sorted_ef = sorted(ef_values) |
|
|
85 |
before = None |
|
|
86 |
after = None |
|
|
87 |
dist_from_thresh = float('inf') |
|
|
88 |
for (rel_date, ef_value) in sorted_ef: |
|
|
89 |
if rel_date <= 0: |
|
|
90 |
before = ef_value |
|
|
91 |
else: |
|
|
92 |
dist = abs(rel_date - after_threshold) |
|
|
93 |
if dist < dist_from_thresh: |
|
|
94 |
after = ef_value |
|
|
95 |
dist_from_thresh = dist |
|
|
96 |
|
|
|
97 |
if before is not None and after is not None: |
|
|
98 |
return after - before |
|
|
99 |
else: |
|
|
100 |
return None |
|
|
101 |
|
|
|
102 |
|
|
|
103 |
class PrintTransformer(TransformerMixin): |
|
|
104 |
def fit(self, X, y=None, **fit_params): |
|
|
105 |
return self |
|
|
106 |
def transform(self, X, **transform_params): |
|
|
107 |
logger.info(X.shape) |
|
|
108 |
print X[0].shape |
|
|
109 |
return X |
|
|
110 |
|
|
|
111 |
class FeaturePipeline(Pipeline): |
|
|
112 |
|
|
|
113 |
def get_feature_names(self): |
|
|
114 |
return self.steps[-1][1].get_feature_names() |
|
|
115 |
|
|
|
116 |
|
|
|
117 |
def display_summary(name, values): |
|
|
118 |
logger.info(name) |
|
|
119 |
logger.info("\tmean:\t", 1.* sum(values) / len(values)) |
|
|
120 |
logger.info("\tmin:\t", min(values)) |
|
|
121 |
logger.info("\tmax:\t", max(values)) |
|
|
122 |
|
|
|
123 |
def get_mu_std(values): |
|
|
124 |
mu = 1. * sum(values) / len(values) |
|
|
125 |
sq_dev = [(x - mu)**2 for x in values] |
|
|
126 |
std = (sum(sq_dev) / len(values))**.5 |
|
|
127 |
return (mu, std) |
|
|
128 |
|
|
|
129 |
############################################ |
|
|
130 |
# Inputs: |
|
|
131 |
# clf: some model object with a fit(X,y) and predict(X) function |
|
|
132 |
# data_size: num patients |
|
|
133 |
# num_cv_splits: number of cv runs |
|
|
134 |
# status_file: opened status file (status_file.write("hello") should work) |
|
|
135 |
# Outputs: a dictionary with the following |
|
|
136 |
# precision_mean(_std) |
|
|
137 |
# recall_mean(_std) |
|
|
138 |
# f1_mean(_std) |
|
|
139 |
# accuracy_mean(_std) |
|
|
140 |
# important_features: a string of the most 100 important features |
|
|
141 |
############################################ |
|
|
142 |
|
|
|
143 |
def execute_test(clf, data_size, num_cv_splits): |
|
|
144 |
|
|
|
145 |
logger.info('Preprocessing...') |
|
|
146 |
X, Y = get_preprocessed_patients(data_size) |
|
|
147 |
Y = change_ef_values_to_categories(Y) |
|
|
148 |
|
|
|
149 |
logger.info(str(len(X)) + " patients in dataset") |
|
|
150 |
|
|
|
151 |
counts = {} |
|
|
152 |
for y in Y: |
|
|
153 |
if y not in counts: |
|
|
154 |
counts[y] = 0 |
|
|
155 |
counts[y] += 1 |
|
|
156 |
logger.info("Summary:") |
|
|
157 |
logger.info(counts) |
|
|
158 |
|
|
|
159 |
precision = [] |
|
|
160 |
precision_train = [] |
|
|
161 |
recall = [] |
|
|
162 |
recall_train = [] |
|
|
163 |
f1 = [] |
|
|
164 |
specificity = [] |
|
|
165 |
f1_train = [] |
|
|
166 |
accuracy = [] |
|
|
167 |
accuracy_train = [] |
|
|
168 |
|
|
|
169 |
logger.info("Beginning runs") |
|
|
170 |
|
|
|
171 |
for cv_run in range(int(num_cv_splits)): |
|
|
172 |
|
|
|
173 |
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .33) |
|
|
174 |
logger.info("fitting " + str(len(X_train)) + " patients...") |
|
|
175 |
if type(clf.steps[-1][1]) == MixtureOfExperts: |
|
|
176 |
logger.info("alex debug, mixture of experts") |
|
|
177 |
Y_real = np.zeros((len(Y_train), 2)) |
|
|
178 |
for i in range(len(Y_train)): |
|
|
179 |
Y_real[i, Y_train[i]] = 1 |
|
|
180 |
Y_train = Y_real |
|
|
181 |
|
|
|
182 |
logger.info("Fitting") |
|
|
183 |
clf.fit(X_train, Y_train) |
|
|
184 |
logger.info("Predicting on test set") |
|
|
185 |
Y_predict = clf.predict(X_test) |
|
|
186 |
logger.info("Predicting on train set") |
|
|
187 |
Y_train_predict = clf.predict(X_train) |
|
|
188 |
|
|
|
189 |
precision += [precision_score(Y_test, Y_predict)] |
|
|
190 |
precision_train += [precision_score(Y_train, Y_train_predict)] |
|
|
191 |
recall += [recall_score(Y_test, Y_predict)] |
|
|
192 |
recall_train += [recall_score(Y_train, Y_train_predict)] |
|
|
193 |
f1 += [f1_score(Y_test, Y_predict)] |
|
|
194 |
f1_train += [f1_score(Y_train, Y_train_predict)] |
|
|
195 |
accuracy += [accuracy_score(Y_test, Y_predict)] |
|
|
196 |
specificity += [_specificity_score(Y_test, Y_predict)] |
|
|
197 |
accuracy_train += [accuracy_score(Y_train, Y_train_predict)] |
|
|
198 |
|
|
|
199 |
logger.info("CV Split #" + str(cv_run + 1)) |
|
|
200 |
logger.info("\tPrecision: " + str(precision[-1])) |
|
|
201 |
logger.info("\tRecall: " + str(recall[-1])) |
|
|
202 |
logger.info("\tF1 Score: " + str(f1[-1])) |
|
|
203 |
logger.info("\tAccuracy: " + str(accuracy[-1])) |
|
|
204 |
logger.info("\tSpecificity: " + str(specificity[-1])) |
|
|
205 |
logger.info("\tTrain Precision: " + str(precision_train[-1])) |
|
|
206 |
logger.info("\tTrain Recall: " + str(recall_train[-1])) |
|
|
207 |
logger.info("\tTrain F1 Score: " + str(f1_train[-1])) |
|
|
208 |
logger.info("\tTrain Accuracy: " + str(accuracy_train[-1])) |
|
|
209 |
|
|
|
210 |
try: |
|
|
211 |
features, model = (clf.steps[0][1], clf.steps[-1][1]) |
|
|
212 |
column_names = features.get_feature_names() |
|
|
213 |
feature_importances = model.coef_[0] if not type(model) in [type(AdaBoostClassifier()), type(DecisionTreeClassifier)] else model.feature_importances_ |
|
|
214 |
if len(column_names) == len(feature_importances): |
|
|
215 |
Z = zip(column_names, feature_importances) |
|
|
216 |
Z.sort(key = lambda x: abs(x[1]), reverse = True) |
|
|
217 |
important_features = "" |
|
|
218 |
for z in Z[:min(100, len(Z))]: |
|
|
219 |
important_features += str(z[1]) + ": " + str(z[0]) + "\\n" |
|
|
220 |
except Exception as e: |
|
|
221 |
logger.error(e) |
|
|
222 |
important_features = "error" |
|
|
223 |
|
|
|
224 |
result = dict() |
|
|
225 |
result['mode'] = max([1. * x/ sum(counts.values()) for x in counts.values()]) |
|
|
226 |
result['precision_mean'], result['precision_std'] = get_mu_std(precision) |
|
|
227 |
result['recall_mean'], result['recall_std'] = get_mu_std(recall) |
|
|
228 |
result['f1_mean'], result['f1_std'] = get_mu_std(f1) |
|
|
229 |
result['accuracy_mean'], result['accuracy_std'] = get_mu_std(accuracy) |
|
|
230 |
result['train_precision_mean'], result['train_precision_std'] = get_mu_std(precision_train) |
|
|
231 |
result['train_recall_mean'], result['train_recall_std'] = get_mu_std(recall_train) |
|
|
232 |
result['train_f1_mean'], result['train_f1_std'] = get_mu_std(f1_train) |
|
|
233 |
result['train_accuracy_mean'], result['train_accuracy_std'] = get_mu_std(accuracy_train) |
|
|
234 |
result['important_features'] = important_features |
|
|
235 |
|
|
|
236 |
return result |
|
|
237 |
|
|
|
238 |
# DEPRECATED |
|
|
239 |
def test_model(features, data_size = 25, num_cv_splits = 5, method = 'logistic regression', show_progress = True, model_args = dict()): |
|
|
240 |
|
|
|
241 |
if method in ['logistic regression', 'lr', 'logitr', 'logistic']: |
|
|
242 |
is_regression = False |
|
|
243 |
clf = LogisticRegression(**model_args) |
|
|
244 |
elif method in ['svm']: |
|
|
245 |
is_regression = False |
|
|
246 |
clf = SVC(**model_args) |
|
|
247 |
elif method in ['boosting', 'adaboost']: |
|
|
248 |
is_regression = False |
|
|
249 |
clf = AdaBoostClassifier(**model_args) |
|
|
250 |
elif method in ['clinical', 'decision', 'cdm', 'clinical decision model']: |
|
|
251 |
is_regression = False |
|
|
252 |
clf = ClinicalDecisionModel() |
|
|
253 |
else: |
|
|
254 |
raise ValueError("'" + method + "' is not a supported classification method") |
|
|
255 |
|
|
|
256 |
logger.info('Preprocessing...') |
|
|
257 |
X, Y = get_preprocessed_patients(data_size) |
|
|
258 |
Y = change_ef_values_to_categories(Y) |
|
|
259 |
logger.info(str(len(X)) + " patients in dataset") |
|
|
260 |
|
|
|
261 |
if not is_regression: |
|
|
262 |
counts = {} |
|
|
263 |
for y in Y: |
|
|
264 |
if y not in counts: |
|
|
265 |
counts[y] = 0 |
|
|
266 |
counts[y] += 1 |
|
|
267 |
logger.info("Summary:") |
|
|
268 |
logger.info(counts) |
|
|
269 |
|
|
|
270 |
pipeline = Pipeline([ |
|
|
271 |
('feature_union', features), |
|
|
272 |
('Classifier', clf) |
|
|
273 |
]) |
|
|
274 |
|
|
|
275 |
#If using the ClinicalDecisionModel then no pipeline needed |
|
|
276 |
if method in ['clinical', 'decision', 'cdm', 'clinical decision model']: |
|
|
277 |
pipeline = clf |
|
|
278 |
|
|
|
279 |
mse = [] |
|
|
280 |
r2 = [] |
|
|
281 |
precision = [] |
|
|
282 |
recall = [] |
|
|
283 |
f1 = [] |
|
|
284 |
accuracy = [] |
|
|
285 |
specificity = [] |
|
|
286 |
|
|
|
287 |
for cv_run in range(num_cv_splits): |
|
|
288 |
|
|
|
289 |
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .33) |
|
|
290 |
pipeline.fit(X_train, Y_train) |
|
|
291 |
Y_predict = pipeline.predict(X_test) |
|
|
292 |
if is_regression: |
|
|
293 |
mse += [mean_squared_error(Y_test, Y_predict)] |
|
|
294 |
r2 += [r2_score(Y_test, Y_predict)] |
|
|
295 |
if show_progress: |
|
|
296 |
logger.info("CV Split #" + str(cv_run + 1)) |
|
|
297 |
logger.info("\tMean Squared Error: ", mse[-1]) |
|
|
298 |
logger.info("\tR2 Score: ", r2[-1]) |
|
|
299 |
else: |
|
|
300 |
precision += [precision_score(Y_test, Y_predict)] |
|
|
301 |
recall += [recall_score(Y_test, Y_predict)] |
|
|
302 |
f1 += [f1_score(Y_test, Y_predict)] |
|
|
303 |
accuracy += [accuracy_score(Y_test, Y_predict)] |
|
|
304 |
specificity += [_specificity_score(Y_test, Y_predict)] |
|
|
305 |
if show_progress: |
|
|
306 |
logger.info("CV Split #" + str(cv_run + 1)) |
|
|
307 |
logger.info("\tPrecision: " + str(precision[-1])) |
|
|
308 |
logger.info("\tRecall: " + str(recall[-1])) |
|
|
309 |
logger.info("\tF1 Score: " + str(f1[-1])) |
|
|
310 |
logger.info("\tAccuracy: " + str(accuracy[-1])) |
|
|
311 |
logger.info("\tSpecificity: " + str(specificity[-1])) |
|
|
312 |
logger.info("\n---------------------------------------") |
|
|
313 |
logger.info("Overall (" + str(num_cv_splits) + " cv cuts)") |
|
|
314 |
if is_regression: |
|
|
315 |
display_summary("Mean Squared Error", mse) |
|
|
316 |
display_summary("R2 Score", r2) |
|
|
317 |
else: |
|
|
318 |
display_summary("Precision", precision) |
|
|
319 |
display_summary("Recall", recall) |
|
|
320 |
display_summary("F1 Score", f1) |
|
|
321 |
display_summary("Accuracy", accuracy) |
|
|
322 |
display_summary("Specificity", specificity) |
|
|
323 |
|
|
|
324 |
try: |
|
|
325 |
column_names = features.get_feature_names() |
|
|
326 |
logger.info("Number of column names: " + str( len(column_names))) |
|
|
327 |
feature_importances = clf.coef_[0] if not method in ['boosting', 'adaboost'] else clf.feature_importances_ |
|
|
328 |
Z = zip(column_names, feature_importances) |
|
|
329 |
Z.sort(key = lambda x: abs(x[1]), reverse = True) |
|
|
330 |
logger.info("100 biggest theta components of last CV run:") |
|
|
331 |
for z in Z[:min(100, len(Z))]: |
|
|
332 |
logger.info(str(z[1]) + "\t" + z[0]) |
|
|
333 |
except Exception as e: |
|
|
334 |
logger.info("Feature name extraction failed") |
|
|
335 |
logger.info(e) |
|
|
336 |
|
|
|
337 |
|
|
|
338 |
if __name__ == "__main__": |
|
|
339 |
main() |
|
|
340 |
#cProfile.run('main()') |