--- a +++ b/model_tester.py @@ -0,0 +1,340 @@ +import logging +import os +import re +import sys +import json + +import numpy as np +import cProfile +from sklearn.base import TransformerMixin +from sklearn.cross_validation import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, f1_score, confusion_matrix +from sklearn.pipeline import FeatureUnion, Pipeline + +from extract_data import get_doc_rel_dates, get_operation_date, get_ef_values +from extract_data import get_operation_date, is_note_doc, get_date_key +from language_processing import parse_date +from loader import get_data +from decision_model import ClinicalDecisionModel +from mix_of_exp import MixtureOfExperts +logger = logging.getLogger("DaemonLog") + +def _specificity_score(y,y_hat): + mc = confusion_matrix(y,y_hat) + return float(mc[0,0])/np.sum(mc[0,:]) + +def get_preprocessed_patients(sample_size = 25, rebuild_cache=False): + cache_file = '/PHShome/ju601/crt/data/patient_cache.json' + + # Build cache + if not os.path.isfile(cache_file) or rebuild_cache: + patients_out = [] + delta_efs_out = [] + patient_nums = range(906) + for i in patient_nums: + if i%100 == 0: + logger.info(str(i) + '/' + str(patient_nums[-1])) + patient_data = get_data([i])[0] + if patient_data is not None: + ef_delta = get_ef_delta(patient_data) + if ef_delta is not None: + patients_out.append(patient_data['NEW_EMPI']) + delta_efs_out.append(ef_delta) + with open(cache_file, 'w') as cache: + cache_obj = { + 'patients': patients_out, + 'delta_efs': delta_efs_out + } + json.dump(cache_obj, cache) + + # Load from cache + with open(cache_file, 'r') as f: + cached = json.load(f) + n = min(sample_size, len(cached['patients'])) + return cached['patients'][:n], cached['delta_efs'][:n] + + +def change_ef_values_to_categories(ef_values): + output = [] + + for value in ef_values: + if value < 5: + output.append(1) + else: + output.append(0) + ''' + if value <-2: + output.append("reduction") + elif value < 5: + output.append("non-responder") + elif value < 15: + output.append("responder") + else: + output.append("super-responder") + ''' + return output + +def get_ef_delta(patient_data): + after_threshold = 12*30 #traub: changed to 12mo optimal measurement + ef_values = get_ef_values(patient_data) + sorted_ef = sorted(ef_values) + before = None + after = None + dist_from_thresh = float('inf') + for (rel_date, ef_value) in sorted_ef: + if rel_date <= 0: + before = ef_value + else: + dist = abs(rel_date - after_threshold) + if dist < dist_from_thresh: + after = ef_value + dist_from_thresh = dist + + if before is not None and after is not None: + return after - before + else: + return None + + +class PrintTransformer(TransformerMixin): + def fit(self, X, y=None, **fit_params): + return self + def transform(self, X, **transform_params): + logger.info(X.shape) + print X[0].shape + return X + +class FeaturePipeline(Pipeline): + + def get_feature_names(self): + return self.steps[-1][1].get_feature_names() + + +def display_summary(name, values): + logger.info(name) + logger.info("\tmean:\t", 1.* sum(values) / len(values)) + logger.info("\tmin:\t", min(values)) + logger.info("\tmax:\t", max(values)) + +def get_mu_std(values): + mu = 1. * sum(values) / len(values) + sq_dev = [(x - mu)**2 for x in values] + std = (sum(sq_dev) / len(values))**.5 + return (mu, std) + +############################################ +# Inputs: +# clf: some model object with a fit(X,y) and predict(X) function +# data_size: num patients +# num_cv_splits: number of cv runs +# status_file: opened status file (status_file.write("hello") should work) +# Outputs: a dictionary with the following +# precision_mean(_std) +# recall_mean(_std) +# f1_mean(_std) +# accuracy_mean(_std) +# important_features: a string of the most 100 important features +############################################ + +def execute_test(clf, data_size, num_cv_splits): + + logger.info('Preprocessing...') + X, Y = get_preprocessed_patients(data_size) + Y = change_ef_values_to_categories(Y) + + logger.info(str(len(X)) + " patients in dataset") + + counts = {} + for y in Y: + if y not in counts: + counts[y] = 0 + counts[y] += 1 + logger.info("Summary:") + logger.info(counts) + + precision = [] + precision_train = [] + recall = [] + recall_train = [] + f1 = [] + specificity = [] + f1_train = [] + accuracy = [] + accuracy_train = [] + + logger.info("Beginning runs") + + for cv_run in range(int(num_cv_splits)): + + X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .33) + logger.info("fitting " + str(len(X_train)) + " patients...") + if type(clf.steps[-1][1]) == MixtureOfExperts: + logger.info("alex debug, mixture of experts") + Y_real = np.zeros((len(Y_train), 2)) + for i in range(len(Y_train)): + Y_real[i, Y_train[i]] = 1 + Y_train = Y_real + + logger.info("Fitting") + clf.fit(X_train, Y_train) + logger.info("Predicting on test set") + Y_predict = clf.predict(X_test) + logger.info("Predicting on train set") + Y_train_predict = clf.predict(X_train) + + precision += [precision_score(Y_test, Y_predict)] + precision_train += [precision_score(Y_train, Y_train_predict)] + recall += [recall_score(Y_test, Y_predict)] + recall_train += [recall_score(Y_train, Y_train_predict)] + f1 += [f1_score(Y_test, Y_predict)] + f1_train += [f1_score(Y_train, Y_train_predict)] + accuracy += [accuracy_score(Y_test, Y_predict)] + specificity += [_specificity_score(Y_test, Y_predict)] + accuracy_train += [accuracy_score(Y_train, Y_train_predict)] + + logger.info("CV Split #" + str(cv_run + 1)) + logger.info("\tPrecision: " + str(precision[-1])) + logger.info("\tRecall: " + str(recall[-1])) + logger.info("\tF1 Score: " + str(f1[-1])) + logger.info("\tAccuracy: " + str(accuracy[-1])) + logger.info("\tSpecificity: " + str(specificity[-1])) + logger.info("\tTrain Precision: " + str(precision_train[-1])) + logger.info("\tTrain Recall: " + str(recall_train[-1])) + logger.info("\tTrain F1 Score: " + str(f1_train[-1])) + logger.info("\tTrain Accuracy: " + str(accuracy_train[-1])) + + try: + features, model = (clf.steps[0][1], clf.steps[-1][1]) + column_names = features.get_feature_names() + feature_importances = model.coef_[0] if not type(model) in [type(AdaBoostClassifier()), type(DecisionTreeClassifier)] else model.feature_importances_ + if len(column_names) == len(feature_importances): + Z = zip(column_names, feature_importances) + Z.sort(key = lambda x: abs(x[1]), reverse = True) + important_features = "" + for z in Z[:min(100, len(Z))]: + important_features += str(z[1]) + ": " + str(z[0]) + "\\n" + except Exception as e: + logger.error(e) + important_features = "error" + + result = dict() + result['mode'] = max([1. * x/ sum(counts.values()) for x in counts.values()]) + result['precision_mean'], result['precision_std'] = get_mu_std(precision) + result['recall_mean'], result['recall_std'] = get_mu_std(recall) + result['f1_mean'], result['f1_std'] = get_mu_std(f1) + result['accuracy_mean'], result['accuracy_std'] = get_mu_std(accuracy) + result['train_precision_mean'], result['train_precision_std'] = get_mu_std(precision_train) + result['train_recall_mean'], result['train_recall_std'] = get_mu_std(recall_train) + result['train_f1_mean'], result['train_f1_std'] = get_mu_std(f1_train) + result['train_accuracy_mean'], result['train_accuracy_std'] = get_mu_std(accuracy_train) + result['important_features'] = important_features + + return result + +# DEPRECATED +def test_model(features, data_size = 25, num_cv_splits = 5, method = 'logistic regression', show_progress = True, model_args = dict()): + + if method in ['logistic regression', 'lr', 'logitr', 'logistic']: + is_regression = False + clf = LogisticRegression(**model_args) + elif method in ['svm']: + is_regression = False + clf = SVC(**model_args) + elif method in ['boosting', 'adaboost']: + is_regression = False + clf = AdaBoostClassifier(**model_args) + elif method in ['clinical', 'decision', 'cdm', 'clinical decision model']: + is_regression = False + clf = ClinicalDecisionModel() + else: + raise ValueError("'" + method + "' is not a supported classification method") + + logger.info('Preprocessing...') + X, Y = get_preprocessed_patients(data_size) + Y = change_ef_values_to_categories(Y) + logger.info(str(len(X)) + " patients in dataset") + + if not is_regression: + counts = {} + for y in Y: + if y not in counts: + counts[y] = 0 + counts[y] += 1 + logger.info("Summary:") + logger.info(counts) + + pipeline = Pipeline([ + ('feature_union', features), + ('Classifier', clf) + ]) + + #If using the ClinicalDecisionModel then no pipeline needed + if method in ['clinical', 'decision', 'cdm', 'clinical decision model']: + pipeline = clf + + mse = [] + r2 = [] + precision = [] + recall = [] + f1 = [] + accuracy = [] + specificity = [] + + for cv_run in range(num_cv_splits): + + X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .33) + pipeline.fit(X_train, Y_train) + Y_predict = pipeline.predict(X_test) + if is_regression: + mse += [mean_squared_error(Y_test, Y_predict)] + r2 += [r2_score(Y_test, Y_predict)] + if show_progress: + logger.info("CV Split #" + str(cv_run + 1)) + logger.info("\tMean Squared Error: ", mse[-1]) + logger.info("\tR2 Score: ", r2[-1]) + else: + precision += [precision_score(Y_test, Y_predict)] + recall += [recall_score(Y_test, Y_predict)] + f1 += [f1_score(Y_test, Y_predict)] + accuracy += [accuracy_score(Y_test, Y_predict)] + specificity += [_specificity_score(Y_test, Y_predict)] + if show_progress: + logger.info("CV Split #" + str(cv_run + 1)) + logger.info("\tPrecision: " + str(precision[-1])) + logger.info("\tRecall: " + str(recall[-1])) + logger.info("\tF1 Score: " + str(f1[-1])) + logger.info("\tAccuracy: " + str(accuracy[-1])) + logger.info("\tSpecificity: " + str(specificity[-1])) + logger.info("\n---------------------------------------") + logger.info("Overall (" + str(num_cv_splits) + " cv cuts)") + if is_regression: + display_summary("Mean Squared Error", mse) + display_summary("R2 Score", r2) + else: + display_summary("Precision", precision) + display_summary("Recall", recall) + display_summary("F1 Score", f1) + display_summary("Accuracy", accuracy) + display_summary("Specificity", specificity) + + try: + column_names = features.get_feature_names() + logger.info("Number of column names: " + str( len(column_names))) + feature_importances = clf.coef_[0] if not method in ['boosting', 'adaboost'] else clf.feature_importances_ + Z = zip(column_names, feature_importances) + Z.sort(key = lambda x: abs(x[1]), reverse = True) + logger.info("100 biggest theta components of last CV run:") + for z in Z[:min(100, len(Z))]: + logger.info(str(z[1]) + "\t" + z[0]) + except Exception as e: + logger.info("Feature name extraction failed") + logger.info(e) + + +if __name__ == "__main__": + main() + #cProfile.run('main()')