Diff of /model_tester.py [000000] .. [8d2107]

Switch to side-by-side view

--- a
+++ b/model_tester.py
@@ -0,0 +1,340 @@
+import logging
+import os
+import re
+import sys
+import json
+
+import numpy as np
+import cProfile
+from sklearn.base import TransformerMixin
+from sklearn.cross_validation import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, f1_score, confusion_matrix
+from sklearn.pipeline import FeatureUnion, Pipeline
+
+from extract_data import get_doc_rel_dates, get_operation_date, get_ef_values
+from extract_data import get_operation_date,  is_note_doc, get_date_key
+from language_processing import parse_date 
+from loader import get_data
+from decision_model import ClinicalDecisionModel
+from mix_of_exp import MixtureOfExperts
+logger = logging.getLogger("DaemonLog")
+
+def _specificity_score(y,y_hat):
+    mc = confusion_matrix(y,y_hat)
+    return float(mc[0,0])/np.sum(mc[0,:])
+
+def get_preprocessed_patients(sample_size = 25, rebuild_cache=False):
+    cache_file = '/PHShome/ju601/crt/data/patient_cache.json'
+    
+    # Build cache
+    if not os.path.isfile(cache_file) or rebuild_cache:
+        patients_out = []
+        delta_efs_out = []
+        patient_nums = range(906)
+        for i in patient_nums:
+            if i%100 == 0:
+                logger.info(str(i) + '/' + str(patient_nums[-1]))
+            patient_data = get_data([i])[0]
+            if patient_data is not None:
+                ef_delta = get_ef_delta(patient_data)
+                if ef_delta is not None:
+                    patients_out.append(patient_data['NEW_EMPI'])
+                    delta_efs_out.append(ef_delta)
+        with open(cache_file, 'w') as cache:
+            cache_obj = {
+                'patients': patients_out,
+                'delta_efs': delta_efs_out
+            }
+            json.dump(cache_obj, cache)
+
+    # Load from cache
+    with open(cache_file, 'r') as f:
+        cached = json.load(f)
+    n = min(sample_size, len(cached['patients']))
+    return cached['patients'][:n], cached['delta_efs'][:n]
+
+
+def change_ef_values_to_categories(ef_values):
+    output = []
+
+    for value in ef_values:
+        if value < 5:
+            output.append(1)
+        else:
+            output.append(0)
+        '''
+        if value <-2:
+            output.append("reduction")
+        elif value < 5:
+            output.append("non-responder")
+        elif value < 15:
+            output.append("responder")
+        else:
+            output.append("super-responder")
+        '''
+    return output
+            
+def get_ef_delta(patient_data):
+    after_threshold = 12*30 #traub: changed to 12mo optimal measurement
+    ef_values = get_ef_values(patient_data)
+    sorted_ef = sorted(ef_values)
+    before = None
+    after = None
+    dist_from_thresh = float('inf')
+    for (rel_date, ef_value) in sorted_ef:
+        if rel_date <= 0:
+            before = ef_value
+        else:
+            dist = abs(rel_date - after_threshold)
+            if dist < dist_from_thresh:
+                after = ef_value
+                dist_from_thresh = dist
+
+    if before is not None and after is not None:
+        return after - before
+    else:
+        return None
+
+
+class PrintTransformer(TransformerMixin):
+    def fit(self, X, y=None, **fit_params):
+        return self
+    def transform(self, X, **transform_params):
+        logger.info(X.shape)
+        print X[0].shape
+        return X
+
+class FeaturePipeline(Pipeline):
+
+    def get_feature_names(self):
+        return self.steps[-1][1].get_feature_names()
+
+
+def display_summary(name, values):
+    logger.info(name)
+    logger.info("\tmean:\t", 1.* sum(values) / len(values))
+    logger.info("\tmin:\t", min(values))
+    logger.info("\tmax:\t", max(values))
+
+def get_mu_std(values):
+    mu = 1. * sum(values) / len(values)
+    sq_dev = [(x - mu)**2 for x in values]
+    std = (sum(sq_dev) / len(values))**.5
+    return (mu, std)
+
+############################################
+# Inputs:
+#       clf: some model object with a fit(X,y) and predict(X) function
+#       data_size: num patients
+#       num_cv_splits: number of cv runs
+#       status_file: opened status file (status_file.write("hello") should work)
+# Outputs: a dictionary with the following
+#       precision_mean(_std)
+#       recall_mean(_std)
+#       f1_mean(_std)
+#       accuracy_mean(_std)
+#       important_features: a string of the most 100 important features 
+############################################
+
+def execute_test(clf, data_size, num_cv_splits): 
+    
+    logger.info('Preprocessing...')
+    X, Y = get_preprocessed_patients(data_size)
+    Y = change_ef_values_to_categories(Y)
+
+    logger.info(str(len(X)) + " patients in dataset")
+    
+    counts = {}
+    for y in Y:
+        if y not in counts:
+            counts[y] = 0
+        counts[y] += 1
+    logger.info("Summary:")
+    logger.info(counts)
+
+    precision = []
+    precision_train = []
+    recall = []
+    recall_train = []
+    f1 = []
+    specificity = []
+    f1_train = []
+    accuracy = []    
+    accuracy_train = []    
+
+    logger.info("Beginning runs")
+    
+    for cv_run in range(int(num_cv_splits)):
+
+        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .33)
+        logger.info("fitting " + str(len(X_train)) + " patients...")
+        if type(clf.steps[-1][1]) == MixtureOfExperts:
+            logger.info("alex debug, mixture of experts")
+            Y_real = np.zeros((len(Y_train), 2))
+            for i in range(len(Y_train)):    
+                Y_real[i, Y_train[i]] = 1
+            Y_train = Y_real
+
+        logger.info("Fitting")
+        clf.fit(X_train, Y_train)
+        logger.info("Predicting on test set")
+        Y_predict = clf.predict(X_test)
+        logger.info("Predicting on train set")
+        Y_train_predict = clf.predict(X_train)
+
+        precision += [precision_score(Y_test, Y_predict)]
+        precision_train += [precision_score(Y_train, Y_train_predict)]
+        recall += [recall_score(Y_test, Y_predict)]
+        recall_train += [recall_score(Y_train, Y_train_predict)]
+        f1 += [f1_score(Y_test, Y_predict)]
+        f1_train += [f1_score(Y_train, Y_train_predict)]
+        accuracy += [accuracy_score(Y_test, Y_predict)]
+        specificity += [_specificity_score(Y_test, Y_predict)]
+        accuracy_train += [accuracy_score(Y_train, Y_train_predict)]
+
+        logger.info("CV Split #" + str(cv_run + 1))
+        logger.info("\tPrecision: " + str(precision[-1]))
+        logger.info("\tRecall: " + str(recall[-1]))
+        logger.info("\tF1 Score: " + str(f1[-1]))
+        logger.info("\tAccuracy: " + str(accuracy[-1]))
+        logger.info("\tSpecificity: " + str(specificity[-1]))
+        logger.info("\tTrain Precision: " + str(precision_train[-1]))
+        logger.info("\tTrain Recall: " + str(recall_train[-1]))
+        logger.info("\tTrain F1 Score: " + str(f1_train[-1]))
+        logger.info("\tTrain Accuracy: " + str(accuracy_train[-1]))
+
+    try:
+        features, model = (clf.steps[0][1], clf.steps[-1][1])
+        column_names = features.get_feature_names()
+        feature_importances = model.coef_[0] if not type(model) in [type(AdaBoostClassifier()), type(DecisionTreeClassifier)]  else model.feature_importances_
+        if len(column_names) == len(feature_importances):
+            Z = zip(column_names, feature_importances)
+            Z.sort(key = lambda x: abs(x[1]), reverse = True)
+            important_features = ""
+            for z in  Z[:min(100, len(Z))]:
+                important_features += str(z[1]) + ": " + str(z[0]) + "\\n" 
+    except Exception as e:
+        logger.error(e)
+        important_features = "error"
+
+    result = dict()
+    result['mode'] = max([1. * x/ sum(counts.values()) for x in counts.values()])
+    result['precision_mean'], result['precision_std'] = get_mu_std(precision)
+    result['recall_mean'], result['recall_std'] = get_mu_std(recall)
+    result['f1_mean'], result['f1_std'] = get_mu_std(f1)
+    result['accuracy_mean'], result['accuracy_std'] = get_mu_std(accuracy)
+    result['train_precision_mean'], result['train_precision_std'] = get_mu_std(precision_train)
+    result['train_recall_mean'], result['train_recall_std'] = get_mu_std(recall_train)
+    result['train_f1_mean'], result['train_f1_std'] = get_mu_std(f1_train)
+    result['train_accuracy_mean'], result['train_accuracy_std'] = get_mu_std(accuracy_train)
+    result['important_features'] = important_features
+     
+    return result    
+
+# DEPRECATED
+def test_model(features, data_size = 25, num_cv_splits = 5, method = 'logistic regression', show_progress = True, model_args = dict()):
+
+    if method in ['logistic regression', 'lr', 'logitr', 'logistic']:
+        is_regression = False
+        clf = LogisticRegression(**model_args)
+    elif method in ['svm']:
+        is_regression = False
+        clf = SVC(**model_args)
+    elif method in ['boosting', 'adaboost']:
+        is_regression = False
+        clf = AdaBoostClassifier(**model_args)
+    elif method in ['clinical', 'decision', 'cdm', 'clinical decision model']:
+        is_regression = False
+        clf = ClinicalDecisionModel()
+    else:
+        raise ValueError("'" + method + "' is not a supported classification method")
+
+    logger.info('Preprocessing...')
+    X, Y = get_preprocessed_patients(data_size)
+    Y = change_ef_values_to_categories(Y)
+    logger.info(str(len(X)) + " patients in dataset")
+    
+    if not is_regression:
+        counts = {}
+        for y in Y:
+            if y not in counts:
+                counts[y] = 0
+            counts[y] += 1
+        logger.info("Summary:")
+        logger.info(counts)
+    
+    pipeline =  Pipeline([
+        ('feature_union', features),
+        ('Classifier', clf)
+    ])
+
+    #If using the ClinicalDecisionModel then no pipeline needed
+    if method in ['clinical', 'decision', 'cdm', 'clinical decision model']:
+        pipeline = clf
+
+    mse = []
+    r2 = []
+    precision = []
+    recall = []
+    f1 = []
+    accuracy = []
+    specificity = []    
+
+    for cv_run in range(num_cv_splits):
+
+        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .33)
+        pipeline.fit(X_train, Y_train)
+        Y_predict = pipeline.predict(X_test)
+        if is_regression:
+            mse += [mean_squared_error(Y_test, Y_predict)]
+            r2 += [r2_score(Y_test, Y_predict)]
+            if show_progress:
+                logger.info("CV Split #" + str(cv_run + 1))
+                logger.info("\tMean Squared Error: ", mse[-1])
+                logger.info("\tR2 Score: ", r2[-1])
+        else:
+            precision += [precision_score(Y_test, Y_predict)]
+            recall += [recall_score(Y_test, Y_predict)]
+            f1 += [f1_score(Y_test, Y_predict)]
+            accuracy += [accuracy_score(Y_test, Y_predict)]
+            specificity += [_specificity_score(Y_test, Y_predict)]
+            if show_progress:
+                logger.info("CV Split #" + str(cv_run + 1))
+                logger.info("\tPrecision: " + str(precision[-1]))
+                logger.info("\tRecall: " + str(recall[-1]))
+                logger.info("\tF1 Score: " + str(f1[-1]))
+                logger.info("\tAccuracy: " + str(accuracy[-1]))
+                logger.info("\tSpecificity: " +  str(specificity[-1]))
+    logger.info("\n---------------------------------------")
+    logger.info("Overall (" + str(num_cv_splits) +  " cv cuts)")
+    if is_regression:
+        display_summary("Mean Squared Error", mse)
+        display_summary("R2 Score", r2)
+    else:
+        display_summary("Precision", precision)
+        display_summary("Recall", recall)
+        display_summary("F1 Score", f1)
+        display_summary("Accuracy", accuracy)
+        display_summary("Specificity", specificity)
+
+    try:
+        column_names = features.get_feature_names()
+        logger.info("Number of column names: " + str( len(column_names)))
+        feature_importances = clf.coef_[0] if not method in ['boosting', 'adaboost'] else clf.feature_importances_
+        Z = zip(column_names, feature_importances)
+        Z.sort(key = lambda x: abs(x[1]), reverse = True)
+        logger.info("100 biggest theta components of last CV run:")
+        for z in Z[:min(100, len(Z))]:
+            logger.info(str(z[1]) + "\t" + z[0])
+    except Exception as e:
+        logger.info("Feature name extraction failed")
+        logger.info(e)
+ 
+
+if __name__ == "__main__":
+    main()
+    #cProfile.run('main()')