NLP_CRT / Git / [8d2107] /model

Models:
philipB/
NLP_CRT
Downloads: 1
[8d2107]: / model_tester.py
History
Download this file
341 lines (295 with data), 12.5 kB

import logging
import os
import re
import sys
import json

import numpy as np
import cProfile
from sklearn.base import TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import FeatureUnion, Pipeline

from extract_data import get_doc_rel_dates, get_operation_date, get_ef_values
from extract_data import get_operation_date,  is_note_doc, get_date_key
from language_processing import parse_date 
from loader import get_data
from decision_model import ClinicalDecisionModel
from mix_of_exp import MixtureOfExperts
logger = logging.getLogger("DaemonLog")

def _specificity_score(y,y_hat):
    mc = confusion_matrix(y,y_hat)
    return float(mc[0,0])/np.sum(mc[0,:])

def get_preprocessed_patients(sample_size = 25, rebuild_cache=False):
    cache_file = '/PHShome/ju601/crt/data/patient_cache.json'
    
    # Build cache
    if not os.path.isfile(cache_file) or rebuild_cache:
        patients_out = []
        delta_efs_out = []
        patient_nums = range(906)
        for i in patient_nums:
            if i%100 == 0:
                logger.info(str(i) + '/' + str(patient_nums[-1]))
            patient_data = get_data([i])[0]
            if patient_data is not None:
                ef_delta = get_ef_delta(patient_data)
                if ef_delta is not None:
                    patients_out.append(patient_data['NEW_EMPI'])
                    delta_efs_out.append(ef_delta)
        with open(cache_file, 'w') as cache:
            cache_obj = {
                'patients': patients_out,
                'delta_efs': delta_efs_out
            }
            json.dump(cache_obj, cache)

    # Load from cache
    with open(cache_file, 'r') as f:
        cached = json.load(f)
    n = min(sample_size, len(cached['patients']))
    return cached['patients'][:n], cached['delta_efs'][:n]


def change_ef_values_to_categories(ef_values):
    output = []

    for value in ef_values:
        if value < 5:
            output.append(1)
        else:
            output.append(0)
        '''
        if value <-2:
            output.append("reduction")
        elif value < 5:
            output.append("non-responder")
        elif value < 15:
            output.append("responder")
        else:
            output.append("super-responder")
        '''
    return output
            
def get_ef_delta(patient_data):
    after_threshold = 12*30 #traub: changed to 12mo optimal measurement
    ef_values = get_ef_values(patient_data)
    sorted_ef = sorted(ef_values)
    before = None
    after = None
    dist_from_thresh = float('inf')
    for (rel_date, ef_value) in sorted_ef:
        if rel_date <= 0:
            before = ef_value
        else:
            dist = abs(rel_date - after_threshold)
            if dist < dist_from_thresh:
                after = ef_value
                dist_from_thresh = dist

    if before is not None and after is not None:
        return after - before
    else:
        return None


class PrintTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, **transform_params):
        logger.info(X.shape)
        print X[0].shape
        return X

class FeaturePipeline(Pipeline):

    def get_feature_names(self):
        return self.steps[-1][1].get_feature_names()


def display_summary(name, values):
    logger.info(name)
    logger.info("\tmean:\t", 1.* sum(values) / len(values))
    logger.info("\tmin:\t", min(values))
    logger.info("\tmax:\t", max(values))

def get_mu_std(values):
    mu = 1. * sum(values) / len(values)
    sq_dev = [(x - mu)**2 for x in values]
    std = (sum(sq_dev) / len(values))**.5
    return (mu, std)

############################################
# Inputs:
#       clf: some model object with a fit(X,y) and predict(X) function
#       data_size: num patients
#       num_cv_splits: number of cv runs
#       status_file: opened status file (status_file.write("hello") should work)
# Outputs: a dictionary with the following
#       precision_mean(_std)
#       recall_mean(_std)
#       f1_mean(_std)
#       accuracy_mean(_std)
#       important_features: a string of the most 100 important features 
############################################

def execute_test(clf, data_size, num_cv_splits): 
    
    logger.info('Preprocessing...')
    X, Y = get_preprocessed_patients(data_size)
    Y = change_ef_values_to_categories(Y)

    logger.info(str(len(X)) + " patients in dataset")
    
    counts = {}
    for y in Y:
        if y not in counts:
            counts[y] = 0
        counts[y] += 1
    logger.info("Summary:")
    logger.info(counts)

    precision = []
    precision_train = []
    recall = []
    recall_train = []
    f1 = []
    specificity = []
    f1_train = []
    accuracy = []    
    accuracy_train = []    

    logger.info("Beginning runs")
    
    for cv_run in range(int(num_cv_splits)):

        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .33)
        logger.info("fitting " + str(len(X_train)) + " patients...")
        if type(clf.steps[-1][1]) == MixtureOfExperts:
            logger.info("alex debug, mixture of experts")
            Y_real = np.zeros((len(Y_train), 2))
            for i in range(len(Y_train)):    
                Y_real[i, Y_train[i]] = 1
            Y_train = Y_real

        logger.info("Fitting")
        clf.fit(X_train, Y_train)
        logger.info("Predicting on test set")
        Y_predict = clf.predict(X_test)
        logger.info("Predicting on train set")
        Y_train_predict = clf.predict(X_train)

        precision += [precision_score(Y_test, Y_predict)]
        precision_train += [precision_score(Y_train, Y_train_predict)]
        recall += [recall_score(Y_test, Y_predict)]
        recall_train += [recall_score(Y_train, Y_train_predict)]
        f1 += [f1_score(Y_test, Y_predict)]
        f1_train += [f1_score(Y_train, Y_train_predict)]
        accuracy += [accuracy_score(Y_test, Y_predict)]
        specificity += [_specificity_score(Y_test, Y_predict)]
        accuracy_train += [accuracy_score(Y_train, Y_train_predict)]

        logger.info("CV Split #" + str(cv_run + 1))
        logger.info("\tPrecision: " + str(precision[-1]))
        logger.info("\tRecall: " + str(recall[-1]))
        logger.info("\tF1 Score: " + str(f1[-1]))
        logger.info("\tAccuracy: " + str(accuracy[-1]))
        logger.info("\tSpecificity: " + str(specificity[-1]))
        logger.info("\tTrain Precision: " + str(precision_train[-1]))
        logger.info("\tTrain Recall: " + str(recall_train[-1]))
        logger.info("\tTrain F1 Score: " + str(f1_train[-1]))
        logger.info("\tTrain Accuracy: " + str(accuracy_train[-1]))

    try:
        features, model = (clf.steps[0][1], clf.steps[-1][1])
        column_names = features.get_feature_names()
        feature_importances = model.coef_[0] if not type(model) in [type(AdaBoostClassifier()), type(DecisionTreeClassifier)]  else model.feature_importances_
        if len(column_names) == len(feature_importances):
            Z = zip(column_names, feature_importances)
            Z.sort(key = lambda x: abs(x[1]), reverse = True)
            important_features = ""
            for z in  Z[:min(100, len(Z))]:
                important_features += str(z[1]) + ": " + str(z[0]) + "\\n" 
    except Exception as e:
        logger.error(e)
        important_features = "error"

    result = dict()
    result['mode'] = max([1. * x/ sum(counts.values()) for x in counts.values()])
    result['precision_mean'], result['precision_std'] = get_mu_std(precision)
    result['recall_mean'], result['recall_std'] = get_mu_std(recall)
    result['f1_mean'], result['f1_std'] = get_mu_std(f1)
    result['accuracy_mean'], result['accuracy_std'] = get_mu_std(accuracy)
    result['train_precision_mean'], result['train_precision_std'] = get_mu_std(precision_train)
    result['train_recall_mean'], result['train_recall_std'] = get_mu_std(recall_train)
    result['train_f1_mean'], result['train_f1_std'] = get_mu_std(f1_train)
    result['train_accuracy_mean'], result['train_accuracy_std'] = get_mu_std(accuracy_train)
    result['important_features'] = important_features
     
    return result    

# DEPRECATED
def test_model(features, data_size = 25, num_cv_splits = 5, method = 'logistic regression', show_progress = True, model_args = dict()):

    if method in ['logistic regression', 'lr', 'logitr', 'logistic']:
        is_regression = False
        clf = LogisticRegression(**model_args)
    elif method in ['svm']:
        is_regression = False
        clf = SVC(**model_args)
    elif method in ['boosting', 'adaboost']:
        is_regression = False
        clf = AdaBoostClassifier(**model_args)
    elif method in ['clinical', 'decision', 'cdm', 'clinical decision model']:
        is_regression = False
        clf = ClinicalDecisionModel()
    else:
        raise ValueError("'" + method + "' is not a supported classification method")

    logger.info('Preprocessing...')
    X, Y = get_preprocessed_patients(data_size)
    Y = change_ef_values_to_categories(Y)
    logger.info(str(len(X)) + " patients in dataset")
    
    if not is_regression:
        counts = {}
        for y in Y:
            if y not in counts:
                counts[y] = 0
            counts[y] += 1
        logger.info("Summary:")
        logger.info(counts)
    
    pipeline =  Pipeline([
        ('feature_union', features),
        ('Classifier', clf)
    ])

    #If using the ClinicalDecisionModel then no pipeline needed
    if method in ['clinical', 'decision', 'cdm', 'clinical decision model']:
        pipeline = clf

    mse = []
    r2 = []
    precision = []
    recall = []
    f1 = []
    accuracy = []
    specificity = []    

    for cv_run in range(num_cv_splits):

        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .33)
        pipeline.fit(X_train, Y_train)
        Y_predict = pipeline.predict(X_test)
        if is_regression:
            mse += [mean_squared_error(Y_test, Y_predict)]
            r2 += [r2_score(Y_test, Y_predict)]
            if show_progress:
                logger.info("CV Split #" + str(cv_run + 1))
                logger.info("\tMean Squared Error: ", mse[-1])
                logger.info("\tR2 Score: ", r2[-1])
        else:
            precision += [precision_score(Y_test, Y_predict)]
            recall += [recall_score(Y_test, Y_predict)]
            f1 += [f1_score(Y_test, Y_predict)]
            accuracy += [accuracy_score(Y_test, Y_predict)]
            specificity += [_specificity_score(Y_test, Y_predict)]
            if show_progress:
                logger.info("CV Split #" + str(cv_run + 1))
                logger.info("\tPrecision: " + str(precision[-1]))
                logger.info("\tRecall: " + str(recall[-1]))
                logger.info("\tF1 Score: " + str(f1[-1]))
                logger.info("\tAccuracy: " + str(accuracy[-1]))
                logger.info("\tSpecificity: " +  str(specificity[-1]))
    logger.info("\n---------------------------------------")
    logger.info("Overall (" + str(num_cv_splits) +  " cv cuts)")
    if is_regression:
        display_summary("Mean Squared Error", mse)
        display_summary("R2 Score", r2)
    else:
        display_summary("Precision", precision)
        display_summary("Recall", recall)
        display_summary("F1 Score", f1)
        display_summary("Accuracy", accuracy)
        display_summary("Specificity", specificity)

    try:
        column_names = features.get_feature_names()
        logger.info("Number of column names: " + str( len(column_names)))
        feature_importances = clf.coef_[0] if not method in ['boosting', 'adaboost'] else clf.feature_importances_
        Z = zip(column_names, feature_importances)
        Z.sort(key = lambda x: abs(x[1]), reverse = True)
        logger.info("100 biggest theta components of last CV run:")
        for z in Z[:min(100, len(Z))]:
            logger.info(str(z[1]) + "\t" + z[0])
    except Exception as e:
        logger.info("Feature name extraction failed")
        logger.info(e)
 

if __name__ == "__main__":
    main()
    #cProfile.run('main()')