NLP_CRT / Git / [8d2107] /model

Models:
philipB/
NLP_CRT
Downloads: 1
[8d2107]: / model_builder.py
History
Download this file
257 lines (228 with data), 16.1 kB

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from decision_model import ClinicalDecisionModel
from model_tester import FeaturePipeline
from doc2vec_transformer import Doc2Vec_Note_Transformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from baseline_transformer import SexTransformer, GetConcatenatedNotesTransformer, GetLatestNotesTransformer, GetEncountersFeaturesTransformer, GetLabsCountsDictTransformer, GetLabsLowCountsDictTransformer, GetLabsHighCountsDictTransformer, GetLabsLatestHighDictTransformer, GetLabsLatestLowDictTransformer, GetLabsHistoryDictTransformer, GetLatestLabValuesTransformer
from icd_transformer import ICD9_Transformer
from value_extractor_transformer import EFTransformer, LBBBTransformer, SinusRhythmTransformer, QRSTransformer, NYHATransformer, NICMTransformer
import logging
from mix_of_exp import MixtureOfExperts
from neural_network import NeuralNetwork, NeuralLogistic
logger = logging.getLogger("DaemonLog")

#This should make adding transformers easier. You could add a transformer like
# features_add = [baseline_features[x] for x in ['Labs_Latest_High', 'Car_tfidf']

control_features = {   'all_ef' :  ('all_ef', EFTransformer, {'method' : 'all', 'num_horizon' : 1}),
                        'mean_ef' : ('mean_ef', EFTransformer, {'method' : 'mean', 'num_horizon' : 5}),
                        'max_ef' :  ('max_ef', EFTransformer, {'method' : 'max', 'num_horizon' : 5}),
                        'lbbb':     ('lbbb', LBBBTransformer, {'time_horizon' : 30*3}),
                        'sr':       ('sr', SinusRhythmTransformer, {'time_horizon' : 30*3}),
                        'nyha':     ('nyha', NYHATransformer, {'time_horizon' : 30*3}),
                        'nicm':     ('nicm', NICMTransformer, {'time_horizon' : 30*3}),
                        'all_qrs':  ('all_qrs', QRSTransformer, {'method' : 'all', 'num_horizon' : 1}),
                        'icd9':     ('icd9', ICD9_Transformer, {'depth' : 2}),
                        'sex':      ('sex', SexTransformer, {}),
                        'car_d2v':  ('car_d2v', Doc2Vec_Note_Transformer, {'note_type':'Car', 'model_file':'/home/ubuntu/josh_project/doc2vec_models/car_1.model', 'dbow_file':'/home/ubuntu/josh_project/doc2vec_models/car_dbow.model', 'max_notes':5}),
                        'lno_d2v':  ('lno_d2v',Doc2Vec_Note_Transformer, {'note_type':'Lno', 'model_file':'/home/ubuntu/josh_project/doc2vec_models/lno_1.model', 'dbow_file':'/home/ubuntu/josh_project/doc2vec_models/lno_dbow.model', 'max_notes':5}),
                        'car_tfidf':('car_tfidf', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car'}),
                                                                    ('tfidf_car', TfidfTransformer, {})]),    
                        'lno_tfidf':('lno_tfidf', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno'}),
                                                                    ('tfidf_lno', TfidfTransformer, {})]),
                        'car_trigram':('car_ngram', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car'}),
                                                                    ('ngram_car', CountVectorizer, {'ngram_range' : (3, 3), 'min_df' : .05})]),    
                        'lno_trigram':('lno_ngram', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno'}),
                                                                    ('ngram_lno', CountVectorizer, {'ngram_range' : (3, 3), 'min_df' : .05})]),
                        'car_bigram':('car_ngram', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car', 'look_back_months': 12}),
                                                                    ('ngram_car', CountVectorizer, {'ngram_range' : (2, 2), 'min_df' : .05})]),    
                        'lno_bigram':('lno_ngram', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno', 'look_back_months': 12}),
                                                                    ('ngram_lno', CountVectorizer, {'ngram_range' : (2, 2), 'min_df' : .05})]),
                        'enc':      ('enc', GetEncountersFeaturesTransformer, {'max_encounters' : 5}),
                        'lab_all' : ('lab_all', FeaturePipeline, [('lab_to_dict', GetLabsCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),                         
                        'lab_low' : ('lab_low', FeaturePipeline, [('lab_to_dict', GetLabsLowCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),                         
                        'lab_high' : ('lab_high', FeaturePipeline, [('lab_to_dict', GetLabsHighCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),                         
                        'lab_low_recent' : ('lab_low_recent', FeaturePipeline, [('lab_to_dict', GetLabsLatestLowDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),                         
                        'lab_high_recent' : ('lab_high_recent', FeaturePipeline, [('lab_to_dict', GetLabsLatestHighDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),                         
                        'lab_values': ('lab_values', FeaturePipeline, [('lab_to_dict', GetLatestLabValuesTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
                        'lab_hist' : ('lab_hist', FeaturePipeline, [('lab_to_dict', GetLabsHistoryDictTransformer, {'time_thresholds_months' : [1]}), ('dict_to_vect', DictVectorizer, {})]),                         
                     }  

control_groups = { 'regex' : ['all_ef', 'mean_ef', 'max_ef', 'lbbb', 'sr', 'nyha', 'nicm', 'all_qrs'],
                   'structured_only' : ['sex', 'icd9', 'enc', 'lab_values'],
                   'notes_tfidf' : ['car_tfidf', 'lno_tfidf'],
                   'labs':  ['lab_all', 'lab_low', 'lab_high', 'lab_low_recent', 'lab_high_recent', 'lab_hist'],
                   'd2v' : ['car_d2v', 'lno_d2v'] 
                }

#These are empty, but might be useful
adaboost_baseline = {  'method' : 'adaboost', 'model_args' : {'n_estimators' : 500}, 'features' : {} } 
lr_baseline = {  'method' : 'lr', 'model_args' : {'C' : 1}, 'features' : {} } 

nn_baseline2 = { 'method' : 'nn',  'model_args' : {'layers' : [(10, 'logistic'), (None, 'softmax')], 'obj_fun' : 'maxent'}, 'features' : {}}

alex_baseline = {   'method':'svm',
                    'model_args': {},
                    'features':{}
                }
def build_alex_baseline():
    #for n in control_groups['notes_tfidf']:
    for n in ['lno_bigram', 'car_bigram']:
        alex_baseline['features'][n] = (control_features[n][1], control_features[n][2])
        nn_baseline2['features'][n] = (control_features[n][1], control_features[n][2])
    for r in control_groups['regex']:
        alex_baseline['features'][r] = (control_features[r][1], control_features[r][2])
        nn_baseline2['features'][n] = (control_features[n][1], control_features[n][2])

build_alex_baseline()

alex_ada = {    'method':'adaboost',
                'model_args': {'n_estimators':500},
                'features': alex_baseline['features']
           }

alex_baseline_10 = {'method':'svm', 'model_args':{'class_weight':{1:10}}, 'features':alex_baseline['features']}

alex_baseline_100 = {'method':'svm', 'model_args':{'class_weight':{1:100}}, 'features':alex_baseline['features']}

alex_baseline_1000 = {'method':'svm', 'model_args':{'class_weight':{1:1000}}, 'features':alex_baseline['features']}

alex_baseline_01 = {'method':'svm', 'model_args':{'class_weight':{2:10}}, 'features':alex_baseline['features']}

alex_baseline_001 = {'method':'svm', 'model_args':{'class_weight':{1:.01}}, 'features':alex_baseline['features']}

alex_baseline_0001 = {'method':'svm', 'model_args':{'class_weight':{1:.001}}, 'features':alex_baseline['features']}

regex_baseline = {  'method' : 'adaboost',
                    'model_args' : {'n_estimators' : 500},
                    'features' : {  'all_ef' :  (EFTransformer, {'method' : 'all', 'num_horizon' : 1}),
                                    'mean_ef' : (EFTransformer, {'method' : 'mean', 'num_horizon' : 5}),
                                    'max_ef' :  (EFTransformer, {'method' : 'max', 'num_horizon' : 5}),
                                    'lbbb':     (LBBBTransformer, {'time_horizon' : 30*3}),
                                    'sr':       (SinusRhythmTransformer, {'time_horizon' : 30*3}),
                                    'nyha':     (NYHATransformer, {'time_horizon' : 30*3}),
                                    'nicm':     (NICMTransformer, {'time_horizon' : 30*3}),
                                    'all_qrs':  (QRSTransformer, {'method' : 'all', 'num_horizon' : 1})
                                 }
                 }

struct_baseline = {
                               'method' : 'adaboost',
                               'model_args' : {'n_estimators' : 200},
                               'features' : {
                                                'lab_values': (FeaturePipeline, [('lab_to_dict', GetLatestLabValuesTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
                                                'enc':      (GetEncountersFeaturesTransformer, {'max_encounters' : 5}),
                                                'icd9':     (ICD9_Transformer, {'depth' : 2}),
                                                'sex':      (SexTransformer, {}),
                                            }
                            }

def __add_to_baseline(base, feature_names):
    for name in feature_names:
        feature_triple = control_features[name] 
        base['features'][name] = (feature_triple[1], feature_triple[2])
    return base 


##################
# control : this is the pre-loaded set of method, model_args, and features. If any of the below are none, will default to these
#
# method : this is a string that is interpreted as a switch statement below
# model_args : the **kwargs dict that are passed to your model, e.g. AdaBoostClassifier(**model_args)    
# features: this ia a dictionary of transformer name : (transformer class, transformer args)
#           eventually this will become a tuple (name, class(**args)) to go into a FeatureUnion
#           However, I have included functionality to handle FeaturePipelines. In this one case, 
#           the transformer args are a list of (name, class, args) triples. For example:
#           'Car' : (FeaturePipeline, [('car_notes', CarNotesTransformer, {'horizon' :3}), ('tfidf', TfidfTransformer, {})])
#
#   ADD     For features_add, you must give a triple (name, class, args). You can write this yourself, or reference the lists above
#               features_add = control_groups['labs'] + [control_features[x] for x in ['icd9', 'lbbb']]
#
#   CHANGE  Besides making features, you can also change them. I thought this was an inutitive option. The simplest change
#           just calls the transformer by name and assigns a new value to one of the args. For example:
#               features_change = {'ef_all' : {'num_horizon' : 10, 'time_horizon' : 30*4}}
#           But you can also change thing inside feature pipelines. For example, to change the feature above:
#               features_change = {('Car', 0) : {'horizon' : 10}}
#   
#   REMOVE  Lastly, you can remove features. This is easy. No error will be thrown if you remove a feature not included.
#               features_remove = ['icd9', 'max_ef', 'lab_high']
#               features_remove = control_groups['regex'] + control_groups['labs']
################

def build_model(control, method = None, model_args = None, features = None, features_add = None, features_change = None, features_remove = None):

    #Modify method
    if method == None:
        method = control['method']
        method_diff = False
    else: 
        method_diff = not method == control['method']

    #Modify model args
    if model_args == None and method_diff:
        model_args = dict()
        args_diff = True
    elif model_args == None:
        model_args = control['model_args']
        args_diff = False
    else:
        args_diff = True

    #Modify the model features
    #keep in mind that features are a dictionary of tuples, e.g. 'name': (transformer_class, transformer_args)
    feature_diff = {'+': [], '-' : []}
    if features == None:
        features = control['features']
        #add features
        if features_add != None:
            for add in features_add:
                features[add[0]] = (add[1], add[2])
            feature_diff['+'] += features_add
        #remove features
        if features_remove != None:
            feature_diff['-'] += features_remove
            for remove in features_remove:
                if remove in features:
                    features.pop(remove)
        #change features
        if features_change != None:
            for change in features_change:#not robust to handle changing FeaturePipeline
                if change[0] in features:
                    transformer, args = features[change[0]]
                    feature_diff['-'] += [(change[0], args)]
                    args = change_args(transformer, args, change[1]) #recursive change call
                    features[change[0]] = (transformer, args)
                    feature_diff['+'] += [(change[0], args)]

    #select classifier and build pipeline
    if method in ['clinical', 'decision', 'cdm', 'clinical decision model']:
        is_regression = False
        clf = ClinicalDecisionModel()
        model = clf
    else:
        if method in ['logistic regression', 'lr', 'logitr', 'logistic']:
            is_regression = False
            clf = LogisticRegression(**model_args)
        elif method in ['svm']:
            is_regression = False
            clf = SVC(**model_args)
        elif method in ['boosting', 'adaboost']:
            is_regression = False
            clf = AdaBoostClassifier(**model_args)
        elif method in ['decision tree', 'dtree']:
            is_regression = False
            clf = DecisionTreeClassifier(**model_args)
        elif method in ['nn', 'neural', 'net', 'neuralnet', 'network']:
            is_regression = False
            clf = NeuralNetwork(**model_args)
        elif method in ['me', 'mixexp', 'mixture of experts']:
            is_regression = False
            print model_args
            clf = MixtureOfExperts(**model_args)
        else:
            raise ValueError("'" + method + "' is not a supported classification method")

        #build the transformers
        transformer_list = []
        for feature_name in features:
            transformer_list += [build_transformer(feature_name, features[feature_name])] #recursive build call

        #assemble pipeline
        model =  Pipeline([
                ('feature_union', FeatureUnion(transformer_list)),#, n_jobs = min(2, len(transformer_list)))),
                ('Classifier', clf)
            ])
    return model

def build_transformer(transformer_name, transformer_values):
    if issubclass(transformer_values[0], Pipeline):
        steps = [build_transformer(step[0], (step[1], step[2])) for step in transformer_values[1]]
        transformer = transformer_values[0](steps)
    else:
        transformer = transformer_values[0](**transformer_values[1])
    return (transformer_name, transformer)

def change_args(transformer, args, params):
    for param in params:
        if issubclass(transformer, Pipeline):
            args[param] = (args[param][0], args[param][1], change_args(args[param][1], args[param][2], params[param]))
        else:
            args[param] = params[param]
    return args