Diff of /model_builder.py [000000] .. [8d2107]

Switch to unified view

a b/model_builder.py
1
from sklearn.linear_model import LogisticRegression
2
from sklearn.svm import SVC
3
from sklearn.tree import DecisionTreeClassifier
4
from sklearn.ensemble import AdaBoostClassifier
5
from sklearn.pipeline import FeatureUnion, Pipeline
6
from decision_model import ClinicalDecisionModel
7
from model_tester import FeaturePipeline
8
from doc2vec_transformer import Doc2Vec_Note_Transformer
9
from sklearn.feature_extraction import DictVectorizer
10
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
11
from baseline_transformer import SexTransformer, GetConcatenatedNotesTransformer, GetLatestNotesTransformer, GetEncountersFeaturesTransformer, GetLabsCountsDictTransformer, GetLabsLowCountsDictTransformer, GetLabsHighCountsDictTransformer, GetLabsLatestHighDictTransformer, GetLabsLatestLowDictTransformer, GetLabsHistoryDictTransformer, GetLatestLabValuesTransformer
12
from icd_transformer import ICD9_Transformer
13
from value_extractor_transformer import EFTransformer, LBBBTransformer, SinusRhythmTransformer, QRSTransformer, NYHATransformer, NICMTransformer
14
import logging
15
from mix_of_exp import MixtureOfExperts
16
from neural_network import NeuralNetwork, NeuralLogistic
17
logger = logging.getLogger("DaemonLog")
18
19
#This should make adding transformers easier. You could add a transformer like
20
# features_add = [baseline_features[x] for x in ['Labs_Latest_High', 'Car_tfidf']
21
22
control_features = {   'all_ef' :  ('all_ef', EFTransformer, {'method' : 'all', 'num_horizon' : 1}),
23
                        'mean_ef' : ('mean_ef', EFTransformer, {'method' : 'mean', 'num_horizon' : 5}),
24
                        'max_ef' :  ('max_ef', EFTransformer, {'method' : 'max', 'num_horizon' : 5}),
25
                        'lbbb':     ('lbbb', LBBBTransformer, {'time_horizon' : 30*3}),
26
                        'sr':       ('sr', SinusRhythmTransformer, {'time_horizon' : 30*3}),
27
                        'nyha':     ('nyha', NYHATransformer, {'time_horizon' : 30*3}),
28
                        'nicm':     ('nicm', NICMTransformer, {'time_horizon' : 30*3}),
29
                        'all_qrs':  ('all_qrs', QRSTransformer, {'method' : 'all', 'num_horizon' : 1}),
30
                        'icd9':     ('icd9', ICD9_Transformer, {'depth' : 2}),
31
                        'sex':      ('sex', SexTransformer, {}),
32
                        'car_d2v':  ('car_d2v', Doc2Vec_Note_Transformer, {'note_type':'Car', 'model_file':'/home/ubuntu/josh_project/doc2vec_models/car_1.model', 'dbow_file':'/home/ubuntu/josh_project/doc2vec_models/car_dbow.model', 'max_notes':5}),
33
                        'lno_d2v':  ('lno_d2v',Doc2Vec_Note_Transformer, {'note_type':'Lno', 'model_file':'/home/ubuntu/josh_project/doc2vec_models/lno_1.model', 'dbow_file':'/home/ubuntu/josh_project/doc2vec_models/lno_dbow.model', 'max_notes':5}),
34
                        'car_tfidf':('car_tfidf', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car'}),
35
                                                                    ('tfidf_car', TfidfTransformer, {})]),    
36
                        'lno_tfidf':('lno_tfidf', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno'}),
37
                                                                    ('tfidf_lno', TfidfTransformer, {})]),
38
                        'car_trigram':('car_ngram', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car'}),
39
                                                                    ('ngram_car', CountVectorizer, {'ngram_range' : (3, 3), 'min_df' : .05})]),    
40
                        'lno_trigram':('lno_ngram', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno'}),
41
                                                                    ('ngram_lno', CountVectorizer, {'ngram_range' : (3, 3), 'min_df' : .05})]),
42
                        'car_bigram':('car_ngram', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car', 'look_back_months': 12}),
43
                                                                    ('ngram_car', CountVectorizer, {'ngram_range' : (2, 2), 'min_df' : .05})]),    
44
                        'lno_bigram':('lno_ngram', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno', 'look_back_months': 12}),
45
                                                                    ('ngram_lno', CountVectorizer, {'ngram_range' : (2, 2), 'min_df' : .05})]),
46
                        'enc':      ('enc', GetEncountersFeaturesTransformer, {'max_encounters' : 5}),
47
                        'lab_all' : ('lab_all', FeaturePipeline, [('lab_to_dict', GetLabsCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),                         
48
                        'lab_low' : ('lab_low', FeaturePipeline, [('lab_to_dict', GetLabsLowCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),                         
49
                        'lab_high' : ('lab_high', FeaturePipeline, [('lab_to_dict', GetLabsHighCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),                         
50
                        'lab_low_recent' : ('lab_low_recent', FeaturePipeline, [('lab_to_dict', GetLabsLatestLowDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),                         
51
                        'lab_high_recent' : ('lab_high_recent', FeaturePipeline, [('lab_to_dict', GetLabsLatestHighDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),                         
52
                        'lab_values': ('lab_values', FeaturePipeline, [('lab_to_dict', GetLatestLabValuesTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
53
                        'lab_hist' : ('lab_hist', FeaturePipeline, [('lab_to_dict', GetLabsHistoryDictTransformer, {'time_thresholds_months' : [1]}), ('dict_to_vect', DictVectorizer, {})]),                         
54
                     }  
55
56
control_groups = { 'regex' : ['all_ef', 'mean_ef', 'max_ef', 'lbbb', 'sr', 'nyha', 'nicm', 'all_qrs'],
57
                   'structured_only' : ['sex', 'icd9', 'enc', 'lab_values'],
58
                   'notes_tfidf' : ['car_tfidf', 'lno_tfidf'],
59
                   'labs':  ['lab_all', 'lab_low', 'lab_high', 'lab_low_recent', 'lab_high_recent', 'lab_hist'],
60
                   'd2v' : ['car_d2v', 'lno_d2v'] 
61
                }
62
63
#These are empty, but might be useful
64
adaboost_baseline = {  'method' : 'adaboost', 'model_args' : {'n_estimators' : 500}, 'features' : {} } 
65
lr_baseline = {  'method' : 'lr', 'model_args' : {'C' : 1}, 'features' : {} } 
66
67
nn_baseline2 = { 'method' : 'nn',  'model_args' : {'layers' : [(10, 'logistic'), (None, 'softmax')], 'obj_fun' : 'maxent'}, 'features' : {}}
68
69
alex_baseline = {   'method':'svm',
70
                    'model_args': {},
71
                    'features':{}
72
                }
73
def build_alex_baseline():
74
    #for n in control_groups['notes_tfidf']:
75
    for n in ['lno_bigram', 'car_bigram']:
76
        alex_baseline['features'][n] = (control_features[n][1], control_features[n][2])
77
        nn_baseline2['features'][n] = (control_features[n][1], control_features[n][2])
78
    for r in control_groups['regex']:
79
        alex_baseline['features'][r] = (control_features[r][1], control_features[r][2])
80
        nn_baseline2['features'][n] = (control_features[n][1], control_features[n][2])
81
82
build_alex_baseline()
83
84
alex_ada = {    'method':'adaboost',
85
                'model_args': {'n_estimators':500},
86
                'features': alex_baseline['features']
87
           }
88
89
alex_baseline_10 = {'method':'svm', 'model_args':{'class_weight':{1:10}}, 'features':alex_baseline['features']}
90
91
alex_baseline_100 = {'method':'svm', 'model_args':{'class_weight':{1:100}}, 'features':alex_baseline['features']}
92
93
alex_baseline_1000 = {'method':'svm', 'model_args':{'class_weight':{1:1000}}, 'features':alex_baseline['features']}
94
95
alex_baseline_01 = {'method':'svm', 'model_args':{'class_weight':{2:10}}, 'features':alex_baseline['features']}
96
97
alex_baseline_001 = {'method':'svm', 'model_args':{'class_weight':{1:.01}}, 'features':alex_baseline['features']}
98
99
alex_baseline_0001 = {'method':'svm', 'model_args':{'class_weight':{1:.001}}, 'features':alex_baseline['features']}
100
101
regex_baseline = {  'method' : 'adaboost',
102
                    'model_args' : {'n_estimators' : 500},
103
                    'features' : {  'all_ef' :  (EFTransformer, {'method' : 'all', 'num_horizon' : 1}),
104
                                    'mean_ef' : (EFTransformer, {'method' : 'mean', 'num_horizon' : 5}),
105
                                    'max_ef' :  (EFTransformer, {'method' : 'max', 'num_horizon' : 5}),
106
                                    'lbbb':     (LBBBTransformer, {'time_horizon' : 30*3}),
107
                                    'sr':       (SinusRhythmTransformer, {'time_horizon' : 30*3}),
108
                                    'nyha':     (NYHATransformer, {'time_horizon' : 30*3}),
109
                                    'nicm':     (NICMTransformer, {'time_horizon' : 30*3}),
110
                                    'all_qrs':  (QRSTransformer, {'method' : 'all', 'num_horizon' : 1})
111
                                 }
112
                 }
113
114
struct_baseline = {
115
                               'method' : 'adaboost',
116
                               'model_args' : {'n_estimators' : 200},
117
                               'features' : {
118
                                                'lab_values': (FeaturePipeline, [('lab_to_dict', GetLatestLabValuesTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
119
                                                'enc':      (GetEncountersFeaturesTransformer, {'max_encounters' : 5}),
120
                                                'icd9':     (ICD9_Transformer, {'depth' : 2}),
121
                                                'sex':      (SexTransformer, {}),
122
                                            }
123
                            }
124
125
def __add_to_baseline(base, feature_names):
126
    for name in feature_names:
127
        feature_triple = control_features[name] 
128
        base['features'][name] = (feature_triple[1], feature_triple[2])
129
    return base 
130
131
132
##################
133
# control : this is the pre-loaded set of method, model_args, and features. If any of the below are none, will default to these
134
#
135
# method : this is a string that is interpreted as a switch statement below
136
# model_args : the **kwargs dict that are passed to your model, e.g. AdaBoostClassifier(**model_args)    
137
# features: this ia a dictionary of transformer name : (transformer class, transformer args)
138
#           eventually this will become a tuple (name, class(**args)) to go into a FeatureUnion
139
#           However, I have included functionality to handle FeaturePipelines. In this one case, 
140
#           the transformer args are a list of (name, class, args) triples. For example:
141
#           'Car' : (FeaturePipeline, [('car_notes', CarNotesTransformer, {'horizon' :3}), ('tfidf', TfidfTransformer, {})])
142
#
143
#   ADD     For features_add, you must give a triple (name, class, args). You can write this yourself, or reference the lists above
144
#               features_add = control_groups['labs'] + [control_features[x] for x in ['icd9', 'lbbb']]
145
#
146
#   CHANGE  Besides making features, you can also change them. I thought this was an inutitive option. The simplest change
147
#           just calls the transformer by name and assigns a new value to one of the args. For example:
148
#               features_change = {'ef_all' : {'num_horizon' : 10, 'time_horizon' : 30*4}}
149
#           But you can also change thing inside feature pipelines. For example, to change the feature above:
150
#               features_change = {('Car', 0) : {'horizon' : 10}}
151
#   
152
#   REMOVE  Lastly, you can remove features. This is easy. No error will be thrown if you remove a feature not included.
153
#               features_remove = ['icd9', 'max_ef', 'lab_high']
154
#               features_remove = control_groups['regex'] + control_groups['labs']
155
################
156
157
def build_model(control, method = None, model_args = None, features = None, features_add = None, features_change = None, features_remove = None):
158
159
    #Modify method
160
    if method == None:
161
        method = control['method']
162
        method_diff = False
163
    else: 
164
        method_diff = not method == control['method']
165
166
    #Modify model args
167
    if model_args == None and method_diff:
168
        model_args = dict()
169
        args_diff = True
170
    elif model_args == None:
171
        model_args = control['model_args']
172
        args_diff = False
173
    else:
174
        args_diff = True
175
176
    #Modify the model features
177
    #keep in mind that features are a dictionary of tuples, e.g. 'name': (transformer_class, transformer_args)
178
    feature_diff = {'+': [], '-' : []}
179
    if features == None:
180
        features = control['features']
181
        #add features
182
        if features_add != None:
183
            for add in features_add:
184
                features[add[0]] = (add[1], add[2])
185
            feature_diff['+'] += features_add
186
        #remove features
187
        if features_remove != None:
188
            feature_diff['-'] += features_remove
189
            for remove in features_remove:
190
                if remove in features:
191
                    features.pop(remove)
192
        #change features
193
        if features_change != None:
194
            for change in features_change:#not robust to handle changing FeaturePipeline
195
                if change[0] in features:
196
                    transformer, args = features[change[0]]
197
                    feature_diff['-'] += [(change[0], args)]
198
                    args = change_args(transformer, args, change[1]) #recursive change call
199
                    features[change[0]] = (transformer, args)
200
                    feature_diff['+'] += [(change[0], args)]
201
202
    #select classifier and build pipeline
203
    if method in ['clinical', 'decision', 'cdm', 'clinical decision model']:
204
        is_regression = False
205
        clf = ClinicalDecisionModel()
206
        model = clf
207
    else:
208
        if method in ['logistic regression', 'lr', 'logitr', 'logistic']:
209
            is_regression = False
210
            clf = LogisticRegression(**model_args)
211
        elif method in ['svm']:
212
            is_regression = False
213
            clf = SVC(**model_args)
214
        elif method in ['boosting', 'adaboost']:
215
            is_regression = False
216
            clf = AdaBoostClassifier(**model_args)
217
        elif method in ['decision tree', 'dtree']:
218
            is_regression = False
219
            clf = DecisionTreeClassifier(**model_args)
220
        elif method in ['nn', 'neural', 'net', 'neuralnet', 'network']:
221
            is_regression = False
222
            clf = NeuralNetwork(**model_args)
223
        elif method in ['me', 'mixexp', 'mixture of experts']:
224
            is_regression = False
225
            print model_args
226
            clf = MixtureOfExperts(**model_args)
227
        else:
228
            raise ValueError("'" + method + "' is not a supported classification method")
229
230
        #build the transformers
231
        transformer_list = []
232
        for feature_name in features:
233
            transformer_list += [build_transformer(feature_name, features[feature_name])] #recursive build call
234
235
        #assemble pipeline
236
        model =  Pipeline([
237
                ('feature_union', FeatureUnion(transformer_list)),#, n_jobs = min(2, len(transformer_list)))),
238
                ('Classifier', clf)
239
            ])
240
    return model
241
242
def build_transformer(transformer_name, transformer_values):
243
    if issubclass(transformer_values[0], Pipeline):
244
        steps = [build_transformer(step[0], (step[1], step[2])) for step in transformer_values[1]]
245
        transformer = transformer_values[0](steps)
246
    else:
247
        transformer = transformer_values[0](**transformer_values[1])
248
    return (transformer_name, transformer)
249
250
def change_args(transformer, args, params):
251
    for param in params:
252
        if issubclass(transformer, Pipeline):
253
            args[param] = (args[param][0], args[param][1], change_args(args[param][1], args[param][2], params[param]))
254
        else:
255
            args[param] = params[param]
256
    return args