[8d2107]: / model_builder.py

Download this file

257 lines (228 with data), 16.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from decision_model import ClinicalDecisionModel
from model_tester import FeaturePipeline
from doc2vec_transformer import Doc2Vec_Note_Transformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from baseline_transformer import SexTransformer, GetConcatenatedNotesTransformer, GetLatestNotesTransformer, GetEncountersFeaturesTransformer, GetLabsCountsDictTransformer, GetLabsLowCountsDictTransformer, GetLabsHighCountsDictTransformer, GetLabsLatestHighDictTransformer, GetLabsLatestLowDictTransformer, GetLabsHistoryDictTransformer, GetLatestLabValuesTransformer
from icd_transformer import ICD9_Transformer
from value_extractor_transformer import EFTransformer, LBBBTransformer, SinusRhythmTransformer, QRSTransformer, NYHATransformer, NICMTransformer
import logging
from mix_of_exp import MixtureOfExperts
from neural_network import NeuralNetwork, NeuralLogistic
logger = logging.getLogger("DaemonLog")
#This should make adding transformers easier. You could add a transformer like
# features_add = [baseline_features[x] for x in ['Labs_Latest_High', 'Car_tfidf']
control_features = { 'all_ef' : ('all_ef', EFTransformer, {'method' : 'all', 'num_horizon' : 1}),
'mean_ef' : ('mean_ef', EFTransformer, {'method' : 'mean', 'num_horizon' : 5}),
'max_ef' : ('max_ef', EFTransformer, {'method' : 'max', 'num_horizon' : 5}),
'lbbb': ('lbbb', LBBBTransformer, {'time_horizon' : 30*3}),
'sr': ('sr', SinusRhythmTransformer, {'time_horizon' : 30*3}),
'nyha': ('nyha', NYHATransformer, {'time_horizon' : 30*3}),
'nicm': ('nicm', NICMTransformer, {'time_horizon' : 30*3}),
'all_qrs': ('all_qrs', QRSTransformer, {'method' : 'all', 'num_horizon' : 1}),
'icd9': ('icd9', ICD9_Transformer, {'depth' : 2}),
'sex': ('sex', SexTransformer, {}),
'car_d2v': ('car_d2v', Doc2Vec_Note_Transformer, {'note_type':'Car', 'model_file':'/home/ubuntu/josh_project/doc2vec_models/car_1.model', 'dbow_file':'/home/ubuntu/josh_project/doc2vec_models/car_dbow.model', 'max_notes':5}),
'lno_d2v': ('lno_d2v',Doc2Vec_Note_Transformer, {'note_type':'Lno', 'model_file':'/home/ubuntu/josh_project/doc2vec_models/lno_1.model', 'dbow_file':'/home/ubuntu/josh_project/doc2vec_models/lno_dbow.model', 'max_notes':5}),
'car_tfidf':('car_tfidf', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car'}),
('tfidf_car', TfidfTransformer, {})]),
'lno_tfidf':('lno_tfidf', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno'}),
('tfidf_lno', TfidfTransformer, {})]),
'car_trigram':('car_ngram', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car'}),
('ngram_car', CountVectorizer, {'ngram_range' : (3, 3), 'min_df' : .05})]),
'lno_trigram':('lno_ngram', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno'}),
('ngram_lno', CountVectorizer, {'ngram_range' : (3, 3), 'min_df' : .05})]),
'car_bigram':('car_ngram', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car', 'look_back_months': 12}),
('ngram_car', CountVectorizer, {'ngram_range' : (2, 2), 'min_df' : .05})]),
'lno_bigram':('lno_ngram', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno', 'look_back_months': 12}),
('ngram_lno', CountVectorizer, {'ngram_range' : (2, 2), 'min_df' : .05})]),
'enc': ('enc', GetEncountersFeaturesTransformer, {'max_encounters' : 5}),
'lab_all' : ('lab_all', FeaturePipeline, [('lab_to_dict', GetLabsCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
'lab_low' : ('lab_low', FeaturePipeline, [('lab_to_dict', GetLabsLowCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
'lab_high' : ('lab_high', FeaturePipeline, [('lab_to_dict', GetLabsHighCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
'lab_low_recent' : ('lab_low_recent', FeaturePipeline, [('lab_to_dict', GetLabsLatestLowDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
'lab_high_recent' : ('lab_high_recent', FeaturePipeline, [('lab_to_dict', GetLabsLatestHighDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
'lab_values': ('lab_values', FeaturePipeline, [('lab_to_dict', GetLatestLabValuesTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
'lab_hist' : ('lab_hist', FeaturePipeline, [('lab_to_dict', GetLabsHistoryDictTransformer, {'time_thresholds_months' : [1]}), ('dict_to_vect', DictVectorizer, {})]),
}
control_groups = { 'regex' : ['all_ef', 'mean_ef', 'max_ef', 'lbbb', 'sr', 'nyha', 'nicm', 'all_qrs'],
'structured_only' : ['sex', 'icd9', 'enc', 'lab_values'],
'notes_tfidf' : ['car_tfidf', 'lno_tfidf'],
'labs': ['lab_all', 'lab_low', 'lab_high', 'lab_low_recent', 'lab_high_recent', 'lab_hist'],
'd2v' : ['car_d2v', 'lno_d2v']
}
#These are empty, but might be useful
adaboost_baseline = { 'method' : 'adaboost', 'model_args' : {'n_estimators' : 500}, 'features' : {} }
lr_baseline = { 'method' : 'lr', 'model_args' : {'C' : 1}, 'features' : {} }
nn_baseline2 = { 'method' : 'nn', 'model_args' : {'layers' : [(10, 'logistic'), (None, 'softmax')], 'obj_fun' : 'maxent'}, 'features' : {}}
alex_baseline = { 'method':'svm',
'model_args': {},
'features':{}
}
def build_alex_baseline():
#for n in control_groups['notes_tfidf']:
for n in ['lno_bigram', 'car_bigram']:
alex_baseline['features'][n] = (control_features[n][1], control_features[n][2])
nn_baseline2['features'][n] = (control_features[n][1], control_features[n][2])
for r in control_groups['regex']:
alex_baseline['features'][r] = (control_features[r][1], control_features[r][2])
nn_baseline2['features'][n] = (control_features[n][1], control_features[n][2])
build_alex_baseline()
alex_ada = { 'method':'adaboost',
'model_args': {'n_estimators':500},
'features': alex_baseline['features']
}
alex_baseline_10 = {'method':'svm', 'model_args':{'class_weight':{1:10}}, 'features':alex_baseline['features']}
alex_baseline_100 = {'method':'svm', 'model_args':{'class_weight':{1:100}}, 'features':alex_baseline['features']}
alex_baseline_1000 = {'method':'svm', 'model_args':{'class_weight':{1:1000}}, 'features':alex_baseline['features']}
alex_baseline_01 = {'method':'svm', 'model_args':{'class_weight':{2:10}}, 'features':alex_baseline['features']}
alex_baseline_001 = {'method':'svm', 'model_args':{'class_weight':{1:.01}}, 'features':alex_baseline['features']}
alex_baseline_0001 = {'method':'svm', 'model_args':{'class_weight':{1:.001}}, 'features':alex_baseline['features']}
regex_baseline = { 'method' : 'adaboost',
'model_args' : {'n_estimators' : 500},
'features' : { 'all_ef' : (EFTransformer, {'method' : 'all', 'num_horizon' : 1}),
'mean_ef' : (EFTransformer, {'method' : 'mean', 'num_horizon' : 5}),
'max_ef' : (EFTransformer, {'method' : 'max', 'num_horizon' : 5}),
'lbbb': (LBBBTransformer, {'time_horizon' : 30*3}),
'sr': (SinusRhythmTransformer, {'time_horizon' : 30*3}),
'nyha': (NYHATransformer, {'time_horizon' : 30*3}),
'nicm': (NICMTransformer, {'time_horizon' : 30*3}),
'all_qrs': (QRSTransformer, {'method' : 'all', 'num_horizon' : 1})
}
}
struct_baseline = {
'method' : 'adaboost',
'model_args' : {'n_estimators' : 200},
'features' : {
'lab_values': (FeaturePipeline, [('lab_to_dict', GetLatestLabValuesTransformer, {}), ('dict_to_vect', DictVectorizer, {})]),
'enc': (GetEncountersFeaturesTransformer, {'max_encounters' : 5}),
'icd9': (ICD9_Transformer, {'depth' : 2}),
'sex': (SexTransformer, {}),
}
}
def __add_to_baseline(base, feature_names):
for name in feature_names:
feature_triple = control_features[name]
base['features'][name] = (feature_triple[1], feature_triple[2])
return base
##################
# control : this is the pre-loaded set of method, model_args, and features. If any of the below are none, will default to these
#
# method : this is a string that is interpreted as a switch statement below
# model_args : the **kwargs dict that are passed to your model, e.g. AdaBoostClassifier(**model_args)
# features: this ia a dictionary of transformer name : (transformer class, transformer args)
# eventually this will become a tuple (name, class(**args)) to go into a FeatureUnion
# However, I have included functionality to handle FeaturePipelines. In this one case,
# the transformer args are a list of (name, class, args) triples. For example:
# 'Car' : (FeaturePipeline, [('car_notes', CarNotesTransformer, {'horizon' :3}), ('tfidf', TfidfTransformer, {})])
#
# ADD For features_add, you must give a triple (name, class, args). You can write this yourself, or reference the lists above
# features_add = control_groups['labs'] + [control_features[x] for x in ['icd9', 'lbbb']]
#
# CHANGE Besides making features, you can also change them. I thought this was an inutitive option. The simplest change
# just calls the transformer by name and assigns a new value to one of the args. For example:
# features_change = {'ef_all' : {'num_horizon' : 10, 'time_horizon' : 30*4}}
# But you can also change thing inside feature pipelines. For example, to change the feature above:
# features_change = {('Car', 0) : {'horizon' : 10}}
#
# REMOVE Lastly, you can remove features. This is easy. No error will be thrown if you remove a feature not included.
# features_remove = ['icd9', 'max_ef', 'lab_high']
# features_remove = control_groups['regex'] + control_groups['labs']
################
def build_model(control, method = None, model_args = None, features = None, features_add = None, features_change = None, features_remove = None):
#Modify method
if method == None:
method = control['method']
method_diff = False
else:
method_diff = not method == control['method']
#Modify model args
if model_args == None and method_diff:
model_args = dict()
args_diff = True
elif model_args == None:
model_args = control['model_args']
args_diff = False
else:
args_diff = True
#Modify the model features
#keep in mind that features are a dictionary of tuples, e.g. 'name': (transformer_class, transformer_args)
feature_diff = {'+': [], '-' : []}
if features == None:
features = control['features']
#add features
if features_add != None:
for add in features_add:
features[add[0]] = (add[1], add[2])
feature_diff['+'] += features_add
#remove features
if features_remove != None:
feature_diff['-'] += features_remove
for remove in features_remove:
if remove in features:
features.pop(remove)
#change features
if features_change != None:
for change in features_change:#not robust to handle changing FeaturePipeline
if change[0] in features:
transformer, args = features[change[0]]
feature_diff['-'] += [(change[0], args)]
args = change_args(transformer, args, change[1]) #recursive change call
features[change[0]] = (transformer, args)
feature_diff['+'] += [(change[0], args)]
#select classifier and build pipeline
if method in ['clinical', 'decision', 'cdm', 'clinical decision model']:
is_regression = False
clf = ClinicalDecisionModel()
model = clf
else:
if method in ['logistic regression', 'lr', 'logitr', 'logistic']:
is_regression = False
clf = LogisticRegression(**model_args)
elif method in ['svm']:
is_regression = False
clf = SVC(**model_args)
elif method in ['boosting', 'adaboost']:
is_regression = False
clf = AdaBoostClassifier(**model_args)
elif method in ['decision tree', 'dtree']:
is_regression = False
clf = DecisionTreeClassifier(**model_args)
elif method in ['nn', 'neural', 'net', 'neuralnet', 'network']:
is_regression = False
clf = NeuralNetwork(**model_args)
elif method in ['me', 'mixexp', 'mixture of experts']:
is_regression = False
print model_args
clf = MixtureOfExperts(**model_args)
else:
raise ValueError("'" + method + "' is not a supported classification method")
#build the transformers
transformer_list = []
for feature_name in features:
transformer_list += [build_transformer(feature_name, features[feature_name])] #recursive build call
#assemble pipeline
model = Pipeline([
('feature_union', FeatureUnion(transformer_list)),#, n_jobs = min(2, len(transformer_list)))),
('Classifier', clf)
])
return model
def build_transformer(transformer_name, transformer_values):
if issubclass(transformer_values[0], Pipeline):
steps = [build_transformer(step[0], (step[1], step[2])) for step in transformer_values[1]]
transformer = transformer_values[0](steps)
else:
transformer = transformer_values[0](**transformer_values[1])
return (transformer_name, transformer)
def change_args(transformer, args, params):
for param in params:
if issubclass(transformer, Pipeline):
args[param] = (args[param][0], args[param][1], change_args(args[param][1], args[param][2], params[param]))
else:
args[param] = params[param]
return args