|
a |
|
b/model_builder.py |
|
|
1 |
from sklearn.linear_model import LogisticRegression |
|
|
2 |
from sklearn.svm import SVC |
|
|
3 |
from sklearn.tree import DecisionTreeClassifier |
|
|
4 |
from sklearn.ensemble import AdaBoostClassifier |
|
|
5 |
from sklearn.pipeline import FeatureUnion, Pipeline |
|
|
6 |
from decision_model import ClinicalDecisionModel |
|
|
7 |
from model_tester import FeaturePipeline |
|
|
8 |
from doc2vec_transformer import Doc2Vec_Note_Transformer |
|
|
9 |
from sklearn.feature_extraction import DictVectorizer |
|
|
10 |
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer |
|
|
11 |
from baseline_transformer import SexTransformer, GetConcatenatedNotesTransformer, GetLatestNotesTransformer, GetEncountersFeaturesTransformer, GetLabsCountsDictTransformer, GetLabsLowCountsDictTransformer, GetLabsHighCountsDictTransformer, GetLabsLatestHighDictTransformer, GetLabsLatestLowDictTransformer, GetLabsHistoryDictTransformer, GetLatestLabValuesTransformer |
|
|
12 |
from icd_transformer import ICD9_Transformer |
|
|
13 |
from value_extractor_transformer import EFTransformer, LBBBTransformer, SinusRhythmTransformer, QRSTransformer, NYHATransformer, NICMTransformer |
|
|
14 |
import logging |
|
|
15 |
from mix_of_exp import MixtureOfExperts |
|
|
16 |
from neural_network import NeuralNetwork, NeuralLogistic |
|
|
17 |
logger = logging.getLogger("DaemonLog") |
|
|
18 |
|
|
|
19 |
#This should make adding transformers easier. You could add a transformer like |
|
|
20 |
# features_add = [baseline_features[x] for x in ['Labs_Latest_High', 'Car_tfidf'] |
|
|
21 |
|
|
|
22 |
control_features = { 'all_ef' : ('all_ef', EFTransformer, {'method' : 'all', 'num_horizon' : 1}), |
|
|
23 |
'mean_ef' : ('mean_ef', EFTransformer, {'method' : 'mean', 'num_horizon' : 5}), |
|
|
24 |
'max_ef' : ('max_ef', EFTransformer, {'method' : 'max', 'num_horizon' : 5}), |
|
|
25 |
'lbbb': ('lbbb', LBBBTransformer, {'time_horizon' : 30*3}), |
|
|
26 |
'sr': ('sr', SinusRhythmTransformer, {'time_horizon' : 30*3}), |
|
|
27 |
'nyha': ('nyha', NYHATransformer, {'time_horizon' : 30*3}), |
|
|
28 |
'nicm': ('nicm', NICMTransformer, {'time_horizon' : 30*3}), |
|
|
29 |
'all_qrs': ('all_qrs', QRSTransformer, {'method' : 'all', 'num_horizon' : 1}), |
|
|
30 |
'icd9': ('icd9', ICD9_Transformer, {'depth' : 2}), |
|
|
31 |
'sex': ('sex', SexTransformer, {}), |
|
|
32 |
'car_d2v': ('car_d2v', Doc2Vec_Note_Transformer, {'note_type':'Car', 'model_file':'/home/ubuntu/josh_project/doc2vec_models/car_1.model', 'dbow_file':'/home/ubuntu/josh_project/doc2vec_models/car_dbow.model', 'max_notes':5}), |
|
|
33 |
'lno_d2v': ('lno_d2v',Doc2Vec_Note_Transformer, {'note_type':'Lno', 'model_file':'/home/ubuntu/josh_project/doc2vec_models/lno_1.model', 'dbow_file':'/home/ubuntu/josh_project/doc2vec_models/lno_dbow.model', 'max_notes':5}), |
|
|
34 |
'car_tfidf':('car_tfidf', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car'}), |
|
|
35 |
('tfidf_car', TfidfTransformer, {})]), |
|
|
36 |
'lno_tfidf':('lno_tfidf', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno'}), |
|
|
37 |
('tfidf_lno', TfidfTransformer, {})]), |
|
|
38 |
'car_trigram':('car_ngram', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car'}), |
|
|
39 |
('ngram_car', CountVectorizer, {'ngram_range' : (3, 3), 'min_df' : .05})]), |
|
|
40 |
'lno_trigram':('lno_ngram', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno'}), |
|
|
41 |
('ngram_lno', CountVectorizer, {'ngram_range' : (3, 3), 'min_df' : .05})]), |
|
|
42 |
'car_bigram':('car_ngram', FeaturePipeline, [('notes_car', GetConcatenatedNotesTransformer, {'note_type' : 'Car', 'look_back_months': 12}), |
|
|
43 |
('ngram_car', CountVectorizer, {'ngram_range' : (2, 2), 'min_df' : .05})]), |
|
|
44 |
'lno_bigram':('lno_ngram', FeaturePipeline, [('notes_lno', GetConcatenatedNotesTransformer, {'note_type' : 'Lno', 'look_back_months': 12}), |
|
|
45 |
('ngram_lno', CountVectorizer, {'ngram_range' : (2, 2), 'min_df' : .05})]), |
|
|
46 |
'enc': ('enc', GetEncountersFeaturesTransformer, {'max_encounters' : 5}), |
|
|
47 |
'lab_all' : ('lab_all', FeaturePipeline, [('lab_to_dict', GetLabsCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]), |
|
|
48 |
'lab_low' : ('lab_low', FeaturePipeline, [('lab_to_dict', GetLabsLowCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]), |
|
|
49 |
'lab_high' : ('lab_high', FeaturePipeline, [('lab_to_dict', GetLabsHighCountsDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]), |
|
|
50 |
'lab_low_recent' : ('lab_low_recent', FeaturePipeline, [('lab_to_dict', GetLabsLatestLowDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]), |
|
|
51 |
'lab_high_recent' : ('lab_high_recent', FeaturePipeline, [('lab_to_dict', GetLabsLatestHighDictTransformer, {}), ('dict_to_vect', DictVectorizer, {})]), |
|
|
52 |
'lab_values': ('lab_values', FeaturePipeline, [('lab_to_dict', GetLatestLabValuesTransformer, {}), ('dict_to_vect', DictVectorizer, {})]), |
|
|
53 |
'lab_hist' : ('lab_hist', FeaturePipeline, [('lab_to_dict', GetLabsHistoryDictTransformer, {'time_thresholds_months' : [1]}), ('dict_to_vect', DictVectorizer, {})]), |
|
|
54 |
} |
|
|
55 |
|
|
|
56 |
control_groups = { 'regex' : ['all_ef', 'mean_ef', 'max_ef', 'lbbb', 'sr', 'nyha', 'nicm', 'all_qrs'], |
|
|
57 |
'structured_only' : ['sex', 'icd9', 'enc', 'lab_values'], |
|
|
58 |
'notes_tfidf' : ['car_tfidf', 'lno_tfidf'], |
|
|
59 |
'labs': ['lab_all', 'lab_low', 'lab_high', 'lab_low_recent', 'lab_high_recent', 'lab_hist'], |
|
|
60 |
'd2v' : ['car_d2v', 'lno_d2v'] |
|
|
61 |
} |
|
|
62 |
|
|
|
63 |
#These are empty, but might be useful |
|
|
64 |
adaboost_baseline = { 'method' : 'adaboost', 'model_args' : {'n_estimators' : 500}, 'features' : {} } |
|
|
65 |
lr_baseline = { 'method' : 'lr', 'model_args' : {'C' : 1}, 'features' : {} } |
|
|
66 |
|
|
|
67 |
nn_baseline2 = { 'method' : 'nn', 'model_args' : {'layers' : [(10, 'logistic'), (None, 'softmax')], 'obj_fun' : 'maxent'}, 'features' : {}} |
|
|
68 |
|
|
|
69 |
alex_baseline = { 'method':'svm', |
|
|
70 |
'model_args': {}, |
|
|
71 |
'features':{} |
|
|
72 |
} |
|
|
73 |
def build_alex_baseline(): |
|
|
74 |
#for n in control_groups['notes_tfidf']: |
|
|
75 |
for n in ['lno_bigram', 'car_bigram']: |
|
|
76 |
alex_baseline['features'][n] = (control_features[n][1], control_features[n][2]) |
|
|
77 |
nn_baseline2['features'][n] = (control_features[n][1], control_features[n][2]) |
|
|
78 |
for r in control_groups['regex']: |
|
|
79 |
alex_baseline['features'][r] = (control_features[r][1], control_features[r][2]) |
|
|
80 |
nn_baseline2['features'][n] = (control_features[n][1], control_features[n][2]) |
|
|
81 |
|
|
|
82 |
build_alex_baseline() |
|
|
83 |
|
|
|
84 |
alex_ada = { 'method':'adaboost', |
|
|
85 |
'model_args': {'n_estimators':500}, |
|
|
86 |
'features': alex_baseline['features'] |
|
|
87 |
} |
|
|
88 |
|
|
|
89 |
alex_baseline_10 = {'method':'svm', 'model_args':{'class_weight':{1:10}}, 'features':alex_baseline['features']} |
|
|
90 |
|
|
|
91 |
alex_baseline_100 = {'method':'svm', 'model_args':{'class_weight':{1:100}}, 'features':alex_baseline['features']} |
|
|
92 |
|
|
|
93 |
alex_baseline_1000 = {'method':'svm', 'model_args':{'class_weight':{1:1000}}, 'features':alex_baseline['features']} |
|
|
94 |
|
|
|
95 |
alex_baseline_01 = {'method':'svm', 'model_args':{'class_weight':{2:10}}, 'features':alex_baseline['features']} |
|
|
96 |
|
|
|
97 |
alex_baseline_001 = {'method':'svm', 'model_args':{'class_weight':{1:.01}}, 'features':alex_baseline['features']} |
|
|
98 |
|
|
|
99 |
alex_baseline_0001 = {'method':'svm', 'model_args':{'class_weight':{1:.001}}, 'features':alex_baseline['features']} |
|
|
100 |
|
|
|
101 |
regex_baseline = { 'method' : 'adaboost', |
|
|
102 |
'model_args' : {'n_estimators' : 500}, |
|
|
103 |
'features' : { 'all_ef' : (EFTransformer, {'method' : 'all', 'num_horizon' : 1}), |
|
|
104 |
'mean_ef' : (EFTransformer, {'method' : 'mean', 'num_horizon' : 5}), |
|
|
105 |
'max_ef' : (EFTransformer, {'method' : 'max', 'num_horizon' : 5}), |
|
|
106 |
'lbbb': (LBBBTransformer, {'time_horizon' : 30*3}), |
|
|
107 |
'sr': (SinusRhythmTransformer, {'time_horizon' : 30*3}), |
|
|
108 |
'nyha': (NYHATransformer, {'time_horizon' : 30*3}), |
|
|
109 |
'nicm': (NICMTransformer, {'time_horizon' : 30*3}), |
|
|
110 |
'all_qrs': (QRSTransformer, {'method' : 'all', 'num_horizon' : 1}) |
|
|
111 |
} |
|
|
112 |
} |
|
|
113 |
|
|
|
114 |
struct_baseline = { |
|
|
115 |
'method' : 'adaboost', |
|
|
116 |
'model_args' : {'n_estimators' : 200}, |
|
|
117 |
'features' : { |
|
|
118 |
'lab_values': (FeaturePipeline, [('lab_to_dict', GetLatestLabValuesTransformer, {}), ('dict_to_vect', DictVectorizer, {})]), |
|
|
119 |
'enc': (GetEncountersFeaturesTransformer, {'max_encounters' : 5}), |
|
|
120 |
'icd9': (ICD9_Transformer, {'depth' : 2}), |
|
|
121 |
'sex': (SexTransformer, {}), |
|
|
122 |
} |
|
|
123 |
} |
|
|
124 |
|
|
|
125 |
def __add_to_baseline(base, feature_names): |
|
|
126 |
for name in feature_names: |
|
|
127 |
feature_triple = control_features[name] |
|
|
128 |
base['features'][name] = (feature_triple[1], feature_triple[2]) |
|
|
129 |
return base |
|
|
130 |
|
|
|
131 |
|
|
|
132 |
################## |
|
|
133 |
# control : this is the pre-loaded set of method, model_args, and features. If any of the below are none, will default to these |
|
|
134 |
# |
|
|
135 |
# method : this is a string that is interpreted as a switch statement below |
|
|
136 |
# model_args : the **kwargs dict that are passed to your model, e.g. AdaBoostClassifier(**model_args) |
|
|
137 |
# features: this ia a dictionary of transformer name : (transformer class, transformer args) |
|
|
138 |
# eventually this will become a tuple (name, class(**args)) to go into a FeatureUnion |
|
|
139 |
# However, I have included functionality to handle FeaturePipelines. In this one case, |
|
|
140 |
# the transformer args are a list of (name, class, args) triples. For example: |
|
|
141 |
# 'Car' : (FeaturePipeline, [('car_notes', CarNotesTransformer, {'horizon' :3}), ('tfidf', TfidfTransformer, {})]) |
|
|
142 |
# |
|
|
143 |
# ADD For features_add, you must give a triple (name, class, args). You can write this yourself, or reference the lists above |
|
|
144 |
# features_add = control_groups['labs'] + [control_features[x] for x in ['icd9', 'lbbb']] |
|
|
145 |
# |
|
|
146 |
# CHANGE Besides making features, you can also change them. I thought this was an inutitive option. The simplest change |
|
|
147 |
# just calls the transformer by name and assigns a new value to one of the args. For example: |
|
|
148 |
# features_change = {'ef_all' : {'num_horizon' : 10, 'time_horizon' : 30*4}} |
|
|
149 |
# But you can also change thing inside feature pipelines. For example, to change the feature above: |
|
|
150 |
# features_change = {('Car', 0) : {'horizon' : 10}} |
|
|
151 |
# |
|
|
152 |
# REMOVE Lastly, you can remove features. This is easy. No error will be thrown if you remove a feature not included. |
|
|
153 |
# features_remove = ['icd9', 'max_ef', 'lab_high'] |
|
|
154 |
# features_remove = control_groups['regex'] + control_groups['labs'] |
|
|
155 |
################ |
|
|
156 |
|
|
|
157 |
def build_model(control, method = None, model_args = None, features = None, features_add = None, features_change = None, features_remove = None): |
|
|
158 |
|
|
|
159 |
#Modify method |
|
|
160 |
if method == None: |
|
|
161 |
method = control['method'] |
|
|
162 |
method_diff = False |
|
|
163 |
else: |
|
|
164 |
method_diff = not method == control['method'] |
|
|
165 |
|
|
|
166 |
#Modify model args |
|
|
167 |
if model_args == None and method_diff: |
|
|
168 |
model_args = dict() |
|
|
169 |
args_diff = True |
|
|
170 |
elif model_args == None: |
|
|
171 |
model_args = control['model_args'] |
|
|
172 |
args_diff = False |
|
|
173 |
else: |
|
|
174 |
args_diff = True |
|
|
175 |
|
|
|
176 |
#Modify the model features |
|
|
177 |
#keep in mind that features are a dictionary of tuples, e.g. 'name': (transformer_class, transformer_args) |
|
|
178 |
feature_diff = {'+': [], '-' : []} |
|
|
179 |
if features == None: |
|
|
180 |
features = control['features'] |
|
|
181 |
#add features |
|
|
182 |
if features_add != None: |
|
|
183 |
for add in features_add: |
|
|
184 |
features[add[0]] = (add[1], add[2]) |
|
|
185 |
feature_diff['+'] += features_add |
|
|
186 |
#remove features |
|
|
187 |
if features_remove != None: |
|
|
188 |
feature_diff['-'] += features_remove |
|
|
189 |
for remove in features_remove: |
|
|
190 |
if remove in features: |
|
|
191 |
features.pop(remove) |
|
|
192 |
#change features |
|
|
193 |
if features_change != None: |
|
|
194 |
for change in features_change:#not robust to handle changing FeaturePipeline |
|
|
195 |
if change[0] in features: |
|
|
196 |
transformer, args = features[change[0]] |
|
|
197 |
feature_diff['-'] += [(change[0], args)] |
|
|
198 |
args = change_args(transformer, args, change[1]) #recursive change call |
|
|
199 |
features[change[0]] = (transformer, args) |
|
|
200 |
feature_diff['+'] += [(change[0], args)] |
|
|
201 |
|
|
|
202 |
#select classifier and build pipeline |
|
|
203 |
if method in ['clinical', 'decision', 'cdm', 'clinical decision model']: |
|
|
204 |
is_regression = False |
|
|
205 |
clf = ClinicalDecisionModel() |
|
|
206 |
model = clf |
|
|
207 |
else: |
|
|
208 |
if method in ['logistic regression', 'lr', 'logitr', 'logistic']: |
|
|
209 |
is_regression = False |
|
|
210 |
clf = LogisticRegression(**model_args) |
|
|
211 |
elif method in ['svm']: |
|
|
212 |
is_regression = False |
|
|
213 |
clf = SVC(**model_args) |
|
|
214 |
elif method in ['boosting', 'adaboost']: |
|
|
215 |
is_regression = False |
|
|
216 |
clf = AdaBoostClassifier(**model_args) |
|
|
217 |
elif method in ['decision tree', 'dtree']: |
|
|
218 |
is_regression = False |
|
|
219 |
clf = DecisionTreeClassifier(**model_args) |
|
|
220 |
elif method in ['nn', 'neural', 'net', 'neuralnet', 'network']: |
|
|
221 |
is_regression = False |
|
|
222 |
clf = NeuralNetwork(**model_args) |
|
|
223 |
elif method in ['me', 'mixexp', 'mixture of experts']: |
|
|
224 |
is_regression = False |
|
|
225 |
print model_args |
|
|
226 |
clf = MixtureOfExperts(**model_args) |
|
|
227 |
else: |
|
|
228 |
raise ValueError("'" + method + "' is not a supported classification method") |
|
|
229 |
|
|
|
230 |
#build the transformers |
|
|
231 |
transformer_list = [] |
|
|
232 |
for feature_name in features: |
|
|
233 |
transformer_list += [build_transformer(feature_name, features[feature_name])] #recursive build call |
|
|
234 |
|
|
|
235 |
#assemble pipeline |
|
|
236 |
model = Pipeline([ |
|
|
237 |
('feature_union', FeatureUnion(transformer_list)),#, n_jobs = min(2, len(transformer_list)))), |
|
|
238 |
('Classifier', clf) |
|
|
239 |
]) |
|
|
240 |
return model |
|
|
241 |
|
|
|
242 |
def build_transformer(transformer_name, transformer_values): |
|
|
243 |
if issubclass(transformer_values[0], Pipeline): |
|
|
244 |
steps = [build_transformer(step[0], (step[1], step[2])) for step in transformer_values[1]] |
|
|
245 |
transformer = transformer_values[0](steps) |
|
|
246 |
else: |
|
|
247 |
transformer = transformer_values[0](**transformer_values[1]) |
|
|
248 |
return (transformer_name, transformer) |
|
|
249 |
|
|
|
250 |
def change_args(transformer, args, params): |
|
|
251 |
for param in params: |
|
|
252 |
if issubclass(transformer, Pipeline): |
|
|
253 |
args[param] = (args[param][0], args[param][1], change_args(args[param][1], args[param][2], params[param])) |
|
|
254 |
else: |
|
|
255 |
args[param] = params[param] |
|
|
256 |
return args |