|
a |
|
b/queue_test.py |
|
|
1 |
import sys |
|
|
2 |
import csv |
|
|
3 |
import itertools |
|
|
4 |
|
|
|
5 |
################################ |
|
|
6 |
# |
|
|
7 |
# Description: adds a test to the queue |
|
|
8 |
# |
|
|
9 |
# example of queue: |
|
|
10 |
# python queue_test.py -mes 'This is a test' -n 500 -cv 3 -d 'from model_builder import build_model, regex_baseline' -r "build_model(regex_baseline, method = 'SVM', features_remove = ['NYHA'], features_add = {'min_ef' : (EFTransformer, {'method':'min', 'num_horizon': 2})})" |
|
|
11 |
# |
|
|
12 |
# Keys Description Default |
|
|
13 |
# -h displays help message |
|
|
14 |
# -mes message/description "" |
|
|
15 |
# -n number of patients 900 |
|
|
16 |
# -cv number of CV splits 5 |
|
|
17 |
# -d dependency string to be executed "" (this dependency is only added on to default) |
|
|
18 |
# -con control regex_baseline |
|
|
19 |
# -met method to override control adaboost |
|
|
20 |
# -arg* arguments to override control model params None |
|
|
21 |
# -a* features to add None |
|
|
22 |
# -c* features to change None |
|
|
23 |
# -r* features to remove None |
|
|
24 |
# -g the grid of features None |
|
|
25 |
# |
|
|
26 |
# The keys with the * indicate that the string can be formatted according to the grid. |
|
|
27 |
# For example, I could pass the following: |
|
|
28 |
# -arg "{'n_estimators' : <<N>>}" -g "{'N' : [5, 50, 500]}" |
|
|
29 |
# which would mean that the model would be ran 3 times, with n_estimators replaced each time. |
|
|
30 |
# |
|
|
31 |
################################ |
|
|
32 |
|
|
|
33 |
|
|
|
34 |
|
|
|
35 |
queue_file_path = "../experiments/experiments.csv" |
|
|
36 |
|
|
|
37 |
base_dependency = "from model_builder import build_model, regex_baseline, control_features, control_groups, struct_baseline, adaboost_baseline, lr_baseline" |
|
|
38 |
|
|
|
39 |
def queue_test(message = "", num_patients = 900, cv_splits = 5, build_model_string = "build_model(regex_baseline)", dependencies = "from model_builder import build_model, regex_baseline"): |
|
|
40 |
with open(queue_file_path, 'r+b') as queue_file: |
|
|
41 |
test_id = sum(1 for line in queue_file) |
|
|
42 |
writer = csv.writer(queue_file) |
|
|
43 |
writer.writerow([test_id, message, cv_splits, min(num_patients, 906), dependencies, build_model_string, '0']) |
|
|
44 |
|
|
|
45 |
def queue_grid_search(grid = None, message = "", num_patients = 900, cv_splits = 5, dependencies = "", control_string = 'regex_baseline', method_string = 'None', model_args = 'None', features_add_string = 'None', features_change_string = 'None', features_remove_string = 'None'): |
|
|
46 |
|
|
|
47 |
if not grid == None: |
|
|
48 |
args_options = build_options(grid, model_args) |
|
|
49 |
add_options = build_options(grid, features_add_string) |
|
|
50 |
change_options = build_options(grid, features_change_string) |
|
|
51 |
remove_options = build_options(grid, features_remove_string) |
|
|
52 |
else: |
|
|
53 |
args_options = [model_args] |
|
|
54 |
add_options = [features_add_string] |
|
|
55 |
change_options = [features_change_string] |
|
|
56 |
remove_options = [features_remove_string] |
|
|
57 |
results = [] |
|
|
58 |
for args in args_options: |
|
|
59 |
for add in add_options: |
|
|
60 |
for change in change_options: |
|
|
61 |
for remove in remove_options: |
|
|
62 |
results += [make_model_string(control_string, method_string, args, add, change, remove)] |
|
|
63 |
|
|
|
64 |
for model_string in results: |
|
|
65 |
queue_test(message, num_patients, cv_splits, model_string, base_dependency + "; " + dependencies) |
|
|
66 |
|
|
|
67 |
def make_model_string(control, method, args, add, change, remove): |
|
|
68 |
result = "build_model(" |
|
|
69 |
result += "control = " + control + ", " |
|
|
70 |
if not method == 'None': |
|
|
71 |
result += "method = '" + method + "', " |
|
|
72 |
if not args == 'None': |
|
|
73 |
result += "model_args = " + args + ", " |
|
|
74 |
if not add == 'None': |
|
|
75 |
result += "features_add = " + add + ", " |
|
|
76 |
if not change == 'None': |
|
|
77 |
result += "features_change = " + change + ", " |
|
|
78 |
if not remove == 'None': |
|
|
79 |
result += "features_remove = " + remove |
|
|
80 |
result += ")" |
|
|
81 |
return result |
|
|
82 |
|
|
|
83 |
def update_string(mapping, string): |
|
|
84 |
replace_pairs = [('{', '{{'), ('}', '}}'), ('<<', '{'), ('>>', '}')] |
|
|
85 |
for replace_pair in replace_pairs: |
|
|
86 |
string = string.replace(replace_pair[0], replace_pair[1]) |
|
|
87 |
return string.format(**mapping) |
|
|
88 |
|
|
|
89 |
def build_options(mapping, string): |
|
|
90 |
result = [] |
|
|
91 |
keys = list(mapping.keys()) |
|
|
92 |
relevant_keys = [key for key in keys if "<<" + key + ">>" in string] |
|
|
93 |
if len(relevant_keys) > 0: |
|
|
94 |
values = [mapping[key] for key in relevant_keys] |
|
|
95 |
combo_iterator = itertools.product(*values) |
|
|
96 |
for combo in combo_iterator: |
|
|
97 |
single_map = dict(zip(relevant_keys, combo)) |
|
|
98 |
result += [update_string(single_map, string)] |
|
|
99 |
return list(set(result)) |
|
|
100 |
else: |
|
|
101 |
return [string] |
|
|
102 |
|
|
|
103 |
def main(): |
|
|
104 |
inputs = sys.argv[1:] |
|
|
105 |
if len(inputs) == 1 and inputs[0] in ['-h', 'help']: |
|
|
106 |
show_help() |
|
|
107 |
else: |
|
|
108 |
if len(inputs) % 2 == 1: |
|
|
109 |
raise ValueError("Uninterpretable input: " + str(inputs)) |
|
|
110 |
|
|
|
111 |
queue_args = dict() |
|
|
112 |
args_converter = {"-con" : "control_string", "-a" : "features_add_string", "-met" : 'method_string', '-arg' : 'model_args', '-c' : 'features_change_string', "-g" : "grid", "-mes" : "message", "-n" : "num_patients", "-r" : "features_remove_string", "-cv" : "cv_splits", "-d" : "dependencies"} |
|
|
113 |
|
|
|
114 |
for i in range(len(inputs) / 2): |
|
|
115 |
key = inputs[2*i] |
|
|
116 |
value = inputs[2*i + 1] |
|
|
117 |
if key in args_converter: |
|
|
118 |
if key in ['-g', '-n', '-cv' ]: |
|
|
119 |
value = eval(value) if key != '-n' else min(906, eval(value)) |
|
|
120 |
queue_args[args_converter[key]] = value |
|
|
121 |
else: |
|
|
122 |
raise ValueError("Unkown key: " + key) |
|
|
123 |
|
|
|
124 |
queue_grid_search(**queue_args) |
|
|
125 |
def show_help(): |
|
|
126 |
|
|
|
127 |
print "\033[95m Keys Description Default\033[0m" |
|
|
128 |
print ' -mes message/description ""' |
|
|
129 |
print ' -n number of patients 900' |
|
|
130 |
print ' -cv number of CV splits 5' |
|
|
131 |
print ' -d dependency string to be executed "" (this dependency is only added on to default)' |
|
|
132 |
print ' -con control regex_baseline' |
|
|
133 |
print ' -met method to override control adaboost' |
|
|
134 |
print ' -arg* arguments to override control model params None' |
|
|
135 |
print ' -a* features to add None' |
|
|
136 |
print ' -c* features to change None' |
|
|
137 |
print ' -r* features to remove None' |
|
|
138 |
print ' -g the grid of features None' |
|
|
139 |
print '\033[92m The keys with the * indicate that the string can be formatted according to the grid.' |
|
|
140 |
print ' For example, I could pass the following:' |
|
|
141 |
print ' -arg "{\'n_estimators\' : <<N>>}" -g "{\'N\' : [5, 50, 500]}"' |
|
|
142 |
print ' which would mean that the model would be ran 3 times, with n_estimators replaced each time.\033[0m' |
|
|
143 |
print '\033[96m You also have the following variables availible to use (they are in model_builder.py):\033[0m' |
|
|
144 |
print ' control_features a dict of all features and their (name, class, args) tuple' |
|
|
145 |
print ' control_groups a dict of some groups of names, which are "regex", "structured_only", "notes_tfidf", and "labs" for now' |
|
|
146 |
print ' adaboost_baseline (for -con) baseline with no features but has adaboost with 500 weak learners' |
|
|
147 |
print ' struct_baseline (for -con) baseline with Enc, Sex, lab_values, ICD9, has adaboost with 200 weak learners' |
|
|
148 |
print ' lr_baseline (for -con) baseline with no features but has logisitic regression' |
|
|
149 |
print ' regex_baseline (for -con) adaboost_baseline with control_groups["regex"] loaded' |
|
|
150 |
print '\033[96m Use these in combintion with grid to easily gain control over our experiments:\033[96m' |
|
|
151 |
print ' Run control, without regex features, without notes: -r "<<G>>" -g "{\'G\' : [\'None\', \'control_groups[\\"regex\\"]\', \'control_groups[\\"notes_tfidf\\"]\']}"' |
|
|
152 |
print ' Run with regex added in, labs added in: -a "[control_features[x] for x in control_groups[\'<<G>>\']]" -g "{\'G\' : [\'regex\', \'labs\']}"' |
|
|
153 |
print ' \033[92m See model_builder.py for more information about these structures and how to handle FeaturePipeline changes' |
|
|
154 |
print '\033[0m' |
|
|
155 |
if __name__ == '__main__': |
|
|
156 |
main() |