[8d2107]: / queue_test.py

Download this file

157 lines (139 with data), 8.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import sys
import csv
import itertools
################################
#
# Description: adds a test to the queue
#
# example of queue:
# python queue_test.py -mes 'This is a test' -n 500 -cv 3 -d 'from model_builder import build_model, regex_baseline' -r "build_model(regex_baseline, method = 'SVM', features_remove = ['NYHA'], features_add = {'min_ef' : (EFTransformer, {'method':'min', 'num_horizon': 2})})"
#
# Keys Description Default
# -h displays help message
# -mes message/description ""
# -n number of patients 900
# -cv number of CV splits 5
# -d dependency string to be executed "" (this dependency is only added on to default)
# -con control regex_baseline
# -met method to override control adaboost
# -arg* arguments to override control model params None
# -a* features to add None
# -c* features to change None
# -r* features to remove None
# -g the grid of features None
#
# The keys with the * indicate that the string can be formatted according to the grid.
# For example, I could pass the following:
# -arg "{'n_estimators' : <<N>>}" -g "{'N' : [5, 50, 500]}"
# which would mean that the model would be ran 3 times, with n_estimators replaced each time.
#
################################
queue_file_path = "../experiments/experiments.csv"
base_dependency = "from model_builder import build_model, regex_baseline, control_features, control_groups, struct_baseline, adaboost_baseline, lr_baseline"
def queue_test(message = "", num_patients = 900, cv_splits = 5, build_model_string = "build_model(regex_baseline)", dependencies = "from model_builder import build_model, regex_baseline"):
with open(queue_file_path, 'r+b') as queue_file:
test_id = sum(1 for line in queue_file)
writer = csv.writer(queue_file)
writer.writerow([test_id, message, cv_splits, min(num_patients, 906), dependencies, build_model_string, '0'])
def queue_grid_search(grid = None, message = "", num_patients = 900, cv_splits = 5, dependencies = "", control_string = 'regex_baseline', method_string = 'None', model_args = 'None', features_add_string = 'None', features_change_string = 'None', features_remove_string = 'None'):
if not grid == None:
args_options = build_options(grid, model_args)
add_options = build_options(grid, features_add_string)
change_options = build_options(grid, features_change_string)
remove_options = build_options(grid, features_remove_string)
else:
args_options = [model_args]
add_options = [features_add_string]
change_options = [features_change_string]
remove_options = [features_remove_string]
results = []
for args in args_options:
for add in add_options:
for change in change_options:
for remove in remove_options:
results += [make_model_string(control_string, method_string, args, add, change, remove)]
for model_string in results:
queue_test(message, num_patients, cv_splits, model_string, base_dependency + "; " + dependencies)
def make_model_string(control, method, args, add, change, remove):
result = "build_model("
result += "control = " + control + ", "
if not method == 'None':
result += "method = '" + method + "', "
if not args == 'None':
result += "model_args = " + args + ", "
if not add == 'None':
result += "features_add = " + add + ", "
if not change == 'None':
result += "features_change = " + change + ", "
if not remove == 'None':
result += "features_remove = " + remove
result += ")"
return result
def update_string(mapping, string):
replace_pairs = [('{', '{{'), ('}', '}}'), ('<<', '{'), ('>>', '}')]
for replace_pair in replace_pairs:
string = string.replace(replace_pair[0], replace_pair[1])
return string.format(**mapping)
def build_options(mapping, string):
result = []
keys = list(mapping.keys())
relevant_keys = [key for key in keys if "<<" + key + ">>" in string]
if len(relevant_keys) > 0:
values = [mapping[key] for key in relevant_keys]
combo_iterator = itertools.product(*values)
for combo in combo_iterator:
single_map = dict(zip(relevant_keys, combo))
result += [update_string(single_map, string)]
return list(set(result))
else:
return [string]
def main():
inputs = sys.argv[1:]
if len(inputs) == 1 and inputs[0] in ['-h', 'help']:
show_help()
else:
if len(inputs) % 2 == 1:
raise ValueError("Uninterpretable input: " + str(inputs))
queue_args = dict()
args_converter = {"-con" : "control_string", "-a" : "features_add_string", "-met" : 'method_string', '-arg' : 'model_args', '-c' : 'features_change_string', "-g" : "grid", "-mes" : "message", "-n" : "num_patients", "-r" : "features_remove_string", "-cv" : "cv_splits", "-d" : "dependencies"}
for i in range(len(inputs) / 2):
key = inputs[2*i]
value = inputs[2*i + 1]
if key in args_converter:
if key in ['-g', '-n', '-cv' ]:
value = eval(value) if key != '-n' else min(906, eval(value))
queue_args[args_converter[key]] = value
else:
raise ValueError("Unkown key: " + key)
queue_grid_search(**queue_args)
def show_help():
print "\033[95m Keys Description Default\033[0m"
print ' -mes message/description ""'
print ' -n number of patients 900'
print ' -cv number of CV splits 5'
print ' -d dependency string to be executed "" (this dependency is only added on to default)'
print ' -con control regex_baseline'
print ' -met method to override control adaboost'
print ' -arg* arguments to override control model params None'
print ' -a* features to add None'
print ' -c* features to change None'
print ' -r* features to remove None'
print ' -g the grid of features None'
print '\033[92m The keys with the * indicate that the string can be formatted according to the grid.'
print ' For example, I could pass the following:'
print ' -arg "{\'n_estimators\' : <<N>>}" -g "{\'N\' : [5, 50, 500]}"'
print ' which would mean that the model would be ran 3 times, with n_estimators replaced each time.\033[0m'
print '\033[96m You also have the following variables availible to use (they are in model_builder.py):\033[0m'
print ' control_features a dict of all features and their (name, class, args) tuple'
print ' control_groups a dict of some groups of names, which are "regex", "structured_only", "notes_tfidf", and "labs" for now'
print ' adaboost_baseline (for -con) baseline with no features but has adaboost with 500 weak learners'
print ' struct_baseline (for -con) baseline with Enc, Sex, lab_values, ICD9, has adaboost with 200 weak learners'
print ' lr_baseline (for -con) baseline with no features but has logisitic regression'
print ' regex_baseline (for -con) adaboost_baseline with control_groups["regex"] loaded'
print '\033[96m Use these in combintion with grid to easily gain control over our experiments:\033[96m'
print ' Run control, without regex features, without notes: -r "<<G>>" -g "{\'G\' : [\'None\', \'control_groups[\\"regex\\"]\', \'control_groups[\\"notes_tfidf\\"]\']}"'
print ' Run with regex added in, labs added in: -a "[control_features[x] for x in control_groups[\'<<G>>\']]" -g "{\'G\' : [\'regex\', \'labs\']}"'
print ' \033[92m See model_builder.py for more information about these structures and how to handle FeaturePipeline changes'
print '\033[0m'
if __name__ == '__main__':
main()