--- a +++ b/queue_test.py @@ -0,0 +1,156 @@ +import sys +import csv +import itertools + +################################ +# +# Description: adds a test to the queue +# +# example of queue: +# python queue_test.py -mes 'This is a test' -n 500 -cv 3 -d 'from model_builder import build_model, regex_baseline' -r "build_model(regex_baseline, method = 'SVM', features_remove = ['NYHA'], features_add = {'min_ef' : (EFTransformer, {'method':'min', 'num_horizon': 2})})" +# +# Keys Description Default +# -h displays help message +# -mes message/description "" +# -n number of patients 900 +# -cv number of CV splits 5 +# -d dependency string to be executed "" (this dependency is only added on to default) +# -con control regex_baseline +# -met method to override control adaboost +# -arg* arguments to override control model params None +# -a* features to add None +# -c* features to change None +# -r* features to remove None +# -g the grid of features None +# +# The keys with the * indicate that the string can be formatted according to the grid. +# For example, I could pass the following: +# -arg "{'n_estimators' : <<N>>}" -g "{'N' : [5, 50, 500]}" +# which would mean that the model would be ran 3 times, with n_estimators replaced each time. +# +################################ + + + +queue_file_path = "../experiments/experiments.csv" + +base_dependency = "from model_builder import build_model, regex_baseline, control_features, control_groups, struct_baseline, adaboost_baseline, lr_baseline" + +def queue_test(message = "", num_patients = 900, cv_splits = 5, build_model_string = "build_model(regex_baseline)", dependencies = "from model_builder import build_model, regex_baseline"): + with open(queue_file_path, 'r+b') as queue_file: + test_id = sum(1 for line in queue_file) + writer = csv.writer(queue_file) + writer.writerow([test_id, message, cv_splits, min(num_patients, 906), dependencies, build_model_string, '0']) + +def queue_grid_search(grid = None, message = "", num_patients = 900, cv_splits = 5, dependencies = "", control_string = 'regex_baseline', method_string = 'None', model_args = 'None', features_add_string = 'None', features_change_string = 'None', features_remove_string = 'None'): + + if not grid == None: + args_options = build_options(grid, model_args) + add_options = build_options(grid, features_add_string) + change_options = build_options(grid, features_change_string) + remove_options = build_options(grid, features_remove_string) + else: + args_options = [model_args] + add_options = [features_add_string] + change_options = [features_change_string] + remove_options = [features_remove_string] + results = [] + for args in args_options: + for add in add_options: + for change in change_options: + for remove in remove_options: + results += [make_model_string(control_string, method_string, args, add, change, remove)] + + for model_string in results: + queue_test(message, num_patients, cv_splits, model_string, base_dependency + "; " + dependencies) + +def make_model_string(control, method, args, add, change, remove): + result = "build_model(" + result += "control = " + control + ", " + if not method == 'None': + result += "method = '" + method + "', " + if not args == 'None': + result += "model_args = " + args + ", " + if not add == 'None': + result += "features_add = " + add + ", " + if not change == 'None': + result += "features_change = " + change + ", " + if not remove == 'None': + result += "features_remove = " + remove + result += ")" + return result + +def update_string(mapping, string): + replace_pairs = [('{', '{{'), ('}', '}}'), ('<<', '{'), ('>>', '}')] + for replace_pair in replace_pairs: + string = string.replace(replace_pair[0], replace_pair[1]) + return string.format(**mapping) + +def build_options(mapping, string): + result = [] + keys = list(mapping.keys()) + relevant_keys = [key for key in keys if "<<" + key + ">>" in string] + if len(relevant_keys) > 0: + values = [mapping[key] for key in relevant_keys] + combo_iterator = itertools.product(*values) + for combo in combo_iterator: + single_map = dict(zip(relevant_keys, combo)) + result += [update_string(single_map, string)] + return list(set(result)) + else: + return [string] + +def main(): + inputs = sys.argv[1:] + if len(inputs) == 1 and inputs[0] in ['-h', 'help']: + show_help() + else: + if len(inputs) % 2 == 1: + raise ValueError("Uninterpretable input: " + str(inputs)) + + queue_args = dict() + args_converter = {"-con" : "control_string", "-a" : "features_add_string", "-met" : 'method_string', '-arg' : 'model_args', '-c' : 'features_change_string', "-g" : "grid", "-mes" : "message", "-n" : "num_patients", "-r" : "features_remove_string", "-cv" : "cv_splits", "-d" : "dependencies"} + + for i in range(len(inputs) / 2): + key = inputs[2*i] + value = inputs[2*i + 1] + if key in args_converter: + if key in ['-g', '-n', '-cv' ]: + value = eval(value) if key != '-n' else min(906, eval(value)) + queue_args[args_converter[key]] = value + else: + raise ValueError("Unkown key: " + key) + + queue_grid_search(**queue_args) +def show_help(): + + print "\033[95m Keys Description Default\033[0m" + print ' -mes message/description ""' + print ' -n number of patients 900' + print ' -cv number of CV splits 5' + print ' -d dependency string to be executed "" (this dependency is only added on to default)' + print ' -con control regex_baseline' + print ' -met method to override control adaboost' + print ' -arg* arguments to override control model params None' + print ' -a* features to add None' + print ' -c* features to change None' + print ' -r* features to remove None' + print ' -g the grid of features None' + print '\033[92m The keys with the * indicate that the string can be formatted according to the grid.' + print ' For example, I could pass the following:' + print ' -arg "{\'n_estimators\' : <<N>>}" -g "{\'N\' : [5, 50, 500]}"' + print ' which would mean that the model would be ran 3 times, with n_estimators replaced each time.\033[0m' + print '\033[96m You also have the following variables availible to use (they are in model_builder.py):\033[0m' + print ' control_features a dict of all features and their (name, class, args) tuple' + print ' control_groups a dict of some groups of names, which are "regex", "structured_only", "notes_tfidf", and "labs" for now' + print ' adaboost_baseline (for -con) baseline with no features but has adaboost with 500 weak learners' + print ' struct_baseline (for -con) baseline with Enc, Sex, lab_values, ICD9, has adaboost with 200 weak learners' + print ' lr_baseline (for -con) baseline with no features but has logisitic regression' + print ' regex_baseline (for -con) adaboost_baseline with control_groups["regex"] loaded' + print '\033[96m Use these in combintion with grid to easily gain control over our experiments:\033[96m' + print ' Run control, without regex features, without notes: -r "<<G>>" -g "{\'G\' : [\'None\', \'control_groups[\\"regex\\"]\', \'control_groups[\\"notes_tfidf\\"]\']}"' + print ' Run with regex added in, labs added in: -a "[control_features[x] for x in control_groups[\'<<G>>\']]" -g "{\'G\' : [\'regex\', \'labs\']}"' + print ' \033[92m See model_builder.py for more information about these structures and how to handle FeaturePipeline changes' + print '\033[0m' +if __name__ == '__main__': + main()