NLP_CRT / Git / Diff of /queue

Models:

philipB/

NLP_CRT

Downloads: 1

Diff of /queue_test.py [000000] .. [8d2107]

Switch to unified view

 b/queue_test.py
+import sys
+import csv
+import itertools
+################################
+#
+# Description: adds a test to the queue
+#
+# example of queue:
+#   python queue_test.py -mes 'This is a test' -n 500 -cv 3 -d 'from model_builder import build_model, regex_baseline' -r "build_model(regex_baseline, method = 'SVM', features_remove = ['NYHA'], features_add = {'min_ef' : (EFTransformer, {'method':'min', 'num_horizon': 2})})"
+#
+#  Keys    Description                                 Default
+#   -h      displays help message
+#   -mes    message/description                         ""
+#   -n      number of patients                          900
+#   -cv     number of CV splits                         5
+#   -d      dependency string to be executed            "" (this dependency is only added on to default)
+#   -con    control                                     regex_baseline
+#   -met    method to override control                  adaboost
+#   -arg*   arguments to override control model params  None
+#   -a*     features to add                             None
+#   -c*     features to change                          None
+#   -r*     features to remove                          None
+#   -g      the grid of features                        None
+#
+# The keys with the * indicate that the string can be formatted according to the grid.
+# For example, I could pass the following:
+#   -arg "{'n_estimators' : <<N>>}" -g "{'N' : [5, 50, 500]}"
+# which would mean that the model would be ran 3 times, with n_estimators replaced each time.
+#
+################################
+queue_file_path =  "../experiments/experiments.csv"
+base_dependency = "from model_builder import build_model, regex_baseline, control_features, control_groups, struct_baseline, adaboost_baseline, lr_baseline"
+def queue_test(message = "", num_patients = 900, cv_splits = 5, build_model_string = "build_model(regex_baseline)", dependencies = "from model_builder import build_model, regex_baseline"):
+    with open(queue_file_path, 'r+b') as queue_file:
+        test_id = sum(1 for line in queue_file)
+        writer = csv.writer(queue_file)
+        writer.writerow([test_id, message, cv_splits, min(num_patients, 906), dependencies, build_model_string, '0'])
+def queue_grid_search(grid = None, message = "", num_patients = 900, cv_splits = 5, dependencies = "", control_string = 'regex_baseline', method_string = 'None', model_args = 'None', features_add_string = 'None', features_change_string = 'None', features_remove_string = 'None'):
+    if not grid == None:
+        args_options = build_options(grid, model_args)
+        add_options = build_options(grid, features_add_string)
+        change_options = build_options(grid, features_change_string)
+        remove_options = build_options(grid, features_remove_string)
+    else:
+        args_options = [model_args]
+        add_options = [features_add_string]
+        change_options = [features_change_string]
+        remove_options = [features_remove_string]
+    results = []
+    for args in args_options:
+        for add in add_options:
+            for change in change_options:
+                for remove in remove_options:
+                    results += [make_model_string(control_string, method_string, args, add, change, remove)]
+    for model_string in results:
+        queue_test(message, num_patients, cv_splits, model_string, base_dependency  + "; " + dependencies)
+def make_model_string(control, method, args, add, change, remove):
+    result = "build_model("
+    result += "control = " + control + ", "
+    if not method == 'None':
+        result += "method = '" + method + "', "
+    if not args == 'None':
+        result += "model_args = " + args + ", "
+    if not add == 'None':
+        result += "features_add = " + add + ", "
+    if not change == 'None':
+        result += "features_change = " + change + ", "
+    if not remove == 'None':
+        result += "features_remove = " + remove
+    result += ")"
+    return result
+def update_string(mapping, string):
+    replace_pairs = [('{', '{{'), ('}', '}}'), ('<<', '{'), ('>>', '}')]
+    for replace_pair in replace_pairs:
+        string = string.replace(replace_pair[0], replace_pair[1])
+    return string.format(**mapping)
+def build_options(mapping, string):
+    result = []
+    keys = list(mapping.keys())
+    relevant_keys = [key for key in keys if "<<" + key + ">>" in string]
+    if len(relevant_keys) > 0:
+        values = [mapping[key] for key in relevant_keys]
+        combo_iterator = itertools.product(*values)
+        for combo in combo_iterator:
+            single_map = dict(zip(relevant_keys, combo))
+            result += [update_string(single_map, string)]
+        return list(set(result))
+    else:
+        return [string]
+def main():
+    inputs = sys.argv[1:]
+    if len(inputs) == 1 and inputs[0] in ['-h', 'help']:
+       show_help()
+    else:
+        if len(inputs) % 2 == 1:
+            raise ValueError("Uninterpretable input: " + str(inputs))
+        queue_args = dict()
+        args_converter = {"-con" : "control_string", "-a" : "features_add_string", "-met" : 'method_string', '-arg' : 'model_args', '-c' : 'features_change_string', "-g" : "grid", "-mes" : "message", "-n" : "num_patients", "-r" : "features_remove_string", "-cv" : "cv_splits", "-d" : "dependencies"}
+        for i in range(len(inputs) / 2):
+            key = inputs[2*i]
+            value = inputs[2*i + 1]
+            if key in args_converter:
+                if key in ['-g', '-n', '-cv' ]:
+                    value = eval(value) if key != '-n' else min(906, eval(value))
+                queue_args[args_converter[key]] = value
+            else:
+                raise ValueError("Unkown key: " + key)
+        queue_grid_search(**queue_args)
+def show_help():
+    print "\033[95m   Keys    Description                                 Default\033[0m"
+    print '   -mes    message/description                         ""'
+    print '   -n      number of patients                          900'
+    print '   -cv     number of CV splits                         5'
+    print '   -d      dependency string to be executed            "" (this dependency is only added on to default)'
+    print '   -con    control                                     regex_baseline'
+    print '   -met    method to override control                  adaboost'
+    print '   -arg*   arguments to override control model params  None'
+    print '   -a*     features to add                             None'
+    print '   -c*     features to change                          None'
+    print '   -r*     features to remove                          None'
+    print '   -g      the grid of features                        None'
+    print '\033[92m   The keys with the * indicate that the string can be formatted according to the grid.'
+    print '   For example, I could pass the following:'
+    print '         -arg "{\'n_estimators\' : <<N>>}" -g "{\'N\' : [5, 50, 500]}"'
+    print '   which would mean that the model would be ran 3 times, with n_estimators replaced each time.\033[0m'
+    print '\033[96m   You also have the following variables availible to use (they are in model_builder.py):\033[0m'
+    print '   control_features      a dict of all features and their (name, class, args) tuple'
+    print '   control_groups        a dict of some groups of names, which are "regex", "structured_only", "notes_tfidf", and "labs" for now'
+    print '   adaboost_baseline     (for -con) baseline with no features but has adaboost with 500 weak learners'
+    print '   struct_baseline       (for -con) baseline with Enc, Sex, lab_values, ICD9, has adaboost with 200 weak learners'
+    print '   lr_baseline           (for -con) baseline with no features but has logisitic regression'
+    print '   regex_baseline        (for -con) adaboost_baseline with control_groups["regex"] loaded'
+    print '\033[96m   Use these in combintion with grid to easily gain control over our experiments:\033[96m'
+    print '   Run control, without regex features, without notes: -r "<<G>>" -g "{\'G\' : [\'None\', \'control_groups[\\"regex\\"]\', \'control_groups[\\"notes_tfidf\\"]\']}"'
+    print '   Run with regex added in, labs added in: -a "[control_features[x] for x in control_groups[\'<<G>>\']]" -g "{\'G\' : [\'regex\', \'labs\']}"'
+    print '  \033[92m See model_builder.py for more information about these structures and how to handle FeaturePipeline changes'
+    print '\033[0m'
+if __name__ == '__main__':
+    main()