Diff of /queue_test.py [000000] .. [8d2107]

Switch to unified view

a b/queue_test.py
1
import sys
2
import csv
3
import itertools
4
5
################################
6
#
7
# Description: adds a test to the queue
8
#
9
# example of queue:
10
#   python queue_test.py -mes 'This is a test' -n 500 -cv 3 -d 'from model_builder import build_model, regex_baseline' -r "build_model(regex_baseline, method = 'SVM', features_remove = ['NYHA'], features_add = {'min_ef' : (EFTransformer, {'method':'min', 'num_horizon': 2})})"
11
#
12
#  Keys    Description                                 Default                    
13
#   -h      displays help message                       
14
#   -mes    message/description                         ""
15
#   -n      number of patients                          900
16
#   -cv     number of CV splits                         5
17
#   -d      dependency string to be executed            "" (this dependency is only added on to default)
18
#   -con    control                                     regex_baseline
19
#   -met    method to override control                  adaboost
20
#   -arg*   arguments to override control model params  None
21
#   -a*     features to add                             None
22
#   -c*     features to change                          None
23
#   -r*     features to remove                          None
24
#   -g      the grid of features                        None
25
#
26
# The keys with the * indicate that the string can be formatted according to the grid. 
27
# For example, I could pass the following:
28
#   -arg "{'n_estimators' : <<N>>}" -g "{'N' : [5, 50, 500]}"
29
# which would mean that the model would be ran 3 times, with n_estimators replaced each time. 
30
#
31
################################
32
33
34
35
queue_file_path =  "../experiments/experiments.csv"
36
37
base_dependency = "from model_builder import build_model, regex_baseline, control_features, control_groups, struct_baseline, adaboost_baseline, lr_baseline" 
38
39
def queue_test(message = "", num_patients = 900, cv_splits = 5, build_model_string = "build_model(regex_baseline)", dependencies = "from model_builder import build_model, regex_baseline"):
40
    with open(queue_file_path, 'r+b') as queue_file:
41
        test_id = sum(1 for line in queue_file)
42
        writer = csv.writer(queue_file)
43
        writer.writerow([test_id, message, cv_splits, min(num_patients, 906), dependencies, build_model_string, '0'])
44
45
def queue_grid_search(grid = None, message = "", num_patients = 900, cv_splits = 5, dependencies = "", control_string = 'regex_baseline', method_string = 'None', model_args = 'None', features_add_string = 'None', features_change_string = 'None', features_remove_string = 'None'):
46
47
    if not grid == None: 
48
        args_options = build_options(grid, model_args)
49
        add_options = build_options(grid, features_add_string)
50
        change_options = build_options(grid, features_change_string)
51
        remove_options = build_options(grid, features_remove_string)
52
    else:
53
        args_options = [model_args]
54
        add_options = [features_add_string]
55
        change_options = [features_change_string]
56
        remove_options = [features_remove_string]
57
    results = []  
58
    for args in args_options:
59
        for add in add_options:
60
            for change in change_options:
61
                for remove in remove_options:
62
                    results += [make_model_string(control_string, method_string, args, add, change, remove)] 
63
64
    for model_string in results:
65
        queue_test(message, num_patients, cv_splits, model_string, base_dependency  + "; " + dependencies)
66
67
def make_model_string(control, method, args, add, change, remove):
68
    result = "build_model("
69
    result += "control = " + control + ", "
70
    if not method == 'None':
71
        result += "method = '" + method + "', "
72
    if not args == 'None':
73
        result += "model_args = " + args + ", "
74
    if not add == 'None':
75
        result += "features_add = " + add + ", "
76
    if not change == 'None':
77
        result += "features_change = " + change + ", "
78
    if not remove == 'None':
79
        result += "features_remove = " + remove
80
    result += ")"
81
    return result
82
83
def update_string(mapping, string):
84
    replace_pairs = [('{', '{{'), ('}', '}}'), ('<<', '{'), ('>>', '}')]
85
    for replace_pair in replace_pairs:
86
        string = string.replace(replace_pair[0], replace_pair[1])
87
    return string.format(**mapping)
88
89
def build_options(mapping, string):
90
    result = []
91
    keys = list(mapping.keys())
92
    relevant_keys = [key for key in keys if "<<" + key + ">>" in string]
93
    if len(relevant_keys) > 0:
94
        values = [mapping[key] for key in relevant_keys]
95
        combo_iterator = itertools.product(*values)
96
        for combo in combo_iterator:
97
            single_map = dict(zip(relevant_keys, combo))
98
            result += [update_string(single_map, string)]
99
        return list(set(result)) 
100
    else:
101
        return [string]
102
103
def main():
104
    inputs = sys.argv[1:]
105
    if len(inputs) == 1 and inputs[0] in ['-h', 'help']:
106
       show_help()
107
    else:  
108
        if len(inputs) % 2 == 1:
109
            raise ValueError("Uninterpretable input: " + str(inputs))
110
111
        queue_args = dict()
112
        args_converter = {"-con" : "control_string", "-a" : "features_add_string", "-met" : 'method_string', '-arg' : 'model_args', '-c' : 'features_change_string', "-g" : "grid", "-mes" : "message", "-n" : "num_patients", "-r" : "features_remove_string", "-cv" : "cv_splits", "-d" : "dependencies"}
113
114
        for i in range(len(inputs) / 2):
115
            key = inputs[2*i]
116
            value = inputs[2*i + 1]
117
            if key in args_converter:
118
                if key in ['-g', '-n', '-cv' ]:
119
                    value = eval(value) if key != '-n' else min(906, eval(value))
120
                queue_args[args_converter[key]] = value
121
            else:
122
                raise ValueError("Unkown key: " + key)       
123
124
        queue_grid_search(**queue_args)
125
def show_help():
126
127
    print "\033[95m   Keys    Description                                 Default\033[0m"                    
128
    print '   -mes    message/description                         ""'
129
    print '   -n      number of patients                          900'
130
    print '   -cv     number of CV splits                         5'
131
    print '   -d      dependency string to be executed            "" (this dependency is only added on to default)'
132
    print '   -con    control                                     regex_baseline'
133
    print '   -met    method to override control                  adaboost'
134
    print '   -arg*   arguments to override control model params  None'
135
    print '   -a*     features to add                             None'
136
    print '   -c*     features to change                          None'
137
    print '   -r*     features to remove                          None'
138
    print '   -g      the grid of features                        None'
139
    print '\033[92m   The keys with the * indicate that the string can be formatted according to the grid.'
140
    print '   For example, I could pass the following:'
141
    print '         -arg "{\'n_estimators\' : <<N>>}" -g "{\'N\' : [5, 50, 500]}"'
142
    print '   which would mean that the model would be ran 3 times, with n_estimators replaced each time.\033[0m' 
143
    print '\033[96m   You also have the following variables availible to use (they are in model_builder.py):\033[0m'
144
    print '   control_features      a dict of all features and their (name, class, args) tuple'
145
    print '   control_groups        a dict of some groups of names, which are "regex", "structured_only", "notes_tfidf", and "labs" for now'
146
    print '   adaboost_baseline     (for -con) baseline with no features but has adaboost with 500 weak learners'
147
    print '   struct_baseline       (for -con) baseline with Enc, Sex, lab_values, ICD9, has adaboost with 200 weak learners'
148
    print '   lr_baseline           (for -con) baseline with no features but has logisitic regression'
149
    print '   regex_baseline        (for -con) adaboost_baseline with control_groups["regex"] loaded'
150
    print '\033[96m   Use these in combintion with grid to easily gain control over our experiments:\033[96m'
151
    print '   Run control, without regex features, without notes: -r "<<G>>" -g "{\'G\' : [\'None\', \'control_groups[\\"regex\\"]\', \'control_groups[\\"notes_tfidf\\"]\']}"'
152
    print '   Run with regex added in, labs added in: -a "[control_features[x] for x in control_groups[\'<<G>>\']]" -g "{\'G\' : [\'regex\', \'labs\']}"'
153
    print '  \033[92m See model_builder.py for more information about these structures and how to handle FeaturePipeline changes'
154
    print '\033[0m'
155
if __name__ == '__main__':
156
    main()