Diff of /analysis/analyze.py [000000] .. [23d48c]

Switch to unified view

a b/analysis/analyze.py
1
import pandas as pd
2
import numpy as np
3
import argparse
4
import os, errno, sys
5
from sklearn.externals.joblib import Parallel, delayed
6
7
8
if __name__ == '__main__':
9
    # parse command line arguments
10
    parser = argparse.ArgumentParser(description="An analyst for quick ML applications.",
11
                                     add_help=False)
12
    parser.add_argument('INPUT_FILE', type=str,
13
                        help='Data file to analyze; ensure that the '
14
                        'target/label column is labeled as "class".')    
15
    parser.add_argument('-h', '--help', action='help',
16
                        help='Show this help message and exit.')
17
    parser.add_argument('-ml', action='store', dest='LEARNERS',default=None,type=str, 
18
            help='Comma-separated list of ML methods to use (should correspond to a py file name in learners/)')
19
    parser.add_argument('--lsf', action='store_true', dest='LSF', default=False, 
20
            help='Run on an LSF HPC (using bsub commands)')
21
    parser.add_argument('-metric',action='store', dest='METRIC', default='f1_macro', type=str, 
22
            help='Metric to compare algorithms')
23
    parser.add_argument('-n_jobs',action='store',dest='N_JOBS',default=4,type=int,
24
            help='Number of parallel jobs')
25
    parser.add_argument('-n_trials',action='store',dest='N_TRIALS',default=1,type=int,
26
            help='Number of parallel jobs')
27
    parser.add_argument('-trials',action='store',dest='TRIALS',default='',type=str,
28
            help='Number of parallel jobs')
29
    parser.add_argument('-rs',action='store',dest='RANDOM_STATE',default=None,type=int,
30
            help='random state')
31
    parser.add_argument('-label',action='store',dest='LABEL',default='class',type=str,help='Name of class label column')
32
    parser.add_argument('-results',action='store',dest='RDIR',type=str,help='Results directory',
33
                        default='/project/moore/users/lacava/geis-ehr/results/')
34
    parser.add_argument('-q',action='store',dest='QUEUE',default='moore_normal',type=str,help='LSF queue')
35
    parser.add_argument('-m',action='store',dest='M',default=4096,type=int,help='LSF memory request and limit (MB)')
36
    parser.add_argument('--norare',action='store_true',dest='NORARE', default=False,
37
                        help='do not include rare labs')
38
    
39
    args = parser.parse_args()
40
      
41
    #if args.RANDOM_STATE:
42
    #    random_state = args.RANDOM_STATE
43
    #else:
44
    #    random_state = np.random.randint(2**15 - 1)
45
    # 10 fixed, randomly generated seeds
46
    seeds = [11085, 14899, 27164,  3674, 16612,  9168,  7016, 30993,  4180, 1188]
47
48
    if len(args.TRIALS)>0:
49
        args.N_TRIALS = len(args.TRIALS.split(','))
50
        seeds = [int(t) for t in args.TRIALS.split(',')]
51
    else: 
52
        seeds = [s for i,s in enumerate(seeds) if i < args.N_TRIALS]
53
    
54
    learners = [ml for ml in args.LEARNERS.split(',')]  # learners
55
56
    model_dir = 'ml'
57
58
    dataset = args.INPUT_FILE.split('/')[-1]
59
    dataset_prefix = 'geis'  + dataset.split('icd9')[-1]
60
    dataset_path = args.INPUT_FILE + '/' + dataset_prefix
61
    print('dataset:',dataset_prefix)
62
    print('dataset path:',dataset_path)
63
64
    results_path = '/'.join([args.RDIR, dataset]) + '/'
65
    # make the results_path directory if it doesn't exit 
66
    try:
67
        os.makedirs(results_path)
68
    except OSError as e:
69
        if e.errno != errno.EEXIST:
70
            raise
71
    rarity = '_noRare' if args.NORARE else ''
72
    # initialize output files
73
    for ml in learners:
74
        #write headers
75
        save_file = results_path + '/' + dataset + '_' + ml + rarity + '.csv'  
76
        feat_file =  save_file.split('.csv')[0]+'.imp_score'        
77
        roc_file =  save_file.split('.csv')[0]+'.roc'        
78
       
79
        if not os.path.isfile(feat_file):
80
            with open(feat_file,'w') as out:
81
                out.write('algorithm\talg-parameters\tseed\tfeature\tscore\n')
82
         
83
        if not os.path.isfile(roc_file):
84
            with open(roc_file,'w') as out:
85
                out.write('algorithm\talg-parameters\tseed\tfpr\ttpr\tauc\n')
86
   
87
        if not os.path.isfile(save_file):
88
            with open(save_file,'w') as out:
89
                # out.write('dataset\talgorithm\tparameters\taccuracy\tf1_macro\tseed\tbal_accuracy\troc_auc\ttime\n')
90
                out.write('dataset\talgorithm\tparameters\tseed\taccuracy\tf1_macro\tbal_accuracy\troc_auc\ttime\n')
91
        
92
    # write run commands
93
    all_commands = []
94
    job_info=[]
95
    for t in seeds:
96
        random_state = t 
97
        # random_state = t 
98
        print('random_seed:',random_state)
99
        for ml in learners:
100
            save_file = results_path + '/' + dataset + '_' + ml + rarity + '.csv'  
101
            
102
            all_commands.append(
103
                    'OMP_NUM_THREADS={N_CORES}; '
104
                    'export OMP_NUM_THREADS; '
105
                    'python -u {ML}.py {DATASET} {SAVEFILE} {RS} {RA}'.format(ML=model_dir + '/' + ml,
106
                                                                         DATASET=dataset_path,
107
                                                                         SAVEFILE=save_file,
108
                                                                         RS=random_state,
109
                                                                         N_CORES=args.N_JOBS,
110
                                                                         RA=not args.NORARE))
111
112
            job_info.append({'ml':ml,'dataset':dataset,'results_path':results_path})
113
114
    if args.LSF:    # bsub commands
115
        for i,run_cmd in enumerate(all_commands):
116
            job_name = job_info[i]['ml'] + '_' + job_info[i]['dataset']
117
            out_file = job_info[i]['results_path'] + job_name + '_%J.out'
118
            # error_file = out_file[:-4] + '.err'
119
            
120
            bsub_cmd = ('bsub -o {OUT_FILE} -n {N_CORES} -J {JOB_NAME} -q {QUEUE} '
121
                        '-R "span[hosts=1] rusage[mem={M}]" -M {M} ').format(OUT_FILE=out_file,
122
                                             JOB_NAME=job_name,
123
                                             QUEUE=args.QUEUE,
124
                                             N_CORES=args.N_JOBS,
125
                                             M=args.M)
126
            
127
            bsub_cmd +=  '"' + run_cmd + '"'
128
            print(bsub_cmd)
129
            os.system(bsub_cmd)     # submit jobs 
130
131
    else:   # run locally  
132
        Parallel(n_jobs=args.N_JOBS)(delayed(os.system)(run_cmd) for run_cmd in all_commands )