Diff of /analysis/analyze.py [000000] .. [23d48c]

Switch to side-by-side view

--- a
+++ b/analysis/analyze.py
@@ -0,0 +1,132 @@
+import pandas as pd
+import numpy as np
+import argparse
+import os, errno, sys
+from sklearn.externals.joblib import Parallel, delayed
+
+
+if __name__ == '__main__':
+    # parse command line arguments
+    parser = argparse.ArgumentParser(description="An analyst for quick ML applications.",
+                                     add_help=False)
+    parser.add_argument('INPUT_FILE', type=str,
+                        help='Data file to analyze; ensure that the '
+                        'target/label column is labeled as "class".')    
+    parser.add_argument('-h', '--help', action='help',
+                        help='Show this help message and exit.')
+    parser.add_argument('-ml', action='store', dest='LEARNERS',default=None,type=str, 
+            help='Comma-separated list of ML methods to use (should correspond to a py file name in learners/)')
+    parser.add_argument('--lsf', action='store_true', dest='LSF', default=False, 
+            help='Run on an LSF HPC (using bsub commands)')
+    parser.add_argument('-metric',action='store', dest='METRIC', default='f1_macro', type=str, 
+            help='Metric to compare algorithms')
+    parser.add_argument('-n_jobs',action='store',dest='N_JOBS',default=4,type=int,
+            help='Number of parallel jobs')
+    parser.add_argument('-n_trials',action='store',dest='N_TRIALS',default=1,type=int,
+            help='Number of parallel jobs')
+    parser.add_argument('-trials',action='store',dest='TRIALS',default='',type=str,
+            help='Number of parallel jobs')
+    parser.add_argument('-rs',action='store',dest='RANDOM_STATE',default=None,type=int,
+            help='random state')
+    parser.add_argument('-label',action='store',dest='LABEL',default='class',type=str,help='Name of class label column')
+    parser.add_argument('-results',action='store',dest='RDIR',type=str,help='Results directory',
+                        default='/project/moore/users/lacava/geis-ehr/results/')
+    parser.add_argument('-q',action='store',dest='QUEUE',default='moore_normal',type=str,help='LSF queue')
+    parser.add_argument('-m',action='store',dest='M',default=4096,type=int,help='LSF memory request and limit (MB)')
+    parser.add_argument('--norare',action='store_true',dest='NORARE', default=False,
+                        help='do not include rare labs')
+    
+    args = parser.parse_args()
+      
+    #if args.RANDOM_STATE:
+    #    random_state = args.RANDOM_STATE
+    #else:
+    #    random_state = np.random.randint(2**15 - 1)
+    # 10 fixed, randomly generated seeds
+    seeds = [11085, 14899, 27164,  3674, 16612,  9168,  7016, 30993,  4180, 1188]
+
+    if len(args.TRIALS)>0:
+        args.N_TRIALS = len(args.TRIALS.split(','))
+        seeds = [int(t) for t in args.TRIALS.split(',')]
+    else: 
+        seeds = [s for i,s in enumerate(seeds) if i < args.N_TRIALS]
+    
+    learners = [ml for ml in args.LEARNERS.split(',')]  # learners
+
+    model_dir = 'ml'
+
+    dataset = args.INPUT_FILE.split('/')[-1]
+    dataset_prefix = 'geis'  + dataset.split('icd9')[-1]
+    dataset_path = args.INPUT_FILE + '/' + dataset_prefix
+    print('dataset:',dataset_prefix)
+    print('dataset path:',dataset_path)
+
+    results_path = '/'.join([args.RDIR, dataset]) + '/'
+    # make the results_path directory if it doesn't exit 
+    try:
+        os.makedirs(results_path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+    rarity = '_noRare' if args.NORARE else ''
+    # initialize output files
+    for ml in learners:
+        #write headers
+        save_file = results_path + '/' + dataset + '_' + ml + rarity + '.csv'  
+        feat_file =  save_file.split('.csv')[0]+'.imp_score'        
+        roc_file =  save_file.split('.csv')[0]+'.roc'        
+       
+        if not os.path.isfile(feat_file):
+            with open(feat_file,'w') as out:
+                out.write('algorithm\talg-parameters\tseed\tfeature\tscore\n')
+         
+        if not os.path.isfile(roc_file):
+            with open(roc_file,'w') as out:
+                out.write('algorithm\talg-parameters\tseed\tfpr\ttpr\tauc\n')
+   
+        if not os.path.isfile(save_file):
+            with open(save_file,'w') as out:
+                # out.write('dataset\talgorithm\tparameters\taccuracy\tf1_macro\tseed\tbal_accuracy\troc_auc\ttime\n')
+                out.write('dataset\talgorithm\tparameters\tseed\taccuracy\tf1_macro\tbal_accuracy\troc_auc\ttime\n')
+        
+    # write run commands
+    all_commands = []
+    job_info=[]
+    for t in seeds:
+        random_state = t 
+        # random_state = t 
+        print('random_seed:',random_state)
+        for ml in learners:
+            save_file = results_path + '/' + dataset + '_' + ml + rarity + '.csv'  
+            
+            all_commands.append(
+                    'OMP_NUM_THREADS={N_CORES}; '
+                    'export OMP_NUM_THREADS; '
+                    'python -u {ML}.py {DATASET} {SAVEFILE} {RS} {RA}'.format(ML=model_dir + '/' + ml,
+                                                                         DATASET=dataset_path,
+                                                                         SAVEFILE=save_file,
+                                                                         RS=random_state,
+                                                                         N_CORES=args.N_JOBS,
+                                                                         RA=not args.NORARE))
+
+            job_info.append({'ml':ml,'dataset':dataset,'results_path':results_path})
+
+    if args.LSF:    # bsub commands
+        for i,run_cmd in enumerate(all_commands):
+            job_name = job_info[i]['ml'] + '_' + job_info[i]['dataset']
+            out_file = job_info[i]['results_path'] + job_name + '_%J.out'
+            # error_file = out_file[:-4] + '.err'
+            
+            bsub_cmd = ('bsub -o {OUT_FILE} -n {N_CORES} -J {JOB_NAME} -q {QUEUE} '
+                        '-R "span[hosts=1] rusage[mem={M}]" -M {M} ').format(OUT_FILE=out_file,
+                                             JOB_NAME=job_name,
+                                             QUEUE=args.QUEUE,
+                                             N_CORES=args.N_JOBS,
+                                             M=args.M)
+            
+            bsub_cmd +=  '"' + run_cmd + '"'
+            print(bsub_cmd)
+            os.system(bsub_cmd)     # submit jobs 
+
+    else:   # run locally  
+        Parallel(n_jobs=args.N_JOBS)(delayed(os.system)(run_cmd) for run_cmd in all_commands )