--- a +++ b/analysis/analyze.py @@ -0,0 +1,132 @@ +import pandas as pd +import numpy as np +import argparse +import os, errno, sys +from sklearn.externals.joblib import Parallel, delayed + + +if __name__ == '__main__': + # parse command line arguments + parser = argparse.ArgumentParser(description="An analyst for quick ML applications.", + add_help=False) + parser.add_argument('INPUT_FILE', type=str, + help='Data file to analyze; ensure that the ' + 'target/label column is labeled as "class".') + parser.add_argument('-h', '--help', action='help', + help='Show this help message and exit.') + parser.add_argument('-ml', action='store', dest='LEARNERS',default=None,type=str, + help='Comma-separated list of ML methods to use (should correspond to a py file name in learners/)') + parser.add_argument('--lsf', action='store_true', dest='LSF', default=False, + help='Run on an LSF HPC (using bsub commands)') + parser.add_argument('-metric',action='store', dest='METRIC', default='f1_macro', type=str, + help='Metric to compare algorithms') + parser.add_argument('-n_jobs',action='store',dest='N_JOBS',default=4,type=int, + help='Number of parallel jobs') + parser.add_argument('-n_trials',action='store',dest='N_TRIALS',default=1,type=int, + help='Number of parallel jobs') + parser.add_argument('-trials',action='store',dest='TRIALS',default='',type=str, + help='Number of parallel jobs') + parser.add_argument('-rs',action='store',dest='RANDOM_STATE',default=None,type=int, + help='random state') + parser.add_argument('-label',action='store',dest='LABEL',default='class',type=str,help='Name of class label column') + parser.add_argument('-results',action='store',dest='RDIR',type=str,help='Results directory', + default='/project/moore/users/lacava/geis-ehr/results/') + parser.add_argument('-q',action='store',dest='QUEUE',default='moore_normal',type=str,help='LSF queue') + parser.add_argument('-m',action='store',dest='M',default=4096,type=int,help='LSF memory request and limit (MB)') + parser.add_argument('--norare',action='store_true',dest='NORARE', default=False, + help='do not include rare labs') + + args = parser.parse_args() + + #if args.RANDOM_STATE: + # random_state = args.RANDOM_STATE + #else: + # random_state = np.random.randint(2**15 - 1) + # 10 fixed, randomly generated seeds + seeds = [11085, 14899, 27164, 3674, 16612, 9168, 7016, 30993, 4180, 1188] + + if len(args.TRIALS)>0: + args.N_TRIALS = len(args.TRIALS.split(',')) + seeds = [int(t) for t in args.TRIALS.split(',')] + else: + seeds = [s for i,s in enumerate(seeds) if i < args.N_TRIALS] + + learners = [ml for ml in args.LEARNERS.split(',')] # learners + + model_dir = 'ml' + + dataset = args.INPUT_FILE.split('/')[-1] + dataset_prefix = 'geis' + dataset.split('icd9')[-1] + dataset_path = args.INPUT_FILE + '/' + dataset_prefix + print('dataset:',dataset_prefix) + print('dataset path:',dataset_path) + + results_path = '/'.join([args.RDIR, dataset]) + '/' + # make the results_path directory if it doesn't exit + try: + os.makedirs(results_path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + rarity = '_noRare' if args.NORARE else '' + # initialize output files + for ml in learners: + #write headers + save_file = results_path + '/' + dataset + '_' + ml + rarity + '.csv' + feat_file = save_file.split('.csv')[0]+'.imp_score' + roc_file = save_file.split('.csv')[0]+'.roc' + + if not os.path.isfile(feat_file): + with open(feat_file,'w') as out: + out.write('algorithm\talg-parameters\tseed\tfeature\tscore\n') + + if not os.path.isfile(roc_file): + with open(roc_file,'w') as out: + out.write('algorithm\talg-parameters\tseed\tfpr\ttpr\tauc\n') + + if not os.path.isfile(save_file): + with open(save_file,'w') as out: + # out.write('dataset\talgorithm\tparameters\taccuracy\tf1_macro\tseed\tbal_accuracy\troc_auc\ttime\n') + out.write('dataset\talgorithm\tparameters\tseed\taccuracy\tf1_macro\tbal_accuracy\troc_auc\ttime\n') + + # write run commands + all_commands = [] + job_info=[] + for t in seeds: + random_state = t + # random_state = t + print('random_seed:',random_state) + for ml in learners: + save_file = results_path + '/' + dataset + '_' + ml + rarity + '.csv' + + all_commands.append( + 'OMP_NUM_THREADS={N_CORES}; ' + 'export OMP_NUM_THREADS; ' + 'python -u {ML}.py {DATASET} {SAVEFILE} {RS} {RA}'.format(ML=model_dir + '/' + ml, + DATASET=dataset_path, + SAVEFILE=save_file, + RS=random_state, + N_CORES=args.N_JOBS, + RA=not args.NORARE)) + + job_info.append({'ml':ml,'dataset':dataset,'results_path':results_path}) + + if args.LSF: # bsub commands + for i,run_cmd in enumerate(all_commands): + job_name = job_info[i]['ml'] + '_' + job_info[i]['dataset'] + out_file = job_info[i]['results_path'] + job_name + '_%J.out' + # error_file = out_file[:-4] + '.err' + + bsub_cmd = ('bsub -o {OUT_FILE} -n {N_CORES} -J {JOB_NAME} -q {QUEUE} ' + '-R "span[hosts=1] rusage[mem={M}]" -M {M} ').format(OUT_FILE=out_file, + JOB_NAME=job_name, + QUEUE=args.QUEUE, + N_CORES=args.N_JOBS, + M=args.M) + + bsub_cmd += '"' + run_cmd + '"' + print(bsub_cmd) + os.system(bsub_cmd) # submit jobs + + else: # run locally + Parallel(n_jobs=args.N_JOBS)(delayed(os.system)(run_cmd) for run_cmd in all_commands )