import pandas as pd
import numpy as np
import argparse
import os, errno, sys
from sklearn.externals.joblib import Parallel, delayed
if __name__ == '__main__':
# parse command line arguments
parser = argparse.ArgumentParser(description="An analyst for quick ML applications.",
add_help=False)
parser.add_argument('INPUT_FILE', type=str,
help='Data file to analyze; ensure that the '
'target/label column is labeled as "class".')
parser.add_argument('-h', '--help', action='help',
help='Show this help message and exit.')
parser.add_argument('-ml', action='store', dest='LEARNERS',default=None,type=str,
help='Comma-separated list of ML methods to use (should correspond to a py file name in learners/)')
parser.add_argument('--lsf', action='store_true', dest='LSF', default=False,
help='Run on an LSF HPC (using bsub commands)')
parser.add_argument('-metric',action='store', dest='METRIC', default='f1_macro', type=str,
help='Metric to compare algorithms')
parser.add_argument('-n_jobs',action='store',dest='N_JOBS',default=4,type=int,
help='Number of parallel jobs')
parser.add_argument('-n_trials',action='store',dest='N_TRIALS',default=1,type=int,
help='Number of parallel jobs')
parser.add_argument('-trials',action='store',dest='TRIALS',default='',type=str,
help='Number of parallel jobs')
parser.add_argument('-rs',action='store',dest='RANDOM_STATE',default=None,type=int,
help='random state')
parser.add_argument('-label',action='store',dest='LABEL',default='class',type=str,help='Name of class label column')
parser.add_argument('-results',action='store',dest='RDIR',type=str,help='Results directory',
default='/project/moore/users/lacava/geis-ehr/results/')
parser.add_argument('-q',action='store',dest='QUEUE',default='moore_normal',type=str,help='LSF queue')
parser.add_argument('-m',action='store',dest='M',default=4096,type=int,help='LSF memory request and limit (MB)')
parser.add_argument('--norare',action='store_true',dest='NORARE', default=False,
help='do not include rare labs')
args = parser.parse_args()
#if args.RANDOM_STATE:
# random_state = args.RANDOM_STATE
#else:
# random_state = np.random.randint(2**15 - 1)
# 10 fixed, randomly generated seeds
seeds = [11085, 14899, 27164, 3674, 16612, 9168, 7016, 30993, 4180, 1188]
if len(args.TRIALS)>0:
args.N_TRIALS = len(args.TRIALS.split(','))
seeds = [int(t) for t in args.TRIALS.split(',')]
else:
seeds = [s for i,s in enumerate(seeds) if i < args.N_TRIALS]
learners = [ml for ml in args.LEARNERS.split(',')] # learners
model_dir = 'ml'
dataset = args.INPUT_FILE.split('/')[-1]
dataset_prefix = 'geis' + dataset.split('icd9')[-1]
dataset_path = args.INPUT_FILE + '/' + dataset_prefix
print('dataset:',dataset_prefix)
print('dataset path:',dataset_path)
results_path = '/'.join([args.RDIR, dataset]) + '/'
# make the results_path directory if it doesn't exit
try:
os.makedirs(results_path)
except OSError as e:
if e.errno != errno.EEXIST:
raise
rarity = '_noRare' if args.NORARE else ''
# initialize output files
for ml in learners:
#write headers
save_file = results_path + '/' + dataset + '_' + ml + rarity + '.csv'
feat_file = save_file.split('.csv')[0]+'.imp_score'
roc_file = save_file.split('.csv')[0]+'.roc'
if not os.path.isfile(feat_file):
with open(feat_file,'w') as out:
out.write('algorithm\talg-parameters\tseed\tfeature\tscore\n')
if not os.path.isfile(roc_file):
with open(roc_file,'w') as out:
out.write('algorithm\talg-parameters\tseed\tfpr\ttpr\tauc\n')
if not os.path.isfile(save_file):
with open(save_file,'w') as out:
# out.write('dataset\talgorithm\tparameters\taccuracy\tf1_macro\tseed\tbal_accuracy\troc_auc\ttime\n')
out.write('dataset\talgorithm\tparameters\tseed\taccuracy\tf1_macro\tbal_accuracy\troc_auc\ttime\n')
# write run commands
all_commands = []
job_info=[]
for t in seeds:
random_state = t
# random_state = t
print('random_seed:',random_state)
for ml in learners:
save_file = results_path + '/' + dataset + '_' + ml + rarity + '.csv'
all_commands.append(
'OMP_NUM_THREADS={N_CORES}; '
'export OMP_NUM_THREADS; '
'python -u {ML}.py {DATASET} {SAVEFILE} {RS} {RA}'.format(ML=model_dir + '/' + ml,
DATASET=dataset_path,
SAVEFILE=save_file,
RS=random_state,
N_CORES=args.N_JOBS,
RA=not args.NORARE))
job_info.append({'ml':ml,'dataset':dataset,'results_path':results_path})
if args.LSF: # bsub commands
for i,run_cmd in enumerate(all_commands):
job_name = job_info[i]['ml'] + '_' + job_info[i]['dataset']
out_file = job_info[i]['results_path'] + job_name + '_%J.out'
# error_file = out_file[:-4] + '.err'
bsub_cmd = ('bsub -o {OUT_FILE} -n {N_CORES} -J {JOB_NAME} -q {QUEUE} '
'-R "span[hosts=1] rusage[mem={M}]" -M {M} ').format(OUT_FILE=out_file,
JOB_NAME=job_name,
QUEUE=args.QUEUE,
N_CORES=args.N_JOBS,
M=args.M)
bsub_cmd += '"' + run_cmd + '"'
print(bsub_cmd)
os.system(bsub_cmd) # submit jobs
else: # run locally
Parallel(n_jobs=args.N_JOBS)(delayed(os.system)(run_cmd) for run_cmd in all_commands )