|
a |
|
b/analysis/analyze.py |
|
|
1 |
import pandas as pd |
|
|
2 |
import numpy as np |
|
|
3 |
import argparse |
|
|
4 |
import os, errno, sys |
|
|
5 |
from sklearn.externals.joblib import Parallel, delayed |
|
|
6 |
|
|
|
7 |
|
|
|
8 |
if __name__ == '__main__': |
|
|
9 |
# parse command line arguments |
|
|
10 |
parser = argparse.ArgumentParser(description="An analyst for quick ML applications.", |
|
|
11 |
add_help=False) |
|
|
12 |
parser.add_argument('INPUT_FILE', type=str, |
|
|
13 |
help='Data file to analyze; ensure that the ' |
|
|
14 |
'target/label column is labeled as "class".') |
|
|
15 |
parser.add_argument('-h', '--help', action='help', |
|
|
16 |
help='Show this help message and exit.') |
|
|
17 |
parser.add_argument('-ml', action='store', dest='LEARNERS',default=None,type=str, |
|
|
18 |
help='Comma-separated list of ML methods to use (should correspond to a py file name in learners/)') |
|
|
19 |
parser.add_argument('--lsf', action='store_true', dest='LSF', default=False, |
|
|
20 |
help='Run on an LSF HPC (using bsub commands)') |
|
|
21 |
parser.add_argument('-metric',action='store', dest='METRIC', default='f1_macro', type=str, |
|
|
22 |
help='Metric to compare algorithms') |
|
|
23 |
parser.add_argument('-n_jobs',action='store',dest='N_JOBS',default=4,type=int, |
|
|
24 |
help='Number of parallel jobs') |
|
|
25 |
parser.add_argument('-n_trials',action='store',dest='N_TRIALS',default=1,type=int, |
|
|
26 |
help='Number of parallel jobs') |
|
|
27 |
parser.add_argument('-trials',action='store',dest='TRIALS',default='',type=str, |
|
|
28 |
help='Number of parallel jobs') |
|
|
29 |
parser.add_argument('-rs',action='store',dest='RANDOM_STATE',default=None,type=int, |
|
|
30 |
help='random state') |
|
|
31 |
parser.add_argument('-label',action='store',dest='LABEL',default='class',type=str,help='Name of class label column') |
|
|
32 |
parser.add_argument('-results',action='store',dest='RDIR',type=str,help='Results directory', |
|
|
33 |
default='/project/moore/users/lacava/geis-ehr/results/') |
|
|
34 |
parser.add_argument('-q',action='store',dest='QUEUE',default='moore_normal',type=str,help='LSF queue') |
|
|
35 |
parser.add_argument('-m',action='store',dest='M',default=4096,type=int,help='LSF memory request and limit (MB)') |
|
|
36 |
parser.add_argument('--norare',action='store_true',dest='NORARE', default=False, |
|
|
37 |
help='do not include rare labs') |
|
|
38 |
|
|
|
39 |
args = parser.parse_args() |
|
|
40 |
|
|
|
41 |
#if args.RANDOM_STATE: |
|
|
42 |
# random_state = args.RANDOM_STATE |
|
|
43 |
#else: |
|
|
44 |
# random_state = np.random.randint(2**15 - 1) |
|
|
45 |
# 10 fixed, randomly generated seeds |
|
|
46 |
seeds = [11085, 14899, 27164, 3674, 16612, 9168, 7016, 30993, 4180, 1188] |
|
|
47 |
|
|
|
48 |
if len(args.TRIALS)>0: |
|
|
49 |
args.N_TRIALS = len(args.TRIALS.split(',')) |
|
|
50 |
seeds = [int(t) for t in args.TRIALS.split(',')] |
|
|
51 |
else: |
|
|
52 |
seeds = [s for i,s in enumerate(seeds) if i < args.N_TRIALS] |
|
|
53 |
|
|
|
54 |
learners = [ml for ml in args.LEARNERS.split(',')] # learners |
|
|
55 |
|
|
|
56 |
model_dir = 'ml' |
|
|
57 |
|
|
|
58 |
dataset = args.INPUT_FILE.split('/')[-1] |
|
|
59 |
dataset_prefix = 'geis' + dataset.split('icd9')[-1] |
|
|
60 |
dataset_path = args.INPUT_FILE + '/' + dataset_prefix |
|
|
61 |
print('dataset:',dataset_prefix) |
|
|
62 |
print('dataset path:',dataset_path) |
|
|
63 |
|
|
|
64 |
results_path = '/'.join([args.RDIR, dataset]) + '/' |
|
|
65 |
# make the results_path directory if it doesn't exit |
|
|
66 |
try: |
|
|
67 |
os.makedirs(results_path) |
|
|
68 |
except OSError as e: |
|
|
69 |
if e.errno != errno.EEXIST: |
|
|
70 |
raise |
|
|
71 |
rarity = '_noRare' if args.NORARE else '' |
|
|
72 |
# initialize output files |
|
|
73 |
for ml in learners: |
|
|
74 |
#write headers |
|
|
75 |
save_file = results_path + '/' + dataset + '_' + ml + rarity + '.csv' |
|
|
76 |
feat_file = save_file.split('.csv')[0]+'.imp_score' |
|
|
77 |
roc_file = save_file.split('.csv')[0]+'.roc' |
|
|
78 |
|
|
|
79 |
if not os.path.isfile(feat_file): |
|
|
80 |
with open(feat_file,'w') as out: |
|
|
81 |
out.write('algorithm\talg-parameters\tseed\tfeature\tscore\n') |
|
|
82 |
|
|
|
83 |
if not os.path.isfile(roc_file): |
|
|
84 |
with open(roc_file,'w') as out: |
|
|
85 |
out.write('algorithm\talg-parameters\tseed\tfpr\ttpr\tauc\n') |
|
|
86 |
|
|
|
87 |
if not os.path.isfile(save_file): |
|
|
88 |
with open(save_file,'w') as out: |
|
|
89 |
# out.write('dataset\talgorithm\tparameters\taccuracy\tf1_macro\tseed\tbal_accuracy\troc_auc\ttime\n') |
|
|
90 |
out.write('dataset\talgorithm\tparameters\tseed\taccuracy\tf1_macro\tbal_accuracy\troc_auc\ttime\n') |
|
|
91 |
|
|
|
92 |
# write run commands |
|
|
93 |
all_commands = [] |
|
|
94 |
job_info=[] |
|
|
95 |
for t in seeds: |
|
|
96 |
random_state = t |
|
|
97 |
# random_state = t |
|
|
98 |
print('random_seed:',random_state) |
|
|
99 |
for ml in learners: |
|
|
100 |
save_file = results_path + '/' + dataset + '_' + ml + rarity + '.csv' |
|
|
101 |
|
|
|
102 |
all_commands.append( |
|
|
103 |
'OMP_NUM_THREADS={N_CORES}; ' |
|
|
104 |
'export OMP_NUM_THREADS; ' |
|
|
105 |
'python -u {ML}.py {DATASET} {SAVEFILE} {RS} {RA}'.format(ML=model_dir + '/' + ml, |
|
|
106 |
DATASET=dataset_path, |
|
|
107 |
SAVEFILE=save_file, |
|
|
108 |
RS=random_state, |
|
|
109 |
N_CORES=args.N_JOBS, |
|
|
110 |
RA=not args.NORARE)) |
|
|
111 |
|
|
|
112 |
job_info.append({'ml':ml,'dataset':dataset,'results_path':results_path}) |
|
|
113 |
|
|
|
114 |
if args.LSF: # bsub commands |
|
|
115 |
for i,run_cmd in enumerate(all_commands): |
|
|
116 |
job_name = job_info[i]['ml'] + '_' + job_info[i]['dataset'] |
|
|
117 |
out_file = job_info[i]['results_path'] + job_name + '_%J.out' |
|
|
118 |
# error_file = out_file[:-4] + '.err' |
|
|
119 |
|
|
|
120 |
bsub_cmd = ('bsub -o {OUT_FILE} -n {N_CORES} -J {JOB_NAME} -q {QUEUE} ' |
|
|
121 |
'-R "span[hosts=1] rusage[mem={M}]" -M {M} ').format(OUT_FILE=out_file, |
|
|
122 |
JOB_NAME=job_name, |
|
|
123 |
QUEUE=args.QUEUE, |
|
|
124 |
N_CORES=args.N_JOBS, |
|
|
125 |
M=args.M) |
|
|
126 |
|
|
|
127 |
bsub_cmd += '"' + run_cmd + '"' |
|
|
128 |
print(bsub_cmd) |
|
|
129 |
os.system(bsub_cmd) # submit jobs |
|
|
130 |
|
|
|
131 |
else: # run locally |
|
|
132 |
Parallel(n_jobs=args.N_JOBS)(delayed(os.system)(run_cmd) for run_cmd in all_commands ) |