|
a |
|
b/analysis/compare.py |
|
|
1 |
import pandas as pd |
|
|
2 |
import numpy as np |
|
|
3 |
import matplotlib.pyplot as plt |
|
|
4 |
plt.switch_backend('agg') |
|
|
5 |
import seaborn as sns |
|
|
6 |
sns.set_style("whitegrid") |
|
|
7 |
import math |
|
|
8 |
import argparse |
|
|
9 |
from glob import glob |
|
|
10 |
import pdb |
|
|
11 |
def main(): |
|
|
12 |
"""Analyzes results and generates figures.""" |
|
|
13 |
|
|
|
14 |
parser = argparse.ArgumentParser(description="An analyst for quick ML applications.", |
|
|
15 |
add_help=False) |
|
|
16 |
|
|
|
17 |
parser.add_argument('RUN_DIR', action='store', type=str, help='Path to results from analysis.') |
|
|
18 |
parser.add_argument('-max_feat',action='store',dest='MAX_FEAT',default=10,type=int, |
|
|
19 |
help = 'Max features to show in importance plots.') |
|
|
20 |
args = parser.parse_args() |
|
|
21 |
|
|
|
22 |
# dataset = args.NAME |
|
|
23 |
# dataset = args.NAME.split('/')[-1].split('.')[0] |
|
|
24 |
# run_dir = 'results/' + dataset + '/' |
|
|
25 |
run_dir = args.RUN_DIR |
|
|
26 |
if run_dir[-1] != '/': |
|
|
27 |
run_dir += '/' |
|
|
28 |
dataset = run_dir.split('/')[-2] |
|
|
29 |
print('dataset:',dataset) |
|
|
30 |
print('loading data from',run_dir) |
|
|
31 |
|
|
|
32 |
frames = [] # data frames to combine |
|
|
33 |
count = 0 |
|
|
34 |
for f in glob(run_dir + '*.csv'): |
|
|
35 |
if 'imp_score' not in f: |
|
|
36 |
frames.append(pd.read_csv(f,sep='\t',index_col=False)) |
|
|
37 |
count = count + 1 |
|
|
38 |
|
|
|
39 |
df = pd.concat(frames, join='outer', ignore_index=True) |
|
|
40 |
print('loaded',count,'result files with results from these learners:',df['algorithm'].unique()) |
|
|
41 |
|
|
|
42 |
restricted_cols = ['prep_alg','preprocessor', 'prep-parameters', 'algorithm', 'alg-parameters','dataset', |
|
|
43 |
'trial','seed','parameters'] |
|
|
44 |
columns_to_plot = [c for c in df.columns if c not in restricted_cols ] |
|
|
45 |
#['accuracy','f1_macro','bal_accuracy'] |
|
|
46 |
print('generating boxplots for these columns:',columns_to_plot) |
|
|
47 |
|
|
|
48 |
for col in columns_to_plot: |
|
|
49 |
fig = plt.figure() |
|
|
50 |
# for i, prep in enumerate(unique_preps): |
|
|
51 |
# fig.add_subplot(math.ceil(len(unique_preps)), 2,i+1) |
|
|
52 |
# pdb.set_trace() |
|
|
53 |
df[col] = df[col].astype(np.float) |
|
|
54 |
sns.boxplot(data=df,x="algorithm",y=col) |
|
|
55 |
# plt.title(prep,size=16) |
|
|
56 |
plt.gca().set_xticklabels(df.algorithm.unique(),size=14,rotation=45) |
|
|
57 |
plt.ylabel(col,size=16) |
|
|
58 |
plt.ylim(0.5,1.0) |
|
|
59 |
plt.xlabel('') |
|
|
60 |
fig.tight_layout() |
|
|
61 |
plt.savefig(run_dir + '_'.join([ dataset, col,'boxplots.pdf'])) |
|
|
62 |
|
|
|
63 |
####################################################################### feature importance plots |
|
|
64 |
frames = [] # data frames to combine |
|
|
65 |
count = 0 |
|
|
66 |
for f in glob(run_dir + '*.imp_score'): |
|
|
67 |
frames.append(pd.read_csv(f,sep='\t',index_col=False)) |
|
|
68 |
count = count + 1 |
|
|
69 |
|
|
|
70 |
df = pd.concat(frames, join='outer', ignore_index=True) |
|
|
71 |
print('loaded',count,'feature importance files with results from these learners:',df['algorithm'].unique()) |
|
|
72 |
|
|
|
73 |
dfp = df.groupby(['algorithm','feature']).median().unstack(['algorithm']) |
|
|
74 |
dfpn = df.groupby(['feature','algorithm']).median().groupby('feature').sum().unstack() |
|
|
75 |
|
|
|
76 |
dfpn.sort_values(ascending=False, inplace=True) |
|
|
77 |
# sort by median feature importance |
|
|
78 |
nf = min(args.MAX_FEAT, dfpn.index.labels[1].shape[0]) |
|
|
79 |
dfpw = dfp.iloc[dfpn.index.labels[1][:nf]] |
|
|
80 |
h = dfpw['score'].plot(kind='bar', stacked=True) |
|
|
81 |
leg = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) |
|
|
82 |
|
|
|
83 |
plt.ylabel('Importance Score') |
|
|
84 |
plt.savefig(run_dir + '_'.join([ dataset, 'importance_scores.pdf']),bbox_extra_artists=(leg,h), bbox_inches='tight') |
|
|
85 |
|
|
|
86 |
############################################################# roc curves |
|
|
87 |
frames = [] # data frames to combine |
|
|
88 |
count = 0 |
|
|
89 |
for f in glob(run_dir + '*.roc'): |
|
|
90 |
frames.append(pd.read_csv(f,sep='\t',index_col=False)) |
|
|
91 |
count = count + 1 |
|
|
92 |
|
|
|
93 |
df = pd.concat(frames, join='outer', ignore_index=True) |
|
|
94 |
|
|
|
95 |
print('loaded',count,'roc files with results from these learners:',df['algorithm'].unique()) |
|
|
96 |
|
|
|
97 |
h, ax = plt.subplots() |
|
|
98 |
ax.plot([0, 1],[0, 1],'--k',label='_nolegend_') |
|
|
99 |
colors = ('r','y','b','g','c','k') |
|
|
100 |
colors = plt.cm.Blues(np.linspace(0.1, 0.9, len(df['algorithm'].unique()))) |
|
|
101 |
|
|
|
102 |
n_algs = len(df['algorithm'].unique()) |
|
|
103 |
markers = ['o','v','^','<','>','8','s', |
|
|
104 |
'p','P','*','h','H','+','x','X','D','d','|','_'] |
|
|
105 |
for i, (alg,df_g) in enumerate(df.groupby('algorithm')): |
|
|
106 |
|
|
|
107 |
aucs = df_g.auc.values |
|
|
108 |
seed_max = df_g.loc[df_g.auc.idxmax()]['seed'] |
|
|
109 |
seed_min = df_g.loc[df_g.auc.idxmin()]['seed'] |
|
|
110 |
seed_med = df_g.loc[np.abs(df_g.auc - df_g.auc.median()) == np.min(np.abs(df_g.auc - df_g.auc.median()))]['seed'] |
|
|
111 |
seed_med = seed_med.iloc[0] |
|
|
112 |
|
|
|
113 |
auc = df_g.auc.median() |
|
|
114 |
# fpr = df_g['fpr'].unique() |
|
|
115 |
tprs,fprs=[],[] |
|
|
116 |
fpr_min = df_g.loc[df_g.seed == seed_min,:]['fpr'] |
|
|
117 |
fpr_max = df_g.loc[df_g.seed == seed_max,:]['fpr'] |
|
|
118 |
tpr_min = df_g.loc[df_g.seed == seed_min,:]['tpr'] |
|
|
119 |
tpr_max = df_g.loc[df_g.seed == seed_max,:]['tpr'] |
|
|
120 |
tpr_med = df_g.loc[df_g.seed == seed_med,:]['tpr'] |
|
|
121 |
fpr_med = df_g.loc[df_g.seed == seed_med,:]['fpr'] |
|
|
122 |
|
|
|
123 |
ax.plot(fpr_med,tpr_med, color=colors[i % n_algs], marker=markers[i], |
|
|
124 |
linestyle='--', linewidth=1, label='{:s} (AUC = {:0.2f})'.format(alg,auc)) |
|
|
125 |
|
|
|
126 |
# ax.plot(fpr_max,tpr_max, color=colors[i % n_algs], |
|
|
127 |
# linestyle='--', linewidth=1, label='_nolegend_', alpha=0.1) |
|
|
128 |
# ax.plot(fpr_min,tpr_min,color=colors[i % n_algs], |
|
|
129 |
# linestyle='--', linewidth=1, label='_nolegend_', alpha=0.1) |
|
|
130 |
|
|
|
131 |
plt.ylabel('True Positive Rate') |
|
|
132 |
plt.xlabel('False Positive Rate') |
|
|
133 |
leg = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) |
|
|
134 |
plt.ylim(0,1) |
|
|
135 |
plt.xlim(0,1) |
|
|
136 |
plt.savefig(run_dir + '_'.join([ dataset, 'roc_curves.pdf']), bbox_inches='tight') |
|
|
137 |
|
|
|
138 |
print('done!') |
|
|
139 |
|
|
|
140 |
if __name__ == '__main__': |
|
|
141 |
main() |