Diff of /analysis/compare.py [000000] .. [23d48c]

Switch to unified view

a b/analysis/compare.py
1
import pandas as pd
2
import numpy as np
3
import matplotlib.pyplot as plt
4
plt.switch_backend('agg')
5
import seaborn as sns
6
sns.set_style("whitegrid")
7
import math
8
import argparse
9
from glob import glob
10
import pdb
11
def main():
12
    """Analyzes results and generates figures."""
13
 
14
    parser = argparse.ArgumentParser(description="An analyst for quick ML applications.",
15
                                     add_help=False)
16
  
17
    parser.add_argument('RUN_DIR', action='store',  type=str, help='Path to results from analysis.')    
18
    parser.add_argument('-max_feat',action='store',dest='MAX_FEAT',default=10,type=int,
19
                        help = 'Max features to show in importance plots.')
20
    args = parser.parse_args()
21
   
22
    # dataset = args.NAME
23
    # dataset = args.NAME.split('/')[-1].split('.')[0] 
24
    # run_dir = 'results/' + dataset + '/' 
25
    run_dir = args.RUN_DIR
26
    if run_dir[-1] != '/': 
27
        run_dir += '/'
28
    dataset = run_dir.split('/')[-2]
29
    print('dataset:',dataset)
30
    print('loading data from',run_dir)
31
32
    frames = []     # data frames to combine
33
    count = 0
34
    for f in glob(run_dir + '*.csv'):
35
        if 'imp_score' not in f:
36
            frames.append(pd.read_csv(f,sep='\t',index_col=False))
37
            count = count + 1
38
39
    df = pd.concat(frames, join='outer', ignore_index=True)
40
    print('loaded',count,'result files with results from these learners:',df['algorithm'].unique())
41
42
    restricted_cols = ['prep_alg','preprocessor', 'prep-parameters', 'algorithm', 'alg-parameters','dataset',
43
                      'trial','seed','parameters']
44
    columns_to_plot = [c for c in df.columns if c not in restricted_cols ] 
45
    #['accuracy','f1_macro','bal_accuracy']
46
    print('generating boxplots for these columns:',columns_to_plot)
47
48
    for col in columns_to_plot:
49
        fig = plt.figure()
50
        # for i, prep in enumerate(unique_preps):
51
            # fig.add_subplot(math.ceil(len(unique_preps)), 2,i+1)
52
        # pdb.set_trace()
53
        df[col] = df[col].astype(np.float)
54
        sns.boxplot(data=df,x="algorithm",y=col)
55
        # plt.title(prep,size=16)
56
        plt.gca().set_xticklabels(df.algorithm.unique(),size=14,rotation=45)
57
        plt.ylabel(col,size=16)
58
        plt.ylim(0.5,1.0)
59
        plt.xlabel('')
60
        fig.tight_layout() 
61
        plt.savefig(run_dir + '_'.join([ dataset, col,'boxplots.pdf']))
62
63
    ####################################################################### feature importance plots
64
    frames = []     # data frames to combine
65
    count = 0
66
    for f in glob(run_dir + '*.imp_score'):
67
        frames.append(pd.read_csv(f,sep='\t',index_col=False))
68
        count = count + 1
69
70
    df = pd.concat(frames, join='outer', ignore_index=True)
71
    print('loaded',count,'feature importance files with results from these learners:',df['algorithm'].unique())
72
    
73
    dfp =  df.groupby(['algorithm','feature']).median().unstack(['algorithm'])
74
    dfpn = df.groupby(['feature','algorithm']).median().groupby('feature').sum().unstack()
75
    
76
    dfpn.sort_values(ascending=False, inplace=True)
77
    # sort by median feature importance
78
    nf = min(args.MAX_FEAT, dfpn.index.labels[1].shape[0])
79
    dfpw = dfp.iloc[dfpn.index.labels[1][:nf]]
80
    h = dfpw['score'].plot(kind='bar', stacked=True)
81
    leg = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
82
83
    plt.ylabel('Importance Score')
84
    plt.savefig(run_dir + '_'.join([ dataset, 'importance_scores.pdf']),bbox_extra_artists=(leg,h), bbox_inches='tight')
85
86
    ############################################################# roc curves
87
    frames = []     # data frames to combine
88
    count = 0
89
    for f in glob(run_dir + '*.roc'):
90
        frames.append(pd.read_csv(f,sep='\t',index_col=False))
91
        count = count + 1
92
93
    df = pd.concat(frames, join='outer', ignore_index=True)
94
    
95
    print('loaded',count,'roc files with results from these learners:',df['algorithm'].unique())
96
97
    h, ax = plt.subplots()
98
    ax.plot([0, 1],[0, 1],'--k',label='_nolegend_')
99
    colors = ('r','y','b','g','c','k')
100
    colors = plt.cm.Blues(np.linspace(0.1, 0.9, len(df['algorithm'].unique())))
101
102
    n_algs = len(df['algorithm'].unique())
103
    markers = ['o','v','^','<','>','8','s',
104
               'p','P','*','h','H','+','x','X','D','d','|','_']
105
    for i, (alg,df_g) in enumerate(df.groupby('algorithm')):
106
   
107
        aucs = df_g.auc.values
108
        seed_max = df_g.loc[df_g.auc.idxmax()]['seed']
109
        seed_min = df_g.loc[df_g.auc.idxmin()]['seed']
110
        seed_med = df_g.loc[np.abs(df_g.auc - df_g.auc.median()) == np.min(np.abs(df_g.auc - df_g.auc.median()))]['seed']
111
        seed_med = seed_med.iloc[0]
112
         
113
        auc = df_g.auc.median()
114
        # fpr = df_g['fpr'].unique()
115
        tprs,fprs=[],[]
116
        fpr_min = df_g.loc[df_g.seed == seed_min,:]['fpr']
117
        fpr_max = df_g.loc[df_g.seed == seed_max,:]['fpr']
118
        tpr_min = df_g.loc[df_g.seed == seed_min,:]['tpr']
119
        tpr_max = df_g.loc[df_g.seed == seed_max,:]['tpr']
120
        tpr_med = df_g.loc[df_g.seed == seed_med,:]['tpr']
121
        fpr_med = df_g.loc[df_g.seed == seed_med,:]['fpr']
122
 
123
        ax.plot(fpr_med,tpr_med, color=colors[i % n_algs], marker=markers[i], 
124
                linestyle='--', linewidth=1, label='{:s} (AUC = {:0.2f})'.format(alg,auc))
125
      
126
        # ax.plot(fpr_max,tpr_max, color=colors[i % n_algs],  
127
        #         linestyle='--', linewidth=1, label='_nolegend_', alpha=0.1)
128
        # ax.plot(fpr_min,tpr_min,color=colors[i % n_algs],  
129
        #         linestyle='--', linewidth=1, label='_nolegend_', alpha=0.1)
130
131
    plt.ylabel('True Positive Rate')
132
    plt.xlabel('False Positive Rate')
133
    leg = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
134
    plt.ylim(0,1)
135
    plt.xlim(0,1)
136
    plt.savefig(run_dir + '_'.join([ dataset, 'roc_curves.pdf']), bbox_inches='tight')
137
138
    print('done!')    
139
140
if __name__ == '__main__':
141
    main()