interpret_ehr / Git / [23d48c] /analysis/compare.py

Models:
philipB/
interpret_ehr
Downloads: 1
[23d48c]: / analysis / compare.py
History
Download this file
142 lines (117 with data), 5.9 kB

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import seaborn as sns
sns.set_style("whitegrid")
import math
import argparse
from glob import glob
import pdb
def main():
    """Analyzes results and generates figures."""
 
    parser = argparse.ArgumentParser(description="An analyst for quick ML applications.",
                                     add_help=False)
  
    parser.add_argument('RUN_DIR', action='store',  type=str, help='Path to results from analysis.')    
    parser.add_argument('-max_feat',action='store',dest='MAX_FEAT',default=10,type=int,
                        help = 'Max features to show in importance plots.')
    args = parser.parse_args()
   
    # dataset = args.NAME
    # dataset = args.NAME.split('/')[-1].split('.')[0] 
    # run_dir = 'results/' + dataset + '/' 
    run_dir = args.RUN_DIR
    if run_dir[-1] != '/': 
        run_dir += '/'
    dataset = run_dir.split('/')[-2]
    print('dataset:',dataset)
    print('loading data from',run_dir)

    frames = []     # data frames to combine
    count = 0
    for f in glob(run_dir + '*.csv'):
        if 'imp_score' not in f:
            frames.append(pd.read_csv(f,sep='\t',index_col=False))
            count = count + 1

    df = pd.concat(frames, join='outer', ignore_index=True)
    print('loaded',count,'result files with results from these learners:',df['algorithm'].unique())

    restricted_cols = ['prep_alg','preprocessor', 'prep-parameters', 'algorithm', 'alg-parameters','dataset',
                      'trial','seed','parameters']
    columns_to_plot = [c for c in df.columns if c not in restricted_cols ] 
    #['accuracy','f1_macro','bal_accuracy']
    print('generating boxplots for these columns:',columns_to_plot)

    for col in columns_to_plot:
        fig = plt.figure()
        # for i, prep in enumerate(unique_preps):
            # fig.add_subplot(math.ceil(len(unique_preps)), 2,i+1)
        # pdb.set_trace()
        df[col] = df[col].astype(np.float)
        sns.boxplot(data=df,x="algorithm",y=col)
        # plt.title(prep,size=16)
        plt.gca().set_xticklabels(df.algorithm.unique(),size=14,rotation=45)
        plt.ylabel(col,size=16)
        plt.ylim(0.5,1.0)
        plt.xlabel('')
        fig.tight_layout() 
        plt.savefig(run_dir + '_'.join([ dataset, col,'boxplots.pdf']))

    ####################################################################### feature importance plots
    frames = []     # data frames to combine
    count = 0
    for f in glob(run_dir + '*.imp_score'):
        frames.append(pd.read_csv(f,sep='\t',index_col=False))
        count = count + 1

    df = pd.concat(frames, join='outer', ignore_index=True)
    print('loaded',count,'feature importance files with results from these learners:',df['algorithm'].unique())
    
    dfp =  df.groupby(['algorithm','feature']).median().unstack(['algorithm'])
    dfpn = df.groupby(['feature','algorithm']).median().groupby('feature').sum().unstack()
    
    dfpn.sort_values(ascending=False, inplace=True)
    # sort by median feature importance
    nf = min(args.MAX_FEAT, dfpn.index.labels[1].shape[0])
    dfpw = dfp.iloc[dfpn.index.labels[1][:nf]]
    h = dfpw['score'].plot(kind='bar', stacked=True)
    leg = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    plt.ylabel('Importance Score')
    plt.savefig(run_dir + '_'.join([ dataset, 'importance_scores.pdf']),bbox_extra_artists=(leg,h), bbox_inches='tight')

    ############################################################# roc curves
    frames = []     # data frames to combine
    count = 0
    for f in glob(run_dir + '*.roc'):
        frames.append(pd.read_csv(f,sep='\t',index_col=False))
        count = count + 1

    df = pd.concat(frames, join='outer', ignore_index=True)
    
    print('loaded',count,'roc files with results from these learners:',df['algorithm'].unique())

    h, ax = plt.subplots()
    ax.plot([0, 1],[0, 1],'--k',label='_nolegend_')
    colors = ('r','y','b','g','c','k')
    colors = plt.cm.Blues(np.linspace(0.1, 0.9, len(df['algorithm'].unique())))

    n_algs = len(df['algorithm'].unique())
    markers = ['o','v','^','<','>','8','s',
               'p','P','*','h','H','+','x','X','D','d','|','_']
    for i, (alg,df_g) in enumerate(df.groupby('algorithm')):
   
        aucs = df_g.auc.values
        seed_max = df_g.loc[df_g.auc.idxmax()]['seed']
        seed_min = df_g.loc[df_g.auc.idxmin()]['seed']
        seed_med = df_g.loc[np.abs(df_g.auc - df_g.auc.median()) == np.min(np.abs(df_g.auc - df_g.auc.median()))]['seed']
        seed_med = seed_med.iloc[0]
         
        auc = df_g.auc.median()
        # fpr = df_g['fpr'].unique()
        tprs,fprs=[],[]
        fpr_min = df_g.loc[df_g.seed == seed_min,:]['fpr']
        fpr_max = df_g.loc[df_g.seed == seed_max,:]['fpr']
        tpr_min = df_g.loc[df_g.seed == seed_min,:]['tpr']
        tpr_max = df_g.loc[df_g.seed == seed_max,:]['tpr']
        tpr_med = df_g.loc[df_g.seed == seed_med,:]['tpr']
        fpr_med = df_g.loc[df_g.seed == seed_med,:]['fpr']
 
        ax.plot(fpr_med,tpr_med, color=colors[i % n_algs], marker=markers[i], 
                linestyle='--', linewidth=1, label='{:s} (AUC = {:0.2f})'.format(alg,auc))
      
        # ax.plot(fpr_max,tpr_max, color=colors[i % n_algs],  
        #         linestyle='--', linewidth=1, label='_nolegend_', alpha=0.1)
        # ax.plot(fpr_min,tpr_min,color=colors[i % n_algs],  
        #         linestyle='--', linewidth=1, label='_nolegend_', alpha=0.1)

    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    leg = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.ylim(0,1)
    plt.xlim(0,1)
    plt.savefig(run_dir + '_'.join([ dataset, 'roc_curves.pdf']), bbox_inches='tight')

    print('done!')    

if __name__ == '__main__':
    main()