b/evaluate.py
+#evaluate.py
+#Copyright (c) 2020 Rachel Lea Ballantyne Draelos
+#MIT License
+#Permission is hereby granted, free of charge, to any person obtaining a copy
+#of this software and associated documentation files (the "Software"), to deal
+#in the Software without restriction, including without limitation the rights
+#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#copies of the Software, and to permit persons to whom the Software is
+#furnished to do so, subject to the following conditions:
+#The above copyright notice and this permission notice shall be included in all
+#copies or substantial portions of the Software.
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#SOFTWARE
+#Imports
+import os
+import copy
+import time
+import bisect
+import shutil
+import operator
+import itertools
+import numpy as np
+import pandas as pd
+import sklearn.metrics
+from scipy import interp
+from itertools import cycle
+import matplotlib
+matplotlib.use('agg') #so that it does not attempt to display via SSH
+import seaborn
+import matplotlib.pyplot as plt
+plt.ioff() #turn interactive plotting off
+#suppress numpy warnings
+import warnings
+warnings.filterwarnings('ignore')
+#######################
+# Reporting Functions #---------------------------------------------------------
+#######################
+def initialize_evaluation_dfs(all_labels, num_epochs):
+    """Create empty "eval_dfs_dict"
+    Variables
+    <all_labels>: a list of strings describing the labels in order
+    <num_epochs>: int for total number of epochs"""
+    if len(all_labels)==2:
+        index = [all_labels[1]]
+        numrows = 1
+    else:
+        index = all_labels
+        numrows = len(all_labels)
+    #Initialize empty pandas dataframe to store evaluation results across epochs
+    #for accuracy, AUROC, and AP
+    result_df = pd.DataFrame(data=np.zeros((numrows, num_epochs)),
+                            index = index,
+                            columns = ['epoch_'+str(n) for n in range(0,num_epochs)])
+    #Initialize empty pandas dataframe to store evaluation results for top k
+    top_k_result_df = pd.DataFrame(np.zeros((len(all_labels), num_epochs)),
+                                   index=[x for x in range(1,len(all_labels)+1)], #e.g. 1,...,64 for len(all_labels)=64
+                                   columns = ['epoch_'+str(n) for n in range(0,num_epochs)])
+    #Make eval results dictionaries
+    eval_results_valid = {'accuracy':copy.deepcopy(result_df),
+        'auroc':copy.deepcopy(result_df),
+        'avg_precision':copy.deepcopy(result_df),
+        'top_k':top_k_result_df}
+    eval_results_test = copy.deepcopy(eval_results_valid)
+    return eval_results_valid, eval_results_test
+def save(eval_dfs_dict, results_dir, descriptor):
+    """Variables
+    <eval_dfs_dict> is a dict of pandas dataframes
+    <descriptor> is a string"""
+    for k in eval_dfs_dict.keys():
+        eval_dfs_dict[k].to_csv(os.path.join(results_dir, descriptor+'_'+k+'_Table.csv'))
+def save_final_summary(eval_dfs_dict, best_valid_epoch, setname, results_dir):
+    """Save to overall df and print summary of best epoch."""
+    #final_descriptor is e.g. '2019-11-15-awesome-model_epoch15
+    final_descriptor = results_dir.replace('results/','')+'_epoch'+str(best_valid_epoch)
+    if setname=='valid': print('***Summary for',setname,results_dir,'***')
+    for metricname in list(eval_dfs_dict.keys()):
+        #metricnames are accuracy, auroc, avg_precision, and top_k.
+        #df holds a particular metric for the particular model we just ran.
+        #for accuracy, auroc, and avg_precision, df index is diseases, columns are epochs.
+        #for top_k, df index is the k value (an int) and columns are epochs.
+        df = eval_dfs_dict[metricname]
+        #all_df tracks results of all models in one giant table.
+        #all_df has index of diseases or k value, and columns which are particular models.
+        all_df_path = os.path.join('results',setname+'_'+metricname+'_all.csv') #e.g. valid_accuracy_all.csv
+        if os.path.isfile(all_df_path):
+            all_df = pd.read_csv(all_df_path,header=0,index_col=0)
+            all_df[final_descriptor] = np.nan
+        else: #all_df doesn't exist yet - create it.
+            all_df = pd.DataFrame(np.empty((df.shape[0],1)),
+                                  index = df.index.values.tolist(),
+                                  columns = [final_descriptor])
+        #Print off and save results for best_valid_epoch
+        if setname=='valid': print('\tEpoch',best_valid_epoch,metricname)
+        for label in df.index.values:
+            #print off to console
+            value = df.at[label,'epoch_'+str(best_valid_epoch)]
+            if setname=='valid': print('\t\t',label,':',str( round(value, 3) ))
+            #save in all_df
+            all_df.at[label,final_descriptor] = value
+        all_df.to_csv(all_df_path,header=True,index=True)
+def clean_up_output_files(best_valid_epoch, results_dir):
+    """Delete output files that aren't from the best epoch"""
+    #Delete all the backup parameters (they take a lot of space and you do not
+    #need to have them)
+    shutil.rmtree(os.path.join(results_dir,'backup'))
+    #Delete all the extra output files:
+    for subdir in ['heatmaps','curves','pred_probs']:
+        #Clean up saved ROC and PR curves
+        fullpath = os.path.join(results_dir,subdir)
+        if os.path.exists(fullpath): #e.g. there may not be a heatmaps dir for a non-bottleneck model
+            allfiles = os.listdir(fullpath)
+            for filename in allfiles:
+                if str(best_valid_epoch) not in filename:
+                    os.remove(os.path.join(fullpath,filename))
+    print('Output files all clean')
+#########################
+# Calculation Functions #-------------------------------------------------------
+#########################
+def evaluate_all(eval_dfs_dict, epoch, label_meanings,
+                 true_labels_array, pred_probs_array):
+    """Fill out the pandas dataframes in the dictionary <eval_dfs_dict>
+    which is created in cnn.py. <epoch> and <which_label> are used to index into
+    the dataframe for the metric. Metrics calculated for the provided vectors
+    are: accuracy, AUC, partial AUC (threshold 0.2), and average precision.
+    If <subjective> is set to True, additional metrics will be calculated
+    (confusion matrix, sensitivity, specificity, PPV, NPV.)
+    Variables:
+    <all_eval_results> is a dictionary of pandas dataframes created in cnn.py
+    <epoch> is an integer indicating which epoch it is, starting from epoch 1
+    <true_labels_array>: array of true labels. examples x labels
+    <pred_probs_array>: array of predicted probabilities. examples x labels"""
+    #Accuracy, AUROC, and AP (iter over labels)
+    for label_number in range(len(label_meanings)):
+        which_label = label_meanings[label_number] #descriptive string for the label
+        true_labels = true_labels_array[:,label_number]
+        pred_probs = pred_probs_array[:,label_number]
+        pred_labels = (pred_probs>=0.5).astype(dtype='int') #decision threshold of 0.5
+        #Accuracy and confusion matrix (dependent on decision threshold)
+        (eval_dfs_dict['accuracy']).at[which_label, 'epoch_'+str(epoch)] = compute_accuracy(true_labels, pred_labels)
+        #confusion_matrix, sensitivity, specificity, ppv, npv = compute_confusion_matrix(true_labels, pred_labels)
+        #AUROC and AP (sliding across multiple decision thresholds)
+        fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true = true_labels,
+                                         y_score = pred_probs,
+                                         pos_label = 1)
+        (eval_dfs_dict['auroc']).at[which_label, 'epoch_'+str(epoch)] = sklearn.metrics.auc(fpr, tpr)
+        (eval_dfs_dict['avg_precision']).at[which_label, 'epoch_'+str(epoch)] = sklearn.metrics.average_precision_score(true_labels, pred_probs)
+    #Top k eval metrics (iter over examples)
+    eval_dfs_dict['top_k'] = evaluate_top_k(eval_dfs_dict['top_k'],
+                                    epoch, true_labels_array, pred_probs_array)
+    return eval_dfs_dict
+#################
+# Top K Metrics #---------------------------------------------------------------
+#################
+def evaluate_top_k(eval_top_k_df, epoch, true_labels_array,
+                   pred_probs_array):
+    """<eval_top_k_df> is a pandas dataframe with epoch number as columns and
+        k values as rows, where k is an integer"""
+    num_labels = true_labels_array.shape[1] #e.g. 64
+    total_examples = true_labels_array.shape[0]
+    vals = [0 for x in range(1,num_labels+2)] #e.g. length 65 list but the index of the last element is 64 for num_labels=64
+    for example_number in range(total_examples):
+        #iterate through individual examples (predictions for an individual CT)
+        #rather than iterating through predicted labels
+        true_labels = true_labels_array[example_number,:]
+        pred_probs = pred_probs_array[example_number,:]
+        for k in range(1,num_labels+1): #e.g. 1,...,64
+            previous_value = vals[k]
+            incremental_update = calculate_top_k_accuracy(true_labels, pred_probs, k)
+            new_value = previous_value + incremental_update
+            vals[k] = new_value
+    #Now update the dataframe. Should reach 100% performance by the end.
+    for k in range(1,num_labels+1):
+        eval_top_k_df.at[k,'epoch_'+str(epoch)] = vals[k]/total_examples
+    ##Now average over all the examples
+    #eval_top_k_df.loc[:,'epoch_'+str(epoch)] = eval_top_k_df.loc[:,'epoch_'+str(epoch)] / total_examples
+    return eval_top_k_df
+def calculate_top_k_accuracy(true_labels, pred_probs, k):
+    k = min(k, len(true_labels)) #avoid accessing array elements that don't exist
+    #argpartition described here: https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array
+    #get the indices of the largest k probabilities
+    ind = np.argpartition(pred_probs, -1*k)[-1*k:]
+    #now figure out what percent of these top predictions were equal to 1 in the
+    #true_labels.
+    #Note that the denominator should not exceed the number of true labels, to
+    #avoid penalizing the model inappropriately:
+    denom = min(k, np.sum(true_labels))
+    if denom == 0: #because np.sum(true_labels) is 0
+        #super important! must return 1 to avoid dividing by 0 and producing nan
+        #we don't return 0 because then the model can never get perfect performance
+        #even at k=num_labels because it'll get 0 for anything that has no labels
+        return 1
+    else:
+        return float(np.sum(true_labels[ind]))/denom
+######################
+# Accuracy and AUROC #----------------------------------------------------------
+######################
+def compute_accuracy(true_labels, labels_pred):
+    """Print and save the accuracy of the model on the dataset"""
+    correct = (true_labels == labels_pred)
+    correct_sum = correct.sum()
+    return (float(correct_sum)/len(true_labels))
+def compute_confusion_matrix(true_labels, labels_pred):
+    """Return the confusion matrix"""
+    cm = sklearn.metrics.confusion_matrix(y_true=true_labels,
+                          y_pred=labels_pred)
+    if cm.size < 4: #cm is too small to calculate anything
+        return np.nan, np.nan, np.nan, np.nan, np.nan
+    true_neg, false_pos, false_neg, true_pos = cm.ravel()
+    sensitivity = float(true_pos)/(true_pos + false_neg)
+    specificity = float(true_neg)/(true_neg + false_pos)
+    ppv = float(true_pos)/(true_pos + false_pos)
+    npv = float(true_neg)/(true_neg + false_neg)
+    return((str(cm).replace("\n","_")), sensitivity, specificity, ppv, npv)
+def compute_partial_auroc(fpr, tpr, thresh = 0.2, trapezoid = False, verbose=False):
+    fpr_thresh, tpr_thresh = get_fpr_tpr_for_thresh(fpr, tpr, thresh)
+    if len(fpr_thresh) < 2:#can't calculate an AUC with only 1 data point
+        return np.nan
+    if verbose:
+        print('fpr: '+str(fpr))
+        print('fpr_thresh: '+str(fpr_thresh))
+        print('tpr: '+str(tpr))
+        print('tpr_thresh: '+str(tpr_thresh))
+    return sklearn.metrics.auc(fpr_thresh, tpr_thresh)
+def get_fpr_tpr_for_thresh(fpr, tpr, thresh):
+    """The <fpr> and <tpr> are already sorted according to threshold (which is
+    sorted from highest to lowest, and is NOT the same as <thresh>; threshold
+    is the third output of sklearn.metrics.roc_curve and is a vector of the
+    thresholds used to calculate FPR and TPR). This function figures out where
+    to bisect the FPR so that the remaining elements are no greater than
+    <thresh>. It bisects the TPR in the same place."""
+    p = (bisect.bisect_left(fpr, thresh)-1) #subtract one so that the FPR
+    #of the remaining elements is NO GREATER THAN <thresh>
+    return fpr[: p + 1], tpr[: p + 1]
+######################
+# Plotting Functions #----------------------------------------------------------
+######################
+def plot_pr_and_roc_curves(results_dir, label_meanings, true_labels, pred_probs,
+                           epoch):
+    #Plot Precision Recall Curve
+    #Plot ROC Curve
+    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true = true_labels,
+                                         y_score = pred_probs,
+                                         pos_label = 1)
+    plot_roc_curve(fpr, tpr, epoch, outfilepath)
+def plot_roc_curve_multi_class(label_meanings, y_test, y_score,
+                               outdir, setname, epoch):
+    """<label_meanings>: list of strings, one for each label
+    <y_test>: matrix of ground truth
+    <y_score>: matrix of predicted probabilities
+    <outdir>: directory to save output file
+    <setname>: string e.g. 'train' 'valid' or 'test'
+    <epoch>: int for epoch"""
+    #Modified from https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
+    n_classes = len(label_meanings)
+    lw = 2
+    # Compute ROC curve and ROC area for each class
+    fpr = dict()
+    tpr = dict()
+    roc_auc = dict()
+    for i in range(n_classes):
+        fpr[i], tpr[i], _ = sklearn.metrics.roc_curve(y_test[:, i], y_score[:, i])
+        roc_auc[i] = sklearn.metrics.auc(fpr[i], tpr[i])
+    #make order df. (note that roc_auc is a dictionary with ints as keys
+    #and AUCs as values.
+    order = pd.DataFrame(np.zeros((n_classes,1)), index = [x for x in range(n_classes)],
+                         columns = ['roc_auc'])
+    for i in range(n_classes):
+        order.at[i,'roc_auc'] = roc_auc[i]
+    order = order.sort_values(by='roc_auc',ascending=False)
+    #Plot all ROC curves
+    #Plot in order of the rainbow colors, from highest AUC to lowest AUC
+    plt.figure()
+    colors_list = ['palevioletred','darkorange','yellowgreen','olive','deepskyblue','royalblue','navy']
+    curves_plotted = 0
+    for i in order.index.values.tolist()[0:10]: #only plot the top ten so the plot is readable
+        color_idx = curves_plotted%len(colors_list) #cycle through the colors list in order of colors
+        color = colors_list[color_idx]
+        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
+                 label='{:5s} (area {:0.2f})'.format(label_meanings[i], roc_auc[i]))
+        curves_plotted+=1
+    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title(setname.lower().capitalize()+' ROC Epoch '+str(epoch))
+    plt.legend(loc="lower right",prop={'size':6})
+    outfilepath = os.path.join(outdir,setname+'_ROC_ep'+str(epoch)+'.pdf')
+    plt.savefig(outfilepath)
+    plt.close()
+def plot_pr_curve_multi_class(label_meanings, y_test, y_score,
+                              outdir, setname, epoch):
+    """<label_meanings>: list of strings, one for each label
+    <y_test>: matrix of ground truth
+    <y_score>: matrix of predicted probabilities
+    <outdir>: directory to save output file
+    <setname>: string e.g. 'train' 'valid' or 'test'
+    <epoch>: int for epoch"""
+    #Modified from https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
+    #https://stackoverflow.com/questions/29656550/how-to-plot-pr-curve-over-10-folds-of-cross-validation-in-scikit-learn
+    n_classes = len(label_meanings)
+    lw = 2
+    #make order df.
+    order = pd.DataFrame(np.zeros((n_classes,1)), index = [x for x in range(n_classes)],
+                         columns = ['prc'])
+    for i in range(n_classes):
+        order.at[i,'prc'] = sklearn.metrics.average_precision_score(y_test[:,i], y_score[:,i])
+    order = order.sort_values(by='prc',ascending=False)
+    #Plot
+    plt.figure()
+    colors_list = ['palevioletred','darkorange','yellowgreen','olive','deepskyblue','royalblue','navy']
+    curves_plotted = 0
+    for i in order.index.values.tolist()[0:10]: #only plot the top ten so the plot is readable
+        color_idx = curves_plotted%len(colors_list) #cycle through the colors list in order of colors
+        color = colors_list[color_idx]
+        average_precision = sklearn.metrics.average_precision_score(y_test[:,i], y_score[:,i])
+        precision, recall, _ = sklearn.metrics.precision_recall_curve(y_test[:,i], y_score[:,i])
+        plt.step(recall, precision, color=color, where='post',
+                 label='{:5s} (area {:0.2f})'.format(label_meanings[i], average_precision))
+        curves_plotted+=1
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.ylim([0.0, 1.05])
+    plt.xlim([0.0, 1.0])
+    plt.title(setname.lower().capitalize()+' PRC Epoch '+str(epoch))
+    plt.legend(loc="lower right",prop={'size':6})
+    outfilepath = os.path.join(outdir,setname+'_PR_ep'+str(epoch)+'.pdf')
+    plt.savefig(outfilepath)
+    plt.close()
+def plot_pr_curve_single_class(true_labels, pred_probs, outfilepath):
+    #http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
+    average_precision = sklearn.metrics.average_precision_score(true_labels, pred_probs)
+    precision, recall, _ = sklearn.metrics.precision_recall_curve(true_labels, pred_probs)
+    plt.step(recall, precision, color='b', alpha=0.2,
+             where='post')
+    plt.fill_between(recall, precision, step='post', alpha=0.2,
+                     color='b')
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.ylim([0.0, 1.05])
+    plt.xlim([0.0, 1.0])
+    plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
+              average_precision))
+    plt.savefig(outfilepath)
+    plt.close()
+def plot_roc_curve_single_class(fpr, tpr, epoch, outfilepath):
+    #http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
+    roc_auc = sklearn.metrics.auc(fpr, tpr)
+    lw = 2
+    plt.plot(fpr, tpr, color='darkorange',
+             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
+    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('Receiver Operating Characteristic')
+    plt.legend(loc="lower right")
+    plt.savefig(outputfilepath)
+    plt.close()
+def plot_learning_curves(train_loss, valid_loss, results_dir, descriptor):
+    """Variables
+    <train_loss> and <valid_loss> are numpy arrays with one numerical entry
+    for each epoch quanitfying the loss for that epoch."""
+    x = np.arange(0,len(train_loss))
+    plt.plot(x, train_loss, color='blue', lw=2, label='train')
+    plt.plot(x, valid_loss, color='green',lw = 2, label='valid')
+    plt.xlabel('Epoch')
+    plt.ylabel('Loss')
+    plt.title('Training and Validation Loss')
+    plt.legend(loc='lower right')
+    plt.savefig(os.path.join(results_dir, descriptor+'_Learning_Curves.png'))
+    plt.close()
+    #save numpy arrays of the losses
+    np.save(os.path.join(results_dir,'train_loss.npy'),train_loss)
+    np.save(os.path.join(results_dir,'valid_loss.npy'),valid_loss)
+def plot_heatmap(outprefix, numeric_array, center, xticklabels, yticklabels):
+    """Save a heatmap based on numeric_array"""
+    seaborn.set(font_scale=0.6)
+    seaplt = (seaborn.heatmap(numeric_array,
+                           center=center,
+                           xticklabels=xticklabels,
+                           yticklabels=yticklabels)).get_figure()
+    seaplt.savefig(outprefix+'.png')
+    seaplt.clf()