--- a +++ b/generate_figures_and_tables.py @@ -0,0 +1,484 @@ +# %% Import packages +import pandas as pd +import numpy as np +from sklearn.metrics import (confusion_matrix, + precision_score, recall_score, f1_score, + precision_recall_curve, average_precision_score) +import matplotlib.pyplot as plt +import seaborn as sns +import xarray as xr +from scipy.stats.distributions import chi2 +from itertools import combinations + + +# %% Auxiliar functions +def get_scores(y_true, y_pred, score_fun): + nclasses = np.shape(y_true)[1] + scores = [] + for name, fun in score_fun.items(): + scores += [[fun(y_true[:, k], y_pred[:, k]) for k in range(nclasses)]] + return np.array(scores).T + + +def specificity_score(y_true, y_pred): + m = confusion_matrix(y_true, y_pred, labels=[0, 1]) + spc = m[0, 0] * 1.0 / (m[0, 0] + m[0, 1]) + return spc + + +def get_optimal_precision_recall(y_true, y_score): + """Find precision and recall values that maximize f1 score.""" + n = np.shape(y_true)[1] + opt_precision = [] + opt_recall = [] + opt_threshold = [] + for k in range(n): + # Get precision-recall curve + precision, recall, threshold = precision_recall_curve(y_true[:, k], y_score[:, k]) + # Compute f1 score for each point (use nan_to_num to avoid nans messing up the results) + f1_score = np.nan_to_num(2 * precision * recall / (precision + recall)) + # Select threshold that maximize f1 score + index = np.argmax(f1_score) + opt_precision.append(precision[index]) + opt_recall.append(recall[index]) + t = threshold[index-1] if index != 0 else threshold[0]-1e-10 + opt_threshold.append(t) + return np.array(opt_precision), np.array(opt_recall), np.array(opt_threshold) + +def affer_results(y_true, y_pred): + """Return true positives, false positives, true negatives, false negatives. + + Parameters + ---------- + y_true : ndarray + True value + y_pred : ndarray + Predicted value + + Returns + ------- + tn, tp, fn, fp: ndarray + Boolean matrices containing true negatives, true positives, false negatives and false positives. + cm : ndarray + Matrix containing: 0 - true negative, 1 - true positive, + 2 - false negative, and 3 - false positive. + """ + + # True negative + tn = (y_true == y_pred) & (y_pred == 0) + # True positive + tp = (y_true == y_pred) & (y_pred == 1) + # False positive + fp = (y_true != y_pred) & (y_pred == 1) + # False negative + fn = (y_true != y_pred) & (y_pred == 0) + + # Generate matrix of "tp, fp, tn, fn" + m, n = np.shape(y_true) + cm = np.zeros((m, n), dtype=int) + cm[tn] = 0 + cm[tp] = 1 + cm[fn] = 2 + cm[fp] = 3 + return tn, tp, fn, fp, cm + + +# %% Constants +score_fun = {'Precision': precision_score, + 'Recall': recall_score, 'Specificity': specificity_score, + 'F1 score': f1_score} +diagnosis = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'AF', 'ST'] +nclasses = len(diagnosis) +predictor_names = ['DNN', 'cardio.', 'emerg.', 'stud.'] + +# %% Read datasets +# Get two annotators +y_cardiologist1 = pd.read_csv('./data/annotations/cardiologist1.csv').values +y_cardiologist2 = pd.read_csv('./data/annotations/cardiologist2.csv').values +# Get true values +y_true = pd.read_csv('./data/annotations/gold_standard.csv').values +# Get residents and students performance +y_cardio = pd.read_csv('./data/annotations/cardiology_residents.csv').values +y_emerg = pd.read_csv('./data/annotations/emergency_residents.csv').values +y_student = pd.read_csv('./data/annotations/medical_students.csv').values +# get y_score for different models +y_score_list = [np.load('./dnn_predicts/other_seeds/model_' + str(i+1) + '.npy') for i in range(10)] + + +# %% Get average model model +# Get micro average precision +micro_avg_precision = [average_precision_score(y_true[:, :6], y_score[:, :6], average='micro') + for y_score in y_score_list] +# get ordered index +index = np.argsort(micro_avg_precision) +print('Micro average precision') +print(np.array(micro_avg_precision)[index]) +# get 6th best model (immediatly above median) out 10 different models +k_dnn_best = index[5] +y_score_best = y_score_list[k_dnn_best] +# Get threshold that yield the best precision recall using "get_optimal_precision_recall" on validation set +# (we rounded it up to three decimal cases to make it easier to read...) +threshold = np.array([0.124, 0.07, 0.05, 0.278, 0.390, 0.174]) +mask = y_score_best > threshold +# Get neural network prediction +# This data was also saved in './data/annotations/dnn.csv' +y_neuralnet = np.zeros_like(y_score_best) +y_neuralnet[mask] = 1 +y_neuralnet[mask] = 1 + + +# %% Generate table with scores for the average model (Table 2) +scores_list = [] +for y_pred in [y_neuralnet, y_cardio, y_emerg, y_student]: + # Compute scores + scores = get_scores(y_true, y_pred, score_fun) + # Put them into a data frame + scores_df = pd.DataFrame(scores, index=diagnosis, columns=score_fun.keys()) + # Append + scores_list.append(scores_df) +# Concatenate dataframes +scores_all_df = pd.concat(scores_list, axis=1, keys=['DNN', 'cardio.', 'emerg.', 'stud.']) +# Change multiindex levels +scores_all_df = scores_all_df.swaplevel(0, 1, axis=1) +scores_all_df = scores_all_df.reindex(level=0, columns=score_fun.keys()) +# Save results +scores_all_df.to_excel("./outputs/tables/scores.xlsx", float_format='%.3f') +scores_all_df.to_csv("./outputs/tables/scores.csv", float_format='%.3f') + + +# %% Plot precision recall curves (Figure 2) +for k, name in enumerate(diagnosis): + precision_list = [] + recall_list = [] + threshold_list = [] + average_precision_list = [] + fig, ax = plt.subplots() + lw = 2 + t = ['bo', 'rv', 'gs', 'kd'] + for j, y_score in enumerate(y_score_list): + # Get precision-recall curve + precision, recall, threshold = precision_recall_curve(y_true[:, k], y_score[:, k]) + recall[np.isnan(recall)] = 0 # change nans to 0 + precision[np.isnan(precision)] = 0 # change nans to 0 + # Plot if is the choosen option + if j == k_dnn_best: + ax.plot(recall, precision, color='blue', alpha=0.7) + # Compute average precision + average_precision = average_precision_score(y_true[:, k], y_score[:, k]) + precision_list += [precision] + recall_list += [recall] + average_precision_list += [average_precision] + threshold_list += [threshold] + # Plot shaded region containing maximum and minimun from other executions + recall_all = np.concatenate(recall_list) + recall_all = np.sort(recall_all) # sort + recall_all = np.unique(recall_all) # remove repeated entries + recall_vec = [] + precision_min = [] + precision_max = [] + for r in recall_all: + p_max = [max(precision[recall == r]) for recall, precision in zip(recall_list, precision_list)] + p_min = [min(precision[recall == r]) for recall, precision in zip(recall_list, precision_list)] + recall_vec += [r, r] + precision_min += [min(p_max), min(p_min)] + precision_max += [max(p_max), max(p_min)] + ax.plot(recall_vec, precision_min, color='blue', alpha=0.3) + ax.plot(recall_vec, precision_max, color='blue', alpha=0.3) + ax.fill_between(recall_vec, precision_min, precision_max, + facecolor="blue", alpha=0.3) + # Plot iso-f1 curves + f_scores = np.linspace(0.1, 0.95, num=15) + for f_score in f_scores: + x = np.linspace(0.0000001, 1, 1000) + y = f_score * x / (2 * x - f_score) + ax.plot(x[y >= 0], y[y >= 0], color='gray', ls=':', lw=0.7, alpha=0.25) + # Plot values in + for npred in range(4): + ax.plot(scores_list[npred]['Recall'][k], scores_list[npred]['Precision'][k], + t[npred], label=predictor_names[npred]) + plt.xticks(fontsize=16) + plt.yticks(fontsize=16) + ax.set_xlim([0.0, 1.0]) + ax.set_ylim([0.0, 1.02]) + if k in [3, 4, 5]: + ax.set_xlabel('Recall (Sensitivity)', fontsize=17) + if k in [0, 3]: + ax.set_ylabel('Precision (PPV)', fontsize=17) + # plt.title('Precision-Recall curve (' + name + ')') + if k == 0: + plt.legend(loc="lower left", fontsize=17) + else: + ax.legend().remove() + plt.tight_layout() + plt.savefig('./outputs/figures/precision_recall_{0}.pdf'.format(name)) + +# %% Confusion matrices (Supplementary Table 1) + +M = [[confusion_matrix(y_true[:, k], y_pred[:, k], labels=[0, 1]) + for k in range(nclasses)] for y_pred in [y_neuralnet, y_cardio, y_emerg, y_student]] + +M_xarray = xr.DataArray(np.array(M), + dims=['predictor', 'diagnosis', 'true label', 'predicted label'], + coords={'predictor': ['DNN', 'cardio.', 'emerg.', 'stud.'], + 'diagnosis': diagnosis, + 'true label': ['not present', 'present'], + 'predicted label': ['not present', 'present']}) +confusion_matrices = M_xarray.to_dataframe('n') +confusion_matrices = confusion_matrices.reorder_levels([1, 2, 3, 0], axis=0) +confusion_matrices = confusion_matrices.unstack() +confusion_matrices = confusion_matrices.unstack() +confusion_matrices = confusion_matrices['n'] +confusion_matrices.to_excel("./outputs/tables/confusion matrices.xlsx", float_format='%.3f') +confusion_matrices.to_csv("./outputs/tables/confusion matrices.csv", float_format='%.3f') + + +#%% Compute scores and bootstraped version of these scores + +bootstrap_nsamples = 1000 +percentiles = [2.5, 97.5] +scores_resampled_list = [] +scores_percentiles_list = [] +for y_pred in [y_neuralnet, y_cardio, y_emerg, y_student]: + # Compute bootstraped samples + np.random.seed(123) # NEVER change this =P + n, _ = np.shape(y_true) + samples = np.random.randint(n, size=n * bootstrap_nsamples) + # Get samples + y_true_resampled = np.reshape(y_true[samples, :], (bootstrap_nsamples, n, nclasses)) + y_doctors_resampled = np.reshape(y_pred[samples, :], (bootstrap_nsamples, n, nclasses)) + # Apply functions + scores_resampled = np.array([get_scores(y_true_resampled[i, :, :], y_doctors_resampled[i, :, :], score_fun) + for i in range(bootstrap_nsamples)]) + # Sort scores + scores_resampled.sort(axis=0) + # Append + scores_resampled_list.append(scores_resampled) + + # Compute percentiles index + i = [int(p / 100.0 * bootstrap_nsamples) for p in percentiles] + # Get percentiles + scores_percentiles = scores_resampled[i, :, :] + # Convert percentiles to a dataframe + scores_percentiles_df = pd.concat([pd.DataFrame(x, index=diagnosis, columns=score_fun.keys()) + for x in scores_percentiles], keys=['p1', 'p2'], axis=1) + # Change multiindex levels + scores_percentiles_df = scores_percentiles_df.swaplevel(0, 1, axis=1) + scores_percentiles_df = scores_percentiles_df.reindex(level=0, columns=score_fun.keys()) + # Append + scores_percentiles_list.append(scores_percentiles_df) +# Concatenate dataframes +scores_percentiles_all_df = pd.concat(scores_percentiles_list, axis=1, keys=predictor_names) +# Change multiindex levels +scores_percentiles_all_df = scores_percentiles_all_df.reorder_levels([1, 0, 2], axis=1) +scores_percentiles_all_df = scores_percentiles_all_df.reindex(level=0, columns=score_fun.keys()) + + +#%% Print box plot (Supplementary Figure 1) +# Convert to xarray +scores_resampled_xr = xr.DataArray(np.array(scores_resampled_list), + dims=['predictor', 'n', 'diagnosis', 'score_fun'], + coords={ + 'predictor': predictor_names, + 'n': range(bootstrap_nsamples), + 'diagnosis': ['1dAVb', 'RBBB', 'LBBB', 'SB', 'AF', 'ST'], + 'score_fun': list(score_fun.keys())}) +# Remove everything except f1_score +for sf in score_fun: + fig, ax = plt.subplots() + f1_score_resampled_xr = scores_resampled_xr.sel(score_fun=sf) + # Convert to dataframe + f1_score_resampled_df = f1_score_resampled_xr.to_dataframe(name=sf).reset_index(level=[0, 1, 2]) + # Plot seaborn + ax = sns.boxplot(x="diagnosis", y=sf, hue="predictor", data=f1_score_resampled_df) + # Save results + plt.xticks(fontsize=16) + plt.yticks(fontsize=16) + plt.xlabel("") + plt.ylabel("", fontsize=16) + if sf == "F1 score": + plt.legend(fontsize=17) + else: + ax.legend().remove() + plt.tight_layout() + plt.savefig('./outputs/figures/boxplot_bootstrap_{}.pdf'.format(sf)) + + +scores_resampled_xr.to_dataframe(name='score').to_csv('./outputs/figures/boxplot_bootstrap_data.txt') + +#%% McNemar test (Supplementary Table 3) +# Get correct and wrong predictions for each of them (cm >= 2 correspond to wrong predictions) +wrong_predictions = np.array([affer_results(y_true, y_pred)[4] >= 2 + for y_pred in [y_neuralnet, y_cardio, y_emerg, y_student]]) + +# Compute McNemar score +names = ["DNN", "cardio.", "emerg.", "stud."] +mcnemar_name = [] +mcnemar_score = np.empty((6, 6)) +k = 0 +for i in range(4): + for j in range(i+1, 4): + a_not_b = np.sum(wrong_predictions[i, :, :] & ~wrong_predictions[j, :, :], axis=0) + b_not_a = np.sum(~wrong_predictions[i, :, :] & wrong_predictions[j, :, :], axis=0) + # An alterantive to the standard McNemar test is to include a + # continuity correction term, resulting in: + # mcnemar_corr_score = np.square(np.abs(a_not_b - b_not_a) - 1) / (a_not_b + b_not_a) + # I tested both and came the conclusion, that we cannot reject the null hypotesis + # for neither. The standard test however provide results that are easier to visualize. + mcnemar_score[k, :] = np.square(a_not_b - b_not_a) / (a_not_b + b_not_a) + k += 1 + mcnemar_name += [names[i] + " vs " + names[j]] + +mcnemar = pd.DataFrame(1-chi2.cdf(mcnemar_score, 1), index=mcnemar_name, columns=diagnosis) # p-value + +# Save results +mcnemar.to_excel("./outputs/tables/mcnemar.xlsx", float_format='%.3f') +mcnemar.to_csv("./outputs/tables/mcnemar.csv", float_format='%.3f') + +# %% Kappa score classifiers (Supplementary Table 2(a)) + +names = ["DNN", "cardio.", "emerg.", "stud."] +predictors = [y_neuralnet, y_cardio, y_emerg, y_student] +kappa_name = [] +kappa_score = np.empty((6, 6)) +k = 0 +for i in range(4): + for j in range(i+1, 4): + y_pred_1 = predictors[i] + y_pred_2 = predictors[j] + # Get "confusion matrix" + negative_negative, positive_positive, positive_negative, negative_positive, _ = \ + affer_results(y_pred_1, y_pred_2) + p_p = positive_positive.sum(axis=0) + p_n = positive_negative.sum(axis=0) + n_p = negative_positive.sum(axis=0) + n_n = negative_negative.sum(axis=0) + total_sum = p_p + p_n + n_p + n_n + # Relative agreement + r_agree = (p_p + n_n) / total_sum + # Empirical probability of both saying yes + p_yes = (p_p + p_n) * (p_p + n_p) / total_sum**2 + # Empirical probability of both saying no + p_no = (n_n + n_p) * (n_n + p_n) / total_sum**2 + # Empirical probability of agreement + p_agree = p_yes + p_no + # Kappa score + kappa_score[k, :] = (r_agree - p_agree) / (1 - p_agree) + k += 1 + kappa_name += [names[i] + " vs " + names[j]] + +kappa = pd.DataFrame(kappa_score, index=kappa_name, columns=diagnosis) # p-value + +# Save results +kappa.to_excel("./outputs/tables/kappa.xlsx", float_format='%.3f') +kappa.to_csv("./outputs/tables/kappa.csv", float_format='%.3f') + + +# %% Kappa score dataset generation (Supplementary Table 2(b)) + +# Compute kappa score +kappa_list = [] +names_list = [] +raters = [('DNN', y_neuralnet), ('Cert. cardiol. 1', y_cardiologist1), ('Certif. cardiol. 2', y_cardiologist2)] +for r1, r2 in combinations(raters, 2): + name1, y1 = r1 + name2, y2 = r2 + negative_negative, positive_positive, positive_negative, negative_positive, _ = \ + affer_results(y1, y2) + p_p = positive_positive.sum(axis=0) + p_n = positive_negative.sum(axis=0) + n_p = negative_positive.sum(axis=0) + n_n = negative_negative.sum(axis=0) + total_sum = p_p + p_n + n_p + n_n + # Relative agreement + r_agree = (p_p + n_n) / total_sum + # Empirical probability of both saying yes + p_yes = (p_p + p_n) * (p_p + n_p) / total_sum ** 2 + # Empirical probability of both saying no + p_no = (n_n + n_p) * (n_n + p_n) / total_sum ** 2 + # Empirical probability of agreement + p_agree = p_yes + p_no + # Kappa score + kappa = (r_agree - p_agree) / (1 - p_agree) + kappa_list.append(kappa) + names_list.append('{} vs {}'.format(name1, name2)) + +kappas_annotators_and_DNN = pd.DataFrame(np.stack(kappa_list), columns=diagnosis, index=names_list) +print(kappas_annotators_and_DNN) +kappas_annotators_and_DNN.to_excel("./outputs/tables/kappas_annotators_and_DNN.xlsx", float_format='%.3f') +kappas_annotators_and_DNN.to_csv("./outputs/tables/kappas_annotators_and_DNN.csv", float_format='%.3f') + +# %% Compute scores and bootstraped version of these scores on alternative splits +bootstrap_nsamples = 1000 +scores_resampled_list = [] +scores_percentiles_list = [] +for name in ['normal_order', 'date_order', 'individual_patients', 'base_model']: + print(name) + # Get data + yn_true = y_true + yn_score = np.load('./dnn_predicts/other_splits/model_'+name+'.npy') if not name == 'base_model' else y_score_best + # Compute threshold + nclasses = np.shape(yn_true)[1] + opt_precision, opt_recall, threshold = get_optimal_precision_recall(yn_true, yn_score) + mask_n = yn_score > threshold + yn_pred = np.zeros_like(yn_score) + yn_pred[mask_n] = 1 + # Compute bootstraped samples + np.random.seed(123) # NEVER change this =P + n, _ = np.shape(yn_true) + samples = np.random.randint(n, size=n * bootstrap_nsamples) + # Get samples + y_true_resampled = np.reshape(yn_true[samples, :], (bootstrap_nsamples, n, nclasses)) + y_doctors_resampled = np.reshape(yn_pred[samples, :], (bootstrap_nsamples, n, nclasses)) + # Apply functions + scores_resampled = np.array([get_scores(y_true_resampled[i, :, :], y_doctors_resampled[i, :, :], score_fun) + for i in range(bootstrap_nsamples)]) + # Sort scores + scores_resampled.sort(axis=0) + # Append + scores_resampled_list.append(scores_resampled) + + # Compute percentiles index + i = [int(p / 100.0 * bootstrap_nsamples) for p in percentiles] + # Get percentiles + scores_percentiles = scores_resampled[i, :, :] + # Convert percentiles to a dataframe + scores_percentiles_df = pd.concat([pd.DataFrame(x, index=diagnosis, columns=score_fun.keys()) + for x in scores_percentiles], keys=['p1', 'p2'], axis=1) + # Change multiindex levels + scores_percentiles_df = scores_percentiles_df.swaplevel(0, 1, axis=1) + scores_percentiles_df = scores_percentiles_df.reindex(level=0, columns=score_fun.keys()) + # Append + scores_percentiles_list.append(scores_percentiles_df) + +# %% Print box plot on alternative splits (Supplementary Figure 2 (a)) +scores_resampled_xr = xr.DataArray(np.array(scores_resampled_list), + dims=['predictor', 'n', 'diagnosis', 'score_fun'], + coords={ + 'predictor': ['random', 'by date', 'by patient', 'original DNN'], + 'n': range(bootstrap_nsamples), + 'diagnosis': ['1dAVb', 'RBBB', 'LBBB', 'SB', 'AF', 'ST'], + 'score_fun': list(score_fun.keys())}) +# Remove everything except f1_score +sf = 'F1 score' +fig, ax = plt.subplots() +f1_score_resampled_xr = scores_resampled_xr.sel(score_fun=sf) +# Convert to dataframe +f1_score_resampled_df = f1_score_resampled_xr.to_dataframe(name=sf).reset_index(level=[0, 1, 2]) +# Plot seaborn +ax = sns.boxplot(x="diagnosis", y=sf, hue="predictor", data=f1_score_resampled_df, + order=['1dAVb', 'SB', 'AF', 'ST', 'RBBB', 'LBBB'], + palette=sns.color_palette("Set1", n_colors=8)) +plt.axvline(3.5, color='black', ls='--') +plt.axvline(5.5, color='black', ls='--') +plt.axvspan(3.5, 5.5, alpha=0.1, color='gray') +# Save results +plt.xticks(fontsize=16) +plt.yticks(fontsize=16) +plt.xlabel("") +plt.ylabel("F1 score", fontsize=16) +plt.legend(fontsize=17) +plt.ylim([0.4, 1.05]) +plt.xlim([-0.5, 5.5]) +plt.tight_layout() +plt.savefig('./outputs/figures/boxplot_bootstrap_other_splits_{0}.pdf'.format(sf)) +f1_score_resampled_df.to_csv('./outputs/figures/boxplot_bootstrap_other_splits_data.txt', index=False) \ No newline at end of file