Diff of /ML_CV/validation.py [000000] .. [cd187b]

Switch to side-by-side view

--- a
+++ b/ML_CV/validation.py
@@ -0,0 +1,174 @@
+# -*- coding: utf-8 -*-
+# @Author  : chq_N
+# @Time    : 2020/8/01
+
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import StratifiedKFold
+
+
+def cv_test(features, label, indices, fn_interval=5, num_K=100):
+    def cross_val(X, y, seed):
+        np.random.seed(seed)
+        kf = StratifiedKFold(n_splits=5, shuffle=True)
+        prob_list = list()
+        auc_list = list()
+        y_list = list()
+        for train_index, test_index in kf.split(X, y):
+            model = RandomForestClassifier(
+                n_estimators=300, criterion='gini',
+                random_state=seed + 5, max_features='auto')
+            model.fit(X[train_index], y[train_index])
+            y_pred = model.predict_proba(X[test_index])[:, 1]
+            test_auc = roc_auc_score(y[test_index], y_pred)
+            prob_list.append(y_pred)
+            auc_list.append(test_auc)
+            y_list.append(y[test_index])
+        return np.concatenate(prob_list), np.concatenate(y_list), np.mean(auc_list)
+
+    best_k = 0
+    best_auc = 0
+
+    for ii in np.arange(num_K) + 1:
+        if (ii - 1) * fn_interval >= features.shape[-1]:
+            break
+        f_n = np.clip(ii * fn_interval, 0, features.shape[-1])
+        X_selected = features[:, indices[0:f_n]]
+        auc_all = list()
+        auc_mean = list()
+        for i in range(5):
+            y_pred, _y, auc = cross_val(X_selected, label, i * 10)
+            test_auc = roc_auc_score(_y, y_pred)
+            auc_all.append(test_auc)
+            auc_mean.append(auc)
+        auc_all = np.mean(auc_all)
+        auc_mean = np.mean(auc_mean)
+        print(f_n, 'auc all:', auc_all, 'auc mean:', auc_mean)
+        if auc_mean > best_auc:
+            best_auc = auc_mean
+            best_k = f_n
+    return best_k, best_auc
+
+
+def detail_test(features, label, indices, f_n, ppv_th=0.7):
+    def cross_val(X, y, seed):
+        np.random.seed(seed)
+        kf = StratifiedKFold(n_splits=5, shuffle=True)
+        prob_list = list()
+        auc_list = list()
+        y_list = list()
+        for train_index, test_index in kf.split(X, y):
+            model = RandomForestClassifier(
+                n_estimators=300, criterion='gini',
+                random_state=seed + 5, max_features='auto')
+            model.fit(X[train_index], y[train_index])
+            y_pred = model.predict_proba(X[test_index])[:, 1]
+            test_auc = roc_auc_score(y[test_index], y_pred)
+            prob_list.append(y_pred)
+            auc_list.append(test_auc)
+            y_list.append(y[test_index])
+        return np.concatenate(prob_list), np.concatenate(y_list), np.mean(auc_list), np.std(auc_list, ddof=1)
+
+    def get_sen_spe(pred, label):
+        label = (label > 0).astype('int')
+
+        def criteria(x, th):
+            return (x > th).astype('int')
+
+        for j in range(0, 1000, 1):
+            j = j / 1000
+            TP = ((label == 1) * (criteria(pred, j) == 1))
+            TN = ((label == 0) * (criteria(pred, j) == 0))
+            FP = ((label == 0) * (criteria(pred, j) == 1))
+            FN = ((label == 1) * (criteria(pred, j) == 0))
+            sensitivity = TP.sum() / (TP.sum() + FN.sum() + 1e-9)
+            specifity = TN.sum() / (TN.sum() + FP.sum() + 1e-9)
+            ppv = TP.sum() / (TP.sum() + FP.sum() + 1e-9)
+            npv = TN.sum() / (TN.sum() + FN.sum() + 1e-9)
+            acc = (TP.sum() + TN.sum()) / (TN.sum() + FP.sum() + TP.sum() + FN.sum() + 1e-9)
+            if ppv >= ppv_th:
+                break
+        return sensitivity, specifity, ppv, npv, acc
+
+    def draw_auc(y_pred, y):
+        inter_fpr = np.linspace(0, 1, 1000)
+        fpr, tpr, thresholds = roc_curve(y, y_pred)
+        inter_tpr = np.interp(inter_fpr, fpr, tpr)
+        inter_tpr[0] = 0.0
+        inter_tpr[-1] = 1.0
+        return inter_tpr
+
+    X_selected = features[:, indices[0:f_n]]
+    auc_all = list()
+    auc_mean = list()
+    auc_std = list()
+    sensitivity = list()
+    specificity = list()
+    ppv = list()
+    npv = list()
+    acc = list()
+    tpr = list()
+    for i in range(5):
+        y_pred, _y, _auc_mean, _auc_std = cross_val(X_selected, label, i * 10)
+        test_auc = roc_auc_score(_y, y_pred)
+        _tpr = draw_auc(y_pred, _y)
+        tpr.append(_tpr)
+        auc_all.append(test_auc)
+        auc_mean.append(_auc_mean)
+        auc_std.append(_auc_std)
+        _sen, _spe, _ppv, _npv, _acc = get_sen_spe(y_pred, _y)
+        sensitivity.append(_sen)
+        specificity.append(_spe)
+        ppv.append(_ppv)
+        npv.append(_npv)
+        acc.append(_acc)
+    return tpr, auc_all, auc_mean, auc_std, sensitivity, specificity, ppv, npv, acc
+
+
+def draw_mean_auc(
+        tpr, mean_sen, std_sen,
+        mean_spe, std_spe,
+        mean_auc, std_auc,
+        save_name):
+    tpr = np.asarray(tpr)
+    fpr = np.linspace(0, 1, 1000)
+    fig, ax = plt.subplots()
+    ax.patch.set_facecolor('white')
+    ax.grid(color='gray', linestyle='-.', linewidth=0.7)
+    ax.spines['bottom'].set_color('black')
+    ax.spines['left'].set_color('black')
+    ax.tick_params(axis='x', colors='black')
+    ax.tick_params(axis='y', colors='black')
+    ax.plot([0, 1], [0, 1],
+            linestyle='--', lw=2, color='r',
+            label='Chance',
+            alpha=.8)
+
+    mean_tpr = np.mean(tpr, axis=0)
+    ax.plot(fpr, mean_tpr, color='b',
+            label=r'ROC (AUC = %0.2f$\pm$%0.2f)' % (mean_auc, std_auc),
+            lw=2, alpha=.8)
+
+    std_tpr = np.std(tpr, axis=0, ddof=1)
+    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
+    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
+    ax.fill_between(fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
+                    label=r'$\pm$ 1 std. dev.'
+                    )
+
+    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
+           title='ROC Curve of %s' % save_name
+           )
+
+    ax.errorbar(1 - mean_spe, mean_sen, xerr=std_spe, yerr=std_sen,
+                color='g', fmt='.', markersize='7', ecolor='red', elinewidth=2, capsize=4,
+                label='Point with PPV=0.7')
+    ax.annotate('Sen=%0.1f%%$\pm$%0.2f%%\nSpe=%0.1f%%$\pm$%0.2f%%' % (
+        round(mean_sen * 100, 1), round(std_sen * 100, 2),
+        round(mean_spe * 100, 1), round(std_spe * 100, 2)),
+                (1 - mean_spe + 0.02, mean_sen - std_sen - 0.1))
+    ax.legend(loc="lower right")
+    plt.savefig(save_name + '.pdf')
+    plt.show()