[f47f8b]: / notebooks / classification.ipynb

Download this file

1877 lines (1876 with data), 310.6 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[MLENS] backend: threading\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "from sklearn.kernel_approximation import Nystroem\n",
    "from sklearn.kernel_approximation import RBFSampler\n",
    "\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.base import clone\n",
    "\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectFromModel, chi2, SelectPercentile, f_classif\n",
    "from sklearn.model_selection import train_test_split, cross_val_score, KFold\n",
    "\n",
    "from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report, roc_auc_score, roc_curve\n",
    "\n",
    "from sklearn.svm import SVC, LinearSVC\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier\n",
    "from sklearn.decomposition import PCA, NMF\n",
    "\n",
    "from mlens.visualization import pca_plot, pca_comp_plot\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "def get_models():\n",
    "    nb = GaussianNB()\n",
    "    svc = SVC(kernel='rbf', C=100, gamma=1e-3, probability=True, class_weight='balanced')\n",
    "    knn = KNeighborsClassifier(n_neighbors=8)\n",
    "    lr = LogisticRegression(C=100, random_state=seed)\n",
    "    nn = MLPClassifier((16, 8), solver='lbfgs', activation='relu', random_state=seed)\n",
    "    gb = GradientBoostingClassifier(n_estimators=32, random_state=seed)\n",
    "    rf = RandomForestClassifier(n_estimators=8, max_features=2, random_state=seed)\n",
    "    ab = AdaBoostClassifier(random_state=seed)\n",
    "    \n",
    "    models = {'svm': svc, 'knn': knn, 'naive bayes': nb,\n",
    "             'mlp': nn, 'random forest': rf, 'gradientboost': gb, 'logistic': lr,\n",
    "             'adaboost': ab}\n",
    "    return models\n",
    "\n",
    "def get_sex(PatientSex):\n",
    "    return 1 if PatientSex is 'M' else 0\n",
    "\n",
    "def get_sample_weight(y):\n",
    "    y = y.astype('int')\n",
    "    class_weight = len(y) / (2 * np.bincount(y))\n",
    "    sample_weight = (np.ones_like(y) * y * class_weight[1]) + (np.ones_like(y) * (1 - y) * class_weight[0] * 0.78)\n",
    "    return sample_weight\n",
    "        \n",
    "\n",
    "def train_predict(model_list):\n",
    "    P = np.zeros((y_test.shape[0], len(model_list)))\n",
    "    P = pd.DataFrame(P)\n",
    "    \n",
    "    print('Fitting models...')\n",
    "    cols = list()\n",
    "    for i, (name, m) in enumerate(model_list.items()):\n",
    "        print(\"%s...\" % name, end=\" \", flush=False)\n",
    "        try:\n",
    "            m.fit(x_train, y_train, sample_weight=get_sample_weight(y_train))\n",
    "        except:\n",
    "            print('no sample weight')\n",
    "            m.fit(x_train, y_train)\n",
    "        try:\n",
    "            print('\\n', name, m.feature_importances_)\n",
    "        except:\n",
    "            print('no feature importances')\n",
    "        P.iloc[:, i] = m.predict_proba(x_test)[:, 1]\n",
    "        cols.append(name)\n",
    "        print(\"done\")\n",
    "    \n",
    "    P.columns = cols\n",
    "    print(\"Done.\\n\")\n",
    "    return P\n",
    "\n",
    "def cross_val_models(model_list, cv=5, scoring='accuracy'):\n",
    "    P = np.zeros((cv, len(model_list)))\n",
    "    P = pd.DataFrame(P)\n",
    "\n",
    "    print('Fitting models...')\n",
    "    cols = list()\n",
    "    for i, (name, m) in enumerate(model_list.items()):\n",
    "        print(\"%s...\" % name, end=\" \", flush=False)\n",
    "        P.iloc[:, i] = cross_val_score(m, X, y, cv=cv, scoring=scoring)\n",
    "        cols.append(name)\n",
    "        print(\"done\")\n",
    "    P.columns = cols\n",
    "    print('Done.\\n')\n",
    "    return P\n",
    "\n",
    "def cross_val_acc(P, scoring='ACC'):\n",
    "    for m in P.columns:\n",
    "        score = np.mean(P.loc[:, m])\n",
    "        std = 2 * np.std(P.loc[:, m])\n",
    "        print('{} {} Result {} +/- {}'.format(m, scoring, score, std))\n",
    "\n",
    "def score_models(P, y):\n",
    "    print('ROC AUC SCORE')\n",
    "    for m in P.columns:\n",
    "        score = roc_auc_score(y, P.loc[:, m])\n",
    "        print(\"%-26s: %.3f\" % (m, score))\n",
    "    print('ACC SCORE')\n",
    "    for m in P.columns:\n",
    "        score = accuracy_score(y, np.float32(P.loc[:, m]>=0.5))\n",
    "        print(\"%-26s: %.3f\" % (m, score))\n",
    "\n",
    "def plot_roc_curve(y_test, P_base_learners, P_ensemble, labels, ens_label):\n",
    "    plt.plot([0, 1], [0, 1], 'k--')\n",
    "    #cm = [plt.cm.gray(i) for i in np.linspace(0, 1.0, P_base_learners.shape[1])]\n",
    "    cm = sns.color_palette(\"Blues\", P_base_learners.shape[1])\n",
    "    \n",
    "    for i in range(P_base_learners.shape[1]):\n",
    "        p = P_base_learners[:, i]\n",
    "        fpr, tpr, _ = roc_curve(y_test, p)\n",
    "        plt.plot(fpr, tpr, label=labels[i], c=cm[i])\n",
    "        \n",
    "    fpr, tpr, _ = roc_curve(y_test, P_ensemble)\n",
    "    plt.plot(fpr, tpr, label=ens_label, c='red')\n",
    "    \n",
    "    plt.xlabel('False positive rate')\n",
    "    plt.ylabel('True positive rate')\n",
    "    plt.title('ROC curve')\n",
    "    plt.legend()\n",
    "    plt.show()\n",
    "    \n",
    "def train_base_learners(base_learners, x_train, y_train):\n",
    "    print('Fitting models')\n",
    "    for i, (name, m) in enumerate(base_learners.items()):\n",
    "        print('%s...'% name, end='', flush=False)\n",
    "        try:\n",
    "            m.fit(x_train, y_train, sample_weight=get_sample_weight(y_train))\n",
    "        except:\n",
    "            m.fit(x_train, y_train)\n",
    "            print('no sample_weight')\n",
    "    print('done.')\n",
    "\n",
    "def predict_base_learners(base_learners, x):\n",
    "    P = np.zeros((x.shape[0], len(base_learners)))\n",
    "    print('Generating base learner predictions.')\n",
    "    for i, (name, m) in enumerate(base_learners.items()):\n",
    "        print('%s...'% name, end='', flush=False)\n",
    "        p = m.predict_proba(x)\n",
    "        P[:, i] = p[:, 1]\n",
    "    print('done.')\n",
    "    return P\n",
    "\n",
    "def ensemble_predict(base_learners, meta_learner, x):\n",
    "    P_pred = predict_base_learners(base_learners, x)\n",
    "    return P_pred, meta_learner.predict_proba(P_pred)[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data = pd.read_csv('analysisAll.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Age', 'InstitutionName', 'Manufacturer', 'PatientBirthDate',\n",
       "       'PatientID', 'PatientSex', 'Severe', 'StudyDate', 'StudyID',\n",
       "       'StudyTime', 'check', 'consolidation', 'filename', 'left_lesion',\n",
       "       'left_lung', 'lesion', 'lesion_consolidation', 'lung', 'lung_lesion',\n",
       "       'patientID', 'ratio', 'right_lesion', 'right_lung', 'severe', 'shape',\n",
       "       'slice', 'spacing', 'studyDate', 'weighted_lesion',\n",
       "       'weighted_lung_lesion', 'z'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature = ['lung', 'lesion', 'ratio', 'lung_lesion', \n",
    "           #'left_lung', 'right_lung', 'left_lesion', 'right_lesion',\n",
    "          'weighted_lesion', 'weighted_lung_lesion',\n",
    "          'consolidation', 'lesion_consolidation',\n",
    "          'z', 'Age', 'sex']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data['sex'] = raw_data['PatientSex'].map(get_sex)\n",
    "cls_data = raw_data\n",
    "cls_data.loc[cls_data['Severe'] == 2, 'Severe'] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = np.array(cls_data[feature]).astype(np.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = np.array(cls_data['Severe'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,\n",
       "       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,\n",
       "       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,\n",
       "       0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,\n",
       "       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,\n",
       "       1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int64)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "seed = 2\n",
    "np.random.seed(seed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(434, 11)\n"
     ]
    }
   ],
   "source": [
    "print(X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(434, 11)\n",
      "(434, 11)\n"
     ]
    }
   ],
   "source": [
    "X = MinMaxScaler().fit_transform(X)\n",
    "#selector = VarianceThreshold(0.01)\n",
    "#selector.fit(X)\n",
    "#X = selector.transform(X)\n",
    "print(X.shape)\n",
    "#etc = GradientBoostingClassifier().fit(X, y)\n",
    "#model = SelectFromModel(etc, prefit=True)\n",
    "#X = model.transform(X)\n",
    "print(X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['lung',\n",
       " 'lesion',\n",
       " 'ratio',\n",
       " 'lung_lesion',\n",
       " 'weighted_lesion',\n",
       " 'weighted_lung_lesion',\n",
       " 'consolidation',\n",
       " 'lesion_consolidation',\n",
       " 'z',\n",
       " 'Age',\n",
       " 'sex']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\utils\\extmath.py:516: RuntimeWarning: invalid value encountered in multiply\n",
      "  v *= signs[:, np.newaxis]\n"
     ]
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 720x576 with 4 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "[<matplotlib.axes._subplots.AxesSubplot at 0x1dce63c9828>,\n",
       " <matplotlib.axes._subplots.AxesSubplot at 0x1dcf9bf59b0>,\n",
       " <matplotlib.axes._subplots.Axes3DSubplot at 0x1dcf9ed28d0>,\n",
       " <matplotlib.axes._subplots.Axes3DSubplot at 0x1dcf9f0f6a0>]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pca_comp_plot(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "样本集大小: (434, 11) (434,)\n",
      "训练集大小: (347, 11) (347,)\n",
      "测试集大小: (87, 11) (87,)\n"
     ]
    }
   ],
   "source": [
    "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)\n",
    "print('样本集大小:',X.shape,y.shape)\n",
    "print('训练集大小:',x_train.shape,y_train.shape)  # 训练集样本大小\n",
    "print('测试集大小:',x_test.shape,y_test.shape)  # 测试集样本大小"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting models...\n",
      "svm... no feature importances\n",
      "done\n",
      "knn... no sample weight\n",
      "no feature importances\n",
      "done\n",
      "naive bayes... no feature importances\n",
      "done\n",
      "mlp... no sample weight\n",
      "no feature importances\n",
      "done\n",
      "random forest... \n",
      " random forest [0.06556802 0.06267865 0.10769074 0.23554236 0.0675672  0.11787239\n",
      " 0.06881165 0.15614353 0.04052227 0.06618391 0.01141929]\n",
      "done\n",
      "gradientboost... \n",
      " gradientboost [0.02102846 0.28094372 0.08241749 0.04754522 0.05069486 0.02973711\n",
      " 0.23328546 0.20862373 0.02436599 0.02135795 0.        ]\n",
      "done\n",
      "logistic... no feature importances\n",
      "done\n",
      "adaboost... \n",
      " adaboost [0.08 0.06 0.1  0.12 0.16 0.04 0.18 0.08 0.08 0.1  0.  ]\n",
      "done\n",
      "Done.\n",
      "\n",
      "ROC AUC SCORE\n",
      "svm                       : 0.893\n",
      "knn                       : 0.905\n",
      "naive bayes               : 0.897\n",
      "mlp                       : 0.908\n",
      "random forest             : 0.862\n",
      "gradientboost             : 0.909\n",
      "logistic                  : 0.899\n",
      "adaboost                  : 0.848\n",
      "ACC SCORE\n",
      "svm                       : 0.828\n",
      "knn                       : 0.839\n",
      "naive bayes               : 0.885\n",
      "mlp                       : 0.839\n",
      "random forest             : 0.851\n",
      "gradientboost             : 0.793\n",
      "logistic                  : 0.851\n",
      "adaboost                  : 0.816\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "models = get_models()\n",
    "P = train_predict(models)\n",
    "score_models(P, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting models...\n",
      "svm... done\n",
      "knn... done\n",
      "naive bayes... done\n",
      "mlp... done\n",
      "random forest... done\n",
      "gradientboost... done\n",
      "logistic... "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done\n",
      "adaboost... done\n",
      "Done.\n",
      "\n",
      "svm ACC Result 0.8237315010570825 +/- 0.3017880613142109\n",
      "knn ACC Result 0.7478858350951375 +/- 0.2920472229603193\n",
      "naive bayes ACC Result 0.8306025369978858 +/- 0.26769702623988184\n",
      "mlp ACC Result 0.7806025369978858 +/- 0.3575818547409162\n",
      "random forest ACC Result 0.775475687103594 +/- 0.28610783615005586\n",
      "gradientboost ACC Result 0.7620507399577168 +/- 0.3986433824327496\n",
      "logistic ACC Result 0.8097251585623679 +/- 0.2858060473499701\n",
      "adaboost ACC Result 0.7620507399577167 +/- 0.35577830735987864\n"
     ]
    }
   ],
   "source": [
    "models = get_models()\n",
    "cv_results=cross_val_models(models)\n",
    "cross_val_acc(cv_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>svm</th>\n",
       "      <th>knn</th>\n",
       "      <th>naive bayes</th>\n",
       "      <th>mlp</th>\n",
       "      <th>random forest</th>\n",
       "      <th>gradientboost</th>\n",
       "      <th>logistic</th>\n",
       "      <th>adaboost</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.534091</td>\n",
       "      <td>0.477273</td>\n",
       "      <td>0.579545</td>\n",
       "      <td>0.477273</td>\n",
       "      <td>0.522727</td>\n",
       "      <td>0.386364</td>\n",
       "      <td>0.534091</td>\n",
       "      <td>0.443182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.863636</td>\n",
       "      <td>0.727273</td>\n",
       "      <td>0.840909</td>\n",
       "      <td>0.693182</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.772727</td>\n",
       "      <td>0.875000</td>\n",
       "      <td>0.715909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.930233</td>\n",
       "      <td>0.883721</td>\n",
       "      <td>0.895349</td>\n",
       "      <td>0.860465</td>\n",
       "      <td>0.848837</td>\n",
       "      <td>0.837209</td>\n",
       "      <td>0.895349</td>\n",
       "      <td>0.918605</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.837209</td>\n",
       "      <td>0.790698</td>\n",
       "      <td>0.860465</td>\n",
       "      <td>0.883721</td>\n",
       "      <td>0.802326</td>\n",
       "      <td>0.837209</td>\n",
       "      <td>0.813953</td>\n",
       "      <td>0.802326</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.860465</td>\n",
       "      <td>0.976744</td>\n",
       "      <td>0.988372</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.976744</td>\n",
       "      <td>0.930233</td>\n",
       "      <td>0.930233</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        svm       knn  naive bayes       mlp  random forest  gradientboost  \\\n",
       "0  0.534091  0.477273     0.579545  0.477273       0.522727       0.386364   \n",
       "1  0.863636  0.727273     0.840909  0.693182       0.750000       0.772727   \n",
       "2  0.930233  0.883721     0.895349  0.860465       0.848837       0.837209   \n",
       "3  0.837209  0.790698     0.860465  0.883721       0.802326       0.837209   \n",
       "4  0.953488  0.860465     0.976744  0.988372       0.953488       0.976744   \n",
       "\n",
       "   logistic  adaboost  \n",
       "0  0.534091  0.443182  \n",
       "1  0.875000  0.715909  \n",
       "2  0.895349  0.918605  \n",
       "3  0.813953  0.802326  \n",
       "4  0.930233  0.930233  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting models...\n",
      "svm... done\n",
      "knn... done\n",
      "naive bayes... done\n",
      "mlp... done\n",
      "random forest... done\n",
      "gradientboost... done\n",
      "logistic... done\n",
      "adaboost... "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done\n",
      "Done.\n",
      "\n",
      "svm ROC AUC Result 0.8927574611181168 +/- 0.27380216453532236\n",
      "knn ROC AUC Result 0.8555520526832001 +/- 0.3006267128541013\n",
      "naive bayes ROC AUC Result 0.8604128718882815 +/- 0.29500499579004863\n",
      "mlp ROC AUC Result 0.8602090047172014 +/- 0.3352491413984705\n",
      "random forest ROC AUC Result 0.8340632151697726 +/- 0.2669250384669379\n",
      "gradientboost ROC AUC Result 0.856611554808276 +/- 0.34781065652988397\n",
      "logistic ROC AUC Result 0.8978590444164215 +/- 0.2869042332673911\n",
      "adaboost ROC AUC Result 0.8341810284433235 +/- 0.2870453702783311\n"
     ]
    }
   ],
   "source": [
    "cv_results_roc=cross_val_models(models, scoring='roc_auc')\n",
    "cross_val_acc(cv_results_roc, scoring='ROC AUC')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ensemble ROC-AUC score: 0.936\n",
      "Ensemble ACC score: 0.885\n"
     ]
    }
   ],
   "source": [
    "print(\"Ensemble ROC-AUC score: %.3f\" % roc_auc_score(y_test, P.mean(axis=1)))\n",
    "print(\"Ensemble ACC score: %.3f\" % accuracy_score(y_test, np.float32(P.mean(axis=1) > 0.5)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plot_roc_curve(y_test, P.values, P.mean(axis=1), list(P.columns), 'voting')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_learners = get_models()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "meta_learner = GradientBoostingClassifier(\n",
    "   n_estimators=128,\n",
    "   loss=\"exponential\",\n",
    "   max_features=4,\n",
    "   max_depth=3,\n",
    "   subsample=0.5,\n",
    "   learning_rate=0.005, \n",
    "   random_state=seed\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting models\n",
      "svm...knn...no sample_weight\n",
      "naive bayes...mlp...no sample_weight\n",
      "random forest...gradientboost...logistic...adaboost...done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "train_base_learners(base_learners, x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n"
     ]
    }
   ],
   "source": [
    "P_base = predict_base_learners(base_learners, x_train)\n",
    "P_test = predict_base_learners(base_learners, x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
       "                           learning_rate=0.005, loss='exponential', max_depth=3,\n",
       "                           max_features=4, max_leaf_nodes=None,\n",
       "                           min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                           min_samples_leaf=1, min_samples_split=2,\n",
       "                           min_weight_fraction_leaf=0.0, n_estimators=128,\n",
       "                           n_iter_no_change=None, presort='auto',\n",
       "                           random_state=2, subsample=0.5, tol=0.0001,\n",
       "                           validation_fraction=0.1, verbose=0,\n",
       "                           warm_start=False)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_learner.fit(P_base, y_train, sample_weight=get_sample_weight(y_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.838\n",
      "Ensemble ACC score: 0.828\n"
     ]
    }
   ],
   "source": [
    "P_pred, p = ensemble_predict(base_learners, meta_learner, x_test)\n",
    "print(\"\\nEnsemble ROC-AUC score: %.3f\" % roc_auc_score(y_test, p))\n",
    "print(\"Ensemble ACC score: %.3f\" % accuracy_score(y_test, np.float32(p > 0.5)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "svm ROC-AUC score: 0.893\n",
      "svm ACC score: 0.816 \n",
      "\n",
      "knn ROC-AUC score: 0.905\n",
      "knn ACC score: 0.851 \n",
      "\n",
      "naive bayes ROC-AUC score: 0.897\n",
      "naive bayes ACC score: 0.885 \n",
      "\n",
      "mlp ROC-AUC score: 0.908\n",
      "mlp ACC score: 0.839 \n",
      "\n",
      "random forest ROC-AUC score: 0.862\n",
      "random forest ACC score: 0.862 \n",
      "\n",
      "gradientboost ROC-AUC score: 0.909\n",
      "gradientboost ACC score: 0.793 \n",
      "\n",
      "logistic ROC-AUC score: 0.899\n",
      "logistic ACC score: 0.851 \n",
      "\n",
      "adaboost ROC-AUC score: 0.848\n",
      "adaboost ACC score: 0.816 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "c = 0\n",
    "for k, v in base_learners.items():\n",
    "    print(\"{} ROC-AUC score: %.3f\".format(k) % roc_auc_score(y_test, P_pred[:, c]))\n",
    "    print(\"{} ACC score: %.3f \\n\".format(k) % accuracy_score(y_test, np.float32(P_pred[:, c] > 0.5)))\n",
    "    c = c + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plot_roc_curve(y_test, P.values, p, list(P.columns), 'stacking')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting models\n",
      "svm...knn...no sample_weight\n",
      "naive bayes...mlp...no sample_weight\n",
      "random forest...gradientboost...logistic..."
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.876\n",
      "Ensemble ACC score: 0.851\n",
      "Fitting models\n",
      "svm...knn...no sample_weight\n",
      "naive bayes...mlp...no sample_weight\n",
      "random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.955\n",
      "Ensemble ACC score: 0.908\n",
      "Fitting models\n",
      "svm...knn...no sample_weight\n",
      "naive bayes...mlp...no sample_weight\n",
      "random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.890\n",
      "Ensemble ACC score: 0.828\n",
      "Fitting models\n",
      "svm...knn...no sample_weight\n",
      "naive bayes...mlp...no sample_weight\n",
      "random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.887\n",
      "Ensemble ACC score: 0.908\n",
      "Fitting models\n",
      "svm...knn...no sample_weight\n",
      "naive bayes...mlp...no sample_weight\n",
      "random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.836\n",
      "Ensemble ACC score: 0.837\n"
     ]
    }
   ],
   "source": [
    "idx = list(range(len(X)))\n",
    "np.random.shuffle(idx)\n",
    "X = X[idx]\n",
    "y = y[idx]\n",
    "cv_scores = []\n",
    "for i, (train_idx, test_idx) in enumerate(KFold(5).split(X)):\n",
    "    fold_x_train, fold_y_train = X[train_idx, :], y[train_idx]\n",
    "    fold_x_test, fold_y_test = X[test_idx, :], y[test_idx]\n",
    "    \n",
    "    base_learners = get_models()\n",
    "    meta_learner = GradientBoostingClassifier(\n",
    "       n_estimators=128,\n",
    "       loss=\"exponential\",\n",
    "       max_features=4,\n",
    "       max_depth=3,\n",
    "       subsample=0.5,\n",
    "       learning_rate=0.005, \n",
    "       random_state=seed\n",
    "    )\n",
    "    train_base_learners(base_learners, fold_x_train, fold_y_train)\n",
    "    P_base = predict_base_learners(base_learners, fold_x_train)\n",
    "    P_test = predict_base_learners(base_learners, fold_x_test)\n",
    "    \n",
    "    meta_learner.fit(P_base, fold_y_train, sample_weight=get_sample_weight(fold_y_train))\n",
    "    P_pred, p = ensemble_predict(base_learners, meta_learner, fold_x_test)\n",
    "\n",
    "    acc = accuracy_score(fold_y_test, np.float32(p > 0.5))\n",
    "    recall = recall_score(fold_y_test, np.float32(p > 0.5))\n",
    "    cm = confusion_matrix(fold_y_test, np.float32(p > 0.5))\n",
    "    roc = roc_auc_score(fold_y_test, p)\n",
    "    print(\"\\nEnsemble ROC-AUC score: %.3f\" % roc)\n",
    "    print(\"Ensemble ACC score: %.3f\" % acc)\n",
    "    \n",
    "    cv_scores.append({'acc': acc, 'recall': recall, 'confusion_matrix': cm, 'roc_auc_score': roc})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>acc</th>\n",
       "      <th>recall</th>\n",
       "      <th>confusion_matrix</th>\n",
       "      <th>roc_auc_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.850575</td>\n",
       "      <td>0.794118</td>\n",
       "      <td>[[47, 6], [7, 27]]</td>\n",
       "      <td>0.875971</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.908046</td>\n",
       "      <td>0.720000</td>\n",
       "      <td>[[61, 1], [7, 18]]</td>\n",
       "      <td>0.955484</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.827586</td>\n",
       "      <td>0.772727</td>\n",
       "      <td>[[55, 10], [5, 17]]</td>\n",
       "      <td>0.889860</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.908046</td>\n",
       "      <td>0.761905</td>\n",
       "      <td>[[63, 3], [5, 16]]</td>\n",
       "      <td>0.886724</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.837209</td>\n",
       "      <td>0.733333</td>\n",
       "      <td>[[50, 6], [8, 22]]</td>\n",
       "      <td>0.836310</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        acc    recall     confusion_matrix  roc_auc_score\n",
       "0  0.850575  0.794118   [[47, 6], [7, 27]]       0.875971\n",
       "1  0.908046  0.720000   [[61, 1], [7, 18]]       0.955484\n",
       "2  0.827586  0.772727  [[55, 10], [5, 17]]       0.889860\n",
       "3  0.908046  0.761905   [[63, 3], [5, 16]]       0.886724\n",
       "4  0.837209  0.733333   [[50, 6], [8, 22]]       0.836310"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(cv_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[276,  26],\n",
       "       [ 32, 100]], dtype=int64)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(pd.DataFrame(cv_scores)['confusion_matrix'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5,1,'Stacking: Confusion Matrix')"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "cm = np.sum(pd.DataFrame(cv_scores)['confusion_matrix'])\n",
    "cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
    "sns.heatmap(cm_norm, cmap='Blues', annot=True)\n",
    "plt.xlabel('Predicted labels')\n",
    "plt.ylabel('True labels')\n",
    "plt.title('Stacking: Confusion Matrix')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_results['stacking'] = pd.DataFrame(cv_scores)['acc']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting models\n",
      "svm...knn...no sample_weight\n",
      "naive bayes...mlp...no sample_weight\n",
      "random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.956\n",
      "Ensemble ACC score: 0.851\n",
      "Fitting models\n",
      "svm...knn...no sample_weight\n",
      "naive bayes...mlp..."
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no sample_weight\n",
      "random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.901\n",
      "Ensemble ACC score: 0.816\n",
      "Fitting models\n",
      "svm...knn...no sample_weight\n",
      "naive bayes...mlp..."
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no sample_weight\n",
      "random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.971\n",
      "Ensemble ACC score: 0.839\n",
      "Fitting models\n",
      "svm...knn..."
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no sample_weight\n",
      "naive bayes...mlp...no sample_weight\n",
      "random forest...gradientboost...logistic...adaboost..."
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.904\n",
      "Ensemble ACC score: 0.816\n",
      "Fitting models\n",
      "svm...knn...no sample_weight\n",
      "naive bayes...mlp...no sample_weight\n",
      "random forest...gradientboost...logistic...adaboost...done.\n",
      "Generating base learner predictions.\n",
      "svm...knn...naive bayes...mlp...random forest...gradientboost...logistic...adaboost...done.\n",
      "\n",
      "Ensemble ROC-AUC score: 0.932\n",
      "Ensemble ACC score: 0.860\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\hasee007\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "idx = list(range(len(X)))\n",
    "np.random.shuffle(idx)\n",
    "X = X[idx]\n",
    "y = y[idx]\n",
    "cv_scores_vote = []\n",
    "for i, (train_idx, test_idx) in enumerate(KFold(5).split(X)):\n",
    "    fold_x_train, fold_y_train = X[train_idx, :], y[train_idx]\n",
    "    fold_x_test, fold_y_test = X[test_idx, :], y[test_idx]\n",
    "    \n",
    "    base_learners = get_models()\n",
    "    meta_learner = GradientBoostingClassifier(\n",
    "       n_estimators=128,\n",
    "       loss=\"exponential\",\n",
    "       max_features=4,\n",
    "       max_depth=3,\n",
    "       subsample=0.5,\n",
    "       learning_rate=0.005, \n",
    "       random_state=seed\n",
    "    )\n",
    "    train_base_learners(base_learners, fold_x_train, fold_y_train)\n",
    "    P_test = predict_base_learners(base_learners, fold_x_test)\n",
    "    p_vote = P_test.mean(axis=1)\n",
    "    acc = accuracy_score(fold_y_test, np.float32(p_vote > 0.5))\n",
    "    recall = recall_score(fold_y_test, np.float32(p_vote > 0.5))\n",
    "    cm = confusion_matrix(fold_y_test, np.float32(p_vote > 0.5))\n",
    "    roc = roc_auc_score(fold_y_test, p_vote)\n",
    "    print(\"\\nEnsemble ROC-AUC score: %.3f\" % roc)\n",
    "    print(\"Ensemble ACC score: %.3f\" % acc)\n",
    "    \n",
    "    cv_scores_vote.append({'acc': acc, 'recall': recall, 'confusion_matrix': cm, 'roc_auc_score': roc})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_results['voting'] = pd.DataFrame(cv_scores_vote)['acc']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>acc</th>\n",
       "      <th>recall</th>\n",
       "      <th>confusion_matrix</th>\n",
       "      <th>roc_auc_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.850575</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>[[58, 5], [8, 16]]</td>\n",
       "      <td>0.956349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.816092</td>\n",
       "      <td>0.615385</td>\n",
       "      <td>[[55, 6], [10, 16]]</td>\n",
       "      <td>0.901009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.839080</td>\n",
       "      <td>0.566667</td>\n",
       "      <td>[[56, 1], [13, 17]]</td>\n",
       "      <td>0.971345</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.816092</td>\n",
       "      <td>0.818182</td>\n",
       "      <td>[[53, 12], [4, 18]]</td>\n",
       "      <td>0.904196</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.860465</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>[[50, 6], [6, 24]]</td>\n",
       "      <td>0.932143</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        acc    recall     confusion_matrix  roc_auc_score\n",
       "0  0.850575  0.666667   [[58, 5], [8, 16]]       0.956349\n",
       "1  0.816092  0.615385  [[55, 6], [10, 16]]       0.901009\n",
       "2  0.839080  0.566667  [[56, 1], [13, 17]]       0.971345\n",
       "3  0.816092  0.818182  [[53, 12], [4, 18]]       0.904196\n",
       "4  0.860465  0.800000   [[50, 6], [6, 24]]       0.932143"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(cv_scores_vote)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[272,  30],\n",
       "       [ 41,  91]], dtype=int64)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(pd.DataFrame(cv_scores_vote)['confusion_matrix'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5,1,'Voting: Confusion Matrix')"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#cm = np.sum(pd.DataFrame(cv_scores)['confusion_matrix'])\n",
    "cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
    "sns.heatmap(cm_norm, cmap='Blues', annot=True)\n",
    "plt.xlabel('Predicted labels')\n",
    "plt.ylabel('True labels')\n",
    "plt.title('Voting: Confusion Matrix')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>svm</th>\n",
       "      <th>knn</th>\n",
       "      <th>naive bayes</th>\n",
       "      <th>mlp</th>\n",
       "      <th>random forest</th>\n",
       "      <th>gradientboost</th>\n",
       "      <th>logistic</th>\n",
       "      <th>adaboost</th>\n",
       "      <th>stacking</th>\n",
       "      <th>voting</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.534091</td>\n",
       "      <td>0.477273</td>\n",
       "      <td>0.579545</td>\n",
       "      <td>0.477273</td>\n",
       "      <td>0.522727</td>\n",
       "      <td>0.386364</td>\n",
       "      <td>0.534091</td>\n",
       "      <td>0.443182</td>\n",
       "      <td>0.850575</td>\n",
       "      <td>0.850575</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.863636</td>\n",
       "      <td>0.727273</td>\n",
       "      <td>0.840909</td>\n",
       "      <td>0.693182</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.772727</td>\n",
       "      <td>0.875000</td>\n",
       "      <td>0.715909</td>\n",
       "      <td>0.908046</td>\n",
       "      <td>0.816092</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.930233</td>\n",
       "      <td>0.883721</td>\n",
       "      <td>0.895349</td>\n",
       "      <td>0.860465</td>\n",
       "      <td>0.848837</td>\n",
       "      <td>0.837209</td>\n",
       "      <td>0.895349</td>\n",
       "      <td>0.918605</td>\n",
       "      <td>0.827586</td>\n",
       "      <td>0.839080</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.837209</td>\n",
       "      <td>0.790698</td>\n",
       "      <td>0.860465</td>\n",
       "      <td>0.883721</td>\n",
       "      <td>0.802326</td>\n",
       "      <td>0.837209</td>\n",
       "      <td>0.813953</td>\n",
       "      <td>0.802326</td>\n",
       "      <td>0.908046</td>\n",
       "      <td>0.816092</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.860465</td>\n",
       "      <td>0.976744</td>\n",
       "      <td>0.988372</td>\n",
       "      <td>0.953488</td>\n",
       "      <td>0.976744</td>\n",
       "      <td>0.930233</td>\n",
       "      <td>0.930233</td>\n",
       "      <td>0.837209</td>\n",
       "      <td>0.860465</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        svm       knn  naive bayes       mlp  random forest  gradientboost  \\\n",
       "0  0.534091  0.477273     0.579545  0.477273       0.522727       0.386364   \n",
       "1  0.863636  0.727273     0.840909  0.693182       0.750000       0.772727   \n",
       "2  0.930233  0.883721     0.895349  0.860465       0.848837       0.837209   \n",
       "3  0.837209  0.790698     0.860465  0.883721       0.802326       0.837209   \n",
       "4  0.953488  0.860465     0.976744  0.988372       0.953488       0.976744   \n",
       "\n",
       "   logistic  adaboost  stacking    voting  \n",
       "0  0.534091  0.443182  0.850575  0.850575  \n",
       "1  0.875000  0.715909  0.908046  0.816092  \n",
       "2  0.895349  0.918605  0.827586  0.839080  \n",
       "3  0.813953  0.802326  0.908046  0.816092  \n",
       "4  0.930233  0.930233  0.837209  0.860465  "
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 1152x648 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.subplots(figsize=(16, 9))\n",
    "colors = sns.color_palette(\"Blues\", len(base_learners.keys()))\n",
    "colors.append((0.8901960784313725, 0.10196078431372549, 0.10980392156862745))\n",
    "colors.append((0.8901960784313725, 0.10196078431372549 * 2, 0.10980392156862745 * 2))\n",
    "dx = sns.boxplot(data=cv_results, palette=colors, boxprops=dict(alpha=1.0),showfliers = False)\n",
    "dx = sns.stripplot(data=cv_results,\n",
    "                   jitter=True,\n",
    "                   color=\".3\",\n",
    "                   dodge=True,\n",
    "                   marker='o', \n",
    "                   alpha=1.0)\n",
    "dx.set(xlabel='Classifcation Methods',ylabel='Accuracy', title='5-Fold Cross Validation Accuracy')\n",
    "sns.set(style='white')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stacking ACC Result 0.8662924351777599 +/- 0.06972955779143106\n",
      "Stacking ROC-AUC Result 0.8888698129072086 +/- 0.07679570822999876\n"
     ]
    }
   ],
   "source": [
    "score = np.mean(pd.DataFrame(cv_scores)['acc'])\n",
    "std = 2 * np.std(pd.DataFrame(cv_scores)['acc'])\n",
    "print('{} ACC Result {} +/- {}'.format('Stacking', score, std))\n",
    "\n",
    "score = np.mean(pd.DataFrame(cv_scores)['roc_auc_score'])\n",
    "std = 2 * np.std(pd.DataFrame(cv_scores)['roc_auc_score'])\n",
    "print('{} ROC-AUC Result {} +/- {}'.format('Stacking', score, std))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Voting ACC Result 0.8364608393477679 +/- 0.03591158164506644\n",
      "Voting ROC-AUC Result 0.9330083448331938 +/- 0.055637238470835695\n"
     ]
    }
   ],
   "source": [
    "score = np.mean(pd.DataFrame(cv_scores_vote)['acc'])\n",
    "std = 2 * np.std(pd.DataFrame(cv_scores_vote)['acc'])\n",
    "print('{} ACC Result {} +/- {}'.format('Voting', score, std))\n",
    "\n",
    "score = np.mean(pd.DataFrame(cv_scores_vote)['roc_auc_score'])\n",
    "std = 2 * np.std(pd.DataFrame(cv_scores_vote)['roc_auc_score'])\n",
    "print('{} ROC-AUC Result {} +/- {}'.format('Voting', score, std))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}