[d6960c]: / modeling.ipynb

Download this file

1052 lines (1051 with data), 151.7 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The simple example demonstration"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Radiomics Analysis for Prediction of EGFR Mutations and Ki-67 Proliferation Index in Patients with NSCLC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Python3\\lib\\site-packages\\numpy\\_distributor_init.py:32: UserWarning: loaded more than 1 DLL from .libs:\n",
      "D:\\Python3\\lib\\site-packages\\numpy\\.libs\\libopenblas.IPBC74C7KURV7CB2PKT5Z5FNR3SIBV4J.gfortran-win_amd64.dll\n",
      "D:\\Python3\\lib\\site-packages\\numpy\\.libs\\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll\n",
      "D:\\Python3\\lib\\site-packages\\numpy\\.libs\\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll\n",
      "  stacklevel=1)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import sklearn\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn import model_selection\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.feature_selection import RFE\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## EGFR Prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 87 entries, 0 to 86\n",
      "Data columns (total 11 columns):\n",
      "EGFR                                                87 non-null int64\n",
      "wavelet-HLL_glcm_MaximumProbability                 87 non-null float64\n",
      "wavelet-LLL_glcm_MaximumProbability                 87 non-null float64\n",
      "original_glcm_SumEntropy                            87 non-null float64\n",
      "log-sigma-1-0-mm-3D_glcm_MaximumProbability         87 non-null float64\n",
      "wavelet-LHL_firstorder_Kurtosis                     87 non-null float64\n",
      "wavelet-LLL_firstorder_Skewness                     87 non-null float64\n",
      "log-sigma-2-0-mm-3D_firstorder_Kurtosis             87 non-null float64\n",
      "original_shape_Sphericity                           87 non-null float64\n",
      "wavelet-LHL_glszm_LargeAreaHighGrayLevelEmphasis    87 non-null float64\n",
      "original_glcm_ClusterTendency                       87 non-null float64\n",
      "dtypes: float64(10), int64(1)\n",
      "memory usage: 7.6 KB\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('./EGFR-radiomics.csv')\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>EGFR</th>\n",
       "      <th>wavelet-HLL_glcm_MaximumProbability</th>\n",
       "      <th>wavelet-LLL_glcm_MaximumProbability</th>\n",
       "      <th>original_glcm_SumEntropy</th>\n",
       "      <th>log-sigma-1-0-mm-3D_glcm_MaximumProbability</th>\n",
       "      <th>wavelet-LHL_firstorder_Kurtosis</th>\n",
       "      <th>wavelet-LLL_firstorder_Skewness</th>\n",
       "      <th>log-sigma-2-0-mm-3D_firstorder_Kurtosis</th>\n",
       "      <th>original_shape_Sphericity</th>\n",
       "      <th>wavelet-LHL_glszm_LargeAreaHighGrayLevelEmphasis</th>\n",
       "      <th>original_glcm_ClusterTendency</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0.007927</td>\n",
       "      <td>0.003456</td>\n",
       "      <td>6.124576</td>\n",
       "      <td>0.019309</td>\n",
       "      <td>5.800779</td>\n",
       "      <td>0.239530</td>\n",
       "      <td>2.254404</td>\n",
       "      <td>0.620527</td>\n",
       "      <td>2465.281536</td>\n",
       "      <td>411.620729</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.012674</td>\n",
       "      <td>0.006008</td>\n",
       "      <td>5.948736</td>\n",
       "      <td>0.031240</td>\n",
       "      <td>2.795991</td>\n",
       "      <td>0.466490</td>\n",
       "      <td>2.382641</td>\n",
       "      <td>0.689155</td>\n",
       "      <td>1824.505051</td>\n",
       "      <td>355.638540</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0.015902</td>\n",
       "      <td>0.008222</td>\n",
       "      <td>4.920900</td>\n",
       "      <td>0.040681</td>\n",
       "      <td>3.766194</td>\n",
       "      <td>0.403203</td>\n",
       "      <td>3.441538</td>\n",
       "      <td>0.631141</td>\n",
       "      <td>2935.840909</td>\n",
       "      <td>66.400015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0.032583</td>\n",
       "      <td>0.016863</td>\n",
       "      <td>5.066296</td>\n",
       "      <td>0.086125</td>\n",
       "      <td>7.706288</td>\n",
       "      <td>-1.581500</td>\n",
       "      <td>3.865254</td>\n",
       "      <td>0.552905</td>\n",
       "      <td>765983.082600</td>\n",
       "      <td>146.881644</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0.015743</td>\n",
       "      <td>0.008266</td>\n",
       "      <td>5.424894</td>\n",
       "      <td>0.021068</td>\n",
       "      <td>3.593316</td>\n",
       "      <td>0.383414</td>\n",
       "      <td>2.323130</td>\n",
       "      <td>0.678834</td>\n",
       "      <td>1008.337079</td>\n",
       "      <td>174.982589</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   EGFR  wavelet-HLL_glcm_MaximumProbability  \\\n",
       "0     1                             0.007927   \n",
       "1     0                             0.012674   \n",
       "2     1                             0.015902   \n",
       "3     0                             0.032583   \n",
       "4     1                             0.015743   \n",
       "\n",
       "   wavelet-LLL_glcm_MaximumProbability  original_glcm_SumEntropy  \\\n",
       "0                             0.003456                  6.124576   \n",
       "1                             0.006008                  5.948736   \n",
       "2                             0.008222                  4.920900   \n",
       "3                             0.016863                  5.066296   \n",
       "4                             0.008266                  5.424894   \n",
       "\n",
       "   log-sigma-1-0-mm-3D_glcm_MaximumProbability  \\\n",
       "0                                     0.019309   \n",
       "1                                     0.031240   \n",
       "2                                     0.040681   \n",
       "3                                     0.086125   \n",
       "4                                     0.021068   \n",
       "\n",
       "   wavelet-LHL_firstorder_Kurtosis  wavelet-LLL_firstorder_Skewness  \\\n",
       "0                         5.800779                         0.239530   \n",
       "1                         2.795991                         0.466490   \n",
       "2                         3.766194                         0.403203   \n",
       "3                         7.706288                        -1.581500   \n",
       "4                         3.593316                         0.383414   \n",
       "\n",
       "   log-sigma-2-0-mm-3D_firstorder_Kurtosis  original_shape_Sphericity  \\\n",
       "0                                 2.254404                   0.620527   \n",
       "1                                 2.382641                   0.689155   \n",
       "2                                 3.441538                   0.631141   \n",
       "3                                 3.865254                   0.552905   \n",
       "4                                 2.323130                   0.678834   \n",
       "\n",
       "   wavelet-LHL_glszm_LargeAreaHighGrayLevelEmphasis  \\\n",
       "0                                       2465.281536   \n",
       "1                                       1824.505051   \n",
       "2                                       2935.840909   \n",
       "3                                     765983.082600   \n",
       "4                                       1008.337079   \n",
       "\n",
       "   original_glcm_ClusterTendency  \n",
       "0                     411.620729  \n",
       "1                     355.638540  \n",
       "2                      66.400015  \n",
       "3                     146.881644  \n",
       "4                     174.982589  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 87 entries, 0 to 86\n",
      "Data columns (total 11 columns):\n",
      "EGFR                                                87 non-null int64\n",
      "wavelet-HLL_glcm_MaximumProbability                 87 non-null float64\n",
      "wavelet-LLL_glcm_MaximumProbability                 87 non-null float64\n",
      "original_glcm_SumEntropy                            87 non-null float64\n",
      "log-sigma-1-0-mm-3D_glcm_MaximumProbability         87 non-null float64\n",
      "wavelet-LHL_firstorder_Kurtosis                     87 non-null float64\n",
      "wavelet-LLL_firstorder_Skewness                     87 non-null float64\n",
      "log-sigma-2-0-mm-3D_firstorder_Kurtosis             87 non-null float64\n",
      "original_shape_Sphericity                           87 non-null float64\n",
      "wavelet-LHL_glszm_LargeAreaHighGrayLevelEmphasis    87 non-null float64\n",
      "original_glcm_ClusterTendency                       87 non-null float64\n",
      "dtypes: float64(10), int64(1)\n",
      "memory usage: 8.2 KB\n"
     ]
    }
   ],
   "source": [
    "df_drop = df.dropna(axis=0)\n",
    "df_drop.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>EGFR</th>\n",
       "      <th>wavelet-HLL_glcm_MaximumProbability</th>\n",
       "      <th>wavelet-LLL_glcm_MaximumProbability</th>\n",
       "      <th>original_glcm_SumEntropy</th>\n",
       "      <th>log-sigma-1-0-mm-3D_glcm_MaximumProbability</th>\n",
       "      <th>wavelet-LHL_firstorder_Kurtosis</th>\n",
       "      <th>wavelet-LLL_firstorder_Skewness</th>\n",
       "      <th>log-sigma-2-0-mm-3D_firstorder_Kurtosis</th>\n",
       "      <th>original_shape_Sphericity</th>\n",
       "      <th>wavelet-LHL_glszm_LargeAreaHighGrayLevelEmphasis</th>\n",
       "      <th>original_glcm_ClusterTendency</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0.007927</td>\n",
       "      <td>0.003456</td>\n",
       "      <td>6.124576</td>\n",
       "      <td>0.019309</td>\n",
       "      <td>5.800779</td>\n",
       "      <td>0.239530</td>\n",
       "      <td>2.254404</td>\n",
       "      <td>0.620527</td>\n",
       "      <td>2465.281536</td>\n",
       "      <td>411.620729</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.012674</td>\n",
       "      <td>0.006008</td>\n",
       "      <td>5.948736</td>\n",
       "      <td>0.031240</td>\n",
       "      <td>2.795991</td>\n",
       "      <td>0.466490</td>\n",
       "      <td>2.382641</td>\n",
       "      <td>0.689155</td>\n",
       "      <td>1824.505051</td>\n",
       "      <td>355.638540</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0.015902</td>\n",
       "      <td>0.008222</td>\n",
       "      <td>4.920900</td>\n",
       "      <td>0.040681</td>\n",
       "      <td>3.766194</td>\n",
       "      <td>0.403203</td>\n",
       "      <td>3.441538</td>\n",
       "      <td>0.631141</td>\n",
       "      <td>2935.840909</td>\n",
       "      <td>66.400015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0.032583</td>\n",
       "      <td>0.016863</td>\n",
       "      <td>5.066296</td>\n",
       "      <td>0.086125</td>\n",
       "      <td>7.706288</td>\n",
       "      <td>-1.581500</td>\n",
       "      <td>3.865254</td>\n",
       "      <td>0.552905</td>\n",
       "      <td>765983.082600</td>\n",
       "      <td>146.881644</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0.015743</td>\n",
       "      <td>0.008266</td>\n",
       "      <td>5.424894</td>\n",
       "      <td>0.021068</td>\n",
       "      <td>3.593316</td>\n",
       "      <td>0.383414</td>\n",
       "      <td>2.323130</td>\n",
       "      <td>0.678834</td>\n",
       "      <td>1008.337079</td>\n",
       "      <td>174.982589</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   EGFR  wavelet-HLL_glcm_MaximumProbability  \\\n",
       "0     1                             0.007927   \n",
       "1     0                             0.012674   \n",
       "2     1                             0.015902   \n",
       "3     0                             0.032583   \n",
       "4     1                             0.015743   \n",
       "\n",
       "   wavelet-LLL_glcm_MaximumProbability  original_glcm_SumEntropy  \\\n",
       "0                             0.003456                  6.124576   \n",
       "1                             0.006008                  5.948736   \n",
       "2                             0.008222                  4.920900   \n",
       "3                             0.016863                  5.066296   \n",
       "4                             0.008266                  5.424894   \n",
       "\n",
       "   log-sigma-1-0-mm-3D_glcm_MaximumProbability  \\\n",
       "0                                     0.019309   \n",
       "1                                     0.031240   \n",
       "2                                     0.040681   \n",
       "3                                     0.086125   \n",
       "4                                     0.021068   \n",
       "\n",
       "   wavelet-LHL_firstorder_Kurtosis  wavelet-LLL_firstorder_Skewness  \\\n",
       "0                         5.800779                         0.239530   \n",
       "1                         2.795991                         0.466490   \n",
       "2                         3.766194                         0.403203   \n",
       "3                         7.706288                        -1.581500   \n",
       "4                         3.593316                         0.383414   \n",
       "\n",
       "   log-sigma-2-0-mm-3D_firstorder_Kurtosis  original_shape_Sphericity  \\\n",
       "0                                 2.254404                   0.620527   \n",
       "1                                 2.382641                   0.689155   \n",
       "2                                 3.441538                   0.631141   \n",
       "3                                 3.865254                   0.552905   \n",
       "4                                 2.323130                   0.678834   \n",
       "\n",
       "   wavelet-LHL_glszm_LargeAreaHighGrayLevelEmphasis  \\\n",
       "0                                       2465.281536   \n",
       "1                                       1824.505051   \n",
       "2                                       2935.840909   \n",
       "3                                     765983.082600   \n",
       "4                                       1008.337079   \n",
       "\n",
       "   original_glcm_ClusterTendency  \n",
       "0                     411.620729  \n",
       "1                     355.638540  \n",
       "2                      66.400015  \n",
       "3                     146.881644  \n",
       "4                     174.982589  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_drop.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: 0, 1: 1}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "class_mapping = {label: idx for idx, label in enumerate(np.unique(df['EGFR']))}\n",
    "class_mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(87,)\n",
      "(87, 10)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.preprocessing import label_binarize\n",
    "\n",
    "df['EGFR'] = df['EGFR'].map(class_mapping)\n",
    "y = LabelEncoder().fit_transform(df['EGFR'].values)\n",
    "X = df.iloc[:,1:11].values\n",
    "print(y.shape)\n",
    "print(X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(87, 10)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "std = StandardScaler()\n",
    "X_std = std.fit_transform(X)\n",
    "print(X_std.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Find_Optimal_Cutoff(TPR, FPR, threshold):\n",
    "    y = TPR - FPR\n",
    "    Youden_index = np.argmax(y)  # Only the first occurrence is returned.\n",
    "    optimal_threshold = threshold[Youden_index]\n",
    "    point = [FPR[Youden_index], TPR[Youden_index]]\n",
    "    return optimal_threshold, point"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 576x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sklearn.metrics import roc_curve\n",
    "from sklearn.metrics import auc\n",
    "from sklearn import model_selection\n",
    "from scipy import interp\n",
    "import matplotlib\n",
    "clf = LogisticRegression(penalty='l2',C=1, multi_class='auto',solver='liblinear',random_state=1)\n",
    "cv = model_selection.ShuffleSplit(n_splits = 5, test_size = 0.25, random_state = 0)\n",
    "\n",
    "tprs = []\n",
    "aucs = []\n",
    "mean_fpr = np.linspace(0, 1, 100)\n",
    "i = 0\n",
    "plt.xticks(np.arange(0, 1.1, step=0.1))\n",
    "plt.yticks(np.arange(0, 1.1, step=0.1))\n",
    "matplotlib.rcParams['figure.figsize'] = (8.0, 8.0)\n",
    "\n",
    "for train, test in cv.split(X_std, y):\n",
    "    probas_ = clf.fit(X_std[train], y[train]).predict_proba(X_std[test])\n",
    "    # Compute ROC curve and area the curve\n",
    "    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])\n",
    "    tprs.append(interp(mean_fpr, fpr, tpr))\n",
    "    tprs[-1][0] = 0.0\n",
    "    roc_auc = auc(fpr, tpr)\n",
    "    optimal_th, optimal_point = Find_Optimal_Cutoff(TPR=tpr, FPR=fpr, threshold=thresholds) ### can be annotated\n",
    "    aucs.append(roc_auc)\n",
    "    plt.plot(fpr, tpr, lw=1, alpha=0.8,label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))\n",
    "    i += 1\n",
    "\n",
    "plt.plot([0, 1], [0, 1], linestyle='--', lw=1, color='gray', alpha=.6)\n",
    "mean_tpr = np.mean(tprs, axis=0)\n",
    "mean_tpr[-1] = 1.0\n",
    "mean_auc = auc(mean_fpr, mean_tpr)\n",
    "#optimal_th, optimal_point = Find_Optimal_Cutoff(TPR=mean_tpr, mean_fpr, threshold=thresholds)\n",
    "std_auc = np.std(aucs)\n",
    "plt.plot(mean_fpr, mean_tpr, color='b',label=r'Mean ROC (AUC = %0.2f $\\pm$ %0.2f)' % (mean_auc, std_auc),lw=2, alpha=.8)\n",
    "std_tpr = np.std(tprs, axis=0)\n",
    "tprs_upper = np.minimum(mean_tpr + std_tpr, 1)\n",
    "tprs_lower = np.maximum(mean_tpr - std_tpr, 0)\n",
    "#plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,label=r'$\\pm$ 1 std. dev.')\n",
    "#plt.xlim([-0, 1])\n",
    "#plt.ylim([-0, 1])\n",
    "plt.xlabel('1-Specificity', fontsize = 'x-large')\n",
    "plt.ylabel('Sensitivity', fontsize = 'x-large')\n",
    "#plt.title('Receiver operating characteristic example')\n",
    "plt.legend(loc=\"lower right\" ,\n",
    "           fontsize = 'medium'\n",
    "          )\n",
    "#plt.plot(optimal_point[0], optimal_point[1], marker='o', color='r')\n",
    "#plt.text(optimal_point[0], optimal_point[1], f'Threshold:{optimal_th:.2f}')\n",
    "\n",
    "plt.savefig('EGFR-ROC1.jpg',dpi=1500)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(45, 10)\n",
      "(45,)\n"
     ]
    }
   ],
   "source": [
    "df_test = pd.read_csv('./EGFR-test.csv')\n",
    "df_test_drop = df_test.dropna(axis=0)\n",
    "class_mapping = {label: idx for idx, label in enumerate(np.unique(df_test_drop['EGFR']))}\n",
    "df_test_drop['EGFR'] = df_test_drop['EGFR'].map(class_mapping)\n",
    "y_test = LabelEncoder().fit_transform(df_test_drop['EGFR'].values)\n",
    "X_test = df_test_drop.iloc[:,1:11].values\n",
    "\n",
    "X_test_std = std.fit_transform(X_test)\n",
    "print(X_test_std.shape)\n",
    "print(y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 576x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "from matplotlib import style\n",
    "tprs = []\n",
    "aucs = []\n",
    "i = 0\n",
    "lw = 2\n",
    "plt.xticks(np.arange(0, 1.1, step=0.1))\n",
    "plt.yticks(np.arange(0, 1.1, step=0.1))\n",
    "\n",
    "probas_ = clf.predict_proba(X_test_std)\n",
    "fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])\n",
    "fpr=fpr\n",
    "tpr=tpr\n",
    "roc_auc = auc(fpr, tpr)\n",
    "optimal_th, optimal_point = Find_Optimal_Cutoff(TPR=tpr, FPR=fpr, threshold=thresholds)\n",
    "aucs.append(roc_auc)\n",
    "#plt.plot(fpr, tpr, lw=1, alpha=0.3,label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))\n",
    "plt.plot(fpr, tpr, color='b', alpha=.8, lw=lw, label='ROC (AUC = %0.2f)' % roc_auc) \n",
    "plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--',alpha=.6)\n",
    "#plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,label=r'$\\pm$ 1 std. dev.')\n",
    "#plt.xlim([-0, 1])\n",
    "#plt.ylim([-0, 1])\n",
    "#plt.plot(optimal_point[0], optimal_point[1], marker='o', color='r')\n",
    "#plt.text(optimal_point[0], optimal_point[1], f'{optimal_th:.2f}', verticalalignment=\"bottom\", horizontalalignment=\"center\")\n",
    "#print (optimal_point[0], optimal_point[1])\n",
    "\n",
    "plt.xlabel('1-Specificity', fontsize = 'x-large')\n",
    "plt.ylabel('Sensitivity', fontsize = 'x-large')\n",
    "plt.legend(loc=\"lower right\", fontsize = 'medium')\n",
    "plt.savefig('EGFR-ROC2.jpg',dpi=1500)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.26637586, -0.36415474,  0.88922437,  0.35225551, -0.63638105,\n",
       "         0.3393241 ,  0.87795737, -0.30623909, -0.59557871, -0.26788923]])"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ki-67 Prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 87 entries, 0 to 86\n",
      "Data columns (total 29 columns):\n",
      "Y                                                             87 non-null int64\n",
      "wavelet-HLL_gldm_LargeDependenceHighGrayLevelEmphasis         87 non-null float64\n",
      "log-sigma-1-0-mm-3D_glszm_SmallAreaLowGrayLevelEmphasis       87 non-null float64\n",
      "log-sigma-5-0-mm-3D_glcm_Idm                                  87 non-null float64\n",
      "log-sigma-5-0-mm-3D_glcm_InverseVariance                      87 non-null float64\n",
      "original_firstorder_Median                                    87 non-null float64\n",
      "log-sigma-2-0-mm-3D_glszm_SmallAreaLowGrayLevelEmphasis       87 non-null float64\n",
      "wavelet-LHL_glszm_LargeAreaLowGrayLevelEmphasis               87 non-null float64\n",
      "wavelet-LHL_firstorder_Maximum                                87 non-null float64\n",
      "wavelet-HLL_glszm_GrayLevelNonUniformityNormalized            87 non-null float64\n",
      "log-sigma-3-0-mm-3D_firstorder_Median                         87 non-null float64\n",
      "log-sigma-3-0-mm-3D_firstorder_90Percentile                   87 non-null float64\n",
      "log-sigma-4-0-mm-3D_glszm_SmallAreaEmphasis                   87 non-null float64\n",
      "wavelet-LHL_glszm_GrayLevelNonUniformityNormalized            87 non-null float64\n",
      "original_shape_SurfaceVolumeRatio                             87 non-null float64\n",
      "wavelet-LHH_firstorder_Kurtosis                               87 non-null float64\n",
      "wavelet-HHL_glrlm_LongRunHighGrayLevelEmphasis                87 non-null float64\n",
      "log-sigma-1-0-mm-3D_glcm_Correlation                          87 non-null float64\n",
      "wavelet-LHH_glcm_Correlation                                  87 non-null float64\n",
      "original_gldm_LargeDependenceLowGrayLevelEmphasis             87 non-null float64\n",
      "wavelet-LLL_gldm_LargeDependenceLowGrayLevelEmphasis          87 non-null float64\n",
      "wavelet-HLL_glszm_LargeAreaLowGrayLevelEmphasis               87 non-null float64\n",
      "wavelet-HLL_firstorder_Maximum                                87 non-null float64\n",
      "log-sigma-1-0-mm-3D_glrlm_GrayLevelNonUniformityNormalized    87 non-null float64\n",
      "wavelet-HHH_glcm_Correlation                                  87 non-null float64\n",
      "original_firstorder_Skewness                                  87 non-null float64\n",
      "wavelet-LHH_glcm_Idn                                          87 non-null float64\n",
      "wavelet-LHH_gldm_SmallDependenceLowGrayLevelEmphasis          87 non-null float64\n",
      "wavelet-HLH_glcm_Idn                                          87 non-null float64\n",
      "dtypes: float64(28), int64(1)\n",
      "memory usage: 19.8 KB\n"
     ]
    }
   ],
   "source": [
    "df1 = pd.read_csv('./Ki67-radiomics.csv', encoding = 'gb2312')\n",
    "df1.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: 0, 1: 1}"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "class_mapping1 = {label: idx for idx, label in enumerate(np.unique(df1['Y']))}\n",
    "class_mapping1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(87,)\n",
      "(87, 28)\n",
      "(87, 28)\n"
     ]
    }
   ],
   "source": [
    "df1_drop = df1.dropna(axis=0)\n",
    "df1_drop['Y'] = df1_drop['Y'].map(class_mapping1)\n",
    "y1 = LabelEncoder().fit_transform(df1_drop['Y'].values)\n",
    "X1 = df1_drop.iloc[:,1:29].values\n",
    "print(y1.shape)\n",
    "print(X1.shape)\n",
    "X1_std = std.fit_transform(X1)\n",
    "print(X1_std.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 576x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "clf1 = LogisticRegression(penalty='l2',C=1, multi_class='auto',solver='liblinear',random_state=1)\n",
    "\n",
    "cv = model_selection.ShuffleSplit(n_splits = 5, test_size = 0.25, random_state = 0)\n",
    "#classifier = clf1\n",
    "#cv = model_selection.StratifiedKFold(n_splits=5)\n",
    "tprs = []\n",
    "aucs = []\n",
    "mean_fpr = np.linspace(0, 1, 100)\n",
    "i = 0\n",
    "plt.xticks(np.arange(0, 1.1, step=0.1))\n",
    "plt.yticks(np.arange(0, 1.1, step=0.1))\n",
    "\n",
    "for train, test in cv.split(X1_std, y1):\n",
    "    probas_ = clf1.fit(X1_std[train], y1[train]).predict_proba(X1_std[test])\n",
    "    # Compute ROC curve and area the curve\n",
    "    fpr, tpr, thresholds = roc_curve(y1[test], probas_[:, 1])\n",
    "    tprs.append(interp(mean_fpr, fpr, tpr))\n",
    "    tprs[-1][0] = 0.0\n",
    "    roc_auc = auc(fpr, tpr)\n",
    "    optimal_th, optimal_point = Find_Optimal_Cutoff(TPR=tpr, FPR=fpr, threshold=thresholds) ###\n",
    "    aucs.append(roc_auc)\n",
    "    plt.plot(fpr, tpr, lw=1, alpha=0.8,label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))\n",
    "    i += 1\n",
    "matplotlib.rcParams['figure.figsize'] = (8.0, 8.0)\n",
    "plt.plot([0, 1], [0, 1], linestyle='--', lw=1, color='gray', alpha=.6)\n",
    "mean_tpr = np.mean(tprs, axis=0)\n",
    "mean_tpr[-1] = 1.0\n",
    "mean_auc = auc(mean_fpr, mean_tpr)\n",
    "#optimal_th, optimal_point = Find_Optimal_Cutoff(TPR=mean_tpr, mean_fpr, threshold=thresholds)\n",
    "std_auc = np.std(aucs)\n",
    "plt.plot(mean_fpr, mean_tpr, color='b',label=r'Mean ROC (AUC = %0.2f $\\pm$ %0.2f)' % (mean_auc, std_auc),lw=2, alpha=.8)\n",
    "std_tpr = np.std(tprs, axis=0)\n",
    "tprs_upper = np.minimum(mean_tpr + std_tpr, 1)\n",
    "tprs_lower = np.maximum(mean_tpr - std_tpr, 0)\n",
    "#plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,label=r'$\\pm$ 1 std. dev.')\n",
    "#plt.xlim([-0, 1])\n",
    "#plt.ylim([-0, 1])\n",
    "plt.xlabel('1-Specificity', fontsize = 'x-large')\n",
    "plt.ylabel('Sensitivity', fontsize = 'x-large')\n",
    "#plt.title('Receiver operating characteristic example')\n",
    "plt.legend(loc=\"lower right\" ,\n",
    "           fontsize = 'medium'\n",
    "          )\n",
    "#plt.plot(optimal_point[0], optimal_point[1], marker='o', color='r')\n",
    "#plt.text(optimal_point[0], optimal_point[1], f'Threshold:{optimal_th:.2f}')\n",
    "\n",
    "plt.savefig('Ki-67-ROC1.jpg',dpi=1500)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(45, 28)\n",
      "(45,)\n"
     ]
    }
   ],
   "source": [
    "df1_test = pd.read_csv('./Ki67-test.csv')\n",
    "df1_test_drop = df1_test.dropna(axis=0)\n",
    "class_mapping1 = {label: idx for idx, label in enumerate(np.unique(df1_test_drop['Ki-67']))}\n",
    "df1_test_drop['Ki-67'] = df1_test_drop['Ki-67'].map(class_mapping1)\n",
    "y1_test = LabelEncoder().fit_transform(df1_test_drop['Ki-67'].values)\n",
    "X1_test = df1_test_drop.iloc[:,1:29].values\n",
    "\n",
    "X1_test_std = std.fit_transform(X1_test)\n",
    "print(X1_test_std.shape)\n",
    "print(y1_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 576x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "from matplotlib import style\n",
    "tprs = []\n",
    "aucs = []\n",
    "i = 0\n",
    "lw = 2\n",
    "plt.xticks(np.arange(0, 1.1, step=0.1))\n",
    "plt.yticks(np.arange(0, 1.1, step=0.1))\n",
    "\n",
    "probas_ = clf1.predict_proba(X1_test_std)\n",
    "fpr, tpr, thresholds = roc_curve(y1_test, probas_[:, 1])\n",
    "fpr=fpr\n",
    "tpr=tpr\n",
    "roc_auc = auc(fpr, tpr)\n",
    "optimal_th, optimal_point = Find_Optimal_Cutoff(TPR=tpr, FPR=fpr, threshold=thresholds)\n",
    "aucs.append(roc_auc)\n",
    "#plt.plot(fpr, tpr, lw=1, alpha=0.3,label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))\n",
    "plt.plot(fpr, tpr, color='b', alpha=.8, lw=lw, label='ROC (AUC = %0.2f)' % roc_auc) \n",
    "plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--',alpha=.6)\n",
    "#plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,label=r'$\\pm$ 1 std. dev.')\n",
    "#plt.xlim([-0, 1])\n",
    "#plt.ylim([-0, 1])\n",
    "#plt.plot(optimal_point[0], optimal_point[1], marker='o', color='r')\n",
    "#plt.text(optimal_point[0], optimal_point[1], f'{optimal_th:.2f}', verticalalignment=\"bottom\", horizontalalignment=\"center\")\n",
    "#print (optimal_point[0], optimal_point[1])\n",
    "plt.xlabel('1-Specificity', fontsize = 'x-large')\n",
    "plt.ylabel('Sensitivity', fontsize = 'x-large')\n",
    "plt.legend(loc=\"lower right\" ,\n",
    "           fontsize = 'medium'\n",
    "          )\n",
    "plt.savefig('Ki-67-ROC2.jpg',dpi=1500)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.33933662, -0.24501119,  0.06149887,  0.66555733,  0.5809558 ,\n",
       "         0.41924958,  0.23932351,  0.77765524, -0.08881509, -0.63052453,\n",
       "        -0.57778378,  0.14979279, -0.19062698,  0.08261748,  0.48340228,\n",
       "         0.68332893,  0.6523309 ,  0.16729185, -0.24171584, -0.02439305,\n",
       "         0.29861927, -0.29627875,  0.41290256,  0.78760983, -0.56817862,\n",
       "         0.38850808, -0.18475777, -0.08823505]])"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf1.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 36",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6rc1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}