{ "cells": [ { "cell_type": "code", "execution_count": 1, "source": [ "# Import libraries\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import getpass\n", "import pdvega\n", "import plotly.graph_objs as go\n", "\n", "from plotly.offline import iplot, init_notebook_mode\n", "import plotly.io as pio\n", "from plotly.graph_objs import *\n", "\n", "# for configuring connection \n", "from configobj import ConfigObj\n", "import os\n", "\n", "%matplotlib inline\n", "\n", "\n", "import os\n", "\n", "\n", "from sklearn import linear_model\n", "from sklearn import metrics\n", "from sklearn.model_selection import train_test_split\n", "\n", "#configure the notebook for use in offline mode\n", "init_notebook_mode(connected=True)" ], "outputs": [ { "output_type": "display_data", "data": { "text/html": [ "" ], "text/vnd.plotly.v1+html": [ "" ] }, "metadata": {} } ], "metadata": {} }, { "cell_type": "code", "execution_count": 2, "source": [ "df2= pd.read_csv(\"analysis.csv\")" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 3, "source": [ "df2.head()" ], "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0hospitalidsodiumelectivesurgeryventdialysisgcsurinewbctemperature...m11_Truem12_Truem13_Truem14_Truem15_Truem16_Truem17_Truem18_Truem19_Truem20_True
0059.0139.0-1.00.00.015.0-1.014.736.1...1001100010
1173.0134.0-1.00.00.013.0-1.014.139.3...1001100010
2273.0-1.01.01.00.015.0-1.08.034.8...0010010100
3363.0137.0-1.00.00.015.0-1.010.936.6...1011100110
4463.0135.0-1.00.00.015.0-1.05.935.0...0010000100
\n", "

5 rows × 85 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 hospitalid sodium electivesurgery vent dialysis gcs \\\n", "0 0 59.0 139.0 -1.0 0.0 0.0 15.0 \n", "1 1 73.0 134.0 -1.0 0.0 0.0 13.0 \n", "2 2 73.0 -1.0 1.0 1.0 0.0 15.0 \n", "3 3 63.0 137.0 -1.0 0.0 0.0 15.0 \n", "4 4 63.0 135.0 -1.0 0.0 0.0 15.0 \n", "\n", " urine wbc temperature ... m11_True m12_True m13_True m14_True \\\n", "0 -1.0 14.7 36.1 ... 1 0 0 1 \n", "1 -1.0 14.1 39.3 ... 1 0 0 1 \n", "2 -1.0 8.0 34.8 ... 0 0 1 0 \n", "3 -1.0 10.9 36.6 ... 1 0 1 1 \n", "4 -1.0 5.9 35.0 ... 0 0 1 0 \n", "\n", " m15_True m16_True m17_True m18_True m19_True m20_True \n", "0 1 0 0 0 1 0 \n", "1 1 0 0 0 1 0 \n", "2 0 1 0 1 0 0 \n", "3 1 0 0 1 1 0 \n", "4 0 0 0 1 0 0 \n", "\n", "[5 rows x 85 columns]" ] }, "metadata": {}, "execution_count": 3 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 4, "source": [ "del df2['hospitalid']\n", "\n", "df2 = df2.drop(df2.columns[[0]], axis=1)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 5, "source": [ "df2.shape" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(95148, 83)" ] }, "metadata": {}, "execution_count": 5 } ], "metadata": {} }, { "cell_type": "markdown", "source": [ "**We moved all the pre-processing including splitting>imputation>Standardization to the CV iterations**" ], "metadata": {} }, { "cell_type": "code", "execution_count": 6, "source": [ "cols_to_norm=['gcs', 'urine', 'wbc', 'sodium',\n", " 'temperature', 'respiratoryrate', 'heartrate', 'meanbp', 'creatinine',\n", " 'ph', 'hematocrit', 'albumin', 'pao2', 'pco2', 'bun', 'glucose',\n", " 'bilirubin', 'fio2', 'age', 'offset']\n", "\n", "X=df2.drop('destcopy', 1)\n", "y=df2['destcopy']\n", "df_cols = list(X) #fancy impute removes column names." ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 8, "source": [ "# Load in our libraries\n", "import pandas as pd\n", "import numpy as np\n", "import re\n", "import sklearn\n", "import xgboost as xgb\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "import plotly.offline as py\n", "py.init_notebook_mode(connected=True)\n", "import plotly.graph_objs as go\n", "import plotly.tools as tls\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "# Going to use these 5 base models for the stacking\n", "from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, \n", " GradientBoostingClassifier, ExtraTreesClassifier)\n", "from sklearn.svm import SVC\n", "from sklearn.model_selection import KFold\n", "from sklearn.linear_model import LogisticRegression" ], "outputs": [ { "output_type": "display_data", "data": { "text/html": [ "" ], "text/vnd.plotly.v1+html": [ "" ] }, "metadata": {} } ], "metadata": {} }, { "cell_type": "code", "execution_count": 9, "source": [ "from sklearn.model_selection import StratifiedKFold" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 10, "source": [ "\n", "classes=['Death','Home','Nursing Home','Rehabilitation']\n", "\n", "kf_m = StratifiedKFold(n_splits=10)\n", "\n", "\n", "\n", "# Class to extend the Sklearn classifier\n", "class SklearnHelper(object):\n", " def __init__(self, clf, seed=0, params=None):\n", " params['random_state'] = seed\n", " self.clf = clf(**params)\n", "\n", " def train(self, x_train, y_train):\n", " self.clf.fit(x_train, y_train)\n", "\n", " def predict(self, x):\n", " return self.clf.predict(x)\n", "\n", " def fit(self,x,y):\n", " return self.clf.fit(x,y)\n", "\n", " def feature_importances(self,x,y):\n", " return(self.clf.fit(x,y).feature_importances_)\n", " \n", "\n", "\n", "\n", "#-------------------------------------------------------------\n", "\n", "\n", "\n", "\n", "#------------------------------------------\n", "\n", "rf_params = {\n", " 'n_jobs': -1,\n", " 'n_estimators': 400,\n", " 'warm_start': True, \n", " #'max_features': 0.2,\n", " 'max_depth': 30,\n", " 'min_samples_leaf': 2,\n", " 'max_features' : 0.8,\n", " 'verbose': 0,\n", " 'criterion':'gini'\n", "}\n", "\n", "\n", "# Extra Trees Parameters\n", "et_params = {\n", " 'n_jobs': -1,\n", " 'n_estimators':500,\n", " #'max_features': 0.5,\n", " 'max_depth': 8,\n", " 'min_samples_leaf': 2,\n", " 'verbose': 0\n", "}\n", "\n", "# AdaBoost parameters\n", "ada_params = {\n", " 'n_estimators': 500,\n", " 'learning_rate' : 0.75\n", "}\n", "\n", "# Gradient Boosting parameters\n", "gb_params = {\n", " 'n_estimators': 500,\n", " #'max_features': 0.2,\n", " 'max_depth': 5,\n", " 'min_samples_leaf': 2,\n", " 'verbose': 0\n", "}\n", "\n", "\n", "\n", "# Support Vector Classifier parameters \n", "lr_params = {\n", " 'penalty' : 'l1',\n", " 'tol' : 6.75e-05,\n", " 'C' : 2.5,\n", " 'max_iter': 66\n", " }\n", "\n" ], "outputs": [], "metadata": {} }, { "cell_type": "markdown", "source": [ "**Random Forest**" ], "metadata": {} }, { "cell_type": "code", "execution_count": 11, "source": [ "from collections import Counter" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 12, "source": [ "from sklearn.model_selection import KFold\n", "from sklearn import preprocessing\n", "from imblearn.over_sampling import SMOTENC\n", "from sklearn.metrics import f1_score\n", "from yellowbrick.classifier import ROCAUC\n", "from sklearn.linear_model import LogisticRegression\n", "from numpy import loadtxt\n", "import os\n", "os.environ['KMP_DUPLICATE_LIB_OK']='True'\n", "from xgboost import XGBClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.ensemble import AdaBoostClassifier\n", "from sklearn.datasets import make_classification\n", "from sklearn.model_selection import StratifiedKFold\n", "import io \n", "\n", "\n", "\n", "for fold, (train_index, test_index) in enumerate(kf_m.split(X,y), 1):\n", " X_train = X.iloc[train_index]\n", " y_train = y.iloc[train_index] # Based on your code, you might need a ravel call here, but I would look into how you're generating your y\n", " X_test = X.iloc[test_index]\n", " y_test = y.iloc[test_index] # See comment on ravel and y_train\n", " \n", " \n", "#------------------------------Standardize Testing Set------------------------------------\n", " \n", " std_scale = preprocessing.StandardScaler().fit(X_train[cols_to_norm])\n", " X_train[cols_to_norm] = std_scale.transform(X_train[cols_to_norm])\n", " X_test[cols_to_norm] = std_scale.transform(X_test[cols_to_norm])\n", "#------------------------------------------------------------------------------------------\n", "\n", " # Hyperparameters are optimized using hyperopt\n", "\n", "\n", "\n", "# Class to extend XGboost classifer\n", " sm = SMOTENC(random_state=50, categorical_features=[1,2,3,22,23,24,25,26,27,28,29,30,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61, 62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81])\n", " X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)\n", " print(sorted(Counter(y_train_oversampled).items()))\n", " \n", "# --------------- Let's Start the fun ------------------------\n", "\n", " # Some useful parameters which will come in handy later on\n", " ntrain = X_train_oversampled.shape[0]\n", " print(ntrain)\n", " ntest = X_test.shape[0]\n", " SEED = 0 # for reproducibility\n", " # set folds for out-of-fold prediction\n", " #kf = KFold(ntrain, n_split=5, random_state=SEED)\n", " \n", " def get_oof(clf, x_train, y_train, x_test):\n", " oof_train = np.zeros((ntrain,))\n", " oof_test = np.zeros((ntest,))\n", " oof_test_skf = np.empty((10, ntest))\n", "\n", "\n", " for i, (train_index, test_index) in enumerate(kf_m.split(x_train, y_train)):\n", " x_tr = x_train[train_index]\n", " y_tr = y_train[train_index]\n", " x_te = x_train[test_index]\n", "\n", " clf.train(x_tr, y_tr)\n", " \n", " oof_train[test_index] = clf.predict(x_te)\n", " oof_test_skf[i, :] = clf.predict(x_test)\n", "\n", " oof_test[:] = oof_test_skf.mean(axis=0)\n", " return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)\n", " \n", " # Create 5 objects that represent our 4 models\n", " #rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)\n", " et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)\n", " #ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)\n", " gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)\n", " #lr = SklearnHelper(clf=LogisticRegression, seed=SEED, params=lr_params)\n", "\n", " #------------------------------------------\n", " # Create our OOF train and test predictions. These base results will be used as new features\n", " et_oof_train, et_oof_test = get_oof(et, X_train_oversampled, y_train_oversampled, X_test) # Extra Trees\n", " #rf_oof_train, rf_oof_test = get_oof(rf,X_train_oversampled, y_train_oversampled, X_test) # Random Forest\n", " #ada_oof_train, ada_oof_test = get_oof(ada, X_train_oversampled, y_train_oversampled, X_test) # AdaBoost \n", " gb_oof_train, gb_oof_test = get_oof(gb,X_train_oversampled, y_train_oversampled, X_test) # Gradient Boost\n", " #lr_oof_train, lr_oof_test = get_oof(lr,X_train_oversampled, y_train_oversampled, X_test) # Support Vector Classifier\n", "\n", " print(\"Training is complete\")\n", "\n", "\n", "\n", " #rf_features = rf.feature_importances(X_train_oversampled,y_train_oversampled).tolist()\n", " et_features = et.feature_importances(X_train_oversampled, y_train_oversampled).tolist()\n", " #ada_features = ada.feature_importances(X_train_oversampled, y_train_oversampled).tolist()\n", " gb_features = gb.feature_importances(X_train_oversampled, y_train_oversampled).tolist()\n", " #lr_features=(map(abs,lr_features)) / (abs(lr_fit.coef_).max())\n", "\n", "\n", "\n", " cols = df2.drop('destcopy', 1).columns.values\n", " # Create a dataframe with features\n", " feature_dataframe = pd.DataFrame( {'features': cols,\n", " \n", " 'Extra Trees feature importances': et_features,\n", " \n", " 'Gradient Boost feature importances': gb_features,\n", " #'LR feature importances': lr_features\n", " })\n", "\n", "\n", "\n", "\n", " # Create a dataframe with features\n", "\n", "\n", "\n", "\n", " # Scatter plot \n", " trace = go.Scatter(\n", " y = feature_dataframe['Extra Trees feature importances'].values,\n", " x = feature_dataframe['features'].values,\n", " mode='markers',\n", " marker=dict(\n", " sizemode = 'diameter',\n", " sizeref = 1,\n", " size = 25,\n", " # size= feature_dataframe['AdaBoost feature importances'].values,\n", " #color = np.random.randn(500), #set color equal to a variable\n", " color = feature_dataframe['Extra Trees feature importances'].values,\n", " colorscale='Portland',\n", " showscale=True\n", " ),\n", " text = feature_dataframe['features'].values\n", " )\n", " data = [trace]\n", "\n", " layout= go.Layout(\n", " autosize= True,\n", " title= 'Extra Trees Feature Importance',\n", " hovermode= 'closest',\n", " # xaxis= dict(\n", " # title= 'Pop',\n", " # ticklen= 5,\n", " # zeroline= False,\n", " # gridwidth= 2,\n", " # ),\n", " yaxis=dict(\n", " title= 'Feature Importance',\n", " ticklen= 5,\n", " gridwidth= 2\n", " ),\n", " showlegend= False\n", " )\n", " fig = go.Figure(data=data, layout=layout)\n", " py.iplot(fig,filename='scatter2010')\n", "\n", "\n", "\n", " # Scatter plot \n", " trace = go.Scatter(\n", " y = feature_dataframe['Gradient Boost feature importances'].values,\n", " x = feature_dataframe['features'].values,\n", " mode='markers',\n", " marker=dict(\n", " sizemode = 'diameter',\n", " sizeref = 1,\n", " size = 25,\n", " # size= feature_dataframe['AdaBoost feature importances'].values,\n", " #color = np.random.randn(500), #set color equal to a variable\n", " color = feature_dataframe['Gradient Boost feature importances'].values,\n", " colorscale='Portland',\n", " showscale=True\n", " ),\n", " text = feature_dataframe['features'].values\n", " )\n", " data = [trace]\n", "\n", " layout= go.Layout(\n", " autosize= True,\n", " title= 'Gradient Boosting Feature Importance',\n", " hovermode= 'closest',\n", " # xaxis= dict(\n", " # title= 'Pop',\n", " # ticklen= 5,\n", " # zeroline= False,\n", " # gridwidth= 2,\n", " # ),\n", " yaxis=dict(\n", " title= 'Feature Importance',\n", " ticklen= 5,\n", " gridwidth= 2\n", " ),\n", " showlegend= False\n", " )\n", " fig = go.Figure(data=data, layout=layout)\n", " py.iplot(fig,filename='scatter2010')\n", "\n", " feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise\n", " feature_dataframe.head(3)\n", "\n", " yv = feature_dataframe['mean'].values\n", " x = feature_dataframe['features'].values\n", " data = [go.Bar(\n", " x= x,\n", " y= yv,\n", " width = 0.5,\n", " marker=dict(\n", " color = feature_dataframe['mean'].values,\n", " colorscale='Portland',\n", " showscale=True,\n", " reversescale = False\n", " ),\n", " opacity=0.6\n", " )]\n", "\n", " layout= go.Layout(\n", " autosize= True,\n", " title= 'Barplots of Mean Feature Importance',\n", " hovermode= 'closest',\n", " # xaxis= dict(\n", " # title= 'Pop',\n", " # ticklen= 5,\n", " # zeroline= False,\n", " # gridwidth= 2,\n", " # ),\n", " yaxis=dict(\n", " title= 'Feature Importance',\n", " ticklen= 5,\n", " gridwidth= 2\n", " ),\n", " showlegend= False\n", " )\n", " fig = go.Figure(data=data, layout=layout)\n", " py.iplot(fig, filename='bar-direct-labels')\n", "\n", "\n", "\n", "\n", " base_predictions_train = pd.DataFrame( {\n", " 'ExtraTrees': et_oof_train.ravel(),\n", " \n", " 'GradientBoost': gb_oof_train.ravel(),\n", " #'LR': lr_oof_train.ravel()\n", " })\n", " base_predictions_train.head()\n", "\n", " data = [\n", " go.Heatmap(\n", " z= base_predictions_train.astype(float).corr().values ,\n", " x=base_predictions_train.columns.values,\n", " y= base_predictions_train.columns.values,\n", " colorscale='Viridis',\n", " showscale=True,\n", " reversescale = True\n", " )\n", " ]\n", " py.iplot(data, filename='labelled-heatmap')\n", "\n", " x_train = np.concatenate(( et_oof_train,gb_oof_train), axis=1)\n", " x_test = np.concatenate(( et_oof_test, gb_oof_test), axis=1)\n", " \n", " gbm = RandomForestClassifier().fit(x_train,y_train_oversampled)\n", " y_pred = gbm.predict(x_test)\n", " visualizer = ROCAUC(gbm, classes=classes)\n", " visualizer.fit(x_train, y_train_oversampled) # Fit the training data to the visualizer\n", " visualizer.score(x_test, y_test) # Evaluate the model on the test data\n", " visualizer.poof(\"Ensembel_{}.pdf\".format(fold), clear_figure=True) \n", " print(f'For fold {fold}:')\n", " print(f'Accuracy: {gbm.score(x_test, y_test)}')\n", " f1=f1_score(y_test, y_pred, average='micro')\n", " print(f'f-score: {f1}')\n", " print(classification_report_imbalanced(y_test, y_pred))\n", " K= classification_report_imbalanced(y_test, y_pred)\n", " df = pd.read_fwf(io.StringIO(K))\n", " df.loc[\"1\":\"1\",\"pre\":\"sup\"].to_csv(\"RF-Ensemble-D.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n", " df.loc[\"2\":\"2\",\"pre\":\"sup\"].to_csv(\"RF-Ensemble-H.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n", " df.loc[\"3\":\"3\",\"pre\":\"sup\"].to_csv(\"RF-Ensemble-N.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n", " df.loc[\"4\":\"4\",\"pre\":\"sup\"].to_csv(\"RF-Ensemble-R.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n", " df.iloc[6:7,:].to_csv(\"RF-Ensemble-avg.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n", " \n", " " ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[(1, 59596), (2, 59596), (3, 59596), (4, 59596)]\n", "238384\n", "Training is complete\n" ] }, { "output_type": "display_data", "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "marker": { "color": [ 0.0018353646817492182, 0.05084310545549719, 0.09400970408912715, 0.0009886418699209191, 0.10234222172294358, 0.003640032842124243, 0.0013646024310890242, 0.003016195056874061, 0.007971256941994166, 0.004620637010899481, 0.005530885089654668, 0.004322134153054816, 0.019205862429324715, 0.002515465892167688, 0.0029068529574578974, 0.0034366571531583533, 0.007804291207740086, 0.009402396653016296, 0.0018958531833847049, 0.002110420701936828, 0.038700339733758235, 0.03423588600963781, 0.00817997089251627, 0.000002642167259916031, 0.00013126404080853596, 0.000013986071937127094, 0.00034378765813338706, 0.000020677034739334597, 0.0009338911010296967, 0.0001707054178427359, 0.0011383842741308104, 0.0009118602546884352, 0.02020044289867789, 0.006722478106964732, 0.00003362408612181299, 0.010061882271833413, 0.000022615551677283513, 0.0003677843800595879, 0.013704607900350243, 0.007208557871758271, 0.00026973301527727416, 0.008129624547063242, 0.014065248845423622, 0.004248744953256545, 0.06376424950946484, 0.011617612464246977, 0.05976497182742223, 0.0004453574258103735, 0.0005074781663826736, 0.033473028923271225, 0.014834521177518382, 0.00022639718661120363, 0.005806196675825123, 0.021814353144357184, 0.014162892666176269, 0.0010164493656439567, 0.005271419199920207, 0.03845638943926816, 0.021141805218045706, 0.006183774361108017, 0.0032385940401626267, 0.000004525283649994152, 0.0017687513793295886, 0.04464062601841449, 0.002590304435506645, 0.031068104371901287, 0.0031761755250237435, 0.0014848938806685972, 0.00014321109881672463, 0.00017648741137150546, 0.0001727334966912723, 0.002155924445924486, 0.023218428487430897, 0.0022595503546761217, 0.003996836103398738, 0.019780160281208, 0.020686656005668783, 0.0021768086898221904, 0.0020361736675598993, 0.0051595512410654455, 0.018875900987376886, 0.013121385435198118 ], "colorscale": "Portland", "showscale": true, "size": 25, "sizemode": "diameter", "sizeref": 1 }, "mode": "markers", "text": [ "sodium", "electivesurgery", "vent", "dialysis", "gcs", "urine", "wbc", "temperature", "respiratoryrate", "heartrate", "meanbp", "creatinine", "ph", "hematocrit", "albumin", "pao2", "pco2", "bun", "glucose", "bilirubin", "fio2", "age", "thrombolytics", "aids", "hepaticfailure", "lymphoma", "metastaticcancer", "leukemia", "immunosuppression", "cirrhosis", "readmit", "offset", "admitsource_1.0", "admitsource_2.0", "admitsource_3.0", "admitsource_4.0", "admitsource_5.0", "admitsource_6.0", "admitsource_7.0", "admitsource_8.0", "diaggroup_ARF", "diaggroup_Asthma-Emphys", "diaggroup_CABG", "diaggroup_CHF", "diaggroup_CVA", "diaggroup_CVOther", "diaggroup_CardiacArrest", "diaggroup_ChestPainUnknown", "diaggroup_Coma", "diaggroup_DKA", "diaggroup_GIBleed", "diaggroup_GIObstruction", "diaggroup_Neuro", "diaggroup_Other", "diaggroup_Overdose", "diaggroup_PNA", "diaggroup_RespMedOther", "diaggroup_Sepsis", "diaggroup_Trauma", "diaggroup_ValveDz", "gender_Male", "gender_Other", "m1_True", "m2_True", "m3_True", "m4_True", "m5_True", "m6_True", "m7_True", "m8_True", "m9_True", "m10_True", "m11_True", "m12_True", "m13_True", "m14_True", "m15_True", "m16_True", "m17_True", "m18_True", "m19_True", "m20_True" ], "type": "scatter", "uid": "ed896635-15d8-49be-ab7b-626cd70d9f44", "x": [ "sodium", "electivesurgery", "vent", "dialysis", "gcs", "urine", "wbc", "temperature", "respiratoryrate", "heartrate", "meanbp", "creatinine", "ph", "hematocrit", "albumin", "pao2", "pco2", "bun", "glucose", "bilirubin", "fio2", "age", "thrombolytics", "aids", "hepaticfailure", "lymphoma", "metastaticcancer", "leukemia", "immunosuppression", "cirrhosis", "readmit", "offset", "admitsource_1.0", "admitsource_2.0", "admitsource_3.0", "admitsource_4.0", "admitsource_5.0", "admitsource_6.0", "admitsource_7.0", "admitsource_8.0", "diaggroup_ARF", "diaggroup_Asthma-Emphys", "diaggroup_CABG", "diaggroup_CHF", "diaggroup_CVA", "diaggroup_CVOther", "diaggroup_CardiacArrest", "diaggroup_ChestPainUnknown", "diaggroup_Coma", "diaggroup_DKA", "diaggroup_GIBleed", "diaggroup_GIObstruction", "diaggroup_Neuro", "diaggroup_Other", "diaggroup_Overdose", "diaggroup_PNA", "diaggroup_RespMedOther", "diaggroup_Sepsis", "diaggroup_Trauma", "diaggroup_ValveDz", "gender_Male", "gender_Other", "m1_True", "m2_True", "m3_True", "m4_True", "m5_True", "m6_True", "m7_True", "m8_True", "m9_True", "m10_True", "m11_True", "m12_True", "m13_True", "m14_True", "m15_True", "m16_True", "m17_True", "m18_True", "m19_True", "m20_True" ], "y": [ 0.0018353646817492182, 0.05084310545549719, 0.09400970408912715, 0.0009886418699209191, 0.10234222172294358, 0.003640032842124243, 0.0013646024310890242, 0.003016195056874061, 0.007971256941994166, 0.004620637010899481, 0.005530885089654668, 0.004322134153054816, 0.019205862429324715, 0.002515465892167688, 0.0029068529574578974, 0.0034366571531583533, 0.007804291207740086, 0.009402396653016296, 0.0018958531833847049, 0.002110420701936828, 0.038700339733758235, 0.03423588600963781, 0.00817997089251627, 0.000002642167259916031, 0.00013126404080853596, 0.000013986071937127094, 0.00034378765813338706, 0.000020677034739334597, 0.0009338911010296967, 0.0001707054178427359, 0.0011383842741308104, 0.0009118602546884352, 0.02020044289867789, 0.006722478106964732, 0.00003362408612181299, 0.010061882271833413, 0.000022615551677283513, 0.0003677843800595879, 0.013704607900350243, 0.007208557871758271, 0.00026973301527727416, 0.008129624547063242, 0.014065248845423622, 0.004248744953256545, 0.06376424950946484, 0.011617612464246977, 0.05976497182742223, 0.0004453574258103735, 0.0005074781663826736, 0.033473028923271225, 0.014834521177518382, 0.00022639718661120363, 0.005806196675825123, 0.021814353144357184, 0.014162892666176269, 0.0010164493656439567, 0.005271419199920207, 0.03845638943926816, 0.021141805218045706, 0.006183774361108017, 0.0032385940401626267, 0.000004525283649994152, 0.0017687513793295886, 0.04464062601841449, 0.002590304435506645, 0.031068104371901287, 0.0031761755250237435, 0.0014848938806685972, 0.00014321109881672463, 0.00017648741137150546, 0.0001727334966912723, 0.002155924445924486, 0.023218428487430897, 0.0022595503546761217, 0.003996836103398738, 0.019780160281208, 0.020686656005668783, 0.0021768086898221904, 0.0020361736675598993, 0.0051595512410654455, 0.018875900987376886, 0.013121385435198118 ] } ], "layout": { "autosize": true, "hovermode": "closest", "showlegend": false, "title": { "text": "Extra Trees Feature Importance" }, "yaxis": { "gridwidth": 2, "ticklen": 5, "title": { "text": "Feature Importance" } } } }, "text/html": [ "
" ], "text/vnd.plotly.v1+html": [ "
" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "marker": { "color": [ 0.02895100008920424, 0.01940889181967343, 0.01713397112196333, 0.0006888048715839917, 0.19943724634884952, 0.02533979514667484, 0.01876847510296464, 0.059907484539022726, 0.03118646710646018, 0.03090449684447332, 0.030793189430916895, 0.02089485838046929, 0.00879895366559916, 0.01259603502926633, 0.014804259064389239, 0.006116415474074245, 0.006781288603417748, 0.04069577405038757, 0.016778793438278258, 0.01897283888878102, 0.01832390108747583, 0.07812599893362324, 0.002962254344109083, 0.000042048554744753824, 0.0002238103351767069, 0.00009523997474768742, 0.0005200448443643561, 0.00016872489921494353, 0.0005661422812375212, 0.0002791824804045882, 0.0007312175206979133, 0.04873564675659822, 0.01634749861630543, 0.0017569377442036523, 0.0001913455363586476, 0.006714635293616546, 0.00008449044682818052, 0.00030841006190384545, 0.0032686970169763445, 0.005871734202759891, 0.0019213571600408086, 0.004536967337591905, 0.006859050935355847, 0.005319202569766147, 0.02782732741683144, 0.00516748393921808, 0.021132005357644496, 0.0011644991203162115, 0.0027817190845017253, 0.006112789646132433, 0.008028640083417802, 0.0012692347381076123, 0.005863116227885496, 0.011362942040987284, 0.009588107560505321, 0.003230215272296569, 0.005562687657783538, 0.014814582870448472, 0.013788101856366882, 0.004404936157351929, 0.005287599611263047, 0.00001575200082121126, 0.00008180812195822928, 0.008547861985514006, 0.002768742233742037, 0.01292689711014233, 0.0004311241532687182, 0.00034245906354993526, 0.0001626202609141136, 0.00019376841710366558, 0.00005846849317033116, 0.0005971032709410448, 0.000006173066489507722, 0.0007804211444204773, 0.0006015691500332959, 0.000013881165602887428, 0.00004166988296844418, 0.00002905911173326329, 0.001215075633558963, 0.0014923308590539296, 0.0000014489417273680594, 0.009392199341675855 ], "colorscale": "Portland", "showscale": true, "size": 25, "sizemode": "diameter", "sizeref": 1 }, "mode": "markers", "text": [ "sodium", "electivesurgery", "vent", "dialysis", "gcs", "urine", "wbc", "temperature", "respiratoryrate", "heartrate", "meanbp", "creatinine", "ph", "hematocrit", "albumin", "pao2", "pco2", "bun", "glucose", "bilirubin", "fio2", "age", "thrombolytics", "aids", "hepaticfailure", "lymphoma", "metastaticcancer", "leukemia", "immunosuppression", "cirrhosis", "readmit", "offset", "admitsource_1.0", "admitsource_2.0", "admitsource_3.0", "admitsource_4.0", "admitsource_5.0", "admitsource_6.0", "admitsource_7.0", "admitsource_8.0", "diaggroup_ARF", "diaggroup_Asthma-Emphys", "diaggroup_CABG", "diaggroup_CHF", "diaggroup_CVA", "diaggroup_CVOther", "diaggroup_CardiacArrest", "diaggroup_ChestPainUnknown", "diaggroup_Coma", "diaggroup_DKA", "diaggroup_GIBleed", "diaggroup_GIObstruction", "diaggroup_Neuro", "diaggroup_Other", "diaggroup_Overdose", "diaggroup_PNA", "diaggroup_RespMedOther", "diaggroup_Sepsis", "diaggroup_Trauma", "diaggroup_ValveDz", "gender_Male", "gender_Other", "m1_True", "m2_True", "m3_True", "m4_True", "m5_True", "m6_True", "m7_True", "m8_True", "m9_True", "m10_True", "m11_True", "m12_True", "m13_True", "m14_True", "m15_True", "m16_True", "m17_True", "m18_True", "m19_True", "m20_True" ], "type": "scatter", "uid": "28a65fda-6624-406f-8a1b-7e1dcb9f0d54", "x": [ "sodium", "electivesurgery", "vent", "dialysis", "gcs", "urine", "wbc", "temperature", "respiratoryrate", "heartrate", "meanbp", "creatinine", "ph", "hematocrit", "albumin", "pao2", "pco2", "bun", "glucose", "bilirubin", "fio2", "age", "thrombolytics", "aids", "hepaticfailure", "lymphoma", "metastaticcancer", "leukemia", "immunosuppression", "cirrhosis", "readmit", "offset", "admitsource_1.0", "admitsource_2.0", "admitsource_3.0", "admitsource_4.0", "admitsource_5.0", "admitsource_6.0", "admitsource_7.0", "admitsource_8.0", "diaggroup_ARF", "diaggroup_Asthma-Emphys", "diaggroup_CABG", "diaggroup_CHF", "diaggroup_CVA", "diaggroup_CVOther", "diaggroup_CardiacArrest", "diaggroup_ChestPainUnknown", "diaggroup_Coma", "diaggroup_DKA", "diaggroup_GIBleed", "diaggroup_GIObstruction", "diaggroup_Neuro", "diaggroup_Other", "diaggroup_Overdose", "diaggroup_PNA", "diaggroup_RespMedOther", "diaggroup_Sepsis", "diaggroup_Trauma", "diaggroup_ValveDz", "gender_Male", "gender_Other", "m1_True", "m2_True", "m3_True", "m4_True", "m5_True", "m6_True", "m7_True", "m8_True", "m9_True", "m10_True", "m11_True", "m12_True", "m13_True", "m14_True", "m15_True", "m16_True", "m17_True", "m18_True", "m19_True", "m20_True" ], "y": [ 0.02895100008920424, 0.01940889181967343, 0.01713397112196333, 0.0006888048715839917, 0.19943724634884952, 0.02533979514667484, 0.01876847510296464, 0.059907484539022726, 0.03118646710646018, 0.03090449684447332, 0.030793189430916895, 0.02089485838046929, 0.00879895366559916, 0.01259603502926633, 0.014804259064389239, 0.006116415474074245, 0.006781288603417748, 0.04069577405038757, 0.016778793438278258, 0.01897283888878102, 0.01832390108747583, 0.07812599893362324, 0.002962254344109083, 0.000042048554744753824, 0.0002238103351767069, 0.00009523997474768742, 0.0005200448443643561, 0.00016872489921494353, 0.0005661422812375212, 0.0002791824804045882, 0.0007312175206979133, 0.04873564675659822, 0.01634749861630543, 0.0017569377442036523, 0.0001913455363586476, 0.006714635293616546, 0.00008449044682818052, 0.00030841006190384545, 0.0032686970169763445, 0.005871734202759891, 0.0019213571600408086, 0.004536967337591905, 0.006859050935355847, 0.005319202569766147, 0.02782732741683144, 0.00516748393921808, 0.021132005357644496, 0.0011644991203162115, 0.0027817190845017253, 0.006112789646132433, 0.008028640083417802, 0.0012692347381076123, 0.005863116227885496, 0.011362942040987284, 0.009588107560505321, 0.003230215272296569, 0.005562687657783538, 0.014814582870448472, 0.013788101856366882, 0.004404936157351929, 0.005287599611263047, 0.00001575200082121126, 0.00008180812195822928, 0.008547861985514006, 0.002768742233742037, 0.01292689711014233, 0.0004311241532687182, 0.00034245906354993526, 0.0001626202609141136, 0.00019376841710366558, 0.00005846849317033116, 0.0005971032709410448, 0.000006173066489507722, 0.0007804211444204773, 0.0006015691500332959, 0.000013881165602887428, 0.00004166988296844418, 0.00002905911173326329, 0.001215075633558963, 0.0014923308590539296, 0.0000014489417273680594, 0.009392199341675855 ] } ], "layout": { "autosize": true, "hovermode": "closest", "showlegend": false, "title": { "text": "Gradient Boosting Feature Importance" }, "yaxis": { "gridwidth": 2, "ticklen": 5, "title": { "text": "Feature Importance" } } } }, "text/html": [ "
" ], "text/vnd.plotly.v1+html": [ "
" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "marker": { "color": [ 0.015393182385476729, 0.03512599863758531, 0.05557183760554524, 0.0008387233707524554, 0.15088973403589656, 0.014489913994399542, 0.010066538767026833, 0.03146183979794839, 0.01957886202422717, 0.0177625669276864, 0.01816203726028578, 0.012608496266762054, 0.014002408047461938, 0.007555750460717009, 0.008855556010923568, 0.004776536313616299, 0.007292789905578917, 0.025049085351701934, 0.009337323310831482, 0.010541629795358925, 0.028512120410617032, 0.056180942471630524, 0.005571112618312676, 0.00002234536100233493, 0.00017753718799262143, 0.00005461302334240726, 0.0004319162512488716, 0.00009470096697713907, 0.0007500166911336089, 0.00022494394912366203, 0.0009348008974143619, 0.02482375350564333, 0.01827397075749166, 0.004239707925584192, 0.0001124848112402303, 0.00838825878272498, 0.00005355299925273202, 0.00033809722098171663, 0.008486652458663294, 0.006540146037259081, 0.0010955450876590413, 0.006333295942327574, 0.010462149890389735, 0.004783973761511346, 0.04579578846314814, 0.008392548201732528, 0.04044848859253336, 0.0008049282730632925, 0.0016445986254421995, 0.01979290928470183, 0.011431580630468091, 0.000747815962359408, 0.005834656451855309, 0.016588647592672236, 0.011875500113340796, 0.002123332318970263, 0.005417053428851872, 0.026635486154858314, 0.017464953537206295, 0.005294355259229973, 0.004263096825712837, 0.000010138642235602707, 0.000925279750643909, 0.026594244001964248, 0.002679523334624341, 0.02199750074102181, 0.0018036498391462309, 0.0009136764721092663, 0.0001529156798654191, 0.00018512791423758553, 0.00011560099493080172, 0.0013765138584327654, 0.011612300776960202, 0.0015199857495482995, 0.002299202626716017, 0.009897020723405443, 0.010364162944318613, 0.0011029339007777269, 0.0016256246505594312, 0.0033259410500596875, 0.009438674964552127, 0.011256792388436986 ], "colorscale": "Portland", "reversescale": false, "showscale": true }, "opacity": 0.6, "type": "bar", "uid": "3aadb222-91ce-4d34-ae5b-f5b2e1bd2098", "width": 0.5, "x": [ "sodium", "electivesurgery", "vent", "dialysis", "gcs", "urine", "wbc", "temperature", "respiratoryrate", "heartrate", "meanbp", "creatinine", "ph", "hematocrit", "albumin", "pao2", "pco2", "bun", "glucose", "bilirubin", "fio2", "age", "thrombolytics", "aids", "hepaticfailure", "lymphoma", "metastaticcancer", "leukemia", "immunosuppression", "cirrhosis", "readmit", "offset", "admitsource_1.0", "admitsource_2.0", "admitsource_3.0", "admitsource_4.0", "admitsource_5.0", "admitsource_6.0", "admitsource_7.0", "admitsource_8.0", "diaggroup_ARF", "diaggroup_Asthma-Emphys", "diaggroup_CABG", "diaggroup_CHF", "diaggroup_CVA", "diaggroup_CVOther", "diaggroup_CardiacArrest", "diaggroup_ChestPainUnknown", "diaggroup_Coma", "diaggroup_DKA", "diaggroup_GIBleed", "diaggroup_GIObstruction", "diaggroup_Neuro", "diaggroup_Other", "diaggroup_Overdose", "diaggroup_PNA", "diaggroup_RespMedOther", "diaggroup_Sepsis", "diaggroup_Trauma", "diaggroup_ValveDz", "gender_Male", "gender_Other", "m1_True", "m2_True", "m3_True", "m4_True", "m5_True", "m6_True", "m7_True", "m8_True", "m9_True", "m10_True", "m11_True", "m12_True", "m13_True", "m14_True", "m15_True", "m16_True", "m17_True", "m18_True", "m19_True", "m20_True" ], "y": [ 0.015393182385476729, 0.03512599863758531, 0.05557183760554524, 0.0008387233707524554, 0.15088973403589656, 0.014489913994399542, 0.010066538767026833, 0.03146183979794839, 0.01957886202422717, 0.0177625669276864, 0.01816203726028578, 0.012608496266762054, 0.014002408047461938, 0.007555750460717009, 0.008855556010923568, 0.004776536313616299, 0.007292789905578917, 0.025049085351701934, 0.009337323310831482, 0.010541629795358925, 0.028512120410617032, 0.056180942471630524, 0.005571112618312676, 0.00002234536100233493, 0.00017753718799262143, 0.00005461302334240726, 0.0004319162512488716, 0.00009470096697713907, 0.0007500166911336089, 0.00022494394912366203, 0.0009348008974143619, 0.02482375350564333, 0.01827397075749166, 0.004239707925584192, 0.0001124848112402303, 0.00838825878272498, 0.00005355299925273202, 0.00033809722098171663, 0.008486652458663294, 0.006540146037259081, 0.0010955450876590413, 0.006333295942327574, 0.010462149890389735, 0.004783973761511346, 0.04579578846314814, 0.008392548201732528, 0.04044848859253336, 0.0008049282730632925, 0.0016445986254421995, 0.01979290928470183, 0.011431580630468091, 0.000747815962359408, 0.005834656451855309, 0.016588647592672236, 0.011875500113340796, 0.002123332318970263, 0.005417053428851872, 0.026635486154858314, 0.017464953537206295, 0.005294355259229973, 0.004263096825712837, 0.000010138642235602707, 0.000925279750643909, 0.026594244001964248, 0.002679523334624341, 0.02199750074102181, 0.0018036498391462309, 0.0009136764721092663, 0.0001529156798654191, 0.00018512791423758553, 0.00011560099493080172, 0.0013765138584327654, 0.011612300776960202, 0.0015199857495482995, 0.002299202626716017, 0.009897020723405443, 0.010364162944318613, 0.0011029339007777269, 0.0016256246505594312, 0.0033259410500596875, 0.009438674964552127, 0.011256792388436986 ] } ], "layout": { "autosize": true, "hovermode": "closest", "showlegend": false, "title": { "text": "Barplots of Mean Feature Importance" }, "yaxis": { "gridwidth": 2, "ticklen": 5, "title": { "text": "Feature Importance" } } } }, "text/html": [ "
" ], "text/vnd.plotly.v1+html": [ "
" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "colorscale": "Viridis", "reversescale": true, "showscale": true, "type": "heatmap", "uid": "ca5a2a9a-2059-4187-a451-d643a7f5d4b9", "x": [ "ExtraTrees", "GradientBoost" ], "y": [ "ExtraTrees", "GradientBoost" ], "z": [ [ 1, 0.45313604730131996 ], [ 0.45313604730131996, 1 ] ] } ], "layout": {} }, "text/html": [ "
" ], "text/vnd.plotly.v1+html": [ "
" ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "For fold 1:\n", "Accuracy: 0.7351828499369483\n", "f-score: 0.7351828499369484\n" ] }, { "output_type": "error", "ename": "NameError", "evalue": "name 'classification_report_imbalanced' is not defined", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 269\u001b[0m \u001b[0mf1\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mf1_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maverage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'micro'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 270\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf'f-score: {f1}'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 271\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mclassification_report_imbalanced\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 272\u001b[0m \u001b[0mK\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mclassification_report_imbalanced\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 273\u001b[0m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_fwf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mStringIO\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mK\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mNameError\u001b[0m: name 'classification_report_imbalanced' is not defined" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ] }, "metadata": {} } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "len(y_test)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "list(map(abs,lr_features))" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "visualizer\n" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "fig.write_image(\"images/fig1.png\")" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "#lr_fit= lr.fit(X_train_oversampled, y_train_oversampled).tolist()\n", "lr_features = lr_fit.coef_\n", "len(list(lr_features.flat))" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ " \n", " \n", " model = AdaBoostClassifier() \n", " model.fit(X_train_oversampled, y_train_oversampled) \n", " y_pred = model.predict(X_test.values)\n", " visualizer = ROCAUC(model, classes=classes)\n", " visualizer.fit(X_train_oversampled, y_train_oversampled) # Fit the training data to the visualizer\n", " visualizer.score(X_test.values, y_test) # Evaluate the model on the test data\n", " visualizer.poof(\"Ada_Indicator_Replace_{}.pdf\".format(fold), clear_figure=True) \n", " print(f'For fold {fold}:')\n", " print(f'Accuracy: {model.score(X_test.values, y_test)}')\n", " f1=f1_score(y_test, y_pred, average='micro')\n", " print(f'f-score: {f1}')\n", " print(classification_report_imbalanced(y_test, y_pred))\n", " \n", " #\n", "\n", " " ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ " feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise\n", " feature_dataframe.head(3)\n", " \n", " y = feature_dataframe['mean'].values\n", " x = feature_dataframe['features'].values\n", " data = [go.Bar(\n", " x= x,\n", " y= y,\n", " width = 0.5,\n", " marker=dict(\n", " color = feature_dataframe['mean'].values,\n", " colorscale='Portland',\n", " showscale=True,\n", " reversescale = False\n", " ),\n", " opacity=0.6\n", " )]\n", "\n", " layout= go.Layout(\n", " autosize= True,\n", " title= 'Barplots of Mean Feature Importance',\n", " hovermode= 'closest',\n", " # xaxis= dict(\n", " # title= 'Pop',\n", " # ticklen= 5,\n", " # zeroline= False,\n", " # gridwidth= 2,\n", " # ),\n", " yaxis=dict(\n", " title= 'Feature Importance',\n", " ticklen= 5,\n", " gridwidth= 2\n", " ),\n", " showlegend= False\n", " )\n", " fig = go.Figure(data=data, layout=layout)\n", " py.iplot(fig, filename='bar-direct-labels')\n", " \n", " base_predictions_train = pd.DataFrame( {\n", " 'ExtraTrees': et_oof_train.ravel(),\n", " 'GradientBoost': gb_oof_train.ravel()\n", " })\n", " base_predictions_train.head()\n", " \n", " data = [\n", " go.Heatmap(\n", " z= base_predictions_train.astype(float).corr().values ,\n", " x=base_predictions_train.columns.values,\n", " y= base_predictions_train.columns.values,\n", " colorscale='Viridis',\n", " showscale=True,\n", " reversescale = True\n", " )\n", " ]\n", " py.iplot(data, filename='labelled-heatmap')\n", " \n", " #-------------------------------------------------------------------------------------\n", " x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, lr_oof_train), axis=1)\n", " x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, lr_oof_test), axis=1)\n", " \n", " gbm = xgb.XGBClassifier(\n", " #learning_rate = 0.02,\n", " n_estimators= 2000,\n", " max_depth= 4,\n", " min_child_weight= 2,\n", " #gamma=1,\n", " gamma=0.9, \n", " subsample=0.8,\n", " colsample_bytree=0.8,\n", " objective= 'binary:logistic',\n", " nthread= -1,\n", " scale_pos_weight=1).fit(x_train, y_train_oversampled)\n", " predictions = gbm.predict(x_test)\n", " " ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "len(lr_features)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "cols = df2.drop('destcopy', 1).columns.values" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "import plotly.graph_objects as go" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "fig.show()" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "import numpy as np \n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.preprocessing import StandardScaler\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "x1 = np.random.randn(100)\n", "x2 = np.random.randn(100)\n", "x3 = np.random.randn(100)\n", "\n", "#Make difference in feature dependance\n", "y = (3 + x1 + 2*x2 + 5*x3 + 0.2*np.random.randn()) > 0\n", "\n", "X = pd.DataFrame({'x1':x1,'x2':x2,'x3':x3})\n", "\n", "#Scale your data\n", "scaler = StandardScaler()\n", "scaler.fit(X) \n", "X_scaled = pd.DataFrame(scaler.transform(X),columns = X.columns)\n", "\n", "clf = LogisticRegression(random_state = 0)\n", "clf.fit(X_scaled, y)\n", "\n", "feature_importance = abs(clf.coef_[0])\n", "feature_importance = 100.0 * (feature_importance / feature_importance.max())\n", "sorted_idx = np.argsort(feature_importance)\n", "pos = np.arange(sorted_idx.shape[0]) + .5\n", "\n", "featfig = plt.figure()\n", "featax = featfig.add_subplot(1, 1, 1)\n", "featax.barh(pos, feature_importance[sorted_idx], align='center')\n", "featax.set_yticks(pos)\n", "featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=8)\n", "featax.set_xlabel('Relative Feature Importance')\n", "\n", "plt.tight_layout() \n", "plt.show()" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "feature_importance" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [], "outputs": [], "metadata": {} } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }