[d8f241]: / Window Size = 2 min, Analysis.ipynb

Download this file

797 lines (796 with data), 22.3 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.45 s, sys: 310 ms, total: 1.76 s\n",
      "Wall time: 9.45 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 5s, sys: 6.39 s, total: 1min 12s\n",
      "Wall time: 1min 47s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train = pd.read_csv(\"2_min_train.csv\")\n",
    "test = pd.read_csv(\"2_min_test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(5040000, 11)\n",
      "(26430603, 11)\n"
     ]
    }
   ],
   "source": [
    "print(train.shape)\n",
    "print(test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0          14786800\n",
       "1          14786801\n",
       "2          14786802\n",
       "3          14786803\n",
       "4          14786804\n",
       "             ...   \n",
       "5039995    26337496\n",
       "5039996    26337497\n",
       "5039997    26337498\n",
       "5039998    26337499\n",
       "5039999    26337500\n",
       "Name: Unnamed: 0, Length: 5040000, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['Unnamed: 0']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Unnamed: 0',\n",
       " 'target',\n",
       " 'subject',\n",
       " 'chest_ACC_x',\n",
       " 'chest_ACC_y',\n",
       " 'chest_ACC_z',\n",
       " 'chest_ECG',\n",
       " 'chest_EMG',\n",
       " 'chest_EDA',\n",
       " 'chest_Temp',\n",
       " 'chest_Resp']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2 µs, sys: 1 µs, total: 3 µs\n",
      "Wall time: 5.72 µs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "# # df=df[df.target!=3]\n",
    "# train=train[train.target!=4]\n",
    "# test=test[test.target!=4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['chest_ACC_x',\n",
       " 'chest_ACC_y',\n",
       " 'chest_ACC_z',\n",
       " 'chest_ECG',\n",
       " 'chest_EMG',\n",
       " 'chest_EDA',\n",
       " 'chest_Temp',\n",
       " 'chest_Resp']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features=train.columns.tolist()\n",
    "features = features[3:]\n",
    "features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5040000 entries, 0 to 5039999\n",
      "Data columns (total 11 columns):\n",
      "Unnamed: 0     int64\n",
      "target         int64\n",
      "subject        int64\n",
      "chest_ACC_x    float64\n",
      "chest_ACC_y    float64\n",
      "chest_ACC_z    float64\n",
      "chest_ECG      float64\n",
      "chest_EMG      float64\n",
      "chest_EDA      float64\n",
      "chest_Temp     float64\n",
      "chest_Resp     float64\n",
      "dtypes: float64(8), int64(3)\n",
      "memory usage: 423.0 MB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 70 µs, sys: 15 µs, total: 85 µs\n",
      "Wall time: 90.1 µs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "et = ExtraTreesClassifier(n_estimators=20, n_jobs=10, verbose=2,random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "building tree 1 of 20building tree 2 of 20\n",
      "building tree 3 of 20\n",
      "building tree 4 of 20\n",
      "building tree 5 of 20\n",
      "building tree 6 of 20\n",
      "building tree 7 of 20\n",
      "building tree 8 of 20\n",
      "building tree 9 of 20\n",
      "\n",
      "building tree 10 of 20\n",
      "building tree 11 of 20\n",
      "building tree 12 of 20\n",
      "building tree 13 of 20\n",
      "building tree 14 of 20\n",
      "building tree 15 of 20\n",
      "building tree 16 of 20\n",
      "building tree 17 of 20\n",
      "building tree 18 of 20\n",
      "building tree 19 of 20\n",
      "building tree 20 of 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=10)]: Done  12 out of  20 | elapsed:   17.2s remaining:   11.5s\n",
      "[Parallel(n_jobs=10)]: Done  20 out of  20 | elapsed:   20.4s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',\n",
       "                     max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "                     min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                     min_samples_leaf=1, min_samples_split=2,\n",
       "                     min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=10,\n",
       "                     oob_score=False, random_state=0, verbose=2,\n",
       "                     warm_start=False)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "et.fit(train[features],train['target'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.\n",
      "[Parallel(n_jobs=10)]: Done  12 out of  20 | elapsed:   23.8s remaining:   15.8s\n",
      "[Parallel(n_jobs=10)]: Done  20 out of  20 | elapsed:   27.1s finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 26s, sys: 19 s, total: 1min 45s\n",
      "Wall time: 30.1 s\n"
     ]
    }
   ],
   "source": [
    "%%time \n",
    "y_pred=et.predict(test[features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 2, 3, 4])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.91      0.68      0.78  11067702\n",
      "           2       0.62      0.90      0.73   5716201\n",
      "           3       0.69      0.91      0.79   2642501\n",
      "           4       0.74      0.67      0.70   7004199\n",
      "\n",
      "    accuracy                           0.75  26430603\n",
      "   macro avg       0.74      0.79      0.75  26430603\n",
      "weighted avg       0.78      0.75      0.75  26430603\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.tree import DecisionTreeClassifier\n",
    "clf = DecisionTreeClassifier()\n",
    "\n",
    "clf = clf.fit(train[features],train['target'])\n",
    "\n",
    "\n",
    "y_pred = clf.predict(test[features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.86      0.52      0.65  11067702\n",
      "           2       0.47      0.78      0.59   5716201\n",
      "           3       0.54      0.91      0.68   2642501\n",
      "           4       0.65      0.55      0.60   7004199\n",
      "\n",
      "    accuracy                           0.62  26430603\n",
      "   macro avg       0.63      0.69      0.63  26430603\n",
      "weighted avg       0.69      0.62      0.62  26430603\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from sklearn.neighbors import KNeighborsClassifier\n",
    "# classifier = KNeighborsClassifier(n_neighbors=5)\n",
    "# classifier.fit(train[feature],train['target'])\n",
    "# y_pred = classifier.predict(test[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting xgboost\n",
      "  Downloading https://files.pythonhosted.org/packages/c1/24/5fe7237b2eca13ee0cfb100bec8c23f4e69ce9df852a64b0493d49dae4e0/xgboost-0.90-py2.py3-none-manylinux1_x86_64.whl (142.8MB)\n",
      "\u001b[K    100% |████████████████████████████████| 142.8MB 4.6kB/s eta 0:00:01 1% |▋                               | 2.6MB 42.5MB/s eta 0:00:04    9% |███                             | 13.6MB 26.6MB/s eta 0:00:05    11% |███▊                            | 16.8MB 41.6MB/s eta 0:00:04    31% |██████████                      | 44.9MB 27.5MB/s eta 0:00:04    51% |████████████████▌               | 73.4MB 47.5MB/s eta 0:00:02    63% |████████████████████▍           | 90.8MB 45.1MB/s eta 0:00:02    68% |██████████████████████          | 98.1MB 48.6MB/s eta 0:00:01    85% |███████████████████████████▍    | 122.5MB 49.9MB/s eta 0:00:01\n",
      "\u001b[?25hCollecting numpy (from xgboost)\n",
      "  Downloading https://files.pythonhosted.org/packages/75/92/57179ed45307ec6179e344231c47da7f3f3da9e2eee5c8ab506bd279ce4e/numpy-1.17.1-cp36-cp36m-manylinux1_x86_64.whl (20.4MB)\n",
      "\u001b[K    100% |████████████████████████████████| 20.4MB 34kB/s  eta 0:00:01    66% |█████████████████████▍          | 13.6MB 19.7MB/s eta 0:00:01\n",
      "\u001b[?25hCollecting scipy (from xgboost)\n",
      "  Using cached https://files.pythonhosted.org/packages/29/50/a552a5aff252ae915f522e44642bb49a7b7b31677f9580cfd11bcc869976/scipy-1.3.1-cp36-cp36m-manylinux1_x86_64.whl\n",
      "Installing collected packages: numpy, scipy, xgboost\n",
      "Successfully installed numpy-1.17.1 scipy-1.3.1 xgboost-0.90\n"
     ]
    }
   ],
   "source": [
    "!pip3 install xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[12:31:40] WARNING: /workspace/src/learner.cc:686: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.\n",
      "CPU times: user 9min 34s, sys: 8.79 s, total: 9min 43s\n",
      "Wall time: 11min 16s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "from xgboost import XGBClassifier\n",
    "import xgboost as xgb\n",
    "from sklearn.metrics import mean_squared_error\n",
    "xg_reg = xgb.XGBClassifier(n_estimators = 10)\n",
    "xg_reg.fit(train[features],train['target'])\n",
    "\n",
    "y_pred = xg_reg.predict(test[features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.82      0.64      0.72  11067702\n",
      "           2       0.62      0.69      0.65   5716201\n",
      "           3       0.29      0.36      0.32   2642501\n",
      "           4       0.62      0.72      0.67   7004199\n",
      "\n",
      "    accuracy                           0.64  26430603\n",
      "   macro avg       0.59      0.60      0.59  26430603\n",
      "weighted avg       0.67      0.64      0.65  26430603\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sf/.local/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "/home/sf/.local/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
      "  \"this warning.\", FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "lg = LogisticRegression()\n",
    "\n",
    "lg.fit(train[features],train['target'])\n",
    "\n",
    "y_pred = lg.predict(test[features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 2, 3, 4])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.61      0.20      0.31  11067702\n",
      "           2       0.35      0.47      0.40   5716201\n",
      "           3       0.17      0.39      0.23   2642501\n",
      "           4       0.43      0.54      0.48   7004199\n",
      "\n",
      "    accuracy                           0.37  26430603\n",
      "   macro avg       0.39      0.40      0.36  26430603\n",
      "weighted avg       0.46      0.37      0.37  26430603\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "#Create a Gaussian Classifier\n",
    "clf=RandomForestClassifier(n_estimators=50)\n",
    "\n",
    "#Train the model using the training sets y_pred=clf.predict(X_test)\n",
    "clf.fit(train[features],train['target'])\n",
    "\n",
    "y_pred=clf.predict(test[features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.92      0.59      0.72  11067702\n",
      "           2       0.57      0.88      0.69   5716201\n",
      "           3       0.60      0.90      0.72   2642501\n",
      "           4       0.70      0.65      0.68   7004199\n",
      "\n",
      "    accuracy                           0.70  26430603\n",
      "   macro avg       0.70      0.76      0.70  26430603\n",
      "weighted avg       0.75      0.70      0.70  26430603\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "#Create a Gaussian Classifier\n",
    "model = GaussianNB()\n",
    "\n",
    "# Train the model using the training sets\n",
    "model.fit(train[features],train['target'])\n",
    "\n",
    "#Predict Output\n",
    "y_pred= model.predict(test[features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.61      0.24      0.35  11067702\n",
      "           2       0.36      0.39      0.38   5716201\n",
      "           3       0.14      0.49      0.22   2642501\n",
      "           4       0.43      0.42      0.43   7004199\n",
      "\n",
      "    accuracy                           0.35  26430603\n",
      "   macro avg       0.39      0.39      0.34  26430603\n",
      "weighted avg       0.46      0.35      0.36  26430603\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
       "                           learning_rate=0.1, loss='deviance', max_depth=3,\n",
       "                           max_features=None, max_leaf_nodes=None,\n",
       "                           min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                           min_samples_leaf=1, min_samples_split=2,\n",
       "                           min_weight_fraction_leaf=0.0, n_estimators=10,\n",
       "                           n_iter_no_change=None, presort='auto',\n",
       "                           random_state=None, subsample=1.0, tol=0.0001,\n",
       "                           validation_fraction=0.1, verbose=0,\n",
       "                           warm_start=False)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "clf = GradientBoostingClassifier(n_estimators=10)\n",
    "# n_estimators = 100 (default)\n",
    "# loss function = deviance(default) used in Logistic Regression\n",
    "clf.fit(train[features],train['target'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred=clf.predict(test[features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.61      0.24      0.35  11067702\n",
      "           2       0.36      0.39      0.38   5716201\n",
      "           3       0.14      0.49      0.22   2642501\n",
      "           4       0.43      0.42      0.43   7004199\n",
      "\n",
      "    accuracy                           0.35  26430603\n",
      "   macro avg       0.39      0.39      0.34  26430603\n",
      "weighted avg       0.46      0.35      0.36  26430603\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}