[d8f241]: / Window Size = 1 min, Analysis.ipynb

Download this file

646 lines (645 with data), 18.3 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.1 s, sys: 606 ms, total: 1.71 s\n",
      "Wall time: 4.9 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 48.8 s, sys: 7.5 s, total: 56.3 s\n",
      "Wall time: 1min 55s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train = pd.read_csv(\"1_min_train.csv\")\n",
    "test = pd.read_csv(\"1_min_test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2520000, 11)\n",
      "(28950603, 11)\n"
     ]
    }
   ],
   "source": [
    "print(train.shape)\n",
    "print(test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2 µs, sys: 0 ns, total: 2 µs\n",
      "Wall time: 6.68 µs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "# # df=df[df.target!=3]\n",
    "# train=train[train.target!=4]\n",
    "# test=test[test.target!=4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['chest_ACC_x',\n",
       " 'chest_ACC_y',\n",
       " 'chest_ACC_z',\n",
       " 'chest_ECG',\n",
       " 'chest_EMG',\n",
       " 'chest_EDA',\n",
       " 'chest_Temp',\n",
       " 'chest_Resp']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features=train.columns.tolist()\n",
    "features = features[3:]\n",
    "features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2520000 entries, 0 to 2519999\n",
      "Data columns (total 11 columns):\n",
      "Unnamed: 0     int64\n",
      "target         int64\n",
      "subject        int64\n",
      "chest_ACC_x    float64\n",
      "chest_ACC_y    float64\n",
      "chest_ACC_z    float64\n",
      "chest_ECG      float64\n",
      "chest_EMG      float64\n",
      "chest_EDA      float64\n",
      "chest_Temp     float64\n",
      "chest_Resp     float64\n",
      "dtypes: float64(8), int64(3)\n",
      "memory usage: 211.5 MB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 74 µs, sys: 10 µs, total: 84 µs\n",
      "Wall time: 88.7 µs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "et = ExtraTreesClassifier(n_estimators=20, n_jobs=10, verbose=2,random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "building tree 1 of 20\n",
      "building tree 2 of 20\n",
      "building tree 3 of 20\n",
      "building tree 4 of 20\n",
      "building tree 5 of 20\n",
      "building tree 6 of 20\n",
      "building tree 7 of 20\n",
      "building tree 8 of 20\n",
      "building tree 9 of 20\n",
      "building tree 10 of 20\n",
      "building tree 11 of 20\n",
      "building tree 12 of 20\n",
      "building tree 13 of 20\n",
      "building tree 14 of 20\n",
      "building tree 15 of 20\n",
      "building tree 16 of 20building tree 17 of 20\n",
      "building tree 18 of 20\n",
      "\n",
      "building tree 19 of 20\n",
      "building tree 20 of 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=10)]: Done  12 out of  20 | elapsed:    3.4s remaining:    2.3s\n",
      "[Parallel(n_jobs=10)]: Done  20 out of  20 | elapsed:    4.1s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',\n",
       "                     max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "                     min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                     min_samples_leaf=1, min_samples_split=2,\n",
       "                     min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=10,\n",
       "                     oob_score=False, random_state=0, verbose=2,\n",
       "                     warm_start=False)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "et.fit(train[feature],train['target'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.\n",
      "[Parallel(n_jobs=10)]: Done  12 out of  20 | elapsed:   12.1s remaining:    8.0s\n",
      "[Parallel(n_jobs=10)]: Done  20 out of  20 | elapsed:   14.5s finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 24s, sys: 11.4 s, total: 1min 35s\n",
      "Wall time: 16.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time \n",
    "y_pred=et.predict(test[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.88      0.64      0.74  11697702\n",
      "           2       0.64      0.83      0.72   6346201\n",
      "           3       0.62      0.80      0.70   3272501\n",
      "           4       0.71      0.75      0.73   7634199\n",
      "\n",
      "    accuracy                           0.73  28950603\n",
      "   macro avg       0.71      0.75      0.72  28950603\n",
      "weighted avg       0.75      0.73      0.73  28950603\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.tree import DecisionTreeClassifier\n",
    "clf = DecisionTreeClassifier()\n",
    "\n",
    "clf = clf.fit(train[feature],train['target'])\n",
    "\n",
    "\n",
    "y_pred = clf.predict(test[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.93      0.68      0.78  11697702\n",
      "           2       0.69      0.85      0.76   6346201\n",
      "           3       0.57      0.86      0.69   3272501\n",
      "\n",
      "    accuracy                           0.76  21316404\n",
      "   macro avg       0.73      0.80      0.74  21316404\n",
      "weighted avg       0.80      0.76      0.76  21316404\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from sklearn.neighbors import KNeighborsClassifier\n",
    "# classifier = KNeighborsClassifier(n_neighbors=5)\n",
    "# classifier.fit(train[feature],train['target'])\n",
    "# y_pred = classifier.predict(test[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting xgboost\n",
      "  Downloading https://files.pythonhosted.org/packages/c1/24/5fe7237b2eca13ee0cfb100bec8c23f4e69ce9df852a64b0493d49dae4e0/xgboost-0.90-py2.py3-none-manylinux1_x86_64.whl (142.8MB)\n",
      "\u001b[K    100% |████████████████████████████████| 142.8MB 4.6kB/s eta 0:00:01 1% |▋                               | 2.6MB 42.5MB/s eta 0:00:04    9% |███                             | 13.6MB 26.6MB/s eta 0:00:05    11% |███▊                            | 16.8MB 41.6MB/s eta 0:00:04    31% |██████████                      | 44.9MB 27.5MB/s eta 0:00:04    51% |████████████████▌               | 73.4MB 47.5MB/s eta 0:00:02    63% |████████████████████▍           | 90.8MB 45.1MB/s eta 0:00:02    68% |██████████████████████          | 98.1MB 48.6MB/s eta 0:00:01    85% |███████████████████████████▍    | 122.5MB 49.9MB/s eta 0:00:01\n",
      "\u001b[?25hCollecting numpy (from xgboost)\n",
      "  Downloading https://files.pythonhosted.org/packages/75/92/57179ed45307ec6179e344231c47da7f3f3da9e2eee5c8ab506bd279ce4e/numpy-1.17.1-cp36-cp36m-manylinux1_x86_64.whl (20.4MB)\n",
      "\u001b[K    100% |████████████████████████████████| 20.4MB 34kB/s  eta 0:00:01    66% |█████████████████████▍          | 13.6MB 19.7MB/s eta 0:00:01\n",
      "\u001b[?25hCollecting scipy (from xgboost)\n",
      "  Using cached https://files.pythonhosted.org/packages/29/50/a552a5aff252ae915f522e44642bb49a7b7b31677f9580cfd11bcc869976/scipy-1.3.1-cp36-cp36m-manylinux1_x86_64.whl\n",
      "Installing collected packages: numpy, scipy, xgboost\n",
      "Successfully installed numpy-1.17.1 scipy-1.3.1 xgboost-0.90\n"
     ]
    }
   ],
   "source": [
    "!pip3 install xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 1s, sys: 4.97 s, total: 1min 6s\n",
      "Wall time: 1min 6s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "from xgboost import XGBClassifier\n",
    "import xgboost as xgb\n",
    "from sklearn.metrics import mean_squared_error\n",
    "xg_reg = xgb.XGBClassifier(n_estimators = 10)\n",
    "xg_reg.fit(train[feature],train['target'])\n",
    "\n",
    "y_pred = xg_reg.predict(test[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.85      0.64      0.73  11697702\n",
      "           2       0.59      0.77      0.67   6346201\n",
      "           3       0.45      0.57      0.50   3272501\n",
      "\n",
      "    accuracy                           0.67  21316404\n",
      "   macro avg       0.63      0.66      0.63  21316404\n",
      "weighted avg       0.71      0.67      0.68  21316404\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sf/.local/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "/home/sf/.local/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
      "  \"this warning.\", FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "lg = LogisticRegression()\n",
    "\n",
    "lg.fit(train[feature],train['target'])\n",
    "\n",
    "y_pred = lg.predict(test[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sf/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.00      0.00      0.00  11697702\n",
      "           2       0.00      0.00      0.00   6346201\n",
      "           3       0.00      0.00      0.00   3272501\n",
      "           4       0.26      1.00      0.42   7634199\n",
      "\n",
      "    accuracy                           0.26  28950603\n",
      "   macro avg       0.07      0.25      0.10  28950603\n",
      "weighted avg       0.07      0.26      0.11  28950603\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "#Create a Gaussian Classifier\n",
    "clf=RandomForestClassifier(n_estimators=10)\n",
    "\n",
    "#Train the model using the training sets y_pred=clf.predict(X_test)\n",
    "clf.fit(train[feature],train['target'])\n",
    "\n",
    "y_pred=clf.predict(test[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.96      0.74      0.84  11697702\n",
      "           2       0.72      0.94      0.81   6346201\n",
      "           3       0.74      0.90      0.81   3272501\n",
      "\n",
      "    accuracy                           0.83  21316404\n",
      "   macro avg       0.81      0.86      0.82  21316404\n",
      "weighted avg       0.85      0.83      0.83  21316404\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "#Create a Gaussian Classifier\n",
    "model = GaussianNB()\n",
    "\n",
    "# Train the model using the training sets\n",
    "model.fit(train[feature],train['target'])\n",
    "\n",
    "#Predict Output\n",
    "y_pred= model.predict(test[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.57      0.47      0.52  11697702\n",
      "           2       0.35      0.20      0.26   6346201\n",
      "           3       0.13      0.33      0.19   3272501\n",
      "\n",
      "    accuracy                           0.37  21316404\n",
      "   macro avg       0.35      0.33      0.32  21316404\n",
      "weighted avg       0.44      0.37      0.39  21316404\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "clf = GradientBoostingClassifier()\n",
    "# n_estimators = 100 (default)\n",
    "# loss function = deviance(default) used in Logistic Regression\n",
    "clf.fit(train[feature],train['target'])\n",
    "clf.predict(test[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(test['target'], y_pred))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}