[dd6cc2]: / generating_data_for_classical_ml.ipynb

Download this file

2637 lines (2636 with data), 89.2 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import librosa\n",
    "import wave as wav\n",
    "import tensorflow as tf\n",
    "import scipy\n",
    "import matplotlib.pyplot as plt\n",
    "import librosa.display\n",
    "import IPython.display as ipd\n",
    "from sklearn import metrics\n",
    "from sklearn.model_selection import cross_validate\n",
    "import os\n",
    "import statistics\n",
    "import math\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "import graphviz\n",
    "from sklearn import tree\n",
    "\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import confusion_matrix, classification_report\n",
    "import sys\n",
    "\n",
    "from sklearn.metrics import plot_confusion_matrix\n",
    "import seaborn as sns \n",
    "import matplotlib.pyplot as plt\n",
    "import operator as op\n",
    "from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict\n",
    "from entropy import *\n",
    "from random import shuffle\n",
    "\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: xgboost in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (1.0.2)\n",
      "Requirement already satisfied: scipy in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (from xgboost) (1.4.1)\n",
      "Requirement already satisfied: numpy in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (from xgboost) (1.18.1)\n",
      "\u001b[33mWARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.\n",
      "You should consider upgrading via the '/gpfs/hpc/home/rannilo/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install xgboost\n",
    "import xgboost as xgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "root = \"respiratory_sound_database/\"\n",
    "patient_diagnosis = pd.read_csv(root+\"patient_diagnosis.csv\", names=[\"patient\", \"diagnosis\"])\n",
    "demographic_info = pd.read_csv(root+\"demographic_info.txt\", delimiter=\" \", names=[\"patient\", \"age\", \"sex\", \"bmi\", \"weight\", \"height\"])\n",
    "\n",
    "train_patients = pd.read_csv(root + \"train_patients.csv\")\n",
    "test_patients = pd.read_csv(root + \"test_patients.csv\")\n",
    "\n",
    "train_patients.set_index(\"patient\", inplace=True)\n",
    "test_patients.set_index(\"patient\", inplace=True)\n",
    "patient_diagnosis.set_index(\"patient\", inplace=True)\n",
    "demographic_info.set_index(\"patient\", inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: [101, 102, 103, 104, 105]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_patients.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>185</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>186</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>187</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>188</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>189</th>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: [185, 186, 187, 188, 189]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_patients.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>Asthma</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        diagnosis\n",
       "patient          \n",
       "101          URTI\n",
       "102       Healthy\n",
       "103        Asthma\n",
       "104          COPD\n",
       "105          URTI"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "patient_diagnosis.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>3.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.0</td>\n",
       "      <td>99.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.75</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.8</td>\n",
       "      <td>73.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>33.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>28.47</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>135.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           age sex    bmi  weight  height\n",
       "patient                                  \n",
       "101       3.00   F    NaN    19.0    99.0\n",
       "102       0.75   F    NaN     9.8    73.0\n",
       "103      70.00   F  33.00     NaN     NaN\n",
       "104      70.00   F  28.47     NaN     NaN\n",
       "105       7.00   F    NaN    32.0   135.0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "demographic_info.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>3.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.0</td>\n",
       "      <td>99.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.75</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.8</td>\n",
       "      <td>73.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>33.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Asthma</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>28.47</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>222</th>\n",
       "      <td>60.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>223</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>224</th>\n",
       "      <td>10.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.3</td>\n",
       "      <td>143.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>225</th>\n",
       "      <td>0.83</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.8</td>\n",
       "      <td>74.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>226</th>\n",
       "      <td>4.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.7</td>\n",
       "      <td>103.0</td>\n",
       "      <td>Pneumonia</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>126 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           age  sex    bmi  weight  height  diagnosis\n",
       "patient                                              \n",
       "101       3.00    F    NaN    19.0    99.0       URTI\n",
       "102       0.75    F    NaN     9.8    73.0    Healthy\n",
       "103      70.00    F  33.00     NaN     NaN     Asthma\n",
       "104      70.00    F  28.47     NaN     NaN       COPD\n",
       "105       7.00    F    NaN    32.0   135.0       URTI\n",
       "...        ...  ...    ...     ...     ...        ...\n",
       "222      60.00    M    NaN     NaN     NaN       COPD\n",
       "223        NaN  NaN    NaN     NaN     NaN       COPD\n",
       "224      10.00    F    NaN    32.3   143.0    Healthy\n",
       "225       0.83    M    NaN     7.8    74.0    Healthy\n",
       "226       4.00    M    NaN    16.7   103.0  Pneumonia\n",
       "\n",
       "[126 rows x 6 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.concat([demographic_info, patient_diagnosis], axis=1)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>3.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.00</td>\n",
       "      <td>99.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.75</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.80</td>\n",
       "      <td>73.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>33.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Asthma</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>28.47</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.00</td>\n",
       "      <td>135.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>73.00</td>\n",
       "      <td>F</td>\n",
       "      <td>21.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>75.00</td>\n",
       "      <td>F</td>\n",
       "      <td>33.70</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>3.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LRTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>84.00</td>\n",
       "      <td>F</td>\n",
       "      <td>33.53</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>75.00</td>\n",
       "      <td>M</td>\n",
       "      <td>25.21</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>63.00</td>\n",
       "      <td>M</td>\n",
       "      <td>28.40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Bronchiectasis</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>60.00</td>\n",
       "      <td>M</td>\n",
       "      <td>22.86</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>58.00</td>\n",
       "      <td>M</td>\n",
       "      <td>28.41</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>77.00</td>\n",
       "      <td>M</td>\n",
       "      <td>23.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>0.58</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.14</td>\n",
       "      <td>64.0</td>\n",
       "      <td>LRTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>56.00</td>\n",
       "      <td>M</td>\n",
       "      <td>28.58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Bronchiectasis</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>68.00</td>\n",
       "      <td>M</td>\n",
       "      <td>24.40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>81.00</td>\n",
       "      <td>M</td>\n",
       "      <td>36.76</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>2.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15.20</td>\n",
       "      <td>94.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>120</th>\n",
       "      <td>78.00</td>\n",
       "      <td>M</td>\n",
       "      <td>35.14</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>13.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65.00</td>\n",
       "      <td>170.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>122</th>\n",
       "      <td>66.00</td>\n",
       "      <td>M</td>\n",
       "      <td>33.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Pneumonia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123</th>\n",
       "      <td>5.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25.00</td>\n",
       "      <td>125.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>124</th>\n",
       "      <td>65.00</td>\n",
       "      <td>M</td>\n",
       "      <td>29.07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>125</th>\n",
       "      <td>14.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.00</td>\n",
       "      <td>170.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>126</th>\n",
       "      <td>1.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10.18</td>\n",
       "      <td>80.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127</th>\n",
       "      <td>2.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12.60</td>\n",
       "      <td>98.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128</th>\n",
       "      <td>65.00</td>\n",
       "      <td>F</td>\n",
       "      <td>24.30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>129</th>\n",
       "      <td>6.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.00</td>\n",
       "      <td>119.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130</th>\n",
       "      <td>85.00</td>\n",
       "      <td>F</td>\n",
       "      <td>17.10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>3.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14.00</td>\n",
       "      <td>97.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>132</th>\n",
       "      <td>71.00</td>\n",
       "      <td>M</td>\n",
       "      <td>34.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133</th>\n",
       "      <td>68.00</td>\n",
       "      <td>M</td>\n",
       "      <td>27.40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>134</th>\n",
       "      <td>61.00</td>\n",
       "      <td>M</td>\n",
       "      <td>32.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135</th>\n",
       "      <td>70.00</td>\n",
       "      <td>M</td>\n",
       "      <td>21.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Pneumonia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>136</th>\n",
       "      <td>5.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.20</td>\n",
       "      <td>110.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>137</th>\n",
       "      <td>4.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18.00</td>\n",
       "      <td>104.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>138</th>\n",
       "      <td>56.00</td>\n",
       "      <td>F</td>\n",
       "      <td>21.60</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>139</th>\n",
       "      <td>61.00</td>\n",
       "      <td>M</td>\n",
       "      <td>28.68</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>140</th>\n",
       "      <td>79.00</td>\n",
       "      <td>F</td>\n",
       "      <td>23.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Pneumonia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>141</th>\n",
       "      <td>66.00</td>\n",
       "      <td>M</td>\n",
       "      <td>22.40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>78.00</td>\n",
       "      <td>M</td>\n",
       "      <td>26.10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>0.25</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.24</td>\n",
       "      <td>68.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>144</th>\n",
       "      <td>3.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.70</td>\n",
       "      <td>100.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>69.00</td>\n",
       "      <td>M</td>\n",
       "      <td>23.40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>67.00</td>\n",
       "      <td>M</td>\n",
       "      <td>28.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>77.00</td>\n",
       "      <td>M</td>\n",
       "      <td>25.70</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>4.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>33.00</td>\n",
       "      <td>110.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>0.67</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.50</td>\n",
       "      <td>70.0</td>\n",
       "      <td>Bronchiolitis</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>150</th>\n",
       "      <td>0.67</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.12</td>\n",
       "      <td>74.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           age sex    bmi  weight  height       diagnosis\n",
       "patient                                                  \n",
       "101       3.00   F    NaN   19.00    99.0            URTI\n",
       "102       0.75   F    NaN    9.80    73.0         Healthy\n",
       "103      70.00   F  33.00     NaN     NaN          Asthma\n",
       "104      70.00   F  28.47     NaN     NaN            COPD\n",
       "105       7.00   F    NaN   32.00   135.0            URTI\n",
       "106      73.00   F  21.00     NaN     NaN            COPD\n",
       "107      75.00   F  33.70     NaN     NaN            COPD\n",
       "108       3.00   M    NaN     NaN     NaN            LRTI\n",
       "109      84.00   F  33.53     NaN     NaN            COPD\n",
       "110      75.00   M  25.21     NaN     NaN            COPD\n",
       "111      63.00   M  28.40     NaN     NaN  Bronchiectasis\n",
       "112      60.00   M  22.86     NaN     NaN            COPD\n",
       "113      58.00   M  28.41     NaN     NaN            COPD\n",
       "114      77.00   M  23.12     NaN     NaN            COPD\n",
       "115       0.58   M    NaN    7.14    64.0            LRTI\n",
       "116      56.00   M  28.58     NaN     NaN  Bronchiectasis\n",
       "117      68.00   M  24.40     NaN     NaN            COPD\n",
       "118      81.00   M  36.76     NaN     NaN            COPD\n",
       "119       2.00   F    NaN   15.20    94.0            URTI\n",
       "120      78.00   M  35.14     NaN     NaN            COPD\n",
       "121      13.00   F    NaN   65.00   170.0         Healthy\n",
       "122      66.00   M  33.00     NaN     NaN       Pneumonia\n",
       "123       5.00   M    NaN   25.00   125.0         Healthy\n",
       "124      65.00   M  29.07     NaN     NaN            COPD\n",
       "125      14.00   M    NaN   62.00   170.0         Healthy\n",
       "126       1.00   F    NaN   10.18    80.0         Healthy\n",
       "127       2.00   M    NaN   12.60    98.0         Healthy\n",
       "128      65.00   F  24.30     NaN     NaN            COPD\n",
       "129       6.00   M    NaN   23.00   119.0            URTI\n",
       "130      85.00   F  17.10     NaN     NaN            COPD\n",
       "131       3.00   M    NaN   14.00    97.0            URTI\n",
       "132      71.00   M  34.00     NaN     NaN            COPD\n",
       "133      68.00   M  27.40     NaN     NaN            COPD\n",
       "134      61.00   M  32.00     NaN     NaN            COPD\n",
       "135      70.00   M  21.00     NaN     NaN       Pneumonia\n",
       "136       5.00   M    NaN   16.20   110.0         Healthy\n",
       "137       4.00   M    NaN   18.00   104.0            URTI\n",
       "138      56.00   F  21.60     NaN     NaN            COPD\n",
       "139      61.00   M  28.68     NaN     NaN            COPD\n",
       "140      79.00   F  23.00     NaN     NaN       Pneumonia\n",
       "141      66.00   M  22.40     NaN     NaN            COPD\n",
       "142      78.00   M  26.10     NaN     NaN            COPD\n",
       "143       0.25   F    NaN    8.24    68.0         Healthy\n",
       "144       3.00   M    NaN   16.70   100.0         Healthy\n",
       "145      69.00   M  23.40     NaN     NaN            COPD\n",
       "146      67.00   M  28.00     NaN     NaN            COPD\n",
       "147      77.00   M  25.70     NaN     NaN            COPD\n",
       "148       4.00   M    NaN   33.00   110.0            URTI\n",
       "149       0.67   M    NaN    9.50    70.0   Bronchiolitis\n",
       "150       0.67   F    NaN    8.12    74.0            URTI"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "age           1\n",
      "sex           1\n",
      "bmi          51\n",
      "weight       82\n",
      "height       84\n",
      "diagnosis     0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(data.isna().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Dropping one patient with NA age and sex\n",
    "data.dropna(thresh=2, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "age           0\n",
      "sex           0\n",
      "bmi          50\n",
      "weight       81\n",
      "height       83\n",
      "diagnosis     0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(data.isna().sum())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age          125\n",
       "sex          125\n",
       "bmi           75\n",
       "weight        44\n",
       "height        42\n",
       "diagnosis    125\n",
       "dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5, 1.0, \"Count of patients' diagnoses\")"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAAF+CAYAAABauMmmAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de5RkZX3u8e8jA4IIImEkCMIQQQkGAR0NqDFGY6IxCjGK4g09KEmOFzyeeAJZHqPGKJpExUuyQkQlSozGgKAkHglKvIE6CMrViAhyZ0CQi4oCv/PH3g1F0z3TzFTV273n+1mrV9e+VNdvd8/UU+/e737fVBWSJGm67tO6AEmSNkQGsCRJDRjAkiQ1YABLktSAASxJUgMGsCRJDRjA0hgl+YMklya5OcneE36tHfvX2WiSr7M+kjwpyWUjy+cmeVLDkqRFwwDWopTkBUlW9QFzZZL/SPKEKbxuJdllPX7E3wCvqqr7V9WZ46oLIMnFSX57Zrmqfti/zu1j+NmnJnn5Avf9SJKXrsvrVNUjqurUdXmuNDQGsBadJK8D3gO8DdgW2BH4O2C/lnUt0E7Aua2LkLT4GcBaVJI8AHgL8MqqOq6qbqmqX1TVZ6rq9f0+903yniRX9F/vSXLffttLk3xl1s+8s1Xbt94+kOSkJDcl+XqSh/bbvtQ/5dt9y/t5c9R3nyRvSHJJkmuS/FOSB/Q13Qxs1D//+/McXyV5TZKLklyb5K+T3Kff9tAkX0hyXb/t2CRb9ds+SvdB5DN9bf8nyYr+5y2b+d0lObo/Y3B5krfOnJ6e+b0k+Zsk1yf5QZKn99v+CvgN4P39z35/Ou/uj/HGJGcn+bUF/P0263/H1yc5D3jMrO13tuKTPDbJaUlu6Gt+f5JNRvb9nSTfTfLjJH+X5L9mWulrOp5++4OTnJjkR0kuTPKKkW2P7c+u3Jjk6iTvGtm2T5Kv9TV9e/R0ef+aF/X/bn6Q5IVr+31Ia1RVfvm1aL6ApwG3AcvWsM9bgNOBBwHLga8Bf9lveynwlVn7F7BL//gjwHXAY4FlwLHAv8y17zyv/T+AC4FfAe4PHAd89F48v4AvAlvTBep/Ay/vt+0CPBW4b39cXwLeM/Lci4HfHlle0f+8Zf3y8cA/AJv3v5tvAH808nv5BfAKug8JfwJcAaTffupMHf3y7wJnAFsBAX4V2G4Bf78jgC/3x/cQ4BzgsrmOAXg0sE//d1gBnA+8tt+2DXAj8Ox++6F9/S9f4PF8ie6syabAXsBq4Mn9ttOAF/eP7w/s0z/evv+38Xt0jZOn9svL+9/pjcDD+323Ax7R+v+LX0v7yxawFptfAq6tqtvWsM8LgbdU1TVVtRp4M/Die/Eax1fVN/rXOJbuDXqhXgi8q6ouqqqbgcOB58+0QhfoHVX1o6r6Id2p9gMBqurCqjq5qm7tj+tdwG8u5Acm2ZYuOF5b3VmDa4B3A88f2e2SqvrH6q4ZH0MXItvO8yN/AWwB7EYXaudX1ZULKOUA4K/647sUeO98O1bVGVV1elXdVlUX0314mDne3wPOre4syG39z7lq1o+Y83iSPAR4PPBnVfWzqjoL+CDwkpFj2yXJNlV1c1Wd3q9/EfDvVfXvVXVHVZ0MrOprAbgD+LUkm1XVlVXlpQatFwNYi811wDZrCbQHA5eMLF/Sr1uo0Tfyn9C1ghZqrtdexvxBNpdLZz3/wdCFaJJ/6U8f3wh8jK4luBA7ARsDV/anT2+gC7QHjexz53FX1U/6h3Mee1V9AXg/8AHgmiRHJdlyAXU8mHse35ySPCzJZ5Nc1R/v27jreO/2c6qqgMtm/Yj5jufBwI+q6qZZdWzfPz4YeBhwQZJvJvn9fv1OwHNnfn/97/AJdC3/W4DnAX9M9zs+Kclua/xNSGthAGuxOQ24Fdh/DftcQfdmOWPHfh3ALcD9ZjYk+eUx1zfXa98GXH0vfsZDZj1/pva30Z1S3qOqtqRrkWVk3zVNXXYp3e9tm6raqv/asqoescCa7vGzq+q9VfVoYHe6wHr9An7Oldzz+Obz98AFwK798f45dx3vlcAOMzsmyejyWlwBbJ1ki1l1XA5QVd+rqgPpPpy8A/hUks3pfocfHfn9bVVVm1fVEf3z/l9VPZWupX0B8I8LrEeakwGsRaWqfgy8EfhAkv2T3C/JxkmenuSd/W4fB96QZHmSbfr9P9Zv+zbwiCR7JdkUeNO9LOFquuu78/k48L+S7Jzk/nSh+Ym1nDKf7fVJHtifKj0U+ES/fgvgZuDHSbbnnoE3b2396eHPA3+bZMt0ncUemmRBp7Bn/+wkj0ny60k2pvtQ8zO6U7Br80ng8P74dgBevYZ9t6C7rnpz35r8k5FtJwF79P8GlgGvBBb0Yao/9f014O1JNk3ySLpW78f6Y3tRkuVVdQdwQ/+0O/rtz0zyu0k26p/7pCQ79Gcn9uuD+la6v9NCfh/SvAxgLTpV9bfA64A30HWeuRR4FfDpfpe30l2b+w5wNvCtfh1V9d90nbT+E/gecLce0QvwJuCY/hTkAXNs/xDwUbpOPj+gC6Y1hcxcTqDr4HQWXdAc3a9/M/Ao4Mf9+uNmPe/tdB88bkjyp3P83JcAmwDnAdcDn6JrrS3EkcBz+h7F7wW2pGvhXU93+vY64K8X8HPe3O//A7oPBB9dw75/CrwAuKl/rZkPIlTVtcBzgXf2r7073d/81gUez4F0HbuuoOuc9hdV9Z/9tqcB56brtX4k8Pyq+mkf3PvRtcRn/t29nu598j50/yavAH5Ed6169AODdK/N9BiUNAVJiu6U64Wta1lK0t2qdRnwwqr6Yut6pHGwBSxpUepPBW+V7h7vmevDp6/ladKSYQBLWqz2Bb4PXAs8E9i/qn7atiRpfDwFLUlSA7aAJUlqwACWJKmBezN83nrbZpttasWKFdN8SUmSmjnjjDOurarlc22bagCvWLGCVatWTfMlJUlqJsm8w7F6ClqSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqYKpjQa+rFYedNNXXu/iIZ0z19SRJGx5bwJIkNWAAS5LUgAEsSVIDBrAkSQ0YwJIkNWAAS5LUgAEsSVIDCwrgJFsl+VSSC5Kcn2TfJFsnOTnJ9/rvD5x0sZIkDcVCW8BHAp+rqt2APYHzgcOAU6pqV+CUflmSJC3AWgM4yQOAJwJHA1TVz6vqBmA/4Jh+t2OA/SdVpCRJQ7OQFvDOwGrgw0nOTPLBJJsD21bVlf0+VwHbTqpISZKGZiEBvAx4FPD3VbU3cAuzTjdXVQE115OTHJJkVZJVq1evXt96JUkahIUE8GXAZVX19X75U3SBfHWS7QD679fM9eSqOqqqVlbVyuXLl4+jZkmSlry1BnBVXQVcmuTh/aqnAOcBJwIH9esOAk6YSIWSJA3QQqcjfDVwbJJNgIuAl9GF9yeTHAxcAhwwmRIlSRqeBQVwVZ0FrJxj01PGW44kSRsGR8KSJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKmBZQvZKcnFwE3A7cBtVbUyydbAJ4AVwMXAAVV1/WTKlCRpWO5NC/i3qmqvqlrZLx8GnFJVuwKn9MuSJGkB1ucU9H7AMf3jY4D9178cSZI2DAsN4AI+n+SMJIf067atqiv7x1cB2871xCSHJFmVZNXq1avXs1xJkoZhQdeAgSdU1eVJHgScnOSC0Y1VVUlqridW1VHAUQArV66ccx9JkjY0C2oBV9Xl/fdrgOOBxwJXJ9kOoP9+zaSKlCRpaNYawEk2T7LFzGPgd4BzgBOBg/rdDgJOmFSRkiQNzUJOQW8LHJ9kZv9/rqrPJfkm8MkkBwOXAAdMrkxJkoZlrQFcVRcBe86x/jrgKZMoSpKkoXMkLEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYWHMBJNkpyZpLP9ss7J/l6kguTfCLJJpMrU5KkYbk3LeBDgfNHlt8BvLuqdgGuBw4eZ2GSJA3ZggI4yQ7AM4AP9ssBngx8qt/lGGD/SRQoSdIQLbQF/B7g/wB39Mu/BNxQVbf1y5cB28/1xCSHJFmVZNXq1avXq1hJkoZirQGc5PeBa6rqjHV5gao6qqpWVtXK5cuXr8uPkCRpcJYtYJ/HA89K8nvApsCWwJHAVkmW9a3gHYDLJ1emJEnDstYWcFUdXlU7VNUK4PnAF6rqhcAXgef0ux0EnDCxKiVJGpj1uQ/4z4DXJbmQ7prw0eMpSZKk4VvIKeg7VdWpwKn944uAx46/JEmShs+RsCRJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhpYawAn2TTJN5J8O8m5Sd7cr985ydeTXJjkE0k2mXy5kiQNw0JawLcCT66qPYG9gKcl2Qd4B/DuqtoFuB44eHJlSpI0LGsN4Orc3C9u3H8V8GTgU/36Y4D9J1KhJEkDtKBrwEk2SnIWcA1wMvB94Iaquq3f5TJg+3mee0iSVUlWrV69ehw1S5K05C0ogKvq9qraC9gBeCyw20JfoKqOqqqVVbVy+fLl61imJEnDcq96QVfVDcAXgX2BrZIs6zftAFw+5tokSRqshfSCXp5kq/7xZsBTgfPpgvg5/W4HASdMqkhJkoZm2dp3YTvgmCQb0QX2J6vqs0nOA/4lyVuBM4GjJ1inJEmDstYArqrvAHvPsf4iuuvBkiTpXnIkLEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKmBhUzGoAlbcdhJU3uti494xtReS5I0P1vAkiQ1YABLktSAASxJUgMGsCRJDRjAkiQ1YABLktSAASxJUgMGsCRJDRjAkiQ1YABLktSAASxJUgMGsCRJDRjAkiQ1YABLktSAASxJUgMGsCRJDRjAkiQ1YABLktSAASxJUgMGsCRJDaw1gJM8JMkXk5yX5Nwkh/brt05ycpLv9d8fOPlyJUkahoW0gG8D/ndV7Q7sA7wyye7AYcApVbUrcEq/LEmSFmCtAVxVV1bVt/rHNwHnA9sD+wHH9LsdA+w/qSIlSRqae3UNOMkKYG/g68C2VXVlv+kqYNt5nnNIklVJVq1evXo9SpUkaTgWHMBJ7g/8G/DaqrpxdFtVFVBzPa+qjqqqlVW1cvny5etVrCRJQ7GgAE6yMV34HltVx/Wrr06yXb99O+CayZQoSdLwLKQXdICjgfOr6l0jm04EDuofHwScMP7yJEkapmUL2OfxwIuBs5Oc1a/7c+AI4JNJDgYuAQ6YTImSJA3PWgO4qr4CZJ7NTxlvOZIkbRgcCUuSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqYFnrAjRsKw47aaqvd/ERz5jq60nSurIFLElSAwawJEkNGMCSJDVgAEuS1IABLElSAwawJEkNGMCSJDVgAEuS1IABLElSAwawJEkNGMCSJDVgAEuS1MBaAzjJh5Jck+SckXVbJzk5yff67w+cbJmSJA3LQlrAHwGeNmvdYcApVbUrcEq/LEmSFmitAVxVXwJ+NGv1fsAx/eNjgP3HXJckSYO2rteAt62qK/vHVwHbjqkeSZI2COvdCauqCqj5tic5JMmqJKtWr169vi8nSdIgrGsAX51kO4D++zXz7VhVR1XVyqpauXz58nV8OUmShmVdA/hE4KD+8UHACeMpR5KkDcNCbkP6OHAa8PAklyU5GDgCeGqS7wG/3S9LkqQFWra2HarqwHk2PWXMtUiStMFwJCxJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpgbXeByxpfisOO2mqr3fxEc+Y6utJmhxbwJIkNWAAS5LUgAEsSVIDXgOWNK+hX+Me+vFpcbMFLElSAwawJEkNGMCSJDVgAEuS1IABLElSAwawJEkNGMCSJDVgAEuS1IABLElSAwawJEkNGMCSJDVgAEuS1IABLElSAwawJEkNGMCSJDXgfMCSNFDTnO/YuY7vPVvAkiQ1YABLktSAASxJUgMGsCRJDaxXJ6wkTwOOBDYCPlhVR4ylKkmS1mCaHcxgMp3M1rkFnGQj4APA04HdgQOT7D6uwiRJGrL1OQX9WODCqrqoqn4O/Auw33jKkiRp2NYngLcHLh1ZvqxfJ0mS1iJVtW5PTJ4DPK2qXt4vvxj49ap61az9DgEO6RcfDnx33cu917YBrp3i603bkI9vyMcGHt9S5/EtXdM+tp2qavlcG9anE9blwENGlnfo191NVR0FHLUer7POkqyqqpUtXnsahnx8Qz428PiWOo9v6VpMx7Y+p6C/CeyaZOckmwDPB04cT1mSJA3bOreAq+q2JK8C/h/dbUgfqqpzx1aZJEkDtl73AVfVvwP/PqZaJqHJqe8pGvLxDfnYwONb6jy+pWvRHNs6d8KSJEnrzqEoJUlqwACWJKkBA1iSpAbWqxPWYpLk4XQDfuzWrzof+MeqmubAHxOX5JnASVV1R+taxiXJo9a0vaq+Na1aJinJ44GzquqWJC8CHgUcWVWXNC5tLJJsDvy0qu5I8jC6/4v/UVW/aFya1iDJs9e0vaqOm1Ytk5LkgcCuwKYz66rqS+0q6gyiE1aSfYHjgH8AzgQC7A28Anh2VZ3esLyxSvIxYF/g3+hu/bqgcUnrLckX17C5qurJUytmgpJ8B9gTeCTwEeCDwAFV9Zst6xqXJGcAvwE8EPgq3VgBP6+qFzYtbEyS7AO8D/hVYBO62y9vqaotmxa2npJ8eA2bq6r+x9SKmYAkLwcOpRss6ixgH+C0xfC+MpQA/g/gHVV16qz1vwkcVlVPb1LYhCTZEjgQeBlQwIeBj1fVTU0LW0dJ9q2q01rXMWlJvlVVj0ryRuDyqjp6Zl3r2sZh5PheDWxWVe9MclZV7dW6tnFIsopuwKF/BVYCLwEeVlWHNy1sPSV59hBaufNJcjbwGOD0qtoryW7A26pqjS3/aRjKNeCHzg5fgKr6L+BXpl/OZFXVjcCn6Gag2g74A+Bb/RvfUvSB1gVMyU1JDgdeBJyU5D7Axo1rGqf0Z6NeCMxM1rpRw3rGrqouBDaqqtur6sPA01rXNAZvaF3AhP2sqn4GkOS+/VnDhzeuCRjONeA1tfxumVoVU5DkWXQt312AfwIeW1XXJLkfcB7dKbKlJq0LmJLnAS8ADq6qq5LsCPx145rG6bXA4cDxVXVukl8B1nR5Yan5ST/s7llJ3glcyXAaMUN2WZKtgE8DJye5HlgU/S6Gcgr6GrrW4D020V1j23bKJU1MkmOAo+fqQJDkKVV1SoOy1kuSG4B5O0RU1bOmWI40pyQ7AdfQnbX4X8ADgL/rW8VLVpKfAHMdQ+iuAT9yyiVNTH9Z8gHA5/p57NvWM5AAPmhN26vqmGnVonsvyfeAl8+3vb+UsGQl+UpVPSHJTXTX7O/cRPcGt9Q78bynql6b5DPc/fgAP0AtdknOBX5vvu1D6KXf94J+CCNnfRfD3RWDCGCAJMuBnYALq+qG1vVMSn/LwDuAB9G9gS/5N/EhdUTaECV5dFWd0bcu7mEAH6A+WVUH9J155vqAsaRbiEnOrKq9W9cxKUn+EngpcBEwc/vmori7YhDXgPtu5m8Dvg/snOSQqhrq1IjvBJ5ZVee3LmSMLm5dwDQkeShwWVXdmuRJdLcj/dNS/8BYVWf0D/eqqiNHtyU5FFjSAUx3CwvA7zetYnK+2rqACTuArqNu81POsw2lA8FrgUdU1b7A4+g6ggzV1QMLX+a7HSDJU5OcPO16JujfgNuT7EI3I8tDgH9uW9JYzXUp6KXTLmLcqurK/vslc321rm99VdWrkmyUZJuZdUk2SXJIkiG815wDbNW6iLkMogVMd7P/aoCquijJfVsXNG4jo9WsSvIJuh59t85sX8r38SX5LbpBVB5Md1zvoLu3OcBfNSxt3O7o59H+A+B9VfW+JGe2Lmp9JTmQrnf3zklGzzxtAfyoTVXjN8TLPwBJnkf3gfCWvj/GXwEfohtIZQiDqLwdODPJOdz9PbN534ShBPAOSd4733JVvaZBTeP2zJHHPwF+Z2S56EYCW6reRTeM6GnA0/vvh1XV+5tWNX6/6MPqIO76ew7hPuCv0d2Ssw3wtyPrbwK+06SiyRji5R+A/ws8uqou7IeFPQ14TlV9pnFd43IM3Qens7nrGvCiMIhOWBtSL+gkj6+qr65t3VIyuxNWku9W1aK4UX6ckuwO/DHdMHgfT7Iz3W1y72hcmhYgyVer6vGt6xi3Of7/nVNVv9aypnFK8s2qekzrOuYyiAAeleT+AFV1c+taJmGuHsNLvRdxkouAPx1Z9Tejy0v59PqGYOi3Wc1IciTwywzo8g9AksvozkLNeN3oclW96x5PWkKSvIvu73Uid/+7eRvSuCT5E7rOV5v3q26mGx/679pVNT79EH+Po+tw9u6RTVsCf1BVezYpbAz6weBnv3HPWPKDwc9Isivd9ajdufusLIMbLnWI5pm0YMn/+0zyF2vYXFX1lqkVMwHzTPbibUjjkuQNdOH0pKq6qF/3K8CRSbauqrc2LXA8NgHuT/c322Jk/Y3Ac5pUND7nzFq+A7gW+EpV/aBBPZPyYeAv6D5A/RbdkKJL/k6EJFuvaXtVDaIjVlW9rHUNk1BVb55vW5LXTrOWSaiq32pdw3wG0QJO8l1gz5kBt0fWbwZ8u6oe1qay8Uuy0xBufRg1zyfwrYHfBd5UVXMNM7rkJDmjqh6d5Oyq2mN0Xeva1keSH9CdwZhrTO8aSgs/yQ50Y63PXAf+MnBoVV3WrqrJSvLDqtqxdR3ro78r5g+BFdx9JKzmLftBtIDp/pP/bI6VP02yqHq9ravRYf6Se77PLYYu9etqvk/gfcvqP5l7nO+l6NZ+BqTvJXkVcDndWY0lrap2bl3DlHyY7r7t5/bLL+rXPbVZRZM3hIlSTgB+DJzByDXgxWAoAXz5XBMRJHkK3e0RQ/A3rQuYtqr6Ueb6tLF0HQrcD3gN8Jd0p6Ff0rSiMetn63piv3hqVX22ZT1jtry6KQhnfGQIp2jXYumfIoUdqmpRThs5lAB+DXBCkq/QfcqBbsLsxwP7NatqjJb6eLrroh+g4/rWdYzRiqr6Jl0HwZcBJHku8PWmVY1JkiPoJj4/tl91aJLHVdWfNyxrnK5L8iLg4/3ygcB1DesZizl6r9+5CdhsyuVMwteS7FFVZ7cuZLahXAPehe72gIcBj+hXnwd8F7iyqr7fqrZxG2JP2nkGud8auAJ4SXUTaC95Q7yFbFSS79CNB31Hv7wRcOZSn6xgRj8d4fuAfen+vX4NeE1V/bBpYZrTyPvKMmBXuskYbmURTbM4lBbwe4DDq+pDoyuT7NFve+acz1qahtiTdvYg9wVcV1W3tChm3JI8nW66t+1njdi2JXBbm6omZivuGn7yAS0LGbe+8+OS7WuxAVr0k2cMJYC3nev0QlWdnWTF9MuZqM2q6pQk6d8Q3pTkDOCNrQtbV0Pr1T2HK4BVdG/eZ4ysv4luYvehmBlz94t0rYwnAoe1LWl8+pHLXs09e9MayovQzPtKko9W1YtHtyX5KPDiOZ84RUMJ4DXNdDGEaxijBtmTdsiq6tvAt5McD9xSVbfDnadoBzNxSD+85ql014EB/qyqrmpY0rh9Gjga+AyLbExhrdEjRhf6/3eL4ta/pX7qcsaqJK+YvbKfJ/iMOfZfykZ70j6a7laINY6FrUXj89z9A+FmdLdZLWlJduu/PwrYDris/3pwv24oflZV762qL1bVf818tS5Kc0tyeN/B7JFJbuy/bgKuobs1qbmhdMLaFjge+Dl37wW9Cd0wjUP6FA5AkvtV1U9a16GFS3JWVe21tnVLTZKjquqQxTzk3zgkeQFdZ57Ps8jGFNb8kry9qhblHPGDOAVdVVcDj+tvW5mZxeOkqvpCw7Imoh8T+mi60847JtkT+KOq+p9tK9MC3JLkUTNv2EkeDfy0cU3rraoO6b8v2iH/xmQPuuuGT+auU9DVL2vx+mySzavqlv42skcBRy6GvieDaAFvSJJ8nW7s5xOrau9+3aCmDxuqJI+hG9XrCrpOSr8MPK+qBnOZJMnjuGcnpX9qVtAYJbkQ2L2qft66Fi1cf3vcnsAjgY8AH6SbBvQ3W9YFA2kBb2iq6tJZA0Td3qoWLVxVfbO/Xjoz1/F3q+oXLWsap75n6UOBs7jr32QBgwhguklDtqK7hqil47aqqiT7Ae+vqqOTHNy6KDCAl6JL+1ZGJdmYrlPW+Y1r0gIkuR/dXKs7VdUrkuya5OEDGq5xJV0Lcain1bYCLkjyTe5+DdjbkBa3m5IcTnf54Df6u0gWRfYtiiJ0r/wxcCSwPd0tSJ8HXtm0Ii3Uh+k6Ce7bL18O/CswlAA+h+60+lDGX59tTfPmavF6HvAC4GVVdVWSJ3LXvPFNeQ1YmpIkq8Vc35YAAAWYSURBVKpqZZIzR67ff7uq9mxd2/oYmalrC2Av4BvYQtQikmRvuhB+LvAD4Liqel/bqmwBLxlJ3scaZiapqtdMsRytm5/3c1TPTCv5UBbZ9GjraIOYqWvWpAWbABvTDayyZbuqNJ8kD6ObMONA4FrgE3SNzkXTW98AXjpWjTx+M54OW4r+Avgc8JAkx9LN1vXSphWNwcxgFP1QjVfOzM3df9jYtmVt41RVW8w87qfJ3A/Yp11FWosLgC8Dv19VFwIkWVRDv3oKegkaPYWppSXJL9G9aQc4vaqubVzS2CRZBTxu5jadJJsAX62qx6z5mUuX/xcXryT7A8+n+6D7ObpbAD9YVTs3LWyELeClyU9NS0iS3arqgpFhGWc6Ke2YZMcBjaS0bPQe2ar6eR/Cg5Dk2SOL96Hr9f2zRuVoLarq08Cnk2xOd7bitcCDkvw9cHxVfb5pgRjA0jS8DjgE+Ns5tg1pJKXVSZ5VVScC9PddDqaFz92nNb0NuJjujV2LWD+t6T8D/5zkgXQdsf6M7g6SpjwFvUTM6gByP2BmHOiZyaXtCKKm+k5lxwIPpvt3eSnwkpnrb5LuzgCWpmjIQzXOSHJ/gKq6uXUt49T3qv17uvnHfy3JI4FnVdVbG5emJcoAlqZkvqEah3ILWZL7An/IPT9gvKVVTeOU5L+A1wP/4DjsGgevAUvTM/ShGk8Afkw32tcQ7m+e7X5V9Y1Z47Df1qoYLX0GsDQ9Qx+qcYeqelrrIibo2v4698xAKs9huH9LTYEBLE3YrKEaz0sy1KEav5Zkj6o6u3UhE/JK4ChgtySX0w1p+KK2JWkp8xqwNGFJ1jjv6MxIUktdkvOAXeiC6Vbu6qH/yKaFjVl/X+l9quqm1rVoaTOApSmZb6jGqrq4aWFjkmSnudZX1SXTrmUSht7JTNN3n9YFSBuQfwXuGFm+vV83CH3QbkU3YMUzga2GEr69E+gG3rgNuGXkS1onXgOWpmfoQzUeCrwCOK5f9bEkRy2Gad/GZOidzDRltoCl6Vmd5M4OVwMcqvFg4Ner6o1V9Ua6SSde0bimcfpakj1aF6HhsAUsTc8fA8cmeT8jQzW2LWmswl0DjNA/zjz7LkVPAF6W5CIG3MlM02MAS1NSVd8H9hnqUI3Ah4GvJzm+X94fOLphPeP29NYFaFjsBS1NyYbQi7afcvEJ/eKXq+rMlvWMQ5JN6c5e7AKcDRxdVY6ApfVmC1iansEO1ZhkI+DcqtoNGMr8xjOOAX4BfJmuFbw7cGjTijQIBrA0PYPtRVtVtyf5bpIdq+qHresZs92rag+AJEcD32hcjwbCAJamZ+hDNT4QOLcfavPO+2MHMNTmL2YeVNVtsyZjkNaZ14ClKRn6UI3zDbm51IfaTHI7d32gCLAZ8BPu+vtt2ao2LW0GsDQlQx+qcVSSbYDrBjz1orTeHIhDmpKhDtWYZJ8kpyY5LsneSc6hm3rx6iSDvOYtjYMBLE1JP1TjscCD+q+PJXl126rG4v3A24CPA18AXl5Vvww8EXh7y8KkxcxT0NKUJPkOsG9V3dIvbw6cttSvASc5q6r26h+fX1W/OrLtzKrau1110uJlC1ianqEO1Tg6w9NPZ23zE740D29DkqZnqEM17pnkRvoewv1j+uVN25UlLW6egpamaIhDNUpaNwawNAWzhmqUJK8BS9NQVbcD302yY+taJC0OXgOWpmeoQzVKWgcGsDQ9/7d1AZIWD68BSw04VKMkrwFLE+ZQjZLmYgtYmrAkq4A/Bx4AHAU8vapOT7Ib8HFHipI2TLaApclbVlWfr6p/Ba6qqtMBquqCxnVJasgAlibPoRol3YOnoKUJG5nQfXQyd/rlTatq41a1SWrHAJYkqQFPQUuS1IABLElSAwawJEkNGMCSJDVgAEuS1MD/B0rSOilnfIDIAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 576x360 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=(8,5))\n",
    "data.diagnosis.value_counts().plot(kind=\"bar\")\n",
    "plt.title(\"Count of patients' diagnoses\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "COPD              63\n",
       "Healthy           26\n",
       "URTI              14\n",
       "Bronchiectasis     7\n",
       "Bronchiolitis      6\n",
       "Pneumonia          6\n",
       "LRTI               2\n",
       "Asthma             1\n",
       "Name: diagnosis, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.diagnosis.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Dropping Asthma and LRTI patients because there are too few of them\n",
    "data = data.drop(data[(data.diagnosis=='Asthma') | (data.diagnosis == 'LRTI')].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age          122\n",
       "sex          122\n",
       "bmi           74\n",
       "weight        43\n",
       "height        41\n",
       "diagnosis    122\n",
       "dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Sex and diagnosis to numerical variables\n",
    "sex_categorical, sex_classes = pd.factorize(data[\"sex\"])\n",
    "diagn_categorical, diagn_classes = pd.factorize(data[\"diagnosis\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['URTI', 'Healthy', 'COPD', 'Bronchiectasis', 'Pneumonia',\n",
       "       'Bronchiolitis'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diagn_classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"diagnosis\"] = diagn_categorical\n",
    "data[\"sex\"] = sex_categorical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>3.00</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.0</td>\n",
       "      <td>99.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.75</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.8</td>\n",
       "      <td>73.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>70.00</td>\n",
       "      <td>0</td>\n",
       "      <td>28.47</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.00</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>73.00</td>\n",
       "      <td>0</td>\n",
       "      <td>21.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           age  sex    bmi  weight  height  diagnosis\n",
       "patient                                              \n",
       "101       3.00    0    NaN    19.0    99.0          0\n",
       "102       0.75    0    NaN     9.8    73.0          1\n",
       "104      70.00    0  28.47     NaN     NaN          2\n",
       "105       7.00    0    NaN    32.0   135.0          0\n",
       "106      73.00    0  21.00     NaN     NaN          2"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>3.00</td>\n",
       "      <td>0</td>\n",
       "      <td>19.385777</td>\n",
       "      <td>19.0</td>\n",
       "      <td>99.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.75</td>\n",
       "      <td>0</td>\n",
       "      <td>18.389942</td>\n",
       "      <td>9.8</td>\n",
       "      <td>73.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>70.00</td>\n",
       "      <td>0</td>\n",
       "      <td>28.470000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.00</td>\n",
       "      <td>0</td>\n",
       "      <td>17.558299</td>\n",
       "      <td>32.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>73.00</td>\n",
       "      <td>0</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>75.00</td>\n",
       "      <td>0</td>\n",
       "      <td>33.700000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>84.00</td>\n",
       "      <td>0</td>\n",
       "      <td>33.530000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>75.00</td>\n",
       "      <td>1</td>\n",
       "      <td>25.210000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>63.00</td>\n",
       "      <td>1</td>\n",
       "      <td>28.400000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>60.00</td>\n",
       "      <td>1</td>\n",
       "      <td>22.860000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>58.00</td>\n",
       "      <td>1</td>\n",
       "      <td>28.410000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>77.00</td>\n",
       "      <td>1</td>\n",
       "      <td>23.120000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>56.00</td>\n",
       "      <td>1</td>\n",
       "      <td>28.580000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>68.00</td>\n",
       "      <td>1</td>\n",
       "      <td>24.400000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>81.00</td>\n",
       "      <td>1</td>\n",
       "      <td>36.760000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>2.00</td>\n",
       "      <td>0</td>\n",
       "      <td>17.202354</td>\n",
       "      <td>15.2</td>\n",
       "      <td>94.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>120</th>\n",
       "      <td>78.00</td>\n",
       "      <td>1</td>\n",
       "      <td>35.140000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>13.00</td>\n",
       "      <td>0</td>\n",
       "      <td>22.491349</td>\n",
       "      <td>65.0</td>\n",
       "      <td>170.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>122</th>\n",
       "      <td>66.00</td>\n",
       "      <td>1</td>\n",
       "      <td>33.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123</th>\n",
       "      <td>5.00</td>\n",
       "      <td>1</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>25.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           age  sex        bmi  weight  height  diagnosis\n",
       "patient                                                  \n",
       "101       3.00    0  19.385777    19.0    99.0          0\n",
       "102       0.75    0  18.389942     9.8    73.0          1\n",
       "104      70.00    0  28.470000     NaN     NaN          2\n",
       "105       7.00    0  17.558299    32.0   135.0          0\n",
       "106      73.00    0  21.000000     NaN     NaN          2\n",
       "107      75.00    0  33.700000     NaN     NaN          2\n",
       "109      84.00    0  33.530000     NaN     NaN          2\n",
       "110      75.00    1  25.210000     NaN     NaN          2\n",
       "111      63.00    1  28.400000     NaN     NaN          3\n",
       "112      60.00    1  22.860000     NaN     NaN          2\n",
       "113      58.00    1  28.410000     NaN     NaN          2\n",
       "114      77.00    1  23.120000     NaN     NaN          2\n",
       "116      56.00    1  28.580000     NaN     NaN          3\n",
       "117      68.00    1  24.400000     NaN     NaN          2\n",
       "118      81.00    1  36.760000     NaN     NaN          2\n",
       "119       2.00    0  17.202354    15.2    94.0          0\n",
       "120      78.00    1  35.140000     NaN     NaN          2\n",
       "121      13.00    0  22.491349    65.0   170.0          1\n",
       "122      66.00    1  33.000000     NaN     NaN          4\n",
       "123       5.00    1  16.000000    25.0   125.0          1"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Replace missing BMI values by using existing weight and height data\n",
    "data[\"bmi_2\"] = data.apply(lambda row: (row[3]/(row[4])**2)*10000, axis=1)\n",
    "data[\"bmi\"] = data[\"bmi\"].combine_first(data[\"bmi_2\"])\n",
    "data.drop(axis=1, columns=[\"bmi_2\"], inplace=True)\n",
    "data.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age           0\n",
       "sex           0\n",
       "bmi           7\n",
       "weight       79\n",
       "height       81\n",
       "diagnosis     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Replacing missing BMI information by using similar data, discarding the rest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [age, sex, bmi, weight, height, diagnosis]\n",
       "Index: []"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "missing_data_indexes = [i for i, val in enumerate(data[\"bmi\"].isnull()) if val == True]\n",
    "missing_data = data.iloc[missing_data_indexes]\n",
    "missing_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "for idx, row in missing_data.iterrows():\n",
    "    age = row[0]\n",
    "    sex = row[1]\n",
    "    bmi = row[2]\n",
    "    diagnosis = row[5]\n",
    "    \n",
    "    similar_patients = data[(data['sex'] == sex)\n",
    "                           & (data['diagnosis'] == diagnosis)\n",
    "                           & (age - 5 <= data['age']) & (data['age'] <= age + 5) \n",
    "                            & (data['bmi'].isnull()==False) ]\n",
    "    \n",
    "    if (len(similar_patients) > 2):\n",
    "        print(\"Found a similar BMI match for index\", idx)\n",
    "        data.at[idx, \"bmi\"] = similar_patients.bmi.mean()\n",
    "    else:\n",
    "        print(\"Dropping index\", idx)\n",
    "        data = data.drop(idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age           0\n",
       "sex           0\n",
       "bmi           0\n",
       "weight       78\n",
       "height       80\n",
       "diagnosis     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Dropping weight and height columns, because they have a lot of missing data\n",
    "data = data.drop(columns=[\"weight\", \"height\"])\n",
    "data = data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age          0\n",
       "sex          0\n",
       "bmi          0\n",
       "diagnosis    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>121.000000</td>\n",
       "      <td>121.000000</td>\n",
       "      <td>121.000000</td>\n",
       "      <td>121.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>43.673554</td>\n",
       "      <td>0.636364</td>\n",
       "      <td>23.451756</td>\n",
       "      <td>1.876033</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>32.110260</td>\n",
       "      <td>0.483046</td>\n",
       "      <td>6.553994</td>\n",
       "      <td>1.158809</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>13.119534</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>5.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>17.485027</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>61.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>23.120000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>71.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>28.340000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>93.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>53.500000</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              age         sex         bmi   diagnosis\n",
       "count  121.000000  121.000000  121.000000  121.000000\n",
       "mean    43.673554    0.636364   23.451756    1.876033\n",
       "std     32.110260    0.483046    6.553994    1.158809\n",
       "min      0.250000    0.000000   13.119534    0.000000\n",
       "25%      5.000000    0.000000   17.485027    1.000000\n",
       "50%     61.000000    1.000000   23.120000    2.000000\n",
       "75%     71.000000    1.000000   28.340000    2.000000\n",
       "max     93.000000    1.000000   53.500000    5.000000"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_diagnosis_data = data[\"diagnosis\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Adding crackles and wheezes count\n",
    "#1. Load in training and test sound files\n",
    "#2. Put them in one dataframe\n",
    "#3. Iterate through them. \n",
    "#4. If the patient number is not one found in the \"data\" dataframe, discard it\n",
    "#5. Put the wheezes per soundfile/crackles per soundfile information into a python array:\n",
    "#[[patient, wheezes, crackles], ...]\n",
    "#6. Make that into a numpy array, that into a pandas dataframe\n",
    "#7. Group by patient number by taking the mean\n",
    "#8. Sort by patient number\n",
    "#9. Put into the \"data\" dataframe.\n",
    "#Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_soundfiles = pd.read_csv(root + \"test_soundfiles.csv\")\n",
    "train_soundfiles = pd.read_csv(root + \"train_soundfiles.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "soundfiles = pd.concat([train_soundfiles, test_soundfiles])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101_1b1_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101_1b1_Pr_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102_1b1_Ar_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>104_1b1_Al_sc_Litt3200.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>104_1b1_Ar_sc_Litt3200.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>224_1b2_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>268</th>\n",
       "      <td>225_1b1_Pl_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269</th>\n",
       "      <td>226_1b1_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>226_1b1_Ll_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>271</th>\n",
       "      <td>226_1b1_Pl_sc_LittC2SE.wav</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>917 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       filename\n",
       "0    101_1b1_Al_sc_Meditron.wav\n",
       "1    101_1b1_Pr_sc_Meditron.wav\n",
       "2    102_1b1_Ar_sc_Meditron.wav\n",
       "3    104_1b1_Al_sc_Litt3200.wav\n",
       "4    104_1b1_Ar_sc_Litt3200.wav\n",
       "..                          ...\n",
       "267  224_1b2_Al_sc_Meditron.wav\n",
       "268  225_1b1_Pl_sc_Meditron.wav\n",
       "269  226_1b1_Al_sc_Meditron.wav\n",
       "270  226_1b1_Ll_sc_Meditron.wav\n",
       "271  226_1b1_Pl_sc_LittC2SE.wav\n",
       "\n",
       "[917 rows x 1 columns]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soundfiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[3,\n",
       " 5,\n",
       " 3,\n",
       " 4,\n",
       " 2,\n",
       " -447.0964,\n",
       " -487.81705,\n",
       " 60.012154,\n",
       " 147.43799,\n",
       " 98.916214,\n",
       " 94.1953,\n",
       " 61.320885,\n",
       " 47.155403,\n",
       " 53.77741,\n",
       " 19.767086,\n",
       " 24.540216,\n",
       " 3.4809492,\n",
       " 23.446045,\n",
       " -3.386144,\n",
       " 9.486736,\n",
       " 1.7393734]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "array = [3,5,3,4,2]\n",
    "SAMPLE_RATE = 16000\n",
    "filename = \"226_1b1_Pl_sc_LittC2SE.wav\"\n",
    "raw, sr = librosa.load(root + \"audio_and_txt_files/\" + filename, sr = SAMPLE_RATE, duration=20)\n",
    "mfccs = librosa.feature.mfcc(raw, hop_length=20*16000, n_mfcc=8)\n",
    "array.extend(mfccs.flatten())\n",
    "array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40000.0"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(20*16000)/8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101_1b1_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101_1b1_Pr_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102_1b1_Ar_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>104_1b1_Al_sc_Litt3200.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>104_1b1_Ar_sc_Litt3200.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>224_1b2_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>268</th>\n",
       "      <td>225_1b1_Pl_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269</th>\n",
       "      <td>226_1b1_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>226_1b1_Ll_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>271</th>\n",
       "      <td>226_1b1_Pl_sc_LittC2SE.wav</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>917 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       filename\n",
       "0    101_1b1_Al_sc_Meditron.wav\n",
       "1    101_1b1_Pr_sc_Meditron.wav\n",
       "2    102_1b1_Ar_sc_Meditron.wav\n",
       "3    104_1b1_Al_sc_Litt3200.wav\n",
       "4    104_1b1_Ar_sc_Litt3200.wav\n",
       "..                          ...\n",
       "267  224_1b2_Al_sc_Meditron.wav\n",
       "268  225_1b1_Pl_sc_Meditron.wav\n",
       "269  226_1b1_Al_sc_Meditron.wav\n",
       "270  226_1b1_Ll_sc_Meditron.wav\n",
       "271  226_1b1_Pl_sc_LittC2SE.wav\n",
       "\n",
       "[917 rows x 1 columns]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soundfiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#This might take a while\n",
    "#Array structure : [[patient, cracles per soundfile, wheezes per soundfile]]\n",
    "breathing_data_array = []\n",
    "print(\"START!\")\n",
    "for i, row in soundfiles.iterrows():\n",
    "    print(\"Index \" + str(i))\n",
    "    print(row[0])\n",
    "    \n",
    "    filename = row[0]\n",
    "    patient = int(row[0].split(\"_\")[0])\n",
    "    try: \n",
    "        data.loc[patient]\n",
    "    except KeyError:\n",
    "        continue\n",
    "    \n",
    "    txt_filename = filename[:-4] + \".txt\"\n",
    "    annotations = pd.read_csv(root + \"audio_and_txt_files/\" + txt_filename, names=[\"start\", \"stop\", \"crackle\", \"wheeze\"], sep=\"\\t\")\n",
    "    total_crackles = annotations.crackle.sum()\n",
    "    total_wheeze = annotations.wheeze.sum()\n",
    "    total_time = annotations.iloc[-1, 1] - annotations.iloc[0, 0]\n",
    "    crackle_per_sec = round(total_crackles/total_time, 4)\n",
    "    wheeze_per_sec = round(total_wheeze/total_time, 4)\n",
    "        \n",
    "    #Extracting sound features\n",
    "    SAMPLE_RATE = 16000\n",
    "    raw, sr = librosa.load(root + \"audio_and_txt_files/\" + filename, sr = SAMPLE_RATE)\n",
    "    zcr = librosa.core.zero_crossings(raw).sum() / len(raw)\n",
    "    sc = librosa.feature.spectral_centroid(raw)[0]\n",
    "    rms = librosa.feature.rms(raw)[0]\n",
    "    s_rf = librosa.feature.spectral_rolloff(raw, roll_percent=0.85)[0]\n",
    "    s_rf_75 = librosa.feature.spectral_rolloff(raw, roll_percent=0.75)[0]\n",
    "    sf = librosa.feature.spectral_flatness(raw)[0]\n",
    "    se = entropy.spectral_entropy(x = raw, sf = sr, method='fft')\n",
    "    mfccs = librosa.feature.mfcc(raw, hop_length=len(raw), n_mfcc=8)\n",
    "    mfccs = mfccs.flatten()\n",
    "    \n",
    "    add_to_array=[patient, \n",
    "                 crackle_per_sec, \n",
    "                 wheeze_per_sec,\n",
    "                 zcr,\n",
    "                 sc.mean(),\n",
    "                 np.median(sc),\n",
    "                 sc.std(),\n",
    "                 rms.mean(),\n",
    "                 np.median(rms),\n",
    "                 rms.std(), \n",
    "                 s_rf.mean(), \n",
    "                 np.median(s_rf),\n",
    "                 s_rf.std(),\n",
    "                 s_rf_75.mean(), \n",
    "                 np.median(s_rf_75),\n",
    "                 s_rf_75.std(),\n",
    "                sf.mean(),\n",
    "                np.median(sf),\n",
    "                sf.std(),\n",
    "                se]\n",
    "    add_to_array.extend(mfccs)\n",
    "    \n",
    "    breathing_data_array.append(add_to_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_columns = [\"patient\", \n",
    "                \"crackles\", \n",
    "                \"wheezes\", \n",
    "                \"zero_crossing_rate\", \n",
    "                \"spectral_centroid_mean\", \n",
    "                \"spectral_centroid_median\",\n",
    "                \"spectral_centroid_std\", \n",
    "                \"root_mean_square_mean\", \n",
    "                \"root_mean_square_median\", \n",
    "                \"root_mean_square_std\", \n",
    "                \"spectral_rolloff_85_mean\", \n",
    "                \"spectral_rolloff_85_median\", \n",
    "                \"spectral_rolloff_85_std\",\n",
    "               \"spectral_rolloff_75_mean\", \n",
    "                \"spectral_rolloff_75_median\", \n",
    "                \"spectral_rolloff_75_std\",\n",
    "               \"spectral_flatness_mean\",\n",
    "               \"spectral_flatness_median\",\n",
    "               \"spectral_flatness_std\",\n",
    "               \"spectral_entropy\",\n",
    "               \"mfcc1\",\n",
    "               \"mfcc2\",\n",
    "               \"mfcc3\",\n",
    "               \"mfcc4\",\n",
    "               \"mfcc5\",\n",
    "               \"mfcc6\",\n",
    "               \"mfcc7\",\n",
    "               \"mfcc8\",\n",
    "               \"mfcc9\",\n",
    "               \"mfcc10\",\n",
    "               \"mfcc11\",\n",
    "               \"mfcc12\",\n",
    "               \"mfcc13\",\n",
    "               \"mfcc14\",\n",
    "               \"mfcc15\",\n",
    "               \"mfcc16\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np_breathing_data_array = np.array(breathing_data_array)\n",
    "np_breathing_data_array.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "breathing_data_df = pd.DataFrame(np_breathing_data_array, columns=data_columns)\n",
    "breathing_data_df.set_index(\"patient\", inplace=True)\n",
    "breathing_data_df = breathing_data_df.groupby(by=\"patient\").mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.merge(left = data, right = breathing_data_df, left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = pd.merge(left = test_patients, right = data, left_index=True, right_index=True)\n",
    "train_data = pd.merge(left = train_patients, right = data, left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train = train_data.pop(\"diagnosis\")\n",
    "y_test = test_data.pop(\"diagnosis\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#NORMALIZATION\n",
    "\n",
    "norm_train_data = (train_data - train_data.min()) / (train_data.max() - train_data.min())\n",
    "norm_test_data = (test_data - train_data.min()) / (train_data.max() - train_data.min())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "norm_train_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Uncomment to save the training and test data for later use here\n",
    "\n",
    "#norm_train_data.to_csv(root + \"dataframes/norm_train_data_sound_features_03_31.csv\")\n",
    "#norm_test_data.to_csv(root + \"dataframes/norm_test_data_sound_features_03_31.csv\")\n",
    "\n",
    "#train_data.to_csv(root + \"dataframes/train_data_sound_features_03_31.csv\")\n",
    "#test_data.to_csv(root + \"dataframes/test_data_sound_features_03_31.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Uncomment to check if saving was successful\n",
    "\n",
    "#train_data = pd.read_csv(root + \"dataframes/train_data_sound_features_03_31.csv\")\n",
    "#test_data = pd.read_csv(root + \"dataframes/test_data_sound_features_03_31.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (venv)",
   "language": "python",
   "name": "venv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}