[dd6cc2]: / generating_data_for_classical_ml.ipynb

Download this file

2637 lines (2636 with data), 89.2 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import librosa\n",
    "import wave as wav\n",
    "import tensorflow as tf\n",
    "import scipy\n",
    "import matplotlib.pyplot as plt\n",
    "import librosa.display\n",
    "import IPython.display as ipd\n",
    "from sklearn import metrics\n",
    "from sklearn.model_selection import cross_validate\n",
    "import os\n",
    "import statistics\n",
    "import math\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "import graphviz\n",
    "from sklearn import tree\n",
    "\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import confusion_matrix, classification_report\n",
    "import sys\n",
    "\n",
    "from sklearn.metrics import plot_confusion_matrix\n",
    "import seaborn as sns \n",
    "import matplotlib.pyplot as plt\n",
    "import operator as op\n",
    "from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict\n",
    "from entropy import *\n",
    "from random import shuffle\n",
    "\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: xgboost in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (1.0.2)\n",
      "Requirement already satisfied: scipy in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (from xgboost) (1.4.1)\n",
      "Requirement already satisfied: numpy in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (from xgboost) (1.18.1)\n",
      "\u001b[33mWARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.\n",
      "You should consider upgrading via the '/gpfs/hpc/home/rannilo/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install xgboost\n",
    "import xgboost as xgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "root = \"respiratory_sound_database/\"\n",
    "patient_diagnosis = pd.read_csv(root+\"patient_diagnosis.csv\", names=[\"patient\", \"diagnosis\"])\n",
    "demographic_info = pd.read_csv(root+\"demographic_info.txt\", delimiter=\" \", names=[\"patient\", \"age\", \"sex\", \"bmi\", \"weight\", \"height\"])\n",
    "\n",
    "train_patients = pd.read_csv(root + \"train_patients.csv\")\n",
    "test_patients = pd.read_csv(root + \"test_patients.csv\")\n",
    "\n",
    "train_patients.set_index(\"patient\", inplace=True)\n",
    "test_patients.set_index(\"patient\", inplace=True)\n",
    "patient_diagnosis.set_index(\"patient\", inplace=True)\n",
    "demographic_info.set_index(\"patient\", inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: [101, 102, 103, 104, 105]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_patients.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>185</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>186</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>187</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>188</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>189</th>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: [185, 186, 187, 188, 189]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_patients.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>Asthma</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        diagnosis\n",
       "patient          \n",
       "101          URTI\n",
       "102       Healthy\n",
       "103        Asthma\n",
       "104          COPD\n",
       "105          URTI"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "patient_diagnosis.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>3.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.0</td>\n",
       "      <td>99.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.75</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.8</td>\n",
       "      <td>73.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>33.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>28.47</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>135.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           age sex    bmi  weight  height\n",
       "patient                                  \n",
       "101       3.00   F    NaN    19.0    99.0\n",
       "102       0.75   F    NaN     9.8    73.0\n",
       "103      70.00   F  33.00     NaN     NaN\n",
       "104      70.00   F  28.47     NaN     NaN\n",
       "105       7.00   F    NaN    32.0   135.0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "demographic_info.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>3.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.0</td>\n",
       "      <td>99.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.75</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.8</td>\n",
       "      <td>73.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>33.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Asthma</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>28.47</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>222</th>\n",
       "      <td>60.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>223</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>224</th>\n",
       "      <td>10.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.3</td>\n",
       "      <td>143.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>225</th>\n",
       "      <td>0.83</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.8</td>\n",
       "      <td>74.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>226</th>\n",
       "      <td>4.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.7</td>\n",
       "      <td>103.0</td>\n",
       "      <td>Pneumonia</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>126 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           age  sex    bmi  weight  height  diagnosis\n",
       "patient                                              \n",
       "101       3.00    F    NaN    19.0    99.0       URTI\n",
       "102       0.75    F    NaN     9.8    73.0    Healthy\n",
       "103      70.00    F  33.00     NaN     NaN     Asthma\n",
       "104      70.00    F  28.47     NaN     NaN       COPD\n",
       "105       7.00    F    NaN    32.0   135.0       URTI\n",
       "...        ...  ...    ...     ...     ...        ...\n",
       "222      60.00    M    NaN     NaN     NaN       COPD\n",
       "223        NaN  NaN    NaN     NaN     NaN       COPD\n",
       "224      10.00    F    NaN    32.3   143.0    Healthy\n",
       "225       0.83    M    NaN     7.8    74.0    Healthy\n",
       "226       4.00    M    NaN    16.7   103.0  Pneumonia\n",
       "\n",
       "[126 rows x 6 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.concat([demographic_info, patient_diagnosis], axis=1)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>3.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.00</td>\n",
       "      <td>99.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.75</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.80</td>\n",
       "      <td>73.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>33.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Asthma</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>70.00</td>\n",
       "      <td>F</td>\n",
       "      <td>28.47</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.00</td>\n",
       "      <td>135.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>73.00</td>\n",
       "      <td>F</td>\n",
       "      <td>21.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>75.00</td>\n",
       "      <td>F</td>\n",
       "      <td>33.70</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>3.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LRTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>84.00</td>\n",
       "      <td>F</td>\n",
       "      <td>33.53</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>75.00</td>\n",
       "      <td>M</td>\n",
       "      <td>25.21</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>63.00</td>\n",
       "      <td>M</td>\n",
       "      <td>28.40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Bronchiectasis</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>60.00</td>\n",
       "      <td>M</td>\n",
       "      <td>22.86</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>58.00</td>\n",
       "      <td>M</td>\n",
       "      <td>28.41</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>77.00</td>\n",
       "      <td>M</td>\n",
       "      <td>23.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>0.58</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.14</td>\n",
       "      <td>64.0</td>\n",
       "      <td>LRTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>56.00</td>\n",
       "      <td>M</td>\n",
       "      <td>28.58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Bronchiectasis</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>68.00</td>\n",
       "      <td>M</td>\n",
       "      <td>24.40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>81.00</td>\n",
       "      <td>M</td>\n",
       "      <td>36.76</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>2.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15.20</td>\n",
       "      <td>94.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>120</th>\n",
       "      <td>78.00</td>\n",
       "      <td>M</td>\n",
       "      <td>35.14</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>13.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65.00</td>\n",
       "      <td>170.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>122</th>\n",
       "      <td>66.00</td>\n",
       "      <td>M</td>\n",
       "      <td>33.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Pneumonia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123</th>\n",
       "      <td>5.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25.00</td>\n",
       "      <td>125.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>124</th>\n",
       "      <td>65.00</td>\n",
       "      <td>M</td>\n",
       "      <td>29.07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>125</th>\n",
       "      <td>14.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.00</td>\n",
       "      <td>170.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>126</th>\n",
       "      <td>1.00</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10.18</td>\n",
       "      <td>80.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127</th>\n",
       "      <td>2.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12.60</td>\n",
       "      <td>98.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128</th>\n",
       "      <td>65.00</td>\n",
       "      <td>F</td>\n",
       "      <td>24.30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>129</th>\n",
       "      <td>6.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.00</td>\n",
       "      <td>119.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130</th>\n",
       "      <td>85.00</td>\n",
       "      <td>F</td>\n",
       "      <td>17.10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>3.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14.00</td>\n",
       "      <td>97.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>132</th>\n",
       "      <td>71.00</td>\n",
       "      <td>M</td>\n",
       "      <td>34.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133</th>\n",
       "      <td>68.00</td>\n",
       "      <td>M</td>\n",
       "      <td>27.40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>134</th>\n",
       "      <td>61.00</td>\n",
       "      <td>M</td>\n",
       "      <td>32.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135</th>\n",
       "      <td>70.00</td>\n",
       "      <td>M</td>\n",
       "      <td>21.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Pneumonia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>136</th>\n",
       "      <td>5.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.20</td>\n",
       "      <td>110.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>137</th>\n",
       "      <td>4.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18.00</td>\n",
       "      <td>104.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>138</th>\n",
       "      <td>56.00</td>\n",
       "      <td>F</td>\n",
       "      <td>21.60</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>139</th>\n",
       "      <td>61.00</td>\n",
       "      <td>M</td>\n",
       "      <td>28.68</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>140</th>\n",
       "      <td>79.00</td>\n",
       "      <td>F</td>\n",
       "      <td>23.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Pneumonia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>141</th>\n",
       "      <td>66.00</td>\n",
       "      <td>M</td>\n",
       "      <td>22.40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>78.00</td>\n",
       "      <td>M</td>\n",
       "      <td>26.10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>0.25</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.24</td>\n",
       "      <td>68.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>144</th>\n",
       "      <td>3.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.70</td>\n",
       "      <td>100.0</td>\n",
       "      <td>Healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>69.00</td>\n",
       "      <td>M</td>\n",
       "      <td>23.40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>67.00</td>\n",
       "      <td>M</td>\n",
       "      <td>28.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>77.00</td>\n",
       "      <td>M</td>\n",
       "      <td>25.70</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>COPD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>4.00</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>33.00</td>\n",
       "      <td>110.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>0.67</td>\n",
       "      <td>M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.50</td>\n",
       "      <td>70.0</td>\n",
       "      <td>Bronchiolitis</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>150</th>\n",
       "      <td>0.67</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.12</td>\n",
       "      <td>74.0</td>\n",
       "      <td>URTI</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           age sex    bmi  weight  height       diagnosis\n",
       "patient                                                  \n",
       "101       3.00   F    NaN   19.00    99.0            URTI\n",
       "102       0.75   F    NaN    9.80    73.0         Healthy\n",
       "103      70.00   F  33.00     NaN     NaN          Asthma\n",
       "104      70.00   F  28.47     NaN     NaN            COPD\n",
       "105       7.00   F    NaN   32.00   135.0            URTI\n",
       "106      73.00   F  21.00     NaN     NaN            COPD\n",
       "107      75.00   F  33.70     NaN     NaN            COPD\n",
       "108       3.00   M    NaN     NaN     NaN            LRTI\n",
       "109      84.00   F  33.53     NaN     NaN            COPD\n",
       "110      75.00   M  25.21     NaN     NaN            COPD\n",
       "111      63.00   M  28.40     NaN     NaN  Bronchiectasis\n",
       "112      60.00   M  22.86     NaN     NaN            COPD\n",
       "113      58.00   M  28.41     NaN     NaN            COPD\n",
       "114      77.00   M  23.12     NaN     NaN            COPD\n",
       "115       0.58   M    NaN    7.14    64.0            LRTI\n",
       "116      56.00   M  28.58     NaN     NaN  Bronchiectasis\n",
       "117      68.00   M  24.40     NaN     NaN            COPD\n",
       "118      81.00   M  36.76     NaN     NaN            COPD\n",
       "119       2.00   F    NaN   15.20    94.0            URTI\n",
       "120      78.00   M  35.14     NaN     NaN            COPD\n",
       "121      13.00   F    NaN   65.00   170.0         Healthy\n",
       "122      66.00   M  33.00     NaN     NaN       Pneumonia\n",
       "123       5.00   M    NaN   25.00   125.0         Healthy\n",
       "124      65.00   M  29.07     NaN     NaN            COPD\n",
       "125      14.00   M    NaN   62.00   170.0         Healthy\n",
       "126       1.00   F    NaN   10.18    80.0         Healthy\n",
       "127       2.00   M    NaN   12.60    98.0         Healthy\n",
       "128      65.00   F  24.30     NaN     NaN            COPD\n",
       "129       6.00   M    NaN   23.00   119.0            URTI\n",
       "130      85.00   F  17.10     NaN     NaN            COPD\n",
       "131       3.00   M    NaN   14.00    97.0            URTI\n",
       "132      71.00   M  34.00     NaN     NaN            COPD\n",
       "133      68.00   M  27.40     NaN     NaN            COPD\n",
       "134      61.00   M  32.00     NaN     NaN            COPD\n",
       "135      70.00   M  21.00     NaN     NaN       Pneumonia\n",
       "136       5.00   M    NaN   16.20   110.0         Healthy\n",
       "137       4.00   M    NaN   18.00   104.0            URTI\n",
       "138      56.00   F  21.60     NaN     NaN            COPD\n",
       "139      61.00   M  28.68     NaN     NaN            COPD\n",
       "140      79.00   F  23.00     NaN     NaN       Pneumonia\n",
       "141      66.00   M  22.40     NaN     NaN            COPD\n",
       "142      78.00   M  26.10     NaN     NaN            COPD\n",
       "143       0.25   F    NaN    8.24    68.0         Healthy\n",
       "144       3.00   M    NaN   16.70   100.0         Healthy\n",
       "145      69.00   M  23.40     NaN     NaN            COPD\n",
       "146      67.00   M  28.00     NaN     NaN            COPD\n",
       "147      77.00   M  25.70     NaN     NaN            COPD\n",
       "148       4.00   M    NaN   33.00   110.0            URTI\n",
       "149       0.67   M    NaN    9.50    70.0   Bronchiolitis\n",
       "150       0.67   F    NaN    8.12    74.0            URTI"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "age           1\n",
      "sex           1\n",
      "bmi          51\n",
      "weight       82\n",
      "height       84\n",
      "diagnosis     0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(data.isna().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Dropping one patient with NA age and sex\n",
    "data.dropna(thresh=2, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "age           0\n",
      "sex           0\n",
      "bmi          50\n",
      "weight       81\n",
      "height       83\n",
      "diagnosis     0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(data.isna().sum())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age          125\n",
       "sex          125\n",
       "bmi           75\n",
       "weight        44\n",
       "height        42\n",
       "diagnosis    125\n",
       "dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5, 1.0, \"Count of patients' diagnoses\")"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 576x360 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=(8,5))\n",
    "data.diagnosis.value_counts().plot(kind=\"bar\")\n",
    "plt.title(\"Count of patients' diagnoses\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "COPD              63\n",
       "Healthy           26\n",
       "URTI              14\n",
       "Bronchiectasis     7\n",
       "Bronchiolitis      6\n",
       "Pneumonia          6\n",
       "LRTI               2\n",
       "Asthma             1\n",
       "Name: diagnosis, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.diagnosis.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Dropping Asthma and LRTI patients because there are too few of them\n",
    "data = data.drop(data[(data.diagnosis=='Asthma') | (data.diagnosis == 'LRTI')].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age          122\n",
       "sex          122\n",
       "bmi           74\n",
       "weight        43\n",
       "height        41\n",
       "diagnosis    122\n",
       "dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Sex and diagnosis to numerical variables\n",
    "sex_categorical, sex_classes = pd.factorize(data[\"sex\"])\n",
    "diagn_categorical, diagn_classes = pd.factorize(data[\"diagnosis\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['URTI', 'Healthy', 'COPD', 'Bronchiectasis', 'Pneumonia',\n",
       "       'Bronchiolitis'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diagn_classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"diagnosis\"] = diagn_categorical\n",
    "data[\"sex\"] = sex_categorical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>3.00</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.0</td>\n",
       "      <td>99.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.75</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.8</td>\n",
       "      <td>73.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>70.00</td>\n",
       "      <td>0</td>\n",
       "      <td>28.47</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.00</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>73.00</td>\n",
       "      <td>0</td>\n",
       "      <td>21.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           age  sex    bmi  weight  height  diagnosis\n",
       "patient                                              \n",
       "101       3.00    0    NaN    19.0    99.0          0\n",
       "102       0.75    0    NaN     9.8    73.0          1\n",
       "104      70.00    0  28.47     NaN     NaN          2\n",
       "105       7.00    0    NaN    32.0   135.0          0\n",
       "106      73.00    0  21.00     NaN     NaN          2"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>3.00</td>\n",
       "      <td>0</td>\n",
       "      <td>19.385777</td>\n",
       "      <td>19.0</td>\n",
       "      <td>99.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>0.75</td>\n",
       "      <td>0</td>\n",
       "      <td>18.389942</td>\n",
       "      <td>9.8</td>\n",
       "      <td>73.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>70.00</td>\n",
       "      <td>0</td>\n",
       "      <td>28.470000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.00</td>\n",
       "      <td>0</td>\n",
       "      <td>17.558299</td>\n",
       "      <td>32.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>73.00</td>\n",
       "      <td>0</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>75.00</td>\n",
       "      <td>0</td>\n",
       "      <td>33.700000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>84.00</td>\n",
       "      <td>0</td>\n",
       "      <td>33.530000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>75.00</td>\n",
       "      <td>1</td>\n",
       "      <td>25.210000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>63.00</td>\n",
       "      <td>1</td>\n",
       "      <td>28.400000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>60.00</td>\n",
       "      <td>1</td>\n",
       "      <td>22.860000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>58.00</td>\n",
       "      <td>1</td>\n",
       "      <td>28.410000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>77.00</td>\n",
       "      <td>1</td>\n",
       "      <td>23.120000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>56.00</td>\n",
       "      <td>1</td>\n",
       "      <td>28.580000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>68.00</td>\n",
       "      <td>1</td>\n",
       "      <td>24.400000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>81.00</td>\n",
       "      <td>1</td>\n",
       "      <td>36.760000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>2.00</td>\n",
       "      <td>0</td>\n",
       "      <td>17.202354</td>\n",
       "      <td>15.2</td>\n",
       "      <td>94.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>120</th>\n",
       "      <td>78.00</td>\n",
       "      <td>1</td>\n",
       "      <td>35.140000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>13.00</td>\n",
       "      <td>0</td>\n",
       "      <td>22.491349</td>\n",
       "      <td>65.0</td>\n",
       "      <td>170.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>122</th>\n",
       "      <td>66.00</td>\n",
       "      <td>1</td>\n",
       "      <td>33.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123</th>\n",
       "      <td>5.00</td>\n",
       "      <td>1</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>25.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           age  sex        bmi  weight  height  diagnosis\n",
       "patient                                                  \n",
       "101       3.00    0  19.385777    19.0    99.0          0\n",
       "102       0.75    0  18.389942     9.8    73.0          1\n",
       "104      70.00    0  28.470000     NaN     NaN          2\n",
       "105       7.00    0  17.558299    32.0   135.0          0\n",
       "106      73.00    0  21.000000     NaN     NaN          2\n",
       "107      75.00    0  33.700000     NaN     NaN          2\n",
       "109      84.00    0  33.530000     NaN     NaN          2\n",
       "110      75.00    1  25.210000     NaN     NaN          2\n",
       "111      63.00    1  28.400000     NaN     NaN          3\n",
       "112      60.00    1  22.860000     NaN     NaN          2\n",
       "113      58.00    1  28.410000     NaN     NaN          2\n",
       "114      77.00    1  23.120000     NaN     NaN          2\n",
       "116      56.00    1  28.580000     NaN     NaN          3\n",
       "117      68.00    1  24.400000     NaN     NaN          2\n",
       "118      81.00    1  36.760000     NaN     NaN          2\n",
       "119       2.00    0  17.202354    15.2    94.0          0\n",
       "120      78.00    1  35.140000     NaN     NaN          2\n",
       "121      13.00    0  22.491349    65.0   170.0          1\n",
       "122      66.00    1  33.000000     NaN     NaN          4\n",
       "123       5.00    1  16.000000    25.0   125.0          1"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Replace missing BMI values by using existing weight and height data\n",
    "data[\"bmi_2\"] = data.apply(lambda row: (row[3]/(row[4])**2)*10000, axis=1)\n",
    "data[\"bmi\"] = data[\"bmi\"].combine_first(data[\"bmi_2\"])\n",
    "data.drop(axis=1, columns=[\"bmi_2\"], inplace=True)\n",
    "data.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age           0\n",
       "sex           0\n",
       "bmi           7\n",
       "weight       79\n",
       "height       81\n",
       "diagnosis     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Replacing missing BMI information by using similar data, discarding the rest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>patient</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [age, sex, bmi, weight, height, diagnosis]\n",
       "Index: []"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "missing_data_indexes = [i for i, val in enumerate(data[\"bmi\"].isnull()) if val == True]\n",
    "missing_data = data.iloc[missing_data_indexes]\n",
    "missing_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "for idx, row in missing_data.iterrows():\n",
    "    age = row[0]\n",
    "    sex = row[1]\n",
    "    bmi = row[2]\n",
    "    diagnosis = row[5]\n",
    "    \n",
    "    similar_patients = data[(data['sex'] == sex)\n",
    "                           & (data['diagnosis'] == diagnosis)\n",
    "                           & (age - 5 <= data['age']) & (data['age'] <= age + 5) \n",
    "                            & (data['bmi'].isnull()==False) ]\n",
    "    \n",
    "    if (len(similar_patients) > 2):\n",
    "        print(\"Found a similar BMI match for index\", idx)\n",
    "        data.at[idx, \"bmi\"] = similar_patients.bmi.mean()\n",
    "    else:\n",
    "        print(\"Dropping index\", idx)\n",
    "        data = data.drop(idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age           0\n",
       "sex           0\n",
       "bmi           0\n",
       "weight       78\n",
       "height       80\n",
       "diagnosis     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Dropping weight and height columns, because they have a lot of missing data\n",
    "data = data.drop(columns=[\"weight\", \"height\"])\n",
    "data = data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age          0\n",
       "sex          0\n",
       "bmi          0\n",
       "diagnosis    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>diagnosis</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>121.000000</td>\n",
       "      <td>121.000000</td>\n",
       "      <td>121.000000</td>\n",
       "      <td>121.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>43.673554</td>\n",
       "      <td>0.636364</td>\n",
       "      <td>23.451756</td>\n",
       "      <td>1.876033</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>32.110260</td>\n",
       "      <td>0.483046</td>\n",
       "      <td>6.553994</td>\n",
       "      <td>1.158809</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>13.119534</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>5.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>17.485027</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>61.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>23.120000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>71.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>28.340000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>93.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>53.500000</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              age         sex         bmi   diagnosis\n",
       "count  121.000000  121.000000  121.000000  121.000000\n",
       "mean    43.673554    0.636364   23.451756    1.876033\n",
       "std     32.110260    0.483046    6.553994    1.158809\n",
       "min      0.250000    0.000000   13.119534    0.000000\n",
       "25%      5.000000    0.000000   17.485027    1.000000\n",
       "50%     61.000000    1.000000   23.120000    2.000000\n",
       "75%     71.000000    1.000000   28.340000    2.000000\n",
       "max     93.000000    1.000000   53.500000    5.000000"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_diagnosis_data = data[\"diagnosis\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Adding crackles and wheezes count\n",
    "#1. Load in training and test sound files\n",
    "#2. Put them in one dataframe\n",
    "#3. Iterate through them. \n",
    "#4. If the patient number is not one found in the \"data\" dataframe, discard it\n",
    "#5. Put the wheezes per soundfile/crackles per soundfile information into a python array:\n",
    "#[[patient, wheezes, crackles], ...]\n",
    "#6. Make that into a numpy array, that into a pandas dataframe\n",
    "#7. Group by patient number by taking the mean\n",
    "#8. Sort by patient number\n",
    "#9. Put into the \"data\" dataframe.\n",
    "#Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_soundfiles = pd.read_csv(root + \"test_soundfiles.csv\")\n",
    "train_soundfiles = pd.read_csv(root + \"train_soundfiles.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "soundfiles = pd.concat([train_soundfiles, test_soundfiles])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101_1b1_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101_1b1_Pr_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102_1b1_Ar_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>104_1b1_Al_sc_Litt3200.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>104_1b1_Ar_sc_Litt3200.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>224_1b2_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>268</th>\n",
       "      <td>225_1b1_Pl_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269</th>\n",
       "      <td>226_1b1_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>226_1b1_Ll_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>271</th>\n",
       "      <td>226_1b1_Pl_sc_LittC2SE.wav</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>917 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       filename\n",
       "0    101_1b1_Al_sc_Meditron.wav\n",
       "1    101_1b1_Pr_sc_Meditron.wav\n",
       "2    102_1b1_Ar_sc_Meditron.wav\n",
       "3    104_1b1_Al_sc_Litt3200.wav\n",
       "4    104_1b1_Ar_sc_Litt3200.wav\n",
       "..                          ...\n",
       "267  224_1b2_Al_sc_Meditron.wav\n",
       "268  225_1b1_Pl_sc_Meditron.wav\n",
       "269  226_1b1_Al_sc_Meditron.wav\n",
       "270  226_1b1_Ll_sc_Meditron.wav\n",
       "271  226_1b1_Pl_sc_LittC2SE.wav\n",
       "\n",
       "[917 rows x 1 columns]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soundfiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[3,\n",
       " 5,\n",
       " 3,\n",
       " 4,\n",
       " 2,\n",
       " -447.0964,\n",
       " -487.81705,\n",
       " 60.012154,\n",
       " 147.43799,\n",
       " 98.916214,\n",
       " 94.1953,\n",
       " 61.320885,\n",
       " 47.155403,\n",
       " 53.77741,\n",
       " 19.767086,\n",
       " 24.540216,\n",
       " 3.4809492,\n",
       " 23.446045,\n",
       " -3.386144,\n",
       " 9.486736,\n",
       " 1.7393734]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "array = [3,5,3,4,2]\n",
    "SAMPLE_RATE = 16000\n",
    "filename = \"226_1b1_Pl_sc_LittC2SE.wav\"\n",
    "raw, sr = librosa.load(root + \"audio_and_txt_files/\" + filename, sr = SAMPLE_RATE, duration=20)\n",
    "mfccs = librosa.feature.mfcc(raw, hop_length=20*16000, n_mfcc=8)\n",
    "array.extend(mfccs.flatten())\n",
    "array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40000.0"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(20*16000)/8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101_1b1_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101_1b1_Pr_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102_1b1_Ar_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>104_1b1_Al_sc_Litt3200.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>104_1b1_Ar_sc_Litt3200.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>224_1b2_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>268</th>\n",
       "      <td>225_1b1_Pl_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269</th>\n",
       "      <td>226_1b1_Al_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>226_1b1_Ll_sc_Meditron.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>271</th>\n",
       "      <td>226_1b1_Pl_sc_LittC2SE.wav</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>917 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       filename\n",
       "0    101_1b1_Al_sc_Meditron.wav\n",
       "1    101_1b1_Pr_sc_Meditron.wav\n",
       "2    102_1b1_Ar_sc_Meditron.wav\n",
       "3    104_1b1_Al_sc_Litt3200.wav\n",
       "4    104_1b1_Ar_sc_Litt3200.wav\n",
       "..                          ...\n",
       "267  224_1b2_Al_sc_Meditron.wav\n",
       "268  225_1b1_Pl_sc_Meditron.wav\n",
       "269  226_1b1_Al_sc_Meditron.wav\n",
       "270  226_1b1_Ll_sc_Meditron.wav\n",
       "271  226_1b1_Pl_sc_LittC2SE.wav\n",
       "\n",
       "[917 rows x 1 columns]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soundfiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#This might take a while\n",
    "#Array structure : [[patient, cracles per soundfile, wheezes per soundfile]]\n",
    "breathing_data_array = []\n",
    "print(\"START!\")\n",
    "for i, row in soundfiles.iterrows():\n",
    "    print(\"Index \" + str(i))\n",
    "    print(row[0])\n",
    "    \n",
    "    filename = row[0]\n",
    "    patient = int(row[0].split(\"_\")[0])\n",
    "    try: \n",
    "        data.loc[patient]\n",
    "    except KeyError:\n",
    "        continue\n",
    "    \n",
    "    txt_filename = filename[:-4] + \".txt\"\n",
    "    annotations = pd.read_csv(root + \"audio_and_txt_files/\" + txt_filename, names=[\"start\", \"stop\", \"crackle\", \"wheeze\"], sep=\"\\t\")\n",
    "    total_crackles = annotations.crackle.sum()\n",
    "    total_wheeze = annotations.wheeze.sum()\n",
    "    total_time = annotations.iloc[-1, 1] - annotations.iloc[0, 0]\n",
    "    crackle_per_sec = round(total_crackles/total_time, 4)\n",
    "    wheeze_per_sec = round(total_wheeze/total_time, 4)\n",
    "        \n",
    "    #Extracting sound features\n",
    "    SAMPLE_RATE = 16000\n",
    "    raw, sr = librosa.load(root + \"audio_and_txt_files/\" + filename, sr = SAMPLE_RATE)\n",
    "    zcr = librosa.core.zero_crossings(raw).sum() / len(raw)\n",
    "    sc = librosa.feature.spectral_centroid(raw)[0]\n",
    "    rms = librosa.feature.rms(raw)[0]\n",
    "    s_rf = librosa.feature.spectral_rolloff(raw, roll_percent=0.85)[0]\n",
    "    s_rf_75 = librosa.feature.spectral_rolloff(raw, roll_percent=0.75)[0]\n",
    "    sf = librosa.feature.spectral_flatness(raw)[0]\n",
    "    se = entropy.spectral_entropy(x = raw, sf = sr, method='fft')\n",
    "    mfccs = librosa.feature.mfcc(raw, hop_length=len(raw), n_mfcc=8)\n",
    "    mfccs = mfccs.flatten()\n",
    "    \n",
    "    add_to_array=[patient, \n",
    "                 crackle_per_sec, \n",
    "                 wheeze_per_sec,\n",
    "                 zcr,\n",
    "                 sc.mean(),\n",
    "                 np.median(sc),\n",
    "                 sc.std(),\n",
    "                 rms.mean(),\n",
    "                 np.median(rms),\n",
    "                 rms.std(), \n",
    "                 s_rf.mean(), \n",
    "                 np.median(s_rf),\n",
    "                 s_rf.std(),\n",
    "                 s_rf_75.mean(), \n",
    "                 np.median(s_rf_75),\n",
    "                 s_rf_75.std(),\n",
    "                sf.mean(),\n",
    "                np.median(sf),\n",
    "                sf.std(),\n",
    "                se]\n",
    "    add_to_array.extend(mfccs)\n",
    "    \n",
    "    breathing_data_array.append(add_to_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_columns = [\"patient\", \n",
    "                \"crackles\", \n",
    "                \"wheezes\", \n",
    "                \"zero_crossing_rate\", \n",
    "                \"spectral_centroid_mean\", \n",
    "                \"spectral_centroid_median\",\n",
    "                \"spectral_centroid_std\", \n",
    "                \"root_mean_square_mean\", \n",
    "                \"root_mean_square_median\", \n",
    "                \"root_mean_square_std\", \n",
    "                \"spectral_rolloff_85_mean\", \n",
    "                \"spectral_rolloff_85_median\", \n",
    "                \"spectral_rolloff_85_std\",\n",
    "               \"spectral_rolloff_75_mean\", \n",
    "                \"spectral_rolloff_75_median\", \n",
    "                \"spectral_rolloff_75_std\",\n",
    "               \"spectral_flatness_mean\",\n",
    "               \"spectral_flatness_median\",\n",
    "               \"spectral_flatness_std\",\n",
    "               \"spectral_entropy\",\n",
    "               \"mfcc1\",\n",
    "               \"mfcc2\",\n",
    "               \"mfcc3\",\n",
    "               \"mfcc4\",\n",
    "               \"mfcc5\",\n",
    "               \"mfcc6\",\n",
    "               \"mfcc7\",\n",
    "               \"mfcc8\",\n",
    "               \"mfcc9\",\n",
    "               \"mfcc10\",\n",
    "               \"mfcc11\",\n",
    "               \"mfcc12\",\n",
    "               \"mfcc13\",\n",
    "               \"mfcc14\",\n",
    "               \"mfcc15\",\n",
    "               \"mfcc16\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np_breathing_data_array = np.array(breathing_data_array)\n",
    "np_breathing_data_array.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "breathing_data_df = pd.DataFrame(np_breathing_data_array, columns=data_columns)\n",
    "breathing_data_df.set_index(\"patient\", inplace=True)\n",
    "breathing_data_df = breathing_data_df.groupby(by=\"patient\").mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.merge(left = data, right = breathing_data_df, left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = pd.merge(left = test_patients, right = data, left_index=True, right_index=True)\n",
    "train_data = pd.merge(left = train_patients, right = data, left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train = train_data.pop(\"diagnosis\")\n",
    "y_test = test_data.pop(\"diagnosis\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#NORMALIZATION\n",
    "\n",
    "norm_train_data = (train_data - train_data.min()) / (train_data.max() - train_data.min())\n",
    "norm_test_data = (test_data - train_data.min()) / (train_data.max() - train_data.min())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "norm_train_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Uncomment to save the training and test data for later use here\n",
    "\n",
    "#norm_train_data.to_csv(root + \"dataframes/norm_train_data_sound_features_03_31.csv\")\n",
    "#norm_test_data.to_csv(root + \"dataframes/norm_test_data_sound_features_03_31.csv\")\n",
    "\n",
    "#train_data.to_csv(root + \"dataframes/train_data_sound_features_03_31.csv\")\n",
    "#test_data.to_csv(root + \"dataframes/test_data_sound_features_03_31.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Uncomment to check if saving was successful\n",
    "\n",
    "#train_data = pd.read_csv(root + \"dataframes/train_data_sound_features_03_31.csv\")\n",
    "#test_data = pd.read_csv(root + \"dataframes/test_data_sound_features_03_31.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (venv)",
   "language": "python",
   "name": "venv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}