2637 lines (2636 with data), 89.2 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import librosa\n",
"import wave as wav\n",
"import tensorflow as tf\n",
"import scipy\n",
"import matplotlib.pyplot as plt\n",
"import librosa.display\n",
"import IPython.display as ipd\n",
"from sklearn import metrics\n",
"from sklearn.model_selection import cross_validate\n",
"import os\n",
"import statistics\n",
"import math\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"import graphviz\n",
"from sklearn import tree\n",
"\n",
"from sklearn import metrics\n",
"from sklearn.metrics import confusion_matrix, classification_report\n",
"import sys\n",
"\n",
"from sklearn.metrics import plot_confusion_matrix\n",
"import seaborn as sns \n",
"import matplotlib.pyplot as plt\n",
"import operator as op\n",
"from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict\n",
"from entropy import *\n",
"from random import shuffle\n",
"\n",
"from sklearn.svm import SVC\n",
"from sklearn.decomposition import PCA"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: xgboost in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (1.0.2)\n",
"Requirement already satisfied: scipy in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (from xgboost) (1.4.1)\n",
"Requirement already satisfied: numpy in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (from xgboost) (1.18.1)\n",
"\u001b[33mWARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.\n",
"You should consider upgrading via the '/gpfs/hpc/home/rannilo/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
]
}
],
"source": [
"!pip install xgboost\n",
"import xgboost as xgb"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"root = \"respiratory_sound_database/\"\n",
"patient_diagnosis = pd.read_csv(root+\"patient_diagnosis.csv\", names=[\"patient\", \"diagnosis\"])\n",
"demographic_info = pd.read_csv(root+\"demographic_info.txt\", delimiter=\" \", names=[\"patient\", \"age\", \"sex\", \"bmi\", \"weight\", \"height\"])\n",
"\n",
"train_patients = pd.read_csv(root + \"train_patients.csv\")\n",
"test_patients = pd.read_csv(root + \"test_patients.csv\")\n",
"\n",
"train_patients.set_index(\"patient\", inplace=True)\n",
"test_patients.set_index(\"patient\", inplace=True)\n",
"patient_diagnosis.set_index(\"patient\", inplace=True)\n",
"demographic_info.set_index(\"patient\", inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: [101, 102, 103, 104, 105]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_patients.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>185</th>\n",
" </tr>\n",
" <tr>\n",
" <th>186</th>\n",
" </tr>\n",
" <tr>\n",
" <th>187</th>\n",
" </tr>\n",
" <tr>\n",
" <th>188</th>\n",
" </tr>\n",
" <tr>\n",
" <th>189</th>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: [185, 186, 187, 188, 189]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_patients.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>Asthma</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>URTI</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" diagnosis\n",
"patient \n",
"101 URTI\n",
"102 Healthy\n",
"103 Asthma\n",
"104 COPD\n",
"105 URTI"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patient_diagnosis.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>3.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>19.0</td>\n",
" <td>99.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>0.75</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>9.8</td>\n",
" <td>73.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>33.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>28.47</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>7.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>32.0</td>\n",
" <td>135.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex bmi weight height\n",
"patient \n",
"101 3.00 F NaN 19.0 99.0\n",
"102 0.75 F NaN 9.8 73.0\n",
"103 70.00 F 33.00 NaN NaN\n",
"104 70.00 F 28.47 NaN NaN\n",
"105 7.00 F NaN 32.0 135.0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"demographic_info.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>3.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>19.0</td>\n",
" <td>99.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>0.75</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>9.8</td>\n",
" <td>73.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>33.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Asthma</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>28.47</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>7.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>32.0</td>\n",
" <td>135.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>222</th>\n",
" <td>60.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>223</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224</th>\n",
" <td>10.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>32.3</td>\n",
" <td>143.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>225</th>\n",
" <td>0.83</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>7.8</td>\n",
" <td>74.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>226</th>\n",
" <td>4.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>16.7</td>\n",
" <td>103.0</td>\n",
" <td>Pneumonia</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>126 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" age sex bmi weight height diagnosis\n",
"patient \n",
"101 3.00 F NaN 19.0 99.0 URTI\n",
"102 0.75 F NaN 9.8 73.0 Healthy\n",
"103 70.00 F 33.00 NaN NaN Asthma\n",
"104 70.00 F 28.47 NaN NaN COPD\n",
"105 7.00 F NaN 32.0 135.0 URTI\n",
"... ... ... ... ... ... ...\n",
"222 60.00 M NaN NaN NaN COPD\n",
"223 NaN NaN NaN NaN NaN COPD\n",
"224 10.00 F NaN 32.3 143.0 Healthy\n",
"225 0.83 M NaN 7.8 74.0 Healthy\n",
"226 4.00 M NaN 16.7 103.0 Pneumonia\n",
"\n",
"[126 rows x 6 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.concat([demographic_info, patient_diagnosis], axis=1)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>3.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>19.00</td>\n",
" <td>99.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>0.75</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>9.80</td>\n",
" <td>73.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>33.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Asthma</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>28.47</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>7.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>32.00</td>\n",
" <td>135.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>73.00</td>\n",
" <td>F</td>\n",
" <td>21.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107</th>\n",
" <td>75.00</td>\n",
" <td>F</td>\n",
" <td>33.70</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108</th>\n",
" <td>3.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LRTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>109</th>\n",
" <td>84.00</td>\n",
" <td>F</td>\n",
" <td>33.53</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>75.00</td>\n",
" <td>M</td>\n",
" <td>25.21</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>111</th>\n",
" <td>63.00</td>\n",
" <td>M</td>\n",
" <td>28.40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Bronchiectasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>112</th>\n",
" <td>60.00</td>\n",
" <td>M</td>\n",
" <td>22.86</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>58.00</td>\n",
" <td>M</td>\n",
" <td>28.41</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>114</th>\n",
" <td>77.00</td>\n",
" <td>M</td>\n",
" <td>23.12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>115</th>\n",
" <td>0.58</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>7.14</td>\n",
" <td>64.0</td>\n",
" <td>LRTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116</th>\n",
" <td>56.00</td>\n",
" <td>M</td>\n",
" <td>28.58</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Bronchiectasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>68.00</td>\n",
" <td>M</td>\n",
" <td>24.40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>118</th>\n",
" <td>81.00</td>\n",
" <td>M</td>\n",
" <td>36.76</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>119</th>\n",
" <td>2.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>15.20</td>\n",
" <td>94.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>120</th>\n",
" <td>78.00</td>\n",
" <td>M</td>\n",
" <td>35.14</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>121</th>\n",
" <td>13.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>65.00</td>\n",
" <td>170.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>122</th>\n",
" <td>66.00</td>\n",
" <td>M</td>\n",
" <td>33.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Pneumonia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>123</th>\n",
" <td>5.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>25.00</td>\n",
" <td>125.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124</th>\n",
" <td>65.00</td>\n",
" <td>M</td>\n",
" <td>29.07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>125</th>\n",
" <td>14.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>62.00</td>\n",
" <td>170.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>126</th>\n",
" <td>1.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>10.18</td>\n",
" <td>80.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127</th>\n",
" <td>2.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>12.60</td>\n",
" <td>98.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>128</th>\n",
" <td>65.00</td>\n",
" <td>F</td>\n",
" <td>24.30</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>129</th>\n",
" <td>6.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>23.00</td>\n",
" <td>119.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>130</th>\n",
" <td>85.00</td>\n",
" <td>F</td>\n",
" <td>17.10</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>3.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>14.00</td>\n",
" <td>97.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>132</th>\n",
" <td>71.00</td>\n",
" <td>M</td>\n",
" <td>34.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>133</th>\n",
" <td>68.00</td>\n",
" <td>M</td>\n",
" <td>27.40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>134</th>\n",
" <td>61.00</td>\n",
" <td>M</td>\n",
" <td>32.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>135</th>\n",
" <td>70.00</td>\n",
" <td>M</td>\n",
" <td>21.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Pneumonia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>136</th>\n",
" <td>5.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>16.20</td>\n",
" <td>110.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>137</th>\n",
" <td>4.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>18.00</td>\n",
" <td>104.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>138</th>\n",
" <td>56.00</td>\n",
" <td>F</td>\n",
" <td>21.60</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>139</th>\n",
" <td>61.00</td>\n",
" <td>M</td>\n",
" <td>28.68</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>79.00</td>\n",
" <td>F</td>\n",
" <td>23.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Pneumonia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>141</th>\n",
" <td>66.00</td>\n",
" <td>M</td>\n",
" <td>22.40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142</th>\n",
" <td>78.00</td>\n",
" <td>M</td>\n",
" <td>26.10</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>143</th>\n",
" <td>0.25</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>8.24</td>\n",
" <td>68.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>144</th>\n",
" <td>3.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>16.70</td>\n",
" <td>100.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>69.00</td>\n",
" <td>M</td>\n",
" <td>23.40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>67.00</td>\n",
" <td>M</td>\n",
" <td>28.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>77.00</td>\n",
" <td>M</td>\n",
" <td>25.70</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>148</th>\n",
" <td>4.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>33.00</td>\n",
" <td>110.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>149</th>\n",
" <td>0.67</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>9.50</td>\n",
" <td>70.0</td>\n",
" <td>Bronchiolitis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>150</th>\n",
" <td>0.67</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>8.12</td>\n",
" <td>74.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex bmi weight height diagnosis\n",
"patient \n",
"101 3.00 F NaN 19.00 99.0 URTI\n",
"102 0.75 F NaN 9.80 73.0 Healthy\n",
"103 70.00 F 33.00 NaN NaN Asthma\n",
"104 70.00 F 28.47 NaN NaN COPD\n",
"105 7.00 F NaN 32.00 135.0 URTI\n",
"106 73.00 F 21.00 NaN NaN COPD\n",
"107 75.00 F 33.70 NaN NaN COPD\n",
"108 3.00 M NaN NaN NaN LRTI\n",
"109 84.00 F 33.53 NaN NaN COPD\n",
"110 75.00 M 25.21 NaN NaN COPD\n",
"111 63.00 M 28.40 NaN NaN Bronchiectasis\n",
"112 60.00 M 22.86 NaN NaN COPD\n",
"113 58.00 M 28.41 NaN NaN COPD\n",
"114 77.00 M 23.12 NaN NaN COPD\n",
"115 0.58 M NaN 7.14 64.0 LRTI\n",
"116 56.00 M 28.58 NaN NaN Bronchiectasis\n",
"117 68.00 M 24.40 NaN NaN COPD\n",
"118 81.00 M 36.76 NaN NaN COPD\n",
"119 2.00 F NaN 15.20 94.0 URTI\n",
"120 78.00 M 35.14 NaN NaN COPD\n",
"121 13.00 F NaN 65.00 170.0 Healthy\n",
"122 66.00 M 33.00 NaN NaN Pneumonia\n",
"123 5.00 M NaN 25.00 125.0 Healthy\n",
"124 65.00 M 29.07 NaN NaN COPD\n",
"125 14.00 M NaN 62.00 170.0 Healthy\n",
"126 1.00 F NaN 10.18 80.0 Healthy\n",
"127 2.00 M NaN 12.60 98.0 Healthy\n",
"128 65.00 F 24.30 NaN NaN COPD\n",
"129 6.00 M NaN 23.00 119.0 URTI\n",
"130 85.00 F 17.10 NaN NaN COPD\n",
"131 3.00 M NaN 14.00 97.0 URTI\n",
"132 71.00 M 34.00 NaN NaN COPD\n",
"133 68.00 M 27.40 NaN NaN COPD\n",
"134 61.00 M 32.00 NaN NaN COPD\n",
"135 70.00 M 21.00 NaN NaN Pneumonia\n",
"136 5.00 M NaN 16.20 110.0 Healthy\n",
"137 4.00 M NaN 18.00 104.0 URTI\n",
"138 56.00 F 21.60 NaN NaN COPD\n",
"139 61.00 M 28.68 NaN NaN COPD\n",
"140 79.00 F 23.00 NaN NaN Pneumonia\n",
"141 66.00 M 22.40 NaN NaN COPD\n",
"142 78.00 M 26.10 NaN NaN COPD\n",
"143 0.25 F NaN 8.24 68.0 Healthy\n",
"144 3.00 M NaN 16.70 100.0 Healthy\n",
"145 69.00 M 23.40 NaN NaN COPD\n",
"146 67.00 M 28.00 NaN NaN COPD\n",
"147 77.00 M 25.70 NaN NaN COPD\n",
"148 4.00 M NaN 33.00 110.0 URTI\n",
"149 0.67 M NaN 9.50 70.0 Bronchiolitis\n",
"150 0.67 F NaN 8.12 74.0 URTI"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head(50)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"age 1\n",
"sex 1\n",
"bmi 51\n",
"weight 82\n",
"height 84\n",
"diagnosis 0\n",
"dtype: int64\n"
]
}
],
"source": [
"print(data.isna().sum())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#Dropping one patient with NA age and sex\n",
"data.dropna(thresh=2, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"age 0\n",
"sex 0\n",
"bmi 50\n",
"weight 81\n",
"height 83\n",
"diagnosis 0\n",
"dtype: int64\n"
]
}
],
"source": [
"print(data.isna().sum())\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 125\n",
"sex 125\n",
"bmi 75\n",
"weight 44\n",
"height 42\n",
"diagnosis 125\n",
"dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.count()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Count of patients' diagnoses\")"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAAF+CAYAAABauMmmAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de5RkZX3u8e8jA4IIImEkCMIQQQkGAR0NqDFGY6IxCjGK4g09KEmOFzyeeAJZHqPGKJpExUuyQkQlSozGgKAkHglKvIE6CMrViAhyZ0CQi4oCv/PH3g1F0z3TzFTV273n+1mrV9e+VNdvd8/UU+/e737fVBWSJGm67tO6AEmSNkQGsCRJDRjAkiQ1YABLktSAASxJUgMGsCRJDRjA0hgl+YMklya5OcneE36tHfvX2WiSr7M+kjwpyWUjy+cmeVLDkqRFwwDWopTkBUlW9QFzZZL/SPKEKbxuJdllPX7E3wCvqqr7V9WZ46oLIMnFSX57Zrmqfti/zu1j+NmnJnn5Avf9SJKXrsvrVNUjqurUdXmuNDQGsBadJK8D3gO8DdgW2BH4O2C/lnUt0E7Aua2LkLT4GcBaVJI8AHgL8MqqOq6qbqmqX1TVZ6rq9f0+903yniRX9F/vSXLffttLk3xl1s+8s1Xbt94+kOSkJDcl+XqSh/bbvtQ/5dt9y/t5c9R3nyRvSHJJkmuS/FOSB/Q13Qxs1D//+/McXyV5TZKLklyb5K+T3Kff9tAkX0hyXb/t2CRb9ds+SvdB5DN9bf8nyYr+5y2b+d0lObo/Y3B5krfOnJ6e+b0k+Zsk1yf5QZKn99v+CvgN4P39z35/Ou/uj/HGJGcn+bUF/P0263/H1yc5D3jMrO13tuKTPDbJaUlu6Gt+f5JNRvb9nSTfTfLjJH+X5L9mWulrOp5++4OTnJjkR0kuTPKKkW2P7c+u3Jjk6iTvGtm2T5Kv9TV9e/R0ef+aF/X/bn6Q5IVr+31Ia1RVfvm1aL6ApwG3AcvWsM9bgNOBBwHLga8Bf9lveynwlVn7F7BL//gjwHXAY4FlwLHAv8y17zyv/T+AC4FfAe4PHAd89F48v4AvAlvTBep/Ay/vt+0CPBW4b39cXwLeM/Lci4HfHlle0f+8Zf3y8cA/AJv3v5tvAH808nv5BfAKug8JfwJcAaTffupMHf3y7wJnAFsBAX4V2G4Bf78jgC/3x/cQ4BzgsrmOAXg0sE//d1gBnA+8tt+2DXAj8Ox++6F9/S9f4PF8ie6syabAXsBq4Mn9ttOAF/eP7w/s0z/evv+38Xt0jZOn9svL+9/pjcDD+323Ax7R+v+LX0v7yxawFptfAq6tqtvWsM8LgbdU1TVVtRp4M/Die/Eax1fVN/rXOJbuDXqhXgi8q6ouqqqbgcOB58+0QhfoHVX1o6r6Id2p9gMBqurCqjq5qm7tj+tdwG8u5Acm2ZYuOF5b3VmDa4B3A88f2e2SqvrH6q4ZH0MXItvO8yN/AWwB7EYXaudX1ZULKOUA4K/647sUeO98O1bVGVV1elXdVlUX0314mDne3wPOre4syG39z7lq1o+Y83iSPAR4PPBnVfWzqjoL+CDwkpFj2yXJNlV1c1Wd3q9/EfDvVfXvVXVHVZ0MrOprAbgD+LUkm1XVlVXlpQatFwNYi811wDZrCbQHA5eMLF/Sr1uo0Tfyn9C1ghZqrtdexvxBNpdLZz3/wdCFaJJ/6U8f3wh8jK4luBA7ARsDV/anT2+gC7QHjexz53FX1U/6h3Mee1V9AXg/8AHgmiRHJdlyAXU8mHse35ySPCzJZ5Nc1R/v27jreO/2c6qqgMtm/Yj5jufBwI+q6qZZdWzfPz4YeBhwQZJvJvn9fv1OwHNnfn/97/AJdC3/W4DnAX9M9zs+Kclua/xNSGthAGuxOQ24Fdh/DftcQfdmOWPHfh3ALcD9ZjYk+eUx1zfXa98GXH0vfsZDZj1/pva30Z1S3qOqtqRrkWVk3zVNXXYp3e9tm6raqv/asqoescCa7vGzq+q9VfVoYHe6wHr9An7Oldzz+Obz98AFwK798f45dx3vlcAOMzsmyejyWlwBbJ1ki1l1XA5QVd+rqgPpPpy8A/hUks3pfocfHfn9bVVVm1fVEf3z/l9VPZWupX0B8I8LrEeakwGsRaWqfgy8EfhAkv2T3C/JxkmenuSd/W4fB96QZHmSbfr9P9Zv+zbwiCR7JdkUeNO9LOFquuu78/k48L+S7Jzk/nSh+Ym1nDKf7fVJHtifKj0U+ES/fgvgZuDHSbbnnoE3b2396eHPA3+bZMt0ncUemmRBp7Bn/+wkj0ny60k2pvtQ8zO6U7Br80ng8P74dgBevYZ9t6C7rnpz35r8k5FtJwF79P8GlgGvBBb0Yao/9f014O1JNk3ySLpW78f6Y3tRkuVVdQdwQ/+0O/rtz0zyu0k26p/7pCQ79Gcn9uuD+la6v9NCfh/SvAxgLTpV9bfA64A30HWeuRR4FfDpfpe30l2b+w5wNvCtfh1V9d90nbT+E/gecLce0QvwJuCY/hTkAXNs/xDwUbpOPj+gC6Y1hcxcTqDr4HQWXdAc3a9/M/Ao4Mf9+uNmPe/tdB88bkjyp3P83JcAmwDnAdcDn6JrrS3EkcBz+h7F7wW2pGvhXU93+vY64K8X8HPe3O//A7oPBB9dw75/CrwAuKl/rZkPIlTVtcBzgXf2r7073d/81gUez4F0HbuuoOuc9hdV9Z/9tqcB56brtX4k8Pyq+mkf3PvRtcRn/t29nu598j50/yavAH5Ed6169AODdK/N9BiUNAVJiu6U64Wta1lK0t2qdRnwwqr6Yut6pHGwBSxpUepPBW+V7h7vmevDp6/ladKSYQBLWqz2Bb4PXAs8E9i/qn7atiRpfDwFLUlSA7aAJUlqwACWJKmBezN83nrbZpttasWKFdN8SUmSmjnjjDOurarlc22bagCvWLGCVatWTfMlJUlqJsm8w7F6ClqSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqYKpjQa+rFYedNNXXu/iIZ0z19SRJGx5bwJIkNWAAS5LUgAEsSVIDBrAkSQ0YwJIkNWAAS5LUgAEsSVIDCwrgJFsl+VSSC5Kcn2TfJFsnOTnJ9/rvD5x0sZIkDcVCW8BHAp+rqt2APYHzgcOAU6pqV+CUflmSJC3AWgM4yQOAJwJHA1TVz6vqBmA/4Jh+t2OA/SdVpCRJQ7OQFvDOwGrgw0nOTPLBJJsD21bVlf0+VwHbTqpISZKGZiEBvAx4FPD3VbU3cAuzTjdXVQE115OTHJJkVZJVq1evXt96JUkahIUE8GXAZVX19X75U3SBfHWS7QD679fM9eSqOqqqVlbVyuXLl4+jZkmSlry1BnBVXQVcmuTh/aqnAOcBJwIH9esOAk6YSIWSJA3QQqcjfDVwbJJNgIuAl9GF9yeTHAxcAhwwmRIlSRqeBQVwVZ0FrJxj01PGW44kSRsGR8KSJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKmBZQvZKcnFwE3A7cBtVbUyydbAJ4AVwMXAAVV1/WTKlCRpWO5NC/i3qmqvqlrZLx8GnFJVuwKn9MuSJGkB1ucU9H7AMf3jY4D9178cSZI2DAsN4AI+n+SMJIf067atqiv7x1cB2871xCSHJFmVZNXq1avXs1xJkoZhQdeAgSdU1eVJHgScnOSC0Y1VVUlqridW1VHAUQArV66ccx9JkjY0C2oBV9Xl/fdrgOOBxwJXJ9kOoP9+zaSKlCRpaNYawEk2T7LFzGPgd4BzgBOBg/rdDgJOmFSRkiQNzUJOQW8LHJ9kZv9/rqrPJfkm8MkkBwOXAAdMrkxJkoZlrQFcVRcBe86x/jrgKZMoSpKkoXMkLEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYWHMBJNkpyZpLP9ss7J/l6kguTfCLJJpMrU5KkYbk3LeBDgfNHlt8BvLuqdgGuBw4eZ2GSJA3ZggI4yQ7AM4AP9ssBngx8qt/lGGD/SRQoSdIQLbQF/B7g/wB39Mu/BNxQVbf1y5cB28/1xCSHJFmVZNXq1avXq1hJkoZirQGc5PeBa6rqjHV5gao6qqpWVtXK5cuXr8uPkCRpcJYtYJ/HA89K8nvApsCWwJHAVkmW9a3gHYDLJ1emJEnDstYWcFUdXlU7VNUK4PnAF6rqhcAXgef0ux0EnDCxKiVJGpj1uQ/4z4DXJbmQ7prw0eMpSZKk4VvIKeg7VdWpwKn944uAx46/JEmShs+RsCRJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpAQNYkqQGDGBJkhpYawAn2TTJN5J8O8m5Sd7cr985ydeTXJjkE0k2mXy5kiQNw0JawLcCT66qPYG9gKcl2Qd4B/DuqtoFuB44eHJlSpI0LGsN4Orc3C9u3H8V8GTgU/36Y4D9J1KhJEkDtKBrwEk2SnIWcA1wMvB94Iaquq3f5TJg+3mee0iSVUlWrV69ehw1S5K05C0ogKvq9qraC9gBeCyw20JfoKqOqqqVVbVy+fLl61imJEnDcq96QVfVDcAXgX2BrZIs6zftAFw+5tokSRqshfSCXp5kq/7xZsBTgfPpgvg5/W4HASdMqkhJkoZm2dp3YTvgmCQb0QX2J6vqs0nOA/4lyVuBM4GjJ1inJEmDstYArqrvAHvPsf4iuuvBkiTpXnIkLEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKmBhUzGoAlbcdhJU3uti494xtReS5I0P1vAkiQ1YABLktSAASxJUgMGsCRJDRjAkiQ1YABLktSAASxJUgMGsCRJDRjAkiQ1YABLktSAASxJUgMGsCRJDRjAkiQ1YABLktSAASxJUgMGsCRJDRjAkiQ1YABLktSAASxJUgMGsCRJDaw1gJM8JMkXk5yX5Nwkh/brt05ycpLv9d8fOPlyJUkahoW0gG8D/ndV7Q7sA7wyye7AYcApVbUrcEq/LEmSFmCtAVxVV1bVt/rHNwHnA9sD+wHH9LsdA+w/qSIlSRqae3UNOMkKYG/g68C2VXVlv+kqYNt5nnNIklVJVq1evXo9SpUkaTgWHMBJ7g/8G/DaqrpxdFtVFVBzPa+qjqqqlVW1cvny5etVrCRJQ7GgAE6yMV34HltVx/Wrr06yXb99O+CayZQoSdLwLKQXdICjgfOr6l0jm04EDuofHwScMP7yJEkapmUL2OfxwIuBs5Oc1a/7c+AI4JNJDgYuAQ6YTImSJA3PWgO4qr4CZJ7NTxlvOZIkbRgcCUuSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqwACWJKkBA1iSpAYMYEmSGjCAJUlqYFnrAjRsKw47aaqvd/ERz5jq60nSurIFLElSAwawJEkNGMCSJDVgAEuS1IABLElSAwawJEkNGMCSJDVgAEuS1IABLElSAwawJEkNGMCSJDVgAEuS1MBaAzjJh5Jck+SckXVbJzk5yff67w+cbJmSJA3LQlrAHwGeNmvdYcApVbUrcEq/LEmSFmitAVxVXwJ+NGv1fsAx/eNjgP3HXJckSYO2rteAt62qK/vHVwHbjqkeSZI2COvdCauqCqj5tic5JMmqJKtWr169vi8nSdIgrGsAX51kO4D++zXz7VhVR1XVyqpauXz58nV8OUmShmVdA/hE4KD+8UHACeMpR5KkDcNCbkP6OHAa8PAklyU5GDgCeGqS7wG/3S9LkqQFWra2HarqwHk2PWXMtUiStMFwJCxJkhowgCVJasAAliSpAQNYkqQGDGBJkhowgCVJasAAliSpgbXeByxpfisOO2mqr3fxEc+Y6utJmhxbwJIkNWAAS5LUgAEsSVIDXgOWNK+hX+Me+vFpcbMFLElSAwawJEkNGMCSJDVgAEuS1IABLElSAwawJEkNGMCSJDVgAEuS1IABLElSAwawJEkNGMCSJDVgAEuS1IABLElSAwawJEkNGMCSJDXgfMCSNFDTnO/YuY7vPVvAkiQ1YABLktSAASxJUgMGsCRJDaxXJ6wkTwOOBDYCPlhVR4ylKkmS1mCaHcxgMp3M1rkFnGQj4APA04HdgQOT7D6uwiRJGrL1OQX9WODCqrqoqn4O/Auw33jKkiRp2NYngLcHLh1ZvqxfJ0mS1iJVtW5PTJ4DPK2qXt4vvxj49ap61az9DgEO6RcfDnx33cu917YBrp3i603bkI9vyMcGHt9S5/EtXdM+tp2qavlcG9anE9blwENGlnfo191NVR0FHLUer7POkqyqqpUtXnsahnx8Qz428PiWOo9v6VpMx7Y+p6C/CeyaZOckmwDPB04cT1mSJA3bOreAq+q2JK8C/h/dbUgfqqpzx1aZJEkDtl73AVfVvwP/PqZaJqHJqe8pGvLxDfnYwONb6jy+pWvRHNs6d8KSJEnrzqEoJUlqwACWJKkBA1iSpAbWqxPWYpLk4XQDfuzWrzof+MeqmubAHxOX5JnASVV1R+taxiXJo9a0vaq+Na1aJinJ44GzquqWJC8CHgUcWVWXNC5tLJJsDvy0qu5I8jC6/4v/UVW/aFya1iDJs9e0vaqOm1Ytk5LkgcCuwKYz66rqS+0q6gyiE1aSfYHjgH8AzgQC7A28Anh2VZ3esLyxSvIxYF/g3+hu/bqgcUnrLckX17C5qurJUytmgpJ8B9gTeCTwEeCDwAFV9Zst6xqXJGcAvwE8EPgq3VgBP6+qFzYtbEyS7AO8D/hVYBO62y9vqaotmxa2npJ8eA2bq6r+x9SKmYAkLwcOpRss6ixgH+C0xfC+MpQA/g/gHVV16qz1vwkcVlVPb1LYhCTZEjgQeBlQwIeBj1fVTU0LW0dJ9q2q01rXMWlJvlVVj0ryRuDyqjp6Zl3r2sZh5PheDWxWVe9MclZV7dW6tnFIsopuwKF/BVYCLwEeVlWHNy1sPSV59hBaufNJcjbwGOD0qtoryW7A26pqjS3/aRjKNeCHzg5fgKr6L+BXpl/OZFXVjcCn6Gag2g74A+Bb/RvfUvSB1gVMyU1JDgdeBJyU5D7Axo1rGqf0Z6NeCMxM1rpRw3rGrqouBDaqqtur6sPA01rXNAZvaF3AhP2sqn4GkOS+/VnDhzeuCRjONeA1tfxumVoVU5DkWXQt312AfwIeW1XXJLkfcB7dKbKlJq0LmJLnAS8ADq6qq5LsCPx145rG6bXA4cDxVXVukl8B1nR5Yan5ST/s7llJ3glcyXAaMUN2WZKtgE8DJye5HlgU/S6Gcgr6GrrW4D020V1j23bKJU1MkmOAo+fqQJDkKVV1SoOy1kuSG4B5O0RU1bOmWI40pyQ7AdfQnbX4X8ADgL/rW8VLVpKfAHMdQ+iuAT9yyiVNTH9Z8gHA5/p57NvWM5AAPmhN26vqmGnVonsvyfeAl8+3vb+UsGQl+UpVPSHJTXTX7O/cRPcGt9Q78bynql6b5DPc/fgAP0AtdknOBX5vvu1D6KXf94J+CCNnfRfD3RWDCGCAJMuBnYALq+qG1vVMSn/LwDuAB9G9gS/5N/EhdUTaECV5dFWd0bcu7mEAH6A+WVUH9J155vqAsaRbiEnOrKq9W9cxKUn+EngpcBEwc/vmori7YhDXgPtu5m8Dvg/snOSQqhrq1IjvBJ5ZVee3LmSMLm5dwDQkeShwWVXdmuRJdLcj/dNS/8BYVWf0D/eqqiNHtyU5FFjSAUx3CwvA7zetYnK+2rqACTuArqNu81POsw2lA8FrgUdU1b7A4+g6ggzV1QMLX+a7HSDJU5OcPO16JujfgNuT7EI3I8tDgH9uW9JYzXUp6KXTLmLcqurK/vslc321rm99VdWrkmyUZJuZdUk2SXJIkiG815wDbNW6iLkMogVMd7P/aoCquijJfVsXNG4jo9WsSvIJuh59t85sX8r38SX5LbpBVB5Md1zvoLu3OcBfNSxt3O7o59H+A+B9VfW+JGe2Lmp9JTmQrnf3zklGzzxtAfyoTVXjN8TLPwBJnkf3gfCWvj/GXwEfohtIZQiDqLwdODPJOdz9PbN534ShBPAOSd4733JVvaZBTeP2zJHHPwF+Z2S56EYCW6reRTeM6GnA0/vvh1XV+5tWNX6/6MPqIO76ew7hPuCv0d2Ssw3wtyPrbwK+06SiyRji5R+A/ws8uqou7IeFPQ14TlV9pnFd43IM3Qens7nrGvCiMIhOWBtSL+gkj6+qr65t3VIyuxNWku9W1aK4UX6ckuwO/DHdMHgfT7Iz3W1y72hcmhYgyVer6vGt6xi3Of7/nVNVv9aypnFK8s2qekzrOuYyiAAeleT+AFV1c+taJmGuHsNLvRdxkouAPx1Z9Tejy0v59PqGYOi3Wc1IciTwywzo8g9AksvozkLNeN3oclW96x5PWkKSvIvu73Uid/+7eRvSuCT5E7rOV5v3q26mGx/679pVNT79EH+Po+tw9u6RTVsCf1BVezYpbAz6weBnv3HPWPKDwc9Isivd9ajdufusLIMbLnWI5pm0YMn/+0zyF2vYXFX1lqkVMwHzTPbibUjjkuQNdOH0pKq6qF/3K8CRSbauqrc2LXA8NgHuT/c322Jk/Y3Ac5pUND7nzFq+A7gW+EpV/aBBPZPyYeAv6D5A/RbdkKJL/k6EJFuvaXtVDaIjVlW9rHUNk1BVb55vW5LXTrOWSaiq32pdw3wG0QJO8l1gz5kBt0fWbwZ8u6oe1qay8Uuy0xBufRg1zyfwrYHfBd5UVXMNM7rkJDmjqh6d5Oyq2mN0Xeva1keSH9CdwZhrTO8aSgs/yQ50Y63PXAf+MnBoVV3WrqrJSvLDqtqxdR3ro78r5g+BFdx9JKzmLftBtIDp/pP/bI6VP02yqHq9ravRYf6Se77PLYYu9etqvk/gfcvqP5l7nO+l6NZ+BqTvJXkVcDndWY0lrap2bl3DlHyY7r7t5/bLL+rXPbVZRZM3hIlSTgB+DJzByDXgxWAoAXz5XBMRJHkK3e0RQ/A3rQuYtqr6Ueb6tLF0HQrcD3gN8Jd0p6Ff0rSiMetn63piv3hqVX22ZT1jtry6KQhnfGQIp2jXYumfIoUdqmpRThs5lAB+DXBCkq/QfcqBbsLsxwP7NatqjJb6eLrroh+g4/rWdYzRiqr6Jl0HwZcBJHku8PWmVY1JkiPoJj4/tl91aJLHVdWfNyxrnK5L8iLg4/3ygcB1DesZizl6r9+5CdhsyuVMwteS7FFVZ7cuZLahXAPehe72gIcBj+hXnwd8F7iyqr7fqrZxG2JP2nkGud8auAJ4SXUTaC95Q7yFbFSS79CNB31Hv7wRcOZSn6xgRj8d4fuAfen+vX4NeE1V/bBpYZrTyPvKMmBXuskYbmURTbM4lBbwe4DDq+pDoyuT7NFve+acz1qahtiTdvYg9wVcV1W3tChm3JI8nW66t+1njdi2JXBbm6omZivuGn7yAS0LGbe+8+OS7WuxAVr0k2cMJYC3nev0QlWdnWTF9MuZqM2q6pQk6d8Q3pTkDOCNrQtbV0Pr1T2HK4BVdG/eZ4ysv4luYvehmBlz94t0rYwnAoe1LWl8+pHLXs09e9MayovQzPtKko9W1YtHtyX5KPDiOZ84RUMJ4DXNdDGEaxijBtmTdsiq6tvAt5McD9xSVbfDnadoBzNxSD+85ql014EB/qyqrmpY0rh9Gjga+AyLbExhrdEjRhf6/3eL4ta/pX7qcsaqJK+YvbKfJ/iMOfZfykZ70j6a7laINY6FrUXj89z9A+FmdLdZLWlJduu/PwrYDris/3pwv24oflZV762qL1bVf818tS5Kc0tyeN/B7JFJbuy/bgKuobs1qbmhdMLaFjge+Dl37wW9Cd0wjUP6FA5AkvtV1U9a16GFS3JWVe21tnVLTZKjquqQxTzk3zgkeQFdZ57Ps8jGFNb8kry9qhblHPGDOAVdVVcDj+tvW5mZxeOkqvpCw7Imoh8T+mi60847JtkT+KOq+p9tK9MC3JLkUTNv2EkeDfy0cU3rraoO6b8v2iH/xmQPuuuGT+auU9DVL2vx+mySzavqlv42skcBRy6GvieDaAFvSJJ8nW7s5xOrau9+3aCmDxuqJI+hG9XrCrpOSr8MPK+qBnOZJMnjuGcnpX9qVtAYJbkQ2L2qft66Fi1cf3vcnsAjgY8AH6SbBvQ3W9YFA2kBb2iq6tJZA0Td3qoWLVxVfbO/Xjoz1/F3q+oXLWsap75n6UOBs7jr32QBgwhguklDtqK7hqil47aqqiT7Ae+vqqOTHNy6KDCAl6JL+1ZGJdmYrlPW+Y1r0gIkuR/dXKs7VdUrkuya5OEDGq5xJV0Lcain1bYCLkjyTe5+DdjbkBa3m5IcTnf54Df6u0gWRfYtiiJ0r/wxcCSwPd0tSJ8HXtm0Ii3Uh+k6Ce7bL18O/CswlAA+h+60+lDGX59tTfPmavF6HvAC4GVVdVWSJ3LXvPFNeQ1YmpIkq8Vc35YAAAWYSURBVKpqZZIzR67ff7uq9mxd2/oYmalrC2Av4BvYQtQikmRvuhB+LvAD4Liqel/bqmwBLxlJ3scaZiapqtdMsRytm5/3c1TPTCv5UBbZ9GjraIOYqWvWpAWbABvTDayyZbuqNJ8kD6ObMONA4FrgE3SNzkXTW98AXjpWjTx+M54OW4r+Avgc8JAkx9LN1vXSphWNwcxgFP1QjVfOzM3df9jYtmVt41RVW8w87qfJ3A/Yp11FWosLgC8Dv19VFwIkWVRDv3oKegkaPYWppSXJL9G9aQc4vaqubVzS2CRZBTxu5jadJJsAX62qx6z5mUuX/xcXryT7A8+n+6D7ObpbAD9YVTs3LWyELeClyU9NS0iS3arqgpFhGWc6Ke2YZMcBjaS0bPQe2ar6eR/Cg5Dk2SOL96Hr9f2zRuVoLarq08Cnk2xOd7bitcCDkvw9cHxVfb5pgRjA0jS8DjgE+Ns5tg1pJKXVSZ5VVScC9PddDqaFz92nNb0NuJjujV2LWD+t6T8D/5zkgXQdsf6M7g6SpjwFvUTM6gByP2BmHOiZyaXtCKKm+k5lxwIPpvt3eSnwkpnrb5LuzgCWpmjIQzXOSHJ/gKq6uXUt49T3qv17uvnHfy3JI4FnVdVbG5emJcoAlqZkvqEah3ILWZL7An/IPT9gvKVVTeOU5L+A1wP/4DjsGgevAUvTM/ShGk8Afkw32tcQ7m+e7X5V9Y1Z47Df1qoYLX0GsDQ9Qx+qcYeqelrrIibo2v4698xAKs9huH9LTYEBLE3YrKEaz0sy1KEav5Zkj6o6u3UhE/JK4ChgtySX0w1p+KK2JWkp8xqwNGFJ1jjv6MxIUktdkvOAXeiC6Vbu6qH/yKaFjVl/X+l9quqm1rVoaTOApSmZb6jGqrq4aWFjkmSnudZX1SXTrmUSht7JTNN3n9YFSBuQfwXuGFm+vV83CH3QbkU3YMUzga2GEr69E+gG3rgNuGXkS1onXgOWpmfoQzUeCrwCOK5f9bEkRy2Gad/GZOidzDRltoCl6Vmd5M4OVwMcqvFg4Ner6o1V9Ua6SSde0bimcfpakj1aF6HhsAUsTc8fA8cmeT8jQzW2LWmswl0DjNA/zjz7LkVPAF6W5CIG3MlM02MAS1NSVd8H9hnqUI3Ah4GvJzm+X94fOLphPeP29NYFaFjsBS1NyYbQi7afcvEJ/eKXq+rMlvWMQ5JN6c5e7AKcDRxdVY6ApfVmC1iansEO1ZhkI+DcqtoNGMr8xjOOAX4BfJmuFbw7cGjTijQIBrA0PYPtRVtVtyf5bpIdq+qHresZs92rag+AJEcD32hcjwbCAJamZ+hDNT4QOLcfavPO+2MHMNTmL2YeVNVtsyZjkNaZ14ClKRn6UI3zDbm51IfaTHI7d32gCLAZ8BPu+vtt2ao2LW0GsDQlQx+qcVSSbYDrBjz1orTeHIhDmpKhDtWYZJ8kpyY5LsneSc6hm3rx6iSDvOYtjYMBLE1JP1TjscCD+q+PJXl126rG4v3A24CPA18AXl5Vvww8EXh7y8KkxcxT0NKUJPkOsG9V3dIvbw6cttSvASc5q6r26h+fX1W/OrLtzKrau1110uJlC1ianqEO1Tg6w9NPZ23zE740D29DkqZnqEM17pnkRvoewv1j+uVN25UlLW6egpamaIhDNUpaNwawNAWzhmqUJK8BS9NQVbcD302yY+taJC0OXgOWpmeoQzVKWgcGsDQ9/7d1AZIWD68BSw04VKMkrwFLE+ZQjZLmYgtYmrAkq4A/Bx4AHAU8vapOT7Ib8HFHipI2TLaApclbVlWfr6p/Ba6qqtMBquqCxnVJasgAlibPoRol3YOnoKUJG5nQfXQyd/rlTatq41a1SWrHAJYkqQFPQUuS1IABLElSAwawJEkNGMCSJDVgAEuS1MD/B0rSOilnfIDIAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 576x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(8,5))\n",
"data.diagnosis.value_counts().plot(kind=\"bar\")\n",
"plt.title(\"Count of patients' diagnoses\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"COPD 63\n",
"Healthy 26\n",
"URTI 14\n",
"Bronchiectasis 7\n",
"Bronchiolitis 6\n",
"Pneumonia 6\n",
"LRTI 2\n",
"Asthma 1\n",
"Name: diagnosis, dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.diagnosis.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"#Dropping Asthma and LRTI patients because there are too few of them\n",
"data = data.drop(data[(data.diagnosis=='Asthma') | (data.diagnosis == 'LRTI')].index)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 122\n",
"sex 122\n",
"bmi 74\n",
"weight 43\n",
"height 41\n",
"diagnosis 122\n",
"dtype: int64"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.count()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"#Sex and diagnosis to numerical variables\n",
"sex_categorical, sex_classes = pd.factorize(data[\"sex\"])\n",
"diagn_categorical, diagn_classes = pd.factorize(data[\"diagnosis\"])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['URTI', 'Healthy', 'COPD', 'Bronchiectasis', 'Pneumonia',\n",
" 'Bronchiolitis'],\n",
" dtype='object')"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diagn_classes"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"data[\"diagnosis\"] = diagn_categorical\n",
"data[\"sex\"] = sex_categorical"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>3.00</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>19.0</td>\n",
" <td>99.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>0.75</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>9.8</td>\n",
" <td>73.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>70.00</td>\n",
" <td>0</td>\n",
" <td>28.47</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>7.00</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>32.0</td>\n",
" <td>135.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>73.00</td>\n",
" <td>0</td>\n",
" <td>21.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex bmi weight height diagnosis\n",
"patient \n",
"101 3.00 0 NaN 19.0 99.0 0\n",
"102 0.75 0 NaN 9.8 73.0 1\n",
"104 70.00 0 28.47 NaN NaN 2\n",
"105 7.00 0 NaN 32.0 135.0 0\n",
"106 73.00 0 21.00 NaN NaN 2"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>3.00</td>\n",
" <td>0</td>\n",
" <td>19.385777</td>\n",
" <td>19.0</td>\n",
" <td>99.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>0.75</td>\n",
" <td>0</td>\n",
" <td>18.389942</td>\n",
" <td>9.8</td>\n",
" <td>73.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>70.00</td>\n",
" <td>0</td>\n",
" <td>28.470000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>7.00</td>\n",
" <td>0</td>\n",
" <td>17.558299</td>\n",
" <td>32.0</td>\n",
" <td>135.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>73.00</td>\n",
" <td>0</td>\n",
" <td>21.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107</th>\n",
" <td>75.00</td>\n",
" <td>0</td>\n",
" <td>33.700000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>109</th>\n",
" <td>84.00</td>\n",
" <td>0</td>\n",
" <td>33.530000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>75.00</td>\n",
" <td>1</td>\n",
" <td>25.210000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>111</th>\n",
" <td>63.00</td>\n",
" <td>1</td>\n",
" <td>28.400000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>112</th>\n",
" <td>60.00</td>\n",
" <td>1</td>\n",
" <td>22.860000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>58.00</td>\n",
" <td>1</td>\n",
" <td>28.410000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>114</th>\n",
" <td>77.00</td>\n",
" <td>1</td>\n",
" <td>23.120000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116</th>\n",
" <td>56.00</td>\n",
" <td>1</td>\n",
" <td>28.580000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>68.00</td>\n",
" <td>1</td>\n",
" <td>24.400000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>118</th>\n",
" <td>81.00</td>\n",
" <td>1</td>\n",
" <td>36.760000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>119</th>\n",
" <td>2.00</td>\n",
" <td>0</td>\n",
" <td>17.202354</td>\n",
" <td>15.2</td>\n",
" <td>94.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>120</th>\n",
" <td>78.00</td>\n",
" <td>1</td>\n",
" <td>35.140000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>121</th>\n",
" <td>13.00</td>\n",
" <td>0</td>\n",
" <td>22.491349</td>\n",
" <td>65.0</td>\n",
" <td>170.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>122</th>\n",
" <td>66.00</td>\n",
" <td>1</td>\n",
" <td>33.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>123</th>\n",
" <td>5.00</td>\n",
" <td>1</td>\n",
" <td>16.000000</td>\n",
" <td>25.0</td>\n",
" <td>125.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex bmi weight height diagnosis\n",
"patient \n",
"101 3.00 0 19.385777 19.0 99.0 0\n",
"102 0.75 0 18.389942 9.8 73.0 1\n",
"104 70.00 0 28.470000 NaN NaN 2\n",
"105 7.00 0 17.558299 32.0 135.0 0\n",
"106 73.00 0 21.000000 NaN NaN 2\n",
"107 75.00 0 33.700000 NaN NaN 2\n",
"109 84.00 0 33.530000 NaN NaN 2\n",
"110 75.00 1 25.210000 NaN NaN 2\n",
"111 63.00 1 28.400000 NaN NaN 3\n",
"112 60.00 1 22.860000 NaN NaN 2\n",
"113 58.00 1 28.410000 NaN NaN 2\n",
"114 77.00 1 23.120000 NaN NaN 2\n",
"116 56.00 1 28.580000 NaN NaN 3\n",
"117 68.00 1 24.400000 NaN NaN 2\n",
"118 81.00 1 36.760000 NaN NaN 2\n",
"119 2.00 0 17.202354 15.2 94.0 0\n",
"120 78.00 1 35.140000 NaN NaN 2\n",
"121 13.00 0 22.491349 65.0 170.0 1\n",
"122 66.00 1 33.000000 NaN NaN 4\n",
"123 5.00 1 16.000000 25.0 125.0 1"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Replace missing BMI values by using existing weight and height data\n",
"data[\"bmi_2\"] = data.apply(lambda row: (row[3]/(row[4])**2)*10000, axis=1)\n",
"data[\"bmi\"] = data[\"bmi\"].combine_first(data[\"bmi_2\"])\n",
"data.drop(axis=1, columns=[\"bmi_2\"], inplace=True)\n",
"data.head(20)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 0\n",
"sex 0\n",
"bmi 7\n",
"weight 79\n",
"height 81\n",
"diagnosis 0\n",
"dtype: int64"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"#Replacing missing BMI information by using similar data, discarding the rest"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [age, sex, bmi, weight, height, diagnosis]\n",
"Index: []"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"missing_data_indexes = [i for i, val in enumerate(data[\"bmi\"].isnull()) if val == True]\n",
"missing_data = data.iloc[missing_data_indexes]\n",
"missing_data"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"for idx, row in missing_data.iterrows():\n",
" age = row[0]\n",
" sex = row[1]\n",
" bmi = row[2]\n",
" diagnosis = row[5]\n",
" \n",
" similar_patients = data[(data['sex'] == sex)\n",
" & (data['diagnosis'] == diagnosis)\n",
" & (age - 5 <= data['age']) & (data['age'] <= age + 5) \n",
" & (data['bmi'].isnull()==False) ]\n",
" \n",
" if (len(similar_patients) > 2):\n",
" print(\"Found a similar BMI match for index\", idx)\n",
" data.at[idx, \"bmi\"] = similar_patients.bmi.mean()\n",
" else:\n",
" print(\"Dropping index\", idx)\n",
" data = data.drop(idx)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 0\n",
"sex 0\n",
"bmi 0\n",
"weight 78\n",
"height 80\n",
"diagnosis 0\n",
"dtype: int64"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"#Dropping weight and height columns, because they have a lot of missing data\n",
"data = data.drop(columns=[\"weight\", \"height\"])\n",
"data = data.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 0\n",
"sex 0\n",
"bmi 0\n",
"diagnosis 0\n",
"dtype: int64"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>121.000000</td>\n",
" <td>121.000000</td>\n",
" <td>121.000000</td>\n",
" <td>121.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>43.673554</td>\n",
" <td>0.636364</td>\n",
" <td>23.451756</td>\n",
" <td>1.876033</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>32.110260</td>\n",
" <td>0.483046</td>\n",
" <td>6.553994</td>\n",
" <td>1.158809</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.250000</td>\n",
" <td>0.000000</td>\n",
" <td>13.119534</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>5.000000</td>\n",
" <td>0.000000</td>\n",
" <td>17.485027</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>61.000000</td>\n",
" <td>1.000000</td>\n",
" <td>23.120000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>71.000000</td>\n",
" <td>1.000000</td>\n",
" <td>28.340000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>93.000000</td>\n",
" <td>1.000000</td>\n",
" <td>53.500000</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex bmi diagnosis\n",
"count 121.000000 121.000000 121.000000 121.000000\n",
"mean 43.673554 0.636364 23.451756 1.876033\n",
"std 32.110260 0.483046 6.553994 1.158809\n",
"min 0.250000 0.000000 13.119534 0.000000\n",
"25% 5.000000 0.000000 17.485027 1.000000\n",
"50% 61.000000 1.000000 23.120000 2.000000\n",
"75% 71.000000 1.000000 28.340000 2.000000\n",
"max 93.000000 1.000000 53.500000 5.000000"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"all_diagnosis_data = data[\"diagnosis\"]"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"#Adding crackles and wheezes count\n",
"#1. Load in training and test sound files\n",
"#2. Put them in one dataframe\n",
"#3. Iterate through them. \n",
"#4. If the patient number is not one found in the \"data\" dataframe, discard it\n",
"#5. Put the wheezes per soundfile/crackles per soundfile information into a python array:\n",
"#[[patient, wheezes, crackles], ...]\n",
"#6. Make that into a numpy array, that into a pandas dataframe\n",
"#7. Group by patient number by taking the mean\n",
"#8. Sort by patient number\n",
"#9. Put into the \"data\" dataframe.\n",
"#Train"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"test_soundfiles = pd.read_csv(root + \"test_soundfiles.csv\")\n",
"train_soundfiles = pd.read_csv(root + \"train_soundfiles.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"soundfiles = pd.concat([train_soundfiles, test_soundfiles])"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>101_1b1_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>101_1b1_Pr_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>102_1b1_Ar_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>104_1b1_Al_sc_Litt3200.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>104_1b1_Ar_sc_Litt3200.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>267</th>\n",
" <td>224_1b2_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>268</th>\n",
" <td>225_1b1_Pl_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>226_1b1_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>226_1b1_Ll_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>271</th>\n",
" <td>226_1b1_Pl_sc_LittC2SE.wav</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>917 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" filename\n",
"0 101_1b1_Al_sc_Meditron.wav\n",
"1 101_1b1_Pr_sc_Meditron.wav\n",
"2 102_1b1_Ar_sc_Meditron.wav\n",
"3 104_1b1_Al_sc_Litt3200.wav\n",
"4 104_1b1_Ar_sc_Litt3200.wav\n",
".. ...\n",
"267 224_1b2_Al_sc_Meditron.wav\n",
"268 225_1b1_Pl_sc_Meditron.wav\n",
"269 226_1b1_Al_sc_Meditron.wav\n",
"270 226_1b1_Ll_sc_Meditron.wav\n",
"271 226_1b1_Pl_sc_LittC2SE.wav\n",
"\n",
"[917 rows x 1 columns]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"soundfiles"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[3,\n",
" 5,\n",
" 3,\n",
" 4,\n",
" 2,\n",
" -447.0964,\n",
" -487.81705,\n",
" 60.012154,\n",
" 147.43799,\n",
" 98.916214,\n",
" 94.1953,\n",
" 61.320885,\n",
" 47.155403,\n",
" 53.77741,\n",
" 19.767086,\n",
" 24.540216,\n",
" 3.4809492,\n",
" 23.446045,\n",
" -3.386144,\n",
" 9.486736,\n",
" 1.7393734]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"array = [3,5,3,4,2]\n",
"SAMPLE_RATE = 16000\n",
"filename = \"226_1b1_Pl_sc_LittC2SE.wav\"\n",
"raw, sr = librosa.load(root + \"audio_and_txt_files/\" + filename, sr = SAMPLE_RATE, duration=20)\n",
"mfccs = librosa.feature.mfcc(raw, hop_length=20*16000, n_mfcc=8)\n",
"array.extend(mfccs.flatten())\n",
"array"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"40000.0"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(20*16000)/8"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>101_1b1_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>101_1b1_Pr_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>102_1b1_Ar_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>104_1b1_Al_sc_Litt3200.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>104_1b1_Ar_sc_Litt3200.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>267</th>\n",
" <td>224_1b2_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>268</th>\n",
" <td>225_1b1_Pl_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>226_1b1_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>226_1b1_Ll_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>271</th>\n",
" <td>226_1b1_Pl_sc_LittC2SE.wav</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>917 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" filename\n",
"0 101_1b1_Al_sc_Meditron.wav\n",
"1 101_1b1_Pr_sc_Meditron.wav\n",
"2 102_1b1_Ar_sc_Meditron.wav\n",
"3 104_1b1_Al_sc_Litt3200.wav\n",
"4 104_1b1_Ar_sc_Litt3200.wav\n",
".. ...\n",
"267 224_1b2_Al_sc_Meditron.wav\n",
"268 225_1b1_Pl_sc_Meditron.wav\n",
"269 226_1b1_Al_sc_Meditron.wav\n",
"270 226_1b1_Ll_sc_Meditron.wav\n",
"271 226_1b1_Pl_sc_LittC2SE.wav\n",
"\n",
"[917 rows x 1 columns]"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"soundfiles"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#This might take a while\n",
"#Array structure : [[patient, cracles per soundfile, wheezes per soundfile]]\n",
"breathing_data_array = []\n",
"print(\"START!\")\n",
"for i, row in soundfiles.iterrows():\n",
" print(\"Index \" + str(i))\n",
" print(row[0])\n",
" \n",
" filename = row[0]\n",
" patient = int(row[0].split(\"_\")[0])\n",
" try: \n",
" data.loc[patient]\n",
" except KeyError:\n",
" continue\n",
" \n",
" txt_filename = filename[:-4] + \".txt\"\n",
" annotations = pd.read_csv(root + \"audio_and_txt_files/\" + txt_filename, names=[\"start\", \"stop\", \"crackle\", \"wheeze\"], sep=\"\\t\")\n",
" total_crackles = annotations.crackle.sum()\n",
" total_wheeze = annotations.wheeze.sum()\n",
" total_time = annotations.iloc[-1, 1] - annotations.iloc[0, 0]\n",
" crackle_per_sec = round(total_crackles/total_time, 4)\n",
" wheeze_per_sec = round(total_wheeze/total_time, 4)\n",
" \n",
" #Extracting sound features\n",
" SAMPLE_RATE = 16000\n",
" raw, sr = librosa.load(root + \"audio_and_txt_files/\" + filename, sr = SAMPLE_RATE)\n",
" zcr = librosa.core.zero_crossings(raw).sum() / len(raw)\n",
" sc = librosa.feature.spectral_centroid(raw)[0]\n",
" rms = librosa.feature.rms(raw)[0]\n",
" s_rf = librosa.feature.spectral_rolloff(raw, roll_percent=0.85)[0]\n",
" s_rf_75 = librosa.feature.spectral_rolloff(raw, roll_percent=0.75)[0]\n",
" sf = librosa.feature.spectral_flatness(raw)[0]\n",
" se = entropy.spectral_entropy(x = raw, sf = sr, method='fft')\n",
" mfccs = librosa.feature.mfcc(raw, hop_length=len(raw), n_mfcc=8)\n",
" mfccs = mfccs.flatten()\n",
" \n",
" add_to_array=[patient, \n",
" crackle_per_sec, \n",
" wheeze_per_sec,\n",
" zcr,\n",
" sc.mean(),\n",
" np.median(sc),\n",
" sc.std(),\n",
" rms.mean(),\n",
" np.median(rms),\n",
" rms.std(), \n",
" s_rf.mean(), \n",
" np.median(s_rf),\n",
" s_rf.std(),\n",
" s_rf_75.mean(), \n",
" np.median(s_rf_75),\n",
" s_rf_75.std(),\n",
" sf.mean(),\n",
" np.median(sf),\n",
" sf.std(),\n",
" se]\n",
" add_to_array.extend(mfccs)\n",
" \n",
" breathing_data_array.append(add_to_array)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"data_columns = [\"patient\", \n",
" \"crackles\", \n",
" \"wheezes\", \n",
" \"zero_crossing_rate\", \n",
" \"spectral_centroid_mean\", \n",
" \"spectral_centroid_median\",\n",
" \"spectral_centroid_std\", \n",
" \"root_mean_square_mean\", \n",
" \"root_mean_square_median\", \n",
" \"root_mean_square_std\", \n",
" \"spectral_rolloff_85_mean\", \n",
" \"spectral_rolloff_85_median\", \n",
" \"spectral_rolloff_85_std\",\n",
" \"spectral_rolloff_75_mean\", \n",
" \"spectral_rolloff_75_median\", \n",
" \"spectral_rolloff_75_std\",\n",
" \"spectral_flatness_mean\",\n",
" \"spectral_flatness_median\",\n",
" \"spectral_flatness_std\",\n",
" \"spectral_entropy\",\n",
" \"mfcc1\",\n",
" \"mfcc2\",\n",
" \"mfcc3\",\n",
" \"mfcc4\",\n",
" \"mfcc5\",\n",
" \"mfcc6\",\n",
" \"mfcc7\",\n",
" \"mfcc8\",\n",
" \"mfcc9\",\n",
" \"mfcc10\",\n",
" \"mfcc11\",\n",
" \"mfcc12\",\n",
" \"mfcc13\",\n",
" \"mfcc14\",\n",
" \"mfcc15\",\n",
" \"mfcc16\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np_breathing_data_array = np.array(breathing_data_array)\n",
"np_breathing_data_array.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"breathing_data_df = pd.DataFrame(np_breathing_data_array, columns=data_columns)\n",
"breathing_data_df.set_index(\"patient\", inplace=True)\n",
"breathing_data_df = breathing_data_df.groupby(by=\"patient\").mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = pd.merge(left = data, right = breathing_data_df, left_index=True, right_index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_data = pd.merge(left = test_patients, right = data, left_index=True, right_index=True)\n",
"train_data = pd.merge(left = train_patients, right = data, left_index=True, right_index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_train = train_data.pop(\"diagnosis\")\n",
"y_test = test_data.pop(\"diagnosis\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#NORMALIZATION\n",
"\n",
"norm_train_data = (train_data - train_data.min()) / (train_data.max() - train_data.min())\n",
"norm_test_data = (test_data - train_data.min()) / (train_data.max() - train_data.min())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"norm_train_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Uncomment to save the training and test data for later use here\n",
"\n",
"#norm_train_data.to_csv(root + \"dataframes/norm_train_data_sound_features_03_31.csv\")\n",
"#norm_test_data.to_csv(root + \"dataframes/norm_test_data_sound_features_03_31.csv\")\n",
"\n",
"#train_data.to_csv(root + \"dataframes/train_data_sound_features_03_31.csv\")\n",
"#test_data.to_csv(root + \"dataframes/test_data_sound_features_03_31.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#Uncomment to check if saving was successful\n",
"\n",
"#train_data = pd.read_csv(root + \"dataframes/train_data_sound_features_03_31.csv\")\n",
"#test_data = pd.read_csv(root + \"dataframes/test_data_sound_features_03_31.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (venv)",
"language": "python",
"name": "venv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}