2637 lines (2636 with data), 89.2 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import librosa\n",
"import wave as wav\n",
"import tensorflow as tf\n",
"import scipy\n",
"import matplotlib.pyplot as plt\n",
"import librosa.display\n",
"import IPython.display as ipd\n",
"from sklearn import metrics\n",
"from sklearn.model_selection import cross_validate\n",
"import os\n",
"import statistics\n",
"import math\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"import graphviz\n",
"from sklearn import tree\n",
"\n",
"from sklearn import metrics\n",
"from sklearn.metrics import confusion_matrix, classification_report\n",
"import sys\n",
"\n",
"from sklearn.metrics import plot_confusion_matrix\n",
"import seaborn as sns \n",
"import matplotlib.pyplot as plt\n",
"import operator as op\n",
"from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict\n",
"from entropy import *\n",
"from random import shuffle\n",
"\n",
"from sklearn.svm import SVC\n",
"from sklearn.decomposition import PCA"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: xgboost in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (1.0.2)\n",
"Requirement already satisfied: scipy in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (from xgboost) (1.4.1)\n",
"Requirement already satisfied: numpy in /gpfs/hpc/home/rannilo/venv/lib/python3.6/site-packages (from xgboost) (1.18.1)\n",
"\u001b[33mWARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.\n",
"You should consider upgrading via the '/gpfs/hpc/home/rannilo/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
]
}
],
"source": [
"!pip install xgboost\n",
"import xgboost as xgb"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"root = \"respiratory_sound_database/\"\n",
"patient_diagnosis = pd.read_csv(root+\"patient_diagnosis.csv\", names=[\"patient\", \"diagnosis\"])\n",
"demographic_info = pd.read_csv(root+\"demographic_info.txt\", delimiter=\" \", names=[\"patient\", \"age\", \"sex\", \"bmi\", \"weight\", \"height\"])\n",
"\n",
"train_patients = pd.read_csv(root + \"train_patients.csv\")\n",
"test_patients = pd.read_csv(root + \"test_patients.csv\")\n",
"\n",
"train_patients.set_index(\"patient\", inplace=True)\n",
"test_patients.set_index(\"patient\", inplace=True)\n",
"patient_diagnosis.set_index(\"patient\", inplace=True)\n",
"demographic_info.set_index(\"patient\", inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: [101, 102, 103, 104, 105]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_patients.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>185</th>\n",
" </tr>\n",
" <tr>\n",
" <th>186</th>\n",
" </tr>\n",
" <tr>\n",
" <th>187</th>\n",
" </tr>\n",
" <tr>\n",
" <th>188</th>\n",
" </tr>\n",
" <tr>\n",
" <th>189</th>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: [185, 186, 187, 188, 189]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_patients.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>Asthma</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>URTI</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" diagnosis\n",
"patient \n",
"101 URTI\n",
"102 Healthy\n",
"103 Asthma\n",
"104 COPD\n",
"105 URTI"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patient_diagnosis.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>3.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>19.0</td>\n",
" <td>99.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>0.75</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>9.8</td>\n",
" <td>73.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>33.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>28.47</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>7.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>32.0</td>\n",
" <td>135.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex bmi weight height\n",
"patient \n",
"101 3.00 F NaN 19.0 99.0\n",
"102 0.75 F NaN 9.8 73.0\n",
"103 70.00 F 33.00 NaN NaN\n",
"104 70.00 F 28.47 NaN NaN\n",
"105 7.00 F NaN 32.0 135.0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"demographic_info.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>3.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>19.0</td>\n",
" <td>99.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>0.75</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>9.8</td>\n",
" <td>73.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>33.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Asthma</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>28.47</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>7.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>32.0</td>\n",
" <td>135.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>222</th>\n",
" <td>60.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>223</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224</th>\n",
" <td>10.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>32.3</td>\n",
" <td>143.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>225</th>\n",
" <td>0.83</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>7.8</td>\n",
" <td>74.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>226</th>\n",
" <td>4.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>16.7</td>\n",
" <td>103.0</td>\n",
" <td>Pneumonia</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>126 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" age sex bmi weight height diagnosis\n",
"patient \n",
"101 3.00 F NaN 19.0 99.0 URTI\n",
"102 0.75 F NaN 9.8 73.0 Healthy\n",
"103 70.00 F 33.00 NaN NaN Asthma\n",
"104 70.00 F 28.47 NaN NaN COPD\n",
"105 7.00 F NaN 32.0 135.0 URTI\n",
"... ... ... ... ... ... ...\n",
"222 60.00 M NaN NaN NaN COPD\n",
"223 NaN NaN NaN NaN NaN COPD\n",
"224 10.00 F NaN 32.3 143.0 Healthy\n",
"225 0.83 M NaN 7.8 74.0 Healthy\n",
"226 4.00 M NaN 16.7 103.0 Pneumonia\n",
"\n",
"[126 rows x 6 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.concat([demographic_info, patient_diagnosis], axis=1)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>3.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>19.00</td>\n",
" <td>99.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>0.75</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>9.80</td>\n",
" <td>73.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>33.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Asthma</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>70.00</td>\n",
" <td>F</td>\n",
" <td>28.47</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>7.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>32.00</td>\n",
" <td>135.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>73.00</td>\n",
" <td>F</td>\n",
" <td>21.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107</th>\n",
" <td>75.00</td>\n",
" <td>F</td>\n",
" <td>33.70</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108</th>\n",
" <td>3.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LRTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>109</th>\n",
" <td>84.00</td>\n",
" <td>F</td>\n",
" <td>33.53</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>75.00</td>\n",
" <td>M</td>\n",
" <td>25.21</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>111</th>\n",
" <td>63.00</td>\n",
" <td>M</td>\n",
" <td>28.40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Bronchiectasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>112</th>\n",
" <td>60.00</td>\n",
" <td>M</td>\n",
" <td>22.86</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>58.00</td>\n",
" <td>M</td>\n",
" <td>28.41</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>114</th>\n",
" <td>77.00</td>\n",
" <td>M</td>\n",
" <td>23.12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>115</th>\n",
" <td>0.58</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>7.14</td>\n",
" <td>64.0</td>\n",
" <td>LRTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116</th>\n",
" <td>56.00</td>\n",
" <td>M</td>\n",
" <td>28.58</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Bronchiectasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>68.00</td>\n",
" <td>M</td>\n",
" <td>24.40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>118</th>\n",
" <td>81.00</td>\n",
" <td>M</td>\n",
" <td>36.76</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>119</th>\n",
" <td>2.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>15.20</td>\n",
" <td>94.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>120</th>\n",
" <td>78.00</td>\n",
" <td>M</td>\n",
" <td>35.14</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>121</th>\n",
" <td>13.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>65.00</td>\n",
" <td>170.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>122</th>\n",
" <td>66.00</td>\n",
" <td>M</td>\n",
" <td>33.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Pneumonia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>123</th>\n",
" <td>5.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>25.00</td>\n",
" <td>125.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124</th>\n",
" <td>65.00</td>\n",
" <td>M</td>\n",
" <td>29.07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>125</th>\n",
" <td>14.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>62.00</td>\n",
" <td>170.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>126</th>\n",
" <td>1.00</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>10.18</td>\n",
" <td>80.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127</th>\n",
" <td>2.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>12.60</td>\n",
" <td>98.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>128</th>\n",
" <td>65.00</td>\n",
" <td>F</td>\n",
" <td>24.30</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>129</th>\n",
" <td>6.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>23.00</td>\n",
" <td>119.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>130</th>\n",
" <td>85.00</td>\n",
" <td>F</td>\n",
" <td>17.10</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>3.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>14.00</td>\n",
" <td>97.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>132</th>\n",
" <td>71.00</td>\n",
" <td>M</td>\n",
" <td>34.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>133</th>\n",
" <td>68.00</td>\n",
" <td>M</td>\n",
" <td>27.40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>134</th>\n",
" <td>61.00</td>\n",
" <td>M</td>\n",
" <td>32.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>135</th>\n",
" <td>70.00</td>\n",
" <td>M</td>\n",
" <td>21.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Pneumonia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>136</th>\n",
" <td>5.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>16.20</td>\n",
" <td>110.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>137</th>\n",
" <td>4.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>18.00</td>\n",
" <td>104.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>138</th>\n",
" <td>56.00</td>\n",
" <td>F</td>\n",
" <td>21.60</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>139</th>\n",
" <td>61.00</td>\n",
" <td>M</td>\n",
" <td>28.68</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>79.00</td>\n",
" <td>F</td>\n",
" <td>23.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Pneumonia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>141</th>\n",
" <td>66.00</td>\n",
" <td>M</td>\n",
" <td>22.40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142</th>\n",
" <td>78.00</td>\n",
" <td>M</td>\n",
" <td>26.10</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>143</th>\n",
" <td>0.25</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>8.24</td>\n",
" <td>68.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>144</th>\n",
" <td>3.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>16.70</td>\n",
" <td>100.0</td>\n",
" <td>Healthy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>69.00</td>\n",
" <td>M</td>\n",
" <td>23.40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>67.00</td>\n",
" <td>M</td>\n",
" <td>28.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>77.00</td>\n",
" <td>M</td>\n",
" <td>25.70</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>COPD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>148</th>\n",
" <td>4.00</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>33.00</td>\n",
" <td>110.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>149</th>\n",
" <td>0.67</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>9.50</td>\n",
" <td>70.0</td>\n",
" <td>Bronchiolitis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>150</th>\n",
" <td>0.67</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>8.12</td>\n",
" <td>74.0</td>\n",
" <td>URTI</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex bmi weight height diagnosis\n",
"patient \n",
"101 3.00 F NaN 19.00 99.0 URTI\n",
"102 0.75 F NaN 9.80 73.0 Healthy\n",
"103 70.00 F 33.00 NaN NaN Asthma\n",
"104 70.00 F 28.47 NaN NaN COPD\n",
"105 7.00 F NaN 32.00 135.0 URTI\n",
"106 73.00 F 21.00 NaN NaN COPD\n",
"107 75.00 F 33.70 NaN NaN COPD\n",
"108 3.00 M NaN NaN NaN LRTI\n",
"109 84.00 F 33.53 NaN NaN COPD\n",
"110 75.00 M 25.21 NaN NaN COPD\n",
"111 63.00 M 28.40 NaN NaN Bronchiectasis\n",
"112 60.00 M 22.86 NaN NaN COPD\n",
"113 58.00 M 28.41 NaN NaN COPD\n",
"114 77.00 M 23.12 NaN NaN COPD\n",
"115 0.58 M NaN 7.14 64.0 LRTI\n",
"116 56.00 M 28.58 NaN NaN Bronchiectasis\n",
"117 68.00 M 24.40 NaN NaN COPD\n",
"118 81.00 M 36.76 NaN NaN COPD\n",
"119 2.00 F NaN 15.20 94.0 URTI\n",
"120 78.00 M 35.14 NaN NaN COPD\n",
"121 13.00 F NaN 65.00 170.0 Healthy\n",
"122 66.00 M 33.00 NaN NaN Pneumonia\n",
"123 5.00 M NaN 25.00 125.0 Healthy\n",
"124 65.00 M 29.07 NaN NaN COPD\n",
"125 14.00 M NaN 62.00 170.0 Healthy\n",
"126 1.00 F NaN 10.18 80.0 Healthy\n",
"127 2.00 M NaN 12.60 98.0 Healthy\n",
"128 65.00 F 24.30 NaN NaN COPD\n",
"129 6.00 M NaN 23.00 119.0 URTI\n",
"130 85.00 F 17.10 NaN NaN COPD\n",
"131 3.00 M NaN 14.00 97.0 URTI\n",
"132 71.00 M 34.00 NaN NaN COPD\n",
"133 68.00 M 27.40 NaN NaN COPD\n",
"134 61.00 M 32.00 NaN NaN COPD\n",
"135 70.00 M 21.00 NaN NaN Pneumonia\n",
"136 5.00 M NaN 16.20 110.0 Healthy\n",
"137 4.00 M NaN 18.00 104.0 URTI\n",
"138 56.00 F 21.60 NaN NaN COPD\n",
"139 61.00 M 28.68 NaN NaN COPD\n",
"140 79.00 F 23.00 NaN NaN Pneumonia\n",
"141 66.00 M 22.40 NaN NaN COPD\n",
"142 78.00 M 26.10 NaN NaN COPD\n",
"143 0.25 F NaN 8.24 68.0 Healthy\n",
"144 3.00 M NaN 16.70 100.0 Healthy\n",
"145 69.00 M 23.40 NaN NaN COPD\n",
"146 67.00 M 28.00 NaN NaN COPD\n",
"147 77.00 M 25.70 NaN NaN COPD\n",
"148 4.00 M NaN 33.00 110.0 URTI\n",
"149 0.67 M NaN 9.50 70.0 Bronchiolitis\n",
"150 0.67 F NaN 8.12 74.0 URTI"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head(50)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"age 1\n",
"sex 1\n",
"bmi 51\n",
"weight 82\n",
"height 84\n",
"diagnosis 0\n",
"dtype: int64\n"
]
}
],
"source": [
"print(data.isna().sum())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#Dropping one patient with NA age and sex\n",
"data.dropna(thresh=2, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"age 0\n",
"sex 0\n",
"bmi 50\n",
"weight 81\n",
"height 83\n",
"diagnosis 0\n",
"dtype: int64\n"
]
}
],
"source": [
"print(data.isna().sum())\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 125\n",
"sex 125\n",
"bmi 75\n",
"weight 44\n",
"height 42\n",
"diagnosis 125\n",
"dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.count()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Count of patients' diagnoses\")"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 576x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(8,5))\n",
"data.diagnosis.value_counts().plot(kind=\"bar\")\n",
"plt.title(\"Count of patients' diagnoses\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"COPD 63\n",
"Healthy 26\n",
"URTI 14\n",
"Bronchiectasis 7\n",
"Bronchiolitis 6\n",
"Pneumonia 6\n",
"LRTI 2\n",
"Asthma 1\n",
"Name: diagnosis, dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.diagnosis.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"#Dropping Asthma and LRTI patients because there are too few of them\n",
"data = data.drop(data[(data.diagnosis=='Asthma') | (data.diagnosis == 'LRTI')].index)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 122\n",
"sex 122\n",
"bmi 74\n",
"weight 43\n",
"height 41\n",
"diagnosis 122\n",
"dtype: int64"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.count()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"#Sex and diagnosis to numerical variables\n",
"sex_categorical, sex_classes = pd.factorize(data[\"sex\"])\n",
"diagn_categorical, diagn_classes = pd.factorize(data[\"diagnosis\"])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['URTI', 'Healthy', 'COPD', 'Bronchiectasis', 'Pneumonia',\n",
" 'Bronchiolitis'],\n",
" dtype='object')"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diagn_classes"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"data[\"diagnosis\"] = diagn_categorical\n",
"data[\"sex\"] = sex_categorical"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>3.00</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>19.0</td>\n",
" <td>99.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>0.75</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>9.8</td>\n",
" <td>73.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>70.00</td>\n",
" <td>0</td>\n",
" <td>28.47</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>7.00</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>32.0</td>\n",
" <td>135.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>73.00</td>\n",
" <td>0</td>\n",
" <td>21.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex bmi weight height diagnosis\n",
"patient \n",
"101 3.00 0 NaN 19.0 99.0 0\n",
"102 0.75 0 NaN 9.8 73.0 1\n",
"104 70.00 0 28.47 NaN NaN 2\n",
"105 7.00 0 NaN 32.0 135.0 0\n",
"106 73.00 0 21.00 NaN NaN 2"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>3.00</td>\n",
" <td>0</td>\n",
" <td>19.385777</td>\n",
" <td>19.0</td>\n",
" <td>99.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>0.75</td>\n",
" <td>0</td>\n",
" <td>18.389942</td>\n",
" <td>9.8</td>\n",
" <td>73.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>70.00</td>\n",
" <td>0</td>\n",
" <td>28.470000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>7.00</td>\n",
" <td>0</td>\n",
" <td>17.558299</td>\n",
" <td>32.0</td>\n",
" <td>135.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>73.00</td>\n",
" <td>0</td>\n",
" <td>21.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107</th>\n",
" <td>75.00</td>\n",
" <td>0</td>\n",
" <td>33.700000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>109</th>\n",
" <td>84.00</td>\n",
" <td>0</td>\n",
" <td>33.530000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>75.00</td>\n",
" <td>1</td>\n",
" <td>25.210000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>111</th>\n",
" <td>63.00</td>\n",
" <td>1</td>\n",
" <td>28.400000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>112</th>\n",
" <td>60.00</td>\n",
" <td>1</td>\n",
" <td>22.860000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>58.00</td>\n",
" <td>1</td>\n",
" <td>28.410000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>114</th>\n",
" <td>77.00</td>\n",
" <td>1</td>\n",
" <td>23.120000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116</th>\n",
" <td>56.00</td>\n",
" <td>1</td>\n",
" <td>28.580000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>68.00</td>\n",
" <td>1</td>\n",
" <td>24.400000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>118</th>\n",
" <td>81.00</td>\n",
" <td>1</td>\n",
" <td>36.760000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>119</th>\n",
" <td>2.00</td>\n",
" <td>0</td>\n",
" <td>17.202354</td>\n",
" <td>15.2</td>\n",
" <td>94.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>120</th>\n",
" <td>78.00</td>\n",
" <td>1</td>\n",
" <td>35.140000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>121</th>\n",
" <td>13.00</td>\n",
" <td>0</td>\n",
" <td>22.491349</td>\n",
" <td>65.0</td>\n",
" <td>170.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>122</th>\n",
" <td>66.00</td>\n",
" <td>1</td>\n",
" <td>33.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>123</th>\n",
" <td>5.00</td>\n",
" <td>1</td>\n",
" <td>16.000000</td>\n",
" <td>25.0</td>\n",
" <td>125.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex bmi weight height diagnosis\n",
"patient \n",
"101 3.00 0 19.385777 19.0 99.0 0\n",
"102 0.75 0 18.389942 9.8 73.0 1\n",
"104 70.00 0 28.470000 NaN NaN 2\n",
"105 7.00 0 17.558299 32.0 135.0 0\n",
"106 73.00 0 21.000000 NaN NaN 2\n",
"107 75.00 0 33.700000 NaN NaN 2\n",
"109 84.00 0 33.530000 NaN NaN 2\n",
"110 75.00 1 25.210000 NaN NaN 2\n",
"111 63.00 1 28.400000 NaN NaN 3\n",
"112 60.00 1 22.860000 NaN NaN 2\n",
"113 58.00 1 28.410000 NaN NaN 2\n",
"114 77.00 1 23.120000 NaN NaN 2\n",
"116 56.00 1 28.580000 NaN NaN 3\n",
"117 68.00 1 24.400000 NaN NaN 2\n",
"118 81.00 1 36.760000 NaN NaN 2\n",
"119 2.00 0 17.202354 15.2 94.0 0\n",
"120 78.00 1 35.140000 NaN NaN 2\n",
"121 13.00 0 22.491349 65.0 170.0 1\n",
"122 66.00 1 33.000000 NaN NaN 4\n",
"123 5.00 1 16.000000 25.0 125.0 1"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Replace missing BMI values by using existing weight and height data\n",
"data[\"bmi_2\"] = data.apply(lambda row: (row[3]/(row[4])**2)*10000, axis=1)\n",
"data[\"bmi\"] = data[\"bmi\"].combine_first(data[\"bmi_2\"])\n",
"data.drop(axis=1, columns=[\"bmi_2\"], inplace=True)\n",
"data.head(20)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 0\n",
"sex 0\n",
"bmi 7\n",
"weight 79\n",
"height 81\n",
"diagnosis 0\n",
"dtype: int64"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"#Replacing missing BMI information by using similar data, discarding the rest"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" <tr>\n",
" <th>patient</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [age, sex, bmi, weight, height, diagnosis]\n",
"Index: []"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"missing_data_indexes = [i for i, val in enumerate(data[\"bmi\"].isnull()) if val == True]\n",
"missing_data = data.iloc[missing_data_indexes]\n",
"missing_data"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"for idx, row in missing_data.iterrows():\n",
" age = row[0]\n",
" sex = row[1]\n",
" bmi = row[2]\n",
" diagnosis = row[5]\n",
" \n",
" similar_patients = data[(data['sex'] == sex)\n",
" & (data['diagnosis'] == diagnosis)\n",
" & (age - 5 <= data['age']) & (data['age'] <= age + 5) \n",
" & (data['bmi'].isnull()==False) ]\n",
" \n",
" if (len(similar_patients) > 2):\n",
" print(\"Found a similar BMI match for index\", idx)\n",
" data.at[idx, \"bmi\"] = similar_patients.bmi.mean()\n",
" else:\n",
" print(\"Dropping index\", idx)\n",
" data = data.drop(idx)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 0\n",
"sex 0\n",
"bmi 0\n",
"weight 78\n",
"height 80\n",
"diagnosis 0\n",
"dtype: int64"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"#Dropping weight and height columns, because they have a lot of missing data\n",
"data = data.drop(columns=[\"weight\", \"height\"])\n",
"data = data.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 0\n",
"sex 0\n",
"bmi 0\n",
"diagnosis 0\n",
"dtype: int64"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>bmi</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>121.000000</td>\n",
" <td>121.000000</td>\n",
" <td>121.000000</td>\n",
" <td>121.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>43.673554</td>\n",
" <td>0.636364</td>\n",
" <td>23.451756</td>\n",
" <td>1.876033</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>32.110260</td>\n",
" <td>0.483046</td>\n",
" <td>6.553994</td>\n",
" <td>1.158809</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.250000</td>\n",
" <td>0.000000</td>\n",
" <td>13.119534</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>5.000000</td>\n",
" <td>0.000000</td>\n",
" <td>17.485027</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>61.000000</td>\n",
" <td>1.000000</td>\n",
" <td>23.120000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>71.000000</td>\n",
" <td>1.000000</td>\n",
" <td>28.340000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>93.000000</td>\n",
" <td>1.000000</td>\n",
" <td>53.500000</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex bmi diagnosis\n",
"count 121.000000 121.000000 121.000000 121.000000\n",
"mean 43.673554 0.636364 23.451756 1.876033\n",
"std 32.110260 0.483046 6.553994 1.158809\n",
"min 0.250000 0.000000 13.119534 0.000000\n",
"25% 5.000000 0.000000 17.485027 1.000000\n",
"50% 61.000000 1.000000 23.120000 2.000000\n",
"75% 71.000000 1.000000 28.340000 2.000000\n",
"max 93.000000 1.000000 53.500000 5.000000"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"all_diagnosis_data = data[\"diagnosis\"]"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"#Adding crackles and wheezes count\n",
"#1. Load in training and test sound files\n",
"#2. Put them in one dataframe\n",
"#3. Iterate through them. \n",
"#4. If the patient number is not one found in the \"data\" dataframe, discard it\n",
"#5. Put the wheezes per soundfile/crackles per soundfile information into a python array:\n",
"#[[patient, wheezes, crackles], ...]\n",
"#6. Make that into a numpy array, that into a pandas dataframe\n",
"#7. Group by patient number by taking the mean\n",
"#8. Sort by patient number\n",
"#9. Put into the \"data\" dataframe.\n",
"#Train"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"test_soundfiles = pd.read_csv(root + \"test_soundfiles.csv\")\n",
"train_soundfiles = pd.read_csv(root + \"train_soundfiles.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"soundfiles = pd.concat([train_soundfiles, test_soundfiles])"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>101_1b1_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>101_1b1_Pr_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>102_1b1_Ar_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>104_1b1_Al_sc_Litt3200.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>104_1b1_Ar_sc_Litt3200.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>267</th>\n",
" <td>224_1b2_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>268</th>\n",
" <td>225_1b1_Pl_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>226_1b1_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>226_1b1_Ll_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>271</th>\n",
" <td>226_1b1_Pl_sc_LittC2SE.wav</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>917 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" filename\n",
"0 101_1b1_Al_sc_Meditron.wav\n",
"1 101_1b1_Pr_sc_Meditron.wav\n",
"2 102_1b1_Ar_sc_Meditron.wav\n",
"3 104_1b1_Al_sc_Litt3200.wav\n",
"4 104_1b1_Ar_sc_Litt3200.wav\n",
".. ...\n",
"267 224_1b2_Al_sc_Meditron.wav\n",
"268 225_1b1_Pl_sc_Meditron.wav\n",
"269 226_1b1_Al_sc_Meditron.wav\n",
"270 226_1b1_Ll_sc_Meditron.wav\n",
"271 226_1b1_Pl_sc_LittC2SE.wav\n",
"\n",
"[917 rows x 1 columns]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"soundfiles"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[3,\n",
" 5,\n",
" 3,\n",
" 4,\n",
" 2,\n",
" -447.0964,\n",
" -487.81705,\n",
" 60.012154,\n",
" 147.43799,\n",
" 98.916214,\n",
" 94.1953,\n",
" 61.320885,\n",
" 47.155403,\n",
" 53.77741,\n",
" 19.767086,\n",
" 24.540216,\n",
" 3.4809492,\n",
" 23.446045,\n",
" -3.386144,\n",
" 9.486736,\n",
" 1.7393734]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"array = [3,5,3,4,2]\n",
"SAMPLE_RATE = 16000\n",
"filename = \"226_1b1_Pl_sc_LittC2SE.wav\"\n",
"raw, sr = librosa.load(root + \"audio_and_txt_files/\" + filename, sr = SAMPLE_RATE, duration=20)\n",
"mfccs = librosa.feature.mfcc(raw, hop_length=20*16000, n_mfcc=8)\n",
"array.extend(mfccs.flatten())\n",
"array"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"40000.0"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(20*16000)/8"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>101_1b1_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>101_1b1_Pr_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>102_1b1_Ar_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>104_1b1_Al_sc_Litt3200.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>104_1b1_Ar_sc_Litt3200.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>267</th>\n",
" <td>224_1b2_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>268</th>\n",
" <td>225_1b1_Pl_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>226_1b1_Al_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>226_1b1_Ll_sc_Meditron.wav</td>\n",
" </tr>\n",
" <tr>\n",
" <th>271</th>\n",
" <td>226_1b1_Pl_sc_LittC2SE.wav</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>917 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" filename\n",
"0 101_1b1_Al_sc_Meditron.wav\n",
"1 101_1b1_Pr_sc_Meditron.wav\n",
"2 102_1b1_Ar_sc_Meditron.wav\n",
"3 104_1b1_Al_sc_Litt3200.wav\n",
"4 104_1b1_Ar_sc_Litt3200.wav\n",
".. ...\n",
"267 224_1b2_Al_sc_Meditron.wav\n",
"268 225_1b1_Pl_sc_Meditron.wav\n",
"269 226_1b1_Al_sc_Meditron.wav\n",
"270 226_1b1_Ll_sc_Meditron.wav\n",
"271 226_1b1_Pl_sc_LittC2SE.wav\n",
"\n",
"[917 rows x 1 columns]"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"soundfiles"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#This might take a while\n",
"#Array structure : [[patient, cracles per soundfile, wheezes per soundfile]]\n",
"breathing_data_array = []\n",
"print(\"START!\")\n",
"for i, row in soundfiles.iterrows():\n",
" print(\"Index \" + str(i))\n",
" print(row[0])\n",
" \n",
" filename = row[0]\n",
" patient = int(row[0].split(\"_\")[0])\n",
" try: \n",
" data.loc[patient]\n",
" except KeyError:\n",
" continue\n",
" \n",
" txt_filename = filename[:-4] + \".txt\"\n",
" annotations = pd.read_csv(root + \"audio_and_txt_files/\" + txt_filename, names=[\"start\", \"stop\", \"crackle\", \"wheeze\"], sep=\"\\t\")\n",
" total_crackles = annotations.crackle.sum()\n",
" total_wheeze = annotations.wheeze.sum()\n",
" total_time = annotations.iloc[-1, 1] - annotations.iloc[0, 0]\n",
" crackle_per_sec = round(total_crackles/total_time, 4)\n",
" wheeze_per_sec = round(total_wheeze/total_time, 4)\n",
" \n",
" #Extracting sound features\n",
" SAMPLE_RATE = 16000\n",
" raw, sr = librosa.load(root + \"audio_and_txt_files/\" + filename, sr = SAMPLE_RATE)\n",
" zcr = librosa.core.zero_crossings(raw).sum() / len(raw)\n",
" sc = librosa.feature.spectral_centroid(raw)[0]\n",
" rms = librosa.feature.rms(raw)[0]\n",
" s_rf = librosa.feature.spectral_rolloff(raw, roll_percent=0.85)[0]\n",
" s_rf_75 = librosa.feature.spectral_rolloff(raw, roll_percent=0.75)[0]\n",
" sf = librosa.feature.spectral_flatness(raw)[0]\n",
" se = entropy.spectral_entropy(x = raw, sf = sr, method='fft')\n",
" mfccs = librosa.feature.mfcc(raw, hop_length=len(raw), n_mfcc=8)\n",
" mfccs = mfccs.flatten()\n",
" \n",
" add_to_array=[patient, \n",
" crackle_per_sec, \n",
" wheeze_per_sec,\n",
" zcr,\n",
" sc.mean(),\n",
" np.median(sc),\n",
" sc.std(),\n",
" rms.mean(),\n",
" np.median(rms),\n",
" rms.std(), \n",
" s_rf.mean(), \n",
" np.median(s_rf),\n",
" s_rf.std(),\n",
" s_rf_75.mean(), \n",
" np.median(s_rf_75),\n",
" s_rf_75.std(),\n",
" sf.mean(),\n",
" np.median(sf),\n",
" sf.std(),\n",
" se]\n",
" add_to_array.extend(mfccs)\n",
" \n",
" breathing_data_array.append(add_to_array)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"data_columns = [\"patient\", \n",
" \"crackles\", \n",
" \"wheezes\", \n",
" \"zero_crossing_rate\", \n",
" \"spectral_centroid_mean\", \n",
" \"spectral_centroid_median\",\n",
" \"spectral_centroid_std\", \n",
" \"root_mean_square_mean\", \n",
" \"root_mean_square_median\", \n",
" \"root_mean_square_std\", \n",
" \"spectral_rolloff_85_mean\", \n",
" \"spectral_rolloff_85_median\", \n",
" \"spectral_rolloff_85_std\",\n",
" \"spectral_rolloff_75_mean\", \n",
" \"spectral_rolloff_75_median\", \n",
" \"spectral_rolloff_75_std\",\n",
" \"spectral_flatness_mean\",\n",
" \"spectral_flatness_median\",\n",
" \"spectral_flatness_std\",\n",
" \"spectral_entropy\",\n",
" \"mfcc1\",\n",
" \"mfcc2\",\n",
" \"mfcc3\",\n",
" \"mfcc4\",\n",
" \"mfcc5\",\n",
" \"mfcc6\",\n",
" \"mfcc7\",\n",
" \"mfcc8\",\n",
" \"mfcc9\",\n",
" \"mfcc10\",\n",
" \"mfcc11\",\n",
" \"mfcc12\",\n",
" \"mfcc13\",\n",
" \"mfcc14\",\n",
" \"mfcc15\",\n",
" \"mfcc16\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np_breathing_data_array = np.array(breathing_data_array)\n",
"np_breathing_data_array.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"breathing_data_df = pd.DataFrame(np_breathing_data_array, columns=data_columns)\n",
"breathing_data_df.set_index(\"patient\", inplace=True)\n",
"breathing_data_df = breathing_data_df.groupby(by=\"patient\").mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = pd.merge(left = data, right = breathing_data_df, left_index=True, right_index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_data = pd.merge(left = test_patients, right = data, left_index=True, right_index=True)\n",
"train_data = pd.merge(left = train_patients, right = data, left_index=True, right_index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_train = train_data.pop(\"diagnosis\")\n",
"y_test = test_data.pop(\"diagnosis\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#NORMALIZATION\n",
"\n",
"norm_train_data = (train_data - train_data.min()) / (train_data.max() - train_data.min())\n",
"norm_test_data = (test_data - train_data.min()) / (train_data.max() - train_data.min())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"norm_train_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Uncomment to save the training and test data for later use here\n",
"\n",
"#norm_train_data.to_csv(root + \"dataframes/norm_train_data_sound_features_03_31.csv\")\n",
"#norm_test_data.to_csv(root + \"dataframes/norm_test_data_sound_features_03_31.csv\")\n",
"\n",
"#train_data.to_csv(root + \"dataframes/train_data_sound_features_03_31.csv\")\n",
"#test_data.to_csv(root + \"dataframes/test_data_sound_features_03_31.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#Uncomment to check if saving was successful\n",
"\n",
"#train_data = pd.read_csv(root + \"dataframes/train_data_sound_features_03_31.csv\")\n",
"#test_data = pd.read_csv(root + \"dataframes/test_data_sound_features_03_31.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (venv)",
"language": "python",
"name": "venv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}