1315 lines (1314 with data), 76.5 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Exploratory data analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from matplotlib import pyplot as plt\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"from tabulate import tabulate"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ptnum</th>\n",
" <th>label</th>\n",
" <th>scc</th>\n",
" <th>C-103579009</th>\n",
" <th>C-125680007</th>\n",
" <th>C-186034007</th>\n",
" <th>C-263495000</th>\n",
" <th>C-398070004</th>\n",
" <th>C-424144002</th>\n",
" <th>C-72514-3</th>\n",
" <th>...</th>\n",
" <th>C-92140-3</th>\n",
" <th>C-92141-1</th>\n",
" <th>C-92142-9</th>\n",
" <th>C-94040-3</th>\n",
" <th>C-94531-1</th>\n",
" <th>C-979092</th>\n",
" <th>C-993452</th>\n",
" <th>C-997501</th>\n",
" <th>C-999998</th>\n",
" <th>C-999999</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>p17767.2</td>\n",
" <td>0</td>\n",
" <td>101</td>\n",
" <td>white</td>\n",
" <td>m</td>\n",
" <td>nonhispanic</td>\n",
" <td>m</td>\n",
" <td>massachusetts</td>\n",
" <td>50t70</td>\n",
" <td>abnormal</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>p3832.1</td>\n",
" <td>0</td>\n",
" <td>110</td>\n",
" <td>white</td>\n",
" <td>m</td>\n",
" <td>nonhispanic</td>\n",
" <td>m</td>\n",
" <td>massachusetts</td>\n",
" <td>50t70</td>\n",
" <td>normal</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>p10784.2</td>\n",
" <td>0</td>\n",
" <td>127</td>\n",
" <td>black</td>\n",
" <td>m</td>\n",
" <td>nonhispanic</td>\n",
" <td>m</td>\n",
" <td>massachusetts</td>\n",
" <td>50t70</td>\n",
" <td>abnormal</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>p17630</td>\n",
" <td>0</td>\n",
" <td>129</td>\n",
" <td>white</td>\n",
" <td>m</td>\n",
" <td>nonhispanic</td>\n",
" <td>m</td>\n",
" <td>massachusetts</td>\n",
" <td>50t70</td>\n",
" <td>abnormal</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>p17983</td>\n",
" <td>1</td>\n",
" <td>69</td>\n",
" <td>white</td>\n",
" <td>m</td>\n",
" <td>nonhispanic</td>\n",
" <td>m</td>\n",
" <td>massachusetts</td>\n",
" <td>50t70</td>\n",
" <td>abnormal</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 785 columns</p>\n",
"</div>"
],
"text/plain": [
" ptnum label scc C-103579009 C-125680007 C-186034007 C-263495000 \\\n",
"0 p17767.2 0 101 white m nonhispanic m \n",
"1 p3832.1 0 110 white m nonhispanic m \n",
"2 p10784.2 0 127 black m nonhispanic m \n",
"3 p17630 0 129 white m nonhispanic m \n",
"4 p17983 1 69 white m nonhispanic m \n",
"\n",
" C-398070004 C-424144002 C-72514-3 ... C-92140-3 C-92141-1 C-92142-9 \\\n",
"0 massachusetts 50t70 abnormal ... NaN NaN NaN \n",
"1 massachusetts 50t70 normal ... NaN NaN NaN \n",
"2 massachusetts 50t70 abnormal ... NaN NaN NaN \n",
"3 massachusetts 50t70 abnormal ... NaN NaN NaN \n",
"4 massachusetts 50t70 abnormal ... NaN NaN NaN \n",
"\n",
" C-94040-3 C-94531-1 C-979092 C-993452 C-997501 C-999998 C-999999 \n",
"0 NaN NaN NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN NaN NaN NaN \n",
"\n",
"[5 rows x 785 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('../data/learning_data.csv')\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**We will be predicting value in column 'label' (1 - patient diagnosed with lung cancer, 0 - patient undiagnosed ) using the other columns as features.**"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 9048 entries, 0 to 9047\n",
"Columns: 785 entries, ptnum to C-999999\n",
"dtypes: float64(59), int64(2), object(724)\n",
"memory usage: 54.2+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>scc</th>\n",
" <th>C-75443-2</th>\n",
" <th>C-84215-3</th>\n",
" <th>C-26453-1</th>\n",
" <th>C-26464-8</th>\n",
" <th>C-26515-7</th>\n",
" <th>C-30385-9</th>\n",
" <th>C-30428-7</th>\n",
" <th>C-33037-3</th>\n",
" <th>...</th>\n",
" <th>C-713-8</th>\n",
" <th>C-727711</th>\n",
" <th>C-731-0</th>\n",
" <th>C-736-9</th>\n",
" <th>C-742-7</th>\n",
" <th>C-751-8</th>\n",
" <th>C-770-8</th>\n",
" <th>C-82078001</th>\n",
" <th>C-86849004</th>\n",
" <th>C-86964003</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>9048.000000</td>\n",
" <td>9048.000000</td>\n",
" <td>16.0</td>\n",
" <td>16.0</td>\n",
" <td>151.000000</td>\n",
" <td>151.000000</td>\n",
" <td>151.000000</td>\n",
" <td>151.000000</td>\n",
" <td>151.000000</td>\n",
" <td>151.000000</td>\n",
" <td>...</td>\n",
" <td>256.000000</td>\n",
" <td>0.0</td>\n",
" <td>256.000000</td>\n",
" <td>256.00000</td>\n",
" <td>256.000000</td>\n",
" <td>256.000000</td>\n",
" <td>256.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.252874</td>\n",
" <td>103.895999</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>5.125166</td>\n",
" <td>13.175497</td>\n",
" <td>308.200662</td>\n",
" <td>13.006623</td>\n",
" <td>87.692715</td>\n",
" <td>8.339073</td>\n",
" <td>...</td>\n",
" <td>4.513672</td>\n",
" <td>NaN</td>\n",
" <td>0.945664</td>\n",
" <td>15.17957</td>\n",
" <td>0.948047</td>\n",
" <td>2.679688</td>\n",
" <td>27.573555</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.434683</td>\n",
" <td>21.988655</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.379863</td>\n",
" <td>1.223817</td>\n",
" <td>88.495507</td>\n",
" <td>0.899605</td>\n",
" <td>4.636940</td>\n",
" <td>3.508351</td>\n",
" <td>...</td>\n",
" <td>0.255184</td>\n",
" <td>NaN</td>\n",
" <td>0.154276</td>\n",
" <td>1.67533</td>\n",
" <td>0.075522</td>\n",
" <td>0.225392</td>\n",
" <td>3.014690</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" <td>9.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>4.500000</td>\n",
" <td>11.100000</td>\n",
" <td>155.300000</td>\n",
" <td>11.600000</td>\n",
" <td>80.000000</td>\n",
" <td>2.000000</td>\n",
" <td>...</td>\n",
" <td>3.740000</td>\n",
" <td>NaN</td>\n",
" <td>0.500000</td>\n",
" <td>10.32000</td>\n",
" <td>0.750000</td>\n",
" <td>2.080000</td>\n",
" <td>18.610000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.000000</td>\n",
" <td>100.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>4.800000</td>\n",
" <td>12.000000</td>\n",
" <td>232.950000</td>\n",
" <td>12.150000</td>\n",
" <td>83.600000</td>\n",
" <td>5.500000</td>\n",
" <td>...</td>\n",
" <td>4.370000</td>\n",
" <td>NaN</td>\n",
" <td>0.980000</td>\n",
" <td>14.08250</td>\n",
" <td>0.900000</td>\n",
" <td>2.540000</td>\n",
" <td>25.457500</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.000000</td>\n",
" <td>107.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>5.100000</td>\n",
" <td>13.400000</td>\n",
" <td>304.100000</td>\n",
" <td>13.100000</td>\n",
" <td>88.100000</td>\n",
" <td>8.200000</td>\n",
" <td>...</td>\n",
" <td>4.500000</td>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" <td>15.19000</td>\n",
" <td>0.950000</td>\n",
" <td>2.690000</td>\n",
" <td>27.840000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.000000</td>\n",
" <td>116.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>5.400000</td>\n",
" <td>14.300000</td>\n",
" <td>386.550000</td>\n",
" <td>13.800000</td>\n",
" <td>91.400000</td>\n",
" <td>11.350000</td>\n",
" <td>...</td>\n",
" <td>4.680000</td>\n",
" <td>NaN</td>\n",
" <td>1.020000</td>\n",
" <td>16.38500</td>\n",
" <td>1.000000</td>\n",
" <td>2.830000</td>\n",
" <td>29.502500</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.000000</td>\n",
" <td>190.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>5.900000</td>\n",
" <td>15.000000</td>\n",
" <td>449.900000</td>\n",
" <td>14.600000</td>\n",
" <td>96.000000</td>\n",
" <td>14.900000</td>\n",
" <td>...</td>\n",
" <td>5.240000</td>\n",
" <td>NaN</td>\n",
" <td>1.090000</td>\n",
" <td>20.15000</td>\n",
" <td>1.160000</td>\n",
" <td>3.260000</td>\n",
" <td>34.850000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 61 columns</p>\n",
"</div>"
],
"text/plain": [
" label scc C-75443-2 C-84215-3 C-26453-1 C-26464-8 \\\n",
"count 9048.000000 9048.000000 16.0 16.0 151.000000 151.000000 \n",
"mean 0.252874 103.895999 1.0 1.0 5.125166 13.175497 \n",
"std 0.434683 21.988655 0.0 0.0 0.379863 1.223817 \n",
"min 0.000000 9.000000 1.0 1.0 4.500000 11.100000 \n",
"25% 0.000000 100.000000 1.0 1.0 4.800000 12.000000 \n",
"50% 0.000000 107.000000 1.0 1.0 5.100000 13.400000 \n",
"75% 1.000000 116.000000 1.0 1.0 5.400000 14.300000 \n",
"max 1.000000 190.000000 1.0 1.0 5.900000 15.000000 \n",
"\n",
" C-26515-7 C-30385-9 C-30428-7 C-33037-3 ... C-713-8 \\\n",
"count 151.000000 151.000000 151.000000 151.000000 ... 256.000000 \n",
"mean 308.200662 13.006623 87.692715 8.339073 ... 4.513672 \n",
"std 88.495507 0.899605 4.636940 3.508351 ... 0.255184 \n",
"min 155.300000 11.600000 80.000000 2.000000 ... 3.740000 \n",
"25% 232.950000 12.150000 83.600000 5.500000 ... 4.370000 \n",
"50% 304.100000 13.100000 88.100000 8.200000 ... 4.500000 \n",
"75% 386.550000 13.800000 91.400000 11.350000 ... 4.680000 \n",
"max 449.900000 14.600000 96.000000 14.900000 ... 5.240000 \n",
"\n",
" C-727711 C-731-0 C-736-9 C-742-7 C-751-8 C-770-8 \\\n",
"count 0.0 256.000000 256.00000 256.000000 256.000000 256.000000 \n",
"mean NaN 0.945664 15.17957 0.948047 2.679688 27.573555 \n",
"std NaN 0.154276 1.67533 0.075522 0.225392 3.014690 \n",
"min NaN 0.500000 10.32000 0.750000 2.080000 18.610000 \n",
"25% NaN 0.980000 14.08250 0.900000 2.540000 25.457500 \n",
"50% NaN 1.000000 15.19000 0.950000 2.690000 27.840000 \n",
"75% NaN 1.020000 16.38500 1.000000 2.830000 29.502500 \n",
"max NaN 1.090000 20.15000 1.160000 3.260000 34.850000 \n",
"\n",
" C-82078001 C-86849004 C-86964003 \n",
"count 0.0 0.0 0.0 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
"min NaN NaN NaN \n",
"25% NaN NaN NaN \n",
"50% NaN NaN NaN \n",
"75% NaN NaN NaN \n",
"max NaN NaN NaN \n",
"\n",
"[8 rows x 61 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Nulls"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"First 10 columns with highest percentage of nulls:\n",
" Column Nulls\n",
"--- ----------- -------\n",
"715 C-48387007 1\n",
"733 C-698423002 1\n",
"753 C-82078001 1\n",
"613 C-13569004 1\n",
"615 C-1373463 1\n",
"742 C-727711 1\n",
"616 C-14152002 1\n",
"689 C-313572 1\n",
"620 C-161621004 1\n",
"610 C-113076002 1\n",
"738 C-707418001 1\n",
"765 C-86849004 1\n",
"766 C-86964003 1\n",
"699 C-406602003 1\n",
"681 C-288328004 1\n",
"700 C-408512008 1\n",
"702 C-427089005 1\n",
"652 C-232657004 1\n",
"707 C-444260001 1\n",
"708 C-448417001 1\n",
"647 C-205532 1\n",
"709 C-448813005 1\n",
"717 C-52734007 1\n",
"640 C-198767 1\n",
"633 C-190905008 1\n"
]
}
],
"source": [
"null_percentages = df.isnull().mean().reset_index()\n",
"null_percentages.sort_values(by=0, ascending=False, inplace=True)\n",
"null_percentages.columns = ['Column', 'Nulls']\n",
"print('First 10 columns with highest percentage of nulls:')\n",
"print(tabulate(null_percentages.head(25), headers='keys', tablefmt='simple'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In feature engineering we will drop the columns with only null values. Other columns with null values will be either imputed or dropped based on the percentage of null values."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Another not neccesary column is ptnum, which is just a patient number"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df = df.drop('ptnum', axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Decoding column names"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>scc</th>\n",
" <th>race</th>\n",
" <th>marital</th>\n",
" <th>ethnic</th>\n",
" <th>gender</th>\n",
" <th>state</th>\n",
" <th>age</th>\n",
" <th>Pain severity - 0-10 verbal numeric rating [Score] - Reported</th>\n",
" <th>Influenza seasonal injectable preservative free</th>\n",
" <th>...</th>\n",
" <th>Parainfluenza virus 1 RNA [Presence] in Respiratory specimen by NAA with probe detection</th>\n",
" <th>Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection</th>\n",
" <th>Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection</th>\n",
" <th>Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection</th>\n",
" <th>SARS-CoV-2 RNA Pnl Resp NAA+probe</th>\n",
" <th>Hydroxychloroquine Sulfate 200 MG Oral Tablet</th>\n",
" <th>1 ML denosumab 60 MG/ML Prefilled Syringe</th>\n",
" <th>Fexofenadine hydrochloride 60 MG Oral Tablet</th>\n",
" <th>Leronlimab 700 MG Injection</th>\n",
" <th>Lenzilumab 200 MG IV</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>101</td>\n",
" <td>white</td>\n",
" <td>m</td>\n",
" <td>nonhispanic</td>\n",
" <td>m</td>\n",
" <td>massachusetts</td>\n",
" <td>50t70</td>\n",
" <td>abnormal</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>110</td>\n",
" <td>white</td>\n",
" <td>m</td>\n",
" <td>nonhispanic</td>\n",
" <td>m</td>\n",
" <td>massachusetts</td>\n",
" <td>50t70</td>\n",
" <td>normal</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>127</td>\n",
" <td>black</td>\n",
" <td>m</td>\n",
" <td>nonhispanic</td>\n",
" <td>m</td>\n",
" <td>massachusetts</td>\n",
" <td>50t70</td>\n",
" <td>abnormal</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>129</td>\n",
" <td>white</td>\n",
" <td>m</td>\n",
" <td>nonhispanic</td>\n",
" <td>m</td>\n",
" <td>massachusetts</td>\n",
" <td>50t70</td>\n",
" <td>abnormal</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>69</td>\n",
" <td>white</td>\n",
" <td>m</td>\n",
" <td>nonhispanic</td>\n",
" <td>m</td>\n",
" <td>massachusetts</td>\n",
" <td>50t70</td>\n",
" <td>abnormal</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 784 columns</p>\n",
"</div>"
],
"text/plain": [
" label scc race marital ethnic gender state age \\\n",
"0 0 101 white m nonhispanic m massachusetts 50t70 \n",
"1 0 110 white m nonhispanic m massachusetts 50t70 \n",
"2 0 127 black m nonhispanic m massachusetts 50t70 \n",
"3 0 129 white m nonhispanic m massachusetts 50t70 \n",
"4 1 69 white m nonhispanic m massachusetts 50t70 \n",
"\n",
" Pain severity - 0-10 verbal numeric rating [Score] - Reported \\\n",
"0 abnormal \n",
"1 normal \n",
"2 abnormal \n",
"3 abnormal \n",
"4 abnormal \n",
"\n",
" Influenza seasonal injectable preservative free ... \\\n",
"0 True ... \n",
"1 True ... \n",
"2 True ... \n",
"3 True ... \n",
"4 True ... \n",
"\n",
" Parainfluenza virus 1 RNA [Presence] in Respiratory specimen by NAA with probe detection \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" SARS-CoV-2 RNA Pnl Resp NAA+probe \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" Hydroxychloroquine Sulfate 200 MG Oral Tablet \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" 1 ML denosumab 60 MG/ML Prefilled Syringe \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" Fexofenadine hydrochloride 60 MG Oral Tablet Leronlimab 700 MG Injection \\\n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"\n",
" Lenzilumab 200 MG IV \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
"[5 rows x 784 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_codes = pd.read_csv('../data/codes.csv')\n",
"code_to_name = df_codes.set_index('code')['name'].to_dict()\n",
"df = df.rename(columns=code_to_name)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['label', 'scc', 'race', 'marital', 'ethnic', 'gender', 'state', 'age',\n",
" 'Pain severity - 0-10 verbal numeric rating [Score] - Reported',\n",
" 'Influenza seasonal injectable preservative free',\n",
" ...\n",
" 'Parainfluenza virus 1 RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
" 'Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
" 'Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
" 'Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
" 'SARS-CoV-2 RNA Pnl Resp NAA+probe',\n",
" 'Hydroxychloroquine Sulfate 200 MG Oral Tablet',\n",
" '1 ML denosumab 60 MG/ML Prefilled Syringe',\n",
" 'Fexofenadine hydrochloride 60 MG Oral Tablet',\n",
" 'Leronlimab 700 MG Injection', 'Lenzilumab 200 MG IV'],\n",
" dtype='object', length=784)\n"
]
}
],
"source": [
"column_names = df.columns\n",
"print(column_names)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Destribution of categorical values"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"categorical_columns = df.select_dtypes(include = ['object'])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(9048, 784)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"DataFrame after combining columns with the same name:\n",
" Glucose Throat culture (procedure) \\\n",
"0 normal NaN \n",
"1 normal NaN \n",
"2 normal NaN \n",
"3 normal NaN \n",
"4 normal NaN \n",
"... ... ... \n",
"9043 normal NaN \n",
"9044 normal True \n",
"9045 normal NaN \n",
"9046 normal NaN \n",
"9047 normal True \n",
"\n",
" Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma \\\n",
"0 NaN \n",
"1 abnormal \n",
"2 abnormal \n",
"3 abnormal \n",
"4 NaN \n",
"... ... \n",
"9043 abnormal \n",
"9044 abnormal \n",
"9045 abnormal \n",
"9046 NaN \n",
"9047 abnormal \n",
"\n",
" lisinopril 10 MG Oral Tablet History of appendectomy \\\n",
"0 True NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 True NaN \n",
"4 True NaN \n",
"... ... ... \n",
"9043 NaN NaN \n",
"9044 NaN NaN \n",
"9045 NaN NaN \n",
"9046 True NaN \n",
"9047 NaN NaN \n",
"\n",
" clonazePAM 0.25 MG Oral Tablet \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"9043 NaN \n",
"9044 NaN \n",
"9045 NaN \n",
"9046 NaN \n",
"9047 NaN \n",
"\n",
" Speech and language therapy regime (regime/therapy \\\n",
"0 True \n",
"1 NaN \n",
"2 True \n",
"3 True \n",
"4 NaN \n",
"... ... \n",
"9043 NaN \n",
"9044 NaN \n",
"9045 NaN \n",
"9046 NaN \n",
"9047 NaN \n",
"\n",
" remdesivir 100 MG Injection losartan potassium 25 MG Oral Tablet \\\n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"... ... ... \n",
"9043 NaN NaN \n",
"9044 NaN NaN \n",
"9045 NaN NaN \n",
"9046 NaN NaN \n",
"9047 NaN NaN \n",
"\n",
" Headache (finding) ... Specific gravity of Urine by Test strip \\\n",
"0 NaN ... NaN \n",
"1 NaN ... NaN \n",
"2 NaN ... NaN \n",
"3 NaN ... NaN \n",
"4 NaN ... NaN \n",
"... ... ... ... \n",
"9043 NaN ... NaN \n",
"9044 NaN ... NaN \n",
"9045 NaN ... NaN \n",
"9046 NaN ... NaN \n",
"9047 NaN ... NaN \n",
"\n",
" Iron binding capacity [Mass/volume] in Serum or Plasma \\\n",
"0 NaN \n",
"1 normal \n",
"2 normal \n",
"3 normal \n",
"4 NaN \n",
"... ... \n",
"9043 NaN \n",
"9044 normal \n",
"9045 normal \n",
"9046 NaN \n",
"9047 normal \n",
"\n",
" sacubitril 97 MG / valsartan 103 MG Oral Tablet \\\n",
"0 NaN \n",
"1 NaN \n",
"2 True \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"9043 NaN \n",
"9044 NaN \n",
"9045 True \n",
"9046 NaN \n",
"9047 NaN \n",
"\n",
" Catheter ablation of tissue of heart Bilateral tubal ligation \\\n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"... ... ... \n",
"9043 NaN NaN \n",
"9044 NaN NaN \n",
"9045 NaN NaN \n",
"9046 NaN NaN \n",
"9047 NaN NaN \n",
"\n",
" History of amputation of foot (situation) \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"9043 NaN \n",
"9044 NaN \n",
"9045 NaN \n",
"9046 NaN \n",
"9047 NaN \n",
"\n",
" Are you covered by health insurance or some other kind of health care plan [PhenX] \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"9043 NaN \n",
"9044 NaN \n",
"9045 NaN \n",
"9046 NaN \n",
"9047 NaN \n",
"\n",
" White oak IgE Ab in Serum Ferritin [Mass/volume] in Serum or Plasma \\\n",
"0 NaN NaN \n",
"1 NaN normal \n",
"2 NaN normal \n",
"3 NaN normal \n",
"4 NaN NaN \n",
"... ... ... \n",
"9043 NaN NaN \n",
"9044 NaN normal \n",
"9045 NaN normal \n",
"9046 NaN NaN \n",
"9047 NaN normal \n",
"\n",
" Brain damage - traumatic \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"9043 NaN \n",
"9044 NaN \n",
"9045 NaN \n",
"9046 NaN \n",
"9047 NaN \n",
"\n",
"[9048 rows x 779 columns]\n"
]
}
],
"source": [
"columns = df.columns\n",
"unique_columns = list(set(columns)) # Get unique column names\n",
"\n",
"# Create a new DataFrame with combined columns\n",
"df_combined = pd.DataFrame()\n",
"\n",
"# Iterate over the unique columns to combine and merge appropriately\n",
"for col in unique_columns:\n",
" if list(columns).count(col) > 1: # If the column name is duplicated\n",
" # Find all columns with this name and combine them, e.g., with first non-null\n",
" combined_series = df[[col]].bfill(axis=1).iloc[:, 0] # Backfill to combine\n",
" df_combined[col] = combined_series\n",
" else:\n",
" df_combined[col] = df[col] # If it's unique, just copy the column\n",
"\n",
"print(\"\\nDataFrame after combining columns with the same name:\")\n",
"print(df_combined)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"df = df_combined.copy()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df.drop(columns=['race', 'ethnic'], inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Distribution of label"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjMAAAHACAYAAABXvOnoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAABHDklEQVR4nO3deVxU9f7H8fcoMKAihsoqKiJuoeaeWO7gflNLr5K5ZT+9uC/XMrtXMoOiMitTs8wlNdvUunZLMZVyyXDLNZdSxIVwBdxA4Pz+6MHcJlwYAoax1/PxmMej8z3f+c5nBmjefs/3nGMyDMMQAACAgypl7wIAAAD+DMIMAABwaIQZAADg0AgzAADAoRFmAACAQyPMAAAAh0aYAQAADo0wAwAAHBphBgAAODTCDO4ZixYtkslkkqurqxITE/Psb9u2rUJCQuxQmbRp0yaZTCZ9+umndnl9W504cULdunWTp6enTCaTxo0bd9u+1atXV/fu3Yusljlz5mjRokVFNr4ju3jxovr16ycvLy+ZTCb17Nnztn0L8/c/929tx44dhTLe78c8ceJEoY2Jvw4nexcAFLaMjAw999xz+uCDD+xdisMaP368tm/frvfff18+Pj7y9fW1Wy1z5sxRpUqVNHjwYLvVUFK98MILWrVqld5//30FBQXJ09PT3iUBdkGYwT2nc+fOWr58uSZNmqSGDRvau5xidf36dbm6uspkMv2pcfbv36/mzZvf8V/6sL/9+/crKChIjz/+uL1LAeyKw0y450yePFkVK1bU008/fcd+J06ckMlkuuUhDJPJpKioKMt2VFSUTCaT9u7dqz59+sjDw0Oenp6aMGGCsrKydPjwYXXu3Fnu7u6qXr26YmNjb/maN27c0IQJE+Tj4yM3Nze1adNGu3fvztNvx44d+tvf/iZPT0+5urqqUaNG+vjjj6365E7Lr1u3TkOHDlXlypVVpkwZZWRk3PY9nzx5UgMGDJCXl5fMZrPq1q2r1157TTk5OZL+dzjs2LFj+uqrr2QymWye+s/9XF999VXNnDlTgYGBKleunFq2bKnvv//equ8vv/yifv36yc/PT2azWd7e3urQoYP27Nkj6bdDWAcOHFB8fLyllurVq1s+y4kTJ+qBBx6w/Dxatmypzz//PE9NJpNJo0aN0gcffKC6deuqTJkyatiwodasWZOn708//aT+/fvL29tbZrNZVatW1cCBA60+1+TkZA0fPlxVqlSRi4uLAgMD9fzzzysrK8tqrLlz56phw4YqV66c3N3dVadOHT377LN3/QwvXryoyMhI+fv7y8XFRTVq1NDUqVMtNeR+xuvXr9ehQ4csn82mTZvuOvad7NixQ/369VP16tXl5uam6tWrq3///rc8bCtJly5d0pAhQ+Tp6amyZcuqR48e+uWXX/L0W79+vTp06KDy5curTJkyatWqlb755pu71rN79251797d8vvq5+enbt266dSpU3/qfeLew8wM7jnu7u567rnnNHbsWG3YsEHt27cvtLH79u2rAQMGaPjw4YqLi1NsbKxu3ryp9evXKzIyUpMmTdLy5cv19NNPq2bNmurdu7fV85999lk1btxY7733nlJTUxUVFaW2bdtq9+7dqlGjhiRp48aN6ty5s1q0aKF58+bJw8NDK1as0N///nddu3Ytz+GWoUOHqlu3bvrggw909epVOTs737L2c+fOKTQ0VJmZmXrhhRdUvXp1rVmzRpMmTdLPP/+sOXPmqHHjxtq2bZt69eqloKAgvfrqq5JUoMNMb7/9turUqaNZs2ZJkv71r3+pa9euOn78uDw8PCRJXbt2VXZ2tmJjY1W1alWdP39eW7du1eXLlyVJq1at0mOPPSYPDw/NmTNHkmQ2myX9djjx4sWLmjRpkvz9/ZWZman169erd+/eWrhwoQYOHGhVz5dffqmEhARNnz5d5cqVU2xsrHr16qXDhw9bPvsff/xRDz30kCpVqqTp06crODhYZ8+e1RdffKHMzEyZzWYlJyerefPmKlWqlP79738rKChI27Zt04wZM3TixAktXLhQkrRixQpFRkZq9OjRevXVV1WqVCkdO3ZMBw8evOPnduPGDbVr104///yznn/+eTVo0EDfffedYmJitGfPHn355Zfy9fXVtm3bFBkZqdTUVC1btkySVK9ePZt/Tr934sQJ1a5dW/369ZOnp6fOnj2ruXPnqlmzZjp48KAqVapk1f/JJ59UWFiYli9frqSkJD333HNq27at9u7dqwoVKkiSli5dqoEDB+qRRx7R4sWL5ezsrHfeeUedOnXS2rVr1aFDh1vWcvXqVYWFhSkwMFBvv/22vL29lZycrI0bNyo9Pf1PvU/cgwzgHrFw4UJDkpGQkGBkZGQYNWrUMJo2bWrk5OQYhmEYbdq0Me6//35L/+PHjxuSjIULF+YZS5Ixbdo0y/a0adMMScZrr71m1e+BBx4wJBkrV660tN28edOoXLmy0bt3b0vbxo0bDUlG48aNLfUYhmGcOHHCcHZ2NoYNG2Zpq1OnjtGoUSPj5s2bVq/VvXt3w9fX18jOzrZ6vwMHDszX5/PMM88Ykozt27dbtf/jH/8wTCaTcfjwYUtbtWrVjG7duuVr3D/2zf1c69evb2RlZVnaf/jhB0OS8eGHHxqGYRjnz583JBmzZs264/j333+/0aZNm7vWkZWVZdy8edN48sknjUaNGlntk2R4e3sbaWlplrbk5GSjVKlSRkxMjKWtffv2RoUKFYyUlJTbvs7w4cONcuXKGYmJiVbtr776qiHJOHDggGEYhjFq1CijQoUKd637j+bNm2dIMj7++GOr9pdfftmQZKxbt87S9sff6TuxpW+urKws48qVK0bZsmWNN954w9Ke+7vXq1cvq/5btmwxJBkzZswwDMMwrl69anh6eho9evSw6pednW00bNjQaN68eZ4xjx8/bhiGYezYscOQZKxevdqmmvHXxGEm3JNcXFw0Y8YM7dixI8/hmT/jj2ft1K1bVyaTSV26dLG0OTk5qWbNmrecmo+IiLBaz1KtWjWFhoZq48aNkqRjx47pp59+sqyByMrKsjy6du2qs2fP6vDhw1ZjPvroo/mqfcOGDapXr56aN29u1T548GAZhqENGzbka5z86tatm0qXLm3ZbtCggSRZPhdPT08FBQXplVde0cyZM7V7927L4a78+uSTT9SqVSuVK1dOTk5OcnZ21oIFC3To0KE8fdu1ayd3d3fLtre3t7y8vCz1XLt2TfHx8erbt68qV65829dcs2aN2rVrJz8/P6ufT+7vQHx8vCSpefPmunz5svr376/PP/9c58+fz9d72rBhg8qWLavHHnvMqj13Ri4/h2cK6sqVK5ZZRScnJzk5OalcuXK6evXqLT/TP67VCQ0NVbVq1Sy/z1u3btXFixc1aNAgq88qJydHnTt3VkJCgq5evXrLWmrWrKn77rtPTz/9tObNm3fXGS38tRFmcM/q16+fGjdurKlTp+rmzZuFMuYfzxZxcXFRmTJl5Orqmqf9xo0beZ7v4+Nzy7YLFy5Ikn799VdJ0qRJk+Ts7Gz1iIyMlKQ8X4r5PQR04cKFW/b18/Oz7C9MFStWtNrOPTx0/fp1Sb+tY/nmm2/UqVMnxcbGqnHjxqpcubLGjBmTr8MIK1euVN++feXv76+lS5dq27ZtSkhI0NChQ2/52f+xntyacuu5dOmSsrOzVaVKlTu+7q+//qr//Oc/eX4+999/v6T//XyeeOIJvf/++0pMTNSjjz4qLy8vtWjRQnFxcXcc/8KFC/Lx8cmziNvLy0tOTk6F/nP6vYiICM2ePVvDhg3T2rVr9cMPPyghIUGVK1e2fE6/l9/f58ceeyzP5/Xyyy/LMAxdvHjxlrV4eHgoPj5eDzzwgJ599lndf//98vPz07Rp0wrt7xn3DtbM4J5lMpn08ssvKywsTPPnz8+zPzeA/HHBbFF+WSQnJ9+yLfeLNndNwpQpU/Kst8lVu3Ztq+38nrlUsWJFnT17Nk/7mTNnrF67OFWrVk0LFiyQJB05ckQff/yxoqKilJmZqXnz5t3xuUuXLlVgYKA++ugjq8/gTgug78TT01OlS5e+6+LSSpUqqUGDBnrxxRdvuT83HErSkCFDNGTIEF29elXffvutpk2bpu7du+vIkSOqVq3aLZ9fsWJFbd++XYZhWL2vlJQUZWVlFdnPKTU1VWvWrNG0adP0zDPPWNpz1ybdyu1+n2vWrCnpf79Tb731lh588MFbjuHt7X3bmurXr68VK1bIMAzt3btXixYt0vTp0+Xm5mZVI8DMDO5pHTt2VFhYmKZPn64rV65Y7fP29parq6v27t1r1X6rs2EKy4cffijDMCzbiYmJ2rp1q9q2bSvpt6ASHBysH3/8UU2bNr3l4/eHSmzRoUMHHTx4ULt27bJqX7JkiUwmk9q1a1fg91UYatWqpeeee07169e3qvH3sye/ZzKZ5OLiYvWFn5ycXOCfX+7ZZZ988skdDwl1797dckr0rX4+vw8zucqWLasuXbpo6tSpyszM1IEDB247focOHXTlyhWtXr3aqn3JkiWW/UXBZDLJMAzLDFqu9957T9nZ2bd8Tu7C41xbt25VYmKi5fe5VatWqlChgg4ePHjb32cXF5d81dawYUO9/vrrqlChQp7fYYCZGdzzXn75ZTVp0kQpKSmWQwHSb/+DHDBggOWCYw0bNtQPP/yg5cuXF1ktKSkp6tWrl5566imlpqZq2rRpcnV11ZQpUyx93nnnHXXp0kWdOnXS4MGD5e/vr4sXL+rQoUPatWuXPvnkkwK99vjx47VkyRJ169ZN06dPV7Vq1fTll19qzpw5+sc//qFatWoV1tvMl71792rUqFHq06ePgoOD5eLiog0bNmjv3r1W/+rO/df5Rx99pBo1asjV1VX169dX9+7dtXLlSkVGRuqxxx5TUlKSXnjhBfn6+uro0aMFqmnmzJl66KGH1KJFCz3zzDOqWbOmfv31V33xxRd655135O7urunTpysuLk6hoaEaM2aMateurRs3bujEiRP673//q3nz5qlKlSp66qmn5ObmplatWsnX11fJycmKiYmRh4eHmjVrdtsaBg4cqLfffluDBg3SiRMnVL9+fW3evFnR0dHq2rWrOnbsWKD3JklpaWm3vAp15cqV1aZNG7Vu3VqvvPKKKlWqpOrVqys+Pl4LFiywnJn0Rzt27NCwYcPUp08fJSUlaerUqfL397ccEi1XrpzeeustDRo0SBcvXtRjjz0mLy8vnTt3Tj/++KPOnTunuXPn3nLsNWvWaM6cOerZs6dq1KghwzC0cuVKXb58WWFhYQX+DHCPsuPiY6BQ/f5spj+KiIgwJOU5myM1NdUYNmyY4e3tbZQtW9bo0aOHceLEiduezXTu3Dmr5w8aNMgoW7Zsntf745kjuWczffDBB8aYMWOMypUrG2az2Xj44YeNHTt25Hn+jz/+aPTt29fw8vIynJ2dDR8fH6N9+/bGvHnz8vV+bycxMdGIiIgwKlasaDg7Oxu1a9c2XnnlFcsZUrkK42ymV155JU/f33+uv/76qzF48GCjTp06RtmyZY1y5coZDRo0MF5//XWrs6BOnDhhhIeHG+7u7oYko1q1apZ9L730klG9enXDbDYbdevWNd59913Lz+qPrzty5Mhb1j5o0CCrtoMHDxp9+vQxKlasaLi4uBhVq1Y1Bg8ebNy4ccPS59y5c8aYMWOMwMBAw9nZ2fD09DSaNGliTJ061bhy5YphGIaxePFio127doa3t7fh4uJi+Pn5GX379jX27t1718/0woULxogRIwxfX1/DycnJqFatmjFlyhSrGgzD9rOZJN3ykXu22KlTp4xHH33UuO+++wx3d3ejc+fOxv79+/N8Trm/e+vWrTOeeOIJo0KFCoabm5vRtWtX4+jRo3leOz4+3ujWrZvh6elpODs7G/7+/ka3bt2MTz75JM+YuWcz/fTTT0b//v2NoKAgw83NzfDw8DCaN29uLFq0KF/vF38tJsP43Zw3AACAg2HNDAAAcGiEGQAA4NAIMwAAwKERZgAAgEMjzAAAAIdGmAEAAA7tnr9oXk5Ojs6cOSN3d/d8X/YdAADYl2EYSk9Pl5+fn0qVuvPcyz0fZs6cOaOAgAB7lwEAAAogKSnprjeAvefDTO59bJKSklS+fHk7V4P69evr5MmTedqHDRum1157zapt7NixWrRokWJiYiyXR7+Vbt26afPmzXnaw8PDLZf+37Jli958803t2bNHycnJWrZsmbp3727V/80339Sbb74p6bdL/48cOdKyb8eOHZowYYI2btyo0qVL5/8NAwAKJC0tTQEBAfm6H909H2ZyDy2VL1+eMFMC7Nixw+qmdfv371dYWJgef/xxq5/P6tWrtXv3bvn5+cnV1fWOP7vPP/9cmZmZlu0LFy6oYcOG6t+/v9XzmjRpoqeeekqPPvqoypQpY7Vv3759io6O1po1a2QYhrp3764ePXooJCREN2/e1MSJE/Xuu+/qvvvuK6yPAgCQD/lZInLPhxmULJUrV7bafumllxQUFKQ2bdpY2k6fPq1Ro0Zp7dq16tat213H9PT0tNpesWKFypQpoz59+ljaunTpoi5dutx2jEOHDqlBgwZq3769JKlBgwY6dOiQQkJC9Morr6h169Z3vDkgAMB+CDOwm8zMTC1dulQTJkywJO+cnBw98cQT+uc//2l1h2tbLFiwQP369VPZsmXz/Zz69evryJEjOnnypAzD0JEjRxQSEqJjx45p0aJF2rlzZ4FqAQAUPU7Nht2sXr1aly9f1uDBgy1tL7/8spycnDRmzJgCjfnDDz9o//79GjZsmE3Pq1u3rqKjoxUWFqbw8HDFxMSobt26GjFihGJjY7V27VqFhISoUaNG+vbbbwtUGwCgaDAzA7tZsGCBunTpIj8/P0nSzp079cYbb2jXrl0FPo1+wYIFCgkJUfPmzW1+7ogRIzRixAjL9qJFi+Tu7q6WLVuqdu3aSkhI0KlTp9SvXz8dP35cZrO5QDUCAAoXMzOwi8TERK1fv95qBuW7775TSkqKqlatKicnJzk5OSkxMVETJ05U9erV7zrmtWvXtGLFCptnZW7l/Pnzmj59ut566y1t375dtWrVUnBwsNq1a6ebN2/qyJEjf/o1AACFg5kZ2MXChQvl5eVltcD3iSeeUMeOHa36derUSU888YSGDBly1zE//vhjZWRkaMCAAX+6vnHjxmn8+PGqUqWKEhISdPPmTcu+rKwsqzOyAAD2RZhBscvJydHChQs1aNAgOTn971ewYsWKqlixolVfZ2dn+fj4qHbt2pa2gQMHyt/fXzExMVZ9FyxYoJ49e+YZQ5KuXLmiY8eOWbaPHz+uPXv2yNPTU1WrVrXqGxcXp6NHj2rJkiWSpObNm+unn37SV199paSkJJUuXdqqHgCAfRFmUOzWr1+vkydPaujQoQV6/smTJ/Nc2vrIkSPavHmz1q1bd8vn7NixQ+3atbNsT5gwQZI0aNAgLVq0yNJ+/fp1jRo1Sh999JHlNfz9/fXWW29pyJAhMpvNWrx4sdzc3ApUOwCg8JkMwzDsXURRSktLk4eHh1JTU7loHgAADsKW728WAAMAAIdGmAEAAA6NMAMAABwaYQYAADg0wgwAAHBohBkAAODQuM7MPeyTn1PtXQKKUZ8gD3uXAAB2wcwMAABwaIQZAADg0AgzAADAoRFmAACAQyPMAAAAh0aYAQAADo0wAwAAHBphBgAAODTCDAAAcGiEGQAA4NAIMwAAwKERZgAAgEMjzAAAAIdGmAEAAA6NMAMAABwaYQYAADg0wgwAAHBodg0z1atXl8lkyvMYOXKkJMkwDEVFRcnPz09ubm5q27atDhw4YM+SAQBACWPXMJOQkKCzZ89aHnFxcZKkPn36SJJiY2M1c+ZMzZ49WwkJCfLx8VFYWJjS09PtWTYAAChB7BpmKleuLB8fH8tjzZo1CgoKUps2bWQYhmbNmqWpU6eqd+/eCgkJ0eLFi3Xt2jUtX77cnmUDAIASpMSsmcnMzNTSpUs1dOhQmUwmHT9+XMnJyQoPD7f0MZvNatOmjbZu3XrbcTIyMpSWlmb1AAAA964SE2ZWr16ty5cva/DgwZKk5ORkSZK3t7dVP29vb8u+W4mJiZGHh4flERAQUGQ1AwAA+ysxYWbBggXq0qWL/Pz8rNpNJpPVtmEYedp+b8qUKUpNTbU8kpKSiqReAABQMjjZuwBJSkxM1Pr167Vy5UpLm4+Pj6TfZmh8fX0t7SkpKXlma37PbDbLbDYXXbEAAKBEKREzMwsXLpSXl5e6detmaQsMDJSPj4/lDCfpt3U18fHxCg0NtUeZAACgBLL7zExOTo4WLlyoQYMGycnpf+WYTCaNGzdO0dHRCg4OVnBwsKKjo1WmTBlFRETYsWIAAFCS2D3MrF+/XidPntTQoUPz7Js8ebKuX7+uyMhIXbp0SS1atNC6devk7u5uh0oBAEBJZDIMw7B3EUUpLS1NHh4eSk1NVfny5e1dTrH65OdUe5eAYtQnyMPeJQBAobHl+7tErJkBAAAoKMIMAABwaIQZAADg0AgzAADAoRFmAACAQyPMAAAAh0aYAQAADo0wAwAAHBphBgAAODTCDAAAcGiEGQAA4NAIMwAAwKERZgAAgEMjzAAAAIdGmAEAAA6NMAMAABwaYQYAADg0wgwAAHBohBkAAODQCDMAAMChEWYAAIBDI8wAAACHRpgBAAAOjTADAAAcGmEGAAA4NMIMAABwaIQZAADg0AgzAADAoRFmAACAQyPMAAAAh0aYAQAADo0wAwAAHBphBgAAODTCDAAAcGiEGQAA4NAIMwAAwKHZPcycPn1aAwYMUMWKFVWmTBk98MAD2rlzp2W/YRiKioqSn5+f3Nzc1LZtWx04cMCOFQMAgJLErmHm0qVLatWqlZydnfXVV1/p4MGDeu2111ShQgVLn9jYWM2cOVOzZ89WQkKCfHx8FBYWpvT0dPsVDgAASgwne774yy+/rICAAC1cuNDSVr16dct/G4ahWbNmaerUqerdu7ckafHixfL29tby5cs1fPjw4i4ZAACUMHadmfniiy/UtGlT9enTR15eXmrUqJHeffddy/7jx48rOTlZ4eHhljaz2aw2bdpo69attxwzIyNDaWlpVg8AAHDvsmuY+eWXXzR37lwFBwdr7dq1GjFihMaMGaMlS5ZIkpKTkyVJ3t7eVs/z9va27PujmJgYeXh4WB4BAQFF+yYAAIBd2TXM5OTkqHHjxoqOjlajRo00fPhwPfXUU5o7d65VP5PJZLVtGEaetlxTpkxRamqq5ZGUlFRk9QMAAPuza5jx9fVVvXr1rNrq1q2rkydPSpJ8fHwkKc8sTEpKSp7Zmlxms1nly5e3egAAgHuXXcNMq1atdPjwYau2I0eOqFq1apKkwMBA+fj4KC4uzrI/MzNT8fHxCg0NLdZaAQBAyWTXs5nGjx+v0NBQRUdHq2/fvvrhhx80f/58zZ8/X9Jvh5fGjRun6OhoBQcHKzg4WNHR0SpTpowiIiLsWToAACgh7BpmmjVrplWrVmnKlCmaPn26AgMDNWvWLD3++OOWPpMnT9b169cVGRmpS5cuqUWLFlq3bp3c3d3tWDkAACgpTIZhGPYuoiilpaXJw8NDqampf7n1M5/8nGrvElCM+gR52LsEACg0tnx/2/12BgAAAH8GYQYAADg0wgwAAHBohBkAAODQCDMAAMChEWYAAIBDI8wAAACHRpgBAAAOjTADAAAcms1hZvHixfryyy8t25MnT1aFChUUGhqqxMTEQi0OAADgbmwOM9HR0XJzc5Mkbdu2TbNnz1ZsbKwqVaqk8ePHF3qBAAAAd2LzjSaTkpJUs2ZNSdLq1av12GOP6f/+7//UqlUrtW3btrDrAwAAuCObZ2bKlSunCxcuSJLWrVunjh07SpJcXV11/fr1wq0OAADgLmyemQkLC9OwYcPUqFEjHTlyRN26dZMkHThwQNWrVy/s+gAAAO7I5pmZt99+Wy1bttS5c+f02WefqWLFipKknTt3qn///oVeIAAAwJ2YDMMw7F1EUUpLS5OHh4dSU1NVvnx5e5dTrD75OdXeJaAY9QnysHcJAFBobPn+LtB1Zr777jsNGDBAoaGhOn36tCTpgw8+0ObNmwsyHAAAQIHZHGY+++wzderUSW5ubtq1a5cyMjIkSenp6YqOji70AgEAAO7E5jAzY8YMzZs3T++++66cnZ0t7aGhodq1a1ehFgcAAHA3NoeZw4cPq3Xr1nnay5cvr8uXLxdGTQAAAPlmc5jx9fXVsWPH8rRv3rxZNWrUKJSiAAAA8svmMDN8+HCNHTtW27dvl8lk0pkzZ7Rs2TJNmjRJkZGRRVEjAADAbdl80bzJkycrNTVV7dq1040bN9S6dWuZzWZNmjRJo0aNKooaAQAAbqvA15m5du2aDh48qJycHNWrV0/lypUr7NoKBdeZwV8F15kBcC+x5fvb5pmZ1NRUZWdny9PTU02bNrW0X7x4UU5OTn+5wAAAAOzL5jUz/fr104oVK/K0f/zxx+rXr1+hFAUAAJBfNoeZ7du3q127dnna27Ztq+3btxdKUQAAAPllc5jJyMhQVlZWnvabN2/q+vXrhVIUAABAftkcZpo1a6b58+fnaZ83b56aNGlSKEUBAADkl80LgF988UV17NhRP/74ozp06CBJ+uabb5SQkKB169YVeoEAAAB3YvPMTKtWrbRt2zYFBATo448/1n/+8x/VrFlTe/fu1cMPP1wUNQIAANyWzTMzkvTAAw9o2bJlhV0LAACAzQoUZnJycnTs2DGlpKQoJyfHat+tbkIJAABQVGwOM99//70iIiKUmJioP1482GQyKTs7u9CKAwAAuBubw8yIESPUtGlTffnll/L19ZXJZCqKugAAAPLF5jBz9OhRffrpp6pZs2ZR1AMAAGATm89matGihY4dO1YoLx4VFSWTyWT18PHxsew3DENRUVHy8/OTm5ub2rZtqwMHDhTKawMAgHuDzTMzo0eP1sSJE5WcnKz69evL2dnZan+DBg1sGu/+++/X+vXrLdulS5e2/HdsbKxmzpypRYsWqVatWpoxY4bCwsJ0+PBhubu721o6AAC4B9kcZh599FFJ0tChQy1tJpNJhmEUaAGwk5OT1WxMLsMwNGvWLE2dOlW9e/eWJC1evFje3t5avny5hg8fbmvpAADgHmRzmDl+/HihFnD06FH5+fnJbDarRYsWio6OVo0aNXT8+HElJycrPDzc0tdsNqtNmzbaunXrbcNMRkaGMjIyLNtpaWmFWi8AAChZbA4z1apVK7QXb9GihZYsWaJatWrp119/1YwZMxQaGqoDBw4oOTlZkuTt7W31HG9vbyUmJt52zJiYGD3//POFViMAACjZCnTRPEk6ePCgTp48qczMTKv2v/3tb/keo0uXLpb/rl+/vlq2bKmgoCAtXrxYDz74oCTlOfU793DW7UyZMkUTJkywbKelpSkgICDfNQEAAMdic5j55Zdf1KtXL+3bt8+yVkb6X+j4MxfNK1u2rOrXr6+jR4+qZ8+ekqTk5GT5+vpa+qSkpOSZrfk9s9kss9lc4BoAAIBjsfnU7LFjxyowMFC//vqrypQpowMHDujbb79V06ZNtWnTpj9VTEZGhg4dOiRfX18FBgbKx8dHcXFxlv2ZmZmKj49XaGjon3odAABw77B5Zmbbtm3asGGDKleurFKlSqlUqVJ66KGHFBMTozFjxmj37t35HmvSpEnq0aOHqlatqpSUFM2YMUNpaWkaNGiQTCaTxo0bp+joaAUHBys4OFjR0dEqU6aMIiIibC0bAADco2wOM9nZ2SpXrpwkqVKlSjpz5oxq166tatWq6fDhwzaNderUKfXv31/nz59X5cqV9eCDD+r777+3LDKePHmyrl+/rsjISF26dEktWrTQunXruMYMAACwsDnMhISEaO/evapRo4ZatGih2NhYubi4aP78+apRo4ZNY61YseKO+00mk6KiohQVFWVrmQAA4C/C5jDz3HPP6erVq5KkGTNmqHv37nr44YdVsWLFu4YTAACAwmZzmOnUqZPlv2vUqKGDBw/q4sWLuu+++7iDNgAAKHY2n800dOhQpaenW7V5enrq2rVrVrc4AAAAKA42h5nFixfr+vXredqvX7+uJUuWFEpRAAAA+ZXvw0xpaWkyDEOGYSg9PV2urq6WfdnZ2frvf/8rLy+vIikSAADgdvIdZipUqCCTySSTyaRatWrl2W8ymbgnEgAAKHb5DjMbN26UYRhq3769PvvsM3l6elr2ubi4qFq1avLz8yuSIgEAAG4n32GmTZs2kqTjx4+ratWqnLkEAABKBJsXAB86dEhbtmyxbL/99tt64IEHFBERoUuXLhVqcQAAAHdjc5j55z//qbS0NEnSvn37NGHCBHXt2lW//PKLJkyYUOgFAgAA3InNF807fvy46tWrJ0n67LPP1KNHD0VHR2vXrl3q2rVroRcIAABwJzbPzLi4uOjatWuSpPXr1ys8PFzSbxfOy52xAQAAKC42z8w89NBDmjBhglq1aqUffvhBH330kSTpyJEjqlKlSqEXCAAAcCc2z8zMnj1bTk5O+vTTTzV37lz5+/tLkr766it17ty50AsEAAC4E5NhGIa9iyhKaWlp8vDwUGpqqsqXL2/vcorVJz+n2rsEFKM+QR72LgEACo0t3982H2aSpJycHB07dkwpKSnKycmx2te6deuCDAkAAFAgNoeZ77//XhEREUpMTNQfJ3VMJpOys7MLrTgAAIC7sTnMjBgxQk2bNtWXX34pX19frgQMAADsyuYwc/ToUX366aeqWbNmUdQDAABgE5vPZmrRooWOHTtWFLUAAADYzOaZmdGjR2vixIlKTk5W/fr15ezsbLW/QYMGhVYcAADA3dgcZh599FFJ0tChQy1tJpNJhmGwABgAABS7At2bCQAAoKSwOcxUq1atKOoAAAAokHyHmS+++CJf/f72t78VuBgAAABb5TvM9OzZ8659WDMDAACKW77DzB9vWwAAAFAS2HydGQAAgJKEMAMAABwaYQYAADg0wgwAAHBo+Qozb775pm7cuCFJOnnypAzDKNKiAAAA8itfYWbChAlKS0uTJAUGBurcuXNFWhQAAEB+5evUbD8/P3322Wfq2rWrDMPQqVOnLDM1f1S1atVCLRAAAOBO8hVmnnvuOY0ePVqjRo2SyWRSs2bN8vThRpMAAMAe8hVm/u///k/9+/dXYmKiGjRooPXr16tixYpFXRsAAMBd5ftsJnd3d4WEhGjhwoVq1aqVGjZseMtHQcXExMhkMmncuHGWNsMwFBUVJT8/P7m5ualt27Y6cOBAgV8DAADce2w+NXvQoEEym83auXOnli5dqmXLlmnXrl1/qoiEhATNnz9fDRo0sGqPjY3VzJkzNXv2bCUkJMjHx0dhYWFKT0//U68HAADuHTaHmZSUFLVv317NmjXTmDFjNGrUKDVt2lQdOnQo0FlOV65c0eOPP653331X9913n6XdMAzNmjVLU6dOVe/evRUSEqLFixfr2rVrWr58uc2vAwAA7k02h5nRo0crLS1NBw4c0MWLF3Xp0iXt379faWlpGjNmjM0FjBw5Ut26dVPHjh2t2o8fP67k5GSFh4db2sxms9q0aaOtW7fedryMjAylpaVZPQAAwL0r33fNzvX1119r/fr1qlu3rqWtXr16evvtt62CR36sWLFCu3btUkJCQp59ycnJkiRvb2+rdm9vbyUmJt52zJiYGD3//PM21QEAAByXzTMzOTk5cnZ2ztPu7OysnJycfI+TlJSksWPHaunSpXJ1db1tP5PJZLWdewr47UyZMkWpqamWR1JSUr5rAgAAjsfmMNO+fXuNHTtWZ86csbSdPn1a48ePV4cOHfI9zs6dO5WSkqImTZrIyclJTk5Oio+P15tvviknJyfLjEzuDE2ulJSUPLM1v2c2m1W+fHmrBwAAuHfZHGZmz56t9PR0Va9eXUFBQapZs6YCAwOVnp6ut956K9/jdOjQQfv27dOePXssj6ZNm+rxxx/Xnj17VKNGDfn4+CguLs7ynMzMTMXHxys0NNTWsgEAwD3K5jUzAQEB2rVrl+Li4vTTTz/JMAzVq1cvzwLeu8m9bs3vlS1bVhUrVrS0jxs3TtHR0QoODlZwcLCio6NVpkwZRURE2Fo2AAC4R9kcZnKFhYUpLCysMGvJY/Lkybp+/boiIyN16dIltWjRQuvWrZO7u3uRvi4AAHAcJsMwDHsXUZTS0tLk4eGh1NTUv9z6mU9+TrV3CShGfYI87F0CABQaW76/bV4zAwAAUJIQZgAAgEMjzAAAAIdWoDDz888/67nnnlP//v2VkpIi6bcrA3NHawAAUNxsDjPx8fGqX7++tm/frpUrV+rKlSuSpL1792ratGmFXiAAAMCd2BxmnnnmGc2YMUNxcXFycXGxtLdr107btm0r1OIAAADuxuYws2/fPvXq1StPe+XKlXXhwoVCKQoAACC/bA4zFSpU0NmzZ/O07969W/7+/oVSFAAAQH7ZHGYiIiL09NNPKzk5WSaTSTk5OdqyZYsmTZqkgQMHFkWNAAAAt2VzmHnxxRdVtWpV+fv768qVK6pXr55at26t0NBQPffcc0VRIwAAwG3ZfG8mZ2dnLVu2TNOnT9fu3buVk5OjRo0aKTg4uCjqAwAAuKMC32gyKChIQUFBhVkLAACAzWwOMxMmTLhlu8lkkqurq2rWrKlHHnlEnp6ef7o4AACAu7E5zOzevVu7du1Sdna2ateuLcMwdPToUZUuXVp16tTRnDlzNHHiRG3evFn16tUripoBAAAsbF4A/Mgjj6hjx446c+aMdu7cqV27dun06dMKCwtT//79dfr0abVu3Vrjx48vinoBAACsmAzDMGx5gr+/v+Li4vLMuhw4cEDh4eE6ffq0du3apfDwcJ0/f75Qiy2ItLQ0eXh4KDU1VeXLl7d3OcXqk59T7V0CilGfIA97lwAAhcaW72+bZ2ZSU1MtN5f8vXPnziktLU3SbxfWy8zMtHVoAAAAmxXoMNPQoUO1atUqnTp1SqdPn9aqVav05JNPqmfPnpKkH374QbVq1SrsWgEAAPKweQHwO++8o/Hjx6tfv37Kysr6bRAnJw0aNEivv/66JKlOnTp67733CrdSAACAW7B5zUyuK1eu6JdffpFhGAoKClK5cuUKu7ZCwZoZ/FWwZgbAvcSW7+8CXzSvXLlyatCgQUGfDgAAUCgKFGYSEhL0ySef6OTJk3kW+q5cubJQCgMAAMgPmxcAr1ixQq1atdLBgwe1atUq3bx5UwcPHtSGDRvk4cE0NwAAKF42h5no6Gi9/vrrWrNmjVxcXPTGG2/o0KFD6tu3r6pWrVoUNQIAANyWzWHm559/Vrdu3SRJZrNZV69elclk0vjx4zV//vxCLxAAAOBObA4znp6eSk9Pl/Tb1YD3798vSbp8+bKuXbtWuNUBAADchc0LgB9++GHFxcWpfv366tu3r8aOHasNGzYoLi5OHTp0KIoaAQAAbsvmMDN79mzduHFDkjRlyhQ5Oztr8+bN6t27t/71r38VeoEAAAB3UuCL5jkKLpqHvwoumgfgXlKkN5osXbr0LW80eeHCBZUuXdrW4QAAAP4Um8PM7SZyMjIy5OLi8qcLAgAAsEW+18y8+eabkiSTyaT33nvP6l5M2dnZ+vbbb1WnTp3CrxAAAOAO8h1mcu+IbRiG5s2bZ3VIycXFRdWrV9e8efMKv0IAAIA7yHeYOX78uCSpXbt2Wrlype67774iKwoAACC/bD41e+PGjUVRBwAAQIHYHGays7O1aNEiffPNN0pJSVFOTo7V/g0bNhRacQAAAHdj89lMY8eO1dixY5Wdna2QkBA1bNjQ6mGLuXPnqkGDBipfvrzKly+vli1b6quvvrLsNwxDUVFR8vPzk5ubm9q2basDBw7YWjIAALiH2Twzs2LFCn388cfq2rXrn37xKlWq6KWXXlLNmjUlSYsXL9Yjjzyi3bt36/7771dsbKxmzpypRYsWqVatWpoxY4bCwsJ0+PBhubu7/+nXBwAAjs/mmRkXFxdL+PizevTooa5du6pWrVqqVauWXnzxRZUrV07ff/+9DMPQrFmzNHXqVPXu3VshISFavHixrl27puXLlxfK6wMAAMdnc5iZOHGi3njjjdtePK+gsrOztWLFCl29elUtW7bU8ePHlZycrPDwcEsfs9msNm3aaOvWrYX62gAAwHHZfJhp8+bN2rhxo7766ivdf//9cnZ2ttq/cuVKm8bbt2+fWrZsqRs3bqhcuXJatWqV6tWrZwks3t7eVv29vb2VmJh42/EyMjKUkZFh2U5LS7OpHgAA4FhsDjMVKlRQr169Cq2A2rVra8+ePbp8+bI+++wzDRo0SPHx8Zb9JpPJqr9hGHnafi8mJkbPP/98odUHAABKthJ31+yOHTsqKChITz/9tIKCgrRr1y41atTIsv+RRx5RhQoVtHjx4ls+/1YzMwEBAdw1G/c87poN4F5SpHfNlqSsrCytX79e77zzjtLT0yVJZ86c0ZUrVwoynBXDMJSRkaHAwED5+PgoLi7Osi8zM1Px8fEKDQ297fPNZrPlVO/cBwAAuHfZfJgpMTFRnTt31smTJ5WRkaGwsDC5u7srNjZWN27csOn+TM8++6y6dOmigIAApaena8WKFdq0aZO+/vprmUwmjRs3TtHR0QoODlZwcLCio6NVpkwZRURE2Fo2AAC4R9kcZsaOHaumTZvqxx9/VMWKFS3tvXr10rBhw2wa69dff9UTTzyhs2fPysPDQw0aNNDXX3+tsLAwSdLkyZN1/fp1RUZG6tKlS2rRooXWrVvHNWYAAICFzWtmKlWqpC1btqh27dpyd3fXjz/+qBo1aujEiROqV6+erl27VlS1Fogtx9zuNayZ+WthzQyAe0mRrpnJyclRdnZ2nvZTp04xYwIAAIqdzWEmLCxMs2bNsmybTCZduXJF06ZNK5RbHAAAANjC5jUzr7/+utq1a6d69erpxo0bioiI0NGjR1WpUiV9+OGHRVEjAADAbdk8M+Pn56c9e/bon//8p4YPH65GjRrppZde0u7du+Xl5VUUNQIAHEBMTIyaNWsmd3d3eXl5qWfPnjp8+LBVn8GDB8tkMlk9HnzwwTuO++677+rhhx/Wfffdp/vuu08dO3bUDz/8YNUnPT1d48aNU7Vq1eTm5qbQ0FAlJCRY9Xn11Vfl7e0tb29vvf7661b7tm/friZNmtxyGQVKPptnZiTJzc1NQ4YM0ZAhQwq7HgCAg4qPj9fIkSPVrFkzZWVlaerUqQoPD9fBgwdVtmxZS7/OnTtr4cKFlm0XF5c7jrtp0yb1799foaGhcnV1VWxsrMLDw3XgwAH5+/tLkoYNG6b9+/frgw8+kJ+fn5YuXaqOHTvq4MGD8vf31759+/Tvf/9ba9askWEY6t69u8LCwhQSEqKbN29qxIgRmj9/vkqXLl00Hw6KlM1hJiYmRt7e3ho6dKhV+/vvv69z587p6aefLrTiAACO4+uvv7baXrhwoby8vLRz5061bt3a0m42m+Xj45PvcZctW2a1/e677+rTTz/VN998o4EDB+r69ev67LPP9Pnnn1teJyoqSqtXr9bcuXM1Y8YMHTp0SA0aNFD79u0lSQ0aNNChQ4cUEhKiV155Ra1bt1azZs0K+tZhZzYfZnrnnXdUp06dPO3333+/TRfMAwDc21JTf7s8hKenp1X7pk2b5OXlpVq1aumpp55SSkqKTeNeu3ZNN2/etIyblZWl7Oxsubq6WvVzc3PT5s2bJUn169fXkSNHdPLkSSUmJurIkSMKCQnRsWPHtGjRIs2YMaOgbxMlgM1hJjk5Wb6+vnnaK1eurLNnzxZKUQAAx2YYhiZMmKCHHnpIISEhlvYuXbpo2bJl2rBhg1577TUlJCSoffv2VvfUu5tnnnlG/v7+6tixoyTJ3d1dLVu21AsvvKAzZ84oOztbS5cu1fbt2y3fS3Xr1lV0dLTCwsIUHh6umJgY1a1bVyNGjFBsbKzWrl2rkJAQNWrUSN9++23hfhgocjYfZgoICNCWLVsUGBho1b5lyxb5+fkVWmEAAMc1atQo7d271zIzkuvvf/+75b9DQkLUtGlTVatWTV9++aV69+5913FjY2P14YcfatOmTVYzMR988IGGDh0qf39/lS5dWo0bN1ZERIR27dpl6TNixAiNGDHCsr1o0SJLEKpdu7YSEhJ06tQp9evXT8ePH5fZbP4zHwGKkc1hZtiwYRo3bpxu3rxpOfb4zTffaPLkyZo4cWKhFwgAcCyjR4/WF198oW+//VZVqlS5Y19fX19Vq1ZNR48eveu4r776qqKjo7V+/Xo1aNDAal9QUJDi4+N19epVpaWlydfXV3//+9/z/MM71/nz5zV9+nR9++232r59u2rVqmW5D+DNmzd15MgR1a9fP/9vGnZlc5iZPHmyLl68qMjISGVmZkqSXF1d9fTTT2vKlCmFXiAAwDEYhqHRo0dr1apV2rRp022DxO9duHBBSUlJt1y+8HuvvPKKZsyYobVr16pp06a37Ve2bFmVLVtWly5d0tq1axUbG3vLfuPGjdP48eNVpUoVJSQk6ObNm5Z9uWtw4DhsCjPZ2dnavHmznn76af3rX//SoUOH5ObmpuDgYKbjAOAvbuTIkVq+fLk+//xzubu7Kzk5WZLk4eEhNzc3XblyRVFRUXr00Ufl6+urEydO6Nlnn1WlSpXUq1cvyzgDBw6Uv7+/YmJiJP12aOlf//qXli9frurVq1vGLVeunMqVKydJWrt2rQzDUO3atXXs2DH985//VO3atW95CZG4uDgdPXpUS5YskSQ1b95cP/30k7766islJSWpdOnSql27dpF+VihcNoWZ0qVLq1OnTjp06JACAwM5jQ0AYDF37lxJUtu2ba3aFy5cqMGDB6t06dLat2+flixZosuXL8vX11ft2rXTRx99ZHVvv5MnT6pUqf+dnzJnzhxlZmbqsccesxp32rRpioqKkvTbmVNTpkzRqVOn5OnpqUcffVQvvviinJ2drZ5z/fp1jRo1Sh999JHlNfz9/fXWW29pyJAhMpvNWrx4sdzc3ArrY0ExsPmu2c2aNdNLL72kDh06FFVNhYq7ZuOvgrtmA7iXFOlds1988UVNmjRJa9as0dmzZ5WWlmb1AAAAKE42LwDu3LmzJOlvf/ubTCaTpd0wDJlMJhZNAQCAYmVzmNm4cWNR1AEAsAGHkf9aOIx8ZzaHmTZt2hRFHQAAAAVi85oZSfruu+80YMAAhYaG6vTp05J+u/riH6/0CAAAUNRsDjOfffaZOnXqJDc3N+3atctyP4309HRFR0cXeoEAAAB3YnOYmTFjhubNm6d3333X6vz90NBQq3tgAAAAFAebw8zhw4fVunXrPO3ly5fX5cuXC6MmAACAfLM5zPj6+urYsWN52jdv3qwaNWoUSlEAAAD5ZXOYGT58uMaOHavt27fLZDLpzJkzWrZsmSZNmqTIyMiiqBEAAOC2CnTX7NTUVLVr1043btxQ69atZTabNWnSJI0aNaooagQAALgtm8OM9NstDaZOnaqDBw8qJydH9erVs9y5FAAAoDjl+zDTtWvXNHLkSPn7+8vLy0vDhg1T9erV1bx5c4IMAACwm3yHmWnTpmnRokXq1q2b+vXrp7i4OP3jH/8oytoAAADuKt+HmVauXKkFCxaoX79+kqQBAwaoVatWys7OVunSpYusQAAAgDvJ98xMUlKSHn74Yct28+bN5eTkpDNnzhRJYQAAAPmR7zCTnZ0tFxcXqzYnJydlZWUVelEAAAD5le/DTIZhaPDgwTKbzZa2GzduaMSIESpbtqylbeXKlYVbIQAAwB3kO8wMGjQoT9uAAQMKtRgAAABb5TvMLFy4sCjrAAAAKBCbb2cAAABQkhBmAACAQ7NrmImJiVGzZs3k7u4uLy8v9ezZU4cPH7bqYxiGoqKi5OfnJzc3N7Vt21YHDhywU8UAAKCksWuYiY+P18iRI/X9998rLi5OWVlZCg8P19WrVy19YmNjNXPmTM2ePVsJCQny8fFRWFiY0tPT7Vg5AAAoKQp0o8nC8vXXX1ttL1y4UF5eXtq5c6dat24twzA0a9YsTZ06Vb1795YkLV68WN7e3lq+fLmGDx9uj7IBAEAJUqLWzKSmpkqSPD09JUnHjx9XcnKywsPDLX3MZrPatGmjrVu32qVGAABQsth1Zub3DMPQhAkT9NBDDykkJESSlJycLEny9va26uvt7a3ExMRbjpORkaGMjAzLdlpaWhFVDAAASoISMzMzatQo7d27Vx9++GGefSaTyWrbMIw8bbliYmLk4eFheQQEBBRJvQAAoGQoEWFm9OjR+uKLL7Rx40ZVqVLF0u7j4yPpfzM0uVJSUvLM1uSaMmWKUlNTLY+kpKSiKxwAANidXcOMYRgaNWqUVq5cqQ0bNigwMNBqf2BgoHx8fBQXF2dpy8zMVHx8vEJDQ285ptlsVvny5a0eAADg3mXXNTMjR47U8uXL9fnnn8vd3d0yA+Ph4SE3NzeZTCaNGzdO0dHRCg4OVnBwsKKjo1WmTBlFRETYs3QAAFBC2DXMzJ07V5LUtm1bq/aFCxdq8ODBkqTJkyfr+vXrioyM1KVLl9SiRQutW7dO7u7uxVwtAAAoiewaZgzDuGsfk8mkqKgoRUVFFX1BAADA4ZSIBcAAAAAFRZgBAAAOjTADAAAcGmEGAAA4NMIMAABwaIQZAADg0AgzAADAoRFmAACAQyPMAAAAh0aYAQAADo0wAwAAHBphBgAAODTCDAAAcGiEGQAA4NAIMwAAwKERZgAAgEMjzAAAAIdGmAEAAA6NMAMAABwaYQYAADg0wgwAAHBohBkAAODQCDMAAMChEWYAAIBDI8wAAACHRpgBAAAOjTADAAAcGmEGAAA4NMIMAABwaIQZAADg0AgzAADAoRFmAACAQyPMAAAAh0aYAQAADo0wAwAAHBphBgAAODS7hplvv/1WPXr0kJ+fn0wmk1avXm213zAMRUVFyc/PT25ubmrbtq0OHDhgn2IBAECJZNcwc/XqVTVs2FCzZ8++5f7Y2FjNnDlTs2fPVkJCgnx8fBQWFqb09PRirhQAAJRUTvZ88S5duqhLly633GcYhmbNmqWpU6eqd+/ekqTFixfL29tby5cv1/Dhw4uzVAAAUEKV2DUzx48fV3JyssLDwy1tZrNZbdq00datW+1YGQAAKEnsOjNzJ8nJyZIkb29vq3Zvb28lJibe9nkZGRnKyMiwbKelpRVNgQAAoEQosTMzuUwmk9W2YRh52n4vJiZGHh4elkdAQEBRlwgAAOyoxIYZHx8fSf+bocmVkpKSZ7bm96ZMmaLU1FTLIykpqUjrBAAA9lViw0xgYKB8fHwUFxdnacvMzFR8fLxCQ0Nv+zyz2azy5ctbPQAAwL3Lrmtmrly5omPHjlm2jx8/rj179sjT01NVq1bVuHHjFB0dreDgYAUHBys6OlplypRRRESEHasGAAAliV3DzI4dO9SuXTvL9oQJEyRJgwYN0qJFizR58mRdv35dkZGRunTpklq0aKF169bJ3d3dXiUDAIASxmQYhmHvIopSWlqaPDw8lJqa+pc75PTJz6n2LgHFqE+Qh71LQDHi7/uv5a/4923L93eJXTMDAACQH4QZAADg0AgzAADAoRFmAACAQyPMAAAAh0aYAQAADo0wAwAAHBphBgAAODTCDAAAcGiEGQAA4NAIMwAAwKERZgAAgEMjzAAAAIdGmAEAAA6NMAMAABwaYQYAADg0wgwAAHBohBkAAODQCDMAAMChEWYAAIBDI8wAAACHRpgBAAAOjTADAAAcGmEGAAA4NMIMAABwaIQZAADg0AgzAADAoRFmAACAQyPMAAAAh0aYAQAADo0wAwAAHBphBgAAODTCDAAAcGiEGQAA4NAIMwAAwKERZgAAgENziDAzZ84cBQYGytXVVU2aNNF3331n75IAAEAJUeLDzEcffaRx48Zp6tSp2r17tx5++GF16dJFJ0+etHdpAACgBCjxYWbmzJl68sknNWzYMNWtW1ezZs1SQECA5s6da+/SAABACVCiw0xmZqZ27typ8PBwq/bw8HBt3brVTlUBAICSxMneBdzJ+fPnlZ2dLW9vb6t2b29vJScn3/I5GRkZysjIsGynpqZKktLS0oqu0BLqWvpf7z3/laWlmexdAooRf99/LX/Fv+/c723DMO7at0SHmVwmk/UP0TCMPG25YmJi9Pzzz+dpDwgIKJLagJJisL0LAFBkBtu7ADtKT0+Xh4fHHfuU6DBTqVIllS5dOs8sTEpKSp7ZmlxTpkzRhAkTLNs5OTm6ePGiKlaseNsAhHtHWlqaAgIClJSUpPLly9u7HACFiL/vvxbDMJSeni4/P7+79i3RYcbFxUVNmjRRXFycevXqZWmPi4vTI488csvnmM1mmc1mq7YKFSoUZZkogcqXL8//7IB7FH/ffx13m5HJVaLDjCRNmDBBTzzxhJo2baqWLVtq/vz5OnnypEaMGGHv0gAAQAlQ4sPM3//+d124cEHTp0/X2bNnFRISov/+97+qVq2avUsDAAAlQIkPM5IUGRmpyMhIe5cBB2A2mzVt2rQ8hxoBOD7+vnE7JiM/5zwBAACUUCX6onkAAAB3Q5gBAAAOjTADAAAcGmEGAAA4NMIMAABwaA5xajZwO6dOndLcuXO1detWJScny2QyydvbW6GhoRoxYgT35AKAvwBOzYbD2rx5s7p06aKAgACFh4fL29tbhmEoJSVFcXFxSkpK0ldffaVWrVrZu1QARSApKUnTpk3T+++/b+9SYGeEGTisZs2a6aGHHtLrr79+y/3jx4/X5s2blZCQUMyVASgOP/74oxo3bqzs7Gx7lwI7I8zAYbm5uWnPnj2qXbv2Lff/9NNPatSoka5fv17MlQEoDF988cUd9//yyy+aOHEiYQasmYHj8vX11datW28bZrZt2yZfX99irgpAYenZs6dMJpPu9G9uk8lUjBWhpCLMwGFNmjRJI0aM0M6dOxUWFiZvb2+ZTCYlJycrLi5O7733nmbNmmXvMgEUkK+vr95++2317Nnzlvv37NmjJk2aFG9RKJEIM3BYkZGRqlixol5//XW98847lqnm0qVLq0mTJlqyZIn69u1r5yoBFFSTJk20a9eu24aZu83a4K+DNTO4J9y8eVPnz5+XJFWqVEnOzs52rgjAn/Xdd9/p6tWr6ty58y33X716VTt27FCbNm2KuTKUNIQZAADg0LgCMAAAcGiEGQAA4NAIMwAAwKERZgA4pEWLFqlChQp/ehyTyaTVq1f/6XEA2A9hBoDdDB48+Lan3QJAfhFmAACAQyPMACiRZs6cqfr166ts2bIKCAhQZGSkrly5kqff6tWrVatWLbm6uiosLExJSUlW+//zn/+oSZMmcnV1VY0aNfT8888rKyuruN4GgGJAmAFQIpUqVUpvvvmm9u/fr8WLF2vDhg2aPHmyVZ9r167pxRdf1OLFi7VlyxalpaWpX79+lv1r167VgAEDNGbMGB08eFDvvPOOFi1apBdffLG43w6AIsRF8wDYzeDBg3X58uV8LcD95JNP9I9//MNypedFixZpyJAh+v7779WiRQtJv90pvW7dutq+fbuaN2+u1q1bq0uXLpoyZYplnKVLl2ry5Mk6c+aMpN8WAK9atYq1O4AD495MAEqkjRs3Kjo6WgcPHlRaWpqysrJ048YNXb16VWXLlpUkOTk5qWnTppbn1KlTRxUqVNChQ4fUvHlz7dy5UwkJCVYzMdnZ2bpx44auXbumMmXKFPv7AlD4CDMASpzExER17dpVI0aM0AsvvCBPT09t3rxZTz75pG7evGnV12Qy5Xl+bltOTo6ef/559e7dO08fV1fXoikeQLEjzAAocXbs2KGsrCy99tprKlXqt6V9H3/8cZ5+WVlZ2rFjh5o3by5JOnz4sC5fvqw6depIkho3bqzDhw+rZs2axVc8gGJHmAFgV6mpqdqzZ49VW+XKlZWVlaW33npLPXr00JYtWzRv3rw8z3V2dtbo0aP15ptvytnZWaNGjdKDDz5oCTf//ve/1b17dwUEBKhPnz4qVaqU9u7dq3379mnGjBnF8fYAFAPOZgJgV5s2bVKjRo2sHu+//75mzpypl19+WSEhIVq2bJliYmLyPLdMmTJ6+umnFRERoZYtW8rNzU0rVqyw7O/UqZPWrFmjuLg4NWvWTA8++KBmzpypatWqFedbBFDEOJsJAAA4NGZmAACAQyPMAAAAh0aYAQAADo0wAwAAHBphBgAAODTCDAAAcGiEGQAA4NAIMwAAwKERZgAAgEMjzAAAAIdGmAEAAA6NMAMAABza/wM9H2zW8NCGLgAAAABJRU5ErkJggg==",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"counts = df['label'].value_counts()\n",
"percentages = counts / counts.sum() * 100\n",
"\n",
"ax = percentages.plot(kind='bar', color='skyblue', alpha=0.7)\n",
"plt.xlabel('Label')\n",
"plt.ylabel('Percentage of Instances')\n",
"plt.title('Number of Instances of Labels')\n",
"\n",
"# Add percentage values on the bars\n",
"for p in ax.patches:\n",
" width = p.get_width()\n",
" height = p.get_height()\n",
" x, y = p.get_xy() \n",
" ax.annotate(f'{height:.2f}%', (x + width/2, y + height*1.02), ha='center')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Conclusions\n",
"- 24,47 % patients have a label of 1\n",
"- during feature engineering we will have to:\n",
" - drop columns with only null values,\n",
" - reduce the number of columns,\n",
" - potentially impute null values in some columns,\n",
" - potentially create new features"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}