[c9d045]: / notebooks / feature_engineering.ipynb

Download this file

5796 lines (5795 with data), 411.6 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# **FEATURE ENGINEERING**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have too many columns with values True/ NaN. We will try to group them by categories. \n",
    "<br/> <br/>\n",
    " **Diagnoses**\n",
    "- Respiratory Disorders\n",
    "- Heart and Cardiovascular Diseases\n",
    "- Metabolic and Endocrine Disorders\n",
    "- Neurological Disorders\n",
    "- Orthopedic Injuries\n",
    "- Mental Health Conditions\n",
    "- Reproductive and Pregnancy-related\n",
    "\n",
    "**Medications**\n",
    "- Pain Relievers and Analgesics\n",
    "- Cardiovascular and Blood Pressure Medications\n",
    "- Infection Medications\n",
    "- Oral Medications\n",
    "- Other Medications\n",
    "\n",
    "**Treatments and Care**\n",
    "- Therapies and Regimes\n",
    "- Diagnostic Procedures\n",
    "- Surgerical Interventions\n",
    "- Patient Care Management\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "from tabulate import tabulate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>scc</th>\n",
       "      <th>race</th>\n",
       "      <th>marital</th>\n",
       "      <th>ethnic</th>\n",
       "      <th>gender</th>\n",
       "      <th>state</th>\n",
       "      <th>age</th>\n",
       "      <th>Pain severity - 0-10 verbal numeric rating [Score] - Reported</th>\n",
       "      <th>Influenza  seasonal  injectable  preservative free</th>\n",
       "      <th>...</th>\n",
       "      <th>Parainfluenza virus 1 RNA [Presence] in Respiratory specimen by NAA with probe detection</th>\n",
       "      <th>Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection</th>\n",
       "      <th>Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection</th>\n",
       "      <th>Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection</th>\n",
       "      <th>SARS-CoV-2 RNA Pnl Resp NAA+probe</th>\n",
       "      <th>Hydroxychloroquine Sulfate 200 MG Oral Tablet</th>\n",
       "      <th>1 ML denosumab 60 MG/ML Prefilled Syringe</th>\n",
       "      <th>Fexofenadine hydrochloride 60 MG Oral Tablet</th>\n",
       "      <th>Leronlimab 700 MG Injection</th>\n",
       "      <th>Lenzilumab 200 MG IV</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>101</td>\n",
       "      <td>white</td>\n",
       "      <td>m</td>\n",
       "      <td>nonhispanic</td>\n",
       "      <td>m</td>\n",
       "      <td>massachusetts</td>\n",
       "      <td>50t70</td>\n",
       "      <td>abnormal</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>110</td>\n",
       "      <td>white</td>\n",
       "      <td>m</td>\n",
       "      <td>nonhispanic</td>\n",
       "      <td>m</td>\n",
       "      <td>massachusetts</td>\n",
       "      <td>50t70</td>\n",
       "      <td>normal</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>127</td>\n",
       "      <td>black</td>\n",
       "      <td>m</td>\n",
       "      <td>nonhispanic</td>\n",
       "      <td>m</td>\n",
       "      <td>massachusetts</td>\n",
       "      <td>50t70</td>\n",
       "      <td>abnormal</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>129</td>\n",
       "      <td>white</td>\n",
       "      <td>m</td>\n",
       "      <td>nonhispanic</td>\n",
       "      <td>m</td>\n",
       "      <td>massachusetts</td>\n",
       "      <td>50t70</td>\n",
       "      <td>abnormal</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>69</td>\n",
       "      <td>white</td>\n",
       "      <td>m</td>\n",
       "      <td>nonhispanic</td>\n",
       "      <td>m</td>\n",
       "      <td>massachusetts</td>\n",
       "      <td>50t70</td>\n",
       "      <td>abnormal</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 783 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   label  scc   race marital       ethnic gender          state    age  \\\n",
       "0      0  101  white       m  nonhispanic      m  massachusetts  50t70   \n",
       "1      0  110  white       m  nonhispanic      m  massachusetts  50t70   \n",
       "2      0  127  black       m  nonhispanic      m  massachusetts  50t70   \n",
       "3      0  129  white       m  nonhispanic      m  massachusetts  50t70   \n",
       "4      1   69  white       m  nonhispanic      m  massachusetts  50t70   \n",
       "\n",
       "  Pain severity - 0-10 verbal numeric rating [Score] - Reported  \\\n",
       "0                                           abnormal              \n",
       "1                                             normal              \n",
       "2                                           abnormal              \n",
       "3                                           abnormal              \n",
       "4                                           abnormal              \n",
       "\n",
       "  Influenza  seasonal  injectable  preservative free  ...  \\\n",
       "0                                               True  ...   \n",
       "1                                               True  ...   \n",
       "2                                               True  ...   \n",
       "3                                               True  ...   \n",
       "4                                               True  ...   \n",
       "\n",
       "  Parainfluenza virus 1 RNA [Presence] in Respiratory specimen by NAA with probe detection  \\\n",
       "0                                                NaN                                         \n",
       "1                                                NaN                                         \n",
       "2                                                NaN                                         \n",
       "3                                                NaN                                         \n",
       "4                                                NaN                                         \n",
       "\n",
       "  Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection  \\\n",
       "0                                                NaN                                     \n",
       "1                                                NaN                                     \n",
       "2                                                NaN                                     \n",
       "3                                                NaN                                     \n",
       "4                                                NaN                                     \n",
       "\n",
       "  Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection  \\\n",
       "0                                                NaN                                     \n",
       "1                                                NaN                                     \n",
       "2                                                NaN                                     \n",
       "3                                                NaN                                     \n",
       "4                                                NaN                                     \n",
       "\n",
       "  Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection  \\\n",
       "0                                                NaN                                        \n",
       "1                                                NaN                                        \n",
       "2                                                NaN                                        \n",
       "3                                                NaN                                        \n",
       "4                                                NaN                                        \n",
       "\n",
       "  SARS-CoV-2 RNA Pnl Resp NAA+probe  \\\n",
       "0                               NaN   \n",
       "1                               NaN   \n",
       "2                               NaN   \n",
       "3                               NaN   \n",
       "4                               NaN   \n",
       "\n",
       "  Hydroxychloroquine Sulfate 200 MG Oral Tablet  \\\n",
       "0                                           NaN   \n",
       "1                                           NaN   \n",
       "2                                           NaN   \n",
       "3                                           NaN   \n",
       "4                                           NaN   \n",
       "\n",
       "  1 ML denosumab 60 MG/ML Prefilled Syringe  \\\n",
       "0                                       NaN   \n",
       "1                                       NaN   \n",
       "2                                       NaN   \n",
       "3                                       NaN   \n",
       "4                                       NaN   \n",
       "\n",
       "  Fexofenadine hydrochloride 60 MG Oral Tablet Leronlimab 700 MG Injection  \\\n",
       "0                                          NaN                         NaN   \n",
       "1                                          NaN                         NaN   \n",
       "2                                          NaN                         NaN   \n",
       "3                                          NaN                         NaN   \n",
       "4                                          NaN                         NaN   \n",
       "\n",
       "  Lenzilumab 200 MG IV  \n",
       "0                  NaN  \n",
       "1                  NaN  \n",
       "2                  NaN  \n",
       "3                  NaN  \n",
       "4                  NaN  \n",
       "\n",
       "[5 rows x 783 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('../data/learning_data.csv')\n",
    "df = df.drop('ptnum', axis=1)\n",
    "df.drop(columns=['C-20565-8'], inplace=True)\n",
    "df_codes = pd.read_csv('../data/codes.csv')\n",
    "code_to_name = df_codes.set_index('code')['name'].to_dict()\n",
    "df = df.rename(columns=code_to_name)\n",
    "df2 = df.copy()\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "DataFrame after combining columns with the same name:\n",
      "     Facial laceration Norinyl 1+50 28 Day Pack Intubation  \\\n",
      "0                  NaN                      NaN        NaN   \n",
      "1                  NaN                      NaN        NaN   \n",
      "2                  NaN                      NaN        NaN   \n",
      "3                  NaN                      NaN        NaN   \n",
      "4                  NaN                      NaN        NaN   \n",
      "...                ...                      ...        ...   \n",
      "9043               NaN                      NaN        NaN   \n",
      "9044               NaN                      NaN        NaN   \n",
      "9045               NaN                      NaN        NaN   \n",
      "9046               NaN                      NaN        NaN   \n",
      "9047               NaN                      NaN        NaN   \n",
      "\n",
      "     Amlodipine 5 MG Oral Tablet anastrozole 1 MG Oral Tablet  \\\n",
      "0                            NaN                          NaN   \n",
      "1                            NaN                          NaN   \n",
      "2                            NaN                          NaN   \n",
      "3                            NaN                          NaN   \n",
      "4                            NaN                          NaN   \n",
      "...                          ...                          ...   \n",
      "9043                         NaN                          NaN   \n",
      "9044                         NaN                          NaN   \n",
      "9045                        True                          NaN   \n",
      "9046                         NaN                          NaN   \n",
      "9047                         NaN                          NaN   \n",
      "\n",
      "      Take blood sample Lack of access to transportation (finding)  \\\n",
      "0                   NaN                                        NaN   \n",
      "1                   NaN                                        NaN   \n",
      "2                   NaN                                        NaN   \n",
      "3                   NaN                                        NaN   \n",
      "4                   NaN                                        NaN   \n",
      "...                 ...                                        ...   \n",
      "9043                NaN                                        NaN   \n",
      "9044                NaN                                        NaN   \n",
      "9045                NaN                                        NaN   \n",
      "9046                NaN                                        NaN   \n",
      "9047                NaN                                        NaN   \n",
      "\n",
      "      Incision of trachea (procedure) Alteplase 100 MG Injection  \\\n",
      "0                                 NaN                        NaN   \n",
      "1                                 NaN                        NaN   \n",
      "2                                 NaN                        NaN   \n",
      "3                                 NaN                        NaN   \n",
      "4                                 NaN                        NaN   \n",
      "...                               ...                        ...   \n",
      "9043                              NaN                        NaN   \n",
      "9044                              NaN                        NaN   \n",
      "9045                              NaN                        NaN   \n",
      "9046                              NaN                       True   \n",
      "9047                              NaN                        NaN   \n",
      "\n",
      "     Referral to hypertension clinic  ... Camila 28 Day Pack  \\\n",
      "0                                NaN  ...                NaN   \n",
      "1                                NaN  ...                NaN   \n",
      "2                                NaN  ...                NaN   \n",
      "3                                NaN  ...                NaN   \n",
      "4                                NaN  ...                NaN   \n",
      "...                              ...  ...                ...   \n",
      "9043                             NaN  ...                NaN   \n",
      "9044                             NaN  ...                NaN   \n",
      "9045                             NaN  ...                NaN   \n",
      "9046                             NaN  ...                NaN   \n",
      "9047                             NaN  ...                NaN   \n",
      "\n",
      "     History of appendectomy remifentanil 2 MG Injection  \\\n",
      "0                        NaN                         NaN   \n",
      "1                        NaN                         NaN   \n",
      "2                        NaN                         NaN   \n",
      "3                        NaN                         NaN   \n",
      "4                        NaN                         NaN   \n",
      "...                      ...                         ...   \n",
      "9043                     NaN                         NaN   \n",
      "9044                     NaN                         NaN   \n",
      "9045                     NaN                         NaN   \n",
      "9046                     NaN                         NaN   \n",
      "9047                     NaN                         NaN   \n",
      "\n",
      "     Chlorpheniramine Maleate 4 MG Oral Tablet Fetal anatomy study  \\\n",
      "0                                          NaN                 NaN   \n",
      "1                                          NaN                 NaN   \n",
      "2                                          NaN                 NaN   \n",
      "3                                          NaN                 NaN   \n",
      "4                                          NaN                 NaN   \n",
      "...                                        ...                 ...   \n",
      "9043                                       NaN                 NaN   \n",
      "9044                                       NaN                 NaN   \n",
      "9045                                       NaN                 NaN   \n",
      "9046                                       NaN                 NaN   \n",
      "9047                                       NaN                 NaN   \n",
      "\n",
      "     Bicarbonate [Moles/volume] in Arterial blood  \\\n",
      "0                                             NaN   \n",
      "1                                             NaN   \n",
      "2                                             NaN   \n",
      "3                                             NaN   \n",
      "4                                             NaN   \n",
      "...                                           ...   \n",
      "9043                                          NaN   \n",
      "9044                                          NaN   \n",
      "9045                                          NaN   \n",
      "9046                                          NaN   \n",
      "9047                                          NaN   \n",
      "\n",
      "     Screening mammography (procedure) Sertraline 100 MG Oral Tablet  \\\n",
      "0                                  NaN                           NaN   \n",
      "1                                  NaN                           NaN   \n",
      "2                                  NaN                           NaN   \n",
      "3                                  NaN                           NaN   \n",
      "4                                  NaN                           NaN   \n",
      "...                                ...                           ...   \n",
      "9043                               NaN                           NaN   \n",
      "9044                               NaN                           NaN   \n",
      "9045                               NaN                           NaN   \n",
      "9046                               NaN                           NaN   \n",
      "9047                               NaN                           NaN   \n",
      "\n",
      "     Transport problems (finding) Sulfamethoxazole / Trimethoprim  \n",
      "0                             NaN                             NaN  \n",
      "1                             NaN                             NaN  \n",
      "2                             NaN                             NaN  \n",
      "3                             NaN                             NaN  \n",
      "4                             NaN                             NaN  \n",
      "...                           ...                             ...  \n",
      "9043                          NaN                             NaN  \n",
      "9044                          NaN                             NaN  \n",
      "9045                          NaN                             NaN  \n",
      "9046                          NaN                             NaN  \n",
      "9047                          NaN                             NaN  \n",
      "\n",
      "[9048 rows x 779 columns]\n"
     ]
    }
   ],
   "source": [
    "columns = df.columns\n",
    "unique_columns = list(set(columns))  # Get unique column names\n",
    "\n",
    "# Create a new DataFrame with combined columns\n",
    "df_combined = pd.DataFrame()\n",
    "\n",
    "# Iterate over the unique columns to combine and merge appropriately\n",
    "for col in unique_columns:\n",
    "    if list(columns).count(col) > 1:  # If the column name is duplicated\n",
    "        # Find all columns with this name and combine them, e.g., with first non-null\n",
    "        combined_series = df[[col]].bfill(axis=1).iloc[:, 0]  # Backfill to combine\n",
    "        df_combined[col] = combined_series\n",
    "    else:\n",
    "        df_combined[col] = df[col]  # If it's unique, just copy the column\n",
    "\n",
    "print(\"\\nDataFrame after combining columns with the same name:\")\n",
    "print(df_combined)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = df_combined.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Creating new features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "respiratory_disorders_columns = [\n",
    "    'Acute bacterial sinusitis (disorder)',\n",
    "    'Acute bronchitis (disorder)',\n",
    "    'Acute respiratory distress syndrome (disorder)',\n",
    "    'Acute viral pharyngitis (disorder)',\n",
    "    'Asthma',\n",
    "    'Asthma screening',\n",
    "    'Bacterial infectious disease (disorder)',\n",
    "    'COVID-19',\n",
    "    'Chronic obstructive bronchitis (disorder)',\n",
    "    'Cough (finding)',\n",
    "    'Dyspnea (finding)',\n",
    "    'Fever (finding)',\n",
    "    'Hemoptysis (finding)',\n",
    "    'Hypoxemia (disorder)',\n",
    "    'Measurement of respiratory function (procedure)',\n",
    "    'NDA020503 200 ACTUAT Albuterol 0.09 MG/ACTUAT Metered Dose Inhaler',\n",
    "    'Nasal congestion (finding)',\n",
    "    'Nasal sinus endoscopy (procedure)',\n",
    "    'Otitis media',\n",
    "    'Oxygen Therapy',\n",
    "    'Pneumococcal conjugate PCV 13',\n",
    "    'Pneumonia (disorder)',\n",
    "    'Pulmonary emphysema (disorder)',\n",
    "    'Respiratory distress (finding)',\n",
    "    'Sore throat symptom (finding)',\n",
    "    'Sputum examination (procedure)',\n",
    "    'Sputum finding (finding)',\n",
    "    'Streptococcal sore throat (disorder)',\n",
    "    'Streptococcus pneumoniae group B antigen test',\n",
    "    'Throat culture (procedure)',\n",
    "    'Total knee replacement',\n",
    "    'Transfer to stepdown unit (procedure)',\n",
    "    'Transplant of lung (procedure)',\n",
    "    'Viral sinusitis (disorder)',\n",
    "    'House dust mite (organism)',\n",
    "    'Mold (organism)',\n",
    "    'Grass pollen (substance)',\n",
    "    'Tree pollen (substance)',\n",
    "    'Animal dander (substance)',\n",
    "    'Wheezing (finding)'\n",
    "]\n",
    "\n",
    "df2['Respiratory Disorders'] = df2[respiratory_disorders_columns].sum(axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "heart_and_cardiovascular_diseases_columns = [\n",
    "    'Acute deep venous thrombosis (disorder)',\n",
    "    'Acute pulmonary embolism (disorder)',\n",
    "    'Atrial Fibrillation',\n",
    "    'Blindness due to type 2 diabetes mellitus (disorder)',\n",
    "    'Chronic congestive heart failure (disorder)',\n",
    "    'Coronary Heart Disease',\n",
    "    'Diabetes',\n",
    "    'Diabetic renal disease (disorder)',\n",
    "    'Diabetic retinopathy associated with type II diabetes mellitus (disorder)',\n",
    "    'Hyperglycemia (disorder)',\n",
    "    'Hypertension',\n",
    "    'Hyperlipidemia',\n",
    "    'Hypertriglyceridemia (disorder)',\n",
    "    'Myocardial Infarction',\n",
    "    'Stroke',\n",
    "    'Heart failure (disorder)'\n",
    "]\n",
    "\n",
    "\n",
    "df2['Heart and Cardiovascular Diseases'] = df2[heart_and_cardiovascular_diseases_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "metabolic_and_endocrine_disorders_columns = [\n",
    "    'Alcoholism',\n",
    "    'Anemia (disorder)',\n",
    "    'Body mass index 30+ - obesity (finding)',\n",
    "    'Chronic kidney disease stage 1 (disorder)',\n",
    "    'Chronic kidney disease stage 2 (disorder)',\n",
    "    'Chronic kidney disease stage 3 (disorder)',\n",
    "    'Fatigue (finding)',\n",
    "    'Gout',\n",
    "    'Hyperlipidemia',\n",
    "    'Hypertension',\n",
    "    'Hypertriglyceridemia (disorder)',\n",
    "    'Hypoxemia (disorder)',\n",
    "    'Idiopathic atrophic hypothyroidism',\n",
    "    'Loss of taste (finding)',\n",
    "    'Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)',\n",
    "    'Metabolic syndrome X (disorder)',\n",
    "    'Microalbuminuria due to type 2 diabetes mellitus (disorder)',\n",
    "    'Neuropathy due to type 2 diabetes mellitus (disorder)',\n",
    "    'Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus (disorder)',\n",
    "    'Osteoarthritis of hip',\n",
    "    'Osteoarthritis of knee',\n",
    "    'Osteoporosis (disorder)',\n",
    "    'Pathological fracture due to osteoporosis (disorder)',\n",
    "    'Prediabetes',\n",
    "    'Proteinuria due to type 2 diabetes mellitus (disorder)',\n",
    "    'Protracted diarrhea',\n",
    "    'Rheumatoid arthritis',\n",
    "    'Secondary malignant neoplasm of colon',\n",
    "    'Stress (finding)',\n",
    "    'Unhealthy alcohol drinking behavior (finding)',\n",
    "    'Whiplash injury to neck',\n",
    "    'Proliferative diabetic retinopathy due to type II diabetes mellitus (disorder)'\n",
    "]\n",
    "\n",
    "df2['Metabolic and Endocrine Disorders'] = df2[metabolic_and_endocrine_disorders_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "neurological_disorders_columns = [\n",
    "    'Alzheimer\\'s disease (disorder)',\n",
    "    'Brain damage - traumatic',\n",
    "    'Chronic intractable migraine without aura',\n",
    "    'Chronic low back pain (finding)',\n",
    "    'Chronic neck pain (finding)',\n",
    "    'Chronic pain',\n",
    "    'Chronic paralysis due to lesion of spinal cord',\n",
    "    'Chronic sinusitis (disorder)',\n",
    "    'Epilepsy',\n",
    "    'Familial Alzheimer\\'s disease of early onset (disorder)',\n",
    "    'Fibromyalgia (disorder)',\n",
    "    'Lupus erythematosus',\n",
    "    'Male infertility due to cystic fibrosis (disorder)',\n",
    "    'Malignant neoplasm of breast (disorder)',\n",
    "    'Malignant tumor of colon',\n",
    "    'Seizure disorder',\n",
    "    'Sepsis (disorder)',\n",
    "    'Sepsis caused by virus (disorder)',\n",
    "    'Septic shock (disorder)',\n",
    "    'Shock (disorder)',\n",
    "    'Sinusitis (disorder)',\n",
    "    'Streptococcal sore throat (disorder)',\n",
    "    'Suspected COVID-19',\n",
    "    'Transformed migraine (disorder)',\n",
    "    'Victim of intimate partner abuse (finding)',\n",
    "    'Viral sinusitis (disorder)',\n",
    "    'Seizure Count Cerebral Cortex Electroencephalogram (EEG)',\n",
    "    'Headache (finding)',\n",
    "    'Primary fibromyalgia syndrome'\n",
    "]\n",
    "\n",
    "\n",
    "df2['Neurological Disorders'] = df2[neurological_disorders_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "orthopedic_injuries_columns = [\n",
    "    'Admission to orthopedic department',\n",
    "    'Ankle X-ray',\n",
    "    'Bone immobilization',\n",
    "    'Brain damage - traumatic',\n",
    "    'Bullet wound',\n",
    "    'Burn injury(morphologic abnormality)',\n",
    "    'Chronic kidney disease stage 1 (disorder)',\n",
    "    'Clavicle X-ray',\n",
    "    'Closed fracture of hip',\n",
    "    'Concussion injury of brain',\n",
    "    'Concussion with loss of consciousness',\n",
    "    'Concussion with no loss of consciousness',\n",
    "    'Facial laceration',\n",
    "    'Fracture of ankle',\n",
    "    'Fracture of clavicle',\n",
    "    'Fracture of forearm',\n",
    "    'Fracture of rib',\n",
    "    'Fracture of the vertebral column with spinal cord injury',\n",
    "    'Fracture of vertebral column without spinal cord injury',\n",
    "    'Fracture subluxation of wrist',\n",
    "    'History of amputation of foot (situation)',\n",
    "    'History of lower limb amputation (situation)',\n",
    "    'Hypertriglyceridemia (disorder)',\n",
    "    'Impacted molars',\n",
    "    'Injury of anterior cruciate ligament',\n",
    "    'Injury of heart (disorder)',\n",
    "    'Injury of kidney (disorder)',\n",
    "    'Injury of medial collateral ligament of knee',\n",
    "    'Injury of tendon of the rotator cuff of shoulder',\n",
    "    'Joint pain (finding)',\n",
    "    'Laceration of foot',\n",
    "    'Laceration of hand',\n",
    "    'Laceration of thigh',\n",
    "    'Localized  primary osteoarthritis of the hand',\n",
    "    'Muscle pain (finding)',\n",
    "    'Rupture of appendix',\n",
    "    'Rupture of patellar tendon',\n",
    "    'Second degree burn',\n",
    "    'Sprain of ankle',\n",
    "    'Sprain of wrist',\n",
    "    'Surgical manipulation of joint of knee',\n",
    "    'Surgical manipulation of shoulder joint',\n",
    "    'Tear of meniscus of knee',\n",
    "    'Total knee replacement',\n",
    "    'Whiplash injury to neck',\n",
    "    'Third degree burn'\n",
    "]\n",
    "\n",
    "df2[\"Orthopedic Injuries\"] = df2[orthopedic_injuries_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "mental_health_columns = [\n",
    "    'Alcoholism',\n",
    "    'At risk for suicide (finding)',\n",
    "    'Brief general examination (procedure)',\n",
    "    'Care regimes assessment (procedure)',\n",
    "    'Comprehensive interview and evaluation (procedure)',\n",
    "    'Concussion with no loss of consciousness',\n",
    "    'Depression screening (procedure)',\n",
    "    'Depression screening using Patient Health Questionnaire Nine Item score (procedure)',\n",
    "    'Depression screening using Patient Health Questionnaire Two-Item score (procedure)',\n",
    "    'Died in hospice (finding)',\n",
    "    'Drug overdose',\n",
    "    'Evaluation of psychiatric state of patient',\n",
    "    'Has a criminal record (finding)',\n",
    "    'Homeless (finding)',\n",
    "    'Limited social contact (finding)',\n",
    "    'Mental health Outpatient Note',\n",
    "    'Mental health Telehealth Note',\n",
    "    'Mental health screening (procedure)',\n",
    "    'Misuses drugs (finding)',\n",
    "    'Nausea (finding)',\n",
    "    'Not in labor force (finding)',\n",
    "    'Part-time employment (finding)',\n",
    "    'Passive conjunctival congestion (finding)',\n",
    "    'Posttraumatic stress disorder',\n",
    "    'Psychiatric follow-up',\n",
    "    'Refugee (person)',\n",
    "    'Reports of violence in the environment (finding)',\n",
    "    'Severe anxiety (panic) (finding',\n",
    "    'Social isolation (finding)',\n",
    "    'Stress (finding)',\n",
    "    'Suicide risk assessment (procedure)',\n",
    "    'Victim of intimate partner abuse (finding)',\n",
    "    'Attempted suicide - cut/stab',\n",
    "    'Attempted suicide - suffocation',\n",
    "    'Opioid abuse (disorder)',\n",
    "    'Assessment of anxiety (procedure)'\n",
    "]\n",
    "\n",
    "df2['Mental Health'] = df2[mental_health_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "reproductive_and_pregancy_columns = [\n",
    "    '1 ML medroxyPROGESTERone acetate 150 MG/ML Injection',\n",
    "    '168 HR Ethinyl Estradiol 0.00146 MG/HR / norelgestromin 0.00625 MG/HR Transdermal System',\n",
    "    'Antenatal RhD antibody screening',\n",
    "    'Antepartum eclampsia',\n",
    "    'Augmentation of labor',\n",
    "    'Auscultation of the fetal heart',\n",
    "    'Bilateral tubal ligation',\n",
    "    'Blighted ovum',\n",
    "    'Camila 28 Day Pack',\n",
    "    'Counseling for termination of pregnancy',\n",
    "    'Cytopathology procedure  preparation of smear  genital source',\n",
    "    'Episiotomy',\n",
    "    'Errin 28 Day Pack',\n",
    "    'Estrostep Fe 28 Day Pack',\n",
    "    'Etonogestrel 68 MG Drug Implant',\n",
    "    'Evaluation of uterine fundal height',\n",
    "    'Excision of fallopian tube and surgical removal of ectopic pregnancy',\n",
    "    'Fetal anatomy study',\n",
    "    'Fetus with unknown complication',\n",
    "    'Gonorrhea infection test',\n",
    "    'Hyperlipidemia',\n",
    "    'Induced termination of pregnancy',\n",
    "    'Insertion of intrauterine contraceptive device',\n",
    "    'Insertion of subcutaneous contraceptive',\n",
    "    'Instrumental delivery',\n",
    "    'Jolivette 28 Day Pack',\n",
    "    'Kyleena 19.5 MG Intrauterine System',\n",
    "    'Leronlimab 700 MG Injection',\n",
    "    'Levonorgestrel 0.00354 MG/HR Drug Implant',\n",
    "    'Levora 0.15/30 28 Day Pack',\n",
    "    'Liletta 52 MG Intrauterine System',\n",
    "    'Medical induction of labor',\n",
    "    'Mestranol / Norethynodrel [Enovid]',\n",
    "    'Methotrexate injection into tubal pregnancy',\n",
    "    'Mirena 52 MG Intrauterine System',\n",
    "    'Miscarriage in first trimester',\n",
    "    'Natazia 28 Day Pack',\n",
    "    'Norinyl 1+50 28 Day Pack',\n",
    "    'NuvaRing 0.12/0.015 MG per 24HR 21 Day Vaginal Ring',\n",
    "    'Ortho Tri-Cyclen 28 Day Pack',\n",
    "    'Preeclampsia',\n",
    "    'Pregnancy termination care',\n",
    "    'Premature birth of newborn',\n",
    "    'Removal of intrauterine device',\n",
    "    'Removal of subcutaneous contraceptive',\n",
    "    'Replacement of contraceptive intrauterine device',\n",
    "    'RhD passive immunization',\n",
    "    'Screening for chromosomal aneuploidy in prenatal amniotic fluid',\n",
    "    'Spontaneous breech delivery',\n",
    "    'Standard pregnancy test',\n",
    "    'Tubal pregnancy',\n",
    "    'Ultrasonography of abdomen  right upper quadrant and epigastrium',\n",
    "    'Ultrasonography of bilateral breasts (procedure)',\n",
    "    'Ultrasound scan for fetal viability',\n",
    "    'Vaccination for diphtheria  pertussis  and tetanus',\n",
    "    'Vasectomy' \n",
    "]\n",
    "\n",
    "df2['Reproductive and Pregnancy'] = df2[reproductive_and_pregancy_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "pain_relievers_and_analesics_columns = [\n",
    "    '10 ML Alfentanil 0.5 MG/ML Injection',\n",
    "    '10 ML Fentanyl 0.05 MG/ML Injection',\n",
    "    '12 HR Hydrocodone Bitartrate 10 MG Extended Release Oral Capsule',\n",
    "    '5 ML SUFentanil 0.05 MG/ML Injection',\n",
    "    '72 HR Fentanyl 0.025 MG/HR Transdermal System',\n",
    "    'Abuse-Deterrent 12 HR Oxycodone Hydrochloride 10 MG Extended Release Oral Tablet [Oxycontin]',\n",
    "    'Abuse-Deterrent 12 HR Oxycodone Hydrochloride 15 MG Extended Release Oral Tablet',\n",
    "    'Acetaminophen 325 MG / oxyCODONE Hydrochloride 2.5 MG Oral Tablet',\n",
    "    'Acetaminophen 325 MG / oxyCODONE Hydrochloride 5 MG Oral Tablet',\n",
    "    'Acetaminophen 500 MG Oral Tablet',\n",
    "    'Acetaminophen 300 MG / Hydrocodone Bitartrate 5 MG Oral Tablet',\n",
    "    'Acetaminophen 325 MG / HYDROcodone Bitartrate 7.5 MG Oral Tablet',\n",
    "    'Acetaminophen 325 MG / Oxycodone Hydrochloride 10 MG Oral Tablet [Percocet]',\n",
    "    'Acetaminophen 325 MG Oral Tablet',\n",
    "    'Acetaminophen/Hydrocodone',\n",
    "    'Aspirin',\n",
    "    'Aspirin 81 MG Oral Tablet',\n",
    "    'Carbamazepine[Tegretol]',\n",
    "    'Chlorpheniramine Maleate 4 MG Oral Tablet',\n",
    "    'Clopidogrel 75 MG Oral Tablet',\n",
    "    'Colchicine 0.6 MG Oral Tablet',\n",
    "    'Cyclophosphamide 1000 MG Injection',\n",
    "    'Diazepam 5 MG Oral Tablet',\n",
    "    'Diazepam 5 MG/ML Injectable Solution',\n",
    "    'Doxycycline Monohydrate 100 MG Oral Tablet',\n",
    "    'Ibuprofen',\n",
    "    'Ibuprofen 200 MG Oral Tablet',\n",
    "    'Ibuprofen 400 MG Oral Tablet [Ibu]',\n",
    "    'Lorazepam 2 MG/ML Injectable Solution',\n",
    "    'Meperidine Hydrochloride 50 MG Oral Tablet',\n",
    "    'Muscle pain (finding)',\n",
    "    'Naproxen 500 MG Oral Tablet',\n",
    "    'Naproxen sodium 220 MG Oral Tablet',\n",
    "    'Nitroglycerin 0.4 MG/ACTUAT Mucosal Spray',\n",
    "    'Phenazopyridine hydrochloride 100 MG Oral Tablet',\n",
    "    'Sulfamethoxazole / Trimethoprim',\n",
    "    'buprenorphine 2 MG / naloxone 0.5 MG Sublingual Tablet',\n",
    "    'diphenhydrAMINE Hydrochloride 25 MG Oral Tablet',\n",
    "    'duloxetine 20 MG Delayed Release Oral Capsule',\n",
    "    'methadone hydrochloride 10 MG Oral Tablet',\n",
    "    'tramadol hydrochloride 50 MG Oral Tablet',\n",
    "    'clonazePAM 0.25 MG Oral Tablet',\n",
    "    'Midazolam 1 MG/ML Injectable Solution',\n",
    "    'Rocuronium bromide 10 MG/ML Injectable Solution',\n",
    "    'Epidural anesthesia',\n",
    "]\n",
    "\n",
    "df2['Pain Relievers and Analesics'] = df2[pain_relievers_and_analesics_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "cardiovascular_and_blood_pressure_medications_columns = [\n",
    "    '0.3 ML Enoxaparin sodium 100 MG/ML Prefilled Syringe',\n",
    "    '0.4 ML Enoxaparin sodium 100 MG/ML Prefilled Syringe',\n",
    "    '1 ML Enoxaparin sodium 150 MG/ML Prefilled Syringe',\n",
    "    '1 ML heparin sodium  porcine 5000 UNT/ML Injection',\n",
    "    '3 ML Amiodarone hydrocholoride 50 MG/ML Prefilled Syringe',\n",
    "    '4 ML norepinephrine 1 MG/ML Injection',\n",
    "    'Acute deep venous thrombosis (disorder)',\n",
    "    'Acute pulmonary embolism (disorder)',\n",
    "    'Alteplase 100 MG Injection',\n",
    "    'Assessment using New York Heart Association Classification (procedure)',\n",
    "    'Atorvastatin 80 MG Oral Tablet',\n",
    "    'Atropine Sulfate 1 MG/ML Injectable Solution',\n",
    "    'Captopril 25 MG Oral Tablet',\n",
    "    'Cardiac Arrest',\n",
    "    'Cardiovascular stress testing (procedure)',\n",
    "    'Catheter ablation of tissue of heart',\n",
    "    'Coronary artery bypass grafting',\n",
    "    'Digoxin 0.125 MG Oral Tablet',\n",
    "    'Echocardiography (procedure)',\n",
    "    'Electrical cardioversion',\n",
    "    'Electrocardiographic procedure',\n",
    "    'Furosemide 40 MG Oral Tablet',\n",
    "    'History of myocardial infarction (situation)',\n",
    "    'Hydrochlorothiazide 25 MG Oral Tablet',\n",
    "    'Implantation of left ventricular assist device (procedure)',\n",
    "    'Insertion of biventricular implantable cardioverter defibrillator',\n",
    "    'Lisinopril',\n",
    "    'Myocardial Infarction',\n",
    "    'Nitrofurantoin 5 MG/ML Oral Suspension',\n",
    "    'Peripheral blood smear interpretation',\n",
    "    'Referral to hypertension clinic',\n",
    "    'Shock (disorder)',\n",
    "    'Stroke',\n",
    "    'Transplantation of heart (procedure)',\n",
    "    'Verapamil Hydrochloride 40 MG',\n",
    "    'Warfarin Sodium 5 MG Oral Tablet',\n",
    "    'carvedilol 25 MG Oral Tablet',\n",
    "    'lisinopril 10 MG Oral Tablet',\n",
    "    'lisinopril 20 MG Oral Tablet',\n",
    "    'losartan potassium 25 MG Oral Tablet',\n",
    "    'losartan potassium 50 MG Oral Tablet',\n",
    "    'sacubitril 97 MG / valsartan 103 MG Oral Tablet',\n",
    "    'remifentanil 2 MG Injection',\n",
    "    'pregabalin 100 MG Oral Capsule'\n",
    "]\n",
    "\n",
    "df2['Cardiovascular and Blood Pressure Medications'] = df2[cardiovascular_and_blood_pressure_medications_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "injection_medications_columns = [\n",
    "    '10 ML Doxorubicin Hydrochloride 2 MG/ML Injection',\n",
    "    '10 ML oxaliplatin 5 MG/ML Injection',\n",
    "    '1 ML DOCEtaxel 20 MG/ML Injection',\n",
    "    '1 ML Epinephrine 1 MG/ML Injection',\n",
    "    '1 ML Morphine Sulfate 5 MG/ML Injection',\n",
    "    '1 ML Vasopressin (USP) 20 UNT/ML Injection',\n",
    "    '10 ML Fluorouracil 50 MG/ML Injection',\n",
    "    '10 ML Pamidronate Disodium 3 MG/ML Injection',\n",
    "    '100 ML Epirubicin Hydrochloride 2 MG/ML Injection',\n",
    "    '100 ML Propofol 10 MG/ML Injection',\n",
    "    '100 ML zoledronic acid 0.04 MG/ML Injection',\n",
    "    '150 ML vancomycin 5 MG/ML Injection',\n",
    "    '2 ML Ondansetron 2 MG/ML Injection',\n",
    "    '20 ML tocilizumab 20 MG/ML Injection',\n",
    "    '5 ML hyaluronidase-oysk 2000 UNT/ML / trastuzumab 120 MG/ML Injection',\n",
    "    'pneumococcal polysaccharide vaccine  23 valent',\n",
    "    'remdesivir 100 MG Injection',\n",
    "    'zoster',\n",
    "    'Aztreonam 2000 MG Injection',\n",
    "    'cefdinir', \n",
    "    'Cefdinir',\n",
    "    'doxycycline hyclate 100 MG',\n",
    "    'Ampicillin 100 MG/ML Injectable Solution',\n",
    "    'Penicillin G 375 MG/ML Injectable Solution',\n",
    "    'Penicillin V',\n",
    "    'Paclitaxel 100 MG Injection',\n",
    "    'Piperacillin 4000 MG / tazobactam 500 MG Injection',\n",
    "    'Leucovorin 100 MG Injection',\n",
    "    'Influenza  seasonal  injectable  preservative free',\n",
    "    'Syphilis infection test',\n",
    "    'Skin test for tuberculosis',\n",
    "    'Urine culture'\n",
    "]\n",
    "\n",
    "\n",
    "df2['Injection Medications'] = df2[injection_medications_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "oral_medications_columns = [\n",
    "    '12 HR Cefaclor 500 MG Extended Release Oral Tablet',\n",
    "    '24 HR Donepezil hydrochloride 10 MG / Memantine hydrochloride 28 MG Extended Release Oral Capsule',\n",
    "    '24 HR Metformin hydrochloride 500 MG Extended Release Oral Tablet',\n",
    "    'Acetaminophen 300 MG / Codeine Phosphate 15 MG Oral Tablet',\n",
    "    'Acetaminophen 325 MG Oral Tablet [Tylenol]',\n",
    "    'Alendronic acid 10 MG Oral Tablet',\n",
    "    'Allopurinol 100 MG Oral Tablet',\n",
    "    'Amlodipine 5 MG Oral Tablet',\n",
    "    'Amoxicillin 250 MG / Clavulanate 125 MG Oral Tablet',\n",
    "    'Amoxicillin 250 MG Oral Capsule',\n",
    "    'Aspirin 81 MG Oral Tablet',\n",
    "    'Astemizole 10 MG Oral Tablet',\n",
    "    'Atorvastatin 80 MG Oral Tablet',\n",
    "    'Azithromycin 250 MG Oral Tablet',\n",
    "    'Azithromycin 250mg',\n",
    "    'Cefuroxime 250 MG Oral Tablet',\n",
    "    'Chlorpheniramine Maleate 4 MG Oral Tablet',\n",
    "    'Clopidogrel 75 MG Oral Tablet',\n",
    "    'Colchicine 0.6 MG Oral Tablet',\n",
    "    'Diazepam 5 MG Oral Tablet',\n",
    "    'Digoxin 0.125 MG Oral Tablet',\n",
    "    'Donepezil hydrochloride 10 MG Oral Tablet',\n",
    "    'Donepezil hydrochloride 23 MG Oral Tablet',\n",
    "    'Doxycycline Monohydrate 100 MG Oral Tablet',\n",
    "    'Fexofenadine hydrochloride 60 MG Oral Tablet',\n",
    "    'Furosemide 40 MG Oral Tablet',\n",
    "    'Galantamine 4 MG Oral Tablet',\n",
    "    'Hydrochlorothiazide 25 MG Oral Tablet',\n",
    "    'Levothyroxine Sodium 0.075 MG Oral Tablet',\n",
    "    'Loratadine 10 MG Oral Tablet',\n",
    "    'Methotrexate 2.5 MG Oral Tablet',\n",
    "    'Milnacipran hydrochloride 100 MG Oral Tablet',\n",
    "    'Nitrofurantoin 5 MG/ML Oral Suspension',\n",
    "    'Penicillin V Potassium 500 MG Oral Tablet',\n",
    "    'Phenazopyridine hydrochloride 100 MG Oral Tablet',\n",
    "    'Sertraline 100 MG Oral Tablet',\n",
    "    'Simvastatin 10 MG Oral Tablet',\n",
    "    'Simvastatin 20 MG Oral Tablet',\n",
    "    'Tacrine 10 MG Oral Capsule',\n",
    "    'Tamoxifen 10 MG Oral Tablet',\n",
    "    'Terfenadine 60 MG Oral Tablet',\n",
    "    'Verapamil Hydrochloride 40 MG',\n",
    "    'Verzenio 100 MG Oral Tablet',\n",
    "    'Warfarin Sodium 5 MG Oral Tablet',\n",
    "    'palbociclib 100 MG Oral Capsule',\n",
    "    'predniSONE 20 MG Oral Tablet',\n",
    "    'ribociclib 200 MG Oral Tablet',\n",
    "    'neratinib 40 MG Oral Tablet',\n",
    "    'Hydroxychloroquine Sulfate 200 MG Oral Tablet',\n",
    "    'chloroquine phosphate 500 MG Oral Tablet',\n",
    "    'Naltrexone hydrochloride 50 MG Oral Tablet',\n",
    "    'lapatinib 250 MG Oral Tablet',\n",
    "    'cetirizine hydrochloride 10 MG Oral Tablet',\n",
    "    'cycloSPORINE  modified 100 MG Oral Capsule',\n",
    "    'letrozole 2.5 MG Oral Tablet',\n",
    "    'exemestane 25 MG Oral Tablet',\n",
    "    'ferrous sulfate 325 MG Oral Tablet'\n",
    "    \n",
    "]\n",
    "\n",
    "df2['Oral Medications'] = df2[oral_medications_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "other_medications_columns = [\n",
    "    '0.25 ML Leuprolide Acetate 30 MG/ML Prefilled Syringe',\n",
    "    '1 ML Epoetin Alfa 4000 UNT/ML Injection [Epogen]',\n",
    "    '1 ML denosumab 60 MG/ML Prefilled Syringe',\n",
    "    '10 ML Furosemide 10 MG/ML Injection',\n",
    "    '3 ML liraglutide 6 MG/ML Pen Injector',\n",
    "    '5 ML fulvestrant 50 MG/ML Prefilled Syringe',\n",
    "    '12 HR Cefaclor 500 MG Extended Release Oral Tablet',\n",
    "    '120 ACTUAT Fluticasone propionate 0.044 MG/ACTUAT Metered Dose Inhaler',\n",
    "    '60 ACTUAT Fluticasone propionate 0.25 MG/ACTUAT / salmeterol 0.05 MG/ACTUAT Dry Powder Inhaler',\n",
    "    'Acetaminophen 21.7 MG/ML / Dextromethorphan Hydrobromide 1 MG/ML / doxylamine succinate 0.417 MG/ML Oral Solution',\n",
    "    'Chlamydia antigen test',\n",
    "    'Hepatitis B Surface Antigen Measurement',\n",
    "    'Hepatitis C antibody test',\n",
    "    'Human immunodeficiency virus antigen test',\n",
    "    'Insulin Lispro 100 UNT/ML Injectable Solution [Humalog]',\n",
    "    'Jolivette 28 Day Pack',\n",
    "    'Measurement of Varicella-zoster virus antibody',\n",
    "    'Memantine hydrochloride 2 MG/ML Oral Solution',\n",
    "    'NDA020503 200 ACTUAT Albuterol 0.09 MG/ACTUAT Metered Dose Inhaler',\n",
    "    'NDA020800 0.3 ML Epinephrine 1 MG/ML Auto-Injector',\n",
    "    'NITROFURANTOIN  MACROCRYSTALS 50 MG Oral Capsule',\n",
    "    'Ortho Tri-Cyclen 28 Day Pack',\n",
    "    'Pneumococcal conjugate PCV 13',\n",
    "    'Pulmozyme (Dornase Alfa)',\n",
    "    'SARS-COV-2 (COVID-19) vaccine  mRNA  spike protein  LNP  preservative free  100 mcg/0.5mL dose',\n",
    "    'SARS-COV-2 (COVID-19) vaccine  mRNA  spike protein  LNP  preservative free  30 mcg/0.3mL dose',\n",
    "    'SARS-COV-2 (COVID-19) vaccine  vector non-replicating  recombinant spike protein-Ad26  preservative free  0.5 mL',\n",
    "    'Seasonique 91 Day Pack',\n",
    "    'Td (adult) preservative free',\n",
    "    'Trinessa 28 Day Pack',\n",
    "    'Vaccination for diphtheria  pertussis  and tetanus',\n",
    "    'Vitamin B 12 5 MG/ML Injectable Solution',\n",
    "    'Vomiting symptom (finding)',\n",
    "    'Yaz 28 Day Pack',\n",
    "    'ado-trastuzumab emtansine 100 MG Injection',\n",
    "    'albuterol 5 MG/ML Inhalation Solution',\n",
    "    'amLODIPine 2.5 MG Oral Tablet',\n",
    "    'anastrozole 1 MG Oral Tablet',\n",
    "    'baricitinib 2 MG Oral Tablet',\n",
    "    'buprenorphine 2 MG / naloxone 0.5 MG Sublingual Tablet',\n",
    "    'carvedilol 25 MG Oral Tablet',\n",
    "    'sevoflurane 1000 MG/ML Inhalant Solution',\n",
    "    'Latex (substance)',\n",
    "    'desflurane 1000 MG/ML Inhalation Solution',\n",
    "    'Isoflurane 999 MG/ML Inhalant Solution',\n",
    "    'Hydrocortisone 10 MG/ML Topical Cream',\n",
    "    'Lenzilumab 200 MG IV',\n",
    "    'remifentanil 2 MG Injection',\n",
    "    'Carboplatin 10 MG/ML Injectable Solution',\n",
    "    'chloroquine phosphate 500 MG Oral Tablet',\n",
    "    'Naltrexone hydrochloride 50 MG Oral Tablet',\n",
    "    'lapatinib 250 MG Oral Tablet',\n",
    "    'Sodium Chloride 9 MG/ML Injectable Solution',\n",
    "    'insulin human  isophane 70 UNT/ML / Regular Insulin  Human 30 UNT/ML Injectable Suspension [Humulin]'\n",
    "]\n",
    "\n",
    "\n",
    "df2['Other Medications'] = df2[other_medications_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "therapies_and_regimes_columns = [\n",
    "    '24hr nicotine transdermal patch',\n",
    "    'Assessment of health and social care needs (procedure)',\n",
    "    'Assessment of substance use (procedure)',\n",
    "    'Assessment using Alcohol Use Disorders Identification Test - Consumption (procedure)',\n",
    "    'Assessment using Morse Fall Scale (procedure)',\n",
    "    'Cognitive and behavioral therapy (regime/therapy)',\n",
    "    'Combined chemotherapy and radiation therapy (procedure)',\n",
    "    'Comprehensive interview and evaluation (procedure)',\n",
    "    'Construction of diverting colostomy',\n",
    "    'Controlled ventilation procedure and therapy  initiation and management (procedure)',\n",
    "    'Coordination of care plan (procedure)',\n",
    "    'Home health aide service (regime/therapy)',\n",
    "    'Hospice care (regime/therapy)',\n",
    "    'Interstitial brachytherapy (procedure)',\n",
    "    'Intracavitary brachytherapy (procedure)',\n",
    "    'Monitoring of patient (regime/therapy)',\n",
    "    'Movement therapy (regime/therapy)',\n",
    "    'Nursing care/supplementary surveillance (regime/therapy)',\n",
    "    'Occupational therapy (regime/therapy)',\n",
    "    'Physical examination',\n",
    "    'Physical therapy procedure (regime/therapy)',\n",
    "    'Professional / ancillary services care (regime/therapy)',\n",
    "    'Psychosocial care (regime/therapy)',\n",
    "    'Pulmonary rehabilitation (regime/therapy)',\n",
    "    'Radiation oncology AND/OR radiotherapy (procedure)',\n",
    "    'Radiation therapy care (regime/therapy)',\n",
    "    'Referral to home health care service (procedure)',\n",
    "    'Referral to hypertension clinic',\n",
    "    'Social case work (regime/therapy)',\n",
    "    'Speech and language therapy regime (regime/therapy',\n",
    "    'Subcutaneous immunotherapy',\n",
    "    'Teleradiotherapy procedure (procedure)',\n",
    "    'Transplant of lung (procedure)',\n",
    "    'Transplantation of heart (procedure)',\n",
    "    'Weaning from mechanically assisted ventilation (procedure)',\n",
    "    'Microbial culture (procedure)',\n",
    "    'Chemotherapy (procedure)'\n",
    "]\n",
    "\n",
    "\n",
    "df2['Therapies and Regimes'] = df2[therapies_and_regimes_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "diagnostic_procedures_columns = [\n",
    "    'Biopsy of breast (procedure)',\n",
    "    'Biopsy of colon',\n",
    "    'Biopsy of prostate',\n",
    "    'Blood typing  RH typing',\n",
    "    'Bone density scan (procedure)',\n",
    "    'Chlamydia antigen test',\n",
    "    'Clavicle X-ray',\n",
    "    'Colonoscopy',\n",
    "    'Digital examination of rectum',\n",
    "    'Echocardiography (procedure)',\n",
    "    'Electrocardiographic procedure',\n",
    "    'Human epidermal growth factor receptor 2 gene detection by fluorescence in situ hybridization (procedure)',\n",
    "    'Human epidermal growth factor receptor 2 gene detection by immunohistochemistry (procedure)',\n",
    "    'Human immunodeficiency virus antigen test',\n",
    "    'Initial patient assessment (procedure)',\n",
    "    'Knee X-ray',\n",
    "    'Magnetic resonance imaging of breast (procedure)',\n",
    "    'Mammogram - symptomatic (procedure)',\n",
    "    'Mammography (procedure)',\n",
    "    'Manual pelvic examination (procedure)',\n",
    "    'Measurement of Varicella-zoster virus antibody',\n",
    "    'Measurement of respiratory function (procedure)',\n",
    "    'Nasal sinus endoscopy (procedure)',\n",
    "    'Pelvis X-ray',\n",
    "    'Plain chest X-ray (procedure)',\n",
    "    'Screening for domestic abuse (procedure)',\n",
    "    'Screening for drug abuse (procedure)',\n",
    "    'Screening for occult blood in feces (procedure)',\n",
    "    'Screening mammography (procedure)',\n",
    "    'Sentinel lymph node biopsy (procedure)',\n",
    "    'Spirometry (procedure)',\n",
    "    'Throat culture (procedure)',\n",
    "    'Upper arm X-ray',\n",
    "    'X-ray or wrist',\n",
    "    'Seizure Count Cerebral Cortex Electroencephalogram (EEG)',\n",
    "    'Allergy screening test',\n",
    "    'Alpha-fetoprotein test',\n",
    "    'Bilirubin.total [Presence] in Urine by Test strip',\n",
    "    'Urine screening for glucose',\n",
    "    'Urine screening test for diabetes',\n",
    "    'Urine protein test',\n",
    "    'Hemoglobin / Hematocrit / Platelet count',\n",
    "    'Assessment of anxiety (procedure)',\n",
    "    'Urine culture'\n",
    "]\n",
    "\n",
    "\n",
    "df2['Diagnostic Procedures'] = df2[diagnostic_procedures_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "surgerical_interventions_columns = [\n",
    "    'Admission to burn unit',\n",
    "    'Admission to long stay hospital',\n",
    "    'Admission to neurosurgical department',\n",
    "    'Admission to trauma surgery department',\n",
    "    'Admit to ICU (procedure)',\n",
    "    'Amputation of right foot',\n",
    "    'Amputation of right leg',\n",
    "    'Appendectomy',\n",
    "    'Artificial respiration (procedure)',\n",
    "    'Brachytherapy of breast (procedure)',\n",
    "    'Catheter ablation of tissue of heart',\n",
    "    'Coronary artery bypass grafting',\n",
    "    'Excision of axillary lymph node (procedure)',\n",
    "    'Excision of breast tissue (procedure)',\n",
    "    'Excision of sentinel lymph node (procedure)',\n",
    "    'Extraction of wisdom tooth',\n",
    "    'Implantation of left ventricular assist device (procedure)',\n",
    "    'Induced termination of pregnancy',\n",
    "    'Insertion of biventricular implantable cardioverter defibrillator',\n",
    "    'Insertion of endotracheal tube (procedure)',\n",
    "    'Instrumental delivery',\n",
    "    'Intubation',\n",
    "    'Laparoscopic Removal of Gall Bladder',\n",
    "    'Lumpectomy of breast (procedure)',\n",
    "    'Lung volume reduction surgery (procedure)',\n",
    "    'Open Removal of Gall Bladder',\n",
    "    'Partial resection of colon',\n",
    "    'Percutaneous coronary intervention',\n",
    "    'Percutaneous mechanical thrombectomy of portal vein using fluoroscopic guidance',\n",
    "    'Prostatectomy',\n",
    "    'Rectal polypectomy',\n",
    "    'Removal of endotracheal tube (procedure)',\n",
    "    'Removal of subcutaneous contraceptive',\n",
    "    'Surgical manipulation of joint of knee',\n",
    "    'Surgical manipulation of shoulder joint',\n",
    "    'Suture open wound',\n",
    "    'Tear of meniscus of knee',\n",
    "    'Transplantation of heart (procedure)'\n",
    "]\n",
    "df2['Surgical Interventions'] = df2[surgerical_interventions_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "patient_care_management_columns = [\n",
    "    'Assessment of health and social care needs (procedure)',\n",
    "    'Bleeding from anus',\n",
    "    'Bullet wound',\n",
    "    'Chronic low back pain (finding)',\n",
    "    'Contact dermatitis',\n",
    "    'Cystitis',\n",
    "    'Diarrhea symptom (finding)',\n",
    "    'Escherichia coli urinary tract infection',\n",
    "    'Evaluation of psychiatric state of patient',\n",
    "    'Facial laceration',\n",
    "    'First degree burn',\n",
    "    'Heart failure education (procedure)',\n",
    "    'Hemodialysis (procedure)',\n",
    "    'Hep A  adult',\n",
    "    'History AND physical examination (procedure)',\n",
    "    'History of amputation of foot (situation)',\n",
    "    'History of appendectomy',\n",
    "    'History of cardiac arrest (situation)',\n",
    "    'History of lower limb amputation (situation)',\n",
    "    'History of single seizure (situation)',\n",
    "    'Hospital admission  short-term  24 hours',\n",
    "    'Information gathering (procedure)',\n",
    "    'Injection of tetanus antitoxin',\n",
    "    'Intramuscular injection',\n",
    "    'Intravenous blood transfusion of packed cells (procedure)',\n",
    "    'Intravenous injection (procedure)',\n",
    "    'Laceration of foot',\n",
    "    'Laceration of forearm',\n",
    "    'Laceration of hand',\n",
    "    'Laceration of thigh',\n",
    "    'Medication Reconciliation (procedure)',\n",
    "    'Movement therapy (regime/therapy)',\n",
    "    'Notifications (procedure)',\n",
    "    'Oxygen administration by mask (procedure)',\n",
    "    'Patient discharge (procedure)',\n",
    "    'Placing subject in prone position (procedure)',\n",
    "    'Postoperative procedure education (procedure)',\n",
    "    'Pre-discharge assessment (procedure)',\n",
    "    'Radiation oncology AND/OR radiotherapy (procedure)',\n",
    "    'Referral to home health care service (procedure)',\n",
    "    'Referral to hypertension clinic',\n",
    "    'Renal dialysis (procedure)',\n",
    "    'Resuscitation using intravenous fluid (procedure)',\n",
    "    'Review of systems (procedure)',\n",
    "    'Rubella screening',\n",
    "    'Screening mammography (procedure)',\n",
    "    'Subcutaneous immunotherapy',\n",
    "    'Teleradiotherapy procedure (procedure)',\n",
    "    'Transfer to stepdown unit (procedure)',\n",
    "    'Transfusion of plasma (procedure)',\n",
    "    'Weaning from mechanically assisted ventilation (procedure)',\n",
    "    'piperacillin 2000 MG / tazobactam 250 MG Injection',\n",
    "    'vancomycin 1000 MG Injection',\n",
    "    'Recurrent urinary tract infection',\n",
    "    'Acquired coagulation disorder (disorder)',\n",
    "    'Primary malignant neoplasm of colon',\n",
    "    'Pyelonephritis',\n",
    "    'Chill (finding)',\n",
    "    'Acute Cholecystitis',\n",
    "    'Cholelithiasis',\n",
    "    'Appendicitis',\n",
    "    'Metastasis from malignant tumor of prostate (disorder)',\n",
    "    'Overlapping malignant neoplasm of colon',\n",
    "    'Polyp of colon',\n",
    "    'Neoplasm of prostate',\n",
    "    'Carcinoma in situ of prostate (disorder)',\n",
    "    'Recurrent rectal polyp',\n",
    "    'Discharge from skilled nursing facility (procedure)',\n",
    "    'Certification procedure (procedure)',\n",
    "    'Development of individualized plan of care (procedure)'\n",
    "]\n",
    "\n",
    "df2['Patient Care Management'] = df2[patient_care_management_columns].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = ['Respiratory Disorders', 'Heart and Cardiovascular Diseases', 'Metabolic and Endocrine Disorders', 'Neurological Disorders', 'Orthopedic Injuries', 'Mental Health', 'Reproductive and Pregnancy', 'Pain Relievers and Analesics', 'Cardiovascular and Blood Pressure Medications', 'Injection Medications', 'Oral Medications', 'Other Medications', 'Therapies and Regimes', 'Diagnostic Procedures', 'Surgical Interventions', 'Patient Care Management']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Type of these column is object, we will convert them to int."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column: Facial laceration\n",
      "Data Type: object\n",
      "Column: Norinyl 1+50 28 Day Pack\n",
      "Data Type: object\n",
      "Column: Intubation\n",
      "Data Type: object\n",
      "Column: Amlodipine 5 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: anastrozole 1 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Take blood sample\n",
      "Data Type: float64\n",
      "Column: Lack of access to transportation (finding)\n",
      "Data Type: object\n",
      "Column: Incision of trachea (procedure)\n",
      "Data Type: float64\n",
      "Column: Alteplase 100 MG Injection\n",
      "Data Type: object\n",
      "Column: Referral to hypertension clinic\n",
      "Data Type: object\n",
      "Column: 168 HR Ethinyl Estradiol 0.00146 MG/HR / norelgestromin 0.00625 MG/HR Transdermal System\n",
      "Data Type: object\n",
      "Column: ferrous sulfate 325 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Certification procedure (procedure)\n",
      "Data Type: object\n",
      "Column: Pulmonary rehabilitation (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Aspirin\n",
      "Data Type: object\n",
      "Column: History of amputation of foot (situation)\n",
      "Data Type: object\n",
      "Column: Limited social contact (finding)\n",
      "Data Type: object\n",
      "Column: Azithromycin 250mg\n",
      "Data Type: object\n",
      "Column: Antepartum eclampsia\n",
      "Data Type: object\n",
      "Column: Hepatitis B Surface Antigen Measurement\n",
      "Data Type: object\n",
      "Column: Jolivette 28 Day Pack\n",
      "Data Type: object\n",
      "Column: Asthma\n",
      "Data Type: object\n",
      "Column: Died in hospice (finding)\n",
      "Data Type: object\n",
      "Column: Levonorgestrel 0.00354 MG/HR Drug Implant\n",
      "Data Type: object\n",
      "Column: Appearance of Urine\n",
      "Data Type: object\n",
      "Column: Rupture of patellar tendon\n",
      "Data Type: object\n",
      "Column: Cesarean section\n",
      "Data Type: object\n",
      "Column: Cystitis\n",
      "Data Type: object\n",
      "Column: Rupture of appendix\n",
      "Data Type: object\n",
      "Column: Skin test for tuberculosis\n",
      "Data Type: object\n",
      "Column: pregabalin 100 MG Oral Capsule\n",
      "Data Type: object\n",
      "Column: Fracture of rib\n",
      "Data Type: object\n",
      "Column: Bleeding from anus\n",
      "Data Type: object\n",
      "Column: cefdinir\n",
      "Data Type: object\n",
      "Column: Egg white IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Chronic kidney disease stage 1 (disorder)\n",
      "Data Type: object\n",
      "Column: Shock (disorder)\n",
      "Data Type: object\n",
      "Column: Epidural anesthesia\n",
      "Data Type: object\n",
      "Column: RhD passive immunization\n",
      "Data Type: object\n",
      "Column: Urine culture\n",
      "Data Type: object\n",
      "Column: Stress (finding)\n",
      "Data Type: object\n",
      "Column: COVID-19\n",
      "Data Type: object\n",
      "Column: Bone density scan (procedure)\n",
      "Data Type: object\n",
      "Column: Alpha-fetoprotein test\n",
      "Data Type: object\n",
      "Column: ribociclib 200 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Coordination of care plan (procedure)\n",
      "Data Type: object\n",
      "Column: Pneumococcal conjugate PCV 13\n",
      "Data Type: object\n",
      "Column: Face mask (physical object)\n",
      "Data Type: object\n",
      "Column: Prostatectomy\n",
      "Data Type: object\n",
      "Column: Information gathering (procedure)\n",
      "Data Type: object\n",
      "Column: Microalbumin Creatinine Ratio\n",
      "Data Type: object\n",
      "Column: Acetaminophen 325 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Protracted diarrhea\n",
      "Data Type: object\n",
      "Column: Functional capacity NYHA\n",
      "Data Type: object\n",
      "Column: Vaccination for diphtheria  pertussis  and tetanus\n",
      "Data Type: object\n",
      "Column: Methotrexate injection into tubal pregnancy\n",
      "Data Type: object\n",
      "Column: Hydrocortisone 10 MG/ML Topical Cream\n",
      "Data Type: object\n",
      "Column: doxycycline hyclate 100 MG\n",
      "Data Type: object\n",
      "Column: White Blood Cell (Elevated)\n",
      "Data Type: float64\n",
      "Column: Furosemide 40 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Human epidermal growth factor receptor 2 gene detection by immunohistochemistry (procedure)\n",
      "Data Type: object\n",
      "Column: Lorazepam 2 MG/ML Injectable Solution\n",
      "Data Type: object\n",
      "Column: Intravenous antibiotic therapy\n",
      "Data Type: float64\n",
      "Column: Color of Urine\n",
      "Data Type: object\n",
      "Column: lisinopril 20 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Osteoarthritis of knee\n",
      "Data Type: object\n",
      "Column: Alcoholism\n",
      "Data Type: object\n",
      "Column: Intravenous blood transfusion of packed cells (procedure)\n",
      "Data Type: object\n",
      "Column: Teleradiotherapy procedure (procedure)\n",
      "Data Type: object\n",
      "Column: Triglycerides\n",
      "Data Type: object\n",
      "Column: lisinopril 10 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Hospital admission  short-term  24 hours\n",
      "Data Type: object\n",
      "Column: Fracture of vertebral column without spinal cord injury\n",
      "Data Type: object\n",
      "Column: Body Mass Index\n",
      "Data Type: object\n",
      "Column: Objective assessment of cardiovascular disease NYHA\n",
      "Data Type: object\n",
      "Column: Honey bee IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Diarrhea symptom (finding)\n",
      "Data Type: object\n",
      "Column: Cardiac Arrest\n",
      "Data Type: object\n",
      "Column: Laceration of foot\n",
      "Data Type: object\n",
      "Column: Admission to long stay hospital\n",
      "Data Type: object\n",
      "Column: Biopsy of prostate\n",
      "Data Type: object\n",
      "Column: Lumpectomy of breast (procedure)\n",
      "Data Type: object\n",
      "Column: Assessment of substance use (procedure)\n",
      "Data Type: object\n",
      "Column: Grass pollen (substance)\n",
      "Data Type: object\n",
      "Column: Glucose [Mass/volume] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Sputum Culture\n",
      "Data Type: float64\n",
      "Column: Spirometry (procedure)\n",
      "Data Type: object\n",
      "Column: Pregnancy termination care\n",
      "Data Type: object\n",
      "Column: Parainfluenza virus 1 RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "Data Type: object\n",
      "Column: Subcutaneous immunotherapy\n",
      "Data Type: object\n",
      "Column: History of myocardial infarction (situation)\n",
      "Data Type: object\n",
      "Column: Controlled ventilation procedure and therapy  initiation and management (procedure)\n",
      "Data Type: object\n",
      "Column: Platelets [#/volume] in Blood by Automated count\n",
      "Data Type: object\n",
      "Column: X-ray or wrist\n",
      "Data Type: object\n",
      "Column: Bee venom (substance)\n",
      "Data Type: float64\n",
      "Column: Urine screening for glucose\n",
      "Data Type: object\n",
      "Column: History of single seizure (situation)\n",
      "Data Type: object\n",
      "Column: Burn injury(morphologic abnormality)\n",
      "Data Type: object\n",
      "Column: Nasal congestion (finding)\n",
      "Data Type: object\n",
      "Column: Sinusitis (disorder)\n",
      "Data Type: object\n",
      "Column: Abuse Status [OMAHA]\n",
      "Data Type: object\n",
      "Column: Alzheimer's disease (disorder)\n",
      "Data Type: object\n",
      "Column: Ultrasound scan for fetal viability\n",
      "Data Type: object\n",
      "Column: Diazepam 5 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Animal dander (substance)\n",
      "Data Type: object\n",
      "Column: Laceration of thigh\n",
      "Data Type: object\n",
      "Column: Cyclophosphamide 1000 MG Injection\n",
      "Data Type: object\n",
      "Column: Cladosporium herbarum IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Blighted ovum\n",
      "Data Type: object\n",
      "Column: Etonogestrel 68 MG Drug Implant\n",
      "Data Type: object\n",
      "Column: Osteoporosis (disorder)\n",
      "Data Type: object\n",
      "Column: Patient discharge (procedure)\n",
      "Data Type: object\n",
      "Column: Sprain of ankle\n",
      "Data Type: object\n",
      "Column: sevoflurane 1000 MG/ML Inhalant Solution\n",
      "Data Type: object\n",
      "Column: sacubitril 97 MG / valsartan 103 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: NuvaRing 0.12/0.015 MG per 24HR 21 Day Vaginal Ring\n",
      "Data Type: object\n",
      "Column: Chronic neck pain (finding)\n",
      "Data Type: object\n",
      "Column: Ibuprofen\n",
      "Data Type: object\n",
      "Column: Viral sinusitis (disorder)\n",
      "Data Type: object\n",
      "Column: Placing subject in prone position (procedure)\n",
      "Data Type: object\n",
      "Column: 1 ML Vasopressin (USP) 20 UNT/ML Injection\n",
      "Data Type: object\n",
      "Column: Cefdinir\n",
      "Data Type: object\n",
      "Column: Naproxen 500 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Respiratory distress (finding)\n",
      "Data Type: object\n",
      "Column: Stage group.clinical Cancer\n",
      "Data Type: object\n",
      "Column: Contact dermatitis\n",
      "Data Type: object\n",
      "Column: 72 HR Fentanyl 0.025 MG/HR Transdermal System\n",
      "Data Type: object\n",
      "Column: Brain damage - traumatic\n",
      "Data Type: object\n",
      "Column: Kyleena 19.5 MG Intrauterine System\n",
      "Data Type: object\n",
      "Column: Common Ragweed IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Fracture subluxation of wrist\n",
      "Data Type: object\n",
      "Column: Iron binding capacity [Mass/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Measurement of respiratory function (procedure)\n",
      "Data Type: object\n",
      "Column: Simvastatin 20 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Injection of tetanus antitoxin\n",
      "Data Type: object\n",
      "Column: High Density Lipoprotein Cholesterol\n",
      "Data Type: object\n",
      "Column: Housing status\n",
      "Data Type: object\n",
      "Column: 0.3 ML Enoxaparin sodium 100 MG/ML Prefilled Syringe\n",
      "Data Type: object\n",
      "Column: Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus (disorder)\n",
      "Data Type: object\n",
      "Column: pH of Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Erythrocyte distribution width [Ratio] by Automated count\n",
      "Data Type: object\n",
      "Column: losartan potassium 50 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Tear of meniscus of knee\n",
      "Data Type: object\n",
      "Column: SARS-COV-2 (COVID-19) vaccine  vector non-replicating  recombinant spike protein-Ad26  preservative free  0.5 mL\n",
      "Data Type: object\n",
      "Column: Excision of breast tissue (procedure)\n",
      "Data Type: object\n",
      "Column: Unhealthy alcohol drinking behavior (finding)\n",
      "Data Type: object\n",
      "Column: Clavicle X-ray\n",
      "Data Type: object\n",
      "Column: Capillary refill [Time] of Nail bed\n",
      "Data Type: object\n",
      "Column: Calcium\n",
      "Data Type: object\n",
      "Column: Admission to trauma surgery department\n",
      "Data Type: object\n",
      "Column: Primary fibromyalgia syndrome\n",
      "Data Type: object\n",
      "Column: Human epidermal growth factor receptor 2 gene detection by fluorescence in situ hybridization (procedure)\n",
      "Data Type: object\n",
      "Column: Nitrofurantoin 5 MG/ML Oral Suspension\n",
      "Data Type: object\n",
      "Column: Chronic intractable migraine without aura\n",
      "Data Type: object\n",
      "Column: Refugee (person)\n",
      "Data Type: object\n",
      "Column: Joint pain (finding)\n",
      "Data Type: object\n",
      "Column: Chronic obstructive bronchitis (disorder)\n",
      "Data Type: object\n",
      "Column: Interstitial brachytherapy (procedure)\n",
      "Data Type: object\n",
      "Column: carvedilol 25 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Insertion of biventricular implantable cardioverter defibrillator\n",
      "Data Type: object\n",
      "Column: Standard pregnancy test\n",
      "Data Type: object\n",
      "Column: Smokes tobacco daily\n",
      "Data Type: object\n",
      "Column: Bone immobilization\n",
      "Data Type: object\n",
      "Column: Hypertension\n",
      "Data Type: object\n",
      "Column: palbociclib 100 MG Oral Capsule\n",
      "Data Type: object\n",
      "Column: Appendicitis\n",
      "Data Type: object\n",
      "Column: Digital examination of rectum\n",
      "Data Type: object\n",
      "Column: Localized  primary osteoarthritis of the hand\n",
      "Data Type: object\n",
      "Column: Seizure Count Cerebral Cortex Electroencephalogram (EEG)\n",
      "Data Type: object\n",
      "Column: Speech and language therapy regime (regime/therapy\n",
      "Data Type: object\n",
      "Column: Levothyroxine Sodium 0.075 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Platelet Count\n",
      "Data Type: float64\n",
      "Column: Polyp size greatest dimension by CAP cancer protocols\n",
      "Data Type: object\n",
      "Column: Azithromycin 250 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: state\n",
      "Data Type: object\n",
      "Column: Nitrite [Presence] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Closed fracture of hip\n",
      "Data Type: object\n",
      "Column: 3 ML Amiodarone hydrocholoride 50 MG/ML Prefilled Syringe\n",
      "Data Type: object\n",
      "Column: Not in labor force (finding)\n",
      "Data Type: object\n",
      "Column: Errin 28 Day Pack\n",
      "Data Type: object\n",
      "Column: Total Cholesterol\n",
      "Data Type: object\n",
      "Column: Amputation of right foot\n",
      "Data Type: object\n",
      "Column: 24 HR Donepezil hydrochloride 10 MG / Memantine hydrochloride 28 MG Extended Release Oral Capsule\n",
      "Data Type: object\n",
      "Column: Lenzilumab 200 MG IV\n",
      "Data Type: object\n",
      "Column: Magnesium [Mass/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Amoxicillin 250 MG Oral Capsule\n",
      "Data Type: object\n",
      "Column: Chloride\n",
      "Data Type: object\n",
      "Column: Cholelithiasis\n",
      "Data Type: object\n",
      "Column: Transformed migraine (disorder)\n",
      "Data Type: object\n",
      "Column: Creatinine\n",
      "Data Type: object\n",
      "Column: Knee X-ray\n",
      "Data Type: object\n",
      "Column: Ankle X-ray\n",
      "Data Type: object\n",
      "Column: Progesterone receptor Ag [Presence] in Breast cancer specimen by Immune stain\n",
      "Data Type: object\n",
      "Column: methadone hydrochloride 10 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Anion Gap\n",
      "Data Type: float64\n",
      "Column: Whiplash injury to neck\n",
      "Data Type: object\n",
      "Column: Spontaneous breech delivery\n",
      "Data Type: object\n",
      "Column: Influenza virus B Ag [Presence] in Nasopharynx by Rapid immunoassay\n",
      "Data Type: object\n",
      "Column: Implantation of left ventricular assist device (procedure)\n",
      "Data Type: object\n",
      "Column: Infection caused by Staphylococcus aureus\n",
      "Data Type: float64\n",
      "Column: Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)\n",
      "Data Type: object\n",
      "Column: Posttraumatic stress disorder\n",
      "Data Type: object\n",
      "Column: Amoxicillin 250 MG / Clavulanate 125 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Review of systems (procedure)\n",
      "Data Type: object\n",
      "Column: Hemoglobin [Mass/volume] in Blood\n",
      "Data Type: object\n",
      "Column: 120 ACTUAT Fluticasone propionate 0.044 MG/ACTUAT Metered Dose Inhaler\n",
      "Data Type: object\n",
      "Column: Iron [Mass/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Latex (substance)\n",
      "Data Type: object\n",
      "Column: Milnacipran hydrochloride 100 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: 10 ML Fluorouracil 50 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: 1 ML Epinephrine 1 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Meperidine Hydrochloride 50 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Fracture of forearm\n",
      "Data Type: object\n",
      "Column: Methotrexate 2.5 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: 5 ML SUFentanil 0.05 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Drug overdose\n",
      "Data Type: object\n",
      "Column: Acute bronchitis (disorder)\n",
      "Data Type: object\n",
      "Column: C reactive protein [Mass/volume] in Serum or Plasma\n",
      "Data Type: float64\n",
      "Column: Chronic paralysis due to lesion of spinal cord\n",
      "Data Type: object\n",
      "Column: Parainfluenza virus 2 RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "Data Type: object\n",
      "Column: Catheter ablation of tissue of heart\n",
      "Data Type: object\n",
      "Column: Screening for domestic abuse (procedure)\n",
      "Data Type: object\n",
      "Column: Attempted suicide - suffocation\n",
      "Data Type: object\n",
      "Column: Fibrin D-dimer FEU [Mass/volume] in Platelet poor plasma\n",
      "Data Type: float64\n",
      "Column: Nausea (finding)\n",
      "Data Type: object\n",
      "Column: History of cardiac arrest (situation)\n",
      "Data Type: object\n",
      "Column: Excision of fallopian tube and surgical removal of ectopic pregnancy\n",
      "Data Type: object\n",
      "Column: Intravenous infusion (procedure)\n",
      "Data Type: float64\n",
      "Column: Levora 0.15/30 28 Day Pack\n",
      "Data Type: object\n",
      "Column: Erythrocyte distribution width [Entitic volume] by Automated count\n",
      "Data Type: object\n",
      "Column: Screening for occult blood in feces (procedure)\n",
      "Data Type: object\n",
      "Column: Left ventricular Ejection fraction\n",
      "Data Type: object\n",
      "Column: 100 ML Propofol 10 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: History of upper limb amputation (situation)\n",
      "Data Type: float64\n",
      "Column: Captopril 25 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Ultrasonography of abdomen  right upper quadrant and epigastrium\n",
      "Data Type: object\n",
      "Column: Extraction of wisdom tooth\n",
      "Data Type: object\n",
      "Column: NDA020800 0.3 ML Epinephrine 1 MG/ML Auto-Injector\n",
      "Data Type: object\n",
      "Column: Peripheral blood smear interpretation\n",
      "Data Type: object\n",
      "Column: Insertion of endotracheal tube (procedure)\n",
      "Data Type: object\n",
      "Column: Urine screening test for diabetes\n",
      "Data Type: object\n",
      "Column: History of lower limb amputation (situation)\n",
      "Data Type: object\n",
      "Column: Stroke\n",
      "Data Type: object\n",
      "Column: Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Screening for chromosomal aneuploidy in prenatal amniotic fluid\n",
      "Data Type: object\n",
      "Column: Proliferative diabetic retinopathy due to type II diabetes mellitus (disorder)\n",
      "Data Type: object\n",
      "Column: ethnic\n",
      "Data Type: object\n",
      "Column: Male infertility due to cystic fibrosis (disorder)\n",
      "Data Type: float64\n",
      "Column: exemestane 25 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Evaluation of psychiatric state of patient\n",
      "Data Type: object\n",
      "Column: Tubal pregnancy\n",
      "Data Type: object\n",
      "Column: label\n",
      "Data Type: int64\n",
      "Column: Episiotomy\n",
      "Data Type: object\n",
      "Column: 150 ML vancomycin 5 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Naltrexone hydrochloride 50 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Transfusion of plasma (procedure)\n",
      "Data Type: float64\n",
      "Column: Oxygen/Inspired gas setting [Volume Fraction] Ventilator\n",
      "Data Type: float64\n",
      "Column: Loratadine 10 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: 3 ML liraglutide 6 MG/ML Pen Injector\n",
      "Data Type: object\n",
      "Column: Cough (finding)\n",
      "Data Type: object\n",
      "Column: Alendronic acid 10 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: 4 ML norepinephrine 1 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Bacterial infectious disease (disorder)\n",
      "Data Type: object\n",
      "Column: Chronic pain\n",
      "Data Type: object\n",
      "Column: Surgical manipulation of shoulder joint\n",
      "Data Type: object\n",
      "Column: Hemodialysis (procedure)\n",
      "Data Type: object\n",
      "Column: Impacted molars\n",
      "Data Type: object\n",
      "Column: Lisinopril\n",
      "Data Type: object\n",
      "Column: Body temperature\n",
      "Data Type: object\n",
      "Column: Cystic Fibrosis\n",
      "Data Type: float64\n",
      "Column: Assessment of health and social care needs (procedure)\n",
      "Data Type: object\n",
      "Column: NDA020503 200 ACTUAT Albuterol 0.09 MG/ACTUAT Metered Dose Inhaler\n",
      "Data Type: object\n",
      "Column: HER2 [Presence] in Breast cancer specimen by Immune stain\n",
      "Data Type: object\n",
      "Column: Abuse-Deterrent 12 HR Oxycodone Hydrochloride 10 MG Extended Release Oral Tablet [Oxycontin]\n",
      "Data Type: object\n",
      "Column: Streptococcal sore throat (disorder)\n",
      "Data Type: object\n",
      "Column: Coronary artery bypass grafting\n",
      "Data Type: object\n",
      "Column: American house dust mite IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Penicillin V\n",
      "Data Type: object\n",
      "Column: Seizure disorder\n",
      "Data Type: object\n",
      "Column: Oxygen saturation in Arterial blood\n",
      "Data Type: object\n",
      "Column: Social isolation (finding)\n",
      "Data Type: object\n",
      "Column: Physical examination\n",
      "Data Type: object\n",
      "Column: Acquired coagulation disorder (disorder)\n",
      "Data Type: object\n",
      "Column: Glucose [Mass/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Suture open wound\n",
      "Data Type: object\n",
      "Column: Lupus erythematosus\n",
      "Data Type: object\n",
      "Column: Rectal polypectomy\n",
      "Data Type: object\n",
      "Column: Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "Data Type: object\n",
      "Column: Brachytherapy of breast (procedure)\n",
      "Data Type: object\n",
      "Column: Recurrent urinary tract infection\n",
      "Data Type: object\n",
      "Column: Memantine hydrochloride 2 MG/ML Oral Solution\n",
      "Data Type: object\n",
      "Column: Terfenadine 60 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Construction of diverting colostomy\n",
      "Data Type: object\n",
      "Column: Cytopathology procedure  preparation of smear  genital source\n",
      "Data Type: object\n",
      "Column: Notifications (procedure)\n",
      "Data Type: object\n",
      "Column: Coronary Heart Disease\n",
      "Data Type: object\n",
      "Column: Excision of sentinel lymph node (procedure)\n",
      "Data Type: object\n",
      "Column: Microbial culture (procedure)\n",
      "Data Type: object\n",
      "Column: Systolic Blood Pressure\n",
      "Data Type: object\n",
      "Column: Lactate [Mass/volume] in Blood\n",
      "Data Type: object\n",
      "Column: Secondary malignant neoplasm of colon\n",
      "Data Type: object\n",
      "Column: tramadol hydrochloride 50 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Body mass index 40+ - severely obese (finding)\n",
      "Data Type: float64\n",
      "Column: Latex IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Walnut IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Plain chest X-ray (procedure)\n",
      "Data Type: object\n",
      "Column: Percutaneous coronary intervention\n",
      "Data Type: object\n",
      "Column: Septic shock (disorder)\n",
      "Data Type: object\n",
      "Column: Fracture of clavicle\n",
      "Data Type: object\n",
      "Column: INR in Platelet poor plasma by Coagulation assay\n",
      "Data Type: float64\n",
      "Column: Nursing care/supplementary surveillance (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Thyroxine (T4) free [Mass/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Insertion of subcutaneous contraceptive\n",
      "Data Type: object\n",
      "Column: Protein [Presence] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Hydrochlorothiazide 25 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Social migrant (finding)\n",
      "Data Type: object\n",
      "Column: Clopidogrel 75 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Depression screening using Patient Health Questionnaire Two-Item score (procedure)\n",
      "Data Type: object\n",
      "Column: Injury of heart (disorder)\n",
      "Data Type: object\n",
      "Column: pneumococcal polysaccharide vaccine  23 valent\n",
      "Data Type: object\n",
      "Column: Simvastatin 10 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Natazia 28 Day Pack\n",
      "Data Type: object\n",
      "Column: Weaning from mechanically assisted ventilation (procedure)\n",
      "Data Type: object\n",
      "Column: Screening for drug abuse (procedure)\n",
      "Data Type: object\n",
      "Column: Bilirubin.total [Presence] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Mental health screening (procedure)\n",
      "Data Type: object\n",
      "Column: Colchicine 0.6 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Homeless (finding)\n",
      "Data Type: object\n",
      "Column: MCV\n",
      "Data Type: float64\n",
      "Column: Chill (finding)\n",
      "Data Type: object\n",
      "Column: Pyelonephritis\n",
      "Data Type: object\n",
      "Column: Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Acute respiratory distress syndrome (disorder)\n",
      "Data Type: object\n",
      "Column: Acetaminophen 500 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: MCHC [Mass/volume] by Automated count\n",
      "Data Type: object\n",
      "Column: Evaluation of uterine fundal height\n",
      "Data Type: object\n",
      "Column: Acute viral pharyngitis (disorder)\n",
      "Data Type: object\n",
      "Column: HIV status\n",
      "Data Type: object\n",
      "Column: Estrogen receptor Ag [Presence] in Breast cancer specimen by Immune stain\n",
      "Data Type: object\n",
      "Column: Assessment using Morse Fall Scale (procedure)\n",
      "Data Type: object\n",
      "Column: Combined chemotherapy and radiation therapy (procedure)\n",
      "Data Type: object\n",
      "Column: Depression screening (procedure)\n",
      "Data Type: object\n",
      "Column: Mestranol / Norethynodrel [Enovid]\n",
      "Data Type: object\n",
      "Column: Manual pelvic examination (procedure)\n",
      "Data Type: object\n",
      "Column: Physical therapy procedure (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Hemoglobin A1c/Hemoglobin.total in Blood\n",
      "Data Type: object\n",
      "Column: Third degree burn\n",
      "Data Type: object\n",
      "Column: Midazolam 1 MG/ML Injectable Solution\n",
      "Data Type: object\n",
      "Column: 10 ML Fentanyl 0.05 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Ferritin [Mass/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Albumin\n",
      "Data Type: object\n",
      "Column: clonazePAM 0.25 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Colonoscopy\n",
      "Data Type: object\n",
      "Column: Hemoglobin.gastrointestinal [Presence] in Stool by Immunologic method\n",
      "Data Type: object\n",
      "Column: Basophils [#/volume] in Blood by Automated count\n",
      "Data Type: float64\n",
      "Column: Rubella screening\n",
      "Data Type: object\n",
      "Column: Psychiatric follow-up\n",
      "Data Type: object\n",
      "Column: Respiratory syncytial virus RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "Data Type: object\n",
      "Column: Diabetic renal disease (disorder)\n",
      "Data Type: object\n",
      "Column: Cardiovascular stress testing (procedure)\n",
      "Data Type: object\n",
      "Column: Acetaminophen 300 MG / Codeine Phosphate 15 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Chronic kidney disease stage 2 (disorder)\n",
      "Data Type: object\n",
      "Column: Hyperglycemia (disorder)\n",
      "Data Type: object\n",
      "Column: Fever (finding)\n",
      "Data Type: object\n",
      "Column: Parainfluenza virus 3 RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "Data Type: object\n",
      "Column: Normal pregnancy\n",
      "Data Type: object\n",
      "Column: Clarity of Urine\n",
      "Data Type: object\n",
      "Column: cycloSPORINE  modified 100 MG Oral Capsule\n",
      "Data Type: object\n",
      "Column: Induced termination of pregnancy\n",
      "Data Type: object\n",
      "Column: 5 ML hyaluronidase-oysk 2000 UNT/ML / trastuzumab 120 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Severe anxiety (panic) (finding\n",
      "Data Type: object\n",
      "Column: Reports of violence in the environment (finding)\n",
      "Data Type: object\n",
      "Column: Cat dander IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Monitoring of patient (regime/therapy)\n",
      "Data Type: object\n",
      "Column: buprenorphine 2 MG / naloxone 0.5 MG Sublingual Tablet\n",
      "Data Type: object\n",
      "Column: Rheumatoid arthritis\n",
      "Data Type: object\n",
      "Column: Suspected COVID-19\n",
      "Data Type: object\n",
      "Column: letrozole 2.5 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Insertion of intrauterine contraceptive device\n",
      "Data Type: object\n",
      "Column: Sweat Test\n",
      "Data Type: float64\n",
      "Column: Instrumental delivery\n",
      "Data Type: object\n",
      "Column: Antenatal RhD antibody screening\n",
      "Data Type: object\n",
      "Column: Glucose\n",
      "Data Type: object\n",
      "Column: Percutaneous mechanical thrombectomy of portal vein using fluoroscopic guidance\n",
      "Data Type: object\n",
      "Column: Replacement of contraceptive intrauterine device\n",
      "Data Type: object\n",
      "Column: Low Density Lipoprotein Cholesterol\n",
      "Data Type: object\n",
      "Column: Seasonique 91 Day Pack\n",
      "Data Type: object\n",
      "Column: Aspirin 81 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Headache (finding)\n",
      "Data Type: object\n",
      "Column: Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "Data Type: object\n",
      "Column: Tacrine 10 MG Oral Capsule\n",
      "Data Type: object\n",
      "Column: Medical induction of labor\n",
      "Data Type: object\n",
      "Column: Troponin I.cardiac [Mass/volume] in Serum or Plasma by High sensitivity method\n",
      "Data Type: object\n",
      "Column: Isoflurane 999 MG/ML Inhalant Solution\n",
      "Data Type: object\n",
      "Column: Upper arm X-ray\n",
      "Data Type: object\n",
      "Column: Heart failure education (procedure)\n",
      "Data Type: object\n",
      "Column: Wheezing (finding)\n",
      "Data Type: object\n",
      "Column: duloxetine 20 MG Delayed Release Oral Capsule\n",
      "Data Type: object\n",
      "Column: Response to cancer treatment\n",
      "Data Type: object\n",
      "Column: Ketones [Mass/volume] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: 20 ML tocilizumab 20 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Verapamil Hydrochloride 40 MG\n",
      "Data Type: object\n",
      "Column: Human metapneumovirus RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "Data Type: object\n",
      "Column: Liletta 52 MG Intrauterine System\n",
      "Data Type: object\n",
      "Column: race\n",
      "Data Type: object\n",
      "Column: Penicillin V Potassium 500 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Vomiting symptom (finding)\n",
      "Data Type: object\n",
      "Column: 0.67 ML anakinra 149 MG/ML Prefilled Syringe\n",
      "Data Type: float64\n",
      "Column: Escherichia coli urinary tract infection\n",
      "Data Type: object\n",
      "Column: Chemotherapy (procedure)\n",
      "Data Type: object\n",
      "Column: Radiation oncology AND/OR radiotherapy (procedure)\n",
      "Data Type: object\n",
      "Column: Procalcitonin [Mass/volume] in Serum or Plasma\n",
      "Data Type: float64\n",
      "Column: Doxycycline Monohydrate 100 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Ketones [Presence] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Passive conjunctival congestion (finding)\n",
      "Data Type: object\n",
      "Column: Oxygen Therapy\n",
      "Data Type: object\n",
      "Column: Estrostep Fe 28 Day Pack\n",
      "Data Type: object\n",
      "Column: Blood typing  RH typing\n",
      "Data Type: object\n",
      "Column: Anemia (disorder)\n",
      "Data Type: object\n",
      "Column: Throat culture (procedure)\n",
      "Data Type: object\n",
      "Column: Admission to burn unit\n",
      "Data Type: object\n",
      "Column: Pelvis X-ray\n",
      "Data Type: object\n",
      "Column: Carbon Dioxide\n",
      "Data Type: object\n",
      "Column: Creatine kinase [Enzymatic activity/volume] in Serum or Plasma\n",
      "Data Type: float64\n",
      "Column: canagliflozin 100 MG Oral Tablet\n",
      "Data Type: float64\n",
      "Column: Admission to neurosurgical department\n",
      "Data Type: object\n",
      "Column: Iron saturation [Mass Fraction] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Syphilis infection test\n",
      "Data Type: object\n",
      "Column: HER2 [Presence] in Breast cancer specimen by FISH\n",
      "Data Type: object\n",
      "Column: 100 ML zoledronic acid 0.04 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Prediabetes\n",
      "Data Type: object\n",
      "Column: Cow milk IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: albuterol 5 MG/ML Inhalation Solution\n",
      "Data Type: object\n",
      "Column: Respiratory rate\n",
      "Data Type: object\n",
      "Column: Intramuscular injection\n",
      "Data Type: object\n",
      "Column: Piperacillin 4000 MG / tazobactam 500 MG Injection\n",
      "Data Type: object\n",
      "Column: Polyp of colon\n",
      "Data Type: object\n",
      "Column: Appendectomy\n",
      "Data Type: object\n",
      "Column: Treatment status Cancer\n",
      "Data Type: object\n",
      "Column: Concussion with no loss of consciousness\n",
      "Data Type: object\n",
      "Column: Aztreonam 2000 MG Injection\n",
      "Data Type: object\n",
      "Column: Acute deep venous thrombosis (disorder)\n",
      "Data Type: object\n",
      "Column: Hep A  adult\n",
      "Data Type: object\n",
      "Column: Pulmonary emphysema (disorder)\n",
      "Data Type: object\n",
      "Column: Idiopathic atrophic hypothyroidism\n",
      "Data Type: object\n",
      "Column: diphenhydrAMINE Hydrochloride 25 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Hematocrit [Volume Fraction] of Blood by Automated count\n",
      "Data Type: object\n",
      "Column: Hospice care (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Brief general examination (procedure)\n",
      "Data Type: object\n",
      "Column: Mammography (procedure)\n",
      "Data Type: object\n",
      "Column: Total Bilirubin (Elevated)\n",
      "Data Type: float64\n",
      "Column: 10 ML Pamidronate Disodium 3 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: History of disarticulation at wrist (situation)\n",
      "Data Type: float64\n",
      "Column: DXA [T-score] Bone density\n",
      "Data Type: object\n",
      "Column: Loss of taste (finding)\n",
      "Data Type: object\n",
      "Column: Carboplatin 10 MG/ML Injectable Solution\n",
      "Data Type: object\n",
      "Column: Fexofenadine hydrochloride 60 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Laparoscopic Removal of Gall Bladder\n",
      "Data Type: object\n",
      "Column: FEV1/FVC\n",
      "Data Type: object\n",
      "Column: Are you covered by health insurance or some other kind of health care plan [PhenX]\n",
      "Data Type: object\n",
      "Column: Acetaminophen 300 MG / Hydrocodone Bitartrate 5 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Blindness due to type 2 diabetes mellitus (disorder)\n",
      "Data Type: object\n",
      "Column: Childbirth\n",
      "Data Type: object\n",
      "Column: Prostate specific Ag [Mass/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Allergy screening test\n",
      "Data Type: object\n",
      "Column: Sputum finding (finding)\n",
      "Data Type: object\n",
      "Column: Wheat IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Carbamazepine[Tegretol]\n",
      "Data Type: object\n",
      "Column: Pathological fracture due to osteoporosis (disorder)\n",
      "Data Type: object\n",
      "Column: 12 HR Hydrocodone Bitartrate 10 MG Extended Release Oral Capsule\n",
      "Data Type: object\n",
      "Column: Admit to ICU (procedure)\n",
      "Data Type: object\n",
      "Column: Chronic low back pain (finding)\n",
      "Data Type: object\n",
      "Column: Hypertriglyceridemia (disorder)\n",
      "Data Type: object\n",
      "Column: Donepezil hydrochloride 10 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Proteinuria due to type 2 diabetes mellitus (disorder)\n",
      "Data Type: object\n",
      "Column: Prothrombin time (PT)\n",
      "Data Type: float64\n",
      "Column: Removal of subcutaneous contraceptive\n",
      "Data Type: object\n",
      "Column: NITROFURANTOIN  MACROCRYSTALS 50 MG Oral Capsule\n",
      "Data Type: object\n",
      "Column: Monocytes/100 leukocytes in Blood by Automated count\n",
      "Data Type: float64\n",
      "Column: Suicide risk assessment (procedure)\n",
      "Data Type: object\n",
      "Column: Rhinovirus RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "Data Type: object\n",
      "Column: Concussion injury of brain\n",
      "Data Type: object\n",
      "Column: Diabetes from Cystic Fibrosis\n",
      "Data Type: float64\n",
      "Column: Leukocyte esterase [Presence] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Urine protein test\n",
      "Data Type: object\n",
      "Column: cetirizine hydrochloride 10 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Injury of kidney (disorder)\n",
      "Data Type: object\n",
      "Column: marital\n",
      "Data Type: object\n",
      "Column: 0.25 ML Leuprolide Acetate 30 MG/ML Prefilled Syringe\n",
      "Data Type: object\n",
      "Column: Sepsis caused by Pseudomonas (disorder)\n",
      "Data Type: float64\n",
      "Column: Peanut IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Urea nitrogen [Mass/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Gonorrhea infection test\n",
      "Data Type: object\n",
      "Column: 24hr nicotine transdermal patch\n",
      "Data Type: object\n",
      "Column: Oxygen administration by mask (procedure)\n",
      "Data Type: object\n",
      "Column: Fibromyalgia (disorder)\n",
      "Data Type: object\n",
      "Column: MCV [Entitic volume] by Automated count\n",
      "Data Type: object\n",
      "Column: Part-time employment (finding)\n",
      "Data Type: object\n",
      "Column: Primary malignant neoplasm of colon\n",
      "Data Type: object\n",
      "Column: Calcium [Mass/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: 24 HR Metformin hydrochloride 500 MG Extended Release Oral Tablet\n",
      "Data Type: object\n",
      "Column: Intravenous injection (procedure)\n",
      "Data Type: object\n",
      "Column: Acetaminophen 21.7 MG/ML / Dextromethorphan Hydrobromide 1 MG/ML / doxylamine succinate 0.417 MG/ML Oral Solution\n",
      "Data Type: object\n",
      "Column: Heart rate\n",
      "Data Type: object\n",
      "Column: 100 ML Epirubicin Hydrochloride 2 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Monocytes [#/volume] in Blood by Automated count\n",
      "Data Type: float64\n",
      "Column: scc\n",
      "Data Type: int64\n",
      "Column: gender\n",
      "Data Type: object\n",
      "Column: Atorvastatin 80 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Cefuroxime 250 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Tamoxifen 10 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: At risk for suicide (finding)\n",
      "Data Type: object\n",
      "Column: Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "Data Type: object\n",
      "Column: Atrial Fibrillation\n",
      "Data Type: object\n",
      "Column: Fatigue (finding)\n",
      "Data Type: object\n",
      "Column: Intracavitary brachytherapy (procedure)\n",
      "Data Type: object\n",
      "Column: Acetaminophen 325 MG / HYDROcodone Bitartrate 7.5 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Neutrophils/100 leukocytes in Blood by Automated count\n",
      "Data Type: float64\n",
      "Column: Preeclampsia\n",
      "Data Type: object\n",
      "Column: desflurane 1000 MG/ML Inhalation Solution\n",
      "Data Type: object\n",
      "Column: Assessment using Alcohol Use Disorders Identification Test - Consumption (procedure)\n",
      "Data Type: object\n",
      "Column: Depression screening using Patient Health Questionnaire Nine Item score (procedure)\n",
      "Data Type: object\n",
      "Column: Electrical cardioversion\n",
      "Data Type: object\n",
      "Column: Diabetic retinopathy associated with type II diabetes mellitus (disorder)\n",
      "Data Type: object\n",
      "Column: Lymphocytes/100 leukocytes in Blood by Automated count\n",
      "Data Type: float64\n",
      "Column: Total knee replacement\n",
      "Data Type: object\n",
      "Column: Erythrocytes [#/volume] in Blood by Automated count\n",
      "Data Type: object\n",
      "Column: Red Blood Cell\n",
      "Data Type: float64\n",
      "Column: 10 ML Furosemide 10 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Rocuronium bromide 10 MG/ML Injectable Solution\n",
      "Data Type: object\n",
      "Column: Shrimp IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Penicillin G 375 MG/ML Injectable Solution\n",
      "Data Type: object\n",
      "Column: Epilepsy\n",
      "Data Type: object\n",
      "Column: First degree burn\n",
      "Data Type: object\n",
      "Column: Laceration of hand\n",
      "Data Type: object\n",
      "Column: Mental health Telehealth Note\n",
      "Data Type: float64\n",
      "Column: Victim of intimate partner abuse (finding)\n",
      "Data Type: object\n",
      "Column: Pneumonia (disorder)\n",
      "Data Type: object\n",
      "Column: Urea Nitrogen\n",
      "Data Type: object\n",
      "Column: Transfer to stepdown unit (procedure)\n",
      "Data Type: object\n",
      "Column: Osteoarthritis of hip\n",
      "Data Type: object\n",
      "Column: MCH [Entitic mass] by Automated count\n",
      "Data Type: object\n",
      "Column: Atropine Sulfate 1 MG/ML Injectable Solution\n",
      "Data Type: object\n",
      "Column: Chlamydia antigen test\n",
      "Data Type: object\n",
      "Column: Acute bacterial sinusitis (disorder)\n",
      "Data Type: object\n",
      "Column: chloroquine phosphate 500 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Miscarriage in first trimester\n",
      "Data Type: object\n",
      "Column: Potassium\n",
      "Data Type: object\n",
      "Column: Microalbuminuria due to type 2 diabetes mellitus (disorder)\n",
      "Data Type: object\n",
      "Column: Platelet mean volume [Entitic volume] in Blood by Automated count\n",
      "Data Type: object\n",
      "Column: Pre-discharge assessment (procedure)\n",
      "Data Type: object\n",
      "Column: Biopsy of breast (procedure)\n",
      "Data Type: object\n",
      "Column: Acute Cholecystitis\n",
      "Data Type: object\n",
      "Column: predniSONE 20 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Eosinophils [#/volume] in Blood by Automated count\n",
      "Data Type: float64\n",
      "Column: Tobacco smoking status NHIS\n",
      "Data Type: object\n",
      "Column: Acetaminophen 325 MG / oxyCODONE Hydrochloride 2.5 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Acetaminophen 325 MG / oxyCODONE Hydrochloride 5 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Hypoxemia (disorder)\n",
      "Data Type: object\n",
      "Column: Sepsis caused by virus (disorder)\n",
      "Data Type: object\n",
      "Column: 10 ML Doxorubicin Hydrochloride 2 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Tree pollen (substance)\n",
      "Data Type: object\n",
      "Column: Sodium\n",
      "Data Type: object\n",
      "Column: NT-proBNP\n",
      "Data Type: object\n",
      "Column: Streptococcus pneumoniae group B antigen test\n",
      "Data Type: object\n",
      "Column: Suicidal deliberate poisoning\n",
      "Data Type: float64\n",
      "Column: RBC Distribution Width\n",
      "Data Type: float64\n",
      "Column: Diabetes\n",
      "Data Type: object\n",
      "Column: Hepatitis C antibody test\n",
      "Data Type: object\n",
      "Column: Basophils/100 leukocytes in Blood by Automated count\n",
      "Data Type: float64\n",
      "Column: Surgical manipulation of joint of knee\n",
      "Data Type: object\n",
      "Column: Cognitive and behavioral therapy (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Hemoglobin / Hematocrit / Platelet count\n",
      "Data Type: object\n",
      "Column: Human immunodeficiency virus antigen test\n",
      "Data Type: object\n",
      "Column: 1 ML medroxyPROGESTERone acetate 150 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: 1 ML Morphine Sulfate 5 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Assessment of anxiety (procedure)\n",
      "Data Type: object\n",
      "Column: Pancreatin 600 MG Oral Tablet\n",
      "Data Type: float64\n",
      "Column: Concussion with loss of consciousness\n",
      "Data Type: object\n",
      "Column: Sodium Chloride 9 MG/ML Injectable Solution\n",
      "Data Type: object\n",
      "Column: Hemoptysis (finding)\n",
      "Data Type: object\n",
      "Column: Sore throat symptom (finding)\n",
      "Data Type: object\n",
      "Column: Body mass index 30+ - obesity (finding)\n",
      "Data Type: object\n",
      "Column: Acute pulmonary embolism (disorder)\n",
      "Data Type: object\n",
      "Column: Movement therapy (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Resuscitation using intravenous fluid (procedure)\n",
      "Data Type: object\n",
      "Column: Vasectomy\n",
      "Data Type: object\n",
      "Column: Allopurinol 100 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Protein [Mass/volume] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Leukocytes [#/volume] in Blood by Automated count\n",
      "Data Type: object\n",
      "Column: Measurement of Varicella-zoster virus antibody\n",
      "Data Type: object\n",
      "Column: Augmentation of labor\n",
      "Data Type: object\n",
      "Column: Abuse-Deterrent 12 HR Oxycodone Hydrochloride 15 MG Extended Release Oral Tablet\n",
      "Data Type: object\n",
      "Column: Diastolic Blood Pressure\n",
      "Data Type: object\n",
      "Column: Transplant of lung (procedure)\n",
      "Data Type: object\n",
      "Column: 20 Gene mutation test\n",
      "Data Type: float64\n",
      "Column: Biopsy of colon\n",
      "Data Type: object\n",
      "Column: Misuses drugs (finding)\n",
      "Data Type: object\n",
      "Column: zoster\n",
      "Data Type: object\n",
      "Column: Recurrent rectal polyp\n",
      "Data Type: object\n",
      "Column: Gram positive blood culture panel by Probe in Positive blood culture\n",
      "Data Type: object\n",
      "Column: Asthma screening\n",
      "Data Type: object\n",
      "Column: Dyspnea (finding)\n",
      "Data Type: object\n",
      "Column: Oxygen [Partial pressure] in Arterial blood\n",
      "Data Type: float64\n",
      "Column: Metastasis from malignant tumor of prostate (disorder)\n",
      "Data Type: object\n",
      "Column: Influenza virus A Ag [Presence] in Nasopharynx by Rapid immunoassay\n",
      "Data Type: object\n",
      "Column: pH of Arterial blood\n",
      "Data Type: float64\n",
      "Column: Home health aide service (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Specific gravity of Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Ibuprofen 200 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Oxygen Saturation\n",
      "Data Type: object\n",
      "Column: Injury of anterior cruciate ligament\n",
      "Data Type: object\n",
      "Column: ado-trastuzumab emtansine 100 MG Injection\n",
      "Data Type: object\n",
      "Column: Mold (organism)\n",
      "Data Type: object\n",
      "Column: Professional / ancillary services care (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Renal dialysis (procedure)\n",
      "Data Type: object\n",
      "Column: Psychosocial care (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Initial patient assessment (procedure)\n",
      "Data Type: object\n",
      "Column: Sepsis (disorder)\n",
      "Data Type: object\n",
      "Column: House dust mite (organism)\n",
      "Data Type: object\n",
      "Column: SARS-COV-2 (COVID-19) vaccine  mRNA  spike protein  LNP  preservative free  30 mcg/0.3mL dose\n",
      "Data Type: object\n",
      "Column: Hydroxychloroquine Sulfate 200 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Injury of medial collateral ligament of knee\n",
      "Data Type: object\n",
      "Column: Pulmozyme (Dornase Alfa)\n",
      "Data Type: float64\n",
      "Column: 12 HR Cefaclor 500 MG Extended Release Oral Tablet\n",
      "Data Type: object\n",
      "Column: Amputation of right leg\n",
      "Data Type: object\n",
      "Column: Comprehensive interview and evaluation (procedure)\n",
      "Data Type: object\n",
      "Column: Discharge from skilled nursing facility (procedure)\n",
      "Data Type: object\n",
      "Column: Chronic kidney disease stage 3 (disorder)\n",
      "Data Type: object\n",
      "Column: Referral to home health care service (procedure)\n",
      "Data Type: object\n",
      "Column: baricitinib 2 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Total replacement of hip\n",
      "Data Type: float64\n",
      "Column: Radiation therapy care (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Lung Transplant\n",
      "Data Type: float64\n",
      "Column: Neutrophils [#/volume] in Blood by Automated count\n",
      "Data Type: float64\n",
      "Column: Nitroglycerin 0.4 MG/ACTUAT Mucosal Spray\n",
      "Data Type: object\n",
      "Column: Occupational therapy (regime/therapy)\n",
      "Data Type: object\n",
      "Column: insulin human  isophane 70 UNT/ML / Regular Insulin  Human 30 UNT/ML Injectable Suspension [Humulin]\n",
      "Data Type: object\n",
      "Column: Overlapping malignant neoplasm of colon\n",
      "Data Type: object\n",
      "Column: Familial Alzheimer's disease of early onset (disorder)\n",
      "Data Type: object\n",
      "Column: Open Removal of Gall Bladder\n",
      "Data Type: object\n",
      "Column: Paclitaxel 100 MG Injection\n",
      "Data Type: object\n",
      "Column: 1 ML denosumab 60 MG/ML Prefilled Syringe\n",
      "Data Type: object\n",
      "Column: Premature birth of newborn\n",
      "Data Type: object\n",
      "Column: Vitamin B 12 5 MG/ML Injectable Solution\n",
      "Data Type: object\n",
      "Column: Postoperative procedure education (procedure)\n",
      "Data Type: object\n",
      "Column: age\n",
      "Data Type: object\n",
      "Column: Lymphocytes [#/volume] in Blood by Automated count\n",
      "Data Type: float64\n",
      "Column: Bilirubin.total [Mass/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Hyperlipidemia\n",
      "Data Type: object\n",
      "Column: Mental health Outpatient Note\n",
      "Data Type: float64\n",
      "Column: Has a criminal record (finding)\n",
      "Data Type: object\n",
      "Column: Malignant tumor of colon\n",
      "Data Type: object\n",
      "Column: Assessment using New York Heart Association Classification (procedure)\n",
      "Data Type: object\n",
      "Column: Nasal sinus endoscopy (procedure)\n",
      "Data Type: object\n",
      "Column: Lung volume reduction surgery (procedure)\n",
      "Data Type: object\n",
      "Column: Auscultation of the fetal heart\n",
      "Data Type: object\n",
      "Column: Development of individualized plan of care (procedure)\n",
      "Data Type: object\n",
      "Column: Galantamine 4 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Insulin Lispro 100 UNT/ML Injectable Solution [Humalog]\n",
      "Data Type: object\n",
      "Column: Malignant neoplasm of breast (disorder)\n",
      "Data Type: object\n",
      "Column: Glomerular filtration rate/1.73 sq M.predicted\n",
      "Data Type: object\n",
      "Column: Leucovorin 100 MG Injection\n",
      "Data Type: object\n",
      "Column: Ibuprofen 400 MG Oral Tablet [Ibu]\n",
      "Data Type: object\n",
      "Column: Acetaminophen 325 MG Oral Tablet [Tylenol]\n",
      "Data Type: object\n",
      "Column: Full-time employment (finding)\n",
      "Data Type: object\n",
      "Column: Verzenio 100 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Soybean IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: neratinib 40 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Diazepam 5 MG/ML Injectable Solution\n",
      "Data Type: object\n",
      "Column: Mean blood pressure\n",
      "Data Type: object\n",
      "Column: Mammogram - symptomatic (procedure)\n",
      "Data Type: object\n",
      "Column: Mirena 52 MG Intrauterine System\n",
      "Data Type: object\n",
      "Column: Thyrotropin [Units/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Carcinoma in situ of prostate (disorder)\n",
      "Data Type: object\n",
      "Column: 60 ACTUAT Fluticasone propionate 0.25 MG/ACTUAT / salmeterol 0.05 MG/ACTUAT Dry Powder Inhaler\n",
      "Data Type: object\n",
      "Column: Ortho Tri-Cyclen 28 Day Pack\n",
      "Data Type: object\n",
      "Column: Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction\n",
      "Data Type: float64\n",
      "Column: Bilateral tubal ligation\n",
      "Data Type: object\n",
      "Column: Medication Reconciliation (procedure)\n",
      "Data Type: object\n",
      "Column: Second degree burn\n",
      "Data Type: object\n",
      "Column: Globulin [Mass/volume] in Serum by calculation\n",
      "Data Type: object\n",
      "Column: Artificial respiration (procedure)\n",
      "Data Type: object\n",
      "Column: Chronic congestive heart failure (disorder)\n",
      "Data Type: object\n",
      "Column: Removal of endotracheal tube (procedure)\n",
      "Data Type: object\n",
      "Column: 10 ML Alfentanil 0.5 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Excision of axillary lymph node (procedure)\n",
      "Data Type: object\n",
      "Column: Warfarin Sodium 5 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: SARS-COV-2 (COVID-19) vaccine  mRNA  spike protein  LNP  preservative free  100 mcg/0.5mL dose\n",
      "Data Type: object\n",
      "Column: 1 ML DOCEtaxel 20 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Phenazopyridine hydrochloride 100 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Codfish IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Chronic sinusitis (disorder)\n",
      "Data Type: object\n",
      "Column: Care regimes assessment (procedure)\n",
      "Data Type: object\n",
      "Column: Ampicillin 100 MG/ML Injectable Solution\n",
      "Data Type: object\n",
      "Column: lapatinib 250 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: losartan potassium 25 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Attempted suicide - cut/stab\n",
      "Data Type: object\n",
      "Column: Influenza  seasonal  injectable  preservative free\n",
      "Data Type: object\n",
      "Column: Drugs of abuse 5 panel - Urine by Screen method\n",
      "Data Type: object\n",
      "Column: Electrocardiographic procedure\n",
      "Data Type: object\n",
      "Column: Counseling for termination of pregnancy\n",
      "Data Type: object\n",
      "Column: Heart failure (disorder)\n",
      "Data Type: object\n",
      "Column: White oak IgE Ab in Serum\n",
      "Data Type: object\n",
      "Column: Echocardiography (procedure)\n",
      "Data Type: object\n",
      "Column: 10 ML oxaliplatin 5 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Injury of tendon of the rotator cuff of shoulder\n",
      "Data Type: object\n",
      "Column: Partial resection of colon\n",
      "Data Type: object\n",
      "Column: Non-low risk pregnancy\n",
      "Data Type: object\n",
      "Column: Ultrasonography of bilateral breasts (procedure)\n",
      "Data Type: object\n",
      "Column: Acetaminophen/Hydrocodone\n",
      "Data Type: object\n",
      "Column: Yaz 28 Day Pack\n",
      "Data Type: object\n",
      "Column: Admission to orthopedic department\n",
      "Data Type: object\n",
      "Column: History AND physical examination (procedure)\n",
      "Data Type: object\n",
      "Column: Bullet wound\n",
      "Data Type: object\n",
      "Column: Td (adult) preservative free\n",
      "Data Type: object\n",
      "Column: Chloride [Moles/volume] in Serum or Plasma\n",
      "Data Type: object\n",
      "Column: Interleukin 6 [Mass/volume] in Serum or Plasma\n",
      "Data Type: float64\n",
      "Column: Muscle pain (finding)\n",
      "Data Type: object\n",
      "Column: Tumor marker Cancer\n",
      "Data Type: object\n",
      "Column: Laceration of forearm\n",
      "Data Type: object\n",
      "Column: Sepsis caused by Staphylococcus aureus\n",
      "Data Type: float64\n",
      "Column: Myocardial Infarction\n",
      "Data Type: object\n",
      "Column: Sputum examination (procedure)\n",
      "Data Type: object\n",
      "Column: Otitis media\n",
      "Data Type: object\n",
      "Column: Bilirubin.total [Mass/volume] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Pain severity - 0-10 verbal numeric rating [Score] - Reported\n",
      "Data Type: object\n",
      "Column: Neoplasm of prostate\n",
      "Data Type: object\n",
      "Column: Gout\n",
      "Data Type: object\n",
      "Column: Vancomycin 50 MG/ML Injectable Solution\n",
      "Data Type: float64\n",
      "Column: Naproxen sodium 220 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Glucose [Presence] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Magnetic resonance imaging of breast (procedure)\n",
      "Data Type: object\n",
      "Column: 2 ML Ondansetron 2 MG/ML Injection\n",
      "Data Type: object\n",
      "Column: Social case work (regime/therapy)\n",
      "Data Type: object\n",
      "Column: Carbon dioxide [Partial pressure] in Arterial blood\n",
      "Data Type: float64\n",
      "Column: Fetus with unknown complication\n",
      "Data Type: object\n",
      "Column: US Guidance for biopsy of Prostate\n",
      "Data Type: object\n",
      "Column: Fracture of the vertebral column with spinal cord injury\n",
      "Data Type: object\n",
      "Column: Oral Glucose Tolerance Test\n",
      "Data Type: float64\n",
      "Column: Digoxin 0.125 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Metabolic syndrome X (disorder)\n",
      "Data Type: object\n",
      "Column: Neuropathy due to type 2 diabetes mellitus (disorder)\n",
      "Data Type: object\n",
      "Column: Sprain of wrist\n",
      "Data Type: object\n",
      "Column: 0.4 ML Enoxaparin sodium 100 MG/ML Prefilled Syringe\n",
      "Data Type: object\n",
      "Column: 1 ML Enoxaparin sodium 150 MG/ML Prefilled Syringe\n",
      "Data Type: object\n",
      "Column: Fracture of ankle\n",
      "Data Type: object\n",
      "Column: Donepezil hydrochloride 23 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Removal of intrauterine device\n",
      "Data Type: object\n",
      "Column: Eosinophils/100 leukocytes in Blood by Automated count\n",
      "Data Type: float64\n",
      "Column: Transplantation of heart (procedure)\n",
      "Data Type: object\n",
      "Column: vancomycin 1000 MG Injection\n",
      "Data Type: object\n",
      "Column: piperacillin 2000 MG / tazobactam 250 MG Injection\n",
      "Data Type: object\n",
      "Column: Opioid abuse (disorder)\n",
      "Data Type: object\n",
      "Column: amLODIPine 2.5 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Leronlimab 700 MG Injection\n",
      "Data Type: object\n",
      "Column: Estrogen+Progesterone receptor Ag [Presence] in Tissue by Immune stain\n",
      "Data Type: object\n",
      "Column: SARS-CoV-2 RNA Pnl Resp NAA+probe\n",
      "Data Type: object\n",
      "Column: remdesivir 100 MG Injection\n",
      "Data Type: object\n",
      "Column: 1 ML Epoetin Alfa 4000 UNT/ML Injection [Epogen]\n",
      "Data Type: object\n",
      "Column: Trinessa 28 Day Pack\n",
      "Data Type: object\n",
      "Column: 1 ML heparin sodium  porcine 5000 UNT/ML Injection\n",
      "Data Type: object\n",
      "Column: Hemoglobin [Presence] in Urine by Test strip\n",
      "Data Type: object\n",
      "Column: Astemizole 10 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Sentinel lymph node biopsy (procedure)\n",
      "Data Type: object\n",
      "Column: 5 ML fulvestrant 50 MG/ML Prefilled Syringe\n",
      "Data Type: object\n",
      "Column: Hematocrit [Volume Fraction] of Blood\n",
      "Data Type: object\n",
      "Column: Acetaminophen 325 MG / Oxycodone Hydrochloride 10 MG Oral Tablet [Percocet]\n",
      "Data Type: object\n",
      "Column: Camila 28 Day Pack\n",
      "Data Type: object\n",
      "Column: History of appendectomy\n",
      "Data Type: object\n",
      "Column: remifentanil 2 MG Injection\n",
      "Data Type: object\n",
      "Column: Chlorpheniramine Maleate 4 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Fetal anatomy study\n",
      "Data Type: object\n",
      "Column: Bicarbonate [Moles/volume] in Arterial blood\n",
      "Data Type: float64\n",
      "Column: Screening mammography (procedure)\n",
      "Data Type: object\n",
      "Column: Sertraline 100 MG Oral Tablet\n",
      "Data Type: object\n",
      "Column: Transport problems (finding)\n",
      "Data Type: object\n",
      "Column: Sulfamethoxazole / Trimethoprim\n",
      "Data Type: object\n",
      "Column: Respiratory Disorders\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Heart and Cardiovascular Diseases\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Metabolic and Endocrine Disorders\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Neurological Disorders\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Orthopedic Injuries\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Mental Health\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Reproductive and Pregnancy\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Pain Relievers and Analesics\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Cardiovascular and Blood Pressure Medications\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Injection Medications\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Oral Medications\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Other Medications\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Therapies and Regimes\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Diagnostic Procedures\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Surgical Interventions\n",
      "Data Type: object\n",
      "Data Type: int64\n",
      "Column: Patient Care Management\n",
      "Data Type: object\n",
      "Data Type: int64\n"
     ]
    }
   ],
   "source": [
    "for col in df2.columns:\n",
    "    # print data type of column\n",
    "    print(f\"Column: {col}\") \n",
    "    print(f\"Data Type: {df2[col].dtype}\")\n",
    "    if col in categories:\n",
    "        # change data type to numeric\n",
    "        df2[col] = pd.to_numeric(df2[col], errors='coerce')\n",
    "        print(f\"Data Type: {df2[col].dtype}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dropping unnecessary columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Not important columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "unimportant_columns = ['race', 'ethnic', 'state', 'Social migrant (finding)', 'Lack of access to transportation (finding)', 'Transport problems (finding)', 'Full-time employment (finding)', 'Face mask (physical object)', 'Childbirth', 'Cesarean section', 'Normal pregnancy', 'Non-low risk pregnancy']\n",
    "df2.drop(columns=unimportant_columns, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Columns used in new features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "used_columns = respiratory_disorders_columns + heart_and_cardiovascular_diseases_columns + metabolic_and_endocrine_disorders_columns + neurological_disorders_columns + orthopedic_injuries_columns + mental_health_columns + reproductive_and_pregancy_columns + pain_relievers_and_analesics_columns + cardiovascular_and_blood_pressure_medications_columns + injection_medications_columns + oral_medications_columns + other_medications_columns + therapies_and_regimes_columns + diagnostic_procedures_columns + surgerical_interventions_columns + patient_care_management_columns\n",
    "df2.drop(columns=used_columns, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Columns with only null values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column                                                                                                  Null Values Percentage\n",
      "----------------------------------------------------------------------------------------------------  ------------------------\n",
      "Take blood sample                                                                                                  100\n",
      "Infection caused by Staphylococcus aureus                                                                          100\n",
      "History of upper limb amputation (situation)                                                                       100\n",
      "Cystic Fibrosis                                                                                                    100\n",
      "Body mass index 40+ - severely obese (finding)                                                                     100\n",
      "Sweat Test                                                                                                         100\n",
      "Incision of trachea (procedure)                                                                                    100\n",
      "canagliflozin 100 MG Oral Tablet                                                                                   100\n",
      "History of disarticulation at wrist (situation)                                                                    100\n",
      "Diabetes from Cystic Fibrosis                                                                                      100\n",
      "Sepsis caused by Pseudomonas (disorder)                                                                            100\n",
      "Suicidal deliberate poisoning                                                                                      100\n",
      "Pancreatin 600 MG Oral Tablet                                                                                      100\n",
      "20 Gene mutation test                                                                                              100\n",
      "Total replacement of hip                                                                                           100\n",
      "Lung Transplant                                                                                                    100\n",
      "Sepsis caused by Staphylococcus aureus                                                                             100\n",
      "Vancomycin 50 MG/ML Injectable Solution                                                                            100\n",
      "Oral Glucose Tolerance Test                                                                                        100\n",
      "Intravenous infusion (procedure)                                                                                   100\n",
      "0.67 ML anakinra 149 MG/ML Prefilled Syringe                                                                       100\n",
      "Bee venom (substance)                                                                                              100\n",
      "Intravenous antibiotic therapy                                                                                     100\n",
      "Sputum Culture                                                                                                     100\n",
      "Interleukin 6 [Mass/volume] in Serum or Plasma                                                                      99.9889\n",
      "Tumor marker Cancer                                                                                                 99.7679\n",
      "Thyroxine (T4) free [Mass/volume] in Serum or Plasma                                                                99.4695\n",
      "Thyrotropin [Units/volume] in Serum or Plasma                                                                       99.4695\n",
      "Bicarbonate [Moles/volume] in Arterial blood                                                                        98.6848\n",
      "Oxygen/Inspired gas setting [Volume Fraction] Ventilator                                                            98.6848\n",
      "Oxygen [Partial pressure] in Arterial blood                                                                         98.6848\n",
      "pH of Arterial blood                                                                                                98.6848\n",
      "Carbon dioxide [Partial pressure] in Arterial blood                                                                 98.6848\n",
      "Abuse Status [OMAHA]                                                                                                98.5301\n",
      "Housing status                                                                                                      98.5301\n",
      "HIV status                                                                                                          98.5301\n",
      "Are you covered by health insurance or some other kind of health care plan [PhenX]                                  98.5301\n",
      "Total Bilirubin (Elevated)                                                                                          98.3311\n",
      "Red Blood Cell                                                                                                      98.3311\n",
      "RBC Distribution Width                                                                                              98.3311\n",
      "MCV                                                                                                                 98.3311\n",
      "Platelet Count                                                                                                      98.3311\n",
      "Anion Gap                                                                                                           98.3311\n",
      "White Blood Cell (Elevated)                                                                                         98.3311\n",
      "Influenza virus B Ag [Presence] in Nasopharynx by Rapid immunoassay                                                 98.3201\n",
      "Influenza virus A Ag [Presence] in Nasopharynx by Rapid immunoassay                                                 98.3201\n",
      "Oxygen Saturation                                                                                                   98.2427\n",
      "Gram positive blood culture panel by Probe in Positive blood culture                                                98.2427\n",
      "Mean blood pressure                                                                                                 98.2427\n",
      "Lactate [Mass/volume] in Blood                                                                                      98.2427\n",
      "Capillary refill [Time] of Nail bed                                                                                 98.2427\n",
      "Estrogen+Progesterone receptor Ag [Presence] in Tissue by Immune stain                                              97.8117\n",
      "Prothrombin time (PT)                                                                                               97.1706\n",
      "Eosinophils/100 leukocytes in Blood by Automated count                                                              97.1706\n",
      "Procalcitonin [Mass/volume] in Serum or Plasma                                                                      97.1706\n",
      "Creatine kinase [Enzymatic activity/volume] in Serum or Plasma                                                      97.1706\n",
      "Basophils [#/volume] in Blood by Automated count                                                                    97.1706\n",
      "INR in Platelet poor plasma by Coagulation assay                                                                    97.1706\n",
      "Neutrophils/100 leukocytes in Blood by Automated count                                                              97.1706\n",
      "Monocytes/100 leukocytes in Blood by Automated count                                                                97.1706\n",
      "Monocytes [#/volume] in Blood by Automated count                                                                    97.1706\n",
      "Lymphocytes/100 leukocytes in Blood by Automated count                                                              97.1706\n",
      "Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction                97.1706\n",
      "Lymphocytes [#/volume] in Blood by Automated count                                                                  97.1706\n",
      "Eosinophils [#/volume] in Blood by Automated count                                                                  97.1706\n",
      "Neutrophils [#/volume] in Blood by Automated count                                                                  97.1706\n",
      "Basophils/100 leukocytes in Blood by Automated count                                                                97.1706\n",
      "C reactive protein [Mass/volume] in Serum or Plasma                                                                 97.1706\n",
      "Fibrin D-dimer FEU [Mass/volume] in Platelet poor plasma                                                            97.1706\n",
      "Treatment status Cancer                                                                                             96.7949\n",
      "Parainfluenza virus 2 RNA [Presence] in Respiratory specimen by NAA with probe detection                            96.7396\n",
      "Respiratory syncytial virus RNA [Presence] in Respiratory specimen by NAA with probe detection                      96.7396\n",
      "Human metapneumovirus RNA [Presence] in Respiratory specimen by NAA with probe detection                            96.7396\n",
      "Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection                             96.7396\n",
      "Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection                                96.7396\n",
      "Rhinovirus RNA [Presence] in Respiratory specimen by NAA with probe detection                                       96.7396\n",
      "Parainfluenza virus 1 RNA [Presence] in Respiratory specimen by NAA with probe detection                            96.7396\n",
      "Parainfluenza virus 3 RNA [Presence] in Respiratory specimen by NAA with probe detection                            96.7396\n",
      "Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection                                96.7396\n",
      "American house dust mite IgE Ab in Serum                                                                            96.5517\n",
      "Latex IgE Ab in Serum                                                                                               96.5517\n",
      "Peanut IgE Ab in Serum                                                                                              96.5517\n",
      "White oak IgE Ab in Serum                                                                                           96.5517\n",
      "Egg white IgE Ab in Serum                                                                                           96.5517\n",
      "Shrimp IgE Ab in Serum                                                                                              96.5517\n",
      "Codfish IgE Ab in Serum                                                                                             96.5517\n",
      "Wheat IgE Ab in Serum                                                                                               96.5517\n",
      "Soybean IgE Ab in Serum                                                                                             96.5517\n",
      "Honey bee IgE Ab in Serum                                                                                           96.5517\n",
      "Cladosporium herbarum IgE Ab in Serum                                                                               96.5517\n",
      "Cow milk IgE Ab in Serum                                                                                            96.5517\n",
      "Common Ragweed IgE Ab in Serum                                                                                      96.5517\n",
      "Cat dander IgE Ab in Serum                                                                                          96.5517\n",
      "Walnut IgE Ab in Serum                                                                                              96.5517\n",
      "Stage group.clinical Cancer                                                                                         96.397\n",
      "HER2 [Presence] in Breast cancer specimen by Immune stain                                                           96.397\n",
      "Progesterone receptor Ag [Presence] in Breast cancer specimen by Immune stain                                       96.397\n",
      "Estrogen receptor Ag [Presence] in Breast cancer specimen by Immune stain                                           96.397\n",
      "HER2 [Presence] in Breast cancer specimen by FISH                                                                   96.397\n",
      "Response to cancer treatment                                                                                        96.2312\n",
      "Smokes tobacco daily                                                                                                96.1096\n",
      "SARS-CoV-2 RNA Pnl Resp NAA+probe                                                                                   95.0597\n",
      "FEV1/FVC                                                                                                            93.3245\n",
      "Drugs of abuse 5 panel - Urine by Screen method                                                                     89.2352\n",
      "DXA [T-score] Bone density                                                                                          88.7821\n",
      "Hematocrit [Volume Fraction] of Blood                                                                               87.2679\n",
      "Hemoglobin.gastrointestinal [Presence] in Stool by Immunologic method                                               85.1569\n",
      "Polyp size greatest dimension by CAP cancer protocols                                                               85.1569\n",
      "US Guidance for biopsy of Prostate                                                                                  81.1561\n",
      "Clarity of Urine                                                                                                    81.0234\n",
      "Bilirubin.total [Mass/volume] in Urine by Test strip                                                                81.0234\n",
      "Protein [Presence] in Urine by Test strip                                                                           81.0234\n",
      "Ketones [Presence] in Urine by Test strip                                                                           81.0234\n",
      "Color of Urine                                                                                                      81.0234\n",
      "Leukocyte esterase [Presence] in Urine by Test strip                                                                81.0234\n",
      "Nitrite [Presence] in Urine by Test strip                                                                           81.0234\n",
      "pH of Urine by Test strip                                                                                           81.0234\n",
      "Protein [Mass/volume] in Urine by Test strip                                                                        81.0234\n",
      "Ketones [Mass/volume] in Urine by Test strip                                                                        81.0234\n",
      "Glucose [Presence] in Urine by Test strip                                                                           81.0234\n",
      "Specific gravity of Urine by Test strip                                                                             81.0234\n",
      "Glucose [Mass/volume] in Urine by Test strip                                                                        81.0234\n",
      "Appearance of Urine                                                                                                 81.0234\n",
      "Hemoglobin [Presence] in Urine by Test strip                                                                        81.0234\n",
      "Microalbumin Creatinine Ratio                                                                                       77.542\n",
      "Prostate specific Ag [Mass/volume] in Serum or Plasma                                                               75.0663\n",
      "Functional capacity NYHA                                                                                            73.9943\n",
      "Objective assessment of cardiovascular disease NYHA                                                                 73.9943\n",
      "Left ventricular Ejection fraction                                                                                  73.519\n",
      "Iron saturation [Mass Fraction] in Serum or Plasma                                                                  73.4637\n",
      "Iron binding capacity [Mass/volume] in Serum or Plasma                                                              73.4637\n",
      "Magnesium [Mass/volume] in Serum or Plasma                                                                          73.4637\n",
      "NT-proBNP                                                                                                           73.4637\n",
      "Iron [Mass/volume] in Serum or Plasma                                                                               73.4637\n",
      "Troponin I.cardiac [Mass/volume] in Serum or Plasma by High sensitivity method                                      70.6897\n",
      "Ferritin [Mass/volume] in Serum or Plasma                                                                           70.6897\n",
      "Erythrocyte distribution width [Ratio] by Automated count                                                           70.6897\n",
      "Oxygen saturation in Arterial blood                                                                                 70.2697\n",
      "Chloride [Moles/volume] in Serum or Plasma                                                                          69.1866\n",
      "Glucose [Mass/volume] in Serum or Plasma                                                                            69.1866\n",
      "Carbon Dioxide                                                                                                      69.1866\n",
      "Urea nitrogen [Mass/volume] in Serum or Plasma                                                                      69.1866\n",
      "Calcium [Mass/volume] in Serum or Plasma                                                                            69.1866\n",
      "Globulin [Mass/volume] in Serum by calculation                                                                      61.0522\n",
      "Body temperature                                                                                                    48.4195\n",
      "Bilirubin.total [Mass/volume] in Serum or Plasma                                                                    39.0141\n",
      "Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma                                                 38.141\n",
      "Albumin                                                                                                             38.141\n",
      "Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma                                           38.141\n",
      "Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma                                             38.141\n",
      "Glomerular filtration rate/1.73 sq M.predicted                                                                      29.1888\n",
      "Hemoglobin A1c/Hemoglobin.total in Blood                                                                            23.1101\n",
      "Chloride                                                                                                            13.2294\n",
      "Glucose                                                                                                             13.2294\n",
      "Urea Nitrogen                                                                                                       13.2294\n",
      "Calcium                                                                                                             13.2294\n",
      "Sodium                                                                                                              12.1905\n",
      "Creatinine                                                                                                          12.1905\n",
      "Potassium                                                                                                           12.1905\n",
      "Erythrocyte distribution width [Entitic volume] by Automated count                                                   0.939434\n",
      "Platelet mean volume [Entitic volume] in Blood by Automated count                                                    0.939434\n",
      "Hematocrit [Volume Fraction] of Blood by Automated count                                                             0.862069\n",
      "Platelets [#/volume] in Blood by Automated count                                                                     0.851017\n",
      "MCV [Entitic volume] by Automated count                                                                              0.851017\n",
      "MCHC [Mass/volume] by Automated count                                                                                0.851017\n",
      "Erythrocytes [#/volume] in Blood by Automated count                                                                  0.851017\n",
      "MCH [Entitic mass] by Automated count                                                                                0.851017\n",
      "Leukocytes [#/volume] in Blood by Automated count                                                                    0.851017\n",
      "Hemoglobin [Mass/volume] in Blood                                                                                    0.806808\n",
      "Heart rate                                                                                                           0.0552608\n",
      "Diastolic Blood Pressure                                                                                             0.0552608\n",
      "Body Mass Index                                                                                                      0.0552608\n",
      "Systolic Blood Pressure                                                                                              0.0552608\n",
      "Respiratory rate                                                                                                     0.0552608\n",
      "Total Cholesterol                                                                                                    0.0552608\n",
      "Low Density Lipoprotein Cholesterol                                                                                  0.0552608\n",
      "High Density Lipoprotein Cholesterol                                                                                 0.0552608\n",
      "Tobacco smoking status NHIS                                                                                          0.0552608\n",
      "Triglycerides                                                                                                        0.0552608\n",
      "Pain severity - 0-10 verbal numeric rating [Score] - Reported                                                        0.0442087\n"
     ]
    }
   ],
   "source": [
    "null_values = df2.isnull().sum()\n",
    "null_values = null_values[null_values > 0]\n",
    "null_values = 100 * null_values / len(df2)\n",
    "null_values = null_values.sort_values(ascending=False)\n",
    "null_values_df = null_values.reset_index()\n",
    "null_values_df.columns = ['Column', 'Null Values Percentage']\n",
    "print(tabulate(null_values_df, headers='keys', tablefmt='simple', showindex=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['Take blood sample', 'Incision of trachea (procedure)',\n",
      "       'Intravenous antibiotic therapy', 'Sputum Culture',\n",
      "       'Bee venom (substance)', 'Infection caused by Staphylococcus aureus',\n",
      "       'Intravenous infusion (procedure)',\n",
      "       'History of upper limb amputation (situation)', 'Cystic Fibrosis',\n",
      "       'Body mass index 40+ - severely obese (finding)', 'Sweat Test',\n",
      "       '0.67 ML anakinra 149 MG/ML Prefilled Syringe',\n",
      "       'canagliflozin 100 MG Oral Tablet',\n",
      "       'History of disarticulation at wrist (situation)',\n",
      "       'Diabetes from Cystic Fibrosis',\n",
      "       'Sepsis caused by Pseudomonas (disorder)',\n",
      "       'Suicidal deliberate poisoning', 'Pancreatin 600 MG Oral Tablet',\n",
      "       '20 Gene mutation test', 'Total replacement of hip', 'Lung Transplant',\n",
      "       'Sepsis caused by Staphylococcus aureus',\n",
      "       'Vancomycin 50 MG/ML Injectable Solution',\n",
      "       'Oral Glucose Tolerance Test'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# drop columns with only null values\n",
    "null_columns = df2.columns[df2.isnull().all()]\n",
    "print(null_columns)\n",
    "df2.dropna(axis=1, how='all', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "177"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df2.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Age "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_one_hot = pd.get_dummies(df2['age'], prefix='age')\n",
    "df2 = pd.concat([df2, df_one_hot], axis=1)\n",
    "df2['age_30t50'] = df2['age_30t50'].astype(int)\n",
    "df2['age_50t70'] = df2['age_50t70'].astype(int)\n",
    "df2['age_gt70'] = df2['age_gt70'].astype(int)\n",
    "df2.drop(columns=['age'], inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Columns with normal/abnormal values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma\n",
      "Albumin\n",
      "Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma\n",
      "American house dust mite IgE Ab in Serum\n",
      "Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma\n",
      "Bilirubin.total [Mass/volume] in Serum or Plasma\n",
      "Bilirubin.total [Mass/volume] in Urine by Test strip\n",
      "Body Mass Index\n",
      "Body temperature\n",
      "Calcium\n",
      "Calcium [Mass/volume] in Serum or Plasma\n",
      "Carbon Dioxide\n",
      "Cat dander IgE Ab in Serum\n",
      "Chloride\n",
      "Chloride [Moles/volume] in Serum or Plasma\n",
      "Cladosporium herbarum IgE Ab in Serum\n",
      "Codfish IgE Ab in Serum\n",
      "Common Ragweed IgE Ab in Serum\n",
      "Cow milk IgE Ab in Serum\n",
      "Creatinine\n",
      "DXA [T-score] Bone density\n",
      "Diastolic Blood Pressure\n",
      "Egg white IgE Ab in Serum\n",
      "Erythrocyte distribution width [Entitic volume] by Automated count\n",
      "Erythrocyte distribution width [Ratio] by Automated count\n",
      "Erythrocytes [#/volume] in Blood by Automated count\n",
      "FEV1/FVC\n",
      "Ferritin [Mass/volume] in Serum or Plasma\n",
      "Globulin [Mass/volume] in Serum by calculation\n",
      "Glomerular filtration rate/1.73 sq M.predicted\n",
      "Glucose\n",
      "Glucose [Mass/volume] in Serum or Plasma\n",
      "Glucose [Mass/volume] in Urine by Test strip\n",
      "Heart rate\n",
      "Hematocrit [Volume Fraction] of Blood\n",
      "Hematocrit [Volume Fraction] of Blood by Automated count\n",
      "Hemoglobin A1c/Hemoglobin.total in Blood\n",
      "Hemoglobin [Mass/volume] in Blood\n",
      "Hemoglobin.gastrointestinal [Presence] in Stool by Immunologic method\n",
      "High Density Lipoprotein Cholesterol\n",
      "Honey bee IgE Ab in Serum\n",
      "Iron [Mass/volume] in Serum or Plasma\n",
      "Iron binding capacity [Mass/volume] in Serum or Plasma\n",
      "Iron saturation [Mass Fraction] in Serum or Plasma\n",
      "Lactate [Mass/volume] in Blood\n",
      "Latex IgE Ab in Serum\n",
      "Left ventricular Ejection fraction\n",
      "Leukocytes [#/volume] in Blood by Automated count\n",
      "Low Density Lipoprotein Cholesterol\n",
      "MCH [Entitic mass] by Automated count\n",
      "MCHC [Mass/volume] by Automated count\n",
      "MCV [Entitic volume] by Automated count\n",
      "Magnesium [Mass/volume] in Serum or Plasma\n",
      "Mean blood pressure\n",
      "Microalbumin Creatinine Ratio\n",
      "NT-proBNP\n",
      "Oxygen Saturation\n",
      "Oxygen saturation in Arterial blood\n",
      "Pain severity - 0-10 verbal numeric rating [Score] - Reported\n",
      "Peanut IgE Ab in Serum\n",
      "Platelet mean volume [Entitic volume] in Blood by Automated count\n",
      "Platelets [#/volume] in Blood by Automated count\n",
      "Polyp size greatest dimension by CAP cancer protocols\n",
      "Potassium\n",
      "Prostate specific Ag [Mass/volume] in Serum or Plasma\n",
      "Protein [Mass/volume] in Urine by Test strip\n",
      "Respiratory rate\n",
      "Shrimp IgE Ab in Serum\n",
      "Sodium\n",
      "Soybean IgE Ab in Serum\n",
      "Specific gravity of Urine by Test strip\n",
      "Systolic Blood Pressure\n",
      "Thyrotropin [Units/volume] in Serum or Plasma\n",
      "Thyroxine (T4) free [Mass/volume] in Serum or Plasma\n",
      "Total Cholesterol\n",
      "Triglycerides\n",
      "Troponin I.cardiac [Mass/volume] in Serum or Plasma by High sensitivity method\n",
      "US Guidance for biopsy of Prostate\n",
      "Urea Nitrogen\n",
      "Urea nitrogen [Mass/volume] in Serum or Plasma\n",
      "Walnut IgE Ab in Serum\n",
      "Wheat IgE Ab in Serum\n",
      "White oak IgE Ab in Serum\n",
      "pH of Urine by Test strip\n"
     ]
    }
   ],
   "source": [
    "n_abn_columns = []\n",
    "for col in df2.columns:\n",
    "    if df2[col].isin(['normal', 'abnormal']).any():\n",
    "        n_abn_columns.append(col)\n",
    "n_abn_columns.sort()\n",
    "for col in n_abn_columns:\n",
    "    print(col)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many null values are in these columns?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column                                                                            Null Values Percentage\n",
      "------------------------------------------------------------------------------  ------------------------\n",
      "Thyroxine (T4) free [Mass/volume] in Serum or Plasma                                          99.4695\n",
      "Thyrotropin [Units/volume] in Serum or Plasma                                                 99.4695\n",
      "Mean blood pressure                                                                           98.2427\n",
      "Oxygen Saturation                                                                             98.2427\n",
      "Lactate [Mass/volume] in Blood                                                                98.2427\n",
      "Honey bee IgE Ab in Serum                                                                     96.5517\n",
      "Latex IgE Ab in Serum                                                                         96.5517\n",
      "Peanut IgE Ab in Serum                                                                        96.5517\n",
      "Egg white IgE Ab in Serum                                                                     96.5517\n",
      "Cow milk IgE Ab in Serum                                                                      96.5517\n",
      "Common Ragweed IgE Ab in Serum                                                                96.5517\n",
      "Codfish IgE Ab in Serum                                                                       96.5517\n",
      "Cladosporium herbarum IgE Ab in Serum                                                         96.5517\n",
      "Shrimp IgE Ab in Serum                                                                        96.5517\n",
      "Cat dander IgE Ab in Serum                                                                    96.5517\n",
      "Soybean IgE Ab in Serum                                                                       96.5517\n",
      "Walnut IgE Ab in Serum                                                                        96.5517\n",
      "Wheat IgE Ab in Serum                                                                         96.5517\n",
      "American house dust mite IgE Ab in Serum                                                      96.5517\n",
      "White oak IgE Ab in Serum                                                                     96.5517\n",
      "FEV1/FVC                                                                                      93.3245\n",
      "DXA [T-score] Bone density                                                                    88.7821\n",
      "Hematocrit [Volume Fraction] of Blood                                                         87.2679\n",
      "Hemoglobin.gastrointestinal [Presence] in Stool by Immunologic method                         85.1569\n",
      "Polyp size greatest dimension by CAP cancer protocols                                         85.1569\n",
      "US Guidance for biopsy of Prostate                                                            81.1561\n",
      "Glucose [Mass/volume] in Urine by Test strip                                                  81.0234\n",
      "pH of Urine by Test strip                                                                     81.0234\n",
      "Protein [Mass/volume] in Urine by Test strip                                                  81.0234\n",
      "Specific gravity of Urine by Test strip                                                       81.0234\n",
      "Bilirubin.total [Mass/volume] in Urine by Test strip                                          81.0234\n",
      "Microalbumin Creatinine Ratio                                                                 77.542\n",
      "Prostate specific Ag [Mass/volume] in Serum or Plasma                                         75.0663\n",
      "Left ventricular Ejection fraction                                                            73.519\n",
      "Magnesium [Mass/volume] in Serum or Plasma                                                    73.4637\n",
      "NT-proBNP                                                                                     73.4637\n",
      "Iron saturation [Mass Fraction] in Serum or Plasma                                            73.4637\n",
      "Iron [Mass/volume] in Serum or Plasma                                                         73.4637\n",
      "Iron binding capacity [Mass/volume] in Serum or Plasma                                        73.4637\n",
      "Troponin I.cardiac [Mass/volume] in Serum or Plasma by High sensitivity method                70.6897\n",
      "Erythrocyte distribution width [Ratio] by Automated count                                     70.6897\n",
      "Ferritin [Mass/volume] in Serum or Plasma                                                     70.6897\n",
      "Oxygen saturation in Arterial blood                                                           70.2697\n",
      "Chloride [Moles/volume] in Serum or Plasma                                                    69.1866\n",
      "Carbon Dioxide                                                                                69.1866\n",
      "Calcium [Mass/volume] in Serum or Plasma                                                      69.1866\n",
      "Glucose [Mass/volume] in Serum or Plasma                                                      69.1866\n",
      "Urea nitrogen [Mass/volume] in Serum or Plasma                                                69.1866\n",
      "Globulin [Mass/volume] in Serum by calculation                                                61.0522\n",
      "Body temperature                                                                              48.4195\n",
      "Bilirubin.total [Mass/volume] in Serum or Plasma                                              39.0141\n",
      "Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma                       38.141\n",
      "Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma                     38.141\n",
      "Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma                           38.141\n",
      "Albumin                                                                                       38.141\n",
      "Glomerular filtration rate/1.73 sq M.predicted                                                29.1888\n",
      "Hemoglobin A1c/Hemoglobin.total in Blood                                                      23.1101\n",
      "Chloride                                                                                      13.2294\n",
      "Urea Nitrogen                                                                                 13.2294\n",
      "Glucose                                                                                       13.2294\n",
      "Calcium                                                                                       13.2294\n",
      "Potassium                                                                                     12.1905\n",
      "Creatinine                                                                                    12.1905\n",
      "Sodium                                                                                        12.1905\n",
      "Erythrocyte distribution width [Entitic volume] by Automated count                             0.939434\n",
      "Platelet mean volume [Entitic volume] in Blood by Automated count                              0.939434\n",
      "Hematocrit [Volume Fraction] of Blood by Automated count                                       0.862069\n",
      "Platelets [#/volume] in Blood by Automated count                                               0.851017\n",
      "Erythrocytes [#/volume] in Blood by Automated count                                            0.851017\n",
      "MCV [Entitic volume] by Automated count                                                        0.851017\n",
      "MCHC [Mass/volume] by Automated count                                                          0.851017\n",
      "MCH [Entitic mass] by Automated count                                                          0.851017\n",
      "Leukocytes [#/volume] in Blood by Automated count                                              0.851017\n",
      "Hemoglobin [Mass/volume] in Blood                                                              0.806808\n",
      "Respiratory rate                                                                               0.0552608\n",
      "Diastolic Blood Pressure                                                                       0.0552608\n",
      "Systolic Blood Pressure                                                                        0.0552608\n",
      "Total Cholesterol                                                                              0.0552608\n",
      "Triglycerides                                                                                  0.0552608\n",
      "Body Mass Index                                                                                0.0552608\n",
      "Low Density Lipoprotein Cholesterol                                                            0.0552608\n",
      "Heart rate                                                                                     0.0552608\n",
      "High Density Lipoprotein Cholesterol                                                           0.0552608\n",
      "Pain severity - 0-10 verbal numeric rating [Score] - Reported                                  0.0442087\n"
     ]
    }
   ],
   "source": [
    "n_abn_df = df2[n_abn_columns]\n",
    "null_values = n_abn_df.isnull().sum()\n",
    "null_values = null_values[null_values > 0]\n",
    "null_values = 100 * null_values / len(n_abn_df)\n",
    "null_values = null_values.sort_values(ascending=False)\n",
    "null_values_df = null_values.reset_index()\n",
    "null_values_df.columns = ['Column', 'Null Values Percentage']\n",
    "print(tabulate(null_values_df, headers='keys', tablefmt='simple', showindex=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Many of them have > 90 % null values. We will drop them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of columns to drop: 21\n"
     ]
    }
   ],
   "source": [
    "null_values_cols = []\n",
    "for col in n_abn_columns:\n",
    "    if df2[col].isnull().sum()/len(df2) > 0.9:\n",
    "        null_values_cols.append(col)\n",
    "print('Number of columns to drop:', len(null_values_cols))\n",
    "df2.drop(columns=null_values_cols, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['American house dust mite IgE Ab in Serum',\n",
       " 'Cat dander IgE Ab in Serum',\n",
       " 'Cladosporium herbarum IgE Ab in Serum',\n",
       " 'Codfish IgE Ab in Serum',\n",
       " 'Common Ragweed IgE Ab in Serum',\n",
       " 'Cow milk IgE Ab in Serum',\n",
       " 'Egg white IgE Ab in Serum',\n",
       " 'FEV1/FVC',\n",
       " 'Honey bee IgE Ab in Serum',\n",
       " 'Lactate [Mass/volume] in Blood',\n",
       " 'Latex IgE Ab in Serum',\n",
       " 'Mean blood pressure',\n",
       " 'Oxygen Saturation',\n",
       " 'Peanut IgE Ab in Serum',\n",
       " 'Shrimp IgE Ab in Serum',\n",
       " 'Soybean IgE Ab in Serum',\n",
       " 'Thyrotropin [Units/volume] in Serum or Plasma',\n",
       " 'Thyroxine (T4) free [Mass/volume] in Serum or Plasma',\n",
       " 'Walnut IgE Ab in Serum',\n",
       " 'Wheat IgE Ab in Serum',\n",
       " 'White oak IgE Ab in Serum']"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "null_values_cols"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Rest of them will be encoded using get_dummies."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_abn_columns = [col for col in n_abn_columns if col not in null_values_cols]\n",
    "df2 = pd.get_dummies(df2, columns=n_abn_columns, prefix_sep='_', dtype=int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Other non-numeric columns will be encoded using get_dummies."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Appearance of Urine\n",
      "[nan 'cloudy']\n",
      "Functional capacity NYHA\n",
      "[nan 'classii' 'classiii' 'classi' 'classiv']\n",
      "White Blood Cell (Elevated)\n",
      "[ nan 11.6 12.9 13.3 14.1 14.5 12.3 11.4 14.  13.7 13.9 14.2 12.4 14.3\n",
      " 12.  12.8 11.7 12.1 13.5 14.4 14.6 13.2 14.8 13.6 11.9 11.8 11.5 12.2\n",
      " 13.8 13.4 11.3 15.  12.7 11.1 14.7 12.6 13.  11.2 14.9 12.5]\n",
      "Color of Urine\n",
      "[nan 'reddish' 'brown']\n",
      "Objective assessment of cardiovascular disease NYHA\n",
      "[nan 'minimal' 'severe' 'mod-severe']\n",
      "Parainfluenza virus 1 RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "[nan 'negative']\n",
      "Abuse Status [OMAHA]\n",
      "[nan 'no' 'severe']\n",
      "Stage group.clinical Cancer\n",
      "[nan 'earlystage' 'latestage']\n",
      "Housing status\n",
      "[nan 'homeless']\n",
      "Capillary refill [Time] of Nail bed\n",
      "[nan 'increased']\n",
      "Smokes tobacco daily\n",
      "[nan True]\n",
      "Platelet Count\n",
      "[  nan 162.1 291.7 444.  411.  368.2 250.  217.9 178.7 292.5 428.9 254.8\n",
      " 423.7 427.7 306.4 421.5 340.9 227.8 322.3 192.7 323.4 440.8 349.8 429.9\n",
      " 328.2 374.3 228.3 204.6 308.6 200.3 429.3 174.3 155.7 329.5 173.6 204.9\n",
      " 260.9 420.7 356.7 449.9 387.3 171.6 440.4 384.8 417.1 186.  396.8 307.1\n",
      " 239.1 175.1 234.4 268.  331.7 326.5 324.6 391.3 336.6 210.8 348.6 407.5\n",
      " 417.6 198.2 226.9 231.5 243.1 378.9 325.2 446.3 266.9 244.4 371.  442.4\n",
      " 432.1 258.9 214.5 264.5 335.  338.2 444.9 278.5 211.3 385.1 350.5 290.1\n",
      " 248.  421.3 448.4 304.1 265.8 184.5 281.8 434.9 291.9 297.8 205.6 191.5\n",
      " 293.7 287.  327.  249.9 412.8 346.9 422.3 320.4 155.4 173.3 339.4 402.8\n",
      " 394.8 182.9 185.9 299.4 252.4 265.1 155.3 299.9 212.9 344.8 428.8 405.8\n",
      " 298.4 385.8 257.1 439.4 210.5 254.2 303.7 283.6 329.8 162.3 317.  365.6\n",
      " 273.4 447.5 209.6 196.1 415.3 197.6 295.  357.9 342.9 266.  224.8 181.3\n",
      " 424.9 265.2 303.6 162.8 367.1]\n",
      "Nitrite [Presence] in Urine by Test strip\n",
      "[nan 'negative']\n",
      "Progesterone receptor Ag [Presence] in Breast cancer specimen by Immune stain\n",
      "[nan 'positive' 'negative']\n",
      "Anion Gap\n",
      "[ nan  9.4  7.2  9.5 13.2  3.5 11.8  3.4  2.4  8.1  4.9  2.3 12.1  4.6\n",
      "  7.6  7.7  5.6 10.7  4.1 10.2 11.4 12.2  7.5 13.9  6.7  6.4  6.9 13.6\n",
      " 10.1  3.1  3.9 13.3  5.3  4.5 12.   6.2 11.3  8.9 10.5 10.6 11.9  5.8\n",
      " 12.6 11.6 14.9  8.7 11.1 12.5  4.3 10.   2.1  5.5 14.  11.  14.7  8.5\n",
      " 12.3 14.1  5.9  6.1  2.   7.1  3.7  5.4 13.8  8.8 10.3  2.7 11.7  2.8\n",
      "  5.  10.8  9.   9.9 14.3 12.7 14.8  7.   9.1  9.8  8.2  5.2  8.3  4.8\n",
      " 12.8  6.3  8.4  7.3 11.2  7.9 13.4  4.   6.   3.3  3.6]\n",
      "Influenza virus B Ag [Presence] in Nasopharynx by Rapid immunoassay\n",
      "[nan False True]\n",
      "C reactive protein [Mass/volume] in Serum or Plasma\n",
      "[  nan 10.27  9.87 10.57 10.13 10.18  9.84 10.23 10.08 10.73 10.42 10.32\n",
      " 12.84  9.42 10.14  9.77 10.01  9.89 14.09  9.98 13.15 10.7   9.9  13.43\n",
      " 10.21 13.54  9.78  9.66 10.64 12.89 10.31 10.4   9.7  13.09 10.05 10.45\n",
      "  9.94 10.24 10.38 10.06 10.26 10.37 10.17 13.56 13.03  9.6  10.6   9.51\n",
      "  9.32 13.57 10.9  10.8  10.36 10.78  9.4   9.48 12.45  9.14 10.3  10.1\n",
      "  9.86  9.88 13.16 10.44 13.64 10.02  9.93  9.68 10.04  9.8   9.76 10.11\n",
      " 10.69 10.22  9.3  13.39  9.96  8.91 10.03  9.83 11.2  13.74  9.08  9.63\n",
      " 10.49  9.52 10.07 10.58 12.49  9.91 10.43 10.34  9.65 10.52  9.62  9.54\n",
      " 13.82 10.96  9.43 10.83 10.59  9.5  10.09 10.2   9.58 12.9   9.1  11.\n",
      " 10.76 10.16  9.69  9.64 12.21  9.46 12.94  9.67 12.46 11.28 10.84  9.06\n",
      " 10.85 10.72  9.82 10.63 15.4   9.56  9.27  9.47  9.72  9.11 10.91  9.24\n",
      " 10.54 10.95 10.47  9.92 13.67  9.44 12.6   9.04 13.07  9.13  9.19  9.15\n",
      "  9.81  9.73  9.97 13.1  12.3  13.14 10.39 11.14 14.45 10.12 13.11 12.82\n",
      " 10.28  9.57 12.09]\n",
      "Parainfluenza virus 2 RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "[nan 'negative']\n",
      "Fibrin D-dimer FEU [Mass/volume] in Platelet poor plasma\n",
      "[  nan  0.42  0.47  0.43  0.38  0.45  0.4   0.34 11.01  0.56  5.2  11.35\n",
      "  0.44  7.23 11.38  0.46  8.89 11.76  0.36  0.51  2.12  7.67  7.83  6.51\n",
      "  0.49  6.52  0.53 10.17  0.41 10.02  0.39  0.37 13.81 13.34  0.5   7.94\n",
      "  0.54  0.35 10.14  8.09  1.96  0.3   6.45 10.19 12.61  8.73  1.43  4.95\n",
      "  1.73  1.37  0.32  9.84  8.56  1.5   9.88 11.7  14.05  5.89  0.55  7.09]\n",
      "label\n",
      "[0 1]\n",
      "Oxygen/Inspired gas setting [Volume Fraction] Ventilator\n",
      "[  nan 69.41 65.75 60.01 59.06 77.46 67.87 73.69 63.76 57.75 56.91 71.23\n",
      " 68.25 63.79 61.4  60.19 69.44 65.89 61.81 70.96 68.06 61.94 62.55 60.05\n",
      " 61.73 59.5  62.46 66.76 71.5  66.16 68.96 64.94 65.71 71.08 64.1  59.46\n",
      " 70.17 60.64 55.96 63.64 59.38 61.76 66.34 64.53 60.16 61.69 71.42 71.07\n",
      " 54.98 65.87 59.32 66.7  72.12 63.82 68.18 63.61 60.77 70.26 59.45 64.42\n",
      " 59.19 65.42 59.85 57.18 71.11 64.75 66.46 56.26 55.83 64.61 69.65 58.36\n",
      " 68.37 68.27 75.45 58.97 62.9  64.13 66.98 61.06 62.16 69.7  70.94 69.67\n",
      " 64.85 70.65 54.01 67.29 62.26 54.76 61.66 60.95 70.62 61.7  61.24 56.48\n",
      " 65.92 70.51 56.17 72.   68.82 75.88 66.49 54.43 64.93 62.64 63.55 68.23\n",
      " 66.55 66.3  68.28 73.22 65.82 61.49]\n",
      "HER2 [Presence] in Breast cancer specimen by Immune stain\n",
      "[nan 'positive' 'negative']\n",
      "Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "[nan 'negative']\n",
      "INR in Platelet poor plasma by Coagulation assay\n",
      "[ nan 2.05 1.95 2.17 2.12 1.82 1.77 2.25 1.39 3.9  1.97 1.84 2.38 2.63\n",
      " 1.76 3.99 4.08 2.08 1.79 3.8  2.01 3.98 2.3  1.64 4.   1.94 2.04 2.\n",
      " 4.2  1.62 2.23 2.1  2.02 1.88 3.82 1.34 2.24 1.65 4.1  4.04 1.81 1.89\n",
      " 2.16 3.81 2.22 2.54 2.37 1.86 2.11 4.14 2.21 2.41 1.83 2.14 4.27 2.07\n",
      " 3.91 2.18 1.87 1.63 1.72 3.97 2.2  1.61 1.7  1.55 2.06 1.59 3.73 1.68\n",
      " 1.78 1.92 2.13 2.26 1.56 1.93 1.8  2.28 4.03 1.58 2.44 1.85 3.78 1.98\n",
      " 1.66 1.96 3.89 2.27 1.71 3.94 1.73 1.9  1.6  3.83 2.15 2.32 2.36 2.33\n",
      " 2.6  4.36 1.67 3.85 2.43 2.58 2.48 2.29 4.09 4.07 3.95 1.91 3.86 3.75\n",
      " 4.16 3.79]\n",
      "Protein [Presence] in Urine by Test strip\n",
      "[nan '3+' '1+' '2+']\n",
      "MCV\n",
      "[ nan 92.6 94.3 93.5 82.1 91.4 90.9 88.4 88.2 89.2 92.7 91.5 89.1 83.1\n",
      " 85.9 85.3 89.5 90.4 87.5 91.3 90.5 83.6 81.7 82.5 85.5 88.  93.4 81.5\n",
      " 81.8 92.2 91.1 88.6 80.6 94.  87.1 82.2 88.5 82.8 85.2 81.3 81.2 83.7\n",
      " 95.2 82.3 95.9 80.1 95.6 80.2 84.2 92.4 90.1 87.4 84.3 90.7 84.9 84.4\n",
      " 85.4 91.7 94.1 91.9 80.7 83.8 88.8 94.5 91.8 89.6 84.5 95.8 95.3 91.2\n",
      " 86.8 93.1 90.2 87.2 93.9 81.4 82.4 89.4 87.  93.3 82.  80.9 90.3 93.2\n",
      " 88.9 86.5 87.8 96.  83.  87.6 93.8 89.  84.7 88.1 80.  85.1 86.2 86.3]\n",
      "HIV status\n",
      "[nan 'negative' 'positive']\n",
      "Estrogen receptor Ag [Presence] in Breast cancer specimen by Immune stain\n",
      "[nan 'positive' 'negative']\n",
      "Basophils [#/volume] in Blood by Automated count\n",
      "[ nan 0.32 0.3  0.33 0.31 0.28 0.29 0.27 0.34 0.26 0.25 0.24 0.35]\n",
      "Respiratory syncytial virus RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "[nan 'negative']\n",
      "Parainfluenza virus 3 RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "[nan 'negative']\n",
      "Clarity of Urine\n",
      "[nan 'cloudy' 'translucent']\n",
      "Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "[nan 'negative' 'positive']\n",
      "Response to cancer treatment\n",
      "[nan 'improving' 'worsening']\n",
      "Ketones [Mass/volume] in Urine by Test strip\n",
      "[nan 'low' 'medium']\n",
      "Human metapneumovirus RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "[nan 'negative']\n",
      "Procalcitonin [Mass/volume] in Serum or Plasma\n",
      "[ nan 0.08 0.07 0.13 0.14 0.1  0.12 0.24 0.09 0.2  0.32 0.19 0.22 0.16\n",
      " 0.26 0.17 0.06 0.28 0.04 0.3  0.21 0.11 0.05 0.25 0.15 0.02 0.27 0.29\n",
      " 0.37 0.23 0.18]\n",
      "Ketones [Presence] in Urine by Test strip\n",
      "[nan '3+' 'trace' '1+' '2+']\n",
      "Creatine kinase [Enzymatic activity/volume] in Serum or Plasma\n",
      "[   nan  27.88  38.47  38.32  38.02  36.5   36.83  25.42  36.82  38.51\n",
      "  35.18  35.33 104.54  32.64  32.87  23.1   31.18 123.83  21.   124.27\n",
      "  27.    35.97  26.14 161.34  31.8   99.11  33.87  38.3   32.81 123.23\n",
      "  29.99  41.63  34.07  34.51  28.3  120.23  35.    38.64  26.73  25.94\n",
      "  26.63  32.18 108.32  24.08  42.39  37.18  31.35 115.12 116.18  43.\n",
      "  33.08  32.89  19.72  32.1   53.88  38.34  40.    34.94  27.25  31.74\n",
      "  34.04  27.6   32.73  93.33  31.36  34.87  29.13  26.27  32.3  107.05\n",
      "  30.26 131.25  24.93  29.87  37.72  32.58  28.88  31.94  41.44  32.22\n",
      "  39.07  38.54  30.7   36.6   28.14 111.28  39.2   30.39 109.73  25.8\n",
      "  35.36  28.46  31.52  29.41  22.06  32.42 125.7   40.54  34.06  30.24\n",
      "  31.66  37.37  35.38  27.78  35.93 148.09  28.72  28.93  34.83  30.29\n",
      "  27.61  30.13  33.99  27.91  33.94  25.72  31.28  28.32  40.28  35.77\n",
      "  29.05  33.63 125.73  26.94  20.26  32.96  32.5   42.57  37.21  30.76\n",
      "  30.47  26.12  36.66 135.96  33.88  20.98  30.44  32.98  23.98  38.96\n",
      "  41.26  29.7   33.14  36.42 118.72  33.06  33.32  43.54  22.91  29.56\n",
      " 105.54  31.08  21.3   40.77  36.    37.04  38.74  33.37 128.96 118.76\n",
      "  33.79  25.24  36.73  39.36  34.95  27.52  35.9   35.99  31.84  26.67\n",
      " 105.27  34.86  36.09  29.28  38.12  30.55  30.3   98.22  34.67  38.36\n",
      "  34.24  36.51  29.45  37.57  36.23  27.63  26.39  33.21  39.1   30.64\n",
      " 145.97  28.58  37.44 132.17  27.64  28.96  38.53  27.86 113.06  29.4\n",
      "  31.34  31.23  40.85  35.51  37.92  20.72 138.17  30.72  30.33  26.78\n",
      "  30.58  33.49  34.44  27.31  35.85  30.2   25.44  37.77  32.47  38.22\n",
      "  36.2   36.94 104.26 129.87  92.5   31.72  32.08  33.23 107.88  31.62\n",
      "  97.99 102.82  35.46  22.35  40.66  35.7  113.65  41.33  35.91 114.33]\n",
      "HER2 [Presence] in Breast cancer specimen by FISH\n",
      "[nan 'positive' 'negative']\n",
      "Treatment status Cancer\n",
      "[nan 'changed']\n",
      "Total Bilirubin (Elevated)\n",
      "[nan 2.4 1.3 1.9 2.2 1.6 2.3 1.7 1.2 1.4 1.5 2.5 2.1 2.  1.8]\n",
      "Are you covered by health insurance or some other kind of health care plan [PhenX]\n",
      "[nan 'yes' 'no']\n",
      "Prothrombin time (PT)\n",
      "[  nan 11.93 11.5  11.85 11.88 12.02 11.32 11.68 11.54 11.74 11.07 12.48\n",
      " 11.27 11.78 11.86 11.57 12.46 12.39 11.14 10.93 11.61 12.83 10.87 12.65\n",
      " 11.47 11.73 12.38 12.29 11.43 11.7  12.44 11.83 12.24 11.52 11.82 12.34\n",
      " 12.12 13.21 11.94 11.49 10.83 11.44 12.71 12.1  11.34 12.06 11.9  11.17\n",
      " 11.84 11.04 11.63 11.23 11.77 10.96 12.87 11.59 12.57 11.4  11.42 12.26\n",
      " 10.84 11.48 11.37 12.6  12.59 11.81 11.99 12.07 11.26 11.66 12.28 11.21\n",
      " 11.53 11.22 10.6  11.8  11.87 11.33 11.39 11.38 11.18 12.   11.55 11.65\n",
      " 11.96 12.78 11.3  11.92 10.97 11.28 11.24 11.46 12.13 11.62 11.06 11.76\n",
      " 11.36 12.7  11.2  11.6  12.73 11.98 10.86 11.29 11.67 12.18 12.55 11.79\n",
      " 12.05 11.71 11.19 12.4  11.75 12.79 11.02 11.09 11.41 13.   11.13 12.8\n",
      " 11.89 10.98 11.16 10.99 12.04 12.89 12.2  12.98 11.05 10.92 12.45 11.12\n",
      " 12.19]\n",
      "Monocytes/100 leukocytes in Blood by Automated count\n",
      "[  nan 10.06  9.52  9.73 10.04 10.92 11.15  9.18 11.04 10.54  7.97  9.82\n",
      "  9.13  9.89  8.93 10.32 10.97  8.59 10.18 10.11 10.26  9.35  8.73 11.22\n",
      " 10.56 10.52 10.3   9.56 10.76 10.07 10.12 10.79 10.36  9.42 10.13 10.29\n",
      " 11.27 10.5   9.87  9.34  9.07 11.43  9.41 10.82 10.34  9.75 10.03 11.16\n",
      " 10.55 10.64  8.96 10.17 10.94  9.39  9.8  10.2   9.78 10.45  8.36  9.1\n",
      "  9.74 11.3  11.34  9.9   9.99  9.94  9.48 10.91 11.39 10.42 10.58 10.37\n",
      " 11.73 10.68  9.58  9.86  9.79  7.91 10.02 10.22  9.71 10.81  8.79 10.41\n",
      "  9.62 11.91 10.75  9.43  9.36  9.61  9.84 11.44 10.47 10.08  9.98 10.1\n",
      "  9.01 10.71 11.25  9.29  9.88  9.7   9.32 10.31  8.92  9.4   9.95  9.93\n",
      " 10.83 11.29  9.3  11.2   8.97  9.02  9.66 10.27 10.46 11.08 10.72 10.51\n",
      "  9.44 11.13  9.91  9.22  9.72  9.92 10.39  9.83  9.24 11.17  9.64  9.12\n",
      "  8.94  8.55 10.62  8.06 10.44  9.31 10.14 11.    8.81  8.71  9.27  9.26\n",
      "  8.12 11.72 11.28 12.27  9.76  8.89  8.9   9.2  10.7  10.73  9.67 10.\n",
      " 10.25  9.25 11.1  10.43  9.6   9.14 10.78  9.53 10.74 10.57]\n",
      "Rhinovirus RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "[nan 'negative']\n",
      "Leukocyte esterase [Presence] in Urine by Test strip\n",
      "[nan 'negative']\n",
      "marital\n",
      "['m' 's']\n",
      "Monocytes [#/volume] in Blood by Automated count\n",
      "[ nan 0.96 0.75 0.9  0.94 1.02 0.98 0.99 1.   0.8  0.93 1.03 0.87 1.05\n",
      " 0.82 0.89 1.06 0.81 0.95 0.88 1.01 0.91 0.97 0.86 0.92 1.16 1.11 1.04\n",
      " 1.08 0.85 1.09 0.76 0.84 1.07 0.78 1.1  1.12 0.83]\n",
      "scc\n",
      "[101 110 127 129  69 111  76 105 106 119 103  63  55 107 112  59  32  90\n",
      "  83 123  66 117 116  46 141  86 100 113 102 108 115 124 109 104  99 150\n",
      " 126  52  98 139 120  89 118  61 114  65  62 145 136 137  58  74  71  97\n",
      "  68  96  87 122 128  54 130  60 133  73 121 132 138  53 149  72  51  57\n",
      "  47 134 140 143  82  91 135  75  80 146 151 131  64  67 125  50  48  34\n",
      "  93  43 142 153 156  70  78  77 160 170  49  88  81 174 158  84  95  79\n",
      "  56 169  92 148 161 175 172  44  85  19  41 144  45  40 152 157 147  94\n",
      "  35 165  20 177  37 154  21 155 167 166 181 184  42  39 164 190  38  14\n",
      " 168 171   9  29 159  28]\n",
      "gender\n",
      "['m' 'f']\n",
      "Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection\n",
      "[nan 'negative' 'positive']\n",
      "Neutrophils/100 leukocytes in Blood by Automated count\n",
      "[  nan 27.85 26.78 30.78 27.69 28.91 27.15 25.12 28.56 26.64 28.4  28.97\n",
      " 25.21 31.12 29.06 25.02 28.81 27.83 27.08 31.5  31.13 31.79 24.54 28.34\n",
      " 26.81 25.62 30.44 27.99 27.51 26.36 28.83 28.12 30.79 30.1  31.76 23.98\n",
      " 28.86 25.46 29.87 30.72 24.72 22.8  24.62 29.57 27.12 25.91 28.03 29.72\n",
      " 27.49 24.13 27.52 34.85 18.61 29.81 26.97 32.32 25.65 27.28 28.28 24.21\n",
      " 28.29 28.32 31.44 29.19 24.99 29.66 27.18 27.94 23.81 18.64 26.61 25.34\n",
      " 33.41 26.1  28.01 29.07 31.42 32.06 31.99 26.06 29.39 23.64 29.5  24.53\n",
      " 28.39 28.02 26.49 29.86 23.38 22.51 33.83 29.04 23.37 32.04 27.07 30.24\n",
      " 31.2  28.47 20.61 23.27 22.17 29.63 32.88 26.32 28.93 23.96 27.75 31.59\n",
      " 22.96 28.16 30.71 22.11 28.61 23.6  27.45 27.79 25.17 27.71 29.1  24.7\n",
      " 27.66 29.49 23.86 28.92 25.22 31.06 29.65 29.89 25.99 23.01 25.45 33.02\n",
      " 32.29 26.09 24.16 20.5  26.77 24.66 28.2  30.37 30.21 30.19 22.86 27.56\n",
      " 30.51 25.53 28.49 33.92 23.08 28.74 22.85 27.39 29.51 26.52 26.47 31.7\n",
      " 28.69 25.33 26.14 29.16 25.01 31.38 22.66 31.86 25.69 26.08 26.93 28.05\n",
      " 26.86 28.53 33.1  24.6  24.46 32.92 29.25 25.18 25.8  28.89 30.32 25.86\n",
      " 26.79 27.93 28.68 21.16 33.05 27.47 21.27 26.31 27.2  23.66 25.49 34.77\n",
      " 23.54 30.39 29.11 24.69 28.15 21.11 28.43 25.75 26.9  26.34 29.95 29.71\n",
      " 22.26 24.79 24.77 24.55 27.14 23.67 25.82 24.47 30.47 29.38 29.94 29.43\n",
      " 27.31 28.44 34.19 31.33 28.06 23.83 33.23 31.74 33.8  27.16 25.48]\n",
      "Lymphocytes/100 leukocytes in Blood by Automated count\n",
      "[  nan 15.09 14.29 15.67 15.57 15.17 16.88 16.07 12.23 14.82 13.94 13.37\n",
      " 15.79 15.31 12.33 14.09 16.49 14.18 15.02 13.84 12.7  12.13 14.1  14.7\n",
      " 18.52 17.23 17.09 15.84 13.36 15.46 15.12 14.21 17.22 18.47 16.57 16.59\n",
      " 15.94 14.58 16.55 13.47 13.24 13.02 17.35 13.64 17.03 14.04 14.06 13.2\n",
      " 13.61 16.52 16.46 15.4  14.67 19.64 11.5  15.62 13.91 14.4  14.31 14.03\n",
      " 16.13 11.42 15.66 14.25 14.87 12.94 16.65 13.86 13.09 15.58 15.87 13.99\n",
      " 16.34 16.38 16.18 15.22 14.26 16.44 14.78 11.04 12.66 17.01 15.95 13.74\n",
      " 15.49 17.38 15.33 14.28 18.22 13.69 12.91 19.07 15.14 13.5  16.12 13.45\n",
      " 15.47 13.92 15.68 15.63 16.22 15.89 14.46 14.72 13.41 18.5  13.22 13.62\n",
      " 17.59 15.54 14.61 18.78 14.41 15.51 14.48 14.91 11.97 17.37 13.54 10.5\n",
      " 17.25 16.99 16.48 15.21 16.54 14.53 14.49 15.37 16.79 16.2  17.07 16.43\n",
      " 15.64 14.45 14.05 17.12 14.73 15.39 17.36 17.2  15.29 17.08 14.83 14.76\n",
      " 14.92 14.44 15.76 10.61 17.75 14.23 14.02 16.98 15.81 18.05 13.89 18.39\n",
      " 15.16 20.15 15.41 11.   12.74 14.47 13.55 17.24 15.52 15.98 17.27 17.18\n",
      " 15.59 16.32 18.28 16.76 13.49 15.56 10.33 14.5  16.58 15.82 14.66 10.32\n",
      " 14.15 16.81 13.52 15.75 14.6  12.36 14.65 12.35 14.63 15.13 14.3  14.74\n",
      " 17.05 15.72 15.88 12.77 13.98 14.95 13.38 12.89 13.93 14.77 16.83 17.11\n",
      " 17.63 16.7  16.4  16.37 14.89 14.93 13.72 14.24]\n",
      "Red Blood Cell\n",
      "[nan 5.5 5.4 4.9 4.8 4.7 5.3 5.  5.7 4.6 5.8 5.2 5.9 4.5 5.6 5.1]\n",
      "Eosinophils [#/volume] in Blood by Automated count\n",
      "[ nan 0.41 0.36 0.4  0.38 0.42 0.32 0.47 0.37 0.43 0.39 0.46 0.34 0.44\n",
      " 0.45 0.48 0.35 0.33 0.3 ]\n",
      "Tobacco smoking status NHIS\n",
      "['former' 'never' nan]\n",
      "RBC Distribution Width\n",
      "[ nan 12.4 14.4 13.6 12.1 13.5 11.8 13.2 14.5 14.3 12.2 14.6 14.2 12.8\n",
      " 13.4 12.  13.9 13.8 12.3 11.6 11.9 12.6 11.7 13.3 13.1 12.9 14.  12.5\n",
      " 14.1 12.7 13.7 13. ]\n",
      "Basophils/100 leukocytes in Blood by Automated count\n",
      "[ nan 3.2  3.05 3.01 2.9  2.95 3.09 2.89 3.16 2.99 3.11 2.57 2.84 2.96\n",
      " 3.12 2.85 2.87 2.64 2.77 3.06 3.02 3.04 2.75 2.92 2.66 3.27 2.93 3.28\n",
      " 3.13 3.1  3.22 3.14 2.94 3.21 3.19 2.83 3.43 2.76 3.3  3.23 3.   2.91\n",
      " 2.86 3.46 3.15 2.98 3.07 3.17 2.79 3.08 2.81 2.88 3.03 2.8  2.78 3.18\n",
      " 2.63 3.25 2.68 2.97 2.56 2.74 3.49 2.73 3.34 3.26 3.38 2.65 3.24 2.82\n",
      " 3.32 3.31 2.71 3.33 3.29 3.36]\n",
      "Gram positive blood culture panel by Probe in Positive blood culture\n",
      "[nan 'positive']\n",
      "Oxygen [Partial pressure] in Arterial blood\n",
      "[  nan 51.12 47.66 46.48 46.73 50.86 46.94 48.78 45.2  49.18 51.67 45.34\n",
      " 49.25 45.85 45.67 50.47 48.48 50.48 47.68 49.42 50.88 49.85 45.5  52.16\n",
      " 49.49 52.44 47.32 51.72 49.15 50.62 49.89 50.76 48.15 47.25 47.62 52.65\n",
      " 50.46 49.28 45.59 50.74 46.21 51.1  46.07 47.76 47.8  48.39 51.85 47.72\n",
      " 49.47 48.27 51.13 48.64 47.26 49.61 48.47 44.14 47.54 48.61 50.63 44.49\n",
      " 46.43 50.29 46.88 49.34 47.93 50.36 49.38 50.79 49.99 46.85 50.82 48.83\n",
      " 47.18 50.11 50.18 48.5  46.03 49.96 48.82 49.55 48.25 47.63 51.36 48.94\n",
      " 46.91 48.05 49.56 49.94 48.97 48.12 52.19 47.19 49.6  45.09 47.38 51.84\n",
      " 47.11 47.75 49.19 50.25 48.7  48.72 49.37 48.16 48.43 45.03 47.79 50.33\n",
      " 47.89 49.95]\n",
      "Influenza virus A Ag [Presence] in Nasopharynx by Rapid immunoassay\n",
      "[nan False True]\n",
      "pH of Arterial blood\n",
      "[ nan 7.02 7.03 7.14 7.09 6.99 7.1  7.05 7.11 7.04 7.07 7.06 6.98 7.12\n",
      " 7.15 7.01 7.   7.08 7.13 6.96 7.17 6.97]\n",
      "Neutrophils [#/volume] in Blood by Automated count\n",
      "[ nan 2.33 2.55 2.78 2.88 2.48 2.34 2.86 2.53 2.68 2.58 3.06 2.64 2.66\n",
      " 3.26 2.72 2.61 2.47 2.75 2.31 2.69 2.98 2.49 2.74 2.83 2.62 2.08 2.59\n",
      " 2.87 2.7  2.73 2.99 2.71 3.04 2.95 2.77 2.52 2.91 2.79 2.56 3.1  2.85\n",
      " 2.63 2.18 2.38 2.43 3.08 2.4  2.67 2.93 2.54 2.76 2.89 2.82 2.28 2.25\n",
      " 2.5  2.3  3.09 2.46 2.97 2.94 2.81 2.65 2.6  2.92 2.27 2.45 2.96 2.57\n",
      " 2.36 2.32 2.26 2.22 2.8  2.84 2.24 2.2  2.39 2.9  2.14 2.51 3.01 3.16\n",
      " 3.   3.23 3.24]\n",
      "Lymphocytes [#/volume] in Blood by Automated count\n",
      "[ nan 1.   1.02 0.97 0.98 1.06 1.03 1.01 0.99 0.56 1.07 0.64 0.59 1.05\n",
      " 0.61 1.09 0.58 0.63 0.96 1.04 1.08 0.52 0.6  0.57 0.53 0.65 0.55 0.5\n",
      " 0.66 0.62 0.54]\n",
      "Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction\n",
      "[   nan 237.9  230.62 254.82 234.23 247.3  249.74 235.6  233.1  257.67\n",
      " 226.88 243.15 362.44 239.37 232.82 239.5  247.96 261.87 339.76 235.82\n",
      " 363.56 230.18 252.47 246.86 353.84 263.44 363.18 247.23 246.81 248.03\n",
      " 359.35 246.96 251.64 242.28 244.4  240.56 366.12 236.07 250.07 248.62\n",
      " 226.72 250.32 232.44 322.02 237.02 245.5  249.02 232.87 352.6  350.98\n",
      " 238.16 251.   227.18 225.2  349.26 221.36 233.9  246.57 235.15 253.14\n",
      " 236.02 257.38 348.04 247.74 243.54 261.2  246.94 225.9  363.01 238.34\n",
      " 364.52 242.95 251.86 248.17 233.46 233.22 230.48 236.14 228.48 254.43\n",
      " 246.79 246.3  255.97 232.2  378.97 222.32 381.57 245.2  238.56 230.25\n",
      " 242.71 222.08 353.06 261.29 220.96 254.63 261.48 243.97 244.35 240.64\n",
      " 237.2  256.73 366.77 229.57 254.61 263.59 249.15 264.43 240.76 242.73\n",
      " 251.7  244.44 227.32 230.44 225.92 241.63 243.82 243.45 230.32 247.4\n",
      " 249.13 232.36 251.4  360.48 227.62 220.56 233.34 259.76 248.08 249.24\n",
      " 250.79 247.64 248.93 231.22 264.06 325.2  231.02 230.3  232.22 228.58\n",
      " 233.58 234.74 228.52 234.22 256.61 227.64 347.1  251.53 222.22 260.76\n",
      " 245.52 255.27 254.34 363.27 238.83 223.2  259.7  227.2  219.12 239.31\n",
      " 240.7  367.89 358.92 257.89 224.98 240.84 251.8  226.   258.06 253.47\n",
      " 243.6  248.66 228.74 337.47 228.38 251.06 228.26 241.62 231.58 242.02\n",
      " 229.   336.5  259.14 240.35 249.7  227.76 246.19 239.67 243.5  251.51\n",
      " 242.78 250.9  249.6  257.47 234.64 323.35 228.92 235.44 333.8  244.36\n",
      " 233.8  227.74 230.2  236.54 365.58 227.92 246.91 243.27 249.59 226.14\n",
      " 242.37 360.11 219.46 251.34 231.   232.9  262.3  230.24 256.   250.25\n",
      " 240.4  230.72 248.15 245.18 254.64 224.48 364.41 324.23 359.18 226.58\n",
      " 235.23 366.9  221.02 231.66 380.42 342.6  231.46 244.15 229.06 372.57\n",
      " 244.73 255.46 347.77]\n",
      "Drugs of abuse 5 panel - Urine by Screen method\n",
      "[nan 'negative' 'positive']\n",
      "Interleukin 6 [Mass/volume] in Serum or Plasma\n",
      "[ nan 5.33]\n",
      "Tumor marker Cancer\n",
      "[nan 'negative']\n",
      "Glucose [Presence] in Urine by Test strip\n",
      "[nan '2+']\n",
      "Carbon dioxide [Partial pressure] in Arterial blood\n",
      "[  nan 40.45 41.06 39.46 40.8  38.71 40.52 41.19 40.59 38.75 39.44 40.38\n",
      " 41.44 40.08 40.79 39.95 39.98 40.49 39.58 40.63 42.71 39.61 37.45 38.77\n",
      " 40.23 40.87 39.   40.41 38.6  39.26 39.23 40.77 40.55 40.01 39.83 40.68\n",
      " 40.7  39.09 40.11 41.26 41.61 40.34 41.56 41.38 41.49 39.51 39.77 40.26\n",
      " 40.74 39.02 40.64 39.15 38.81 40.14 40.56 40.19 40.51 39.38 39.5  41.22\n",
      " 40.07 39.57 38.36 39.85 40.48 40.54 40.44 39.14 38.56 40.31 42.62 40.46\n",
      " 38.83 42.07 39.74 38.98 42.94 39.89 41.8  39.88 40.36 38.96 41.69 40.04\n",
      " 38.23 40.06 40.88 39.08 39.59 40.18 40.47 40.53 40.22 40.96 40.16 39.91\n",
      " 39.49 37.46 40.24 39.06 39.79 39.41 40.82 39.78]\n",
      "Eosinophils/100 leukocytes in Blood by Automated count\n",
      "[ nan 4.45 4.4  4.59 3.74 4.35 4.61 4.46 4.51 5.05 4.73 4.72 4.65 4.15\n",
      " 4.22 4.81 4.63 4.34 4.37 4.16 4.23 4.04 3.98 4.86 4.66 4.56 4.68 4.12\n",
      " 4.83 4.25 4.92 4.75 4.32 4.58 4.11 4.71 4.41 4.21 4.17 4.89 4.19 4.43\n",
      " 4.57 4.39 4.5  4.42 4.54 3.91 4.7  4.28 4.31 4.69 4.38 4.55 4.76 4.6\n",
      " 4.78 4.29 4.74 4.98 4.93 4.88 4.33 4.94 4.24 4.49 4.48 4.36 4.47 5.08\n",
      " 4.03 4.87 4.52 4.62 4.44 4.8  4.64 5.19 4.06 3.95 4.97 4.96 4.53 4.85\n",
      " 4.08 3.87 3.92 4.18 4.84 3.84 4.27 3.99 5.06 5.24 4.82 4.79]\n",
      "Estrogen+Progesterone receptor Ag [Presence] in Tissue by Immune stain\n",
      "[nan 'positive' 'negative']\n",
      "SARS-CoV-2 RNA Pnl Resp NAA+probe\n",
      "[nan False True]\n",
      "Hemoglobin [Presence] in Urine by Test strip\n",
      "[nan 'positive' 'negative']\n",
      "Bicarbonate [Moles/volume] in Arterial blood\n",
      "[  nan 24.1  25.27 24.3  24.34 24.36 25.05 24.41 24.58 24.82 24.48 24.25\n",
      " 24.22 23.65 24.33 24.69 23.9  24.62 24.89 24.71 24.7  24.09 24.56 24.08\n",
      " 24.47 24.55 24.21 23.95 24.72 24.53 23.87 23.8  24.75 24.38 24.57 23.81\n",
      " 24.06 24.93 25.07 24.64 23.67 24.23 23.94 24.28 24.44 24.17 24.45 24.37\n",
      " 24.54 24.46 24.52 24.49 24.43 24.94 23.68 24.02 25.06 24.19 24.29 24.\n",
      " 23.78 24.84 23.85 24.67 24.91 24.63 25.14 25.37 23.5  24.13 24.61 24.81\n",
      " 24.15 24.32 24.85 23.55 25.21 23.18 24.76 24.07 24.92 25.15 24.86 24.12\n",
      " 23.7  23.71 23.82]\n",
      "Respiratory Disorders\n",
      "[ 5  1  2  6  0  4  3 13 12  8  7 11 17 16 10  9 15 14]\n",
      "Heart and Cardiovascular Diseases\n",
      "[0 3 2 5 1 4 6 7 9 8]\n",
      "Metabolic and Endocrine Disorders\n",
      "[ 4  3  6  2  1  5  9  7  8  0 10 11 12]\n",
      "Neurological Disorders\n",
      "[2 0 3 4 1 6 5 8 7 9]\n",
      "Orthopedic Injuries\n",
      "[4 0 2 3 5 1 6 7]\n",
      "Mental Health\n",
      "[11  8 10 12  7  9  6  5 13 15 14  4  3 16  0  2 17  1]\n",
      "Reproductive and Pregnancy\n",
      "[ 0  1 15 18 14 23 19 12  2 13  5 16 20 21 24 11 17  8  9 10  3  4 22  6\n",
      "  7 25 26]\n",
      "Pain Relievers and Analesics\n",
      "[3 2 1 4 0 5 7 6 8 9]\n",
      "Cardiovascular and Blood Pressure Medications\n",
      "[ 2  4  7  5  1  0  8  6  3 11  9 10 12 16 13 15 14 20]\n",
      "Injection Medications\n",
      "[ 3  1  2  4  6  7  5  8  0 10  9]\n",
      "Oral Medications\n",
      "[ 2  3  1  8  0  4  5  6  7  9 10 12]\n",
      "Other Medications\n",
      "[ 5  1  3  2  7  8  0  4 12  9 10  6 11 13 14 15]\n",
      "Therapies and Regimes\n",
      "[ 9  2  8  7  3  4  6  5 10 11 13  1 12 14 15  0]\n",
      "Diagnostic Procedures\n",
      "[ 8  7  9  5 10 14 16  1  3  6 15 13  4 19 12 18  2 11 17 20 24 21 25 23\n",
      " 22  0]\n",
      "Surgical Interventions\n",
      "[1 2 0 3 4 5 6 7]\n",
      "Patient Care Management\n",
      "[ 8 10 13 12  3  6  2  5  9  7  4 11 14 15  1 16 19  0 17]\n",
      "age_30t50\n",
      "[0 1]\n",
      "age_50t70\n",
      "[1 0]\n",
      "age_gt70\n",
      "[0 1]\n",
      "Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Albumin_abnormal\n",
      "[0 1]\n",
      "Albumin_normal\n",
      "[0 1]\n",
      "Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Bilirubin.total [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Bilirubin.total [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Bilirubin.total [Mass/volume] in Urine by Test strip_abnormal\n",
      "[0 1]\n",
      "Bilirubin.total [Mass/volume] in Urine by Test strip_normal\n",
      "[0 1]\n",
      "Body Mass Index_abnormal\n",
      "[1 0]\n",
      "Body Mass Index_normal\n",
      "[0 1]\n",
      "Body temperature_abnormal\n",
      "[0 1]\n",
      "Body temperature_normal\n",
      "[1 0]\n",
      "Calcium_normal\n",
      "[1 0]\n",
      "Calcium [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Carbon Dioxide_abnormal\n",
      "[0 1]\n",
      "Carbon Dioxide_normal\n",
      "[0 1]\n",
      "Chloride_abnormal\n",
      "[0 1]\n",
      "Chloride_normal\n",
      "[1 0]\n",
      "Chloride [Moles/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Chloride [Moles/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Creatinine_abnormal\n",
      "[0 1]\n",
      "Creatinine_normal\n",
      "[1 0]\n",
      "DXA [T-score] Bone density_abnormal\n",
      "[0 1]\n",
      "DXA [T-score] Bone density_normal\n",
      "[1 0]\n",
      "Diastolic Blood Pressure_abnormal\n",
      "[1 0]\n",
      "Diastolic Blood Pressure_normal\n",
      "[0 1]\n",
      "Erythrocyte distribution width [Entitic volume] by Automated count_abnormal\n",
      "[0 1]\n",
      "Erythrocyte distribution width [Entitic volume] by Automated count_normal\n",
      "[1 0]\n",
      "Erythrocyte distribution width [Ratio] by Automated count_abnormal\n",
      "[0 1]\n",
      "Erythrocyte distribution width [Ratio] by Automated count_normal\n",
      "[0 1]\n",
      "Erythrocytes [#/volume] in Blood by Automated count_abnormal\n",
      "[0 1]\n",
      "Erythrocytes [#/volume] in Blood by Automated count_normal\n",
      "[1 0]\n",
      "Ferritin [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Ferritin [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Globulin [Mass/volume] in Serum by calculation_abnormal\n",
      "[0 1]\n",
      "Globulin [Mass/volume] in Serum by calculation_normal\n",
      "[0 1]\n",
      "Glomerular filtration rate/1.73 sq M.predicted_abnormal\n",
      "[0 1]\n",
      "Glomerular filtration rate/1.73 sq M.predicted_normal\n",
      "[0 1]\n",
      "Glucose_abnormal\n",
      "[0 1]\n",
      "Glucose_normal\n",
      "[1 0]\n",
      "Glucose [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Glucose [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Glucose [Mass/volume] in Urine by Test strip_normal\n",
      "[0 1]\n",
      "Heart rate_abnormal\n",
      "[0 1]\n",
      "Heart rate_normal\n",
      "[1 0]\n",
      "Hematocrit [Volume Fraction] of Blood_abnormal\n",
      "[0 1]\n",
      "Hematocrit [Volume Fraction] of Blood_normal\n",
      "[0 1]\n",
      "Hematocrit [Volume Fraction] of Blood by Automated count_abnormal\n",
      "[0 1]\n",
      "Hematocrit [Volume Fraction] of Blood by Automated count_normal\n",
      "[1 0]\n",
      "Hemoglobin A1c/Hemoglobin.total in Blood_abnormal\n",
      "[1 0]\n",
      "Hemoglobin A1c/Hemoglobin.total in Blood_normal\n",
      "[0 1]\n",
      "Hemoglobin [Mass/volume] in Blood_abnormal\n",
      "[0 1]\n",
      "Hemoglobin [Mass/volume] in Blood_normal\n",
      "[1 0]\n",
      "Hemoglobin.gastrointestinal [Presence] in Stool by Immunologic method_abnormal\n",
      "[0 1]\n",
      "Hemoglobin.gastrointestinal [Presence] in Stool by Immunologic method_normal\n",
      "[0 1]\n",
      "High Density Lipoprotein Cholesterol_abnormal\n",
      "[1 0]\n",
      "High Density Lipoprotein Cholesterol_normal\n",
      "[0 1]\n",
      "Iron [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Iron [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Iron binding capacity [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Iron binding capacity [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Iron saturation [Mass Fraction] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Iron saturation [Mass Fraction] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Left ventricular Ejection fraction_abnormal\n",
      "[0 1]\n",
      "Left ventricular Ejection fraction_normal\n",
      "[0 1]\n",
      "Leukocytes [#/volume] in Blood by Automated count_abnormal\n",
      "[0 1]\n",
      "Leukocytes [#/volume] in Blood by Automated count_normal\n",
      "[1 0]\n",
      "Low Density Lipoprotein Cholesterol_abnormal\n",
      "[0 1]\n",
      "Low Density Lipoprotein Cholesterol_normal\n",
      "[1 0]\n",
      "MCH [Entitic mass] by Automated count_abnormal\n",
      "[0 1]\n",
      "MCH [Entitic mass] by Automated count_normal\n",
      "[1 0]\n",
      "MCHC [Mass/volume] by Automated count_normal\n",
      "[1 0]\n",
      "MCV [Entitic volume] by Automated count_normal\n",
      "[1 0]\n",
      "Magnesium [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Magnesium [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Microalbumin Creatinine Ratio_abnormal\n",
      "[0 1]\n",
      "Microalbumin Creatinine Ratio_normal\n",
      "[0 1]\n",
      "NT-proBNP_abnormal\n",
      "[0 1]\n",
      "Oxygen saturation in Arterial blood_abnormal\n",
      "[0 1]\n",
      "Pain severity - 0-10 verbal numeric rating [Score] - Reported_abnormal\n",
      "[1 0]\n",
      "Pain severity - 0-10 verbal numeric rating [Score] - Reported_normal\n",
      "[0 1]\n",
      "Platelet mean volume [Entitic volume] in Blood by Automated count_normal\n",
      "[1 0]\n",
      "Platelets [#/volume] in Blood by Automated count_abnormal\n",
      "[0 1]\n",
      "Platelets [#/volume] in Blood by Automated count_normal\n",
      "[1 0]\n",
      "Polyp size greatest dimension by CAP cancer protocols_abnormal\n",
      "[0 1]\n",
      "Polyp size greatest dimension by CAP cancer protocols_normal\n",
      "[0 1]\n",
      "Potassium_normal\n",
      "[1 0]\n",
      "Prostate specific Ag [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Prostate specific Ag [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Protein [Mass/volume] in Urine by Test strip_abnormal\n",
      "[0 1]\n",
      "Protein [Mass/volume] in Urine by Test strip_normal\n",
      "[0 1]\n",
      "Respiratory rate_abnormal\n",
      "[0 1]\n",
      "Respiratory rate_normal\n",
      "[1 0]\n",
      "Sodium_normal\n",
      "[1 0]\n",
      "Specific gravity of Urine by Test strip_abnormal\n",
      "[0 1]\n",
      "Systolic Blood Pressure_abnormal\n",
      "[1 0]\n",
      "Systolic Blood Pressure_normal\n",
      "[0 1]\n",
      "Total Cholesterol_abnormal\n",
      "[0 1]\n",
      "Total Cholesterol_normal\n",
      "[1 0]\n",
      "Triglycerides_abnormal\n",
      "[0 1]\n",
      "Triglycerides_normal\n",
      "[1 0]\n",
      "Troponin I.cardiac [Mass/volume] in Serum or Plasma by High sensitivity method_abnormal\n",
      "[0 1]\n",
      "Troponin I.cardiac [Mass/volume] in Serum or Plasma by High sensitivity method_normal\n",
      "[0 1]\n",
      "US Guidance for biopsy of Prostate_abnormal\n",
      "[0 1]\n",
      "US Guidance for biopsy of Prostate_normal\n",
      "[0 1]\n",
      "Urea Nitrogen_normal\n",
      "[1 0]\n",
      "Urea nitrogen [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "pH of Urine by Test strip_normal\n",
      "[0 1]\n"
     ]
    }
   ],
   "source": [
    "# what columns are still not numeric?\n",
    "for col in df2.columns:\n",
    "    print(col)\n",
    "    print(df2[col].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_to_encode = [\n",
    "    'HER2 [Presence] in Breast cancer specimen by FISH',\n",
    "    'Parainfluenza virus 1 RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
    "    'Estrogen+Progesterone receptor Ag [Presence] in Tissue by Immune stain',\n",
    "    'Ketones [Mass/volume] in Urine by Test strip',\n",
    "    'marital',\n",
    "    'Abuse Status [OMAHA]',\n",
    "    'Interleukin 6 [Mass/volume] in Serum or Plasma',\n",
    "    'Progesterone receptor Ag [Presence] in Breast cancer specimen by Immune stain',\n",
    "    'Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
    "    'Parainfluenza virus 2 RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
    "    'Response to cancer treatment',\n",
    "    'Estrogen receptor Ag [Presence] in Breast cancer specimen by Immune stain',\n",
    "    'Hemoglobin [Presence] in Urine by Test strip',\n",
    "    'Housing status',\n",
    "    'Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
    "    'Objective assessment of cardiovascular disease NYHA',\n",
    "    'Drugs of abuse 5 panel - Urine by Screen method',\n",
    "    'Leukocyte esterase [Presence] in Urine by Test strip',\n",
    "    'Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
    "    'Protein [Presence] in Urine by Test strip',\n",
    "    'Appearance of Urine',\n",
    "    'Capillary refill [Time] of Nail bed',\n",
    "    'Treatment status Cancer',\n",
    "    'Gram positive blood culture panel by Probe in Positive blood culture',\n",
    "    'Glucose [Presence] in Urine by Test strip',\n",
    "    'Respiratory syncytial virus RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
    "    'Functional capacity NYHA',\n",
    "    'Color of Urine',\n",
    "    'Nitrite [Presence] in Urine by Test strip',\n",
    "    'Tumor marker Cancer',\n",
    "    'Tobacco smoking status NHIS',\n",
    "    'gender',\n",
    "    'HIV status',\n",
    "    'Are you covered by health insurance or some other kind of health care plan [PhenX]',\n",
    "    'Human metapneumovirus RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
    "    'Ketones [Presence] in Urine by Test strip',\n",
    "    'Clarity of Urine',\n",
    "    'Stage group.clinical Cancer',\n",
    "    'Rhinovirus RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
    "    'HER2 [Presence] in Breast cancer specimen by Immune stain',\n",
    "    'Smokes tobacco daily',\n",
    "    'Parainfluenza virus 3 RNA [Presence] in Respiratory specimen by NAA with probe detection',\n",
    "    'SARS-CoV-2 RNA Pnl Resp NAA+probe',\n",
    "    'Influenza virus A Ag [Presence] in Nasopharynx by Rapid immunoassay',\n",
    "    'Influenza virus B Ag [Presence] in Nasopharynx by Rapid immunoassay'\n",
    "]\n",
    "\n",
    "df2 = pd.get_dummies(df2, columns=columns_to_encode, prefix_sep='_', dtype=int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's see one more time if we have any null values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column                                                                                                  Null Values Percentage\n",
      "----------------------------------------------------------------------------------------------------  ------------------------\n",
      "Bicarbonate [Moles/volume] in Arterial blood                                                                           98.6848\n",
      "Carbon dioxide [Partial pressure] in Arterial blood                                                                    98.6848\n",
      "pH of Arterial blood                                                                                                   98.6848\n",
      "Oxygen [Partial pressure] in Arterial blood                                                                            98.6848\n",
      "Oxygen/Inspired gas setting [Volume Fraction] Ventilator                                                               98.6848\n",
      "Total Bilirubin (Elevated)                                                                                             98.3311\n",
      "RBC Distribution Width                                                                                                 98.3311\n",
      "Red Blood Cell                                                                                                         98.3311\n",
      "Platelet Count                                                                                                         98.3311\n",
      "White Blood Cell (Elevated)                                                                                            98.3311\n",
      "Anion Gap                                                                                                              98.3311\n",
      "MCV                                                                                                                    98.3311\n",
      "Creatine kinase [Enzymatic activity/volume] in Serum or Plasma                                                         97.1706\n",
      "Basophils/100 leukocytes in Blood by Automated count                                                                   97.1706\n",
      "Eosinophils/100 leukocytes in Blood by Automated count                                                                 97.1706\n",
      "Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction                   97.1706\n",
      "Lymphocytes [#/volume] in Blood by Automated count                                                                     97.1706\n",
      "Neutrophils [#/volume] in Blood by Automated count                                                                     97.1706\n",
      "C reactive protein [Mass/volume] in Serum or Plasma                                                                    97.1706\n",
      "Fibrin D-dimer FEU [Mass/volume] in Platelet poor plasma                                                               97.1706\n",
      "INR in Platelet poor plasma by Coagulation assay                                                                       97.1706\n",
      "Procalcitonin [Mass/volume] in Serum or Plasma                                                                         97.1706\n",
      "Eosinophils [#/volume] in Blood by Automated count                                                                     97.1706\n",
      "Lymphocytes/100 leukocytes in Blood by Automated count                                                                 97.1706\n",
      "Neutrophils/100 leukocytes in Blood by Automated count                                                                 97.1706\n",
      "Basophils [#/volume] in Blood by Automated count                                                                       97.1706\n",
      "Monocytes/100 leukocytes in Blood by Automated count                                                                   97.1706\n",
      "Prothrombin time (PT)                                                                                                  97.1706\n",
      "Monocytes [#/volume] in Blood by Automated count                                                                       97.1706\n"
     ]
    }
   ],
   "source": [
    "null_values = df2.isnull().sum()\n",
    "null_values = null_values[null_values > 0]\n",
    "null_values = 100 * null_values / len(df2)\n",
    "null_values = null_values.sort_values(ascending=False)\n",
    "null_values_df = null_values.reset_index()\n",
    "null_values_df.columns = ['Column', 'Null Values Percentage']\n",
    "print(tabulate(null_values_df, headers='keys', tablefmt='simple', showindex=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "White Blood Cell (Elevated)\n",
      "[ nan 11.6 12.9 13.3 14.1 14.5 12.3 11.4 14.  13.7 13.9 14.2 12.4 14.3\n",
      " 12.  12.8 11.7 12.1 13.5 14.4 14.6 13.2 14.8 13.6 11.9 11.8 11.5 12.2\n",
      " 13.8 13.4 11.3 15.  12.7 11.1 14.7 12.6 13.  11.2 14.9 12.5]\n",
      "Platelet Count\n",
      "[  nan 162.1 291.7 444.  411.  368.2 250.  217.9 178.7 292.5 428.9 254.8\n",
      " 423.7 427.7 306.4 421.5 340.9 227.8 322.3 192.7 323.4 440.8 349.8 429.9\n",
      " 328.2 374.3 228.3 204.6 308.6 200.3 429.3 174.3 155.7 329.5 173.6 204.9\n",
      " 260.9 420.7 356.7 449.9 387.3 171.6 440.4 384.8 417.1 186.  396.8 307.1\n",
      " 239.1 175.1 234.4 268.  331.7 326.5 324.6 391.3 336.6 210.8 348.6 407.5\n",
      " 417.6 198.2 226.9 231.5 243.1 378.9 325.2 446.3 266.9 244.4 371.  442.4\n",
      " 432.1 258.9 214.5 264.5 335.  338.2 444.9 278.5 211.3 385.1 350.5 290.1\n",
      " 248.  421.3 448.4 304.1 265.8 184.5 281.8 434.9 291.9 297.8 205.6 191.5\n",
      " 293.7 287.  327.  249.9 412.8 346.9 422.3 320.4 155.4 173.3 339.4 402.8\n",
      " 394.8 182.9 185.9 299.4 252.4 265.1 155.3 299.9 212.9 344.8 428.8 405.8\n",
      " 298.4 385.8 257.1 439.4 210.5 254.2 303.7 283.6 329.8 162.3 317.  365.6\n",
      " 273.4 447.5 209.6 196.1 415.3 197.6 295.  357.9 342.9 266.  224.8 181.3\n",
      " 424.9 265.2 303.6 162.8 367.1]\n",
      "Anion Gap\n",
      "[ nan  9.4  7.2  9.5 13.2  3.5 11.8  3.4  2.4  8.1  4.9  2.3 12.1  4.6\n",
      "  7.6  7.7  5.6 10.7  4.1 10.2 11.4 12.2  7.5 13.9  6.7  6.4  6.9 13.6\n",
      " 10.1  3.1  3.9 13.3  5.3  4.5 12.   6.2 11.3  8.9 10.5 10.6 11.9  5.8\n",
      " 12.6 11.6 14.9  8.7 11.1 12.5  4.3 10.   2.1  5.5 14.  11.  14.7  8.5\n",
      " 12.3 14.1  5.9  6.1  2.   7.1  3.7  5.4 13.8  8.8 10.3  2.7 11.7  2.8\n",
      "  5.  10.8  9.   9.9 14.3 12.7 14.8  7.   9.1  9.8  8.2  5.2  8.3  4.8\n",
      " 12.8  6.3  8.4  7.3 11.2  7.9 13.4  4.   6.   3.3  3.6]\n",
      "C reactive protein [Mass/volume] in Serum or Plasma\n",
      "[  nan 10.27  9.87 10.57 10.13 10.18  9.84 10.23 10.08 10.73 10.42 10.32\n",
      " 12.84  9.42 10.14  9.77 10.01  9.89 14.09  9.98 13.15 10.7   9.9  13.43\n",
      " 10.21 13.54  9.78  9.66 10.64 12.89 10.31 10.4   9.7  13.09 10.05 10.45\n",
      "  9.94 10.24 10.38 10.06 10.26 10.37 10.17 13.56 13.03  9.6  10.6   9.51\n",
      "  9.32 13.57 10.9  10.8  10.36 10.78  9.4   9.48 12.45  9.14 10.3  10.1\n",
      "  9.86  9.88 13.16 10.44 13.64 10.02  9.93  9.68 10.04  9.8   9.76 10.11\n",
      " 10.69 10.22  9.3  13.39  9.96  8.91 10.03  9.83 11.2  13.74  9.08  9.63\n",
      " 10.49  9.52 10.07 10.58 12.49  9.91 10.43 10.34  9.65 10.52  9.62  9.54\n",
      " 13.82 10.96  9.43 10.83 10.59  9.5  10.09 10.2   9.58 12.9   9.1  11.\n",
      " 10.76 10.16  9.69  9.64 12.21  9.46 12.94  9.67 12.46 11.28 10.84  9.06\n",
      " 10.85 10.72  9.82 10.63 15.4   9.56  9.27  9.47  9.72  9.11 10.91  9.24\n",
      " 10.54 10.95 10.47  9.92 13.67  9.44 12.6   9.04 13.07  9.13  9.19  9.15\n",
      "  9.81  9.73  9.97 13.1  12.3  13.14 10.39 11.14 14.45 10.12 13.11 12.82\n",
      " 10.28  9.57 12.09]\n",
      "Fibrin D-dimer FEU [Mass/volume] in Platelet poor plasma\n",
      "[  nan  0.42  0.47  0.43  0.38  0.45  0.4   0.34 11.01  0.56  5.2  11.35\n",
      "  0.44  7.23 11.38  0.46  8.89 11.76  0.36  0.51  2.12  7.67  7.83  6.51\n",
      "  0.49  6.52  0.53 10.17  0.41 10.02  0.39  0.37 13.81 13.34  0.5   7.94\n",
      "  0.54  0.35 10.14  8.09  1.96  0.3   6.45 10.19 12.61  8.73  1.43  4.95\n",
      "  1.73  1.37  0.32  9.84  8.56  1.5   9.88 11.7  14.05  5.89  0.55  7.09]\n",
      "label\n",
      "[0 1]\n",
      "Oxygen/Inspired gas setting [Volume Fraction] Ventilator\n",
      "[  nan 69.41 65.75 60.01 59.06 77.46 67.87 73.69 63.76 57.75 56.91 71.23\n",
      " 68.25 63.79 61.4  60.19 69.44 65.89 61.81 70.96 68.06 61.94 62.55 60.05\n",
      " 61.73 59.5  62.46 66.76 71.5  66.16 68.96 64.94 65.71 71.08 64.1  59.46\n",
      " 70.17 60.64 55.96 63.64 59.38 61.76 66.34 64.53 60.16 61.69 71.42 71.07\n",
      " 54.98 65.87 59.32 66.7  72.12 63.82 68.18 63.61 60.77 70.26 59.45 64.42\n",
      " 59.19 65.42 59.85 57.18 71.11 64.75 66.46 56.26 55.83 64.61 69.65 58.36\n",
      " 68.37 68.27 75.45 58.97 62.9  64.13 66.98 61.06 62.16 69.7  70.94 69.67\n",
      " 64.85 70.65 54.01 67.29 62.26 54.76 61.66 60.95 70.62 61.7  61.24 56.48\n",
      " 65.92 70.51 56.17 72.   68.82 75.88 66.49 54.43 64.93 62.64 63.55 68.23\n",
      " 66.55 66.3  68.28 73.22 65.82 61.49]\n",
      "INR in Platelet poor plasma by Coagulation assay\n",
      "[ nan 2.05 1.95 2.17 2.12 1.82 1.77 2.25 1.39 3.9  1.97 1.84 2.38 2.63\n",
      " 1.76 3.99 4.08 2.08 1.79 3.8  2.01 3.98 2.3  1.64 4.   1.94 2.04 2.\n",
      " 4.2  1.62 2.23 2.1  2.02 1.88 3.82 1.34 2.24 1.65 4.1  4.04 1.81 1.89\n",
      " 2.16 3.81 2.22 2.54 2.37 1.86 2.11 4.14 2.21 2.41 1.83 2.14 4.27 2.07\n",
      " 3.91 2.18 1.87 1.63 1.72 3.97 2.2  1.61 1.7  1.55 2.06 1.59 3.73 1.68\n",
      " 1.78 1.92 2.13 2.26 1.56 1.93 1.8  2.28 4.03 1.58 2.44 1.85 3.78 1.98\n",
      " 1.66 1.96 3.89 2.27 1.71 3.94 1.73 1.9  1.6  3.83 2.15 2.32 2.36 2.33\n",
      " 2.6  4.36 1.67 3.85 2.43 2.58 2.48 2.29 4.09 4.07 3.95 1.91 3.86 3.75\n",
      " 4.16 3.79]\n",
      "MCV\n",
      "[ nan 92.6 94.3 93.5 82.1 91.4 90.9 88.4 88.2 89.2 92.7 91.5 89.1 83.1\n",
      " 85.9 85.3 89.5 90.4 87.5 91.3 90.5 83.6 81.7 82.5 85.5 88.  93.4 81.5\n",
      " 81.8 92.2 91.1 88.6 80.6 94.  87.1 82.2 88.5 82.8 85.2 81.3 81.2 83.7\n",
      " 95.2 82.3 95.9 80.1 95.6 80.2 84.2 92.4 90.1 87.4 84.3 90.7 84.9 84.4\n",
      " 85.4 91.7 94.1 91.9 80.7 83.8 88.8 94.5 91.8 89.6 84.5 95.8 95.3 91.2\n",
      " 86.8 93.1 90.2 87.2 93.9 81.4 82.4 89.4 87.  93.3 82.  80.9 90.3 93.2\n",
      " 88.9 86.5 87.8 96.  83.  87.6 93.8 89.  84.7 88.1 80.  85.1 86.2 86.3]\n",
      "Basophils [#/volume] in Blood by Automated count\n",
      "[ nan 0.32 0.3  0.33 0.31 0.28 0.29 0.27 0.34 0.26 0.25 0.24 0.35]\n",
      "Procalcitonin [Mass/volume] in Serum or Plasma\n",
      "[ nan 0.08 0.07 0.13 0.14 0.1  0.12 0.24 0.09 0.2  0.32 0.19 0.22 0.16\n",
      " 0.26 0.17 0.06 0.28 0.04 0.3  0.21 0.11 0.05 0.25 0.15 0.02 0.27 0.29\n",
      " 0.37 0.23 0.18]\n",
      "Creatine kinase [Enzymatic activity/volume] in Serum or Plasma\n",
      "[   nan  27.88  38.47  38.32  38.02  36.5   36.83  25.42  36.82  38.51\n",
      "  35.18  35.33 104.54  32.64  32.87  23.1   31.18 123.83  21.   124.27\n",
      "  27.    35.97  26.14 161.34  31.8   99.11  33.87  38.3   32.81 123.23\n",
      "  29.99  41.63  34.07  34.51  28.3  120.23  35.    38.64  26.73  25.94\n",
      "  26.63  32.18 108.32  24.08  42.39  37.18  31.35 115.12 116.18  43.\n",
      "  33.08  32.89  19.72  32.1   53.88  38.34  40.    34.94  27.25  31.74\n",
      "  34.04  27.6   32.73  93.33  31.36  34.87  29.13  26.27  32.3  107.05\n",
      "  30.26 131.25  24.93  29.87  37.72  32.58  28.88  31.94  41.44  32.22\n",
      "  39.07  38.54  30.7   36.6   28.14 111.28  39.2   30.39 109.73  25.8\n",
      "  35.36  28.46  31.52  29.41  22.06  32.42 125.7   40.54  34.06  30.24\n",
      "  31.66  37.37  35.38  27.78  35.93 148.09  28.72  28.93  34.83  30.29\n",
      "  27.61  30.13  33.99  27.91  33.94  25.72  31.28  28.32  40.28  35.77\n",
      "  29.05  33.63 125.73  26.94  20.26  32.96  32.5   42.57  37.21  30.76\n",
      "  30.47  26.12  36.66 135.96  33.88  20.98  30.44  32.98  23.98  38.96\n",
      "  41.26  29.7   33.14  36.42 118.72  33.06  33.32  43.54  22.91  29.56\n",
      " 105.54  31.08  21.3   40.77  36.    37.04  38.74  33.37 128.96 118.76\n",
      "  33.79  25.24  36.73  39.36  34.95  27.52  35.9   35.99  31.84  26.67\n",
      " 105.27  34.86  36.09  29.28  38.12  30.55  30.3   98.22  34.67  38.36\n",
      "  34.24  36.51  29.45  37.57  36.23  27.63  26.39  33.21  39.1   30.64\n",
      " 145.97  28.58  37.44 132.17  27.64  28.96  38.53  27.86 113.06  29.4\n",
      "  31.34  31.23  40.85  35.51  37.92  20.72 138.17  30.72  30.33  26.78\n",
      "  30.58  33.49  34.44  27.31  35.85  30.2   25.44  37.77  32.47  38.22\n",
      "  36.2   36.94 104.26 129.87  92.5   31.72  32.08  33.23 107.88  31.62\n",
      "  97.99 102.82  35.46  22.35  40.66  35.7  113.65  41.33  35.91 114.33]\n",
      "Total Bilirubin (Elevated)\n",
      "[nan 2.4 1.3 1.9 2.2 1.6 2.3 1.7 1.2 1.4 1.5 2.5 2.1 2.  1.8]\n",
      "Prothrombin time (PT)\n",
      "[  nan 11.93 11.5  11.85 11.88 12.02 11.32 11.68 11.54 11.74 11.07 12.48\n",
      " 11.27 11.78 11.86 11.57 12.46 12.39 11.14 10.93 11.61 12.83 10.87 12.65\n",
      " 11.47 11.73 12.38 12.29 11.43 11.7  12.44 11.83 12.24 11.52 11.82 12.34\n",
      " 12.12 13.21 11.94 11.49 10.83 11.44 12.71 12.1  11.34 12.06 11.9  11.17\n",
      " 11.84 11.04 11.63 11.23 11.77 10.96 12.87 11.59 12.57 11.4  11.42 12.26\n",
      " 10.84 11.48 11.37 12.6  12.59 11.81 11.99 12.07 11.26 11.66 12.28 11.21\n",
      " 11.53 11.22 10.6  11.8  11.87 11.33 11.39 11.38 11.18 12.   11.55 11.65\n",
      " 11.96 12.78 11.3  11.92 10.97 11.28 11.24 11.46 12.13 11.62 11.06 11.76\n",
      " 11.36 12.7  11.2  11.6  12.73 11.98 10.86 11.29 11.67 12.18 12.55 11.79\n",
      " 12.05 11.71 11.19 12.4  11.75 12.79 11.02 11.09 11.41 13.   11.13 12.8\n",
      " 11.89 10.98 11.16 10.99 12.04 12.89 12.2  12.98 11.05 10.92 12.45 11.12\n",
      " 12.19]\n",
      "Monocytes/100 leukocytes in Blood by Automated count\n",
      "[  nan 10.06  9.52  9.73 10.04 10.92 11.15  9.18 11.04 10.54  7.97  9.82\n",
      "  9.13  9.89  8.93 10.32 10.97  8.59 10.18 10.11 10.26  9.35  8.73 11.22\n",
      " 10.56 10.52 10.3   9.56 10.76 10.07 10.12 10.79 10.36  9.42 10.13 10.29\n",
      " 11.27 10.5   9.87  9.34  9.07 11.43  9.41 10.82 10.34  9.75 10.03 11.16\n",
      " 10.55 10.64  8.96 10.17 10.94  9.39  9.8  10.2   9.78 10.45  8.36  9.1\n",
      "  9.74 11.3  11.34  9.9   9.99  9.94  9.48 10.91 11.39 10.42 10.58 10.37\n",
      " 11.73 10.68  9.58  9.86  9.79  7.91 10.02 10.22  9.71 10.81  8.79 10.41\n",
      "  9.62 11.91 10.75  9.43  9.36  9.61  9.84 11.44 10.47 10.08  9.98 10.1\n",
      "  9.01 10.71 11.25  9.29  9.88  9.7   9.32 10.31  8.92  9.4   9.95  9.93\n",
      " 10.83 11.29  9.3  11.2   8.97  9.02  9.66 10.27 10.46 11.08 10.72 10.51\n",
      "  9.44 11.13  9.91  9.22  9.72  9.92 10.39  9.83  9.24 11.17  9.64  9.12\n",
      "  8.94  8.55 10.62  8.06 10.44  9.31 10.14 11.    8.81  8.71  9.27  9.26\n",
      "  8.12 11.72 11.28 12.27  9.76  8.89  8.9   9.2  10.7  10.73  9.67 10.\n",
      " 10.25  9.25 11.1  10.43  9.6   9.14 10.78  9.53 10.74 10.57]\n",
      "Monocytes [#/volume] in Blood by Automated count\n",
      "[ nan 0.96 0.75 0.9  0.94 1.02 0.98 0.99 1.   0.8  0.93 1.03 0.87 1.05\n",
      " 0.82 0.89 1.06 0.81 0.95 0.88 1.01 0.91 0.97 0.86 0.92 1.16 1.11 1.04\n",
      " 1.08 0.85 1.09 0.76 0.84 1.07 0.78 1.1  1.12 0.83]\n",
      "scc\n",
      "[101 110 127 129  69 111  76 105 106 119 103  63  55 107 112  59  32  90\n",
      "  83 123  66 117 116  46 141  86 100 113 102 108 115 124 109 104  99 150\n",
      " 126  52  98 139 120  89 118  61 114  65  62 145 136 137  58  74  71  97\n",
      "  68  96  87 122 128  54 130  60 133  73 121 132 138  53 149  72  51  57\n",
      "  47 134 140 143  82  91 135  75  80 146 151 131  64  67 125  50  48  34\n",
      "  93  43 142 153 156  70  78  77 160 170  49  88  81 174 158  84  95  79\n",
      "  56 169  92 148 161 175 172  44  85  19  41 144  45  40 152 157 147  94\n",
      "  35 165  20 177  37 154  21 155 167 166 181 184  42  39 164 190  38  14\n",
      " 168 171   9  29 159  28]\n",
      "Neutrophils/100 leukocytes in Blood by Automated count\n",
      "[  nan 27.85 26.78 30.78 27.69 28.91 27.15 25.12 28.56 26.64 28.4  28.97\n",
      " 25.21 31.12 29.06 25.02 28.81 27.83 27.08 31.5  31.13 31.79 24.54 28.34\n",
      " 26.81 25.62 30.44 27.99 27.51 26.36 28.83 28.12 30.79 30.1  31.76 23.98\n",
      " 28.86 25.46 29.87 30.72 24.72 22.8  24.62 29.57 27.12 25.91 28.03 29.72\n",
      " 27.49 24.13 27.52 34.85 18.61 29.81 26.97 32.32 25.65 27.28 28.28 24.21\n",
      " 28.29 28.32 31.44 29.19 24.99 29.66 27.18 27.94 23.81 18.64 26.61 25.34\n",
      " 33.41 26.1  28.01 29.07 31.42 32.06 31.99 26.06 29.39 23.64 29.5  24.53\n",
      " 28.39 28.02 26.49 29.86 23.38 22.51 33.83 29.04 23.37 32.04 27.07 30.24\n",
      " 31.2  28.47 20.61 23.27 22.17 29.63 32.88 26.32 28.93 23.96 27.75 31.59\n",
      " 22.96 28.16 30.71 22.11 28.61 23.6  27.45 27.79 25.17 27.71 29.1  24.7\n",
      " 27.66 29.49 23.86 28.92 25.22 31.06 29.65 29.89 25.99 23.01 25.45 33.02\n",
      " 32.29 26.09 24.16 20.5  26.77 24.66 28.2  30.37 30.21 30.19 22.86 27.56\n",
      " 30.51 25.53 28.49 33.92 23.08 28.74 22.85 27.39 29.51 26.52 26.47 31.7\n",
      " 28.69 25.33 26.14 29.16 25.01 31.38 22.66 31.86 25.69 26.08 26.93 28.05\n",
      " 26.86 28.53 33.1  24.6  24.46 32.92 29.25 25.18 25.8  28.89 30.32 25.86\n",
      " 26.79 27.93 28.68 21.16 33.05 27.47 21.27 26.31 27.2  23.66 25.49 34.77\n",
      " 23.54 30.39 29.11 24.69 28.15 21.11 28.43 25.75 26.9  26.34 29.95 29.71\n",
      " 22.26 24.79 24.77 24.55 27.14 23.67 25.82 24.47 30.47 29.38 29.94 29.43\n",
      " 27.31 28.44 34.19 31.33 28.06 23.83 33.23 31.74 33.8  27.16 25.48]\n",
      "Lymphocytes/100 leukocytes in Blood by Automated count\n",
      "[  nan 15.09 14.29 15.67 15.57 15.17 16.88 16.07 12.23 14.82 13.94 13.37\n",
      " 15.79 15.31 12.33 14.09 16.49 14.18 15.02 13.84 12.7  12.13 14.1  14.7\n",
      " 18.52 17.23 17.09 15.84 13.36 15.46 15.12 14.21 17.22 18.47 16.57 16.59\n",
      " 15.94 14.58 16.55 13.47 13.24 13.02 17.35 13.64 17.03 14.04 14.06 13.2\n",
      " 13.61 16.52 16.46 15.4  14.67 19.64 11.5  15.62 13.91 14.4  14.31 14.03\n",
      " 16.13 11.42 15.66 14.25 14.87 12.94 16.65 13.86 13.09 15.58 15.87 13.99\n",
      " 16.34 16.38 16.18 15.22 14.26 16.44 14.78 11.04 12.66 17.01 15.95 13.74\n",
      " 15.49 17.38 15.33 14.28 18.22 13.69 12.91 19.07 15.14 13.5  16.12 13.45\n",
      " 15.47 13.92 15.68 15.63 16.22 15.89 14.46 14.72 13.41 18.5  13.22 13.62\n",
      " 17.59 15.54 14.61 18.78 14.41 15.51 14.48 14.91 11.97 17.37 13.54 10.5\n",
      " 17.25 16.99 16.48 15.21 16.54 14.53 14.49 15.37 16.79 16.2  17.07 16.43\n",
      " 15.64 14.45 14.05 17.12 14.73 15.39 17.36 17.2  15.29 17.08 14.83 14.76\n",
      " 14.92 14.44 15.76 10.61 17.75 14.23 14.02 16.98 15.81 18.05 13.89 18.39\n",
      " 15.16 20.15 15.41 11.   12.74 14.47 13.55 17.24 15.52 15.98 17.27 17.18\n",
      " 15.59 16.32 18.28 16.76 13.49 15.56 10.33 14.5  16.58 15.82 14.66 10.32\n",
      " 14.15 16.81 13.52 15.75 14.6  12.36 14.65 12.35 14.63 15.13 14.3  14.74\n",
      " 17.05 15.72 15.88 12.77 13.98 14.95 13.38 12.89 13.93 14.77 16.83 17.11\n",
      " 17.63 16.7  16.4  16.37 14.89 14.93 13.72 14.24]\n",
      "Red Blood Cell\n",
      "[nan 5.5 5.4 4.9 4.8 4.7 5.3 5.  5.7 4.6 5.8 5.2 5.9 4.5 5.6 5.1]\n",
      "Eosinophils [#/volume] in Blood by Automated count\n",
      "[ nan 0.41 0.36 0.4  0.38 0.42 0.32 0.47 0.37 0.43 0.39 0.46 0.34 0.44\n",
      " 0.45 0.48 0.35 0.33 0.3 ]\n",
      "RBC Distribution Width\n",
      "[ nan 12.4 14.4 13.6 12.1 13.5 11.8 13.2 14.5 14.3 12.2 14.6 14.2 12.8\n",
      " 13.4 12.  13.9 13.8 12.3 11.6 11.9 12.6 11.7 13.3 13.1 12.9 14.  12.5\n",
      " 14.1 12.7 13.7 13. ]\n",
      "Basophils/100 leukocytes in Blood by Automated count\n",
      "[ nan 3.2  3.05 3.01 2.9  2.95 3.09 2.89 3.16 2.99 3.11 2.57 2.84 2.96\n",
      " 3.12 2.85 2.87 2.64 2.77 3.06 3.02 3.04 2.75 2.92 2.66 3.27 2.93 3.28\n",
      " 3.13 3.1  3.22 3.14 2.94 3.21 3.19 2.83 3.43 2.76 3.3  3.23 3.   2.91\n",
      " 2.86 3.46 3.15 2.98 3.07 3.17 2.79 3.08 2.81 2.88 3.03 2.8  2.78 3.18\n",
      " 2.63 3.25 2.68 2.97 2.56 2.74 3.49 2.73 3.34 3.26 3.38 2.65 3.24 2.82\n",
      " 3.32 3.31 2.71 3.33 3.29 3.36]\n",
      "Oxygen [Partial pressure] in Arterial blood\n",
      "[  nan 51.12 47.66 46.48 46.73 50.86 46.94 48.78 45.2  49.18 51.67 45.34\n",
      " 49.25 45.85 45.67 50.47 48.48 50.48 47.68 49.42 50.88 49.85 45.5  52.16\n",
      " 49.49 52.44 47.32 51.72 49.15 50.62 49.89 50.76 48.15 47.25 47.62 52.65\n",
      " 50.46 49.28 45.59 50.74 46.21 51.1  46.07 47.76 47.8  48.39 51.85 47.72\n",
      " 49.47 48.27 51.13 48.64 47.26 49.61 48.47 44.14 47.54 48.61 50.63 44.49\n",
      " 46.43 50.29 46.88 49.34 47.93 50.36 49.38 50.79 49.99 46.85 50.82 48.83\n",
      " 47.18 50.11 50.18 48.5  46.03 49.96 48.82 49.55 48.25 47.63 51.36 48.94\n",
      " 46.91 48.05 49.56 49.94 48.97 48.12 52.19 47.19 49.6  45.09 47.38 51.84\n",
      " 47.11 47.75 49.19 50.25 48.7  48.72 49.37 48.16 48.43 45.03 47.79 50.33\n",
      " 47.89 49.95]\n",
      "pH of Arterial blood\n",
      "[ nan 7.02 7.03 7.14 7.09 6.99 7.1  7.05 7.11 7.04 7.07 7.06 6.98 7.12\n",
      " 7.15 7.01 7.   7.08 7.13 6.96 7.17 6.97]\n",
      "Neutrophils [#/volume] in Blood by Automated count\n",
      "[ nan 2.33 2.55 2.78 2.88 2.48 2.34 2.86 2.53 2.68 2.58 3.06 2.64 2.66\n",
      " 3.26 2.72 2.61 2.47 2.75 2.31 2.69 2.98 2.49 2.74 2.83 2.62 2.08 2.59\n",
      " 2.87 2.7  2.73 2.99 2.71 3.04 2.95 2.77 2.52 2.91 2.79 2.56 3.1  2.85\n",
      " 2.63 2.18 2.38 2.43 3.08 2.4  2.67 2.93 2.54 2.76 2.89 2.82 2.28 2.25\n",
      " 2.5  2.3  3.09 2.46 2.97 2.94 2.81 2.65 2.6  2.92 2.27 2.45 2.96 2.57\n",
      " 2.36 2.32 2.26 2.22 2.8  2.84 2.24 2.2  2.39 2.9  2.14 2.51 3.01 3.16\n",
      " 3.   3.23 3.24]\n",
      "Lymphocytes [#/volume] in Blood by Automated count\n",
      "[ nan 1.   1.02 0.97 0.98 1.06 1.03 1.01 0.99 0.56 1.07 0.64 0.59 1.05\n",
      " 0.61 1.09 0.58 0.63 0.96 1.04 1.08 0.52 0.6  0.57 0.53 0.65 0.55 0.5\n",
      " 0.66 0.62 0.54]\n",
      "Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction\n",
      "[   nan 237.9  230.62 254.82 234.23 247.3  249.74 235.6  233.1  257.67\n",
      " 226.88 243.15 362.44 239.37 232.82 239.5  247.96 261.87 339.76 235.82\n",
      " 363.56 230.18 252.47 246.86 353.84 263.44 363.18 247.23 246.81 248.03\n",
      " 359.35 246.96 251.64 242.28 244.4  240.56 366.12 236.07 250.07 248.62\n",
      " 226.72 250.32 232.44 322.02 237.02 245.5  249.02 232.87 352.6  350.98\n",
      " 238.16 251.   227.18 225.2  349.26 221.36 233.9  246.57 235.15 253.14\n",
      " 236.02 257.38 348.04 247.74 243.54 261.2  246.94 225.9  363.01 238.34\n",
      " 364.52 242.95 251.86 248.17 233.46 233.22 230.48 236.14 228.48 254.43\n",
      " 246.79 246.3  255.97 232.2  378.97 222.32 381.57 245.2  238.56 230.25\n",
      " 242.71 222.08 353.06 261.29 220.96 254.63 261.48 243.97 244.35 240.64\n",
      " 237.2  256.73 366.77 229.57 254.61 263.59 249.15 264.43 240.76 242.73\n",
      " 251.7  244.44 227.32 230.44 225.92 241.63 243.82 243.45 230.32 247.4\n",
      " 249.13 232.36 251.4  360.48 227.62 220.56 233.34 259.76 248.08 249.24\n",
      " 250.79 247.64 248.93 231.22 264.06 325.2  231.02 230.3  232.22 228.58\n",
      " 233.58 234.74 228.52 234.22 256.61 227.64 347.1  251.53 222.22 260.76\n",
      " 245.52 255.27 254.34 363.27 238.83 223.2  259.7  227.2  219.12 239.31\n",
      " 240.7  367.89 358.92 257.89 224.98 240.84 251.8  226.   258.06 253.47\n",
      " 243.6  248.66 228.74 337.47 228.38 251.06 228.26 241.62 231.58 242.02\n",
      " 229.   336.5  259.14 240.35 249.7  227.76 246.19 239.67 243.5  251.51\n",
      " 242.78 250.9  249.6  257.47 234.64 323.35 228.92 235.44 333.8  244.36\n",
      " 233.8  227.74 230.2  236.54 365.58 227.92 246.91 243.27 249.59 226.14\n",
      " 242.37 360.11 219.46 251.34 231.   232.9  262.3  230.24 256.   250.25\n",
      " 240.4  230.72 248.15 245.18 254.64 224.48 364.41 324.23 359.18 226.58\n",
      " 235.23 366.9  221.02 231.66 380.42 342.6  231.46 244.15 229.06 372.57\n",
      " 244.73 255.46 347.77]\n",
      "Carbon dioxide [Partial pressure] in Arterial blood\n",
      "[  nan 40.45 41.06 39.46 40.8  38.71 40.52 41.19 40.59 38.75 39.44 40.38\n",
      " 41.44 40.08 40.79 39.95 39.98 40.49 39.58 40.63 42.71 39.61 37.45 38.77\n",
      " 40.23 40.87 39.   40.41 38.6  39.26 39.23 40.77 40.55 40.01 39.83 40.68\n",
      " 40.7  39.09 40.11 41.26 41.61 40.34 41.56 41.38 41.49 39.51 39.77 40.26\n",
      " 40.74 39.02 40.64 39.15 38.81 40.14 40.56 40.19 40.51 39.38 39.5  41.22\n",
      " 40.07 39.57 38.36 39.85 40.48 40.54 40.44 39.14 38.56 40.31 42.62 40.46\n",
      " 38.83 42.07 39.74 38.98 42.94 39.89 41.8  39.88 40.36 38.96 41.69 40.04\n",
      " 38.23 40.06 40.88 39.08 39.59 40.18 40.47 40.53 40.22 40.96 40.16 39.91\n",
      " 39.49 37.46 40.24 39.06 39.79 39.41 40.82 39.78]\n",
      "Eosinophils/100 leukocytes in Blood by Automated count\n",
      "[ nan 4.45 4.4  4.59 3.74 4.35 4.61 4.46 4.51 5.05 4.73 4.72 4.65 4.15\n",
      " 4.22 4.81 4.63 4.34 4.37 4.16 4.23 4.04 3.98 4.86 4.66 4.56 4.68 4.12\n",
      " 4.83 4.25 4.92 4.75 4.32 4.58 4.11 4.71 4.41 4.21 4.17 4.89 4.19 4.43\n",
      " 4.57 4.39 4.5  4.42 4.54 3.91 4.7  4.28 4.31 4.69 4.38 4.55 4.76 4.6\n",
      " 4.78 4.29 4.74 4.98 4.93 4.88 4.33 4.94 4.24 4.49 4.48 4.36 4.47 5.08\n",
      " 4.03 4.87 4.52 4.62 4.44 4.8  4.64 5.19 4.06 3.95 4.97 4.96 4.53 4.85\n",
      " 4.08 3.87 3.92 4.18 4.84 3.84 4.27 3.99 5.06 5.24 4.82 4.79]\n",
      "Bicarbonate [Moles/volume] in Arterial blood\n",
      "[  nan 24.1  25.27 24.3  24.34 24.36 25.05 24.41 24.58 24.82 24.48 24.25\n",
      " 24.22 23.65 24.33 24.69 23.9  24.62 24.89 24.71 24.7  24.09 24.56 24.08\n",
      " 24.47 24.55 24.21 23.95 24.72 24.53 23.87 23.8  24.75 24.38 24.57 23.81\n",
      " 24.06 24.93 25.07 24.64 23.67 24.23 23.94 24.28 24.44 24.17 24.45 24.37\n",
      " 24.54 24.46 24.52 24.49 24.43 24.94 23.68 24.02 25.06 24.19 24.29 24.\n",
      " 23.78 24.84 23.85 24.67 24.91 24.63 25.14 25.37 23.5  24.13 24.61 24.81\n",
      " 24.15 24.32 24.85 23.55 25.21 23.18 24.76 24.07 24.92 25.15 24.86 24.12\n",
      " 23.7  23.71 23.82]\n",
      "Respiratory Disorders\n",
      "[ 5  1  2  6  0  4  3 13 12  8  7 11 17 16 10  9 15 14]\n",
      "Heart and Cardiovascular Diseases\n",
      "[0 3 2 5 1 4 6 7 9 8]\n",
      "Metabolic and Endocrine Disorders\n",
      "[ 4  3  6  2  1  5  9  7  8  0 10 11 12]\n",
      "Neurological Disorders\n",
      "[2 0 3 4 1 6 5 8 7 9]\n",
      "Orthopedic Injuries\n",
      "[4 0 2 3 5 1 6 7]\n",
      "Mental Health\n",
      "[11  8 10 12  7  9  6  5 13 15 14  4  3 16  0  2 17  1]\n",
      "Reproductive and Pregnancy\n",
      "[ 0  1 15 18 14 23 19 12  2 13  5 16 20 21 24 11 17  8  9 10  3  4 22  6\n",
      "  7 25 26]\n",
      "Pain Relievers and Analesics\n",
      "[3 2 1 4 0 5 7 6 8 9]\n",
      "Cardiovascular and Blood Pressure Medications\n",
      "[ 2  4  7  5  1  0  8  6  3 11  9 10 12 16 13 15 14 20]\n",
      "Injection Medications\n",
      "[ 3  1  2  4  6  7  5  8  0 10  9]\n",
      "Oral Medications\n",
      "[ 2  3  1  8  0  4  5  6  7  9 10 12]\n",
      "Other Medications\n",
      "[ 5  1  3  2  7  8  0  4 12  9 10  6 11 13 14 15]\n",
      "Therapies and Regimes\n",
      "[ 9  2  8  7  3  4  6  5 10 11 13  1 12 14 15  0]\n",
      "Diagnostic Procedures\n",
      "[ 8  7  9  5 10 14 16  1  3  6 15 13  4 19 12 18  2 11 17 20 24 21 25 23\n",
      " 22  0]\n",
      "Surgical Interventions\n",
      "[1 2 0 3 4 5 6 7]\n",
      "Patient Care Management\n",
      "[ 8 10 13 12  3  6  2  5  9  7  4 11 14 15  1 16 19  0 17]\n",
      "age_30t50\n",
      "[0 1]\n",
      "age_50t70\n",
      "[1 0]\n",
      "age_gt70\n",
      "[0 1]\n",
      "Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Albumin_abnormal\n",
      "[0 1]\n",
      "Albumin_normal\n",
      "[0 1]\n",
      "Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Bilirubin.total [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Bilirubin.total [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Bilirubin.total [Mass/volume] in Urine by Test strip_abnormal\n",
      "[0 1]\n",
      "Bilirubin.total [Mass/volume] in Urine by Test strip_normal\n",
      "[0 1]\n",
      "Body Mass Index_abnormal\n",
      "[1 0]\n",
      "Body Mass Index_normal\n",
      "[0 1]\n",
      "Body temperature_abnormal\n",
      "[0 1]\n",
      "Body temperature_normal\n",
      "[1 0]\n",
      "Calcium_normal\n",
      "[1 0]\n",
      "Calcium [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Carbon Dioxide_abnormal\n",
      "[0 1]\n",
      "Carbon Dioxide_normal\n",
      "[0 1]\n",
      "Chloride_abnormal\n",
      "[0 1]\n",
      "Chloride_normal\n",
      "[1 0]\n",
      "Chloride [Moles/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Chloride [Moles/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Creatinine_abnormal\n",
      "[0 1]\n",
      "Creatinine_normal\n",
      "[1 0]\n",
      "DXA [T-score] Bone density_abnormal\n",
      "[0 1]\n",
      "DXA [T-score] Bone density_normal\n",
      "[1 0]\n",
      "Diastolic Blood Pressure_abnormal\n",
      "[1 0]\n",
      "Diastolic Blood Pressure_normal\n",
      "[0 1]\n",
      "Erythrocyte distribution width [Entitic volume] by Automated count_abnormal\n",
      "[0 1]\n",
      "Erythrocyte distribution width [Entitic volume] by Automated count_normal\n",
      "[1 0]\n",
      "Erythrocyte distribution width [Ratio] by Automated count_abnormal\n",
      "[0 1]\n",
      "Erythrocyte distribution width [Ratio] by Automated count_normal\n",
      "[0 1]\n",
      "Erythrocytes [#/volume] in Blood by Automated count_abnormal\n",
      "[0 1]\n",
      "Erythrocytes [#/volume] in Blood by Automated count_normal\n",
      "[1 0]\n",
      "Ferritin [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Ferritin [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Globulin [Mass/volume] in Serum by calculation_abnormal\n",
      "[0 1]\n",
      "Globulin [Mass/volume] in Serum by calculation_normal\n",
      "[0 1]\n",
      "Glomerular filtration rate/1.73 sq M.predicted_abnormal\n",
      "[0 1]\n",
      "Glomerular filtration rate/1.73 sq M.predicted_normal\n",
      "[0 1]\n",
      "Glucose_abnormal\n",
      "[0 1]\n",
      "Glucose_normal\n",
      "[1 0]\n",
      "Glucose [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Glucose [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Glucose [Mass/volume] in Urine by Test strip_normal\n",
      "[0 1]\n",
      "Heart rate_abnormal\n",
      "[0 1]\n",
      "Heart rate_normal\n",
      "[1 0]\n",
      "Hematocrit [Volume Fraction] of Blood_abnormal\n",
      "[0 1]\n",
      "Hematocrit [Volume Fraction] of Blood_normal\n",
      "[0 1]\n",
      "Hematocrit [Volume Fraction] of Blood by Automated count_abnormal\n",
      "[0 1]\n",
      "Hematocrit [Volume Fraction] of Blood by Automated count_normal\n",
      "[1 0]\n",
      "Hemoglobin A1c/Hemoglobin.total in Blood_abnormal\n",
      "[1 0]\n",
      "Hemoglobin A1c/Hemoglobin.total in Blood_normal\n",
      "[0 1]\n",
      "Hemoglobin [Mass/volume] in Blood_abnormal\n",
      "[0 1]\n",
      "Hemoglobin [Mass/volume] in Blood_normal\n",
      "[1 0]\n",
      "Hemoglobin.gastrointestinal [Presence] in Stool by Immunologic method_abnormal\n",
      "[0 1]\n",
      "Hemoglobin.gastrointestinal [Presence] in Stool by Immunologic method_normal\n",
      "[0 1]\n",
      "High Density Lipoprotein Cholesterol_abnormal\n",
      "[1 0]\n",
      "High Density Lipoprotein Cholesterol_normal\n",
      "[0 1]\n",
      "Iron [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Iron [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Iron binding capacity [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Iron binding capacity [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Iron saturation [Mass Fraction] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Iron saturation [Mass Fraction] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Left ventricular Ejection fraction_abnormal\n",
      "[0 1]\n",
      "Left ventricular Ejection fraction_normal\n",
      "[0 1]\n",
      "Leukocytes [#/volume] in Blood by Automated count_abnormal\n",
      "[0 1]\n",
      "Leukocytes [#/volume] in Blood by Automated count_normal\n",
      "[1 0]\n",
      "Low Density Lipoprotein Cholesterol_abnormal\n",
      "[0 1]\n",
      "Low Density Lipoprotein Cholesterol_normal\n",
      "[1 0]\n",
      "MCH [Entitic mass] by Automated count_abnormal\n",
      "[0 1]\n",
      "MCH [Entitic mass] by Automated count_normal\n",
      "[1 0]\n",
      "MCHC [Mass/volume] by Automated count_normal\n",
      "[1 0]\n",
      "MCV [Entitic volume] by Automated count_normal\n",
      "[1 0]\n",
      "Magnesium [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Magnesium [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Microalbumin Creatinine Ratio_abnormal\n",
      "[0 1]\n",
      "Microalbumin Creatinine Ratio_normal\n",
      "[0 1]\n",
      "NT-proBNP_abnormal\n",
      "[0 1]\n",
      "Oxygen saturation in Arterial blood_abnormal\n",
      "[0 1]\n",
      "Pain severity - 0-10 verbal numeric rating [Score] - Reported_abnormal\n",
      "[1 0]\n",
      "Pain severity - 0-10 verbal numeric rating [Score] - Reported_normal\n",
      "[0 1]\n",
      "Platelet mean volume [Entitic volume] in Blood by Automated count_normal\n",
      "[1 0]\n",
      "Platelets [#/volume] in Blood by Automated count_abnormal\n",
      "[0 1]\n",
      "Platelets [#/volume] in Blood by Automated count_normal\n",
      "[1 0]\n",
      "Polyp size greatest dimension by CAP cancer protocols_abnormal\n",
      "[0 1]\n",
      "Polyp size greatest dimension by CAP cancer protocols_normal\n",
      "[0 1]\n",
      "Potassium_normal\n",
      "[1 0]\n",
      "Prostate specific Ag [Mass/volume] in Serum or Plasma_abnormal\n",
      "[0 1]\n",
      "Prostate specific Ag [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "Protein [Mass/volume] in Urine by Test strip_abnormal\n",
      "[0 1]\n",
      "Protein [Mass/volume] in Urine by Test strip_normal\n",
      "[0 1]\n",
      "Respiratory rate_abnormal\n",
      "[0 1]\n",
      "Respiratory rate_normal\n",
      "[1 0]\n",
      "Sodium_normal\n",
      "[1 0]\n",
      "Specific gravity of Urine by Test strip_abnormal\n",
      "[0 1]\n",
      "Systolic Blood Pressure_abnormal\n",
      "[1 0]\n",
      "Systolic Blood Pressure_normal\n",
      "[0 1]\n",
      "Total Cholesterol_abnormal\n",
      "[0 1]\n",
      "Total Cholesterol_normal\n",
      "[1 0]\n",
      "Triglycerides_abnormal\n",
      "[0 1]\n",
      "Triglycerides_normal\n",
      "[1 0]\n",
      "Troponin I.cardiac [Mass/volume] in Serum or Plasma by High sensitivity method_abnormal\n",
      "[0 1]\n",
      "Troponin I.cardiac [Mass/volume] in Serum or Plasma by High sensitivity method_normal\n",
      "[0 1]\n",
      "US Guidance for biopsy of Prostate_abnormal\n",
      "[0 1]\n",
      "US Guidance for biopsy of Prostate_normal\n",
      "[0 1]\n",
      "Urea Nitrogen_normal\n",
      "[1 0]\n",
      "Urea nitrogen [Mass/volume] in Serum or Plasma_normal\n",
      "[0 1]\n",
      "pH of Urine by Test strip_normal\n",
      "[0 1]\n",
      "HER2 [Presence] in Breast cancer specimen by FISH_negative\n",
      "[0 1]\n",
      "HER2 [Presence] in Breast cancer specimen by FISH_positive\n",
      "[0 1]\n",
      "Parainfluenza virus 1 RNA [Presence] in Respiratory specimen by NAA with probe detection_negative\n",
      "[0 1]\n",
      "Estrogen+Progesterone receptor Ag [Presence] in Tissue by Immune stain_negative\n",
      "[0 1]\n",
      "Estrogen+Progesterone receptor Ag [Presence] in Tissue by Immune stain_positive\n",
      "[0 1]\n",
      "Ketones [Mass/volume] in Urine by Test strip_low\n",
      "[0 1]\n",
      "Ketones [Mass/volume] in Urine by Test strip_medium\n",
      "[0 1]\n",
      "marital_m\n",
      "[1 0]\n",
      "marital_s\n",
      "[0 1]\n",
      "Abuse Status [OMAHA]_no\n",
      "[0 1]\n",
      "Abuse Status [OMAHA]_severe\n",
      "[0 1]\n",
      "Interleukin 6 [Mass/volume] in Serum or Plasma_5.33\n",
      "[0 1]\n",
      "Progesterone receptor Ag [Presence] in Breast cancer specimen by Immune stain_negative\n",
      "[0 1]\n",
      "Progesterone receptor Ag [Presence] in Breast cancer specimen by Immune stain_positive\n",
      "[0 1]\n",
      "Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection_negative\n",
      "[0 1]\n",
      "Influenza virus A RNA [Presence] in Respiratory specimen by NAA with probe detection_positive\n",
      "[0 1]\n",
      "Parainfluenza virus 2 RNA [Presence] in Respiratory specimen by NAA with probe detection_negative\n",
      "[0 1]\n",
      "Response to cancer treatment_improving\n",
      "[0 1]\n",
      "Response to cancer treatment_worsening\n",
      "[0 1]\n",
      "Estrogen receptor Ag [Presence] in Breast cancer specimen by Immune stain_negative\n",
      "[0 1]\n",
      "Estrogen receptor Ag [Presence] in Breast cancer specimen by Immune stain_positive\n",
      "[0 1]\n",
      "Hemoglobin [Presence] in Urine by Test strip_negative\n",
      "[0 1]\n",
      "Hemoglobin [Presence] in Urine by Test strip_positive\n",
      "[0 1]\n",
      "Housing status_homeless\n",
      "[0 1]\n",
      "Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection_negative\n",
      "[0 1]\n",
      "Influenza virus B RNA [Presence] in Respiratory specimen by NAA with probe detection_positive\n",
      "[0 1]\n",
      "Objective assessment of cardiovascular disease NYHA_minimal\n",
      "[0 1]\n",
      "Objective assessment of cardiovascular disease NYHA_mod-severe\n",
      "[0 1]\n",
      "Objective assessment of cardiovascular disease NYHA_severe\n",
      "[0 1]\n",
      "Drugs of abuse 5 panel - Urine by Screen method_negative\n",
      "[0 1]\n",
      "Drugs of abuse 5 panel - Urine by Screen method_positive\n",
      "[0 1]\n",
      "Leukocyte esterase [Presence] in Urine by Test strip_negative\n",
      "[0 1]\n",
      "Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection_negative\n",
      "[0 1]\n",
      "Protein [Presence] in Urine by Test strip_1+\n",
      "[0 1]\n",
      "Protein [Presence] in Urine by Test strip_2+\n",
      "[0 1]\n",
      "Protein [Presence] in Urine by Test strip_3+\n",
      "[0 1]\n",
      "Appearance of Urine_cloudy\n",
      "[0 1]\n",
      "Capillary refill [Time] of Nail bed_increased\n",
      "[0 1]\n",
      "Treatment status Cancer_changed\n",
      "[0 1]\n",
      "Gram positive blood culture panel by Probe in Positive blood culture_positive\n",
      "[0 1]\n",
      "Glucose [Presence] in Urine by Test strip_2+\n",
      "[0 1]\n",
      "Respiratory syncytial virus RNA [Presence] in Respiratory specimen by NAA with probe detection_negative\n",
      "[0 1]\n",
      "Functional capacity NYHA_classi\n",
      "[0 1]\n",
      "Functional capacity NYHA_classii\n",
      "[0 1]\n",
      "Functional capacity NYHA_classiii\n",
      "[0 1]\n",
      "Functional capacity NYHA_classiv\n",
      "[0 1]\n",
      "Color of Urine_brown\n",
      "[0 1]\n",
      "Color of Urine_reddish\n",
      "[0 1]\n",
      "Nitrite [Presence] in Urine by Test strip_negative\n",
      "[0 1]\n",
      "Tumor marker Cancer_negative\n",
      "[0 1]\n",
      "Tobacco smoking status NHIS_former\n",
      "[1 0]\n",
      "Tobacco smoking status NHIS_never\n",
      "[0 1]\n",
      "gender_f\n",
      "[0 1]\n",
      "gender_m\n",
      "[1 0]\n",
      "HIV status_negative\n",
      "[0 1]\n",
      "HIV status_positive\n",
      "[0 1]\n",
      "Are you covered by health insurance or some other kind of health care plan [PhenX]_no\n",
      "[0 1]\n",
      "Are you covered by health insurance or some other kind of health care plan [PhenX]_yes\n",
      "[0 1]\n",
      "Human metapneumovirus RNA [Presence] in Respiratory specimen by NAA with probe detection_negative\n",
      "[0 1]\n",
      "Ketones [Presence] in Urine by Test strip_1+\n",
      "[0 1]\n",
      "Ketones [Presence] in Urine by Test strip_2+\n",
      "[0 1]\n",
      "Ketones [Presence] in Urine by Test strip_3+\n",
      "[0 1]\n",
      "Ketones [Presence] in Urine by Test strip_trace\n",
      "[0 1]\n",
      "Clarity of Urine_cloudy\n",
      "[0 1]\n",
      "Clarity of Urine_translucent\n",
      "[0 1]\n",
      "Stage group.clinical Cancer_earlystage\n",
      "[0 1]\n",
      "Stage group.clinical Cancer_latestage\n",
      "[0 1]\n",
      "Rhinovirus RNA [Presence] in Respiratory specimen by NAA with probe detection_negative\n",
      "[0 1]\n",
      "HER2 [Presence] in Breast cancer specimen by Immune stain_negative\n",
      "[0 1]\n",
      "HER2 [Presence] in Breast cancer specimen by Immune stain_positive\n",
      "[0 1]\n",
      "Smokes tobacco daily_True\n",
      "[0 1]\n",
      "Parainfluenza virus 3 RNA [Presence] in Respiratory specimen by NAA with probe detection_negative\n",
      "[0 1]\n",
      "SARS-CoV-2 RNA Pnl Resp NAA+probe_False\n",
      "[0 1]\n",
      "SARS-CoV-2 RNA Pnl Resp NAA+probe_True\n",
      "[0 1]\n",
      "Influenza virus A Ag [Presence] in Nasopharynx by Rapid immunoassay_False\n",
      "[0 1]\n",
      "Influenza virus A Ag [Presence] in Nasopharynx by Rapid immunoassay_True\n",
      "[0 1]\n",
      "Influenza virus B Ag [Presence] in Nasopharynx by Rapid immunoassay_False\n",
      "[0 1]\n",
      "Influenza virus B Ag [Presence] in Nasopharynx by Rapid immunoassay_True\n",
      "[0 1]\n"
     ]
    }
   ],
   "source": [
    "for col in df2.columns:\n",
    "    print(col)\n",
    "    print(df2[col].unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Other NaN values will be filled with median values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = df2.fillna(df2.median())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Saving medians to fill validation data."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Corelations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = df2.drop('label', axis=1)\n",
    "y_train = df2['label']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Correlations between features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_corr = X_train.corr().abs()\n",
    "\n",
    "high_corr = []\n",
    "to_drop = set()\n",
    "\n",
    "for i in range(len(X_corr.columns)):\n",
    "    for j in range(i+1, len(X_corr.columns)):\n",
    "        if X_corr.iloc[i, j] > .98:\n",
    "            col_1 = X_corr.columns[i]\n",
    "            col_2 = X_corr.columns[j]\n",
    "            corr = X_corr.iloc[i, j]\n",
    "            high_corr.append((col_1, col_2, corr))\n",
    "            to_drop.add(col_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Columns with correlation > 0.98:\n",
      "Bilirubin.total [Mass/volume] in Serum or Plasma_abnormal <-> Ferritin [Mass/volume] in Serum or Plasma_abnormal : 0.9959728153860173\n",
      "Body Mass Index_abnormal <-> Body Mass Index_normal : 0.9839742527296425\n",
      "Calcium_normal <-> Urea Nitrogen_normal : 1.0\n",
      "Calcium [Mass/volume] in Serum or Plasma_normal <-> Urea nitrogen [Mass/volume] in Serum or Plasma_normal : 1.0\n",
      "Diastolic Blood Pressure_abnormal <-> Diastolic Blood Pressure_normal : 0.9975674506121094\n",
      "Diastolic Blood Pressure_abnormal <-> Systolic Blood Pressure_abnormal : 0.9926660729264255\n",
      "Diastolic Blood Pressure_abnormal <-> Systolic Blood Pressure_normal : 0.9902281695048288\n",
      "Diastolic Blood Pressure_normal <-> Systolic Blood Pressure_abnormal : 0.9902495007996511\n",
      "Diastolic Blood Pressure_normal <-> Systolic Blood Pressure_normal : 0.9926925140325418\n",
      "Ferritin [Mass/volume] in Serum or Plasma_normal <-> Iron [Mass/volume] in Serum or Plasma_normal : 0.9889332332718642\n"
     ]
    }
   ],
   "source": [
    "print(\"Columns with correlation > 0.98:\")\n",
    "for i in range(10):\n",
    "    print(high_corr[i][0], \"<->\", high_corr[i][1], \":\", high_corr[i][2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "47"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(to_drop)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will drop one of the features if they are highly correlated."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.drop(to_drop, axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Corelations between previously created features and target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_sorted_correlations(X, cols, target):\n",
    "    # Calculate correlations with target for each column in cols\n",
    "    correlations = {}\n",
    "    for col in cols:\n",
    "        if col not in X.columns:\n",
    "            print(f\"Column '{col}' not found in the DataFrame\")\n",
    "            continue\n",
    "        if X[col].dropna().nunique() > 1:\n",
    "            if X[col].dtype == 'object':\n",
    "                X[col] = X[col].astype('category').cat.codes\n",
    "            correlation = target.dropna().corr(X[col].dropna())\n",
    "            correlations[col] = round(correlation, 2)\n",
    "\n",
    "    # Sort correlations in descending order\n",
    "    sorted_correlations = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)\n",
    "    \n",
    "    return sorted_correlations\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column                                           Correlation with Target\n",
      "---------------------------------------------  -------------------------\n",
      "Diagnostic Procedures                                              -0.52\n",
      "Patient Care Management                                            -0.5\n",
      "Metabolic and Endocrine Disorders                                  -0.44\n",
      "Other Medications                                                  -0.44\n",
      "Injection Medications                                              -0.36\n",
      "Cardiovascular and Blood Pressure Medications                      -0.34\n",
      "Heart and Cardiovascular Diseases                                  -0.33\n",
      "Neurological Disorders                                             -0.33\n",
      "Reproductive and Pregnancy                                         -0.33\n",
      "Respiratory Disorders                                              -0.32\n",
      "Mental Health                                                      -0.28\n",
      "Pain Relievers and Analesics                                       -0.28\n",
      "Therapies and Regimes                                              -0.23\n",
      "Surgical Interventions                                             -0.23\n",
      "Orthopedic Injuries                                                -0.2\n",
      "Oral Medications                                                   -0.18\n"
     ]
    }
   ],
   "source": [
    "categories_correlations = get_sorted_correlations(X_train, categories, y_train)\n",
    "\n",
    "print(tabulate(categories_correlations, headers=[\"Column\", \"Correlation with Target\"], tablefmt=\"simple\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Saving medians to fill validation data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "X_train.median()\n",
    "with open('medians.json', 'w') as f:\n",
    "    json.dump(X_train.median().to_dict(), f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### First modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save x_train and y_train to csv\n",
    "X_train.to_csv('../transformed_data/X_train.csv', index=False)\n",
    "y_train.to_csv('../transformed_data/y_train.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.naive_bayes import GaussianNB, BernoulliNB\n",
    "from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier\n",
    "from sklearn.naive_bayes import GaussianNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tabulate import tabulate\n",
    "from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score\n",
    "def calculate_metrics(model, X_train, y_train, X_val, y_val):\n",
    "    model.fit(X_train, y_train)\n",
    "    predictions = model.predict(X_val)\n",
    "    accuracy = accuracy_score(y_val, predictions)\n",
    "    recall = recall_score(y_val, predictions)\n",
    "    precision = precision_score(y_val, predictions)\n",
    "    f1 = f1_score(y_val, predictions)\n",
    "    auc = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])\n",
    "    gini = 2 * auc - 1\n",
    "    return accuracy, recall, precision, f1, auc, gini\n",
    "\n",
    "def print_metrics(models, X_train, y_train, X_val, y_val):\n",
    "    results = pd.DataFrame(columns=['Model', 'Accuracy', 'Recall', 'Precision', 'F1 Score', 'AUC', 'Gini'])\n",
    "    for model in models:\n",
    "        accuracy, recall, precision, f1, auc, gini = calculate_metrics(model, X_train, y_train, X_val, y_val)\n",
    "        results.loc[len(results)] = {'Model': model.__class__.__name__,\n",
    "                                        'Accuracy': accuracy,\n",
    "                                        'Recall': recall,\n",
    "                                        'Precision': precision,\n",
    "                                        'F1 Score': f1,\n",
    "                                        'AUC': auc,\n",
    "                                        'Gini': gini}\n",
    "    print(tabulate(results, headers='keys', tablefmt='simple'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = [DecisionTreeClassifier(), \n",
    "          MLPClassifier(),\n",
    "          GaussianNB(), \n",
    "          GradientBoostingClassifier(), \n",
    "          AdaBoostClassifier(),\n",
    "          RandomForestClassifier(),\n",
    "         LogisticRegression(),\n",
    "         SVC(probability=True),\n",
    "         KNeighborsClassifier(),\n",
    "         ExtraTreesClassifier(),\n",
    "         BaggingClassifier(),\n",
    "         BernoulliNB()\n",
    "        ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data\n",
      "    Model                         Accuracy    Recall    Precision    F1 Score       AUC      Gini\n",
      "--  --------------------------  ----------  --------  -----------  ----------  --------  --------\n",
      " 0  DecisionTreeClassifier        1         1            1           1         1         1\n",
      " 1  MLPClassifier                 0.984527  0.940122     0.998607    0.968483  0.999251  0.998502\n",
      " 2  GaussianNB                    0.61649   0.983392     0.395987    0.564617  0.745811  0.491622\n",
      " 3  GradientBoostingClassifier    0.994584  0.983392     0.995135    0.989228  0.999602  0.999205\n",
      " 4  AdaBoostClassifier            0.992153  0.978147     0.990704    0.984385  0.999619  0.999238\n",
      " 5  RandomForestClassifier        1         1            1           1         1         1\n",
      " 6  LogisticRegression            0.982317  0.95542      0.974153    0.964695  0.996748  0.993495\n",
      " 7  SVC                           0.946729  0.789336     1           0.882267  0.963216  0.926432\n",
      " 8  KNeighborsClassifier          0.979443  0.923077     0.995287    0.957823  0.998893  0.997786\n",
      " 9  ExtraTreesClassifier          1         1            1           1         1         1\n",
      "10  BaggingClassifier             0.999005  0.996503     0.999562    0.99803   0.999998  0.999995\n",
      "11  BernoulliNB                   0.79233   0.956731     0.551524    0.699696  0.964519  0.929039\n"
     ]
    }
   ],
   "source": [
    "print('Training data')\n",
    "print_metrics(models, X_train, y_train, X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "from sklearn import metrics\n",
    "def roc_curve_plot(models, X_train, y_train, X_val, y_val):\n",
    "    plt.figure(0).clf()\n",
    "    plt.figure(figsize=(8, 8))\n",
    "    plt.title(\"ROC Curve for Different Models (Training Data)\")\n",
    "    for model in models:\n",
    "        model.fit(X_train, y_train)\n",
    "        y_pred = model.predict_proba(X_val)[:, 1]\n",
    "        fpr, tpr, _ = metrics.roc_curve(y_val, y_pred)\n",
    "        auc = round(metrics.roc_auc_score(y_val, y_pred), 4)\n",
    "        model_name = model.__class__.__name__  \n",
    "        if model_name == 'Pipeline':\n",
    "            model_name = model['model'].__class__.__name__\n",
    "        plt.plot(fpr, tpr, label=f\"{model_name}, AUC={auc}\")\n",
    "    plt.legend(framealpha=0.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Figure size 432x288 with 0 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 576x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "roc_curve_plot(models, X_train, y_train, X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cross Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score, StratifiedKFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cross_validate_calculate_scores(models, X, y, scoring):\n",
    "    results = {}\n",
    "    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
    "    for model in models:\n",
    "        model_name = model.__class__.__name__\n",
    "        for score, score_name in scoring.items():  \n",
    "            scores_result = cross_val_score(model, X, y, cv=skf, scoring=score)\n",
    "            results[(model_name, score_name)] = scores_result\n",
    "        \n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "scoring = {'roc_auc': 'roc_auc', 'f1': 'f1', 'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall'}\n",
    "cross_val_scores = cross_validate_calculate_scores(models, X_train, y_train, scoring)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "def plot_cross_validate_scores(cross_val_scores, scoring):\n",
    "    sns.set_palette(sns.color_palette('hls', 8))\n",
    "    num_metrics = len(scoring)\n",
    "    fig, axes = plt.subplots(num_metrics, 1, figsize=(10, 3 * num_metrics), sharex=True, sharey=False)\n",
    "\n",
    "    scores_by_metric = {}\n",
    "    for (model_name, metric_name), scores in cross_val_scores.items():\n",
    "        if metric_name not in scores_by_metric:\n",
    "            scores_by_metric[metric_name] = {}\n",
    "        scores_by_metric[metric_name][model_name] = scores\n",
    "\n",
    "    for i, (metric_name, scores) in enumerate(scores_by_metric.items()):\n",
    "        min_score = min(min(scores) for scores in scores_by_metric[metric_name].values())\n",
    "        max_score = max(max(scores) for scores in scores_by_metric[metric_name].values())\n",
    "        sns.boxplot(data=list(scores.values()), ax=axes[i])\n",
    "        axes[i].set_ylim(min_score - 0.01, max_score + 0.01)\n",
    "        axes[i].set_xticklabels(scores.keys(), rotation=45, fontsize=10)\n",
    "        axes[i].set_title(metric_name)\n",
    "        axes[i].set_ylabel('Score')\n",
    "    fig.suptitle('Cross-validation scores')\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    return scores_by_metric\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 720x1080 with 5 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "scores_by_metric = plot_cross_validate_scores(cross_val_scores, scoring)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model                       Metric       Fold 1    Fold 2    Fold 3    Fold 4    Fold 5      Mean\n",
      "--------------------------  ---------  --------  --------  --------  --------  --------  --------\n",
      "DecisionTreeClassifier      roc_auc    0.979873  0.981687  0.971879  0.982433  0.977671  0.978709\n",
      "MLPClassifier               roc_auc    0.998912  0.99887   0.998763  0.998587  0.997632  0.998553\n",
      "GaussianNB                  roc_auc    0.752191  0.737028  0.743017  0.748565  0.743148  0.74479\n",
      "GradientBoostingClassifier  roc_auc    0.997837  0.99876   0.997455  0.996957  0.997898  0.997781\n",
      "AdaBoostClassifier          roc_auc    0.995736  0.998194  0.997287  0.997755  0.997959  0.997386\n",
      "RandomForestClassifier      roc_auc    0.998537  0.999064  0.998236  0.997711  0.996173  0.997944\n",
      "LogisticRegression          roc_auc    0.99704   0.996559  0.996226  0.995583  0.995554  0.996192\n",
      "SVC                         roc_auc    0.969975  0.963131  0.959823  0.960533  0.958628  0.962418\n",
      "KNeighborsClassifier        roc_auc    0.982652  0.978629  0.97595   0.980412  0.982138  0.979957\n",
      "ExtraTreesClassifier        roc_auc    0.997683  0.998605  0.997287  0.995945  0.9968    0.997264\n",
      "BaggingClassifier           roc_auc    0.992643  0.996265  0.993824  0.992671  0.992852  0.993651\n",
      "BernoulliNB                 roc_auc    0.964625  0.971294  0.959676  0.958243  0.9649    0.963748\n",
      "DecisionTreeClassifier      f1         0.962882  0.970936  0.965746  0.976898  0.970492  0.969391\n",
      "MLPClassifier               f1         0.967379  0.973568  0.963293  0.982301  0.969365  0.971181\n",
      "GaussianNB                  f1         0.570521  0.560549  0.563046  0.566709  0.56391   0.564947\n",
      "GradientBoostingClassifier  f1         0.976948  0.980349  0.971047  0.982301  0.981257  0.97838\n",
      "AdaBoostClassifier          f1         0.972497  0.984615  0.976796  0.983389  0.983498  0.980159\n",
      "RandomForestClassifier      f1         0.976693  0.982379  0.965129  0.978818  0.972004  0.975005\n",
      "LogisticRegression          f1         0.963374  0.95747   0.963696  0.956522  0.960265  0.960265\n",
      "SVC                         f1         0.877451  0.891041  0.878825  0.882641  0.866005  0.879193\n",
      "KNeighborsClassifier        f1         0.952273  0.937571  0.938073  0.946163  0.941176  0.943051\n",
      "ExtraTreesClassifier        f1         0.95838   0.969967  0.959459  0.96614   0.972004  0.96519\n",
      "BaggingClassifier           f1         0.971491  0.979235  0.975501  0.976744  0.984513  0.977497\n",
      "BernoulliNB                 f1         0.702962  0.707965  0.688958  0.693484  0.702746  0.699223\n",
      "DecisionTreeClassifier      accuracy   0.98011   0.982873  0.983425  0.986733  0.983969  0.983422\n",
      "MLPClassifier               accuracy   0.962431  0.988398  0.979558  0.99005   0.983416  0.980771\n",
      "GaussianNB                  accuracy   0.626519  0.61105   0.61326   0.622996  0.615257  0.617816\n",
      "GradientBoostingClassifier  accuracy   0.988398  0.990608  0.985635  0.99005   0.990603  0.989059\n",
      "AdaBoostClassifier          accuracy   0.986188  0.992265  0.988398  0.991708  0.991708  0.990053\n",
      "RandomForestClassifier      accuracy   0.986188  0.991713  0.98232   0.987839  0.987839  0.98718\n",
      "LogisticRegression          accuracy   0.981768  0.978453  0.981768  0.978441  0.9801    0.980106\n",
      "SVC                         accuracy   0.944751  0.950276  0.945304  0.946932  0.940299  0.945512\n",
      "KNeighborsClassifier        accuracy   0.976796  0.969613  0.970166  0.974019  0.971808  0.97248\n",
      "ExtraTreesClassifier        accuracy   0.981215  0.98674   0.98232   0.981758  0.98618   0.983643\n",
      "BaggingClassifier           accuracy   0.98453   0.990608  0.98011   0.988944  0.989497  0.986738\n",
      "BernoulliNB                 accuracy   0.795028  0.799448  0.779006  0.789386  0.796573  0.791888\n",
      "DecisionTreeClassifier      precision  0.958874  0.951579  0.971366  0.971678  0.977974  0.966294\n",
      "MLPClassifier               precision  0.995413  0.94926   0.969631  0.993258  0.990991  0.979711\n",
      "GaussianNB                  precision  0.40233   0.392483  0.394231  0.399284  0.395083  0.396682\n",
      "GradientBoostingClassifier  precision  0.98234   0.980349  0.990909  0.993289  0.988889  0.987155\n",
      "AdaBoostClassifier          precision  0.980044  0.99115   0.988814  0.995516  0.988938  0.988893\n",
      "RandomForestClassifier      precision  0.997727  0.993333  0.997685  0.997727  0.997706  0.996836\n",
      "LogisticRegression          precision  0.979684  0.956427  0.971175  0.975     0.96882   0.970221\n",
      "SVC                         precision  1         1         1         1         1         1\n",
      "KNeighborsClassifier        precision  0.992891  0.976359  0.987923  0.992788  0.995122  0.989017\n",
      "ExtraTreesClassifier        precision  0.995338  0.988662  0.993039  0.995349  0.997696  0.994017\n",
      "BaggingClassifier           precision  0.98234   0.973856  0.986456  0.98441   0.997758  0.984964\n",
      "BernoulliNB                 precision  0.554994  0.56051   0.535024  0.548346  0.556978  0.55117\n",
      "DecisionTreeClassifier      recall     0.960699  0.980349  0.960699  0.971554  0.962801  0.96722\n",
      "MLPClassifier               recall     0.962882  0.975983  0.973799  0.934354  0.969365  0.963277\n",
      "GaussianNB                  recall     0.980349  0.980349  0.984716  0.97593   0.984683  0.981206\n",
      "GradientBoostingClassifier  recall     0.971616  0.980349  0.951965  0.971554  0.973742  0.969845\n",
      "AdaBoostClassifier          recall     0.965066  0.978166  0.965066  0.971554  0.978118  0.971594\n",
      "RandomForestClassifier      recall     0.956332  0.969432  0.941048  0.960613  0.954048  0.956295\n",
      "LogisticRegression          recall     0.947598  0.958515  0.956332  0.938731  0.95186   0.950607\n",
      "SVC                         recall     0.781659  0.803493  0.783843  0.789934  0.763676  0.784521\n",
      "KNeighborsClassifier        recall     0.914847  0.901747  0.893013  0.90372   0.892779  0.901221\n",
      "ExtraTreesClassifier        recall     0.932314  0.958515  0.934498  0.936543  0.947484  0.941871\n",
      "BaggingClassifier           recall     0.965066  0.971616  0.947598  0.960613  0.978118  0.964602\n",
      "BernoulliNB                 recall     0.958515  0.960699  0.967249  0.943107  0.95186   0.956286\n"
     ]
    }
   ],
   "source": [
    "headers = [\"Model\", \"Metric\", \"Fold 1\", \"Fold 2\", \"Fold 3\", \"Fold 4\", \"Fold 5\", \"Mean\"]\n",
    "\n",
    "table_data = []\n",
    "for metric, classifiers in scores_by_metric.items():\n",
    "    for classifier, scores in classifiers.items():\n",
    "        row = [classifier, metric] + list(scores) + [sum(scores) / len(scores)]\n",
    "        table_data.append(row)\n",
    "\n",
    "print(tabulate(table_data, headers=headers))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}