[90c205]: / final notebook.ipynb

Download this file

2326 lines (2325 with data), 75.6 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "52a93d80",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5578d52a",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df=pd.read_csv('hemolysis_index_results.csv')\n",
    "cost=pd.read_csv(\"redraw_costs.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cedb75cb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>COLLECTOR_ID</th>\n",
       "      <th>REDRAW_COUNT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>COLLECTOR_101</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>COLLECTOR_1013</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>COLLECTOR_1028</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>COLLECTOR_1029</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>COLLECTOR_1031</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1234</th>\n",
       "      <td>COLLECTOR_981</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1235</th>\n",
       "      <td>COLLECTOR_984</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1236</th>\n",
       "      <td>COLLECTOR_989</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1237</th>\n",
       "      <td>COLLECTOR_991</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1238</th>\n",
       "      <td>COLLECTOR_992</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1239 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        COLLECTOR_ID  REDRAW_COUNT\n",
       "0      COLLECTOR_101             1\n",
       "1     COLLECTOR_1013             2\n",
       "2     COLLECTOR_1028             3\n",
       "3     COLLECTOR_1029             1\n",
       "4     COLLECTOR_1031             1\n",
       "...              ...           ...\n",
       "1234   COLLECTOR_981             1\n",
       "1235   COLLECTOR_984             3\n",
       "1236   COLLECTOR_989             2\n",
       "1237   COLLECTOR_991             1\n",
       "1238   COLLECTOR_992             2\n",
       "\n",
       "[1239 rows x 2 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "redraw = df.query(\"HEMOLYSIS_INDEX > 75\")\n",
    "df3 = redraw[['COLLECTOR_ID', 'SPECIMEN_ID', 'HEMOLYSIS_INDEX']].copy()\n",
    "df3 = df3.groupby('COLLECTOR_ID')['SPECIMEN_ID'].nunique().reset_index()\n",
    "df3 = df3.rename(columns={'SPECIMEN_ID' : 'REDRAW_COUNT'})\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "634fe1c8",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ORDER_ID</th>\n",
       "      <th>ORDERABLE</th>\n",
       "      <th>PATIENT_ID</th>\n",
       "      <th>PATIENT_AGE</th>\n",
       "      <th>PATIENT_SEX</th>\n",
       "      <th>PATIENT_RACE</th>\n",
       "      <th>NURSING_UNIT</th>\n",
       "      <th>ENCOUNTER_TYPE</th>\n",
       "      <th>MEDICAL_SERVICE</th>\n",
       "      <th>SPECIMEN_ID</th>\n",
       "      <th>SPECIMEN_TYPE</th>\n",
       "      <th>COLLECTOR_ID</th>\n",
       "      <th>DRAW_SITE</th>\n",
       "      <th>COLLECTION_DAY</th>\n",
       "      <th>COLLECTION_TIME</th>\n",
       "      <th>HEMOLYSIS_INDEX</th>\n",
       "      <th>REDRAW_COST</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ORDER_1179508</td>\n",
       "      <td>Vancomycin Tr</td>\n",
       "      <td>PATIENT_131440</td>\n",
       "      <td>44</td>\n",
       "      <td>Female</td>\n",
       "      <td>White</td>\n",
       "      <td>UNIT_10</td>\n",
       "      <td>Intensive Care</td>\n",
       "      <td>Medical</td>\n",
       "      <td>SPECIMEN_543979</td>\n",
       "      <td>Blood</td>\n",
       "      <td>COLLECTOR_2732</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6</td>\n",
       "      <td>23:21:00</td>\n",
       "      <td>6.0</td>\n",
       "      <td>162.18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ORDER_1181190</td>\n",
       "      <td>Comp Met Plas</td>\n",
       "      <td>PATIENT_58856</td>\n",
       "      <td>75</td>\n",
       "      <td>Male</td>\n",
       "      <td>White</td>\n",
       "      <td>UNIT_49</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>Bone Marrow Transplant</td>\n",
       "      <td>SPECIMEN_544650</td>\n",
       "      <td>Blood</td>\n",
       "      <td>COLLECTOR_522</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6</td>\n",
       "      <td>23:16:00</td>\n",
       "      <td>8.0</td>\n",
       "      <td>357.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ORDER_1181164</td>\n",
       "      <td>Phos Plas</td>\n",
       "      <td>PATIENT_58856</td>\n",
       "      <td>75</td>\n",
       "      <td>Male</td>\n",
       "      <td>White</td>\n",
       "      <td>UNIT_49</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>Bone Marrow Transplant</td>\n",
       "      <td>SPECIMEN_544650</td>\n",
       "      <td>Blood</td>\n",
       "      <td>COLLECTOR_522</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6</td>\n",
       "      <td>23:16:00</td>\n",
       "      <td>8.0</td>\n",
       "      <td>357.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ORDER_1181191</td>\n",
       "      <td>Magnesium</td>\n",
       "      <td>PATIENT_58856</td>\n",
       "      <td>75</td>\n",
       "      <td>Male</td>\n",
       "      <td>White</td>\n",
       "      <td>UNIT_49</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>Bone Marrow Transplant</td>\n",
       "      <td>SPECIMEN_544650</td>\n",
       "      <td>Blood</td>\n",
       "      <td>COLLECTOR_522</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6</td>\n",
       "      <td>23:16:00</td>\n",
       "      <td>8.0</td>\n",
       "      <td>357.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ORDER_1180687</td>\n",
       "      <td>Basic Met Plas</td>\n",
       "      <td>PATIENT_99436</td>\n",
       "      <td>40</td>\n",
       "      <td>Female</td>\n",
       "      <td>White</td>\n",
       "      <td>UNIT_161</td>\n",
       "      <td>Intensive Care</td>\n",
       "      <td>Cardiology</td>\n",
       "      <td>SPECIMEN_544632</td>\n",
       "      <td>Blood</td>\n",
       "      <td>COLLECTOR_201</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6</td>\n",
       "      <td>23:18:00</td>\n",
       "      <td>7.0</td>\n",
       "      <td>162.18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101169</th>\n",
       "      <td>ORDER_1253155</td>\n",
       "      <td>Phos Plas</td>\n",
       "      <td>PATIENT_74490</td>\n",
       "      <td>80</td>\n",
       "      <td>Male</td>\n",
       "      <td>White</td>\n",
       "      <td>UNIT_45</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>Bone Marrow Transplant</td>\n",
       "      <td>SPECIMEN_577685</td>\n",
       "      <td>Blood</td>\n",
       "      <td>COLLECTOR_5636</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27</td>\n",
       "      <td>23:10:00</td>\n",
       "      <td>27.0</td>\n",
       "      <td>357.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101170</th>\n",
       "      <td>ORDER_1253154</td>\n",
       "      <td>Magnesium</td>\n",
       "      <td>PATIENT_74490</td>\n",
       "      <td>80</td>\n",
       "      <td>Male</td>\n",
       "      <td>White</td>\n",
       "      <td>UNIT_45</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>Bone Marrow Transplant</td>\n",
       "      <td>SPECIMEN_577685</td>\n",
       "      <td>Blood</td>\n",
       "      <td>COLLECTOR_5636</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27</td>\n",
       "      <td>23:10:00</td>\n",
       "      <td>27.0</td>\n",
       "      <td>357.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101171</th>\n",
       "      <td>ORDER_1253157</td>\n",
       "      <td>Uric Acid</td>\n",
       "      <td>PATIENT_74490</td>\n",
       "      <td>80</td>\n",
       "      <td>Male</td>\n",
       "      <td>White</td>\n",
       "      <td>UNIT_45</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>Bone Marrow Transplant</td>\n",
       "      <td>SPECIMEN_577685</td>\n",
       "      <td>Blood</td>\n",
       "      <td>COLLECTOR_5636</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27</td>\n",
       "      <td>23:10:00</td>\n",
       "      <td>27.0</td>\n",
       "      <td>357.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101172</th>\n",
       "      <td>ORDER_1252360</td>\n",
       "      <td>Comp Met Plas</td>\n",
       "      <td>PATIENT_132340</td>\n",
       "      <td>58</td>\n",
       "      <td>Male</td>\n",
       "      <td>White</td>\n",
       "      <td>UNIT_145</td>\n",
       "      <td>Emergency</td>\n",
       "      <td>Neuro Medicine</td>\n",
       "      <td>SPECIMEN_577345</td>\n",
       "      <td>Blood</td>\n",
       "      <td>COLLECTOR_2490</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27</td>\n",
       "      <td>22:55:00</td>\n",
       "      <td>16.0</td>\n",
       "      <td>600.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101173</th>\n",
       "      <td>ORDER_1252361</td>\n",
       "      <td>Thyroid Cascade</td>\n",
       "      <td>PATIENT_132340</td>\n",
       "      <td>58</td>\n",
       "      <td>Male</td>\n",
       "      <td>White</td>\n",
       "      <td>UNIT_145</td>\n",
       "      <td>Emergency</td>\n",
       "      <td>Neuro Medicine</td>\n",
       "      <td>SPECIMEN_577345</td>\n",
       "      <td>Blood</td>\n",
       "      <td>COLLECTOR_2490</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27</td>\n",
       "      <td>22:55:00</td>\n",
       "      <td>16.0</td>\n",
       "      <td>600.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>101174 rows × 17 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             ORDER_ID        ORDERABLE      PATIENT_ID PATIENT_AGE  \\\n",
       "0       ORDER_1179508    Vancomycin Tr  PATIENT_131440          44   \n",
       "1       ORDER_1181190    Comp Met Plas   PATIENT_58856          75   \n",
       "2       ORDER_1181164        Phos Plas   PATIENT_58856          75   \n",
       "3       ORDER_1181191        Magnesium   PATIENT_58856          75   \n",
       "4       ORDER_1180687   Basic Met Plas   PATIENT_99436          40   \n",
       "...               ...              ...             ...         ...   \n",
       "101169  ORDER_1253155        Phos Plas   PATIENT_74490          80   \n",
       "101170  ORDER_1253154        Magnesium   PATIENT_74490          80   \n",
       "101171  ORDER_1253157        Uric Acid   PATIENT_74490          80   \n",
       "101172  ORDER_1252360    Comp Met Plas  PATIENT_132340          58   \n",
       "101173  ORDER_1252361  Thyroid Cascade  PATIENT_132340          58   \n",
       "\n",
       "       PATIENT_SEX PATIENT_RACE NURSING_UNIT  ENCOUNTER_TYPE  \\\n",
       "0           Female        White      UNIT_10  Intensive Care   \n",
       "1             Male        White      UNIT_49       Inpatient   \n",
       "2             Male        White      UNIT_49       Inpatient   \n",
       "3             Male        White      UNIT_49       Inpatient   \n",
       "4           Female        White     UNIT_161  Intensive Care   \n",
       "...            ...          ...          ...             ...   \n",
       "101169        Male        White      UNIT_45       Inpatient   \n",
       "101170        Male        White      UNIT_45       Inpatient   \n",
       "101171        Male        White      UNIT_45       Inpatient   \n",
       "101172        Male        White     UNIT_145       Emergency   \n",
       "101173        Male        White     UNIT_145       Emergency   \n",
       "\n",
       "               MEDICAL_SERVICE      SPECIMEN_ID SPECIMEN_TYPE    COLLECTOR_ID  \\\n",
       "0                      Medical  SPECIMEN_543979         Blood  COLLECTOR_2732   \n",
       "1       Bone Marrow Transplant  SPECIMEN_544650         Blood   COLLECTOR_522   \n",
       "2       Bone Marrow Transplant  SPECIMEN_544650         Blood   COLLECTOR_522   \n",
       "3       Bone Marrow Transplant  SPECIMEN_544650         Blood   COLLECTOR_522   \n",
       "4                   Cardiology  SPECIMEN_544632         Blood   COLLECTOR_201   \n",
       "...                        ...              ...           ...             ...   \n",
       "101169  Bone Marrow Transplant  SPECIMEN_577685         Blood  COLLECTOR_5636   \n",
       "101170  Bone Marrow Transplant  SPECIMEN_577685         Blood  COLLECTOR_5636   \n",
       "101171  Bone Marrow Transplant  SPECIMEN_577685         Blood  COLLECTOR_5636   \n",
       "101172          Neuro Medicine  SPECIMEN_577345         Blood  COLLECTOR_2490   \n",
       "101173          Neuro Medicine  SPECIMEN_577345         Blood  COLLECTOR_2490   \n",
       "\n",
       "       DRAW_SITE  COLLECTION_DAY COLLECTION_TIME  HEMOLYSIS_INDEX  REDRAW_COST  \n",
       "0            NaN               6        23:21:00              6.0       162.18  \n",
       "1            NaN               6        23:16:00              8.0       357.15  \n",
       "2            NaN               6        23:16:00              8.0       357.15  \n",
       "3            NaN               6        23:16:00              8.0       357.15  \n",
       "4            NaN               6        23:18:00              7.0       162.18  \n",
       "...          ...             ...             ...              ...          ...  \n",
       "101169       NaN              27        23:10:00             27.0       357.15  \n",
       "101170       NaN              27        23:10:00             27.0       357.15  \n",
       "101171       NaN              27        23:10:00             27.0       357.15  \n",
       "101172       NaN              27        22:55:00             16.0       600.00  \n",
       "101173       NaN              27        22:55:00             16.0       600.00  \n",
       "\n",
       "[101174 rows x 17 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df=df.merge(cost,how=\"left\",on=\"ENCOUNTER_TYPE\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7b1e6c42",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>COLLECTOR_ID</th>\n",
       "      <th>SPECIMEN_COUNT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>COLLECTOR_101</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>COLLECTOR_1013</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>COLLECTOR_1028</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>COLLECTOR_1029</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>COLLECTOR_1031</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2223</th>\n",
       "      <td>COLLECTOR_991</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2224</th>\n",
       "      <td>COLLECTOR_992</td>\n",
       "      <td>43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2225</th>\n",
       "      <td>COLLECTOR_993</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2226</th>\n",
       "      <td>COLLECTOR_995</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2227</th>\n",
       "      <td>COLLECTOR_999</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2228 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        COLLECTOR_ID  SPECIMEN_COUNT\n",
       "0      COLLECTOR_101               7\n",
       "1     COLLECTOR_1013              35\n",
       "2     COLLECTOR_1028              23\n",
       "3     COLLECTOR_1029              17\n",
       "4     COLLECTOR_1031              30\n",
       "...              ...             ...\n",
       "2223   COLLECTOR_991               6\n",
       "2224   COLLECTOR_992              43\n",
       "2225   COLLECTOR_993              23\n",
       "2226   COLLECTOR_995               2\n",
       "2227   COLLECTOR_999              41\n",
       "\n",
       "[2228 rows x 2 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#calculate number of specimen per collector\n",
    "df2 = df[['COLLECTOR_ID', 'SPECIMEN_ID', 'HEMOLYSIS_INDEX']].copy()\n",
    "df2 = df2.groupby('COLLECTOR_ID')['SPECIMEN_ID'].nunique().reset_index()\n",
    "df2 = df2.rename(columns={'SPECIMEN_ID' : 'SPECIMEN_COUNT'})\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "55aa964e",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>COLLECTOR_ID</th>\n",
       "      <th>median_hi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>COLLECTOR_101</td>\n",
       "      <td>9.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>COLLECTOR_1013</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>COLLECTOR_1028</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>COLLECTOR_1029</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>COLLECTOR_1031</td>\n",
       "      <td>15.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2223</th>\n",
       "      <td>COLLECTOR_991</td>\n",
       "      <td>16.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2224</th>\n",
       "      <td>COLLECTOR_992</td>\n",
       "      <td>16.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2225</th>\n",
       "      <td>COLLECTOR_993</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2226</th>\n",
       "      <td>COLLECTOR_995</td>\n",
       "      <td>7.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2227</th>\n",
       "      <td>COLLECTOR_999</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2228 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        COLLECTOR_ID  median_hi\n",
       "0      COLLECTOR_101        9.0\n",
       "1     COLLECTOR_1013       12.0\n",
       "2     COLLECTOR_1028        8.0\n",
       "3     COLLECTOR_1029        5.0\n",
       "4     COLLECTOR_1031       15.5\n",
       "...              ...        ...\n",
       "2223   COLLECTOR_991       16.5\n",
       "2224   COLLECTOR_992       16.0\n",
       "2225   COLLECTOR_993       11.0\n",
       "2226   COLLECTOR_995        7.5\n",
       "2227   COLLECTOR_999       12.0\n",
       "\n",
       "[2228 rows x 2 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#%% compute median hemolysis index per collector\n",
    "def compute_median(X:pd.DataFrame)->float:\n",
    "    '''Return median hemolsysis index across a collectors samples'''\n",
    "    median=X.drop_duplicates(subset='SPECIMEN_ID')['HEMOLYSIS_INDEX'].median()\n",
    "    return(median)\n",
    "\n",
    "median_hi=df.groupby('COLLECTOR_ID').apply(lambda X: compute_median(X))\n",
    "median_hi=median_hi.reset_index().rename(columns={0:'median_hi'})\n",
    "median_hi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "adb51e58",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>COLLECTOR_ID</th>\n",
       "      <th>ENCOUNTER_TYPE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>COLLECTOR_2732</td>\n",
       "      <td>Intensive Care</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>COLLECTOR_522</td>\n",
       "      <td>Inpatient</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>COLLECTOR_201</td>\n",
       "      <td>Intensive Care</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>COLLECTOR_3469</td>\n",
       "      <td>Emergency</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>COLLECTOR_2720</td>\n",
       "      <td>Intensive Care</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2224</th>\n",
       "      <td>COLLECTOR_4046</td>\n",
       "      <td>Inpatient</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2225</th>\n",
       "      <td>COLLECTOR_3746</td>\n",
       "      <td>Outpatient</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2226</th>\n",
       "      <td>COLLECTOR_1858</td>\n",
       "      <td>Emergency</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2227</th>\n",
       "      <td>COLLECTOR_4625</td>\n",
       "      <td>Inpatient</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2228</th>\n",
       "      <td>COLLECTOR_693</td>\n",
       "      <td>Inpatient</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2229 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        COLLECTOR_ID  ENCOUNTER_TYPE\n",
       "0     COLLECTOR_2732  Intensive Care\n",
       "1      COLLECTOR_522       Inpatient\n",
       "2      COLLECTOR_201  Intensive Care\n",
       "3     COLLECTOR_3469       Emergency\n",
       "4     COLLECTOR_2720  Intensive Care\n",
       "...              ...             ...\n",
       "2224  COLLECTOR_4046       Inpatient\n",
       "2225  COLLECTOR_3746      Outpatient\n",
       "2226  COLLECTOR_1858       Emergency\n",
       "2227  COLLECTOR_4625       Inpatient\n",
       "2228   COLLECTOR_693       Inpatient\n",
       "\n",
       "[2229 rows x 2 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#determine specialized encoutner type for each collector\n",
    "df4 = df[['COLLECTOR_ID', 'ENCOUNTER_TYPE']].drop_duplicates(subset='COLLECTOR_ID').reset_index(drop=True).copy()\n",
    "df4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "16712b4e",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SPLIT</th>\n",
       "      <th>COLLECTOR_ID</th>\n",
       "      <th>RELEVANCE</th>\n",
       "      <th>median_hi</th>\n",
       "      <th>SPECIMEN_COUNT</th>\n",
       "      <th>REDRAW_COUNT</th>\n",
       "      <th>ENCOUNTER_TYPE</th>\n",
       "      <th>REDRAW_COST</th>\n",
       "      <th>REDRAW_PERCENT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TRAIN</td>\n",
       "      <td>COLLECTOR_13</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>29.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TRAIN</td>\n",
       "      <td>COLLECTOR_2184</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>18.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.153846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TRAIN</td>\n",
       "      <td>COLLECTOR_798</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Intensive Care</td>\n",
       "      <td>162.18</td>\n",
       "      <td>0.051282</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>TRAIN</td>\n",
       "      <td>COLLECTOR_492</td>\n",
       "      <td>0.32436</td>\n",
       "      <td>21.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Intensive Care</td>\n",
       "      <td>162.18</td>\n",
       "      <td>0.100000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TRAIN</td>\n",
       "      <td>COLLECTOR_2658</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>10.5</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1338</th>\n",
       "      <td>TRAIN</td>\n",
       "      <td>COLLECTOR_3798</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>33.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.111111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1339</th>\n",
       "      <td>TRAIN</td>\n",
       "      <td>COLLECTOR_3397</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>12.5</td>\n",
       "      <td>46.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.065217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1340</th>\n",
       "      <td>TRAIN</td>\n",
       "      <td>COLLECTOR_33</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.046512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1341</th>\n",
       "      <td>TRAIN</td>\n",
       "      <td>COLLECTOR_3001</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1342</th>\n",
       "      <td>TRAIN</td>\n",
       "      <td>COLLECTOR_4667</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1343 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      SPLIT    COLLECTOR_ID  RELEVANCE  median_hi  SPECIMEN_COUNT  \\\n",
       "0     TRAIN    COLLECTOR_13    0.00000       29.0             2.0   \n",
       "1     TRAIN  COLLECTOR_2184    0.00000       18.0            13.0   \n",
       "2     TRAIN   COLLECTOR_798    0.00000       13.0            39.0   \n",
       "3     TRAIN   COLLECTOR_492    0.32436       21.0            20.0   \n",
       "4     TRAIN  COLLECTOR_2658    0.00000       10.5            26.0   \n",
       "...     ...             ...        ...        ...             ...   \n",
       "1338  TRAIN  COLLECTOR_3798    0.00000       33.0             9.0   \n",
       "1339  TRAIN  COLLECTOR_3397    0.00000       12.5            46.0   \n",
       "1340  TRAIN    COLLECTOR_33    0.00000       13.0            43.0   \n",
       "1341  TRAIN  COLLECTOR_3001    0.00000       13.0             3.0   \n",
       "1342  TRAIN  COLLECTOR_4667    0.00000       13.0             6.0   \n",
       "\n",
       "      REDRAW_COUNT  ENCOUNTER_TYPE  REDRAW_COST  REDRAW_PERCENT  \n",
       "0              0.0       Inpatient       357.15        0.000000  \n",
       "1              2.0       Inpatient       357.15        0.153846  \n",
       "2              2.0  Intensive Care       162.18        0.051282  \n",
       "3              2.0  Intensive Care       162.18        0.100000  \n",
       "4              0.0       Inpatient       357.15        0.000000  \n",
       "...            ...             ...          ...             ...  \n",
       "1338           1.0       Inpatient       357.15        0.111111  \n",
       "1339           3.0       Inpatient       357.15        0.065217  \n",
       "1340           2.0       Inpatient       357.15        0.046512  \n",
       "1341           0.0       Inpatient       357.15        0.000000  \n",
       "1342           0.0       Inpatient       357.15        0.000000  \n",
       "\n",
       "[1343 rows x 9 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#join features with train collector ids in sample solution \n",
    "train=pd.read_csv('train.csv')\n",
    "train=train.merge(median_hi,how='left',on='COLLECTOR_ID')\n",
    "train=train.merge(df2,how='left',on='COLLECTOR_ID')\n",
    "train=train.merge(df3,how='left',on='COLLECTOR_ID')\n",
    "train=train.merge(df4,how='left',on='COLLECTOR_ID')\n",
    "train=train.merge(cost,how='left',on='ENCOUNTER_TYPE')\n",
    "train['REDRAW_COUNT'] = train['REDRAW_COUNT'].fillna(0)\n",
    "train['REDRAW_PERCENT'] = train['REDRAW_COUNT'] / train['SPECIMEN_COUNT']\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "775a5158",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RELEVANCE</th>\n",
       "      <th>median_hi</th>\n",
       "      <th>REDRAW_COUNT</th>\n",
       "      <th>ENCOUNTER_TYPE</th>\n",
       "      <th>REDRAW_COST</th>\n",
       "      <th>REDRAW_PERCENT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>29.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.153846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Intensive Care</td>\n",
       "      <td>162.18</td>\n",
       "      <td>0.051282</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.32436</td>\n",
       "      <td>21.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Intensive Care</td>\n",
       "      <td>162.18</td>\n",
       "      <td>0.100000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>10.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1338</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>33.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.111111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1339</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>12.5</td>\n",
       "      <td>3.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.065217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1340</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.046512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1341</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1342</th>\n",
       "      <td>0.00000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1343 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      RELEVANCE  median_hi  REDRAW_COUNT  ENCOUNTER_TYPE  REDRAW_COST  \\\n",
       "0       0.00000       29.0           0.0       Inpatient       357.15   \n",
       "1       0.00000       18.0           2.0       Inpatient       357.15   \n",
       "2       0.00000       13.0           2.0  Intensive Care       162.18   \n",
       "3       0.32436       21.0           2.0  Intensive Care       162.18   \n",
       "4       0.00000       10.5           0.0       Inpatient       357.15   \n",
       "...         ...        ...           ...             ...          ...   \n",
       "1338    0.00000       33.0           1.0       Inpatient       357.15   \n",
       "1339    0.00000       12.5           3.0       Inpatient       357.15   \n",
       "1340    0.00000       13.0           2.0       Inpatient       357.15   \n",
       "1341    0.00000       13.0           0.0       Inpatient       357.15   \n",
       "1342    0.00000       13.0           0.0       Inpatient       357.15   \n",
       "\n",
       "      REDRAW_PERCENT  \n",
       "0           0.000000  \n",
       "1           0.153846  \n",
       "2           0.051282  \n",
       "3           0.100000  \n",
       "4           0.000000  \n",
       "...              ...  \n",
       "1338        0.111111  \n",
       "1339        0.065217  \n",
       "1340        0.046512  \n",
       "1341        0.000000  \n",
       "1342        0.000000  \n",
       "\n",
       "[1343 rows x 6 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#remove redundant columns\n",
    "train = train.drop(['SPLIT', 'SPECIMEN_COUNT', 'COLLECTOR_ID'], axis = 1)\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "90827e7b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RELEVANCE</th>\n",
       "      <th>median_hi</th>\n",
       "      <th>REDRAW_COUNT</th>\n",
       "      <th>REDRAW_COST</th>\n",
       "      <th>REDRAW_PERCENT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>29.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.153846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>10.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.912717</td>\n",
       "      <td>10.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.045455</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1338</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>33.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.111111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1339</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>12.5</td>\n",
       "      <td>3.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.065217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1340</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.046512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1341</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1342</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>13.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>937 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      RELEVANCE  median_hi  REDRAW_COUNT  REDRAW_COST  REDRAW_PERCENT\n",
       "0      0.000000       29.0           0.0       357.15        0.000000\n",
       "1      0.000000       18.0           2.0       357.15        0.153846\n",
       "4      0.000000       10.5           0.0       357.15        0.000000\n",
       "7      0.912717       10.0           1.0       357.15        0.045455\n",
       "8      0.000000       10.0           0.0       357.15        0.000000\n",
       "...         ...        ...           ...          ...             ...\n",
       "1338   0.000000       33.0           1.0       357.15        0.111111\n",
       "1339   0.000000       12.5           3.0       357.15        0.065217\n",
       "1340   0.000000       13.0           2.0       357.15        0.046512\n",
       "1341   0.000000       13.0           0.0       357.15        0.000000\n",
       "1342   0.000000       13.0           0.0       357.15        0.000000\n",
       "\n",
       "[937 rows x 5 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#filter for only Emergency and Inpatient collectors\n",
    "emergency = train.loc[(train['ENCOUNTER_TYPE'] == \"Emergency\") | (train['ENCOUNTER_TYPE'] == \"Inpatient\")]\n",
    "emergency = emergency.drop('ENCOUNTER_TYPE', axis=1)\n",
    "emergency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "613d0944",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import xgboost as xgb\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "9e8eb69c",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = emergency.drop('RELEVANCE', axis=1)\n",
    "y = emergency['RELEVANCE']\n",
    "\n",
    "#Select only nonzero relevance scores to train on\n",
    "X=X[y>0]\n",
    "y=y[y>0]\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e3153ef9",
   "metadata": {},
   "outputs": [],
   "source": [
    "dtrain = xgb.DMatrix(X_train, label=y_train)\n",
    "dtest = xgb.DMatrix(X_test, label=y_test)\n",
    "params = {\n",
    "    \"max_depth\": 15,\n",
    "    \"eta\": 0.3,\n",
    "    \"subsample\": 1.0,\n",
    "    \"colsample_bytree\": 1.0,\n",
    "    \"learning_rate\": 0.35,\n",
    "    \"objective\": \"reg:squarederror\",\n",
    "    \"eval_metric\": \"rmse\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "d951fb6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_round = 100\n",
    "bst = xgb.train(params, dtrain, num_round)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9a096d9f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE: 3.329161021152597\n"
     ]
    }
   ],
   "source": [
    "preds = bst.predict(dtest)\n",
    "rmse = mean_squared_error(y_test, preds, squared=False)\n",
    "print(\"RMSE:\", rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "d0f696f3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SPLIT</th>\n",
       "      <th>COLLECTOR_ID</th>\n",
       "      <th>median_hi</th>\n",
       "      <th>SPECIMEN_COUNT</th>\n",
       "      <th>REDRAW_COUNT</th>\n",
       "      <th>ENCOUNTER_TYPE</th>\n",
       "      <th>REDRAW_COST</th>\n",
       "      <th>REDRAW_PERCENT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_2011</td>\n",
       "      <td>32.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Emergency</td>\n",
       "      <td>600.00</td>\n",
       "      <td>0.235294</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_1550</td>\n",
       "      <td>50.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_4570</td>\n",
       "      <td>8.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Intensive Care</td>\n",
       "      <td>162.18</td>\n",
       "      <td>0.062500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_3531</td>\n",
       "      <td>9.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_5020</td>\n",
       "      <td>31.0</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>Emergency</td>\n",
       "      <td>600.00</td>\n",
       "      <td>0.111111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_2848</td>\n",
       "      <td>14.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.032258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>891</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_1501</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>892</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_860</td>\n",
       "      <td>72.0</td>\n",
       "      <td>48.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.479167</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>893</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_3200</td>\n",
       "      <td>30.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Inpatient</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>894</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_4788</td>\n",
       "      <td>12.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Intensive Care</td>\n",
       "      <td>162.18</td>\n",
       "      <td>0.250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>895 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    SPLIT    COLLECTOR_ID  median_hi  SPECIMEN_COUNT  REDRAW_COUNT  \\\n",
       "0    TEST  COLLECTOR_2011       32.0            17.0           4.0   \n",
       "1    TEST  COLLECTOR_1550       50.0             9.0           3.0   \n",
       "2    TEST  COLLECTOR_4570        8.0            32.0           2.0   \n",
       "3    TEST  COLLECTOR_3531        9.0             7.0           0.0   \n",
       "4    TEST  COLLECTOR_5020       31.0            18.0           2.0   \n",
       "..    ...             ...        ...             ...           ...   \n",
       "890  TEST  COLLECTOR_2848       14.0            31.0           1.0   \n",
       "891  TEST  COLLECTOR_1501        1.0             1.0           0.0   \n",
       "892  TEST   COLLECTOR_860       72.0            48.0          23.0   \n",
       "893  TEST  COLLECTOR_3200       30.0             3.0           1.0   \n",
       "894  TEST  COLLECTOR_4788       12.0             4.0           1.0   \n",
       "\n",
       "     ENCOUNTER_TYPE  REDRAW_COST  REDRAW_PERCENT  \n",
       "0         Emergency       600.00        0.235294  \n",
       "1         Inpatient       357.15        0.333333  \n",
       "2    Intensive Care       162.18        0.062500  \n",
       "3         Inpatient       357.15        0.000000  \n",
       "4         Emergency       600.00        0.111111  \n",
       "..              ...          ...             ...  \n",
       "890       Inpatient       357.15        0.032258  \n",
       "891       Inpatient       357.15        0.000000  \n",
       "892       Inpatient       357.15        0.479167  \n",
       "893       Inpatient       357.15        0.333333  \n",
       "894  Intensive Care       162.18        0.250000  \n",
       "\n",
       "[895 rows x 8 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#calculate features for test set\n",
    "test=pd.read_csv('test.csv')\n",
    "test=test.merge(median_hi,how='left',on='COLLECTOR_ID')\n",
    "test=test.merge(df2,how='left',on='COLLECTOR_ID')\n",
    "test=test.merge(df3,how='left',on='COLLECTOR_ID')\n",
    "test=test.merge(df4,how='left',on='COLLECTOR_ID')\n",
    "test=test.merge(cost,how='left',on='ENCOUNTER_TYPE')\n",
    "test['REDRAW_COUNT'] = test['REDRAW_COUNT'].fillna(0)\n",
    "test['REDRAW_PERCENT'] = test['REDRAW_COUNT'] / test['SPECIMEN_COUNT']\n",
    "test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "61f0a30f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>median_hi</th>\n",
       "      <th>REDRAW_COUNT</th>\n",
       "      <th>REDRAW_COST</th>\n",
       "      <th>REDRAW_PERCENT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>32.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>600.00</td>\n",
       "      <td>0.235294</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>31.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>600.00</td>\n",
       "      <td>0.111111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>16.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>12.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.019231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>14.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.032258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>891</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>892</th>\n",
       "      <td>72.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.479167</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>893</th>\n",
       "      <td>30.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.333333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>635 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     median_hi  REDRAW_COUNT  REDRAW_COST  REDRAW_PERCENT\n",
       "0         32.0           4.0       600.00        0.235294\n",
       "1         50.0           3.0       357.15        0.333333\n",
       "3          9.0           0.0       357.15        0.000000\n",
       "4         31.0           2.0       600.00        0.111111\n",
       "6         16.0           0.0       357.15        0.000000\n",
       "..         ...           ...          ...             ...\n",
       "888       12.5           1.0       357.15        0.019231\n",
       "890       14.0           1.0       357.15        0.032258\n",
       "891        1.0           0.0       357.15        0.000000\n",
       "892       72.0          23.0       357.15        0.479167\n",
       "893       30.0           1.0       357.15        0.333333\n",
       "\n",
       "[635 rows x 4 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = test.drop(['SPLIT', 'SPECIMEN_COUNT', 'COLLECTOR_ID'], axis = 1)\n",
    "test = test.loc[(test['ENCOUNTER_TYPE'] == \"Emergency\") | (test['ENCOUNTER_TYPE'] == \"Inpatient\")]\n",
    "test = test.drop('ENCOUNTER_TYPE', axis=1)\n",
    "test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "572def32",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>median_hi</th>\n",
       "      <th>REDRAW_COUNT</th>\n",
       "      <th>REDRAW_COST</th>\n",
       "      <th>REDRAW_PERCENT</th>\n",
       "      <th>pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>32.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>600.00</td>\n",
       "      <td>0.235294</td>\n",
       "      <td>1.953370</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>2.510889</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.304316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>31.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>600.00</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>1.311873</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>16.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.332751</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>12.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.019231</td>\n",
       "      <td>1.501695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>14.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.032258</td>\n",
       "      <td>1.442110</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>891</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.356631</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>892</th>\n",
       "      <td>72.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.479167</td>\n",
       "      <td>0.846387</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>893</th>\n",
       "      <td>30.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>357.15</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>1.144415</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>635 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     median_hi  REDRAW_COUNT  REDRAW_COST  REDRAW_PERCENT      pred\n",
       "0         32.0           4.0       600.00        0.235294  1.953370\n",
       "1         50.0           3.0       357.15        0.333333  2.510889\n",
       "3          9.0           0.0       357.15        0.000000  0.304316\n",
       "4         31.0           2.0       600.00        0.111111  1.311873\n",
       "6         16.0           0.0       357.15        0.000000  0.332751\n",
       "..         ...           ...          ...             ...       ...\n",
       "888       12.5           1.0       357.15        0.019231  1.501695\n",
       "890       14.0           1.0       357.15        0.032258  1.442110\n",
       "891        1.0           0.0       357.15        0.000000  0.356631\n",
       "892       72.0          23.0       357.15        0.479167  0.846387\n",
       "893       30.0           1.0       357.15        0.333333  1.144415\n",
       "\n",
       "[635 rows x 5 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dtestset = xgb.DMatrix(test)\n",
    "predictions = bst.predict(dtestset)\n",
    "test[\"pred\"]=predictions\n",
    "test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "fe3e4e03",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SPLIT</th>\n",
       "      <th>COLLECTOR_ID</th>\n",
       "      <th>pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_2011</td>\n",
       "      <td>1.953370</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_1550</td>\n",
       "      <td>2.510889</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_4570</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_3531</td>\n",
       "      <td>0.304316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_5020</td>\n",
       "      <td>1.311873</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_2848</td>\n",
       "      <td>1.442110</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>891</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_1501</td>\n",
       "      <td>0.356631</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>892</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_860</td>\n",
       "      <td>0.846387</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>893</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_3200</td>\n",
       "      <td>1.144415</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>894</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_4788</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>895 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    SPLIT    COLLECTOR_ID      pred\n",
       "0    TEST  COLLECTOR_2011  1.953370\n",
       "1    TEST  COLLECTOR_1550  2.510889\n",
       "2    TEST  COLLECTOR_4570       NaN\n",
       "3    TEST  COLLECTOR_3531  0.304316\n",
       "4    TEST  COLLECTOR_5020  1.311873\n",
       "..    ...             ...       ...\n",
       "890  TEST  COLLECTOR_2848  1.442110\n",
       "891  TEST  COLLECTOR_1501  0.356631\n",
       "892  TEST   COLLECTOR_860  0.846387\n",
       "893  TEST  COLLECTOR_3200  1.144415\n",
       "894  TEST  COLLECTOR_4788       NaN\n",
       "\n",
       "[895 rows x 3 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub=pd.read_csv('sample_solution.csv')\n",
    "sub = sub.merge(test['pred'], how='left', left_index=True, right_index=True)\n",
    "sub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "bed71081",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SPLIT</th>\n",
       "      <th>COLLECTOR_ID</th>\n",
       "      <th>pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>183</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_5162</td>\n",
       "      <td>36.953739</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>163</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_2973</td>\n",
       "      <td>36.892879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>850</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_2028</td>\n",
       "      <td>13.891359</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_1491</td>\n",
       "      <td>11.083419</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>786</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_1460</td>\n",
       "      <td>6.416948</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>882</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_3097</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>883</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_310</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>884</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_5283</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_418</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>894</th>\n",
       "      <td>TEST</td>\n",
       "      <td>COLLECTOR_4788</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>895 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    SPLIT    COLLECTOR_ID       pred\n",
       "183  TEST  COLLECTOR_5162  36.953739\n",
       "163  TEST  COLLECTOR_2973  36.892879\n",
       "850  TEST  COLLECTOR_2028  13.891359\n",
       "76   TEST  COLLECTOR_1491  11.083419\n",
       "786  TEST  COLLECTOR_1460   6.416948\n",
       "..    ...             ...        ...\n",
       "882  TEST  COLLECTOR_3097        NaN\n",
       "883  TEST   COLLECTOR_310        NaN\n",
       "884  TEST  COLLECTOR_5283        NaN\n",
       "889  TEST   COLLECTOR_418        NaN\n",
       "894  TEST  COLLECTOR_4788        NaN\n",
       "\n",
       "[895 rows x 3 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub=sub.sort_values(by=\"pred\",ascending=False)\n",
    "sub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "05ea00b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "sub[['SPLIT','COLLECTOR_ID']].to_csv('mlsub5.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34c106ff",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}