1278 lines (1277 with data), 46.7 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#my stuff\n",
"import icu_data_defs\n",
"import transformers\n",
"import utils\n",
"import features\n",
"from constants import column_names,variable_type,clinical_source\n",
"import units\n",
"import mimic\n",
"import logger\n",
"\n",
"#other stuff\n",
"from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit\n",
"from sklearn.linear_model import LinearRegression,ElasticNet\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.pipeline import Pipeline\n",
"\n",
"#make pretty pictures\n",
"import seaborn as sns\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#HELPER FUNCTIONS\n",
"\n",
"def run_crossval(pipeline,X,y):\n",
" scores_r2 = cross_val_score(pipeline,X,y, scoring='r2',cv=10)\n",
" scores_nmse = cross_val_score(pipeline,X,y, scoring='neg_mean_squared_error',cv=10)\n",
"\n",
" print 'Cross Validation, K-Fold'\n",
" print 'R^2: {}, {}'.format(scores_r2.mean(),scores_r2.std())\n",
" print 'RMSE: {}, {}'.format(np.sqrt(-1.0*scores_nmse).mean(),np.sqrt(-1.0*scores_nmse).std())\n",
"\n",
" cv_shuffle = ShuffleSplit(n_splits=10,test_size=0.1)\n",
"\n",
" scores_r2 = cross_val_score(pipeline,X,y, scoring='r2',cv=cv_shuffle)\n",
" scores_nmse = cross_val_score(pipeline,X,y, scoring='neg_mean_squared_error', cv=cv_shuffle)\n",
"\n",
" print '\\nCross Validation, ShuffleSplit'\n",
" print 'R^2: {}, {}'.format(scores_r2.mean(),scores_r2.std())\n",
" print 'RMSE: {}, {}'.format(np.sqrt(-1.0*scores_nmse).mean(),np.sqrt(-1.0*scores_nmse).std())\n",
" return\n",
"\n",
"\"\"\"\n",
"Visualize data\n",
"\"\"\"\n",
"#Visualize\n",
"def viz_per_feature(df_features,df_labels): \n",
" plot_cnt = len(df_labels.columns)+1\n",
" \n",
" df_corr = pd.DataFrame(index=df_features.columns,columns=df_labels.columns)\n",
" \n",
" for i,col_name in enumerate(df_features.columns):\n",
" print col_name,'{}/{}'.format(i,df_features.shape[1])\n",
" col = df_features.loc[:,col_name]\n",
" display(col.describe().apply(lambda x: '%.4f' % x).to_frame())\n",
" #determine # of filled values\n",
" mode = col.mode()[0]\n",
" print mode\n",
" mode_count = (col == mode).sum()\n",
" print \"MODE:\",mode\n",
" print mode_count\n",
" print mode_count/float(col.shape[0])\n",
"\n",
"\n",
" # plot histogram of column (all of df_train)\n",
" fig, axarr = plt.subplots(1,plot_cnt,figsize=(5*(plot_cnt), 5))\n",
" ax = plt.subplot(1, plot_cnt, 1)\n",
" std = col.std()\n",
" mean = col.mean()\n",
" col.loc[(col < (mean + 3.0*std)) & (col > (mean - 3.0*std))].hist()\n",
" ax.set_title('{}_{}\\n{}'.format(col_name[0],col_name[1],col_name[2:]))\n",
" ax.set_xlabel(col_name[-2])\n",
" ax.set_ylabel('COUNT')\n",
"\n",
" #plot this column vs. each label\n",
" for i,label_name in enumerate(df_labels.columns):\n",
" y = df_labels.loc[:,label_name].dropna()\n",
" \n",
" x = col.loc[y.index]\n",
" ax = plt.subplot(1, plot_cnt, 2+i)\n",
" sns.regplot(x, y)\n",
" corr = np.corrcoef(x, y)[0][1]\n",
" ax.set_title('{}_{} vs. {} \\n PCC (r) = {}'.format(col_name[0],col_name[1],label_name[0],corr))\n",
" df_corr.loc[col_name,label_name]=corr\n",
" ax.set_xlabel(col_name[-2])\n",
" ax.set_ylabel(label_name)\n",
" \n",
" plt.tight_layout()\n",
" plt.show()\n",
" \n",
" return df_corr\n",
" \n",
"\"\"\"\n",
"Test/train/validate split\n",
"\"\"\"\n",
"\n",
"def test_train_val_split(all_ids=None,test_size=0.1,random_state=42,print_ids=False):\n",
"\n",
" if all_ids is None:\n",
" all_ids = mimic.get_all_hadm_ids()\n",
" \n",
" validate_size = test_size/(1-test_size)\n",
" train_size = (1-test_size)*(1-validate_size)\n",
" #these test IDs will never be touched again. They are sacred\n",
" train_val_ids,test_ids = train_test_split(all_ids,test_size=test_size,random_state=random_state)\n",
" train_ids,validate_ids = train_test_split(train_val_ids,test_size=validate_size,random_state=random_state)\n",
"\n",
" if print_ids:\n",
" print 'Train {}:'.format(int(train_size*100)), len(train_ids),'>',train_ids[:5],'...'\n",
" print 'Validate {}:'.format(int(train_size*100)), len(validate_ids),'>',validate_ids[:5],'...'\n",
" print 'Test {}:'.format(int(test_size*100)), len(test_ids),'>',test_ids[:5],'...'\n",
" return train_ids,validate_ids,test_ids"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Set up"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ETL"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>component</th>\n",
" <th>units</th>\n",
" <th>variable_type</th>\n",
" <th>clinical_source</th>\n",
" <th>lower</th>\n",
" <th>upper</th>\n",
" <th>list_id</th>\n",
" </tr>\n",
" <tr>\n",
" <th>def_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>heart rate</td>\n",
" <td>beats/min</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>500.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>blood pressure systolic</td>\n",
" <td>mmHg</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>500.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>blood pressure diastolic</td>\n",
" <td>mmHg</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>500.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>blood pressure mean</td>\n",
" <td>mmHg</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>500.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>respiratory rate</td>\n",
" <td>insp/min</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>150.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>temperature body</td>\n",
" <td>degF</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>150.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>oxygen saturation pulse oximetry</td>\n",
" <td>percent</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>weight body</td>\n",
" <td>kg</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>700.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>output urine</td>\n",
" <td>mL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>30000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>output urine</td>\n",
" <td>mL/hr</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>5000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>output urine</td>\n",
" <td>mL/kg/hr</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>glasgow coma scale motor</td>\n",
" <td>no_units</td>\n",
" <td>ord</td>\n",
" <td>observation</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>glasgow coma scale eye opening</td>\n",
" <td>no_units</td>\n",
" <td>ord</td>\n",
" <td>observation</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>glasgow coma scale verbal</td>\n",
" <td>no_units</td>\n",
" <td>ord</td>\n",
" <td>observation</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>normal saline</td>\n",
" <td>mL</td>\n",
" <td>qn</td>\n",
" <td>intervention</td>\n",
" <td>0.0</td>\n",
" <td>30000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>normal saline</td>\n",
" <td>mL/hr</td>\n",
" <td>qn</td>\n",
" <td>intervention</td>\n",
" <td>0.0</td>\n",
" <td>10000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>lactated ringers</td>\n",
" <td>mL</td>\n",
" <td>qn</td>\n",
" <td>intervention</td>\n",
" <td>0.0</td>\n",
" <td>30000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>lactated ringers</td>\n",
" <td>mL/hr</td>\n",
" <td>qn</td>\n",
" <td>intervention</td>\n",
" <td>0.0</td>\n",
" <td>10000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>norepinephrine</td>\n",
" <td>mcg</td>\n",
" <td>qn</td>\n",
" <td>intervention</td>\n",
" <td>0.0</td>\n",
" <td>100000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>norepinephrine</td>\n",
" <td>mcg/min</td>\n",
" <td>qn</td>\n",
" <td>intervention</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>norepinephrine</td>\n",
" <td>mcg/kg/min</td>\n",
" <td>qn</td>\n",
" <td>intervention</td>\n",
" <td>0.0</td>\n",
" <td>10.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>vasopressin</td>\n",
" <td>units</td>\n",
" <td>qn</td>\n",
" <td>intervention</td>\n",
" <td>0.0</td>\n",
" <td>300.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>vasopressin</td>\n",
" <td>units/min</td>\n",
" <td>qn</td>\n",
" <td>intervention</td>\n",
" <td>0.0</td>\n",
" <td>5.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>lactate</td>\n",
" <td>mmol/L</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>50.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>lactate</td>\n",
" <td>mg/dL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>50.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>hemoglobin</td>\n",
" <td>g/dL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>white blood cell count</td>\n",
" <td>x10e3/uL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>1000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>red blood cell count</td>\n",
" <td>x10e6/uL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>1000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>hematocrit</td>\n",
" <td>percent</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>mean corpuscular volume</td>\n",
" <td>fL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>200.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>glucose serum</td>\n",
" <td>mmol/L</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>500.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>glucose serum</td>\n",
" <td>mg/dL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>10000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>glucose fingerstick</td>\n",
" <td>mmol/L</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>500.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>glucose fingerstick</td>\n",
" <td>mg/dL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>10000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>calcium total serum</td>\n",
" <td>mmol/L</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>25.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>calcium total serum</td>\n",
" <td>mg/dL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50</th>\n",
" <td>calcium ionized serum</td>\n",
" <td>mmol/L</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>25.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>calcium ionized serum</td>\n",
" <td>mg/dL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52</th>\n",
" <td>magnesium serum</td>\n",
" <td>mg/dL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>phosphorous serum</td>\n",
" <td>mg/dL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>prothrombin time</td>\n",
" <td>seconds</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>1000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>partial thromboplastin time</td>\n",
" <td>seconds</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>1000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>international normalized ratio</td>\n",
" <td>no_units</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>partial pressure of oxygen arterial</td>\n",
" <td>mmHg</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>1000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58</th>\n",
" <td>partial pressure of carbon dioxide arterial</td>\n",
" <td>mmHg</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>1000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>oxygen saturation arterial</td>\n",
" <td>percent</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>pH arterial</td>\n",
" <td>no_units</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>14.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>pH other</td>\n",
" <td>no_units</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>14.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>bicarbonate arterial</td>\n",
" <td>mEq/L</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>200.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63</th>\n",
" <td>bicarbonate other</td>\n",
" <td>mEq/L</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>200.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>alanine aminotransferase serum</td>\n",
" <td>U/L</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>aspartate aminotransferase serum</td>\n",
" <td>U/L</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66</th>\n",
" <td>alkaline phosphatase serum</td>\n",
" <td>IU/L</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>10000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67</th>\n",
" <td>fraction of inspired oxygen</td>\n",
" <td>percent</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68</th>\n",
" <td>fraction of inspired oxygen</td>\n",
" <td>no_units</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69</th>\n",
" <td>positive end expiratory pressure</td>\n",
" <td>cmH2O</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>1000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>tidal volume</td>\n",
" <td>mL</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>10000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>central venous pressure</td>\n",
" <td>mm/Hg</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>500.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>central venous oxygen saturation</td>\n",
" <td>percent</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>end tidal cardon dioxide</td>\n",
" <td>mmHg</td>\n",
" <td>qn</td>\n",
" <td>observation</td>\n",
" <td>0.0</td>\n",
" <td>1000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>74 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" component units variable_type \\\n",
"def_id \n",
"0 heart rate beats/min qn \n",
"1 blood pressure systolic mmHg qn \n",
"2 blood pressure diastolic mmHg qn \n",
"3 blood pressure mean mmHg qn \n",
"4 respiratory rate insp/min qn \n",
"5 temperature body degF qn \n",
"6 oxygen saturation pulse oximetry percent qn \n",
"7 weight body kg qn \n",
"8 output urine mL qn \n",
"9 output urine mL/hr qn \n",
"10 output urine mL/kg/hr qn \n",
"11 glasgow coma scale motor no_units ord \n",
"12 glasgow coma scale eye opening no_units ord \n",
"13 glasgow coma scale verbal no_units ord \n",
"14 normal saline mL qn \n",
"15 normal saline mL/hr qn \n",
"16 lactated ringers mL qn \n",
"17 lactated ringers mL/hr qn \n",
"18 norepinephrine mcg qn \n",
"19 norepinephrine mcg/min qn \n",
"20 norepinephrine mcg/kg/min qn \n",
"21 vasopressin units qn \n",
"22 vasopressin units/min qn \n",
"23 lactate mmol/L qn \n",
"24 lactate mg/dL qn \n",
"25 hemoglobin g/dL qn \n",
"26 white blood cell count x10e3/uL qn \n",
"27 red blood cell count x10e6/uL qn \n",
"28 hematocrit percent qn \n",
"29 mean corpuscular volume fL qn \n",
"... ... ... ... \n",
"44 glucose serum mmol/L qn \n",
"45 glucose serum mg/dL qn \n",
"46 glucose fingerstick mmol/L qn \n",
"47 glucose fingerstick mg/dL qn \n",
"48 calcium total serum mmol/L qn \n",
"49 calcium total serum mg/dL qn \n",
"50 calcium ionized serum mmol/L qn \n",
"51 calcium ionized serum mg/dL qn \n",
"52 magnesium serum mg/dL qn \n",
"53 phosphorous serum mg/dL qn \n",
"54 prothrombin time seconds qn \n",
"55 partial thromboplastin time seconds qn \n",
"56 international normalized ratio no_units qn \n",
"57 partial pressure of oxygen arterial mmHg qn \n",
"58 partial pressure of carbon dioxide arterial mmHg qn \n",
"59 oxygen saturation arterial percent qn \n",
"60 pH arterial no_units qn \n",
"61 pH other no_units qn \n",
"62 bicarbonate arterial mEq/L qn \n",
"63 bicarbonate other mEq/L qn \n",
"64 alanine aminotransferase serum U/L qn \n",
"65 aspartate aminotransferase serum U/L qn \n",
"66 alkaline phosphatase serum IU/L qn \n",
"67 fraction of inspired oxygen percent qn \n",
"68 fraction of inspired oxygen no_units qn \n",
"69 positive end expiratory pressure cmH2O qn \n",
"70 tidal volume mL qn \n",
"71 central venous pressure mm/Hg qn \n",
"72 central venous oxygen saturation percent qn \n",
"73 end tidal cardon dioxide mmHg qn \n",
"\n",
" clinical_source lower upper list_id \n",
"def_id \n",
"0 observation 0.0 500.0 NaN \n",
"1 observation 0.0 500.0 NaN \n",
"2 observation 0.0 500.0 NaN \n",
"3 observation 0.0 500.0 NaN \n",
"4 observation 0.0 150.0 NaN \n",
"5 observation 0.0 150.0 NaN \n",
"6 observation 0.0 100.0 NaN \n",
"7 observation 0.0 700.0 NaN \n",
"8 observation 0.0 30000.0 NaN \n",
"9 observation 0.0 5000.0 NaN \n",
"10 observation 0.0 100.0 NaN \n",
"11 observation NaN NaN 0.0 \n",
"12 observation NaN NaN 2.0 \n",
"13 observation NaN NaN 1.0 \n",
"14 intervention 0.0 30000.0 NaN \n",
"15 intervention 0.0 10000.0 NaN \n",
"16 intervention 0.0 30000.0 NaN \n",
"17 intervention 0.0 10000.0 NaN \n",
"18 intervention 0.0 100000.0 NaN \n",
"19 intervention 0.0 100.0 NaN \n",
"20 intervention 0.0 10.0 NaN \n",
"21 intervention 0.0 300.0 NaN \n",
"22 intervention 0.0 5.0 NaN \n",
"23 observation 0.0 50.0 NaN \n",
"24 observation 0.0 50.0 NaN \n",
"25 observation 0.0 100.0 NaN \n",
"26 observation 0.0 1000.0 NaN \n",
"27 observation 0.0 1000.0 NaN \n",
"28 observation 0.0 100.0 NaN \n",
"29 observation 0.0 200.0 NaN \n",
"... ... ... ... ... \n",
"44 observation 0.0 500.0 NaN \n",
"45 observation 0.0 10000.0 NaN \n",
"46 observation 0.0 500.0 NaN \n",
"47 observation 0.0 10000.0 NaN \n",
"48 observation 0.0 25.0 NaN \n",
"49 observation 0.0 100.0 NaN \n",
"50 observation 0.0 25.0 NaN \n",
"51 observation 0.0 100.0 NaN \n",
"52 observation 0.0 100.0 NaN \n",
"53 observation 0.0 100.0 NaN \n",
"54 observation 0.0 1000.0 NaN \n",
"55 observation 0.0 1000.0 NaN \n",
"56 observation 0.0 100.0 NaN \n",
"57 observation 0.0 1000.0 NaN \n",
"58 observation 0.0 1000.0 NaN \n",
"59 observation 0.0 100.0 NaN \n",
"60 observation 0.0 14.0 NaN \n",
"61 observation 0.0 14.0 NaN \n",
"62 observation 0.0 200.0 NaN \n",
"63 observation 0.0 200.0 NaN \n",
"64 observation 0.0 100000.0 NaN \n",
"65 observation 0.0 100000.0 NaN \n",
"66 observation 0.0 10000.0 NaN \n",
"67 observation 0.0 100.0 NaN \n",
"68 observation 0.0 1.0 NaN \n",
"69 observation 0.0 1000.0 NaN \n",
"70 observation 0.0 10000.0 NaN \n",
"71 observation 0.0 500.0 NaN \n",
"72 observation 0.0 100.0 NaN \n",
"73 observation 0.0 1000.0 NaN \n",
"\n",
"[74 rows x 7 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Load Our Data Dict\n",
"data_dict = icu_data_defs.data_dictionary('config/data_definitions.xlsx')\n",
"display(data_dict.get_defs())\n",
"\n",
"#init ETL Manager => mimic_extract data\n",
"etl_fname = 'data/mimic_extract.h5'\n",
"etl_manager = mimic.MimicETLManager(etl_fname,'config/mimic_item_map.csv',data_dict)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"etl_manager.etl(components=data_dict.get_components(),save_steps=True) #all components in data dictionary"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Generation"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train 80: 47180 > [139698, 127590, 178959, 139276, 196600] ...\n",
"Validate 80: 5898 > [112338, 107467, 158733, 144544, 115417] ...\n",
"Test 10: 5898 > [167957, 164747, 124147, 184424, 136508] ...\n"
]
}
],
"source": [
"random_state=42\n",
"#test/train/val split\n",
"train_ids,validate_ids,test_ids = test_train_val_split(print_ids=True,random_state=random_state);\n",
"\n",
"#create all features\n",
"m_ureg = units.MedicalUreg()\n",
"is_summable = lambda x: m_ureg.is_volume(str(x)) or m_ureg.is_mass(str(x))\n",
"\n",
"\n",
"\"\"\"\n",
"Data Specs\n",
"\"\"\"\n",
"summable = {\n",
" column_names.VAR_TYPE : variable_type.QUANTITATIVE,\n",
" column_names.COMPONENT : lambda comp: comp not in [data_dict.components.WEIGHT_BODY],\n",
" column_names.UNITS: is_summable\n",
"}\n",
"\n",
"ordinal = {\n",
" column_names.VAR_TYPE : variable_type.ORDINAL\n",
"}\n",
"\n",
"quantitative = {\n",
" column_names.VAR_TYPE : variable_type.QUANTITATIVE\n",
"}\n",
"\n",
"nominal = {\n",
" column_names.VAR_TYPE : variable_type.NOMINAL\n",
"}\n",
"\n",
"\"\"\"\n",
"FEATURES\n",
"\"\"\"\n",
"\n",
"F_mean_qn = features.DataSpecsFeaturizer(\n",
" 'mean',\n",
" resample_freq=None,\n",
" data_specs=[quantitative],\n",
" fillna_transformer=Pipeline([\n",
" ('ffill',transformers.GroupbyAndFFill(level=column_names.ID)),\n",
" ('fill_mean',transformers.FillerMean())\n",
" ])\n",
" \n",
")\n",
"\n",
"F_mean_ord = features.DataSpecsFeaturizer(\n",
" 'mean',\n",
" resample_freq=None,\n",
" data_specs=[ordinal],\n",
" fillna_transformer=Pipeline([\n",
" ('ffill',transformers.GroupbyAndFFill(level=column_names.ID)),\n",
" ('fill_mean',transformers.FillerMode())\n",
" ])\n",
" \n",
")\n",
"\n",
"F_last = features.DataSpecsFeaturizer(\n",
" agg_func='last',\n",
" resample_freq=None,\n",
" data_specs=[ordinal,quantitative],\n",
" fillna_transformer=Pipeline([\n",
" ('ffill',transformers.GroupbyAndFFill(level=column_names.ID)),\n",
" ('fill_mean',transformers.FillerMean())\n",
" ])\n",
")\n",
"\n",
"\n",
"F_std = features.DataSpecsFeaturizer(\n",
" 'std',\n",
" resample_freq=None,\n",
" data_specs=[ordinal,quantitative],\n",
" fillna_transformer=transformers.FillerZero()\n",
")\n",
"\n",
"F_sum = features.DataSpecsFeaturizer(\n",
" 'sum',\n",
" resample_freq=None,\n",
" data_specs=[summable],\n",
" fillna_transformer=transformers.FillerZero()\n",
")\n",
"\n",
"F_count = features.DataSpecsFeaturizer(\n",
" 'count',\n",
" resample_freq=None,\n",
" data_specs=[ordinal,quantitative],\n",
" post_processor = transformers.Replacer(0,np.nan),\n",
" fillna_transformer=transformers.FillerZero()\n",
")\n",
"\n",
"F_count_nom = features.DataSpecsFeaturizer(\n",
" 'sum',\n",
" resample_freq=None,\n",
" data_specs=[nominal],\n",
" fillna_transformer=transformers.FillerZero()\n",
")\n",
"\n",
"\"\"\"\n",
"LABELS\n",
"\"\"\"\n",
"qn_lactate_only={\n",
" column_names.COMPONENT : data_dict.components.LACTATE,\n",
" column_names.VAR_TYPE : variable_type.QUANTITATIVE\n",
"}\n",
"L_next_lac = features.DataSpecsFeaturizer(\n",
" agg_func='first',\n",
" resample_freq=None,\n",
" data_specs=qn_lactate_only,\n",
" post_processor=transformers.TimeShifter(column_names.DATETIME,shift='infer',n=-1)\n",
")\n",
"\n",
"L_delta_lac = features.DataSpecsFeaturizer(\n",
" agg_func='last',\n",
" resample_freq=None,\n",
" data_specs=qn_lactate_only,\n",
" post_processor=Pipeline([\n",
" ('group_by_id',transformers.ToGroupby(level=column_names.ID)),\n",
" ('delta',transformers.Delta())\n",
" ])\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Smaller Data Set"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[100014L, 100029L, 100039L, 100046L, 100052L] 9436\n"
]
}
],
"source": [
"reload(logger)\n",
"\n",
"train_subset = pd.Series(train_ids).sample(frac=0.2, random_state=random_state).sort_values().tolist()\n",
"\n",
"print train_subset[:5], len(train_subset)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"reload(features)\n",
"#with more memory/a better processor, might not need these first 2 cleaning steps until post-processing\n",
"combine_like = Pipeline([\n",
" ('drop_small_columns',transformers.remove_small_columns(threshold=1000)),\n",
" ('drop_low_id_count',transformers.record_threshold(threshold=100)),\n",
" ('combine_like_columns',transformers.combine_like_cols())\n",
" ])\n",
"\n",
"drop_low_counts = Pipeline([\n",
" ('row_threshold',transformers.DropNaN(thresh=20)), #this threshold MAY not apply to a larger feature set.\n",
" ('drop_small_columns',transformers.remove_small_columns(threshold=1000)),\n",
" ('drop_low_id_count',transformers.record_threshold(threshold=100)) \n",
" ])\n",
"\n",
"dsf_labels = features.DataSetFactory(\n",
" featurizers=[\n",
" ('NEXT_LACTATE',L_next_lac),\n",
" ('DELTA_LACTATE',L_delta_lac)\n",
" ],\n",
" resample_freq='2H',\n",
" components=[data_dict.components.LACTATE],\n",
" etl_manager = etl_manager,\n",
" pre_processor = combine_like,\n",
" post_processor = transformers.DropNaN(thresh=1) #drop any rows that have NO labels\n",
")\n",
"\n",
"dsf_features = features.DataSetFactory(\n",
" featurizers=[\n",
" ('MEAN_QN',F_mean_qn),\n",
" ('MEAN_ORD',F_mean_ord),\n",
" ('LAST',F_last),\n",
" ('STD',F_std),\n",
" ('SUM',F_sum),\n",
" ('COUNT',F_count),\n",
" ('COUNT_NOMINAL',F_count_nom),\n",
" ],\n",
" resample_freq='2H',\n",
" components=data_dict.get_components(panel_id=12), # simple data\n",
" etl_manager = etl_manager,\n",
" pre_processor = combine_like,\n",
" post_processor = drop_low_counts\n",
"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2017-08-17 12:57:36) Make Feature Set. id_count=9436, #features=2\n",
"(2017-08-17 12:57:37)>> *fit* Filter columns (remove_small_columns) (28278, 63)\n",
"(2017-08-17 12:57:37)<< --- (0.0s)\n",
"(2017-08-17 12:57:37)>> *transform* Filter columns (remove_small_columns) (28278, 63)\n",
"(2017-08-17 12:57:37)<< --- (0.0s)\n",
"(2017-08-17 12:57:37)>> *fit* Filter columns (record_threshold) (28278, 4)\n",
"(2017-08-17 12:57:37)<< --- (0.0s)\n",
"(2017-08-17 12:57:37)>> *transform* Filter columns (record_threshold) (28278, 4)\n",
"(2017-08-17 12:57:37)<< --- (0.0s)\n",
"(2017-08-17 12:57:37)>> FIT Combine like columns (28278, 4)\n",
"(2017-08-17 12:57:37)>>>> ('lactate', 'known', 'qn', 'mmol/L')\n",
"(2017-08-17 12:57:37)<<<< --- (0.0s)\n",
"(2017-08-17 12:57:37)<< --- (0.0s)\n",
"(2017-08-17 12:57:37)>> TRANSFORM Combine like columns (28278, 4)\n",
"(2017-08-17 12:57:37)>>>> ('lactate', 'known', 'qn', 'mmol/L')\n",
"(2017-08-17 12:57:38)<<<< --- (1.0s)\n",
"(2017-08-17 12:57:38)<< --- (1.0s)\n",
"(2017-08-17 12:57:38)>> *fit* Filter columns (DataSpecFilter) (28278, 1)\n",
"(2017-08-17 12:57:38)<< --- (0.0s)\n",
"(2017-08-17 12:57:38)>> *transform* Filter columns (DataSpecFilter) (28278, 1)\n",
"(2017-08-17 12:57:38)<< --- (0.0s)\n",
"(2017-08-17 12:57:49)>> *fit* Filter columns (DataSpecFilter) (28278, 1)\n",
"(2017-08-17 12:57:49)<< --- (0.0s)\n",
"(2017-08-17 12:57:49)>> *transform* Filter columns (DataSpecFilter) (28278, 1)\n",
"(2017-08-17 12:57:49)<< --- (0.0s)\n",
"(2017-08-17 12:58:05) --- (29.0s)\n"
]
}
],
"source": [
"df_labels = dsf_labels.fit_transform(train_subset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_features = dsf_features.fit_transform(train_subset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "Python [Root]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}