EHRKit-2022 / Git / [2d4573] /mimic_icd9_coding/utils/mimic_data

Models:
philipB/
EHRKit-2022
Downloads: 1
[2d4573]: / mimic_icd9_coding / utils / mimic_data_preparation.py
History
Download this file
167 lines (129 with data), 6.8 kB

# %% [markdown]
# # Data preparation for automated text classification for ICD 9 diagnosis code assignment from MIMIC Database
#%%
# Data prep code from: https://github.com/IM-APHP/mimic_icd9_code_assignment
# %%
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# %% [markdown]
# Load MIMIC tables NOTEEVENTS and DIAGNOSES_ICD

def run_mimic_prep(train_split=False, mimic_data_path='../data/mimic_data', output_folder ='../data', output_name='mimic_full.csv', code_mincount=1000, return_df=False, verbose=False):
    if code_mincount != 1000:
        rewrite=True
    # %%
    NOTEEVENTS=pd.read_csv(mimic_data_path + 'NOTEEVENTS.csv',dtype={'ROW_ID':np.int32, 'SUBJECT_ID': np.int32,'HADM_ID': np.float64,
                                        'CHARTDATE':str,'STORETIME':str,'CHARTTIME':str,
                                        'STORETIME': str,'CATEGORY': str,'DESCRIPTION':str,'CGID':str,'ISERROR':str,
                                            'TEXT':str}, parse_dates=['CHARTDATE'])
    DIAGNOSES_ICD=pd.read_csv(mimic_data_path + 'DIAGNOSES_ICD.csv',dtype={'ROW_ID':np.int32, 'SUBJECT_ID': np.int32,'HADM_ID': np.int32,
                                                'SEQ_NUM': np.float64, 'ICD9_CODE':str})

    # %% [markdown]
    # ## Explore NOTEEVENTS
    # NOTEEVENTS.groupby('CATEGORY').count()

    # %% [markdown]
    # ## Explore DIAGNOSES_ICD

    # %%
    DIAGNOSES_ICD['ICD9_CODE']=DIAGNOSES_ICD['ICD9_CODE'].str.pad(4,'left','0')
    DIAGNOSES_ICD['ICD9_CHAP']=DIAGNOSES_ICD['ICD9_CODE'].str.slice(0,3)
    # DIAGNOSES_ICD.count()

    ##Explore by the first character
    # DIAGNOSES_ICD.groupby(DIAGNOSES_ICD['ICD9_CODE'].str.slice(0,1))['HADM_ID'].count()

    # %% [markdown]
    # Codes from V,E,U,8,9 can be exclude in a purpose of facturation as they are not take in count for the calculus of hospitalization fees.

    # %%
    # DIAGNOSES_ICD=DIAGNOSES_ICD[~DIAGNOSES_ICD['ICD9_CODE'].str.slice(0,1).isin(['V','E','U','8','9'])]

    # %% [markdown]
    # ## Exploration of diagnoses to choose the perfect y
    # ### Selection of the most frequent codes

    # %%
    a=DIAGNOSES_ICD.groupby('ICD9_CODE')['HADM_ID'].count().sort_values(ascending=False)
    # %% [markdown]
    # Due to dispersion of the distribution and the low frequency of some code, for the machine learning task a selection of the code and their chapter will be made

    # %%
    if verbose:
        print('Nb codes > 1000 occurences= '+str(len(a[a>1000]))+
            '  \nNb codes 1000-100 occurences = ' +str(len(a[(a<1000)&(a>100)]))
            +'  \nNb codes <100 occurences = ' +str(len(a[a<100])) )


    # %%
    a=DIAGNOSES_ICD.groupby('ICD9_CODE')[ 'HADM_ID'].count()
    DIAGNOSES_ICD_freq=DIAGNOSES_ICD[DIAGNOSES_ICD['ICD9_CODE'].isin(a[a>code_mincount].keys())]
    df=DIAGNOSES_ICD_freq.groupby('HADM_ID')['ICD9_CODE'].apply(lambda x: "['%s']" %"','".join(x))
    df=df.apply(lambda x : eval(x))
    DIAGNOSES_ICD_freq=pd.DataFrame(df)
    DIAGNOSES_ICD_freq['HADM_ID']=df.keys()
    # DIAGNOSES_ICD_freq.head()

    # %% [markdown]
    # Select the visits with the most frequent codes

    # %%
    if verbose:
        print('Nb of stays selected = '+str(DIAGNOSES_ICD_freq['HADM_ID'].nunique())+
            '  \nNb of different codes = ' +str(len(a[a>code_mincount])))

    # %% [markdown]
    # ### Selection of the most frequent chapters

    # %%
    a=DIAGNOSES_ICD.groupby('ICD9_CHAP')[ 'HADM_ID'].count().sort_values(ascending=False)


    # %%
    if verbose:
        print('Nb chapter codes > {} occurences= '.format(code_mincount)+str(len(a[a>code_mincount]))+
            '  \nNb chapter codes 1000-100 occurences = ' +str(len(a[(a<1000)&(a>100)]))
            +'  \nNb chapter codes <100 occurences = ' +str(len(a[a<100])) )


    # %%
    #a=DIAGNOSES_ICD[~DIAGNOSES_ICD['ICD9_CODE'].isin(DIAGNOSES_ICD_freq['ICD9_CODE'])].groupby('ICD9_CHAP')[ 'HADM_ID'].count()
    a=DIAGNOSES_ICD.groupby('ICD9_CHAP')['HADM_ID'].count()
    DIAGNOSES_ICD_chap_freq=DIAGNOSES_ICD[DIAGNOSES_ICD['ICD9_CHAP'].isin(a[a>code_mincount].keys())]

    # %% [markdown]
    # Selection of stays with the most frequent codes

    # %%
    if verbose:
        print('Number of stays selected = '+str(DIAGNOSES_ICD_chap_freq['HADM_ID'].nunique())+
            '  \nNb of different chapters = ' +str(DIAGNOSES_ICD_chap_freq['ICD9_CHAP'].nunique()))


    # %%
    # DIAGNOSES_ICD_chap_freq.groupby('ICD9_CHAP')['HADM_ID'].count().sort_values(ascending=False).plot(kind='bar',figsize= (17, 10))


    # %%
    df=DIAGNOSES_ICD_chap_freq.groupby('HADM_ID')['ICD9_CHAP'].apply(lambda x: "['%s']" %"','".join(x))
    df=df.apply(lambda x : eval(x))
    DIAGNOSES_ICD_chap_freq=pd.DataFrame(df)
    DIAGNOSES_ICD_chap_freq['HADM_ID']=df.keys()
    # DIAGNOSES_ICD_chap_freq.head()

    # %% [markdown]
    # ### Conclusion : we will focus on ICD chapters.
    # %% [markdown]
    # ## Build the final X and y
    # %% [markdown]
    # Merge of the most important notes to make a single text by hospitalisation.
    # 
    # In the first place we will focus on discharge summaries.

    # %%
    selected_doc=['Discharge summary']
    df=NOTEEVENTS[NOTEEVENTS['CATEGORY'].isin(selected_doc)].groupby('HADM_ID')['TEXT'].apply(lambda x: "{%s}" % ', '.join(x))
    df2=pd.DataFrame(df)
    df2.index.names = ['HADM_ID_INDEX']
    df2['HADM_ID']=df.keys()


    # %%
    DIAGNOSES_ICD_chap_freq.index.names = ['HADM_ID_INDEX']
    DIAGNOSES_ICD_chap_freq.head()

    # %% [markdown]
    # Create one dataframe for selected diagnoses with merge with texts on HADM_ID, and the other one for selected chapters, and concatenate them to have the final dataframe that will be use for prediction

    # %%
    NOTE_DIAGNOSES=pd.merge(df2,DIAGNOSES_ICD_chap_freq[['HADM_ID','ICD9_CHAP']],on='HADM_ID')
    # NOTE_DIAGNOSES=pd.merge(df2,DIAGNOSES_ICD_chap_freq[['HADM_ID','ICD9_CHAP']], left_index=True, right_on='HADM_ID')
    # new = pd.join(df2)

    # %%
    from sklearn import model_selection
    import os
    NOTE_DIAGNOSES.rename(columns={"ICD9_CHAP": "TARGET"}, inplace=True)
    # if not os.path.exists(output_folder):
        # os.mkdir(output_folder)
    if train_split:
        train, test = model_selection.train_test_split(NOTE_DIAGNOSES[['TEXT','TARGET', 'HADM_ID']],test_size=0.2, random_state=123)
        print('Size of train: '+ str(train.shape[0])+' \nSize of test: '+str(test.shape[0]) )
        train.to_csv(output_folder + 'train.csv',index=False)
        test.to_csv(output_folder + 'test.csv',index=False)
        if return_df:
            return train, test
    else:
        out_df = NOTE_DIAGNOSES[['TEXT', 'TARGET', 'HADM_ID']]

        out_df.to_csv(output_folder + output_name,index=False)
        if return_df:
            return out_df