--- a +++ b/ispy1/clean_data.py @@ -0,0 +1,61 @@ +import pandas as pd +# ignore copy warnings in pandas +pd.options.mode.chained_assignment = None + +# hidden module +def _organize_data(dataframe): + # make a copy + df = dataframe.copy() + + df.dropna(axis=0, inplace=True) + #rename columns + df = df.rename(columns={'ERpos':'ER+', + 'PgRpos':'PR+', + 'HR Pos':'HR+', + 'BilateralCa':"Bilateral", + 'sstat':'Alive', + 'MRI LD Baseline':'MRI_LD_Baseline', + 'MRI LD 1-3dAC':'MRI_LD_1_3dAC', + 'MRI LD InterReg':'MRI_LD_Int_Reg', + 'MRI LD PreSurg': 'MRI_LD_PreSurg', + 'survDtD2 (tx)':'Survival_length', + 'rfs_ind':'RFS_code', + 'RCBClass':'RCB', + 'Laterality':'Right_Breast', + 'race_id':'White'}) + # Rename clinical outcomes and predictors 0/1 to make it easier to display + categorical_vars = ['ER+','PR+','HR+','Bilateral','PCR'] + for str_ in categorical_vars: + df[str_] = df[str_].replace([1,0],['Yes','No']) + + # rename other predictors and outcomes + df.Alive = df.Alive.replace([7,8,9], ['Yes','No','Lost']) + df.Right_Breast = df.Right_Breast.replace([1,2],['No','Yes']) + + df.White[df.White != 1] = 0 + df.White = df.White.replace([1,0],['Yes','No']) + + # remove patients lost to follow up + df = df.loc[df.Alive != 'Lost',:] + + # output + return df + +def clean_my_data(file): + # load and set index of predictors + predictors = pd.read_excel(file, sheetname='predictors') + predictors = predictors.set_index('SUBJECTID') + + # drop Columns I don't need + predictors.drop(['DataExtractDt','Her2MostPos','HR_HER2_CATEGORY','HR_HER2_STATUS'],axis=1,inplace=True) + + # load predictors and drop columns I don't need + outcomes_df = pd.read_excel(file, sheetname='outcomes') + outcomes_df.drop(['DataExtractDt'],axis=1,inplace=True) + outcomes_df = outcomes_df.set_index('SUBJECTID') + + #merge PCR and predictors using the Subject ID index + ISPY = predictors.join(outcomes_df) + + ISPY = _organize_data(ISPY) + return ISPY