--- a +++ b/prescreen/simbad/parse_radio.py @@ -0,0 +1,118 @@ +""" +script to parse radiology reports from cr_sfditep-2012.xslsx + +They are denoted as CR + + +""" +import pandas as pd + +from clintk.text_parser.parser import ReportsParser +from clintk.utils.connection import get_engine, sql2df + +import argparse + + +def parse_cr(path, engine, targets, n_reports): + PATH = path + + # fetching targets + df_targets = sql2df(engine, targets) + + # fetching reports + # PATH = 'data/cr_sfditep_2012.xlsx' + df = pd.read_excel(PATH) + + df = df[df['CR NAT'] == 'CR'] + df.rename(columns={'CR DATE': 'date', 'text CR': 'value'}, inplace=True) + + # keep only date in 'date columns' + df['date'] = df.loc[:, 'date'].dt.date + df['DATE_SIGN_OK'] = df.loc[:, 'DATE_SIGN_OK'].dt.date + + # remove useless reports + df = df[~(df['value'].str.match('Examen du', na=False))] + + # filter by date + # df = df[df['date'] <= (df['DATE_SIGN_OK'] + datetime.timedelta(weeks=8))] + + # normalize nip + df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC'] + df['nip'] = df.loc[:, 'nip'] \ + .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:]) + + df.drop(['N° Dossier patient IGR', 'LC', 'NOCET', 'SIGLE_ETUDE', + 'LIBELLE_TYPE_ETUDE', 'NUM CR', 'CR RESP'], axis=1, inplace=True) + + df.dropna(inplace=True) + df.drop_duplicates(subset=['value'], inplace=True) + + # taking only the first reports + group_dict = {'date': 'first', 'DATE_SIGN_OK': 'last', + 'value': lambda g: ' '.join(g[:n_reports])} + df = df.groupby('nip', as_index=False).agg(group_dict) + + df = df.merge(df_targets, on='nip') + + # removing useless tags (blocks parsing) + df['value'] = df.loc[:, 'value'].apply(lambda s: \ + str(s).replace('<u>', '').replace('</u>', '')) + + sections = ('resultats', 'resultat', 'evaluation des cibles', 'conclusion', + 'lesions non cibles', 'nouvelles lesions', + 'lesion s non cible s', 'nouvelle s lesion s', + 'resultats a l etage thoracique', 'en fenetre osseuse', + 'a l etage abdomino plevien', 'conclusion :', '') + parser = ReportsParser(headers='b', is_html=False, col_name='value', + sections=sections, + n_jobs=-1) + + df['value'] = parser.transform(df) + + # dropping empty rows + df = df[df['value'] != ''] + + df['feature'] = ['rad'] * df.shape[0] + + df = df.loc[:, ['nip', 'id', 'feature', 'value', 'date']] + + return df + + +def main_parse_cr(): + description = 'Folding radiology reports from Simbad' + parser = argparse.ArgumentParser(description=description) + + parser.add_argument('-p', '--path', + help='path to file that contains the reports') + parser.add_argument('--id', '-I', + help='id to connect to sql server') + parser.add_argument('--ip', '-a', + help='ip adress of the sql server') + parser.add_argument('--db', '-d', + help='name of the database on the sql server') + parser.add_argument('--targets', '-t', + help='name of the table containing targets on the db') + parser.add_argument('--output', '-o', + help='output path to write the folded result') + parser.add_argument('-n', '--nb', + help='number of reports to fetch', type=int) + + args = parser.parse_args() + + engine = get_engine(args.id, args.ip, args.db) + + # getting variables from args + df = parse_cr(args.path,engine, args.targets, args.nb) + + # output = '/home/v_charvet/workspace/data/features/simbad/radiology_v{}.csv' + output = args.output + df.to_csv(output, sep=';', encoding='utf-8') + + print('done') + + return 0 + + +if __name__ == "__main__": + main_parse_cr()