Switch to side-by-side view

--- a
+++ b/prescreen/vcare/radiology.py
@@ -0,0 +1,126 @@
+"""
+fetching and processing radiology reports
+
+Two data inputs needed: xls file that contains the contents of the reports and
+the table containing additionnal patients information from SQL server
+"""
+import pandas as pd
+
+from clintk.utils.connection import get_engine, sql2df
+from clintk.utils.fold import Folder
+from clintk.text_parser.parser import ReportsParser
+
+import argparse
+
+
+def fetch_and_fold(path, engine, targets, n_reports):
+    """ function to fetch radiology reports from vcare database
+
+       Parameters
+       ----------
+       For definition of parameters, see arguments in `main_fetch_and_fold`
+    """
+    key1, key2, date = 'patient_id', 'nip', 'date'
+
+    # fetching targets table
+    df_targets = sql2df(engine, targets).loc[:, [key2, 'id', 'C1J1']]
+    df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
+                                               format='%Y-%m-%d',
+                                               unit='D')
+
+    df_rad = pd.read_excel(path, sep=';', usecols=7, parse_dates=[1, 2, 5])
+
+    # filter SESSION and keep prescreen values
+    mask = df_rad['SESSION'].isin(['SCA ', 'IRM '])
+
+    # mask_prescreen = df_rad['OBSERVATION DATE'] == 'Before Date'
+    df_rad = df_rad[mask]  # [mask_prescreen]
+
+    df_rad['CR_DATE'] = pd.to_datetime(df_rad['CR_DATE'], format='%Y%m%d')
+
+    # remove useless rows
+    df_rad = df_rad[~(df_rad['CONTENU_CR'].str.match('Examen du', na=False))]
+
+    df_rad.rename({'CR_DATE': date}, axis=1, inplace=True)
+
+    df_rad = df_rad.merge(df_targets, on=None, left_on='Nip',
+                          right_on='nip').drop('Nip', axis=1)
+
+    df_rad['patient_id'] = df_rad['id']
+    df_rad.drop('id', axis=1, inplace=True)
+
+    folder = Folder(key1, key2, ['CONTENU_CR'], date, n_jobs=-1)
+
+    rad_folded = folder.fold(df_rad)
+
+    rad_folded.dropna(inplace=True)
+    rad_folded.drop_duplicates(subset=['value'], inplace=True)
+    # concatenating n_reports
+    group_dict = {key2: 'first', 'feature': 'first', date: 'last',
+                  'value': lambda g: ' '.join(g[:n_reports])}
+
+    rad_folded = rad_folded.groupby(key1, as_index=False).agg(group_dict)
+
+    # removing useless tags (blocks parsing)
+    rad_folded['value'] = rad_folded.loc[:, 'value'].apply(lambda s: \
+        str(s).replace('<u>', '').replace('</u>', ''))
+
+    sections = ('resultats', 'resultat', 'evaluation des cibles', 'conclusion',
+                'lesion(s) non cible(s)', 'nouvelles(s) lesion(s)',
+                'resultats a l etage thoracique', 'en fenetre osseuse',
+                'a l etage abdomino plevien', 'conclusion :')
+
+    parser = ReportsParser(headers='b', is_html=False, col_name='value',
+                           sections=sections,
+                           n_jobs=-1)
+
+    rad_folded['value'] = parser.transform(rad_folded)
+
+    rad_folded = rad_folded[rad_folded['value'] != '']
+    rad_folded['feature'] = ['rad'] * rad_folded.shape[0]
+
+
+    return rad_folded
+
+
+def main_fetch_and_fold():
+    description = 'Folding radiology reports from Ventura Care'
+    parser = argparse.ArgumentParser(description=description)
+
+    parser.add_argument('-p', '--path',
+                        help='path to file that contains the reports')
+    parser.add_argument('--id', '-I',
+                        help='id to connect to sql server')
+    parser.add_argument('--ip', '-a',
+                        help='ip adress of the sql server')
+    parser.add_argument('--db', '-d',
+                        help='name of the database on the sql server')
+    parser.add_argument('--targets', '-t',
+                        help='name of the table containing targets on the db')
+    parser.add_argument('--output', '-o',
+                        help='output path to write the folded result')
+    parser.add_argument('-n', '--nb',
+                        help='number of reports to fetch', type=int)
+    # PATH = "/home/v_charvet/workspace/data/cr/cr_rad_tronc.xlsx"
+
+    args = parser.parse_args()
+
+    engine = get_engine(args.id, args.ip, args.db)
+
+    rad_folded = fetch_and_fold(args.path, engine, args.targets, args.nb)
+
+    output = args.output
+    rad_folded.to_csv(output, encoding='utf-8', sep=';')
+
+    print('done')
+
+    return rad_folded
+
+
+
+
+if __name__ == "__main__":
+    main_fetch_and_fold()
+
+
+