a b/prescreen/vcare/reports.py
1
""""
2
fetching and processing electronic medical reports
3
"""
4
import pandas as pd
5
6
from clintk.utils.connection import get_engine, sql2df
7
from clintk.utils.fold import Folder
8
from clintk.text_parser.parser import ReportsParser
9
10
import argparse
11
12
13
def fetch_and_fold(table, engine, targets, n_reports):
14
    """ function to fetch reports from vcare database
15
16
    Parameters
17
    ----------
18
    For definition of parameters, see arguments in `main_fetch_and_fold`
19
    """
20
    key1, key2, date = 'patient_id', 'nip', 'date'
21
22
    # data used to train the model
23
    df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']]
24
    df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
25
                                               format='%Y-%m-%d',
26
                                               unit='D')
27
28
    df_reports = sql2df(engine, table)\
29
        .loc[:, ['original_date', 'patient_id', 'report']]
30
31
    mask = [report is not None for report in df_reports['report']]
32
33
    df_reports.rename(columns={'original_date': 'date'}, inplace=True)
34
    df_reports = df_reports.loc[mask]
35
36
    # joining features df with complete patient informations
37
    df_reports = df_reports.merge(df_targets, on=None, left_on='patient_id',
38
                                  right_on='id').drop('id', axis=1)
39
    # df_reports = df_reports[df_reports[date] <= df_reports['C1J1']]
40
41
    # folding frames so that they have the same columns
42
    folder = Folder(key1, key2, ['report'], date, n_jobs=-1)
43
    reports_folded = folder.fold(df_reports)
44
45
    reports_folded.dropna(inplace=True)
46
    reports_folded.drop_duplicates(subset=['value'], inplace=True)
47
48
    # taking only first `n_reports` reports
49
    group_dict = {key2: 'first', 'feature': 'first', date: 'last',
50
                  'value': lambda g: ' '.join(g[:n_reports])}
51
    reports_folded = reports_folded.groupby(key1, as_index=False)\
52
        .agg(group_dict)
53
54
    # parsing and vectorising text reports
55
    sections = ['examens complementaire', 'hopital de jour',
56
                'examen du patient']
57
58
    parser = ReportsParser(sections=None, n_jobs=-1, norm=False,
59
                           col_name='value')
60
61
    reports_folded['value'] = parser.transform(reports_folded)
62
63
    return reports_folded
64
65
66
def main_fetch_and_fold():
67
    description = 'Folding text reports from Ventura Care'
68
    parser = argparse.ArgumentParser(description=description)
69
70
    parser.add_argument('--reports', '-r',
71
                        help='name of the table contains the reports')
72
    parser.add_argument('--id', '-I',
73
                        help='id to connect to sql server')
74
    parser.add_argument('--ip', '-a',
75
                        help='ip address of the sql server')
76
    parser.add_argument('--db', '-d',
77
                        help='name of the database on the sql server')
78
    parser.add_argument('--targets', '-t',
79
                        help='name of the table containing targets on the db')
80
    parser.add_argument('--output', '-o',
81
                        help='output path to write the folded result')
82
    parser.add_argument('-n', '--nb',
83
                        help='number of reports to fetch', type=int)
84
    args = parser.parse_args()
85
86
    # getting variables from args
87
    engine = get_engine(args.id, args.ip, args.db)
88
89
    reports_folded = fetch_and_fold(args.reports, engine, args.targets, args.nb)
90
91
    output = args.output
92
    reports_folded.to_csv(output, encoding='utf-8', sep=';')
93
    print('done')
94
95
    return reports_folded
96
97
98
99
if __name__ == "__main__":
100
    main_fetch_and_fold()