|
a |
|
b/prescreen/vcare/reports.py |
|
|
1 |
"""" |
|
|
2 |
fetching and processing electronic medical reports |
|
|
3 |
""" |
|
|
4 |
import pandas as pd |
|
|
5 |
|
|
|
6 |
from clintk.utils.connection import get_engine, sql2df |
|
|
7 |
from clintk.utils.fold import Folder |
|
|
8 |
from clintk.text_parser.parser import ReportsParser |
|
|
9 |
|
|
|
10 |
import argparse |
|
|
11 |
|
|
|
12 |
|
|
|
13 |
def fetch_and_fold(table, engine, targets, n_reports): |
|
|
14 |
""" function to fetch reports from vcare database |
|
|
15 |
|
|
|
16 |
Parameters |
|
|
17 |
---------- |
|
|
18 |
For definition of parameters, see arguments in `main_fetch_and_fold` |
|
|
19 |
""" |
|
|
20 |
key1, key2, date = 'patient_id', 'nip', 'date' |
|
|
21 |
|
|
|
22 |
# data used to train the model |
|
|
23 |
df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']] |
|
|
24 |
df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'], |
|
|
25 |
format='%Y-%m-%d', |
|
|
26 |
unit='D') |
|
|
27 |
|
|
|
28 |
df_reports = sql2df(engine, table)\ |
|
|
29 |
.loc[:, ['original_date', 'patient_id', 'report']] |
|
|
30 |
|
|
|
31 |
mask = [report is not None for report in df_reports['report']] |
|
|
32 |
|
|
|
33 |
df_reports.rename(columns={'original_date': 'date'}, inplace=True) |
|
|
34 |
df_reports = df_reports.loc[mask] |
|
|
35 |
|
|
|
36 |
# joining features df with complete patient informations |
|
|
37 |
df_reports = df_reports.merge(df_targets, on=None, left_on='patient_id', |
|
|
38 |
right_on='id').drop('id', axis=1) |
|
|
39 |
# df_reports = df_reports[df_reports[date] <= df_reports['C1J1']] |
|
|
40 |
|
|
|
41 |
# folding frames so that they have the same columns |
|
|
42 |
folder = Folder(key1, key2, ['report'], date, n_jobs=-1) |
|
|
43 |
reports_folded = folder.fold(df_reports) |
|
|
44 |
|
|
|
45 |
reports_folded.dropna(inplace=True) |
|
|
46 |
reports_folded.drop_duplicates(subset=['value'], inplace=True) |
|
|
47 |
|
|
|
48 |
# taking only first `n_reports` reports |
|
|
49 |
group_dict = {key2: 'first', 'feature': 'first', date: 'last', |
|
|
50 |
'value': lambda g: ' '.join(g[:n_reports])} |
|
|
51 |
reports_folded = reports_folded.groupby(key1, as_index=False)\ |
|
|
52 |
.agg(group_dict) |
|
|
53 |
|
|
|
54 |
# parsing and vectorising text reports |
|
|
55 |
sections = ['examens complementaire', 'hopital de jour', |
|
|
56 |
'examen du patient'] |
|
|
57 |
|
|
|
58 |
parser = ReportsParser(sections=None, n_jobs=-1, norm=False, |
|
|
59 |
col_name='value') |
|
|
60 |
|
|
|
61 |
reports_folded['value'] = parser.transform(reports_folded) |
|
|
62 |
|
|
|
63 |
return reports_folded |
|
|
64 |
|
|
|
65 |
|
|
|
66 |
def main_fetch_and_fold(): |
|
|
67 |
description = 'Folding text reports from Ventura Care' |
|
|
68 |
parser = argparse.ArgumentParser(description=description) |
|
|
69 |
|
|
|
70 |
parser.add_argument('--reports', '-r', |
|
|
71 |
help='name of the table contains the reports') |
|
|
72 |
parser.add_argument('--id', '-I', |
|
|
73 |
help='id to connect to sql server') |
|
|
74 |
parser.add_argument('--ip', '-a', |
|
|
75 |
help='ip address of the sql server') |
|
|
76 |
parser.add_argument('--db', '-d', |
|
|
77 |
help='name of the database on the sql server') |
|
|
78 |
parser.add_argument('--targets', '-t', |
|
|
79 |
help='name of the table containing targets on the db') |
|
|
80 |
parser.add_argument('--output', '-o', |
|
|
81 |
help='output path to write the folded result') |
|
|
82 |
parser.add_argument('-n', '--nb', |
|
|
83 |
help='number of reports to fetch', type=int) |
|
|
84 |
args = parser.parse_args() |
|
|
85 |
|
|
|
86 |
# getting variables from args |
|
|
87 |
engine = get_engine(args.id, args.ip, args.db) |
|
|
88 |
|
|
|
89 |
reports_folded = fetch_and_fold(args.reports, engine, args.targets, args.nb) |
|
|
90 |
|
|
|
91 |
output = args.output |
|
|
92 |
reports_folded.to_csv(output, encoding='utf-8', sep=';') |
|
|
93 |
print('done') |
|
|
94 |
|
|
|
95 |
return reports_folded |
|
|
96 |
|
|
|
97 |
|
|
|
98 |
|
|
|
99 |
if __name__ == "__main__": |
|
|
100 |
main_fetch_and_fold() |