Switch to unified view

a b/prescreen/simbad/biology_2.py
1
"""
2
fetches biology for simbad
3
"""
4
import pandas as pd
5
6
from clintk.utils.connection import get_engine, sql2df
7
from datetime import timedelta
8
from bs4 import BeautifulSoup
9
from io import StringIO
10
11
import requests
12
import argparse
13
14
15
def fetch(url, header_path, id, ip, dbase, targets_table):
16
    """
17
    il suffit de concatener toutes les tables extraites pour ensuite les fold
18
19
    url : str
20
        url to the location of biology files
21
22
    header_path : str
23
        path to csv file containing header
24
25
    id : str
26
        login to the sql database
27
28
    ip : str
29
        ip adress to the sql server
30
31
    dbase : str
32
        name of the database on the given server
33
34
    targets_table : str
35
        name of the table containing targets information
36
37
    @TODO ne need to fetch targets_table from sql since already loaded by
38
    @TODO main function
39
40
    Returns
41
    -------
42
    """
43
    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
44
    #       '&STARTDATE={}&ENDDATE={}'
45
46
    # header_path = '~/workspace/data/biology/header.csv'
47
    # constant names specific to our database
48
    KEY1 = 'id'
49
    KEY2 = 'NIP'
50
51
    header = pd.read_csv(header_path, sep=';', encoding='latin1').columns
52
53
54
    engine = get_engine(id, ip, dbase)
55
56
    df_ids = sql2df(engine, targets_table)
57
    df_ids.rename({'nip': KEY2}, inplace=True, axis=1)
58
    df_ids['patient_id'] = df_ids[KEY1]
59
60
    cols = [KEY2, 'Analyse', 'Resultat', 'Date prelvt']
61
    df_res = pd.DataFrame(data=None, columns=cols)
62
63
    for index, row in df_ids.iterrows():
64
        nip = row[KEY2].replace(' ', '')
65
        # patient_id = row['patient_id']
66
        # c1j1_date = row[C1J1].date()
67
        # start_date = c1j1_date - timedelta(weeks=8)
68
        start_date = row['prescreen']
69
        end_date = start_date + timedelta(weeks=4)
70
71
        start = str(start_date).replace('-', '')
72
        stop = str(end_date).replace('-', '')
73
74
        req = requests.get(url.format(nip, start, stop))
75
        values = BeautifulSoup(req.content, 'html.parser').body.text
76
77
        new_df = pd.read_csv(StringIO(values), sep=';', header=None,
78
                             index_col=False, names=header)
79
        new_df = new_df.loc[:, cols + ['LC']]
80
81
        # normalize nip
82
        new_df[KEY2] = row[KEY2]
83
84
        new_df.drop('LC', axis=1, inplace=True)
85
86
        df_res = pd.concat([df_res, new_df], axis=0,
87
                           sort=False, ignore_index=True)
88
89
    return df_res
90
91
92
def fetch_and_fold(url, header, id, ip, db, targets):
93
    key1, key2, date = 'patient_id', 'nip', 'date'
94
    # engine for sql connection
95
    engine = get_engine(id, ip, db)
96
97
    # fetching targets table
98
    df_targets = sql2df(engine, 'patient_target_simbad')
99
    df_targets['prescreen'] = df_targets.loc[:, 'prescreen'].dt.date
100
101
    # fetching features
102
    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
103
    #       '&STARTDATE={}&ENDDATE={}'
104
    #
105
    # header_path = '~/workspace/data/biology/header.csv'
106
    url =url
107
    header_path = header
108
109
    # fetching features
110
111
    df_bio = fetch(url, header_path, id, ip, db, targets)
112
    # parse_dates
113
    df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
114
                                           errors='coerce',
115
                                           format='%Y%m%d').dt.date
116
    df_bio.dropna(inplace=True)
117
118
    df_bio.rename({'Date prelvt': date, 'Analyse': 'feature',
119
                   'Resultat': 'value'}, inplace=True, axis=1)
120
121
    # joining with targets
122
    df_bio = df_bio.merge(df_targets, on=None, left_on='NIP',
123
                          right_on='nip').drop('NIP', axis=1)
124
125
    df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True)
126
    df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce',
127
                                    downcast='float')
128
129
    df_bio = df_bio.loc[:, [key1, key2, 'feature', 'value', date]]
130
    # df_bio already folded
131
132
133
    print('done')
134
135
    return df_bio
136
137
138
139
140
def main_fetch_and_fold():
141
    description = 'Folding biology measures from Ventura Care'
142
    parser = argparse.ArgumentParser(description=description)
143
144
    parser.add_argument('--url', '-u',
145
                        help='url to where measures are stored')
146
    parser.add_argument('--header', '-H',
147
                        help='path to the header file to read csv')
148
    parser.add_argument('--id', '-I',
149
                        help='id to connect to sql server')
150
    parser.add_argument('--ip', '-a',
151
                        help='ip adress of the sql server')
152
    parser.add_argument('--db', '-d',
153
                        help='name of the database on the sql server')
154
    parser.add_argument('--targets', '-t',
155
                        help='name of the table containing targets on the db')
156
    parser.add_argument('--output', '-o',
157
                        help='output path to write the folded result')
158
159
    args = parser.parse_args()
160
161
162
    df_bio = fetch_and_fold(args.url, args.header,args.id, args.ip,
163
                            args.db, args.targets)
164
    # df_bio already folded
165
166
    output = args.output
167
    df_bio.to_csv(output, encoding='utf-8', sep=';')
168
169
    print('done')
170
171
    return df_bio
172
173
174
if __name__ == "__main__":
175
    main_fetch_and_fold()