a b/prescreen/evaluation/biology_val.py
1
"""
2
fetches biology for validation cohort
3
"""
4
import pandas as pd
5
6
from clintk.utils import Unfolder
7
from datetime import timedelta
8
from bs4 import BeautifulSoup
9
from io import StringIO
10
11
import requests
12
13
14
15
def fetch(url, header_path, df_ids):
16
    """
17
18
    Parameters
19
    ----------
20
    url : str
21
        url to the location of biology files
22
23
    header_path : str
24
        path to csv file containing header
25
26
27
    df_ids : pd.DataFrame
28
        df containing target info
29
        columns should be [nip, date_sign_ok]
30
31
    Returns
32
    -------
33
34
    """
35
    header = pd.read_csv(header_path, sep=';', encoding='latin1').columns
36
37
    cols = ['nip', 'Analyse', 'Resultat', 'Date prelvt']
38
    df_res = pd.DataFrame(data=None, columns=cols)
39
40
    for index, row in df_ids.iterrows():
41
        nip = row['nip']
42
43
        start_date = row['DATE SIGN_OK']
44
        end_date = start_date + timedelta(weeks=4)
45
46
        start = str(start_date).replace('-', '')
47
        stop = str(end_date).replace('-', '')
48
49
        req = requests.get(url.format(nip.replace(' ', ''), start, stop))
50
        values = BeautifulSoup(req.content, 'html.parser').body.text
51
52
        new_df = pd.read_csv(StringIO(values), sep=';', header=None,
53
                             index_col=False, names=header)
54
        new_df = new_df.loc[:, cols]
55
56
        new_df['nip'] = nip
57
58
        df_res = pd.concat([df_res, new_df], axis=0,
59
                           sort=False, ignore_index=True)
60
61
    return df_res
62
63
64
def fetch_and_fold(url, header_path, targets):
65
66
    df_bio = fetch(url, header_path, targets)
67
68
    df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
69
                                           errors='coerce',
70
                                           format='%Y%m%d').dt.date
71
    df_bio.dropna(inplace=True)
72
73
    df_bio.rename({'Date prelvt': 'date', 'Analyse': 'feature',
74
                   'Resultat': 'value'}, inplace=True, axis=1)
75
76
    df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce',
77
                                    downcast='float')
78
79
    return df_bio
80
81
82
def main_fetch():
83
    base_path = 'data/cohorte_validation'
84
85
86
    ## ditep
87
    path = base_path + '/ditep_inclus.csv'
88
    ditep_ok = pd.read_csv(path, sep=';',
89
                           parse_dates=[-2]).loc[:, ['nip','DATE SIGN_OK']]
90
91
    path = base_path + '/ditep_sf.csv'
92
    ditep_sf = pd.read_csv(path, sep=';',
93
                           parse_dates=[-2]).loc[:, ['nip', 'DATE SIGN_OK']]
94
95
    ditep = pd.concat([ditep_ok, ditep_sf], ignore_index=True)
96
97
    ditep['DATE SIGN_OK'] = ditep['DATE SIGN_OK'].dt.date
98
99
    url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
100
          '&STARTDATE={}&ENDDATE={}'
101
    header_path = '/home/v_charvet/workspace/data/biology/header.csv'
102
103
    bio_ditep = fetch_and_fold(url, header_path, ditep)
104
105
106
    # poumon
107
    path = base_path + '/poumons_inclusion.csv'
108
    poumon_ok = pd.read_csv(path, sep=';',
109
                            parse_dates=[-2]).loc[:, ['nip', 'DATE_SIGN_OK']]
110
111
    path = base_path + '/poumons_sf.csv'
112
    poumon_sf = pd.read_csv(path, sep=';',
113
                            parse_dates=[-2]).loc[:, ['nip', 'DATE_SIGN_OK']]
114
115
    poumon = pd.concat([poumon_ok, poumon_sf], ignore_index=True)
116
    poumon.rename({'DATE_SIGN_OK': 'DATE SIGN_OK'}, axis=1, inplace=True)
117
118
    poumon['DATE SIGN_OK'] = poumon['DATE SIGN_OK'].dt.date
119
120
    bio_poumon = fetch_and_fold(url, header_path, poumon)
121
122
123
    #unfolding features
124
    bio_ditep['null_id'] = [1] * bio_ditep.shape[0]
125
    bio_poumon['null_id'] = [1] * bio_poumon.shape[0]
126
127
    unfolder = Unfolder('nip', 'null_id', 'feature', 'value', 'date', False, -1)
128
129
    ditep_unfold = unfolder.fit(bio_ditep).unfold()
130
131
    poumon_unfold = unfolder.fit(bio_poumon).unfold()
132
133
    ditep_unfold.to_csv('data/ditep_bio_unfold.csv', sep=';')
134
    poumon_unfold.to_csv('data/poumons_bio_unfold.csv', sep=';')
135
136
137
    return bio_ditep, bio_poumon
138
139
if __name__ == "__main__":
140
    main_fetch()
141
142
143
144
145
146
147
148
149
150
151
152