[ae9c43]: / prescreen / evaluation / biology_val.py

Download this file

153 lines (89 with data), 4.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
fetches biology for validation cohort
"""
import pandas as pd
from clintk.utils import Unfolder
from datetime import timedelta
from bs4 import BeautifulSoup
from io import StringIO
import requests
def fetch(url, header_path, df_ids):
"""
Parameters
----------
url : str
url to the location of biology files
header_path : str
path to csv file containing header
df_ids : pd.DataFrame
df containing target info
columns should be [nip, date_sign_ok]
Returns
-------
"""
header = pd.read_csv(header_path, sep=';', encoding='latin1').columns
cols = ['nip', 'Analyse', 'Resultat', 'Date prelvt']
df_res = pd.DataFrame(data=None, columns=cols)
for index, row in df_ids.iterrows():
nip = row['nip']
start_date = row['DATE SIGN_OK']
end_date = start_date + timedelta(weeks=4)
start = str(start_date).replace('-', '')
stop = str(end_date).replace('-', '')
req = requests.get(url.format(nip.replace(' ', ''), start, stop))
values = BeautifulSoup(req.content, 'html.parser').body.text
new_df = pd.read_csv(StringIO(values), sep=';', header=None,
index_col=False, names=header)
new_df = new_df.loc[:, cols]
new_df['nip'] = nip
df_res = pd.concat([df_res, new_df], axis=0,
sort=False, ignore_index=True)
return df_res
def fetch_and_fold(url, header_path, targets):
df_bio = fetch(url, header_path, targets)
df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
errors='coerce',
format='%Y%m%d').dt.date
df_bio.dropna(inplace=True)
df_bio.rename({'Date prelvt': 'date', 'Analyse': 'feature',
'Resultat': 'value'}, inplace=True, axis=1)
df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce',
downcast='float')
return df_bio
def main_fetch():
base_path = 'data/cohorte_validation'
## ditep
path = base_path + '/ditep_inclus.csv'
ditep_ok = pd.read_csv(path, sep=';',
parse_dates=[-2]).loc[:, ['nip','DATE SIGN_OK']]
path = base_path + '/ditep_sf.csv'
ditep_sf = pd.read_csv(path, sep=';',
parse_dates=[-2]).loc[:, ['nip', 'DATE SIGN_OK']]
ditep = pd.concat([ditep_ok, ditep_sf], ignore_index=True)
ditep['DATE SIGN_OK'] = ditep['DATE SIGN_OK'].dt.date
url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
'&STARTDATE={}&ENDDATE={}'
header_path = '/home/v_charvet/workspace/data/biology/header.csv'
bio_ditep = fetch_and_fold(url, header_path, ditep)
# poumon
path = base_path + '/poumons_inclusion.csv'
poumon_ok = pd.read_csv(path, sep=';',
parse_dates=[-2]).loc[:, ['nip', 'DATE_SIGN_OK']]
path = base_path + '/poumons_sf.csv'
poumon_sf = pd.read_csv(path, sep=';',
parse_dates=[-2]).loc[:, ['nip', 'DATE_SIGN_OK']]
poumon = pd.concat([poumon_ok, poumon_sf], ignore_index=True)
poumon.rename({'DATE_SIGN_OK': 'DATE SIGN_OK'}, axis=1, inplace=True)
poumon['DATE SIGN_OK'] = poumon['DATE SIGN_OK'].dt.date
bio_poumon = fetch_and_fold(url, header_path, poumon)
#unfolding features
bio_ditep['null_id'] = [1] * bio_ditep.shape[0]
bio_poumon['null_id'] = [1] * bio_poumon.shape[0]
unfolder = Unfolder('nip', 'null_id', 'feature', 'value', 'date', False, -1)
ditep_unfold = unfolder.fit(bio_ditep).unfold()
poumon_unfold = unfolder.fit(bio_poumon).unfold()
ditep_unfold.to_csv('data/ditep_bio_unfold.csv', sep=';')
poumon_unfold.to_csv('data/poumons_bio_unfold.csv', sep=';')
return bio_ditep, bio_poumon
if __name__ == "__main__":
main_fetch()