[ae9c43]: / prescreen / vcare / biology.py

Download this file

210 lines (148 with data), 6.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""
fetching biology measurements
"""
import pandas as pd
from clintk.utils.connection import get_engine, sql2df
from datetime import timedelta
from bs4 import BeautifulSoup
from io import StringIO
import requests
import argparse
def fetch(url, header_path, id, ip, dbase, targets_table):
"""
il suffit de concatener toutes les tables extraites pour ensuite les fold
url : str
url to the location of biology files
header_path : str
path to csv file containing header
id : str
login to the sql database
ip : str
ip adress to the sql server
dbase : str
name of the database on the given server
targets_table : str @TODO remplacer par df_targets
name of the table containing targets information
Returns
-------
"""
# url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
# '&STARTDATE={}&ENDDATE={}'
# header_path = '~/workspace/data/biology/header.csv'
# constant names specific to our database
KEY1 = 'id'
KEY2 = 'NIP'
C1J1 = 'C1J1'
header = pd.read_csv(header_path, sep=';', encoding='latin1').columns
engine = get_engine(id, ip, dbase)
df_ids = sql2df(engine, targets_table)[[KEY1, 'nip', C1J1]]
df_ids.rename({'nip': KEY2}, inplace=True, axis=1)
df_ids['patient_id'] = df_ids[KEY1]
cols = [KEY2, 'Analyse', 'Resultat', 'Date prelvt']
df_res = pd.DataFrame(data=None, columns=cols)
for index, row in df_ids.iterrows():
nip = row[KEY2].replace(' ', '')
patient_id = row['patient_id']
c1j1_date = row[C1J1].date()
start_date = c1j1_date - timedelta(weeks=8)
c1j1 = str(c1j1_date).replace('-', '')
start = str(start_date).replace('-', '')
req = requests.get(url.format(nip, start, c1j1))
values = BeautifulSoup(req.content, 'html.parser').body.text
new_df = pd.read_csv(StringIO(values), sep=';', header=None,
index_col=False, names=header)
new_df = new_df.loc[:, cols + ['LC']] # remove LC
# normalize nip
new_df[KEY2] = row[KEY2]
# new_df[KEY2] = new_df[KEY2].map(str)
# new_df[KEY2] = [nip[:4] + '-' + nip[4:] for nip in new_df[KEY2]]
new_df.drop('LC', axis=1, inplace=True)
df_res = pd.concat([df_res, new_df], axis=0,
sort=False, ignore_index=True)
return df_res
def fetch_and_fold(url, header, id, ip, db, targets):
key1, key2, date = 'patient_id', 'nip', 'date'
# engine for sql connection
engine = get_engine(id, ip, db)
# fetching targets table
df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']]
df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
format='%Y-%m-%d',
unit='D')
# url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
# '&STARTDATE={}&ENDDATE={}'
url = url
header_path = header
df_bio = fetch(url, header_path,id, ip, db, targets)
# parse_dates
df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
format='%Y%m%d').dt.date
df_bio.rename({'Date prelvt': date, 'Analyse': 'feature',
'Resultat': 'value'}, inplace=True, axis=1)
# joining with targets
# df_bio = df_bio.merge(df_targets, on=None, left_on='NIP',
# right_on='nip').drop('NIP', axis=1)
df_bio = df_targets.merge(df_bio, left_on='nip', right_on='NIP')
df_bio.drop('NIP', axis=1, inplace=True)
df_bio['C1J1'] = pd.to_datetime(df_bio['C1J1'],
format='%Y%m%d').dt.date
df_bio = df_bio[df_bio[date] <= df_bio['C1J1']] # ?
df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True)
df_bio.drop('C1J1', axis=1, inplace=True)
#removing non float values
df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce',
downcast='float')
# df_bio already folded
print('done')
return df_bio
def main_fetch_and_fold():
description = 'Folding biology measures from Ventura Care'
parser = argparse.ArgumentParser(description=description)
parser.add_argument('--url', '-u',
help='url to where measures are stored')
parser.add_argument('--header', '-H',
help='path to the header file to read csv')
parser.add_argument('--id', '-I',
help='id to connect to sql server')
parser.add_argument('--ip', '-a',
help='ip adress of the sql server')
parser.add_argument('--db', '-d',
help='name of the database on the sql server')
parser.add_argument('--targets', '-t',
help='name of the table containing targets on the db')
parser.add_argument('--output', '-o',
help='output path to write the folded result')
args = parser.parse_args()
# column names for merged dataframe
df_bio = fetch_and_fold(args.url, args.header, args.id, args.ip,
args.db, args.targets)
# df_bio already folded
output = args.output
df_bio.to_csv(output, encoding='utf-8', sep=';')
print('done')
return df_bio
if __name__ == "__main__":
main_fetch_and_fold()
def read_biology(url, header):
""" reads biology measures for one patient
Parameters
----------
url : str
url containing file
header : list
column names
Returns
-------
dict
keys are name of the biologic parameter, value are the measures
"""
req = requests.get(url)
values = BeautifulSoup(req.content, 'html.parser').body.text
df = pd.read_csv(StringIO(values), sep=';', header=None,
index_col=False, names=header)
res_dic = {}
for i, row in df.iterrows():
key = row['Analyse'].replace(' ', '_') # ou analyse?
value = row['Resultat']
res_dic[key] = value
return res_dic