[ae9c43]: / prescreen / simbad / biology_2.py

Download this file

176 lines (123 with data), 5.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
fetches biology for simbad
"""
import pandas as pd
from clintk.utils.connection import get_engine, sql2df
from datetime import timedelta
from bs4 import BeautifulSoup
from io import StringIO
import requests
import argparse
def fetch(url, header_path, id, ip, dbase, targets_table):
"""
il suffit de concatener toutes les tables extraites pour ensuite les fold
url : str
url to the location of biology files
header_path : str
path to csv file containing header
id : str
login to the sql database
ip : str
ip adress to the sql server
dbase : str
name of the database on the given server
targets_table : str
name of the table containing targets information
@TODO ne need to fetch targets_table from sql since already loaded by
@TODO main function
Returns
-------
"""
# url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
# '&STARTDATE={}&ENDDATE={}'
# header_path = '~/workspace/data/biology/header.csv'
# constant names specific to our database
KEY1 = 'id'
KEY2 = 'NIP'
header = pd.read_csv(header_path, sep=';', encoding='latin1').columns
engine = get_engine(id, ip, dbase)
df_ids = sql2df(engine, targets_table)
df_ids.rename({'nip': KEY2}, inplace=True, axis=1)
df_ids['patient_id'] = df_ids[KEY1]
cols = [KEY2, 'Analyse', 'Resultat', 'Date prelvt']
df_res = pd.DataFrame(data=None, columns=cols)
for index, row in df_ids.iterrows():
nip = row[KEY2].replace(' ', '')
# patient_id = row['patient_id']
# c1j1_date = row[C1J1].date()
# start_date = c1j1_date - timedelta(weeks=8)
start_date = row['prescreen']
end_date = start_date + timedelta(weeks=4)
start = str(start_date).replace('-', '')
stop = str(end_date).replace('-', '')
req = requests.get(url.format(nip, start, stop))
values = BeautifulSoup(req.content, 'html.parser').body.text
new_df = pd.read_csv(StringIO(values), sep=';', header=None,
index_col=False, names=header)
new_df = new_df.loc[:, cols + ['LC']]
# normalize nip
new_df[KEY2] = row[KEY2]
new_df.drop('LC', axis=1, inplace=True)
df_res = pd.concat([df_res, new_df], axis=0,
sort=False, ignore_index=True)
return df_res
def fetch_and_fold(url, header, id, ip, db, targets):
key1, key2, date = 'patient_id', 'nip', 'date'
# engine for sql connection
engine = get_engine(id, ip, db)
# fetching targets table
df_targets = sql2df(engine, 'patient_target_simbad')
df_targets['prescreen'] = df_targets.loc[:, 'prescreen'].dt.date
# fetching features
# url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
# '&STARTDATE={}&ENDDATE={}'
#
# header_path = '~/workspace/data/biology/header.csv'
url =url
header_path = header
# fetching features
df_bio = fetch(url, header_path, id, ip, db, targets)
# parse_dates
df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
errors='coerce',
format='%Y%m%d').dt.date
df_bio.dropna(inplace=True)
df_bio.rename({'Date prelvt': date, 'Analyse': 'feature',
'Resultat': 'value'}, inplace=True, axis=1)
# joining with targets
df_bio = df_bio.merge(df_targets, on=None, left_on='NIP',
right_on='nip').drop('NIP', axis=1)
df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True)
df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce',
downcast='float')
df_bio = df_bio.loc[:, [key1, key2, 'feature', 'value', date]]
# df_bio already folded
print('done')
return df_bio
def main_fetch_and_fold():
description = 'Folding biology measures from Ventura Care'
parser = argparse.ArgumentParser(description=description)
parser.add_argument('--url', '-u',
help='url to where measures are stored')
parser.add_argument('--header', '-H',
help='path to the header file to read csv')
parser.add_argument('--id', '-I',
help='id to connect to sql server')
parser.add_argument('--ip', '-a',
help='ip adress of the sql server')
parser.add_argument('--db', '-d',
help='name of the database on the sql server')
parser.add_argument('--targets', '-t',
help='name of the table containing targets on the db')
parser.add_argument('--output', '-o',
help='output path to write the folded result')
args = parser.parse_args()
df_bio = fetch_and_fold(args.url, args.header,args.id, args.ip,
args.db, args.targets)
# df_bio already folded
output = args.output
df_bio.to_csv(output, encoding='utf-8', sep=';')
print('done')
return df_bio
if __name__ == "__main__":
main_fetch_and_fold()