--- a +++ b/prescreen/vcare/biology.py @@ -0,0 +1,209 @@ +""" +fetching biology measurements +""" +import pandas as pd + +from clintk.utils.connection import get_engine, sql2df +from datetime import timedelta +from bs4 import BeautifulSoup +from io import StringIO + +import requests +import argparse + + +def fetch(url, header_path, id, ip, dbase, targets_table): + """ + il suffit de concatener toutes les tables extraites pour ensuite les fold + + url : str + url to the location of biology files + + header_path : str + path to csv file containing header + + id : str + login to the sql database + + ip : str + ip adress to the sql server + + dbase : str + name of the database on the given server + + targets_table : str @TODO remplacer par df_targets + name of the table containing targets information + + Returns + ------- + """ + # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \ + # '&STARTDATE={}&ENDDATE={}' + + # header_path = '~/workspace/data/biology/header.csv' + # constant names specific to our database + KEY1 = 'id' + KEY2 = 'NIP' + C1J1 = 'C1J1' + + header = pd.read_csv(header_path, sep=';', encoding='latin1').columns + + + engine = get_engine(id, ip, dbase) + + df_ids = sql2df(engine, targets_table)[[KEY1, 'nip', C1J1]] + df_ids.rename({'nip': KEY2}, inplace=True, axis=1) + df_ids['patient_id'] = df_ids[KEY1] + + cols = [KEY2, 'Analyse', 'Resultat', 'Date prelvt'] + df_res = pd.DataFrame(data=None, columns=cols) + + for index, row in df_ids.iterrows(): + nip = row[KEY2].replace(' ', '') + patient_id = row['patient_id'] + c1j1_date = row[C1J1].date() + start_date = c1j1_date - timedelta(weeks=8) + + c1j1 = str(c1j1_date).replace('-', '') + start = str(start_date).replace('-', '') + + req = requests.get(url.format(nip, start, c1j1)) + values = BeautifulSoup(req.content, 'html.parser').body.text + + new_df = pd.read_csv(StringIO(values), sep=';', header=None, + index_col=False, names=header) + new_df = new_df.loc[:, cols + ['LC']] # remove LC + + # normalize nip + new_df[KEY2] = row[KEY2] + # new_df[KEY2] = new_df[KEY2].map(str) + # new_df[KEY2] = [nip[:4] + '-' + nip[4:] for nip in new_df[KEY2]] + + new_df.drop('LC', axis=1, inplace=True) + + df_res = pd.concat([df_res, new_df], axis=0, + sort=False, ignore_index=True) + + return df_res + + +def fetch_and_fold(url, header, id, ip, db, targets): + key1, key2, date = 'patient_id', 'nip', 'date' + # engine for sql connection + engine = get_engine(id, ip, db) + + # fetching targets table + df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']] + df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'], + format='%Y-%m-%d', + unit='D') + + # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \ + # '&STARTDATE={}&ENDDATE={}' + url = url + header_path = header + + + df_bio = fetch(url, header_path,id, ip, db, targets) + # parse_dates + df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'], + format='%Y%m%d').dt.date + + df_bio.rename({'Date prelvt': date, 'Analyse': 'feature', + 'Resultat': 'value'}, inplace=True, axis=1) + + # joining with targets + # df_bio = df_bio.merge(df_targets, on=None, left_on='NIP', + # right_on='nip').drop('NIP', axis=1) + df_bio = df_targets.merge(df_bio, left_on='nip', right_on='NIP') + df_bio.drop('NIP', axis=1, inplace=True) + + df_bio['C1J1'] = pd.to_datetime(df_bio['C1J1'], + format='%Y%m%d').dt.date + df_bio = df_bio[df_bio[date] <= df_bio['C1J1']] # ? + + df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True) + df_bio.drop('C1J1', axis=1, inplace=True) + + #removing non float values + df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce', + downcast='float') + + # df_bio already folded + + print('done') + + return df_bio + + + +def main_fetch_and_fold(): + description = 'Folding biology measures from Ventura Care' + parser = argparse.ArgumentParser(description=description) + + parser.add_argument('--url', '-u', + help='url to where measures are stored') + parser.add_argument('--header', '-H', + help='path to the header file to read csv') + parser.add_argument('--id', '-I', + help='id to connect to sql server') + parser.add_argument('--ip', '-a', + help='ip adress of the sql server') + parser.add_argument('--db', '-d', + help='name of the database on the sql server') + parser.add_argument('--targets', '-t', + help='name of the table containing targets on the db') + parser.add_argument('--output', '-o', + help='output path to write the folded result') + + args = parser.parse_args() + + # column names for merged dataframe + + df_bio = fetch_and_fold(args.url, args.header, args.id, args.ip, + args.db, args.targets) + + # df_bio already folded + output = args.output + df_bio.to_csv(output, encoding='utf-8', sep=';') + + print('done') + + return df_bio + + +if __name__ == "__main__": + main_fetch_and_fold() + + + +def read_biology(url, header): + """ reads biology measures for one patient + + Parameters + ---------- + url : str + url containing file + + header : list + column names + Returns + ------- + dict + keys are name of the biologic parameter, value are the measures + """ + req = requests.get(url) + values = BeautifulSoup(req.content, 'html.parser').body.text + + df = pd.read_csv(StringIO(values), sep=';', header=None, + index_col=False, names=header) + res_dic = {} + for i, row in df.iterrows(): + key = row['Analyse'].replace(' ', '_') # ou analyse? + value = row['Resultat'] + + res_dic[key] = value + + return res_dic + +