prescreen-prediction / Git / [ae9c43] /prescreen/vcare/biology.py

Models:
joseph-gordon/
prescreen-prediction
Downloads: 1
[ae9c43]: / prescreen / vcare / biology.py
History
Download this file
210 lines (148 with data), 6.2 kB

"""
fetching biology measurements
"""
import pandas as pd

from clintk.utils.connection import get_engine, sql2df
from datetime import timedelta
from bs4 import BeautifulSoup
from io import StringIO

import requests
import argparse


def fetch(url, header_path, id, ip, dbase, targets_table):
    """
    il suffit de concatener toutes les tables extraites pour ensuite les fold

    url : str
        url to the location of biology files

    header_path : str
        path to csv file containing header

    id : str
        login to the sql database

    ip : str
        ip adress to the sql server

    dbase : str
        name of the database on the given server

    targets_table : str @TODO remplacer par df_targets
        name of the table containing targets information

    Returns
    -------
    """
    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
    #       '&STARTDATE={}&ENDDATE={}'

    # header_path = '~/workspace/data/biology/header.csv'
    # constant names specific to our database
    KEY1 = 'id'
    KEY2 = 'NIP'
    C1J1 = 'C1J1'

    header = pd.read_csv(header_path, sep=';', encoding='latin1').columns


    engine = get_engine(id, ip, dbase)

    df_ids = sql2df(engine, targets_table)[[KEY1, 'nip', C1J1]]
    df_ids.rename({'nip': KEY2}, inplace=True, axis=1)
    df_ids['patient_id'] = df_ids[KEY1]

    cols = [KEY2, 'Analyse', 'Resultat', 'Date prelvt']
    df_res = pd.DataFrame(data=None, columns=cols)

    for index, row in df_ids.iterrows():
        nip = row[KEY2].replace(' ', '')
        patient_id = row['patient_id']
        c1j1_date = row[C1J1].date()
        start_date = c1j1_date - timedelta(weeks=8)

        c1j1 = str(c1j1_date).replace('-', '')
        start = str(start_date).replace('-', '')

        req = requests.get(url.format(nip, start, c1j1))
        values = BeautifulSoup(req.content, 'html.parser').body.text

        new_df = pd.read_csv(StringIO(values), sep=';', header=None,
                             index_col=False, names=header)
        new_df = new_df.loc[:, cols + ['LC']] # remove LC

        # normalize nip
        new_df[KEY2] = row[KEY2]
        # new_df[KEY2] = new_df[KEY2].map(str)
        # new_df[KEY2] = [nip[:4] + '-' + nip[4:] for nip in new_df[KEY2]]

        new_df.drop('LC', axis=1, inplace=True)

        df_res = pd.concat([df_res, new_df], axis=0,
                           sort=False, ignore_index=True)

    return df_res


def fetch_and_fold(url, header, id, ip, db, targets):
    key1, key2, date = 'patient_id', 'nip', 'date'
    # engine for sql connection
    engine = get_engine(id, ip, db)

    # fetching targets table
    df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']]
    df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
                                               format='%Y-%m-%d',
                                               unit='D')

    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
    #       '&STARTDATE={}&ENDDATE={}'
    url = url
    header_path = header


    df_bio = fetch(url, header_path,id, ip, db, targets)
    # parse_dates
    df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
                                           format='%Y%m%d').dt.date

    df_bio.rename({'Date prelvt': date, 'Analyse': 'feature',
                   'Resultat': 'value'}, inplace=True, axis=1)

    # joining with targets
    # df_bio = df_bio.merge(df_targets, on=None, left_on='NIP',
    #                       right_on='nip').drop('NIP', axis=1)
    df_bio = df_targets.merge(df_bio, left_on='nip', right_on='NIP')
    df_bio.drop('NIP', axis=1, inplace=True)

    df_bio['C1J1'] = pd.to_datetime(df_bio['C1J1'],
                                           format='%Y%m%d').dt.date
    df_bio = df_bio[df_bio[date] <= df_bio['C1J1']] # ?

    df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True)
    df_bio.drop('C1J1', axis=1, inplace=True)

    #removing non float values
    df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce',
                                    downcast='float')

    # df_bio already folded

    print('done')

    return df_bio



def main_fetch_and_fold():
    description = 'Folding biology measures from Ventura Care'
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('--url', '-u',
                        help='url to where measures are stored')
    parser.add_argument('--header', '-H',
                        help='path to the header file to read csv')
    parser.add_argument('--id', '-I',
                        help='id to connect to sql server')
    parser.add_argument('--ip', '-a',
                        help='ip adress of the sql server')
    parser.add_argument('--db', '-d',
                        help='name of the database on the sql server')
    parser.add_argument('--targets', '-t',
                        help='name of the table containing targets on the db')
    parser.add_argument('--output', '-o',
                        help='output path to write the folded result')

    args = parser.parse_args()

    # column names for merged dataframe

    df_bio = fetch_and_fold(args.url, args.header, args.id, args.ip,
                            args.db, args.targets)

    # df_bio already folded
    output = args.output
    df_bio.to_csv(output, encoding='utf-8', sep=';')

    print('done')

    return df_bio


if __name__ == "__main__":
    main_fetch_and_fold()



def read_biology(url, header):
    """ reads biology measures for one patient

    Parameters
    ----------
    url : str
        url containing file

    header : list
        column names
    Returns
    -------
    dict
        keys are name of the biologic parameter, value are the measures
    """
    req = requests.get(url)
    values = BeautifulSoup(req.content, 'html.parser').body.text

    df = pd.read_csv(StringIO(values), sep=';', header=None,
                     index_col=False, names=header)
    res_dic = {}
    for i, row in df.iterrows():
        key = row['Analyse'].replace(' ', '_') # ou analyse?
        value = row['Resultat']

        res_dic[key] = value

    return res_dic