prescreen-prediction / Git / Diff of /prescreen/vcare/biology.py

Models:
joseph-gordon/
prescreen-prediction
Downloads: 1
Diff of /prescreen/vcare/biology.py [000000] .. [ae9c43]
Switch to side-by-side view

--- a
+++ b/prescreen/vcare/biology.py
@@ -0,0 +1,209 @@
+"""
+fetching biology measurements
+"""
+import pandas as pd
+
+from clintk.utils.connection import get_engine, sql2df
+from datetime import timedelta
+from bs4 import BeautifulSoup
+from io import StringIO
+
+import requests
+import argparse
+
+
+def fetch(url, header_path, id, ip, dbase, targets_table):
+    """
+    il suffit de concatener toutes les tables extraites pour ensuite les fold
+
+    url : str
+        url to the location of biology files
+
+    header_path : str
+        path to csv file containing header
+
+    id : str
+        login to the sql database
+
+    ip : str
+        ip adress to the sql server
+
+    dbase : str
+        name of the database on the given server
+
+    targets_table : str @TODO remplacer par df_targets
+        name of the table containing targets information
+
+    Returns
+    -------
+    """
+    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
+    #       '&STARTDATE={}&ENDDATE={}'
+
+    # header_path = '~/workspace/data/biology/header.csv'
+    # constant names specific to our database
+    KEY1 = 'id'
+    KEY2 = 'NIP'
+    C1J1 = 'C1J1'
+
+    header = pd.read_csv(header_path, sep=';', encoding='latin1').columns
+
+
+    engine = get_engine(id, ip, dbase)
+
+    df_ids = sql2df(engine, targets_table)[[KEY1, 'nip', C1J1]]
+    df_ids.rename({'nip': KEY2}, inplace=True, axis=1)
+    df_ids['patient_id'] = df_ids[KEY1]
+
+    cols = [KEY2, 'Analyse', 'Resultat', 'Date prelvt']
+    df_res = pd.DataFrame(data=None, columns=cols)
+
+    for index, row in df_ids.iterrows():
+        nip = row[KEY2].replace(' ', '')
+        patient_id = row['patient_id']
+        c1j1_date = row[C1J1].date()
+        start_date = c1j1_date - timedelta(weeks=8)
+
+        c1j1 = str(c1j1_date).replace('-', '')
+        start = str(start_date).replace('-', '')
+
+        req = requests.get(url.format(nip, start, c1j1))
+        values = BeautifulSoup(req.content, 'html.parser').body.text
+
+        new_df = pd.read_csv(StringIO(values), sep=';', header=None,
+                             index_col=False, names=header)
+        new_df = new_df.loc[:, cols + ['LC']] # remove LC
+
+        # normalize nip
+        new_df[KEY2] = row[KEY2]
+        # new_df[KEY2] = new_df[KEY2].map(str)
+        # new_df[KEY2] = [nip[:4] + '-' + nip[4:] for nip in new_df[KEY2]]
+
+        new_df.drop('LC', axis=1, inplace=True)
+
+        df_res = pd.concat([df_res, new_df], axis=0,
+                           sort=False, ignore_index=True)
+
+    return df_res
+
+
+def fetch_and_fold(url, header, id, ip, db, targets):
+    key1, key2, date = 'patient_id', 'nip', 'date'
+    # engine for sql connection
+    engine = get_engine(id, ip, db)
+
+    # fetching targets table
+    df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']]
+    df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
+                                               format='%Y-%m-%d',
+                                               unit='D')
+
+    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
+    #       '&STARTDATE={}&ENDDATE={}'
+    url = url
+    header_path = header
+
+
+    df_bio = fetch(url, header_path,id, ip, db, targets)
+    # parse_dates
+    df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
+                                           format='%Y%m%d').dt.date
+
+    df_bio.rename({'Date prelvt': date, 'Analyse': 'feature',
+                   'Resultat': 'value'}, inplace=True, axis=1)
+
+    # joining with targets
+    # df_bio = df_bio.merge(df_targets, on=None, left_on='NIP',
+    #                       right_on='nip').drop('NIP', axis=1)
+    df_bio = df_targets.merge(df_bio, left_on='nip', right_on='NIP')
+    df_bio.drop('NIP', axis=1, inplace=True)
+
+    df_bio['C1J1'] = pd.to_datetime(df_bio['C1J1'],
+                                           format='%Y%m%d').dt.date
+    df_bio = df_bio[df_bio[date] <= df_bio['C1J1']] # ?
+
+    df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True)
+    df_bio.drop('C1J1', axis=1, inplace=True)
+
+    #removing non float values
+    df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce',
+                                    downcast='float')
+
+    # df_bio already folded
+
+    print('done')
+
+    return df_bio
+
+
+
+def main_fetch_and_fold():
+    description = 'Folding biology measures from Ventura Care'
+    parser = argparse.ArgumentParser(description=description)
+
+    parser.add_argument('--url', '-u',
+                        help='url to where measures are stored')
+    parser.add_argument('--header', '-H',
+                        help='path to the header file to read csv')
+    parser.add_argument('--id', '-I',
+                        help='id to connect to sql server')
+    parser.add_argument('--ip', '-a',
+                        help='ip adress of the sql server')
+    parser.add_argument('--db', '-d',
+                        help='name of the database on the sql server')
+    parser.add_argument('--targets', '-t',
+                        help='name of the table containing targets on the db')
+    parser.add_argument('--output', '-o',
+                        help='output path to write the folded result')
+
+    args = parser.parse_args()
+
+    # column names for merged dataframe
+
+    df_bio = fetch_and_fold(args.url, args.header, args.id, args.ip,
+                            args.db, args.targets)
+
+    # df_bio already folded
+    output = args.output
+    df_bio.to_csv(output, encoding='utf-8', sep=';')
+
+    print('done')
+
+    return df_bio
+
+
+if __name__ == "__main__":
+    main_fetch_and_fold()
+
+
+
+def read_biology(url, header):
+    """ reads biology measures for one patient
+
+    Parameters
+    ----------
+    url : str
+        url containing file
+
+    header : list
+        column names
+    Returns
+    -------
+    dict
+        keys are name of the biologic parameter, value are the measures
+    """
+    req = requests.get(url)
+    values = BeautifulSoup(req.content, 'html.parser').body.text
+
+    df = pd.read_csv(StringIO(values), sep=';', header=None,
+                     index_col=False, names=header)
+    res_dic = {}
+    for i, row in df.iterrows():
+        key = row['Analyse'].replace(' ', '_') # ou analyse?
+        value = row['Resultat']
+
+        res_dic[key] = value
+
+    return res_dic
+
+