criteria_parser / Git / [811e40] /src/data.py

Models:
joseph-gordon/
criteria_parser
Downloads: 1
[811e40]: / src / data.py
History
Download this file
169 lines (124 with data), 4.7 kB

import pandas as pd
import json
from typing import List, Dict, Tuple
from settings import CHIA_PATH, FB_PATH


def get_chia_entities(clinical_trail_no: str, mode: str, entity_name: str) -> List:
    """Read annotations from .ann file and return a list of entities of type e

    Args:
        clinical_trail_no (str): Clinical trial number
        mode (str): Inclusion or exclusion criteria
        entity_name (str): Entity type

    Returns:
        List: List of entities of type e
    """

    entities = []

    with open(f"{CHIA_PATH}/{clinical_trail_no}{mode}.ann", "rt") as f:
        data = f.read().splitlines()

    for row in data:
        if entity_name in row:
            entities.append(" ".join(row.split()[4:]))

    return entities


def get_chia_annotations(entity: str, n: int = None, random: bool = False) -> List[Tuple[int, str, str]]:
    """Returns a list of tuples of the form (clinical_trial_no, criteria, entity) from the Chia dataset

    Args:
        entity (str): Entity type
        n (int, optional): Number of records to return. Defaults to None.
        random (bool, optional): Whether to return records randomly. Defaults to False.
    Returns:
        List[Tuple[int, str, str]]: List of tuples of the form (clinical_trial_no, criteria, entity)
    """
    ents = [
        "drugs",
        "persons",
        "procedures",
        "conditions",
        "devices",
        "visits",
        "scopes",
        "observations",
        "measurements",
    ]

    assert entity in ents, f"Entity must be one of {ents}"

    df = load_chia()

    if random:
        result = (
            df[~df[entity].isna()][["ct_no", "criteria", entity]][:n]
            .sample(frac=1.0)
            .to_records(index=False)
            .tolist()
        )
    else:
        result = (
            df[~df[entity].isna()][["ct_no", "criteria", entity]][:n]
            .to_records(index=False)
            .tolist()
        )

    return result


def load_chia() -> pd.DataFrame:
    """Exports Chia annotated dataset as a Pandas dataframe

    Returns:
        pd.DataFrame: Chia annotated dataset as a Pandas dataframe
    """

    _lst = []

    ent_map = {
        "drugs": "Drug",
        "persons": "Person",
        "procedures": "Proceure",
        "conditions": "Condition",
        "devices": "Device",
        "visits": "Visit",
        "scopes": "Scope",
        "observations": "Observation",
        "measurements": "Measurement",
    }

    for mode in ["_inc", "_exc"]:
        criteria_files = CHIA_PATH.glob(f"*{mode}.txt")

        for f in criteria_files:
            clinical_trial_no = str(f).lstrip(f"{CHIA_PATH}/").rstrip(f"{mode}.txt")

            with open(f, "rt") as f:
                criteria = " ".join(f.read().splitlines())

            _rec = {
                "ct_no": clinical_trial_no,
                "criteria": criteria,
                "mode": "inclusion" if mode == "_inc" else "exclusion",
            }

            for entity in ent_map:
                ents = get_chia_entities(clinical_trial_no, mode, ent_map[entity])
                _rec[entity] = ents if ents else None

            _lst.append(_rec)

    return pd.DataFrame(_lst)


def load_fb() -> Dict:
    """Exports FB annotated dataset as a dictionary with train, test and val dataframes

    Returns:
        Dict: Dictionary with train, test and val dataframes
    """

    files = {
        "train": "fb_dset_100trials_train.json",
        "test": "fb_dset_100trials_test.json",
        "val": "fb_dset_100trials_val.json",
    }

    d = {}

    for k, v in files.items():
        with open(f"{FB_PATH}/{v}", "rt") as f:
            _data = json.load(f)
            d[k] = pd.json_normalize(_data)

    return d


def train_test_val_split(df: pd.DataFrame, random_seed=42, ratio=(70, 20, 10)) -> Dict:
    """Splits the dataset into train, test and validation sets using the given ratios and random seed

    Args:
        df (pd.DataFrame): Input dataframe
        random_seed (int, optional): Random seed. Defaults to 42.
        ratio (tuple, optional): Train, test and val ratios. Defaults to (70, 20, 10).

    Returns:
        Dict: Dictionary with train, test and val sets
    """

    assert sum(ratio) == 100, "Sum of ratios must be 100"

    df = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    train_ratio, test_ratio, val_ratio = ratio

    train_size = int(len(df) * train_ratio / 100)
    test_size = int(len(df) * test_ratio / 100)
    val_size = int(len(df) * val_ratio / 100)

    train = df[:train_size]
    test = df[train_size: train_size + test_size]
    val = df[train_size + test_size:]

    return {"train": train, "test": test, "val": val}