selection-bias-benchmark / Git / [6ac965] /catenets/datasets/dataset

Models:
MarcoTheBlack/
selection-bias-benchmark
Downloads: 1
[6ac965]: / catenets / datasets / dataset_twins.py
History
Download this file
266 lines (226 with data), 6.7 kB

"""
Twins dataset
Load real-world individualized treatment effects estimation datasets

- Reference: http://data.nber.org/data/linked-birth-infant-death-data-vital-statistics-data.html
"""
# stdlib
import random
from pathlib import Path
from typing import Tuple

# third party
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

import catenets.logger as log

from .network import download_if_needed


DATASET = "Twin_Data.csv.gz"
URL = "https://bitbucket.org/mvdschaar/mlforhealthlabpub/raw/0b0190bcd38a76c405c805f1ca774971fcd85233/data/twins/Twin_Data.csv.gz"  # noqa: E501


def preprocess(
    fn_csv: Path,
    train_ratio: float = 0.8,
    treatment_type: str = "rand",
    seed: int = 42,
    treat_prop: float = 0.5,
) -> Tuple:
    """Helper for preprocessing the Twins dataset.

    Parameters
    ----------
    fn_csv: Path
        Dataset CSV file path.
    train_ratio: float
        The ratio of training data.
    treatment_type: string
        The treatment selection strategy.
    seed: float
        Random seed.

    Returns
    -------
    train_x: array or pd.DataFrame
        Features in training data.
    train_t: array or pd.DataFrame
        Treatments in training data.
    train_y: array or pd.DataFrame
        Observed outcomes in training data.
    train_potential_y: array or pd.DataFrame
        Potential outcomes in training data.
    test_x: array or pd.DataFrame
        Features in testing data.
    test_potential_y: array or pd.DataFrame
        Potential outcomes in testing data.
    """
    np.random.seed(seed)
    random.seed(seed)
    
    # Load original data (11400 patients, 30 features, 2 dimensional potential outcomes)
    df = pd.read_csv(fn_csv)

    cleaned_columns = []
    for col in df.columns:
        cleaned_columns.append(col.replace("'", "").replace("’", ""))
    df.columns = cleaned_columns

    feat_list = list(df)

    # 8: factor not on certificate, 9: factor not classifiable --> np.nan --> mode imputation
    medrisk_list = [
        "anemia",
        "cardiac",
        "lung",
        "diabetes",
        "herpes",
        "hydra",
        "hemo",
        "chyper",
        "phyper",
        "eclamp",
        "incervix",
        "pre4000",
        "dtotord",
        "preterm",
        "renal",
        "rh",
        "uterine",
        "othermr",
    ]
    # 99: missing
    other_list = ["cigar", "drink", "wtgain", "gestat", "dmeduc", "nprevist"]

    other_list2 = ["pldel", "resstatb"]  # but no samples are missing..

    bin_list = ["dmar"] + medrisk_list
    con_list = ["dmage", "mpcb"] + other_list
    cat_list = ["adequacy"] + other_list2

    for feat in medrisk_list:
        df[feat] = df[feat].apply(lambda x: df[feat].mode()[0] if x in [8, 9] else x)

    for feat in other_list:
        df.loc[df[feat] == 99, feat] = df.loc[df[feat] != 99, feat].mean()

    df_features = df[con_list + bin_list]

    for feat in cat_list:
        df_features = pd.concat(
            [df_features, pd.get_dummies(df[feat], prefix=feat)], axis=1
        )

    # Define features
    feat_list = [
        "dmage",
        "mpcb",
        "cigar",
        "drink",
        "wtgain",
        "gestat",
        "dmeduc",
        "nprevist",
        "dmar",
        "anemia",
        "cardiac",
        "lung",
        "diabetes",
        "herpes",
        "hydra",
        "hemo",
        "chyper",
        "phyper",
        "eclamp",
        "incervix",
        "pre4000",
        "dtotord",
        "preterm",
        "renal",
        "rh",
        "uterine",
        "othermr",
        "adequacy_1",
        "adequacy_2",
        "adequacy_3",
        "pldel_1",
        "pldel_2",
        "pldel_3",
        "pldel_4",
        "pldel_5",
        "resstatb_1",
        "resstatb_2",
        "resstatb_3",
        "resstatb_4",
    ]

    x = np.asarray(df_features[feat_list])
    y0 = np.asarray(df[["outcome(t=0)"]]).reshape((-1,))
    y0 = np.array(y0 < 9999, dtype=int)

    y1 = np.asarray(df[["outcome(t=1)"]]).reshape((-1,))
    y1 = np.array(y1 < 9999, dtype=int)

    # Preprocessing
    scaler = MinMaxScaler()
    scaler.fit(x)
    x = scaler.transform(x)

    no, dim = x.shape

    if treatment_type == "rand":
        # assign with p=0.5
        prob = np.ones(x.shape[0]) * treat_prop
    elif treatment_type == "logistic":
        # assign with logistic prob
        coef = np.random.uniform(-0.1, 0.1, size=[np.shape(x)[1], 1])
        prob = 1 / (1 + np.exp(-np.matmul(x, coef)))

    w = np.random.binomial(1, prob)
    y = y1 * w + y0 * (1 - w)

    potential_y = np.vstack((y0, y1)).T

    # Train/test division
    if train_ratio < 1:
        idx = np.random.permutation(no)
        train_idx = idx[: int(train_ratio * no)]
        test_idx = idx[int(train_ratio * no):]

        train_x = x[train_idx, :]
        train_w = w[train_idx]
        train_y = y[train_idx]
        train_potential_y = potential_y[train_idx, :]

        test_x = x[test_idx, :]
        test_potential_y = potential_y[test_idx, :]
    else:
        train_x = x
        train_w = w
        train_y = y
        train_potential_y = potential_y
        test_x = None
        test_potential_y = None

    return train_x, train_w, train_y, train_potential_y, test_x, test_potential_y


def load(
    data_path: Path,
    train_ratio: float = 0.8,
    treatment_type: str = "rand",
    seed: int = 42,
    treat_prop: float = 0.5,
) -> Tuple:
    """
    Twins dataset dataloader.
        - Download the dataset if needed.
        - Load the dataset.
        - Preprocess the data.
        - Return train/test split.

    Parameters
    ----------
    data_path: Path
        Path to the CSV. If it is missing, it will be downloaded.
    train_ratio: float
        Train/test ratio
    treatment_type: str
        Treatment generation strategy
    seed: float
        Random seed
    treat_prop: float
        Treatment proportion

    Returns
    -------
    train_x: array or pd.DataFrame
        Features in training data.
    train_t: array or pd.DataFrame
        Treatments in training data.
    train_y: array or pd.DataFrame
        Observed outcomes in training data.
    train_potential_y: array or pd.DataFrame
        Potential outcomes in training data.
    test_x: array or pd.DataFrame
        Features in testing data.
    test_potential_y: array or pd.DataFrame
        Potential outcomes in testing data.
    """
    csv = data_path / DATASET

    download_if_needed(csv, http_url=URL)

    log.debug(f"load dataset {csv}")

    return preprocess(
        csv,
        train_ratio=train_ratio,
        treatment_type=treatment_type,
        seed=seed,
        treat_prop=treat_prop,
    )