selection-bias-benchmark / Git / Diff of /catenets/datasets/dataset

Models:
MarcoTheBlack/
selection-bias-benchmark
Downloads: 1
Diff of /catenets/datasets/dataset_twins.py [000000] .. [6ac965]
Switch to side-by-side view

--- a
+++ b/catenets/datasets/dataset_twins.py
@@ -0,0 +1,265 @@
+"""
+Twins dataset
+Load real-world individualized treatment effects estimation datasets
+
+- Reference: http://data.nber.org/data/linked-birth-infant-death-data-vital-statistics-data.html
+"""
+# stdlib
+import random
+from pathlib import Path
+from typing import Tuple
+
+# third party
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import MinMaxScaler
+
+import catenets.logger as log
+
+from .network import download_if_needed
+
+
+DATASET = "Twin_Data.csv.gz"
+URL = "https://bitbucket.org/mvdschaar/mlforhealthlabpub/raw/0b0190bcd38a76c405c805f1ca774971fcd85233/data/twins/Twin_Data.csv.gz"  # noqa: E501
+
+
+def preprocess(
+    fn_csv: Path,
+    train_ratio: float = 0.8,
+    treatment_type: str = "rand",
+    seed: int = 42,
+    treat_prop: float = 0.5,
+) -> Tuple:
+    """Helper for preprocessing the Twins dataset.
+
+    Parameters
+    ----------
+    fn_csv: Path
+        Dataset CSV file path.
+    train_ratio: float
+        The ratio of training data.
+    treatment_type: string
+        The treatment selection strategy.
+    seed: float
+        Random seed.
+
+    Returns
+    -------
+    train_x: array or pd.DataFrame
+        Features in training data.
+    train_t: array or pd.DataFrame
+        Treatments in training data.
+    train_y: array or pd.DataFrame
+        Observed outcomes in training data.
+    train_potential_y: array or pd.DataFrame
+        Potential outcomes in training data.
+    test_x: array or pd.DataFrame
+        Features in testing data.
+    test_potential_y: array or pd.DataFrame
+        Potential outcomes in testing data.
+    """
+    np.random.seed(seed)
+    random.seed(seed)
+    
+    # Load original data (11400 patients, 30 features, 2 dimensional potential outcomes)
+    df = pd.read_csv(fn_csv)
+
+    cleaned_columns = []
+    for col in df.columns:
+        cleaned_columns.append(col.replace("'", "").replace("’", ""))
+    df.columns = cleaned_columns
+
+    feat_list = list(df)
+
+    # 8: factor not on certificate, 9: factor not classifiable --> np.nan --> mode imputation
+    medrisk_list = [
+        "anemia",
+        "cardiac",
+        "lung",
+        "diabetes",
+        "herpes",
+        "hydra",
+        "hemo",
+        "chyper",
+        "phyper",
+        "eclamp",
+        "incervix",
+        "pre4000",
+        "dtotord",
+        "preterm",
+        "renal",
+        "rh",
+        "uterine",
+        "othermr",
+    ]
+    # 99: missing
+    other_list = ["cigar", "drink", "wtgain", "gestat", "dmeduc", "nprevist"]
+
+    other_list2 = ["pldel", "resstatb"]  # but no samples are missing..
+
+    bin_list = ["dmar"] + medrisk_list
+    con_list = ["dmage", "mpcb"] + other_list
+    cat_list = ["adequacy"] + other_list2
+
+    for feat in medrisk_list:
+        df[feat] = df[feat].apply(lambda x: df[feat].mode()[0] if x in [8, 9] else x)
+
+    for feat in other_list:
+        df.loc[df[feat] == 99, feat] = df.loc[df[feat] != 99, feat].mean()
+
+    df_features = df[con_list + bin_list]
+
+    for feat in cat_list:
+        df_features = pd.concat(
+            [df_features, pd.get_dummies(df[feat], prefix=feat)], axis=1
+        )
+
+    # Define features
+    feat_list = [
+        "dmage",
+        "mpcb",
+        "cigar",
+        "drink",
+        "wtgain",
+        "gestat",
+        "dmeduc",
+        "nprevist",
+        "dmar",
+        "anemia",
+        "cardiac",
+        "lung",
+        "diabetes",
+        "herpes",
+        "hydra",
+        "hemo",
+        "chyper",
+        "phyper",
+        "eclamp",
+        "incervix",
+        "pre4000",
+        "dtotord",
+        "preterm",
+        "renal",
+        "rh",
+        "uterine",
+        "othermr",
+        "adequacy_1",
+        "adequacy_2",
+        "adequacy_3",
+        "pldel_1",
+        "pldel_2",
+        "pldel_3",
+        "pldel_4",
+        "pldel_5",
+        "resstatb_1",
+        "resstatb_2",
+        "resstatb_3",
+        "resstatb_4",
+    ]
+
+    x = np.asarray(df_features[feat_list])
+    y0 = np.asarray(df[["outcome(t=0)"]]).reshape((-1,))
+    y0 = np.array(y0 < 9999, dtype=int)
+
+    y1 = np.asarray(df[["outcome(t=1)"]]).reshape((-1,))
+    y1 = np.array(y1 < 9999, dtype=int)
+
+    # Preprocessing
+    scaler = MinMaxScaler()
+    scaler.fit(x)
+    x = scaler.transform(x)
+
+    no, dim = x.shape
+
+    if treatment_type == "rand":
+        # assign with p=0.5
+        prob = np.ones(x.shape[0]) * treat_prop
+    elif treatment_type == "logistic":
+        # assign with logistic prob
+        coef = np.random.uniform(-0.1, 0.1, size=[np.shape(x)[1], 1])
+        prob = 1 / (1 + np.exp(-np.matmul(x, coef)))
+
+    w = np.random.binomial(1, prob)
+    y = y1 * w + y0 * (1 - w)
+
+    potential_y = np.vstack((y0, y1)).T
+
+    # Train/test division
+    if train_ratio < 1:
+        idx = np.random.permutation(no)
+        train_idx = idx[: int(train_ratio * no)]
+        test_idx = idx[int(train_ratio * no):]
+
+        train_x = x[train_idx, :]
+        train_w = w[train_idx]
+        train_y = y[train_idx]
+        train_potential_y = potential_y[train_idx, :]
+
+        test_x = x[test_idx, :]
+        test_potential_y = potential_y[test_idx, :]
+    else:
+        train_x = x
+        train_w = w
+        train_y = y
+        train_potential_y = potential_y
+        test_x = None
+        test_potential_y = None
+
+    return train_x, train_w, train_y, train_potential_y, test_x, test_potential_y
+
+
+def load(
+    data_path: Path,
+    train_ratio: float = 0.8,
+    treatment_type: str = "rand",
+    seed: int = 42,
+    treat_prop: float = 0.5,
+) -> Tuple:
+    """
+    Twins dataset dataloader.
+        - Download the dataset if needed.
+        - Load the dataset.
+        - Preprocess the data.
+        - Return train/test split.
+
+    Parameters
+    ----------
+    data_path: Path
+        Path to the CSV. If it is missing, it will be downloaded.
+    train_ratio: float
+        Train/test ratio
+    treatment_type: str
+        Treatment generation strategy
+    seed: float
+        Random seed
+    treat_prop: float
+        Treatment proportion
+
+    Returns
+    -------
+    train_x: array or pd.DataFrame
+        Features in training data.
+    train_t: array or pd.DataFrame
+        Treatments in training data.
+    train_y: array or pd.DataFrame
+        Observed outcomes in training data.
+    train_potential_y: array or pd.DataFrame
+        Potential outcomes in training data.
+    test_x: array or pd.DataFrame
+        Features in testing data.
+    test_potential_y: array or pd.DataFrame
+        Potential outcomes in testing data.
+    """
+    csv = data_path / DATASET
+
+    download_if_needed(csv, http_url=URL)
+
+    log.debug(f"load dataset {csv}")
+
+    return preprocess(
+        csv,
+        train_ratio=train_ratio,
+        treatment_type=treatment_type,
+        seed=seed,
+        treat_prop=treat_prop,
+    )