--- a +++ b/catenets/datasets/dataset_twins.py @@ -0,0 +1,265 @@ +""" +Twins dataset +Load real-world individualized treatment effects estimation datasets + +- Reference: http://data.nber.org/data/linked-birth-infant-death-data-vital-statistics-data.html +""" +# stdlib +import random +from pathlib import Path +from typing import Tuple + +# third party +import numpy as np +import pandas as pd +from sklearn.preprocessing import MinMaxScaler + +import catenets.logger as log + +from .network import download_if_needed + + +DATASET = "Twin_Data.csv.gz" +URL = "https://bitbucket.org/mvdschaar/mlforhealthlabpub/raw/0b0190bcd38a76c405c805f1ca774971fcd85233/data/twins/Twin_Data.csv.gz" # noqa: E501 + + +def preprocess( + fn_csv: Path, + train_ratio: float = 0.8, + treatment_type: str = "rand", + seed: int = 42, + treat_prop: float = 0.5, +) -> Tuple: + """Helper for preprocessing the Twins dataset. + + Parameters + ---------- + fn_csv: Path + Dataset CSV file path. + train_ratio: float + The ratio of training data. + treatment_type: string + The treatment selection strategy. + seed: float + Random seed. + + Returns + ------- + train_x: array or pd.DataFrame + Features in training data. + train_t: array or pd.DataFrame + Treatments in training data. + train_y: array or pd.DataFrame + Observed outcomes in training data. + train_potential_y: array or pd.DataFrame + Potential outcomes in training data. + test_x: array or pd.DataFrame + Features in testing data. + test_potential_y: array or pd.DataFrame + Potential outcomes in testing data. + """ + np.random.seed(seed) + random.seed(seed) + + # Load original data (11400 patients, 30 features, 2 dimensional potential outcomes) + df = pd.read_csv(fn_csv) + + cleaned_columns = [] + for col in df.columns: + cleaned_columns.append(col.replace("'", "").replace("’", "")) + df.columns = cleaned_columns + + feat_list = list(df) + + # 8: factor not on certificate, 9: factor not classifiable --> np.nan --> mode imputation + medrisk_list = [ + "anemia", + "cardiac", + "lung", + "diabetes", + "herpes", + "hydra", + "hemo", + "chyper", + "phyper", + "eclamp", + "incervix", + "pre4000", + "dtotord", + "preterm", + "renal", + "rh", + "uterine", + "othermr", + ] + # 99: missing + other_list = ["cigar", "drink", "wtgain", "gestat", "dmeduc", "nprevist"] + + other_list2 = ["pldel", "resstatb"] # but no samples are missing.. + + bin_list = ["dmar"] + medrisk_list + con_list = ["dmage", "mpcb"] + other_list + cat_list = ["adequacy"] + other_list2 + + for feat in medrisk_list: + df[feat] = df[feat].apply(lambda x: df[feat].mode()[0] if x in [8, 9] else x) + + for feat in other_list: + df.loc[df[feat] == 99, feat] = df.loc[df[feat] != 99, feat].mean() + + df_features = df[con_list + bin_list] + + for feat in cat_list: + df_features = pd.concat( + [df_features, pd.get_dummies(df[feat], prefix=feat)], axis=1 + ) + + # Define features + feat_list = [ + "dmage", + "mpcb", + "cigar", + "drink", + "wtgain", + "gestat", + "dmeduc", + "nprevist", + "dmar", + "anemia", + "cardiac", + "lung", + "diabetes", + "herpes", + "hydra", + "hemo", + "chyper", + "phyper", + "eclamp", + "incervix", + "pre4000", + "dtotord", + "preterm", + "renal", + "rh", + "uterine", + "othermr", + "adequacy_1", + "adequacy_2", + "adequacy_3", + "pldel_1", + "pldel_2", + "pldel_3", + "pldel_4", + "pldel_5", + "resstatb_1", + "resstatb_2", + "resstatb_3", + "resstatb_4", + ] + + x = np.asarray(df_features[feat_list]) + y0 = np.asarray(df[["outcome(t=0)"]]).reshape((-1,)) + y0 = np.array(y0 < 9999, dtype=int) + + y1 = np.asarray(df[["outcome(t=1)"]]).reshape((-1,)) + y1 = np.array(y1 < 9999, dtype=int) + + # Preprocessing + scaler = MinMaxScaler() + scaler.fit(x) + x = scaler.transform(x) + + no, dim = x.shape + + if treatment_type == "rand": + # assign with p=0.5 + prob = np.ones(x.shape[0]) * treat_prop + elif treatment_type == "logistic": + # assign with logistic prob + coef = np.random.uniform(-0.1, 0.1, size=[np.shape(x)[1], 1]) + prob = 1 / (1 + np.exp(-np.matmul(x, coef))) + + w = np.random.binomial(1, prob) + y = y1 * w + y0 * (1 - w) + + potential_y = np.vstack((y0, y1)).T + + # Train/test division + if train_ratio < 1: + idx = np.random.permutation(no) + train_idx = idx[: int(train_ratio * no)] + test_idx = idx[int(train_ratio * no):] + + train_x = x[train_idx, :] + train_w = w[train_idx] + train_y = y[train_idx] + train_potential_y = potential_y[train_idx, :] + + test_x = x[test_idx, :] + test_potential_y = potential_y[test_idx, :] + else: + train_x = x + train_w = w + train_y = y + train_potential_y = potential_y + test_x = None + test_potential_y = None + + return train_x, train_w, train_y, train_potential_y, test_x, test_potential_y + + +def load( + data_path: Path, + train_ratio: float = 0.8, + treatment_type: str = "rand", + seed: int = 42, + treat_prop: float = 0.5, +) -> Tuple: + """ + Twins dataset dataloader. + - Download the dataset if needed. + - Load the dataset. + - Preprocess the data. + - Return train/test split. + + Parameters + ---------- + data_path: Path + Path to the CSV. If it is missing, it will be downloaded. + train_ratio: float + Train/test ratio + treatment_type: str + Treatment generation strategy + seed: float + Random seed + treat_prop: float + Treatment proportion + + Returns + ------- + train_x: array or pd.DataFrame + Features in training data. + train_t: array or pd.DataFrame + Treatments in training data. + train_y: array or pd.DataFrame + Observed outcomes in training data. + train_potential_y: array or pd.DataFrame + Potential outcomes in training data. + test_x: array or pd.DataFrame + Features in testing data. + test_potential_y: array or pd.DataFrame + Potential outcomes in testing data. + """ + csv = data_path / DATASET + + download_if_needed(csv, http_url=URL) + + log.debug(f"load dataset {csv}") + + return preprocess( + csv, + train_ratio=train_ratio, + treatment_type=treatment_type, + seed=seed, + treat_prop=treat_prop, + )