Switch to side-by-side view

--- a
+++ b/catenets/datasets/dataset_acic2016.py
@@ -0,0 +1,325 @@
+"""
+ACIC2016 dataset
+"""
+# stdlib
+import random
+from pathlib import Path
+from typing import Any, Tuple
+import glob
+
+# third party
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.model_selection import train_test_split
+
+import catenets.logger as log
+
+from .network import download_if_needed
+
+np.random.seed(0)
+random.seed(0)
+
+FILE_ID = "0B7pG5PPgj6A3N09ibmFwNWE1djA"
+PREPROCESSED_FILE_ID = "1iOfEAk402o3jYBs2Prfiz6oaailwWcR5"
+
+NUMERIC_COLS = [
+    0,
+    3,
+    4,
+    16,
+    17,
+    18,
+    20,
+    21,
+    22,
+    24,
+    24,
+    25,
+    30,
+    31,
+    32,
+    33,
+    39,
+    40,
+    41,
+    53,
+    54,
+]
+N_NUM_COLS = len(NUMERIC_COLS)
+
+
+def get_acic_covariates(
+        fn_csv: Path, keep_categorical: bool = False, preprocessed: bool = True
+) -> np.ndarray:
+    X = pd.read_csv(fn_csv)
+    if not keep_categorical:
+        X = X.drop(columns=["x_2", "x_21", "x_24"])
+    else:
+        # encode categorical features
+        feature_list = []
+        for cols_ in X.columns:
+            if type(X.loc[X.index[0], cols_]) not in [np.int64, np.float64]:
+
+                enc = OneHotEncoder(drop="first")
+
+                enc.fit(np.array(X[[cols_]]).reshape((-1, 1)))
+
+                for k in range(len(list(enc.get_feature_names()))):
+                    X[cols_ + list(enc.get_feature_names())[k]] = enc.transform(
+                        np.array(X[[cols_]]).reshape((-1, 1))
+                    ).toarray()[:, k]
+
+                feature_list.append(cols_)
+
+        X.drop(feature_list, axis=1, inplace=True)
+
+    if preprocessed:
+        X_t = X.values
+    else:
+        scaler = StandardScaler()
+        X_t = scaler.fit_transform(X)
+    return X_t
+
+
+def preprocess_simu(
+        fn_csv: Path,
+        n_0: int = 2000,
+        n_1: int = 200,
+        n_test: int = 500,
+        error_sd: float = 1,
+        sp_lin: float = 0.6,
+        sp_nonlin: float = 0.3,
+        prop_gamma: float = 0,
+        prop_omega: float = 0,
+        ate_goal: float = 0,
+        inter: bool = True,
+        i_exp: int = 0,
+        keep_categorical: bool = False,
+        preprocessed: bool = True,
+) -> Tuple:
+    X = get_acic_covariates(
+        fn_csv, keep_categorical=keep_categorical, preprocessed=preprocessed
+    )
+    np.random.seed(i_exp)
+
+    # shuffle indices
+    n_total, n_cov = X.shape
+    ind = np.arange(n_total)
+    np.random.shuffle(ind)
+    ind_test = ind[-n_test:]
+    ind_1 = ind[n_0: (n_0 + n_1)]
+
+    # create treatment indicator (treatment assignment does not matter in test set)
+    w = np.zeros(n_total).reshape((-1, 1))
+    w[ind_1] = 1
+
+    # create dgp
+    coeffs_ = [0, 1]
+    # sample baseline coefficients
+    beta_0 = np.random.choice(coeffs_, size=n_cov, replace=True, p=[1 - sp_lin, sp_lin])
+    intercept = np.random.choice([x for x in np.arange(-1, 1.25, 0.25)])
+
+    # sample treatment effect coefficients
+    gamma = np.random.choice(
+        coeffs_, size=n_cov, replace=True, p=[1 - prop_gamma, prop_gamma]
+    )
+    omega = np.random.choice(
+        [0, 1], replace=True, size=n_cov, p=[prop_omega, 1 - prop_omega]
+    )
+
+    # simulate mu_0 and mu_1
+    mu_0 = (intercept + np.dot(X, beta_0)).reshape((-1, 1))
+    mu_1 = (intercept + np.dot(X, gamma + beta_0 * omega)).reshape((-1, 1))
+    if sp_nonlin > 0:
+        coefs_sq = [0, 0.1]
+        beta_sq = np.random.choice(
+            coefs_sq, size=N_NUM_COLS, replace=True, p=[1 - sp_nonlin, sp_nonlin]
+        )
+        omega = np.random.choice(
+            [0, 1], replace=True, size=N_NUM_COLS, p=[prop_omega, 1 - prop_omega]
+        )
+        X_sq = X[:, NUMERIC_COLS] ** 2
+        mu_0 = mu_0 + np.dot(X_sq, beta_sq).reshape((-1, 1))
+        mu_1 = mu_1 + np.dot(X_sq, beta_sq * omega).reshape((-1, 1))
+
+        if inter:
+            # randomly add some interactions
+            ind_c = np.arange(n_cov)
+            np.random.shuffle(ind_c)
+            inter_list = list()
+            for i in range(0, n_cov - 2, 2):
+                inter_list.append(X[:, ind_c[i]] * X[:, ind_c[i + 1]])
+
+            X_inter = np.array(inter_list).T
+            n_inter = X_inter.shape[1]
+            beta_inter = np.random.choice(
+                coefs_sq, size=n_inter, replace=True, p=[1 - sp_nonlin, sp_nonlin]
+            )
+            omega = np.random.choice(
+                [0, 1], replace=True, size=n_inter, p=[prop_omega, 1 - prop_omega]
+            )
+            mu_0 = mu_0 + np.dot(X_inter, beta_inter).reshape((-1, 1))
+            mu_1 = mu_1 + np.dot(X_inter, beta_inter * omega).reshape((-1, 1))
+
+    ate = np.mean(mu_1 - mu_0)
+    mu_1 = mu_1 - ate + ate_goal
+
+    y = (
+            w * mu_1
+            + (1 - w) * mu_0
+            + np.random.normal(0, error_sd, n_total).reshape((-1, 1))
+    )
+
+    X_train, y_train, w_train, mu_0_train, mu_1_train = (
+        X[ind[: (n_0 + n_1)], :],
+        y[ind[: (n_0 + n_1)]],
+        w[ind[: (n_0 + n_1)]],
+        mu_0[ind[: (n_0 + n_1)]],
+        mu_1[ind[: (n_0 + n_1)]],
+    )
+    X_test, y_test, w_test, mu_0_t, mu_1_t = (
+        X[ind_test, :],
+        y[ind_test],
+        w[ind_test],
+        mu_0[ind_test],
+        mu_1[ind_test],
+    )
+
+    return (
+        X_train,
+        w_train,
+        y_train,
+        np.asarray([mu_0_train, mu_1_train]).squeeze().T,
+        X_test,
+        w_test,
+        y_test,
+        np.asarray([mu_0_t, mu_1_t]).squeeze().T,
+    )
+
+
+def get_acic_orig_filenames(data_path: Path, simu_num: int) -> list:
+    return sorted(glob.glob((data_path / ("data_cf_all/" + str(simu_num) +
+                                          '/zymu_*.csv')).__str__()))
+
+
+def get_acic_orig_outcomes(data_path: Path,
+                           simu_num: int,
+                           i_exp: int) -> Tuple:
+    file_list = get_acic_orig_filenames(data_path=data_path,
+                                        simu_num=simu_num)
+
+    out = pd.read_csv(file_list[i_exp])
+    w = out['z']
+    y = w * out['y1'] + (1 - w) * out['y0']
+    mu_0, mu_1 = out['mu0'], out['mu1']
+    return y.values, w.values, mu_0.values, mu_1.values
+
+
+def preprocess_acic_orig(fn_csv: Path,
+                         data_path: Path,
+                         preprocessed: bool = False,
+                         keep_categorical: bool = True,
+                         simu_num: int = 1,
+                         i_exp: int = 0,
+                         train_size: int = 4000,
+                         random_split: bool = False
+                         )-> Tuple:
+    X = get_acic_covariates(
+        fn_csv, keep_categorical=keep_categorical, preprocessed=preprocessed
+    )
+
+    y, w, mu_0, mu_1 = get_acic_orig_outcomes(data_path=data_path, simu_num=simu_num, i_exp=i_exp)
+
+    if not random_split:
+        X_train, y_train, w_train, mu_0_train, mu_1_train = X[:train_size, :], y[:train_size], \
+                                                            w[:train_size], mu_0[:train_size], \
+                                                            mu_1[:train_size]
+        X_test, y_test, w_test, mu_0_test, mu_1_test = X[train_size:, :], y[train_size:], \
+                                                       w[train_size:], mu_0[train_size:], \
+                                                       mu_1[train_size:]
+    else:
+        X_train, X_test, y_train, y_test, w_train, w_test, \
+        mu_0_train, mu_0_test, mu_1_train, mu_1_test = train_test_split(X, y, w, mu_0, mu_1,
+                                                                        test_size=1 - train_size,
+                                                                        random_state=i_exp)
+
+    return (
+        X_train,
+        w_train,
+        y_train,
+        np.asarray([mu_0_train, mu_1_train]).squeeze().T,
+        X_test,
+        w_test,
+        y_test,
+        np.asarray([mu_0_test, mu_1_test]).squeeze().T,
+    )
+
+
+def preprocess(fn_csv: Path,
+               data_path: Path,
+               preprocessed: bool = True,
+               original_acic_outcomes: bool = False,
+               **kwargs: Any,
+               ) -> Tuple:
+    if not original_acic_outcomes:
+        return preprocess_simu(fn_csv=fn_csv, preprocessed=preprocessed, **kwargs)
+    else:
+        return preprocess_acic_orig(fn_csv=fn_csv, preprocessed=preprocessed,
+                                    data_path=data_path, **kwargs)
+
+
+def load(
+        data_path: Path,
+        preprocessed: bool = True,
+        original_acic_outcomes: bool = False,
+        **kwargs: Any,
+) -> Tuple:
+    """
+    ACIC2016 dataset dataloader.
+        - Download the dataset if needed.
+        - Load the dataset.
+        - Preprocess the data.
+        - Return train/test split.
+
+    Parameters
+    ----------
+    data_path: Path
+        Path to the CSV. If it is missing, it will be downloaded.
+    preprocessed: bool
+        Switch between the raw and preprocessed versions of the dataset.
+    original_acic_outcomes: bool
+        Switch between new simulations (Inductive bias paper) and original acic outcomes
+
+    Returns
+    -------
+    train_x: array or pd.DataFrame
+        Features in training data.
+    train_t: array or pd.DataFrame
+        Treatments in training data.
+    train_y: array or pd.DataFrame
+        Observed outcomes in training data.
+    train_potential_y: array or pd.DataFrame
+        Potential outcomes in training data.
+    test_x: array or pd.DataFrame
+        Features in testing data.
+    test_potential_y: array or pd.DataFrame
+        Potential outcomes in testing data.
+    """
+    if preprocessed:
+        csv = data_path / "x_trans.csv"
+
+        download_if_needed(csv, file_id=PREPROCESSED_FILE_ID)
+    else:
+        arch = data_path / "data_cf_all.tar.gz"
+
+        download_if_needed(
+            arch, file_id=FILE_ID, unarchive=True, unarchive_folder=data_path
+        )
+
+        csv = data_path / "data_cf_all/x.csv"
+    log.debug(f"load dataset {csv}")
+
+    return preprocess(csv, data_path=data_path, preprocessed=preprocessed,
+                      original_acic_outcomes=original_acic_outcomes,
+                      **kwargs)