a b/catenets/datasets/dataset_acic2016.py
1
"""
2
ACIC2016 dataset
3
"""
4
# stdlib
5
import random
6
from pathlib import Path
7
from typing import Any, Tuple
8
import glob
9
10
# third party
11
import numpy as np
12
import pandas as pd
13
from sklearn.preprocessing import OneHotEncoder, StandardScaler
14
from sklearn.model_selection import train_test_split
15
16
import catenets.logger as log
17
18
from .network import download_if_needed
19
20
np.random.seed(0)
21
random.seed(0)
22
23
FILE_ID = "0B7pG5PPgj6A3N09ibmFwNWE1djA"
24
PREPROCESSED_FILE_ID = "1iOfEAk402o3jYBs2Prfiz6oaailwWcR5"
25
26
NUMERIC_COLS = [
27
    0,
28
    3,
29
    4,
30
    16,
31
    17,
32
    18,
33
    20,
34
    21,
35
    22,
36
    24,
37
    24,
38
    25,
39
    30,
40
    31,
41
    32,
42
    33,
43
    39,
44
    40,
45
    41,
46
    53,
47
    54,
48
]
49
N_NUM_COLS = len(NUMERIC_COLS)
50
51
52
def get_acic_covariates(
53
        fn_csv: Path, keep_categorical: bool = False, preprocessed: bool = True
54
) -> np.ndarray:
55
    X = pd.read_csv(fn_csv)
56
    if not keep_categorical:
57
        X = X.drop(columns=["x_2", "x_21", "x_24"])
58
    else:
59
        # encode categorical features
60
        feature_list = []
61
        for cols_ in X.columns:
62
            if type(X.loc[X.index[0], cols_]) not in [np.int64, np.float64]:
63
64
                enc = OneHotEncoder(drop="first")
65
66
                enc.fit(np.array(X[[cols_]]).reshape((-1, 1)))
67
68
                for k in range(len(list(enc.get_feature_names()))):
69
                    X[cols_ + list(enc.get_feature_names())[k]] = enc.transform(
70
                        np.array(X[[cols_]]).reshape((-1, 1))
71
                    ).toarray()[:, k]
72
73
                feature_list.append(cols_)
74
75
        X.drop(feature_list, axis=1, inplace=True)
76
77
    if preprocessed:
78
        X_t = X.values
79
    else:
80
        scaler = StandardScaler()
81
        X_t = scaler.fit_transform(X)
82
    return X_t
83
84
85
def preprocess_simu(
86
        fn_csv: Path,
87
        n_0: int = 2000,
88
        n_1: int = 200,
89
        n_test: int = 500,
90
        error_sd: float = 1,
91
        sp_lin: float = 0.6,
92
        sp_nonlin: float = 0.3,
93
        prop_gamma: float = 0,
94
        prop_omega: float = 0,
95
        ate_goal: float = 0,
96
        inter: bool = True,
97
        i_exp: int = 0,
98
        keep_categorical: bool = False,
99
        preprocessed: bool = True,
100
) -> Tuple:
101
    X = get_acic_covariates(
102
        fn_csv, keep_categorical=keep_categorical, preprocessed=preprocessed
103
    )
104
    np.random.seed(i_exp)
105
106
    # shuffle indices
107
    n_total, n_cov = X.shape
108
    ind = np.arange(n_total)
109
    np.random.shuffle(ind)
110
    ind_test = ind[-n_test:]
111
    ind_1 = ind[n_0: (n_0 + n_1)]
112
113
    # create treatment indicator (treatment assignment does not matter in test set)
114
    w = np.zeros(n_total).reshape((-1, 1))
115
    w[ind_1] = 1
116
117
    # create dgp
118
    coeffs_ = [0, 1]
119
    # sample baseline coefficients
120
    beta_0 = np.random.choice(coeffs_, size=n_cov, replace=True, p=[1 - sp_lin, sp_lin])
121
    intercept = np.random.choice([x for x in np.arange(-1, 1.25, 0.25)])
122
123
    # sample treatment effect coefficients
124
    gamma = np.random.choice(
125
        coeffs_, size=n_cov, replace=True, p=[1 - prop_gamma, prop_gamma]
126
    )
127
    omega = np.random.choice(
128
        [0, 1], replace=True, size=n_cov, p=[prop_omega, 1 - prop_omega]
129
    )
130
131
    # simulate mu_0 and mu_1
132
    mu_0 = (intercept + np.dot(X, beta_0)).reshape((-1, 1))
133
    mu_1 = (intercept + np.dot(X, gamma + beta_0 * omega)).reshape((-1, 1))
134
    if sp_nonlin > 0:
135
        coefs_sq = [0, 0.1]
136
        beta_sq = np.random.choice(
137
            coefs_sq, size=N_NUM_COLS, replace=True, p=[1 - sp_nonlin, sp_nonlin]
138
        )
139
        omega = np.random.choice(
140
            [0, 1], replace=True, size=N_NUM_COLS, p=[prop_omega, 1 - prop_omega]
141
        )
142
        X_sq = X[:, NUMERIC_COLS] ** 2
143
        mu_0 = mu_0 + np.dot(X_sq, beta_sq).reshape((-1, 1))
144
        mu_1 = mu_1 + np.dot(X_sq, beta_sq * omega).reshape((-1, 1))
145
146
        if inter:
147
            # randomly add some interactions
148
            ind_c = np.arange(n_cov)
149
            np.random.shuffle(ind_c)
150
            inter_list = list()
151
            for i in range(0, n_cov - 2, 2):
152
                inter_list.append(X[:, ind_c[i]] * X[:, ind_c[i + 1]])
153
154
            X_inter = np.array(inter_list).T
155
            n_inter = X_inter.shape[1]
156
            beta_inter = np.random.choice(
157
                coefs_sq, size=n_inter, replace=True, p=[1 - sp_nonlin, sp_nonlin]
158
            )
159
            omega = np.random.choice(
160
                [0, 1], replace=True, size=n_inter, p=[prop_omega, 1 - prop_omega]
161
            )
162
            mu_0 = mu_0 + np.dot(X_inter, beta_inter).reshape((-1, 1))
163
            mu_1 = mu_1 + np.dot(X_inter, beta_inter * omega).reshape((-1, 1))
164
165
    ate = np.mean(mu_1 - mu_0)
166
    mu_1 = mu_1 - ate + ate_goal
167
168
    y = (
169
            w * mu_1
170
            + (1 - w) * mu_0
171
            + np.random.normal(0, error_sd, n_total).reshape((-1, 1))
172
    )
173
174
    X_train, y_train, w_train, mu_0_train, mu_1_train = (
175
        X[ind[: (n_0 + n_1)], :],
176
        y[ind[: (n_0 + n_1)]],
177
        w[ind[: (n_0 + n_1)]],
178
        mu_0[ind[: (n_0 + n_1)]],
179
        mu_1[ind[: (n_0 + n_1)]],
180
    )
181
    X_test, y_test, w_test, mu_0_t, mu_1_t = (
182
        X[ind_test, :],
183
        y[ind_test],
184
        w[ind_test],
185
        mu_0[ind_test],
186
        mu_1[ind_test],
187
    )
188
189
    return (
190
        X_train,
191
        w_train,
192
        y_train,
193
        np.asarray([mu_0_train, mu_1_train]).squeeze().T,
194
        X_test,
195
        w_test,
196
        y_test,
197
        np.asarray([mu_0_t, mu_1_t]).squeeze().T,
198
    )
199
200
201
def get_acic_orig_filenames(data_path: Path, simu_num: int) -> list:
202
    return sorted(glob.glob((data_path / ("data_cf_all/" + str(simu_num) +
203
                                          '/zymu_*.csv')).__str__()))
204
205
206
def get_acic_orig_outcomes(data_path: Path,
207
                           simu_num: int,
208
                           i_exp: int) -> Tuple:
209
    file_list = get_acic_orig_filenames(data_path=data_path,
210
                                        simu_num=simu_num)
211
212
    out = pd.read_csv(file_list[i_exp])
213
    w = out['z']
214
    y = w * out['y1'] + (1 - w) * out['y0']
215
    mu_0, mu_1 = out['mu0'], out['mu1']
216
    return y.values, w.values, mu_0.values, mu_1.values
217
218
219
def preprocess_acic_orig(fn_csv: Path,
220
                         data_path: Path,
221
                         preprocessed: bool = False,
222
                         keep_categorical: bool = True,
223
                         simu_num: int = 1,
224
                         i_exp: int = 0,
225
                         train_size: int = 4000,
226
                         random_split: bool = False
227
                         )-> Tuple:
228
    X = get_acic_covariates(
229
        fn_csv, keep_categorical=keep_categorical, preprocessed=preprocessed
230
    )
231
232
    y, w, mu_0, mu_1 = get_acic_orig_outcomes(data_path=data_path, simu_num=simu_num, i_exp=i_exp)
233
234
    if not random_split:
235
        X_train, y_train, w_train, mu_0_train, mu_1_train = X[:train_size, :], y[:train_size], \
236
                                                            w[:train_size], mu_0[:train_size], \
237
                                                            mu_1[:train_size]
238
        X_test, y_test, w_test, mu_0_test, mu_1_test = X[train_size:, :], y[train_size:], \
239
                                                       w[train_size:], mu_0[train_size:], \
240
                                                       mu_1[train_size:]
241
    else:
242
        X_train, X_test, y_train, y_test, w_train, w_test, \
243
        mu_0_train, mu_0_test, mu_1_train, mu_1_test = train_test_split(X, y, w, mu_0, mu_1,
244
                                                                        test_size=1 - train_size,
245
                                                                        random_state=i_exp)
246
247
    return (
248
        X_train,
249
        w_train,
250
        y_train,
251
        np.asarray([mu_0_train, mu_1_train]).squeeze().T,
252
        X_test,
253
        w_test,
254
        y_test,
255
        np.asarray([mu_0_test, mu_1_test]).squeeze().T,
256
    )
257
258
259
def preprocess(fn_csv: Path,
260
               data_path: Path,
261
               preprocessed: bool = True,
262
               original_acic_outcomes: bool = False,
263
               **kwargs: Any,
264
               ) -> Tuple:
265
    if not original_acic_outcomes:
266
        return preprocess_simu(fn_csv=fn_csv, preprocessed=preprocessed, **kwargs)
267
    else:
268
        return preprocess_acic_orig(fn_csv=fn_csv, preprocessed=preprocessed,
269
                                    data_path=data_path, **kwargs)
270
271
272
def load(
273
        data_path: Path,
274
        preprocessed: bool = True,
275
        original_acic_outcomes: bool = False,
276
        **kwargs: Any,
277
) -> Tuple:
278
    """
279
    ACIC2016 dataset dataloader.
280
        - Download the dataset if needed.
281
        - Load the dataset.
282
        - Preprocess the data.
283
        - Return train/test split.
284
285
    Parameters
286
    ----------
287
    data_path: Path
288
        Path to the CSV. If it is missing, it will be downloaded.
289
    preprocessed: bool
290
        Switch between the raw and preprocessed versions of the dataset.
291
    original_acic_outcomes: bool
292
        Switch between new simulations (Inductive bias paper) and original acic outcomes
293
294
    Returns
295
    -------
296
    train_x: array or pd.DataFrame
297
        Features in training data.
298
    train_t: array or pd.DataFrame
299
        Treatments in training data.
300
    train_y: array or pd.DataFrame
301
        Observed outcomes in training data.
302
    train_potential_y: array or pd.DataFrame
303
        Potential outcomes in training data.
304
    test_x: array or pd.DataFrame
305
        Features in testing data.
306
    test_potential_y: array or pd.DataFrame
307
        Potential outcomes in testing data.
308
    """
309
    if preprocessed:
310
        csv = data_path / "x_trans.csv"
311
312
        download_if_needed(csv, file_id=PREPROCESSED_FILE_ID)
313
    else:
314
        arch = data_path / "data_cf_all.tar.gz"
315
316
        download_if_needed(
317
            arch, file_id=FILE_ID, unarchive=True, unarchive_folder=data_path
318
        )
319
320
        csv = data_path / "data_cf_all/x.csv"
321
    log.debug(f"load dataset {csv}")
322
323
    return preprocess(csv, data_path=data_path, preprocessed=preprocessed,
324
                      original_acic_outcomes=original_acic_outcomes,
325
                      **kwargs)