|
a |
|
b/catenets/datasets/dataset_acic2016.py |
|
|
1 |
""" |
|
|
2 |
ACIC2016 dataset |
|
|
3 |
""" |
|
|
4 |
# stdlib |
|
|
5 |
import random |
|
|
6 |
from pathlib import Path |
|
|
7 |
from typing import Any, Tuple |
|
|
8 |
import glob |
|
|
9 |
|
|
|
10 |
# third party |
|
|
11 |
import numpy as np |
|
|
12 |
import pandas as pd |
|
|
13 |
from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
|
14 |
from sklearn.model_selection import train_test_split |
|
|
15 |
|
|
|
16 |
import catenets.logger as log |
|
|
17 |
|
|
|
18 |
from .network import download_if_needed |
|
|
19 |
|
|
|
20 |
np.random.seed(0) |
|
|
21 |
random.seed(0) |
|
|
22 |
|
|
|
23 |
FILE_ID = "0B7pG5PPgj6A3N09ibmFwNWE1djA" |
|
|
24 |
PREPROCESSED_FILE_ID = "1iOfEAk402o3jYBs2Prfiz6oaailwWcR5" |
|
|
25 |
|
|
|
26 |
NUMERIC_COLS = [ |
|
|
27 |
0, |
|
|
28 |
3, |
|
|
29 |
4, |
|
|
30 |
16, |
|
|
31 |
17, |
|
|
32 |
18, |
|
|
33 |
20, |
|
|
34 |
21, |
|
|
35 |
22, |
|
|
36 |
24, |
|
|
37 |
24, |
|
|
38 |
25, |
|
|
39 |
30, |
|
|
40 |
31, |
|
|
41 |
32, |
|
|
42 |
33, |
|
|
43 |
39, |
|
|
44 |
40, |
|
|
45 |
41, |
|
|
46 |
53, |
|
|
47 |
54, |
|
|
48 |
] |
|
|
49 |
N_NUM_COLS = len(NUMERIC_COLS) |
|
|
50 |
|
|
|
51 |
|
|
|
52 |
def get_acic_covariates( |
|
|
53 |
fn_csv: Path, keep_categorical: bool = False, preprocessed: bool = True |
|
|
54 |
) -> np.ndarray: |
|
|
55 |
X = pd.read_csv(fn_csv) |
|
|
56 |
if not keep_categorical: |
|
|
57 |
X = X.drop(columns=["x_2", "x_21", "x_24"]) |
|
|
58 |
else: |
|
|
59 |
# encode categorical features |
|
|
60 |
feature_list = [] |
|
|
61 |
for cols_ in X.columns: |
|
|
62 |
if type(X.loc[X.index[0], cols_]) not in [np.int64, np.float64]: |
|
|
63 |
|
|
|
64 |
enc = OneHotEncoder(drop="first") |
|
|
65 |
|
|
|
66 |
enc.fit(np.array(X[[cols_]]).reshape((-1, 1))) |
|
|
67 |
|
|
|
68 |
for k in range(len(list(enc.get_feature_names()))): |
|
|
69 |
X[cols_ + list(enc.get_feature_names())[k]] = enc.transform( |
|
|
70 |
np.array(X[[cols_]]).reshape((-1, 1)) |
|
|
71 |
).toarray()[:, k] |
|
|
72 |
|
|
|
73 |
feature_list.append(cols_) |
|
|
74 |
|
|
|
75 |
X.drop(feature_list, axis=1, inplace=True) |
|
|
76 |
|
|
|
77 |
if preprocessed: |
|
|
78 |
X_t = X.values |
|
|
79 |
else: |
|
|
80 |
scaler = StandardScaler() |
|
|
81 |
X_t = scaler.fit_transform(X) |
|
|
82 |
return X_t |
|
|
83 |
|
|
|
84 |
|
|
|
85 |
def preprocess_simu( |
|
|
86 |
fn_csv: Path, |
|
|
87 |
n_0: int = 2000, |
|
|
88 |
n_1: int = 200, |
|
|
89 |
n_test: int = 500, |
|
|
90 |
error_sd: float = 1, |
|
|
91 |
sp_lin: float = 0.6, |
|
|
92 |
sp_nonlin: float = 0.3, |
|
|
93 |
prop_gamma: float = 0, |
|
|
94 |
prop_omega: float = 0, |
|
|
95 |
ate_goal: float = 0, |
|
|
96 |
inter: bool = True, |
|
|
97 |
i_exp: int = 0, |
|
|
98 |
keep_categorical: bool = False, |
|
|
99 |
preprocessed: bool = True, |
|
|
100 |
) -> Tuple: |
|
|
101 |
X = get_acic_covariates( |
|
|
102 |
fn_csv, keep_categorical=keep_categorical, preprocessed=preprocessed |
|
|
103 |
) |
|
|
104 |
np.random.seed(i_exp) |
|
|
105 |
|
|
|
106 |
# shuffle indices |
|
|
107 |
n_total, n_cov = X.shape |
|
|
108 |
ind = np.arange(n_total) |
|
|
109 |
np.random.shuffle(ind) |
|
|
110 |
ind_test = ind[-n_test:] |
|
|
111 |
ind_1 = ind[n_0: (n_0 + n_1)] |
|
|
112 |
|
|
|
113 |
# create treatment indicator (treatment assignment does not matter in test set) |
|
|
114 |
w = np.zeros(n_total).reshape((-1, 1)) |
|
|
115 |
w[ind_1] = 1 |
|
|
116 |
|
|
|
117 |
# create dgp |
|
|
118 |
coeffs_ = [0, 1] |
|
|
119 |
# sample baseline coefficients |
|
|
120 |
beta_0 = np.random.choice(coeffs_, size=n_cov, replace=True, p=[1 - sp_lin, sp_lin]) |
|
|
121 |
intercept = np.random.choice([x for x in np.arange(-1, 1.25, 0.25)]) |
|
|
122 |
|
|
|
123 |
# sample treatment effect coefficients |
|
|
124 |
gamma = np.random.choice( |
|
|
125 |
coeffs_, size=n_cov, replace=True, p=[1 - prop_gamma, prop_gamma] |
|
|
126 |
) |
|
|
127 |
omega = np.random.choice( |
|
|
128 |
[0, 1], replace=True, size=n_cov, p=[prop_omega, 1 - prop_omega] |
|
|
129 |
) |
|
|
130 |
|
|
|
131 |
# simulate mu_0 and mu_1 |
|
|
132 |
mu_0 = (intercept + np.dot(X, beta_0)).reshape((-1, 1)) |
|
|
133 |
mu_1 = (intercept + np.dot(X, gamma + beta_0 * omega)).reshape((-1, 1)) |
|
|
134 |
if sp_nonlin > 0: |
|
|
135 |
coefs_sq = [0, 0.1] |
|
|
136 |
beta_sq = np.random.choice( |
|
|
137 |
coefs_sq, size=N_NUM_COLS, replace=True, p=[1 - sp_nonlin, sp_nonlin] |
|
|
138 |
) |
|
|
139 |
omega = np.random.choice( |
|
|
140 |
[0, 1], replace=True, size=N_NUM_COLS, p=[prop_omega, 1 - prop_omega] |
|
|
141 |
) |
|
|
142 |
X_sq = X[:, NUMERIC_COLS] ** 2 |
|
|
143 |
mu_0 = mu_0 + np.dot(X_sq, beta_sq).reshape((-1, 1)) |
|
|
144 |
mu_1 = mu_1 + np.dot(X_sq, beta_sq * omega).reshape((-1, 1)) |
|
|
145 |
|
|
|
146 |
if inter: |
|
|
147 |
# randomly add some interactions |
|
|
148 |
ind_c = np.arange(n_cov) |
|
|
149 |
np.random.shuffle(ind_c) |
|
|
150 |
inter_list = list() |
|
|
151 |
for i in range(0, n_cov - 2, 2): |
|
|
152 |
inter_list.append(X[:, ind_c[i]] * X[:, ind_c[i + 1]]) |
|
|
153 |
|
|
|
154 |
X_inter = np.array(inter_list).T |
|
|
155 |
n_inter = X_inter.shape[1] |
|
|
156 |
beta_inter = np.random.choice( |
|
|
157 |
coefs_sq, size=n_inter, replace=True, p=[1 - sp_nonlin, sp_nonlin] |
|
|
158 |
) |
|
|
159 |
omega = np.random.choice( |
|
|
160 |
[0, 1], replace=True, size=n_inter, p=[prop_omega, 1 - prop_omega] |
|
|
161 |
) |
|
|
162 |
mu_0 = mu_0 + np.dot(X_inter, beta_inter).reshape((-1, 1)) |
|
|
163 |
mu_1 = mu_1 + np.dot(X_inter, beta_inter * omega).reshape((-1, 1)) |
|
|
164 |
|
|
|
165 |
ate = np.mean(mu_1 - mu_0) |
|
|
166 |
mu_1 = mu_1 - ate + ate_goal |
|
|
167 |
|
|
|
168 |
y = ( |
|
|
169 |
w * mu_1 |
|
|
170 |
+ (1 - w) * mu_0 |
|
|
171 |
+ np.random.normal(0, error_sd, n_total).reshape((-1, 1)) |
|
|
172 |
) |
|
|
173 |
|
|
|
174 |
X_train, y_train, w_train, mu_0_train, mu_1_train = ( |
|
|
175 |
X[ind[: (n_0 + n_1)], :], |
|
|
176 |
y[ind[: (n_0 + n_1)]], |
|
|
177 |
w[ind[: (n_0 + n_1)]], |
|
|
178 |
mu_0[ind[: (n_0 + n_1)]], |
|
|
179 |
mu_1[ind[: (n_0 + n_1)]], |
|
|
180 |
) |
|
|
181 |
X_test, y_test, w_test, mu_0_t, mu_1_t = ( |
|
|
182 |
X[ind_test, :], |
|
|
183 |
y[ind_test], |
|
|
184 |
w[ind_test], |
|
|
185 |
mu_0[ind_test], |
|
|
186 |
mu_1[ind_test], |
|
|
187 |
) |
|
|
188 |
|
|
|
189 |
return ( |
|
|
190 |
X_train, |
|
|
191 |
w_train, |
|
|
192 |
y_train, |
|
|
193 |
np.asarray([mu_0_train, mu_1_train]).squeeze().T, |
|
|
194 |
X_test, |
|
|
195 |
w_test, |
|
|
196 |
y_test, |
|
|
197 |
np.asarray([mu_0_t, mu_1_t]).squeeze().T, |
|
|
198 |
) |
|
|
199 |
|
|
|
200 |
|
|
|
201 |
def get_acic_orig_filenames(data_path: Path, simu_num: int) -> list: |
|
|
202 |
return sorted(glob.glob((data_path / ("data_cf_all/" + str(simu_num) + |
|
|
203 |
'/zymu_*.csv')).__str__())) |
|
|
204 |
|
|
|
205 |
|
|
|
206 |
def get_acic_orig_outcomes(data_path: Path, |
|
|
207 |
simu_num: int, |
|
|
208 |
i_exp: int) -> Tuple: |
|
|
209 |
file_list = get_acic_orig_filenames(data_path=data_path, |
|
|
210 |
simu_num=simu_num) |
|
|
211 |
|
|
|
212 |
out = pd.read_csv(file_list[i_exp]) |
|
|
213 |
w = out['z'] |
|
|
214 |
y = w * out['y1'] + (1 - w) * out['y0'] |
|
|
215 |
mu_0, mu_1 = out['mu0'], out['mu1'] |
|
|
216 |
return y.values, w.values, mu_0.values, mu_1.values |
|
|
217 |
|
|
|
218 |
|
|
|
219 |
def preprocess_acic_orig(fn_csv: Path, |
|
|
220 |
data_path: Path, |
|
|
221 |
preprocessed: bool = False, |
|
|
222 |
keep_categorical: bool = True, |
|
|
223 |
simu_num: int = 1, |
|
|
224 |
i_exp: int = 0, |
|
|
225 |
train_size: int = 4000, |
|
|
226 |
random_split: bool = False |
|
|
227 |
)-> Tuple: |
|
|
228 |
X = get_acic_covariates( |
|
|
229 |
fn_csv, keep_categorical=keep_categorical, preprocessed=preprocessed |
|
|
230 |
) |
|
|
231 |
|
|
|
232 |
y, w, mu_0, mu_1 = get_acic_orig_outcomes(data_path=data_path, simu_num=simu_num, i_exp=i_exp) |
|
|
233 |
|
|
|
234 |
if not random_split: |
|
|
235 |
X_train, y_train, w_train, mu_0_train, mu_1_train = X[:train_size, :], y[:train_size], \ |
|
|
236 |
w[:train_size], mu_0[:train_size], \ |
|
|
237 |
mu_1[:train_size] |
|
|
238 |
X_test, y_test, w_test, mu_0_test, mu_1_test = X[train_size:, :], y[train_size:], \ |
|
|
239 |
w[train_size:], mu_0[train_size:], \ |
|
|
240 |
mu_1[train_size:] |
|
|
241 |
else: |
|
|
242 |
X_train, X_test, y_train, y_test, w_train, w_test, \ |
|
|
243 |
mu_0_train, mu_0_test, mu_1_train, mu_1_test = train_test_split(X, y, w, mu_0, mu_1, |
|
|
244 |
test_size=1 - train_size, |
|
|
245 |
random_state=i_exp) |
|
|
246 |
|
|
|
247 |
return ( |
|
|
248 |
X_train, |
|
|
249 |
w_train, |
|
|
250 |
y_train, |
|
|
251 |
np.asarray([mu_0_train, mu_1_train]).squeeze().T, |
|
|
252 |
X_test, |
|
|
253 |
w_test, |
|
|
254 |
y_test, |
|
|
255 |
np.asarray([mu_0_test, mu_1_test]).squeeze().T, |
|
|
256 |
) |
|
|
257 |
|
|
|
258 |
|
|
|
259 |
def preprocess(fn_csv: Path, |
|
|
260 |
data_path: Path, |
|
|
261 |
preprocessed: bool = True, |
|
|
262 |
original_acic_outcomes: bool = False, |
|
|
263 |
**kwargs: Any, |
|
|
264 |
) -> Tuple: |
|
|
265 |
if not original_acic_outcomes: |
|
|
266 |
return preprocess_simu(fn_csv=fn_csv, preprocessed=preprocessed, **kwargs) |
|
|
267 |
else: |
|
|
268 |
return preprocess_acic_orig(fn_csv=fn_csv, preprocessed=preprocessed, |
|
|
269 |
data_path=data_path, **kwargs) |
|
|
270 |
|
|
|
271 |
|
|
|
272 |
def load( |
|
|
273 |
data_path: Path, |
|
|
274 |
preprocessed: bool = True, |
|
|
275 |
original_acic_outcomes: bool = False, |
|
|
276 |
**kwargs: Any, |
|
|
277 |
) -> Tuple: |
|
|
278 |
""" |
|
|
279 |
ACIC2016 dataset dataloader. |
|
|
280 |
- Download the dataset if needed. |
|
|
281 |
- Load the dataset. |
|
|
282 |
- Preprocess the data. |
|
|
283 |
- Return train/test split. |
|
|
284 |
|
|
|
285 |
Parameters |
|
|
286 |
---------- |
|
|
287 |
data_path: Path |
|
|
288 |
Path to the CSV. If it is missing, it will be downloaded. |
|
|
289 |
preprocessed: bool |
|
|
290 |
Switch between the raw and preprocessed versions of the dataset. |
|
|
291 |
original_acic_outcomes: bool |
|
|
292 |
Switch between new simulations (Inductive bias paper) and original acic outcomes |
|
|
293 |
|
|
|
294 |
Returns |
|
|
295 |
------- |
|
|
296 |
train_x: array or pd.DataFrame |
|
|
297 |
Features in training data. |
|
|
298 |
train_t: array or pd.DataFrame |
|
|
299 |
Treatments in training data. |
|
|
300 |
train_y: array or pd.DataFrame |
|
|
301 |
Observed outcomes in training data. |
|
|
302 |
train_potential_y: array or pd.DataFrame |
|
|
303 |
Potential outcomes in training data. |
|
|
304 |
test_x: array or pd.DataFrame |
|
|
305 |
Features in testing data. |
|
|
306 |
test_potential_y: array or pd.DataFrame |
|
|
307 |
Potential outcomes in testing data. |
|
|
308 |
""" |
|
|
309 |
if preprocessed: |
|
|
310 |
csv = data_path / "x_trans.csv" |
|
|
311 |
|
|
|
312 |
download_if_needed(csv, file_id=PREPROCESSED_FILE_ID) |
|
|
313 |
else: |
|
|
314 |
arch = data_path / "data_cf_all.tar.gz" |
|
|
315 |
|
|
|
316 |
download_if_needed( |
|
|
317 |
arch, file_id=FILE_ID, unarchive=True, unarchive_folder=data_path |
|
|
318 |
) |
|
|
319 |
|
|
|
320 |
csv = data_path / "data_cf_all/x.csv" |
|
|
321 |
log.debug(f"load dataset {csv}") |
|
|
322 |
|
|
|
323 |
return preprocess(csv, data_path=data_path, preprocessed=preprocessed, |
|
|
324 |
original_acic_outcomes=original_acic_outcomes, |
|
|
325 |
**kwargs) |