[6ac965]: / catenets / datasets / dataset_ihdp.py

Download this file

268 lines (218 with data), 6.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
"""
IHDP (Infant Health and Development Program) dataset
"""
# stdlib
import os
import random
from pathlib import Path
from typing import Any, Tuple
# third party
import numpy as np
import catenets.logger as log
from .network import download_if_needed
np.random.seed(0)
random.seed(0)
TRAIN_DATASET = "ihdp_npci_1-100.train.npz"
TEST_DATASET = "ihdp_npci_1-100.test.npz"
TRAIN_URL = "https://www.fredjo.com/files/ihdp_npci_1-100.train.npz"
TEST_URL = "https://www.fredjo.com/files/ihdp_npci_1-100.test.npz"
# helper functions
def load_data_npz(fname: Path, get_po: bool = True) -> dict:
"""
Helper function for loading the IHDP data set (adapted from https://github.com/clinicalml/cfrnet)
Parameters
----------
fname: Path
Dataset path
Returns
-------
data: dict
Raw IHDP dict, with X, w, y and yf keys.
"""
data_in = np.load(fname)
data = {"X": data_in["x"], "w": data_in["t"], "y": data_in["yf"]}
try:
data["ycf"] = data_in["ycf"]
except BaseException:
data["ycf"] = None
if get_po:
data["mu0"] = data_in["mu0"]
data["mu1"] = data_in["mu1"]
data["HAVE_TRUTH"] = not data["ycf"] is None
data["dim"] = data["X"].shape[1]
data["n"] = data["X"].shape[0]
return data
def prepare_ihdp_data(
data_train: dict,
data_test: dict,
rescale: bool = False,
setting: str = "C",
return_pos: bool = False,
) -> Tuple:
"""
Helper for preprocessing the IHDP dataset.
Parameters
----------
data_train: pd.DataFrame or dict
Train dataset
data_test: pd.DataFrame or dict
Test dataset
rescale: bool, default False
Rescale the outcomes to have similar scale
setting: str, default C
Experiment setting
return_pos: bool
Return potential outcomes
Returns
-------
X: dict or pd.DataFrame
Training Feature set
y: pd.DataFrame or list
Outcome list
t: pd.DataFrame or list
Treatment list
cate_true_in: pd.DataFrame or list
Average treatment effects for the training set
X_t: pd.Dataframe or list
Test feature set
cate_true_out: pd.DataFrame of list
Average treatment effects for the testing set
"""
X, y, w, mu0, mu1 = (
data_train["X"],
data_train["y"],
data_train["w"],
data_train["mu0"],
data_train["mu1"],
)
X_t, _, _, mu0_t, mu1_t = (
data_test["X"],
data_test["y"],
data_test["w"],
data_test["mu0"],
data_test["mu1"],
)
if setting == "D":
y[w == 1] = y[w == 1] + mu0[w == 1]
mu1 = mu0 + mu1
mu1_t = mu0_t + mu1_t
if rescale:
# rescale all outcomes to have similar scale of CATEs if sd_cate > 1
cate_in = mu0 - mu1
sd_cate = np.sqrt(cate_in.var())
if sd_cate > 1:
# training data
error = y - w * mu1 - (1 - w) * mu0
mu0 = mu0 / sd_cate
mu1 = mu1 / sd_cate
y = w * mu1 + (1 - w) * mu0 + error
# test data
mu0_t = mu0_t / sd_cate
mu1_t = mu1_t / sd_cate
cate_true_in = mu1 - mu0
cate_true_out = mu1_t - mu0_t
if return_pos:
return X, y, w, cate_true_in, X_t, cate_true_out, mu0, mu1, mu0_t, mu1_t
return X, y, w, cate_true_in, X_t, cate_true_out
def get_one_data_set(D: dict, i_exp: int, get_po: bool = True) -> dict:
"""
Helper for getting the IHDP data for one experiment. Adapted from https://github.com/clinicalml/cfrnet
Parameters
----------
D: dict or pd.DataFrame
All the experiment
i_exp: int
Experiment number
Returns
-------
data: dict or pd.Dataframe
dict with the experiment
"""
D_exp = {}
D_exp["X"] = D["X"][:, :, i_exp - 1]
D_exp["w"] = D["w"][:, i_exp - 1 : i_exp]
D_exp["y"] = D["y"][:, i_exp - 1 : i_exp]
if D["HAVE_TRUTH"]:
D_exp["ycf"] = D["ycf"][:, i_exp - 1 : i_exp]
else:
D_exp["ycf"] = None
if get_po:
D_exp["mu0"] = D["mu0"][:, i_exp - 1 : i_exp]
D_exp["mu1"] = D["mu1"][:, i_exp - 1 : i_exp]
return D_exp
def load(data_path: Path, exp: int = 1, rescale: bool = False, **kwargs: Any) -> Tuple:
"""
Get IHDP train/test datasets with treatments and labels.
Parameters
----------
data_path: Path
Path to the dataset csv. If the data is missing, it will be downloaded.
Returns
-------
X: pd.Dataframe or array
The training feature set
w: pd.DataFrame or array
Training treatment assignments.
y: pd.Dataframe or array
The training labels
training potential outcomes: pd.DataFrame or array.
Potential outcomes for the training set.
X_t: pd.DataFrame or array
The testing feature set
testing potential outcomes: pd.DataFrame of array
Potential outcomes for the testing set.
"""
data_train, data_test = load_raw(data_path)
data_exp = get_one_data_set(data_train, i_exp=exp, get_po=True)
data_exp_test = get_one_data_set(data_test, i_exp=exp, get_po=True)
(
X,
y,
w,
cate_true_in,
X_t,
cate_true_out,
mu0,
mu1,
mu0_t,
mu1_t,
) = prepare_ihdp_data(
data_exp,
data_exp_test,
rescale=rescale,
return_pos=True,
)
return (
X,
w,
y,
np.asarray([mu0, mu1]).squeeze().T,
X_t,
np.asarray([mu0_t, mu1_t]).squeeze().T,
)
def load_raw(data_path: Path) -> Tuple:
"""
Get IHDP raw train/test sets.
Parameters
----------
data_path: Path
Path to the dataset csv. If the data is missing, it will be downloaded.
Returns
-------
data_train: dict or pd.DataFrame
Training data
data_test: dict or pd.DataFrame
Testing data
"""
try:
os.mkdir(data_path)
except BaseException:
pass
train_csv = data_path / TRAIN_DATASET
test_csv = data_path / TEST_DATASET
log.debug(f"load raw dataset {train_csv}")
download_if_needed(train_csv, http_url=TRAIN_URL)
download_if_needed(test_csv, http_url=TEST_URL)
data_train = load_data_npz(train_csv, get_po=True)
data_test = load_data_npz(test_csv, get_po=True)
return data_train, data_test