[6ac965]: / catenets / models / diffpo / diffpo_learner.py

Download this file

404 lines (333 with data), 15.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
from typing import Any, Callable, List
import numpy as np
import torch
from torch import nn
import os
import tqdm
import catenets.logger as log
from catenets.models.constants import (
DEFAULT_BATCH_SIZE,
DEFAULT_DIM_P_OUT,
DEFAULT_DIM_P_R,
DEFAULT_DIM_S_OUT,
DEFAULT_DIM_S_R,
DEFAULT_LAYERS_OUT,
DEFAULT_LAYERS_R,
DEFAULT_N_ITER,
DEFAULT_N_ITER_MIN,
DEFAULT_N_ITER_PRINT,
DEFAULT_PATIENCE,
DEFAULT_PENALTY_L2,
DEFAULT_PENALTY_ORTHOGONAL,
DEFAULT_SEED,
DEFAULT_NJOBS,
DEFAULT_STEP_SIZE,
DEFAULT_VAL_SPLIT,
LARGE_VAL,
)
from catenets.models.torch.base import DEVICE, BaseCATEEstimator
from catenets.models.torch.utils.model_utils import make_val_split
import pandas as pd
# Hydra
from omegaconf import DictConfig
import json
import datetime
from .src.main_model_table import TabCSDI
from .src.utils_table import train
from .dataset_acic import get_dataloader
from .PropensityNet import load_data
torch.manual_seed(0)
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
class DiffPOLearner(BaseCATEEstimator):
"""
A flexible treatment effect estimator based on the EconML framework.
"""
def __init__(
self,
cfg: DictConfig,
num_features: int,
binary_y: bool,
) -> None:
self.config = cfg.DiffPOLearner
self.diffpo_path = cfg.diffpo_path
self.config.diffusion.cond_dim = num_features+1 # make sure inner dimension matches the dataset
self.est = None
self.propnet = None
self.device = DEVICE
self.cate_cis = None # confidence intervals, dim: 2, n, num_T-1, dim_Y
self.pred_outcomes = None
# create folder if diffpo_path + 'data' does not exist
if not os.path.exists(self.diffpo_path):
os.makedirs(self.diffpo_path)
# Store data for their pipeline
self.data_dir = self.diffpo_path+'/data/'
if not os.path.exists(self.data_dir):
os.makedirs(self.data_dir)
return None
def reshape_data(self, X: np.ndarray, w: np.ndarray, outcomes: np.ndarray) -> None:
data = np.concatenate([w.reshape(-1,1),outcomes[:,0],outcomes[:,1],outcomes[:,0],outcomes[:,1],X], axis=1)
data_df = pd.DataFrame(data)
# Create masking array of same shape as pp_data and initialize with 1s
mask = np.ones(data_df.shape)
mask[:,1] = w
mask[:,2] = 1-w
mask[:,3] = 0
mask[:,4] = 0
mask_df = pd.DataFrame(mask)
return data_df, mask_df
def train(self, X: np.ndarray, y: np.ndarray, w: np.ndarray, outcomes:np.ndarray) -> None:
"""
Prepare data and train DiffPO Learner
"""
log.info("Training data shapes: X: {}, Y: {}, T: {}".format(X.shape, y.shape, w.shape))
if not os.path.exists(self.data_dir):
os.makedirs(self.data_dir)
data, mask = self.reshape_data(X, w, outcomes)
# create destination folders if not exist
if not os.path.exists(self.data_dir+"acic2018_norm_data/"):
os.makedirs(self.data_dir+"acic2018_norm_data/")
if not os.path.exists(self.data_dir+"acic2018_mask/"):
os.makedirs(self.data_dir+"acic2018_mask/")
# save intermediate data
data.to_csv(self.data_dir+"acic2018_norm_data/data_pp.csv", index=False)
mask.to_csv(self.data_dir+"acic2018_mask/data_pp.csv", index=False)
# Remove old files
if os.path.exists(self.data_dir+"missing_ratio-0.2_seed-1_current_id-data_max-min_norm.pk"):
os.remove(self.data_dir+"missing_ratio-0.2_seed-1_current_id-data_max-min_norm.pk")
if os.path.exists(self.data_dir+"missing_ratio-0.2_seed-1.pk"):
os.remove(self.data_dir+"missing_ratio-0.2_seed-1.pk")
# Create folder
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
# define these as variables
nfold = 1
config = "acic2018.yaml"
current_id = "data_pp"
device = DEVICE
seed = 1
testmissingratio = 0.2
unconditional = 0
modelfolder = ""
nsample = 1
perform_training = 1
foldername = self.diffpo_path + "/save/acic_fold" + str(nfold) + "_" + current_time + "/"
# print("model folder:", foldername)
os.makedirs(foldername, exist_ok=True)
current_id = "data_pp"
# print('Start exe_acic on current_id', current_id)
# Every loader contains "observed_data", "observed_mask", "gt_mask", "timepoints"
training_size = 1
train_loader, valid_loader, _ = get_dataloader(
seed=seed,
nfold=nfold,
batch_size=self.config["train"]["batch_size"],
missing_ratio=testmissingratio,
dataset_name = self.config["dataset"]["data_name"],
current_id = current_id,
training_size = training_size,
data_path=self.data_dir,
x_dim=X.shape[1],
)
#=======================First train and fix propnet======================
# Train a propensitynet on this dataset
propnet = load_data(dataset_name = self.config["dataset"]["data_name"], current_id=current_id, x_dim=X.shape[1], data_path=self.data_dir)
# frozen the trained_propnet
# print('Finish training propnet and fix the parameters')
propnet.eval()
# ========================================================================
propnet = propnet.to(device)
model = TabCSDI(self.config, self.device).to(self.device)
# Train the model
train(
model,
self.config["train"],
train_loader,
valid_loader=valid_loader,
valid_epoch_interval=self.config["train"]["valid_epoch_interval"],
foldername=foldername,
propnet = propnet
)
directory = self.diffpo_path + "/save_model/" + current_id
if not os.path.exists(directory):
os.makedirs(directory)
# # load model
# model.load_state_dict(torch.load(directory + "/model_weights.pth"))
# save model
torch.save(model.state_dict(), directory + "/model_weights.pth")
# predict function with bool return_po and return potential outcome if true
def predict(self, X: np.ndarray, T0: np.ndarray = None, T1: np.ndarray = None, outcomes: np.ndarray = None) -> np.ndarray:
"""
Predict the treatment effect using the DiffPO estimator.
"""
# Store data for their pipeline
data_dir = self.data_dir
data, mask = self.reshape_data(X, T0, outcomes)
data.to_csv(data_dir+"acic2018_norm_data/data_pp_test.csv", index=False)
mask.to_csv(data_dir+"acic2018_mask/data_pp_test.csv", index=False)
# Remove old files
if os.path.exists(data_dir+"missing_ratio-0.2_seed-1_current_id-data_max-min_norm.pk"):
os.remove(data_dir+"missing_ratio-0.2_seed-1_current_id-data_max-min_norm.pk")
if os.path.exists(data_dir+"missing_ratio-0.2_seed-1.pk"):
os.remove(data_dir+"missing_ratio-0.2_seed-1.pk")
# Create folder
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
# define these as variables
nfold = 1
current_id = "data_pp_test"
current_id_train = "data_pp"
seed = 1
testmissingratio = 0.2
nsample = 50
perform_training = 1
foldername = "./save/acic_fold" + str(nfold) + "_" + current_time + "/"
# print("model folder:", foldername)
os.makedirs(foldername, exist_ok=True)
# Every loader contains "observed_data", "observed_mask", "gt_mask", "timepoints"
training_size = 0
_,_,test_loader = get_dataloader(
seed=seed,
nfold=nfold,
batch_size=1,
missing_ratio=testmissingratio,
dataset_name = self.config["dataset"]["data_name"],
current_id = current_id,
training_size = training_size,
data_path=data_dir,
x_dim=X.shape[1],
)
# load model
directory = self.diffpo_path + "/save_model/" + current_id_train
os.makedirs(directory, exist_ok=True)
model = TabCSDI(self.config, self.device).to(self.device)
model.load_state_dict(torch.load(directory + "/model_weights.pth"))
# get cates
return self.evaluate(model, test_loader, nsample, foldername=foldername)
def predict_outcomes(self, X: np.ndarray, T0: np.ndarray = None, T1: np.ndarray = None, outcomes: np.ndarray = None) -> np.ndarray:
"""
Predict the potential outcomes using the DiffPO estimator.
"""
# add outer dimension to self.pred_outcomes
return self.pred_outcomes.cpu().numpy().reshape(self.pred_outcomes.shape[0], self.pred_outcomes.shape[1], 1)
def explain(self, X: np.ndarray, background_samples: np.ndarray = None, explainer_limit: int = None) -> np.ndarray:
"""
Explain the treatment effect using the EconML estimator.
"""
if explainer_limit is None:
explainer_limit = X.shape[0]
return self.est.shap_values(X[:explainer_limit], background_samples=None)
def infer_effect_ci(self, X, T0) -> np.ndarray:
"""
Infer the confidence interval of the treatment effect using the EconML estimator.
"""
cates_conf_lbs = self.cate_cis[0]
cates_conf_ups = self.cate_cis[1]
temp = cates_conf_lbs[T0 != 0]
cates_conf_lbs[T0 != 0] = -cates_conf_ups[T0 != 0]
cates_conf_ups[T0 != 0] = -temp
return np.array([cates_conf_lbs, cates_conf_ups])
def evaluate(self, model, test_loader, nsample=100, scaler=1, mean_scaler=0, foldername=""):
# Control random seed in the current script.
torch.manual_seed(0)
np.random.seed(0)
with torch.no_grad():
model.eval()
mse_total = 0
mae_total = 0
evalpoints_total = 0
pehe_test = AverageMeter()
y0_test = AverageMeter()
y1_test = AverageMeter()
# for uncertainty
y0_samples = []
y1_samples = []
y0_true_list = []
y1_true_list = []
ite_samples = []
ite_true_list = []
pred_ites = []
pred_y0s = []
pred_y1s = []
for batch_no, test_batch in enumerate(test_loader, start=1):
# Get model outputs
output = model.evaluate(test_batch, nsample)
samples, observed_data, target_mask, observed_mask, observed_tp = output
# Extract relevant quantities
y0_samples.append(samples[:,:,0])
y1_samples.append(samples[:,:,1])
ite_samples.append(samples[:,:,1] - samples[:,:,0])
# Get point estimation through median
est_data = torch.median(samples, dim=1).values
# Get true ite
obs_data = observed_data.squeeze(1)
true_ite = obs_data[:, 2] - obs_data[:, 1]
ite_true_list.append(true_ite)
# Get predicted ite
pred_y0 = est_data[:, 0]
pred_y1 = est_data[:, 1]
pred_y0s.append(pred_y0)
pred_y1s.append(pred_y1)
y0_true_list.append(obs_data[:, 1])
y1_true_list.append(obs_data[:, 2])
pred_ite = pred_y1 - pred_y0
pred_ites.append(pred_ite)
#y0_test.update(diff_y0, obs_data.size(0))
#diff_y0 = np.mean((pred_y0.cpu().numpy()-obs_data[:, 1].cpu().numpy())**2)
#y1_test.update(diff_y1, obs_data.size(0))
#diff_y1 = np.mean((pred_y1.cpu().numpy()-obs_data[:, 2].cpu().numpy())**2)
#pehe_test.update(diff_ite, obs_data.size(0))
#diff_ite = np.mean((true_ite.cpu().numpy()-est_ite.cpu().numpy())**2)
#---------------uncertainty estimation-------------------------
pred_samples_y0 = torch.cat(y0_samples, dim=0)
pred_samples_y1 = torch.cat(y1_samples, dim=0)
pred_samples_ite = torch.cat(ite_samples, dim=0)
truth_y0 = torch.cat(y0_true_list, dim=0)
truth_y1 = torch.cat(y1_true_list, dim=0)
truth_ite = torch.cat(ite_true_list, dim=0)
prob_0, median_width_0 = self.compute_interval(pred_samples_y0, truth_y0)
prob_1, median_width_1 = self.compute_interval(pred_samples_y1, truth_y1)
prob_ite, median_width_ite = self.compute_interval(pred_samples_ite, truth_ite)
self.cate_cis = torch.zeros(2, pred_samples_ite.shape[0], 1) # confidence intervals, dim: 2, n, dim_Y
for i in range(pred_samples_ite.shape[0]):
lower_quantile, upper_quantile, in_quantiles = self.check_intervel(confidence_level=0.95, y_pred= pred_samples_ite[i, :], y_true=truth_ite[i])
self.cate_cis[0, i, 0] = lower_quantile
self.cate_cis[1, i, 0] = upper_quantile
#----------------------------------------------------------------
pred_ites = torch.cat(pred_ites, dim=0)
pred_y0s = torch.cat(pred_y0s, dim=0)
pred_y1s = torch.cat(pred_y1s, dim=0)
#np.zeros((X.shape[0], self.cfg.simulator.num_T, self.cfg.simulator.dim_Y))
self.pred_outcomes = torch.cat([pred_y0s.unsqueeze(1), pred_y1s.unsqueeze(1)], dim=1)
self.cate_cis = self.cate_cis.cpu().numpy()
return pred_ites
def check_intervel(self, confidence_level, y_pred, y_true):
lower = (1 - confidence_level) / 2
upper = 1 - lower
lower_quantile = torch.quantile(y_pred, lower)
upper_quantile = torch.quantile(y_pred, upper)
in_quantiles = torch.logical_and(y_true >= lower_quantile, y_true <= upper_quantile)
return lower_quantile, upper_quantile, in_quantiles
def compute_interval(self, po_samples, y_true):
counter = 0
width_list = []
for i in range(po_samples.shape[0]):
lower_quantile, upper_quantile, in_quantiles = self.check_intervel(confidence_level=0.95, y_pred= po_samples[i, :], y_true=y_true[i])
if in_quantiles == True:
counter+=1
width = upper_quantile - lower_quantile
width_list.append(width.unsqueeze(0))
prob = (counter/po_samples.shape[0])
all_width = torch.cat(width_list, dim=0)
median_width = torch.median(all_width, dim=0).values
return prob, median_width