[0d4320]: / dataloader.py

Download this file

53 lines (39 with data), 1.6 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from torch.utils.data import Dataset
import pickle
import numpy as np
import torch
def load_pickle(fname):
with open(fname, 'rb') as f:
return pickle.load(f)
def downsample(train_idx, neg_young, train_idx_pos):
downsamples = np.random.permutation(neg_young)[:450000]
mask=np.ones(len(train_idx), bool)
mask[downsamples] = False
downsample_idx = np.concatenate((train_idx[mask], np.repeat(train_idx_pos,50)))
return downsample_idx
class OriginalData:
def __init__(self, path):
self.path = path
self.feature_selection = load_pickle(path + 'frts_selection.pkl')
self.x = load_pickle(path + 'preprocess_x.pkl')[:, self.feature_selection]
self.y = load_pickle(path + 'y_bin.pkl')
def datasampler(self, idx_path, train = True):
idx = load_pickle(self.path + idx_path)
if train:
downsample_idx = downsample(idx, load_pickle(self.path + 'neg_young.pkl'), idx[self.y[idx] == 1])
return self.x[downsample_idx, :], self.y[downsample_idx]
return self.x, self.y
class EHRData(Dataset):
def __init__(self, data, cla):
self.data = data
self.cla = cla
def __len__(self):
return len(self.cla)
def __getitem__(self, idx):
return self.data[idx], self.cla[idx]
def collate_fn(data):
# padding
data_list = []
for datum in data:
data_list.append(np.hstack((datum[0].toarray().ravel(), datum[1])))
return torch.from_numpy(np.array(data_list)).long()