|
a |
|
b/import_data.py |
|
|
1 |
import numpy as np |
|
|
2 |
import pandas as pd |
|
|
3 |
import random |
|
|
4 |
|
|
|
5 |
|
|
|
6 |
|
|
|
7 |
## all samples MUST include at least one view. |
|
|
8 |
def import_incomplete_handwritten(): |
|
|
9 |
npz = np.load('./data/Handwritten_Missing/data_with_missingviews.npz', allow_pickle=True) |
|
|
10 |
|
|
|
11 |
X_set = npz['X_set'].tolist() |
|
|
12 |
Y_onehot = npz['Y_onehot'] |
|
|
13 |
|
|
|
14 |
M = len(X_set) |
|
|
15 |
|
|
|
16 |
### Construct Mask Vector to indicate available (m=1) or missing (m=0) values |
|
|
17 |
Mask = np.ones([np.shape(X_set[0])[0], M]) |
|
|
18 |
for m_idx in range(M): |
|
|
19 |
Mask[np.isnan(X_set[m_idx]).all(axis=1), m_idx] = 0 |
|
|
20 |
X_set[m_idx][Mask[:, m_idx] == 0] = np.mean(X_set[m_idx][Mask[:, m_idx] == 1], axis=0) |
|
|
21 |
|
|
|
22 |
return X_set, Y_onehot, Mask |
|
|
23 |
|
|
|
24 |
|
|
|
25 |
|
|
|
26 |
def import_dataset_TCGA(year=1): |
|
|
27 |
filename = '/media/vdslab/Genomics/TCGA/dataset/FINAL/cleaned/incomplete_multi_view_pca_{}yr.npz'.format(int(year)) |
|
|
28 |
npz = np.load(filename) |
|
|
29 |
|
|
|
30 |
Mask = npz['m'] |
|
|
31 |
M = np.shape(Mask)[1] |
|
|
32 |
|
|
|
33 |
X_set = {} |
|
|
34 |
for m in range(M): |
|
|
35 |
tmp = npz['x{}'.format(m+1)] |
|
|
36 |
tmp[np.isnan(tmp[:, 0]), :] = np.nanmean(tmp, axis=0) |
|
|
37 |
X_set[m] = tmp |
|
|
38 |
|
|
|
39 |
Y = npz['y'] |
|
|
40 |
|
|
|
41 |
X_set_incomp = {} |
|
|
42 |
X_set_comp = {} |
|
|
43 |
for m in range(M): |
|
|
44 |
X_set_comp[m] = X_set[m][np.sum(Mask, axis=1) == 4] |
|
|
45 |
X_set_incomp[m] = X_set[m][np.sum(Mask, axis=1) != 4] |
|
|
46 |
|
|
|
47 |
Y_comp = Y[np.sum(Mask, axis=1) == 4] |
|
|
48 |
Y_incomp = Y[np.sum(Mask, axis=1) != 4] |
|
|
49 |
|
|
|
50 |
Mask_comp = Mask[np.sum(Mask, axis=1) == 4] |
|
|
51 |
Mask_incomp = Mask[np.sum(Mask, axis=1) != 4] |
|
|
52 |
|
|
|
53 |
|
|
|
54 |
Y_onehot_incomp = np.zeros([np.shape(Y_incomp)[0], 2]) |
|
|
55 |
Y_onehot_comp = np.zeros([np.shape(Y_comp)[0], 2]) |
|
|
56 |
|
|
|
57 |
Y_onehot_incomp[Y_incomp == 0, 0] = 1 |
|
|
58 |
Y_onehot_incomp[Y_incomp == 1, 1] = 1 |
|
|
59 |
|
|
|
60 |
Y_onehot_comp[Y_comp == 0, 0] = 1 |
|
|
61 |
Y_onehot_comp[Y_comp == 1, 1] = 1 |
|
|
62 |
|
|
|
63 |
return X_set_comp, Y_onehot_comp, Mask_comp, X_set_incomp, Y_onehot_incomp, Mask_incomp |