--- a +++ b/import_data.py @@ -0,0 +1,63 @@ +import numpy as np +import pandas as pd +import random + + + +## all samples MUST include at least one view. +def import_incomplete_handwritten(): + npz = np.load('./data/Handwritten_Missing/data_with_missingviews.npz', allow_pickle=True) + + X_set = npz['X_set'].tolist() + Y_onehot = npz['Y_onehot'] + + M = len(X_set) + + ### Construct Mask Vector to indicate available (m=1) or missing (m=0) values + Mask = np.ones([np.shape(X_set[0])[0], M]) + for m_idx in range(M): + Mask[np.isnan(X_set[m_idx]).all(axis=1), m_idx] = 0 + X_set[m_idx][Mask[:, m_idx] == 0] = np.mean(X_set[m_idx][Mask[:, m_idx] == 1], axis=0) + + return X_set, Y_onehot, Mask + + + +def import_dataset_TCGA(year=1): + filename = '/media/vdslab/Genomics/TCGA/dataset/FINAL/cleaned/incomplete_multi_view_pca_{}yr.npz'.format(int(year)) + npz = np.load(filename) + + Mask = npz['m'] + M = np.shape(Mask)[1] + + X_set = {} + for m in range(M): + tmp = npz['x{}'.format(m+1)] + tmp[np.isnan(tmp[:, 0]), :] = np.nanmean(tmp, axis=0) + X_set[m] = tmp + + Y = npz['y'] + + X_set_incomp = {} + X_set_comp = {} + for m in range(M): + X_set_comp[m] = X_set[m][np.sum(Mask, axis=1) == 4] + X_set_incomp[m] = X_set[m][np.sum(Mask, axis=1) != 4] + + Y_comp = Y[np.sum(Mask, axis=1) == 4] + Y_incomp = Y[np.sum(Mask, axis=1) != 4] + + Mask_comp = Mask[np.sum(Mask, axis=1) == 4] + Mask_incomp = Mask[np.sum(Mask, axis=1) != 4] + + + Y_onehot_incomp = np.zeros([np.shape(Y_incomp)[0], 2]) + Y_onehot_comp = np.zeros([np.shape(Y_comp)[0], 2]) + + Y_onehot_incomp[Y_incomp == 0, 0] = 1 + Y_onehot_incomp[Y_incomp == 1, 1] = 1 + + Y_onehot_comp[Y_comp == 0, 0] = 1 + Y_onehot_comp[Y_comp == 1, 1] = 1 + + return X_set_comp, Y_onehot_comp, Mask_comp, X_set_incomp, Y_onehot_incomp, Mask_incomp