# Tutorial - TCGA
### Generating Results in Table 3 (TCGA Dataset)
### Method: BASE1

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import tensorflow as tf

import random
import sys, os

In [2]:
from sklearn.model_selection import train_test_split

import import_data as impt
from helper import f_get_minibatch_set, evaluate
from class_Baseline_Concat import Baseline_Concat

In [4]:
year         = 1
DATASET_PATH = 'TCGA_{}YR'.format(int(year))
DATASET      = 'TCGA'

X_set_comp, Y_onehot_comp, Mask_comp, X_set_incomp, Y_onehot_incomp, Mask_incomp = impt.import_dataset_TCGA(year)

MODE       = 'incomplete'
model_name = 'Base1'

M = len(X_set_comp)

In [5]:
SEED = 1234
OUTITERATION = 5

RESULTS_AUROC_RAND = np.zeros([4, OUTITERATION+2])
RESULTS_AUPRC_RAND = np.zeros([4, OUTITERATION+2])

# IMPORT DATASET

In [6]:
out_itr = 1

tr_X_set, te_X_set, va_X_set = {}, {}, {}
for m in range(M):
    tr_X_set[m],te_X_set[m] = train_test_split(X_set_comp[m], test_size=0.2, random_state=SEED + out_itr)
    tr_X_set[m],va_X_set[m] = train_test_split(tr_X_set[m], test_size=0.2, random_state=SEED + out_itr)
    
tr_Y_onehot,te_Y_onehot, tr_M,te_M = train_test_split(Y_onehot_comp, Mask_comp, test_size=0.2, random_state=SEED + out_itr)
tr_Y_onehot,va_Y_onehot, tr_M,va_M = train_test_split(tr_Y_onehot, tr_M, test_size=0.2, random_state=SEED + out_itr)

In [7]:
if MODE == 'incomplete':
    for m in range(M):
        tr_X_set[m] = np.concatenate([tr_X_set[m], X_set_incomp[m]], axis=0)

    tr_Y_onehot = np.concatenate([tr_Y_onehot, Y_onehot_incomp], axis=0)
    tr_M        = np.concatenate([tr_M, Mask_incomp], axis=0)
    
    print(tr_M.shape)
elif MODE == 'complete':
    print(tr_M.shape)
else:
    raise ValueError('WRONG MODE!!!')
    

save_path = '{}/M{}_{}/{}/'.format(DATASET_PATH, M, MODE, model_name)
    
    
if not os.path.exists(save_path + 'itr{}/'.format(out_itr)):
    os.makedirs(save_path + 'itr{}/'.format(out_itr))

(5850, 4)


### Hyper-parameters

In [14]:
x_dim_set    = [tr_X_set[m].shape[1] for m in range(len(tr_X_set))]
y_dim        = np.shape(tr_Y_onehot)[1]

y_type       = 'binary'

z_dim        = 100

h_dim_p      = 100
num_layers_p = 2

h_dim_e      = 100
num_layers_e = 3


input_dims = {
    'x_dim_set': x_dim_set,
    'y_dim': y_dim,
    'y_type': y_type,
    'z_dim': z_dim
}

network_settings = {
    'h_dim_p': h_dim_p,
    'num_layers_p': num_layers_p,
    'h_dim_e': h_dim_e,
    'num_layers_e': num_layers_e,
    'fc_activate_fn': tf.nn.relu,
    'reg_scale': 0., 
}

mb_size  = 32
lr_rate  = 1e-4
k_prob   = 0.7


### Training

In [15]:
tf.reset_default_graph()

gpu_options = tf.GPUOptions()
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

model = Baseline_Concat(sess, "Base1", input_dims, network_settings)

In [16]:
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

ITERATION = 500000
STEPSIZE  = 500

min_loss  = 1e+8   
max_acc   = 0.0
max_flag  = 20

tr_avg_Lt, va_avg_Lt  = 0, 0

stop_flag = 0
for itr in range(ITERATION):
    x_mb_set, y_mb, m_mb          = f_get_minibatch_set(mb_size, tr_X_set, tr_Y_onehot, tr_M)
    _, Lt                         = model.train(x_mb_set, y_mb, lr_rate, k_prob)

    tr_avg_Lt  += Lt/STEPSIZE


    x_mb_set, y_mb, m_mb          = f_get_minibatch_set(min(np.shape(va_M)[0], mb_size), va_X_set, va_Y_onehot, va_M)        
    Lt                            = model.get_loss(x_mb_set, y_mb)

    va_avg_Lt  += Lt/STEPSIZE

    if (itr+1)%STEPSIZE == 0:
        y_pred = model.predict_y(va_X_set)
        auroc, auprc = evaluate(va_Y_onehot, y_pred, y_type)

        print( "{:05d}: TRAIN| LT={:.3f}   VALID| LT={:.3f}   auroc={:.4f}   auprc={:.4f}".format(
            itr+1, tr_avg_Lt, va_avg_Lt, auroc, auprc))

        if min_loss > va_avg_Lt:
            min_loss  = va_avg_Lt
            stop_flag = 0
            saver.save(sess, save_path  + 'itr{}/best_model'.format(out_itr))
            print('saved...')
        else:
            stop_flag += 1

        tr_avg_Lt = 0
        va_avg_Lt = 0 

        if stop_flag >= max_flag:
            break

print('FINISHED...')

00500: TRAIN| LT=0.969   VALID| LT=0.804   auroc=0.7378   auprc=0.2977
saved...
01000: TRAIN| LT=0.743   VALID| LT=0.716   auroc=0.7685   auprc=0.3671
saved...
01500: TRAIN| LT=0.675   VALID| LT=0.721   auroc=0.7703   auprc=0.3434
02000: TRAIN| LT=0.637   VALID| LT=0.748   auroc=0.7757   auprc=0.3746
02500: TRAIN| LT=0.610   VALID| LT=0.777   auroc=0.7757   auprc=0.3422
03000: TRAIN| LT=0.580   VALID| LT=0.746   auroc=0.7717   auprc=0.3369
03500: TRAIN| LT=0.558   VALID| LT=0.774   auroc=0.7773   auprc=0.3451
04000: TRAIN| LT=0.534   VALID| LT=0.782   auroc=0.7663   auprc=0.3426
04500: TRAIN| LT=0.518   VALID| LT=0.810   auroc=0.7709   auprc=0.3266
05000: TRAIN| LT=0.491   VALID| LT=0.845   auroc=0.7719   auprc=0.3573
05500: TRAIN| LT=0.481   VALID| LT=0.863   auroc=0.7634   auprc=0.3282
06000: TRAIN| LT=0.447   VALID| LT=0.897   auroc=0.7594   auprc=0.3321
06500: TRAIN| LT=0.425   VALID| LT=0.951   auroc=0.7658   auprc=0.3384
07000: TRAIN| LT=0.397   VALID| LT=0.932   auroc=0.7734   a

In [17]:
saver.restore(sess, save_path  + 'itr{}/best_model'.format(out_itr))

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from TCGA_1YR/M4_incomplete/Base1/itr1/best_model


### Evaluation -- (Results in Table 3)

In [21]:
mean_X = {}
for m in range(4):
    mean_X[m] = X_set_incomp[m][Mask_incomp[:,m] == 0][0, :]

for m_available in [1,2,3,4]:
    tmp_M_mis = np.zeros_like(te_M)#np.copy(te_M)

    for i in range(len(tmp_M_mis)):
        np.random.seed(SEED+out_itr+i)
        idx = np.random.choice(4, m_available, replace=False)
        tmp_M_mis[i, idx] = 1

    tmp_te_X = {}
    for m in range(M):
        tmp_te_X[m] = np.copy(te_X_set[m])
        tmp_te_X[m][tmp_M_mis[:,m] == 0] = mean_X[m] 

    y_pred = model.predict_y(tmp_te_X)
    auc1, apc1 = evaluate(te_Y_onehot, y_pred, y_type)

    RESULTS_AUROC_RAND[m_available-1, out_itr] = auc1
    RESULTS_AUPRC_RAND[m_available-1, out_itr] = apc1

    print("TEST - {} - #VIEW {}: auroc={:.4f}  auprc={:.4f}".format(MODE.upper(), m_available,  auc1, apc1))

TEST - INCOMPLETE - #VIEW 1: auroc=0.6844  auprc=0.3007
TEST - INCOMPLETE - #VIEW 2: auroc=0.7444  auprc=0.3537
TEST - INCOMPLETE - #VIEW 3: auroc=0.7774  auprc=0.3685
TEST - INCOMPLETE - #VIEW 4: auroc=0.7805  auprc=0.3945
