Subtype-GAN / Git / [5e3989] /SubtypeGAN.py

Models:
AlyssaS/
Subtype-GAN
Downloads: 1
[5e3989]: / SubtypeGAN.py
History
Download this file
629 lines (572 with data), 26.2 kB

import argparse
import sys
import numpy as np
import random
import time
import os

import tensorflow as tf
from subprocess import check_output
import h5py
import re
import math
import pandas as pd
from os.path import splitext, basename, isfile
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import preprocessing
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
from sklearn import mixture
from keras import backend as K
from keras.layers import Input, Dense, Lambda, Layer, Add, BatchNormalization, Dropout, Activation, merge, Conv2D, \
    MaxPooling2D, Activation, LeakyReLU, concatenate
from keras.models import Model, Sequential
from keras.losses import mse, binary_crossentropy
from keras.optimizers import Adam
from sklearn.ensemble import RandomForestClassifier
from keras.models import load_model
from keras.utils.generic_utils import get_custom_objects
from itertools import combinations
import bisect

random.seed(1)
np.random.seed(1)
tf.set_random_seed(1)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"


class ConsensusCluster:
    def __init__(self, cluster, L, K, H, resample_proportion=0.8):
        self.cluster_ = cluster
        self.resample_proportion_ = resample_proportion
        self.L_ = L
        self.K_ = K
        self.H_ = H
        self.Mk = None
        self.Ak = None
        self.deltaK = None
        self.bestK = None

    def _internal_resample(self, data, proportion):
        ids = np.random.choice(
            range(data.shape[0]), size=int(data.shape[0] * proportion), replace=False)
        return ids, data[ids, :]

    def fit(self, data):
        Mk = np.zeros((self.K_ - self.L_, data.shape[0], data.shape[0]))
        Is = np.zeros((data.shape[0],) * 2)
        for k in range(self.L_, self.K_):
            i_ = k - self.L_
            for h in range(self.H_):
                ids, dt = self._internal_resample(data, self.resample_proportion_)
                Mh = self.cluster_(n_clusters=k).fit_predict(dt)
                ids_sorted = np.argsort(Mh)
                sorted_ = Mh[ids_sorted]
                for i in range(k):
                    ia = bisect.bisect_left(sorted_, i)
                    ib = bisect.bisect_right(sorted_, i)
                    is_ = ids_sorted[ia:ib]
                    ids_ = np.array(list(combinations(is_, 2))).T
                    if ids_.size != 0:
                        Mk[i_, ids_[0], ids_[1]] += 1
                ids_2 = np.array(list(combinations(ids, 2))).T
                Is[ids_2[0], ids_2[1]] += 1
            Mk[i_] /= Is + 1e-8
            Mk[i_] += Mk[i_].T
            Mk[i_, range(data.shape[0]), range(
                data.shape[0])] = 1
            Is.fill(0)
        self.Mk = Mk
        self.Ak = np.zeros(self.K_ - self.L_)
        for i, m in enumerate(Mk):
            hist, bins = np.histogram(m.ravel(), density=True)
            self.Ak[i] = np.sum(h * (b - a)
                                for b, a, h in zip(bins[1:], bins[:-1], np.cumsum(hist)))
        self.deltaK = np.array([(Ab - Aa) / Aa if i > 2 else Aa
                                for Ab, Aa, i in zip(self.Ak[1:], self.Ak[:-1], range(self.L_, self.K_ - 1))])
        self.bestK = np.argmax(self.deltaK) + \
                     self.L_ if self.deltaK.size > 0 else self.L_

    def predict(self):
        return self.cluster_(n_clusters=self.bestK).fit_predict(
            1 - self.Mk[self.bestK - self.L_])

    def predict_data(self, data):
        return self.cluster_(n_clusters=self.bestK).fit_predict(
            data)


class GeLU(Activation):
    def __init__(self, activation, **kwargs):
        super(GeLU, self).__init__(activation, **kwargs)
        self.__name__ = 'gelu'


def gelu(x):
    return 0.5 * x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))


get_custom_objects().update({'gelu': GeLU(gelu)})


class AE():
    def __init__(self, X_shape, n_components, epochs=100):
        self.epochs = epochs
        sample_size = X_shape[0]
        self.batch_size = 16
        sample_size = X_shape[0]
        self.epochs = 30
        self.n_components = n_components
        self.shape = X_shape[1]

    def train(self, X):
        encoding_dim = self.n_components
        original_dim = X.shape[1]
        input = Input(shape=(original_dim,))
        encoded = Dense(encoding_dim)(input)
        encoded = BatchNormalization()(encoded)
        encoded = Activation('relu')(encoded)
        z = Dense(encoding_dim, activation='relu')(encoded)
        decoded = Dense(encoding_dim, activation='relu')(z)
        output = Dense(original_dim, activation='sigmoid')(decoded)
        ae = Model(input, output)
        encoder = Model(input, z)
        ae_loss = mse(input, output)
        ae.add_loss(ae_loss)
        ae.compile(optimizer=Adam())
        print(len(ae.layers))
        print(ae.count_params())
        ae.fit(X, epochs=self.epochs, batch_size=self.batch_size, verbose=2)
        return encoder.predict(X)


class VAE():
    def __init__(self, X_shape, n_components, epochs=100):
        self.epochs = epochs
        self.batch_size = 16
        sample_size = X_shape[0]
        self.epochs = 30
        self.n_components = n_components
        self.shape = X_shape[1]

    def train(self, X):
        def sampling(args):
            z_mean, z_log_var = args
            batch = K.shape(z_mean)[0]
            dim = K.int_shape(z_mean)[1]
            epsilon = K.random_normal(shape=(batch, dim), seed=0)
            return z_mean + K.exp(0.5 * z_log_var) * epsilon

        encoding_dim = self.n_components
        original_dim = X.shape[1]
        input = Input(shape=(original_dim,))
        encoded = Dense(encoding_dim)(input)
        encoded = BatchNormalization()(encoded)
        encoded = Activation('relu')(encoded)
        z_mean = Dense(encoding_dim)(encoded)
        z_log_var = Dense(encoding_dim)(encoded)
        z = Lambda(sampling, output_shape=(encoding_dim,), name='z')([z_mean, z_log_var])
        decoded = Dense(encoding_dim, activation='relu')(z)
        output = Dense(original_dim, activation='sigmoid')(decoded)
        vae = Model(input, output)
        encoder = Model(input, z)
        reconstruction_loss = mse(input, output)
        reconstruction_loss *= original_dim
        kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
        kl_loss = K.sum(kl_loss, axis=-1)
        kl_loss *= -0.5
        vae_loss = K.mean(reconstruction_loss + kl_loss)
        vae.add_loss(vae_loss)
        vae.compile(optimizer=Adam())
        print(len(vae.layers))
        print(vae.count_params())
        vae.fit(X, epochs=self.epochs, batch_size=self.batch_size, verbose=2)
        return encoder.predict(X)


class SubtypeGAN():
    def __init__(self, datasets, n_latent_dim, weight=0.001, model_path='SubtypeGAN.h5', epochs=100, batch_size=64):
        self.latent_dim = n_latent_dim
        optimizer = Adam()
        self.n = len(datasets)
        self.epochs = epochs
        self.batch_size = batch_size
        sample_size = 0
        if self.n > 1:
            sample_size = datasets[0].shape[0]
        print(sample_size)
        if sample_size > 300:
            self.epochs = 11
        else:
            self.epochs = 10
        self.epochs = 30 * batch_size
        self.shape = []
        self.weight = [0.3, 0.1, 0.1, 0.5]
        self.disc_w = 1e-4
        self.model_path = model_path
        input = []
        loss = []
        loss_weights = []
        output = []
        for i in range(self.n):
            self.shape.append(datasets[i].shape[1])
            loss.append('mse')
        loss.append('binary_crossentropy')
        self.decoder, self.disc = self.build_decoder_disc()
        self.disc.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        self.encoder = self.build_encoder()
        for i in range(self.n):
            input.append(Input(shape=(self.shape[i],)))
            loss_weights.append((1 - self.disc_w) * self.weight[i])
        loss_weights.append(self.disc_w)
        z_mean, z_log_var, z = self.encoder(input)
        output = self.decoder(z)
        self.gan = Model(input, output)
        self.gan.compile(loss=loss, loss_weights=loss_weights, optimizer=optimizer)
        print(self.gan.summary())
        return

    def build_encoder(self):
        def sampling(args):
            z_mean, z_log_var = args
            return z_mean + K.exp(0.5 * z_log_var) * K.random_normal(K.shape(z_mean), seed=0)

        encoding_dim = self.latent_dim
        X = []
        dims = []
        denses = []
        for i in range(self.n):
            X.append(Input(shape=(self.shape[i],)))
            dims.append(int(encoding_dim * self.weight[i]))
        for i in range(self.n):
            denses.append(Dense(dims[i])(X[i]))
        if self.n > 1:
            merged_dense = concatenate(denses, axis=-1)
        else:
            merged_dense = denses[0]
        model = BatchNormalization()(merged_dense)
        model = Activation('gelu')(model)
        model = Dense(encoding_dim)(model)
        z_mean = Dense(encoding_dim)(model)
        z_log_var = Dense(encoding_dim)(model)
        z = Lambda(sampling, output_shape=(encoding_dim,), name='z')([z_mean, z_log_var])
        return Model(X, [z_mean, z_log_var, z])

    def build_decoder_disc(self):
        denses = []
        X = Input(shape=(self.latent_dim,))
        model = Dense(self.latent_dim)(X)
        model = BatchNormalization()(model)
        model = Activation('gelu')(model)
        for i in range(self.n):
            denses.append(Dense(self.shape[i])(model))
        dec = Dense(1, activation='sigmoid')(model)
        denses.append(dec)
        m_decoder = Model(X, denses)
        m_disc = Model(X, dec)
        return m_decoder, m_disc

    def build_disc(self):
        X = Input(shape=(self.latent_dim,))
        dec = Dense(1, activation='sigmoid', kernel_initializer="glorot_normal")(X)
        output = Model(X, dec)
        return output

    def train(self, X_train, bTrain=True):
        model_path = self.model_path
        log_file = "./run.log"
        fp = open(log_file, 'w')
        if bTrain:
            # GAN
            valid = np.ones((self.batch_size, 1))
            fake = np.zeros((self.batch_size, 1))
            for epoch in range(self.epochs):
                #  Train Discriminator
                data = []
                idx = np.random.randint(0, X_train[0].shape[0], self.batch_size)
                for i in range(self.n):
                    data.append(X_train[i][idx])
                latent_fake = self.encoder.predict(data)[2]
                latent_real = np.random.normal(size=(self.batch_size, self.latent_dim))
                d_loss_real = self.disc.train_on_batch(latent_real, valid)
                d_loss_fake = self.disc.train_on_batch(latent_fake, fake)
                d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
                outs = data + [valid]
                #  Train Encoder_GAN
                g_loss = self.gan.train_on_batch(data, outs)
            fp.close()
            self.encoder.save(model_path)
        else:
            self.encoder = load_model(model_path)
        mat = self.encoder.predict(X_train)[0]
        return mat


class SubtypeGAN_API(object):
    def __init__(self, model_path='./model/', epochs=200, weight=0.001):
        self.model_path = model_path
        self.score_path = './score/'
        self.epochs = epochs
        self.batch_size = 16
        self.weight = weight

    # feature extract
    def feature_gan(self, datasets, index=None, n_components=100, b_decomposition=True, weight=0.001):
        if b_decomposition:
            X = self.encoder_gan(datasets, n_components)
            fea = pd.DataFrame(data=X, index=index, columns=map(lambda x: 'v' + str(x), range(X.shape[1])))
        else:
            fea = np.concatenate(datasets)
        print("feature extract finished!")
        return fea

    def feature_vae(self, df_ori, n_components=100, b_decomposition=True):
        if b_decomposition:
            X = self.encoder_vae(df_ori, n_components)
            print(X)
            fea = pd.DataFrame(data=X, index=df_ori.index,
                               columns=map(lambda x: 'v' + str(x), range(X.shape[1])))
        else:
            fea = df_ori.copy()
        print("feature extract finished!")
        return fea

    def feature_ae(self, df_ori, n_components=100, b_decomposition=True):
        if b_decomposition:
            X = self.encoder_ae(df_ori, n_components)
            print(X)
            fea = pd.DataFrame(data=X, index=df_ori.index,
                               columns=map(lambda x: 'v' + str(x), range(X.shape[1])))
        else:
            fea = df_ori.copy()
        print("feature extract finished!")
        return fea

    def impute(self, X):
        X.fillna(X.mean())
        return X

    def encoder_gan(self, ldata, n_components=100):
        egan = SubtypeGAN(ldata, n_components, self.weight, self.model_path, self.epochs, self.batch_size)
        return egan.train(ldata)

    def encoder_vae(self, df, n_components=100):
        vae = VAE(df.shape, n_components, self.epochs)
        return vae.train(df)

    def encoder_ae(self, df, n_components=100):
        ae = AE(df.shape, n_components, self.epochs)
        return ae.train(df)

    def tsne(self, X):
        model = TSNE(n_components=2)
        return model.fit_transform(X)

    def pca(self, X):
        fea_model = PCA(n_components=200)
        return fea_model.fit_transform(X)

    def gmm(self, n_clusters=28):
        model = mixture.GaussianMixture(n_components=n_clusters, covariance_type='diag')
        return model

    def kmeans(self, n_clusters=28):
        model = KMeans(n_clusters=n_clusters, random_state=0)
        return model

    def spectral(self, n_clusters=28):
        model = SpectralClustering(n_clusters=n_clusters, random_state=0)
        return model

    def hierarchical(self, n_clusters=28):
        model = AgglomerativeClustering(n_clusters=n_clusters)
        return model


def main(argv=sys.argv):
    parser = argparse.ArgumentParser(description='SubtypeGAN v1.0')
    parser.add_argument("-i", dest='file_input', default="./input/input.list",
                        help="file input")
    parser.add_argument("-e", dest='epochs', type=int, default=200, help="Number of iterations")
    parser.add_argument("-m", dest='run_mode', default="feature", help="run_mode: feature, cluster")
    parser.add_argument("-n", dest='cluster_num', type=int, default=-1, help="cluster number")
    parser.add_argument("-w", dest='disc_weight', type=float, default=1e-4, help="weight")
    parser.add_argument("-o", dest='output_path', default="./score/", help="file output")
    parser.add_argument("-p", dest='other_approach', default="spectral", help="kmeans, spectral, tsne_gmm, tsne")
    parser.add_argument("-s", dest='surv_path',
                        default="./data/TCGA/clinical_PANCAN_patient_with_followup.tsv",
                        help="surv input")
    parser.add_argument("-t", dest='type', default="ALL", help="cancer type: BRCA, GBM")
    args = parser.parse_args()
    model_path = './model/' + args.type + '.h5'
    SubtypeGAN = SubtypeGAN_API(model_path, epochs=args.epochs, weight=args.disc_weight)
    cancer_dict = {'BRCA': 5, 'BLCA': 5, 'KIRC': 4,
                   'GBM': 3, 'LUAD': 3, 'PAAD': 2,
                   'SKCM': 4, 'STAD': 3, 'UCEC': 4, 'UVM': 4}
    if args.run_mode == 'SubtypeGAN':
        cancer_type = args.type
        if cancer_type not in cancer_dict and args.cluster_num == -1:
            print("Please set the number of clusters!")
        elif args.cluster_num == -1:
            args.cluster_num = cancer_dict[cancer_type]
        fea_tmp_file = './fea/' + cancer_type + '.fea'
        tmp_dir = './fea/' + cancer_type + '/'
        model_dir ='./model'
        if not os.path.isdir(tmp_dir):
            os.mkdir(tmp_dir)
        if not os.path.isdir(model_dir):
            os.mkdir(model_dir)
        ldata = []
        l = []
        nb_line = 0
        for line in open(args.file_input, 'rt'):
            base_file = splitext(basename(line.rstrip()))[0]
            fea_save_file = tmp_dir + base_file + '.fea'
            if isfile(fea_save_file):
                df_new = pd.read_csv(fea_save_file, sep=',', header=0, index_col=0)
                l = list(df_new)
            else:
                clinic_parms = ['bcr_patient_barcode', 'acronym', 'vital_status', 'days_to_death',
                                'days_to_last_followup', 'gender', 'age_at_initial_pathologic_diagnosis',
                                'pathologic_M',
                                'pathologic_N', 'pathologic_T', 'pathologic_stage']
                df = pd.read_csv(args.surv_path, header=0, sep='\t',
                                 usecols=clinic_parms)
                df['status'] = np.where(df['vital_status'] == 'Dead', 1, 0)
                df['days'] = df.apply(lambda r: r['days_to_death'] if r['status'] == 1 else r['days_to_last_followup'],
                                      axis=1)
                df.index = df['bcr_patient_barcode']

                if cancer_type == 'ALL':
                    pass
                else:
                    df = df.loc[df['acronym'] == cancer_type, ::]
                clic_save_file = './results/' + cancer_type + '.clinic'
                df_new = pd.read_csv(line.rstrip(), sep='\t', header=0, index_col=0, comment='#')
                nb_line += 1
                if nb_line == 1:
                    ids = list(df.index)
                    ids_sub = list(df_new)
                    l = list(set(ids) & set(ids_sub))
                    df_clic = df.loc[
                        l, ['status', 'days', 'gender', 'age_at_initial_pathologic_diagnosis', 'pathologic_M',
                            'pathologic_N', 'pathologic_T', 'pathologic_stage']]

                    df_clic.to_csv(clic_save_file, index=True, header=True, sep=',')
                df_new = df_new.loc[::, l]
                df_new = df_new.fillna(0)
                if 'miRNA' in base_file or 'rna' in base_file:
                    df_new = np.log2(df_new + 1)
                scaler = preprocessing.StandardScaler()
                mat = scaler.fit_transform(df_new.values.astype(float))
                df_new.iloc[::, ::] = mat
                print(df_new.shape)
                df_new.to_csv(fea_save_file, index=True, header=True, sep=',')
            df_new = df_new.T
            ldata.append(df_new.values.astype(float))
        start_time = time.time()
        vec = SubtypeGAN.feature_gan(ldata, index=l, n_components=100, weight=args.disc_weight)
        df = pd.DataFrame(data=[time.time() - start_time])
        vec.to_csv(fea_tmp_file, header=True, index=True, sep='\t')
        out_file = './results/' + cancer_type + '.SubtypeGAN.time'
        df.to_csv(out_file, header=True, index=False, sep=',')

        if isfile(fea_tmp_file):
            X = pd.read_csv(fea_tmp_file, header=0, index_col=0, sep='\t')
            X['SubtypeGAN'] = SubtypeGAN.gmm(args.cluster_num).fit_predict(X.values) + 1
            X = X.loc[:, ['SubtypeGAN']]
            out_file = './results/' + cancer_type + '.SubtypeGAN'
            X.to_csv(out_file, header=True, index=True, sep='\t')
        else:
            print('file does not exist!')

    elif args.run_mode == 'show':
        cancer_type = args.type
        fea_tmp_file = './fea/' + cancer_type + '.fea'
        out_file = './fea/' + cancer_type + '.tsne'
        if isfile(fea_tmp_file):
            df = pd.read_csv(fea_tmp_file, header=0, index_col=0, sep='\t')
            mat = df.values.astype(float)
            labels = SubtypeGAN.tsne(mat)
            print(labels.shape)
            df['x'] = labels[:, 0]
            df['y'] = labels[:, 1]
            df = df.loc[:, ['x', 'y']]
            df.to_csv(out_file, header=True, index=True, sep='\t')
        else:
            print('file does not exist!')

    elif args.run_mode == 'kmeans':
        cancer_type = args.type
        if cancer_type not in cancer_dict and args.cluster_num == -1:
            print("Please set the number of clusters!")
        elif args.cluster_num == -1:
            args.cluster_num = cancer_dict[cancer_type]
        dfs = []
        for line in open(args.file_input, 'rt'):
            base_file = splitext(basename(line.rstrip()))[0]
            fea_tmp_file = './fea/' + cancer_type + '/' + base_file + '.fea'
            dfs.append(pd.read_csv(fea_tmp_file, header=0, index_col=0, sep=','))
        X = pd.concat(dfs, axis=0).T
        print(X.head(5))
        print(X.shape)
        X['kmeans'] = SubtypeGAN.kmeans(args.cluster_num).fit_predict(X.values) + 1
        X = X.loc[:, ['kmeans']]
        out_file = './results/' + cancer_type + '.kmeans'
        X.to_csv(out_file, header=True, index=True, sep='\t')

    elif args.run_mode == 'spectral':
        cancer_type = args.type
        if cancer_type not in cancer_dict and args.cluster_num == -1:
            print("Please set the number of clusters!")
        elif args.cluster_num == -1:
            args.cluster_num = cancer_dict[cancer_type]
        dfs = []
        for line in open(args.file_input, 'rt'):
            base_file = splitext(basename(line.rstrip()))[0]
            fea_tmp_file = './fea/' + cancer_type + '/' + base_file + '.fea'
            dfs.append(pd.read_csv(fea_tmp_file, header=0, index_col=0, sep=','))
        X = pd.concat(dfs, axis=0).T
        print(X.head(5))
        print(X.shape)
        X['spectral'] = SubtypeGAN.spectral(args.cluster_num).fit_predict(X.values) + 1
        X = X.loc[:, ['spectral']]
        out_file = './results/' + cancer_type + '.spectral'
        X.to_csv(out_file, header=True, index=True, sep='\t')

    elif args.run_mode == 'ae':
        cancer_type = args.type
        if cancer_type not in cancer_dict and args.cluster_num == -1:
            print("Please set the number of clusters!")
        elif args.cluster_num == -1:
            args.cluster_num = cancer_dict[cancer_type]
        dfs = []
        for line in open(args.file_input, 'rt'):
            base_file = splitext(basename(line.rstrip()))[0]
            fea_tmp_file = './fea/' + cancer_type + '/' + base_file + '.fea'
            dfs.append(pd.read_csv(fea_tmp_file, header=0, index_col=0, sep=','))
        X = pd.concat(dfs, axis=0).T
        print(X.head(5))
        print(X.shape)
        fea_save_file = './fea/' + cancer_type + '.ae'
        start_time = time.time()
        vec = SubtypeGAN.feature_ae(X, n_components=100)
        df = pd.DataFrame(data=[time.time() - start_time])
        vec.to_csv(fea_tmp_file, header=True, index=True, sep='\t')
        out_file = './results/' + cancer_type + '.ae.time'
        df.to_csv(out_file, header=True, index=False, sep=',')
        if isfile(fea_save_file):
            X = pd.read_csv(fea_save_file, header=0, index_col=0, sep='\t')
            X['ae'] = SubtypeGAN.gmm(args.cluster_num).fit_predict(X.values) + 1
            X = X.loc[:, ['ae']]
            out_file = './results/' + cancer_type + '.ae'
            X.to_csv(out_file, header=True, index=True, sep='\t')
        else:
            print('file does not exist!')

    elif args.run_mode == 'vae':
        cancer_type = args.type
        if cancer_type not in cancer_dict and args.cluster_num == -1:
            print("Please set the number of clusters!")
        elif args.cluster_num == -1:
            args.cluster_num = cancer_dict[cancer_type]
        dfs = []
        for line in open(args.file_input, 'rt'):
            base_file = splitext(basename(line.rstrip()))[0]
            fea_tmp_file = './fea/' + cancer_type + '/' + base_file + '.fea'
            dfs.append(pd.read_csv(fea_tmp_file, header=0, index_col=0, sep=','))
        X = pd.concat(dfs, axis=0).T
        print(X.head(5))
        print(X.shape)
        fea_save_file = './fea/' + cancer_type + '.vae'
        start_time = time.time()
        vec = SubtypeGAN.feature_vae(X, n_components=100)
        df = pd.DataFrame(data=[time.time() - start_time])
        vec.to_csv(fea_tmp_file, header=True, index=True, sep='\t')
        out_file = './results/' + cancer_type + '.vae.time'
        df.to_csv(out_file, header=True, index=False, sep=',')
        if isfile(fea_save_file):
            X = pd.read_csv(fea_save_file, header=0, index_col=0, sep='\t')
            X['vae'] = SubtypeGAN.gmm(args.cluster_num).fit_predict(X.values) + 1
            X = X.loc[:, ['vae']]
            out_file = './results/' + cancer_type + '.vae'
            X.to_csv(out_file, header=True, index=True, sep='\t')
        else:
            print('file does not exist!')

    elif args.run_mode == 'cc':
        K1_dict = {'BRCA': 4, 'BLCA': 3, 'KIRC': 3,
                   'GBM': 2, 'LUAD': 3, 'PAAD': 2,
                   'SKCM': 3, 'STAD': 3, 'UCEC': 4, 'UVM': 2}
        K2_dict = {'BRCA': 8, 'BLCA': 6, 'KIRC': 6,
                   'GBM': 4, 'LUAD': 6, 'PAAD': 4,
                   'SKCM': 6, 'STAD': 6, 'UCEC': 8, 'UVM': 4}
        cancer_type = args.type
        base_file = splitext(basename(args.file_input))[0]
        fea_tmp_file = './fea/' + cancer_type + '.fea'
        fs = []
        cc_file = './results/k.cc'
        fp = open(cc_file, 'a')
        if isfile(fea_tmp_file):
            X = pd.read_csv(fea_tmp_file, header=0, index_col=0, sep='\t')
            cc = ConsensusCluster(SubtypeGAN.gmm, K1_dict[cancer_type], K2_dict[cancer_type], 10)
            cc.fit(X.values)
            X['cc'] = SubtypeGAN.gmm(cc.bestK).fit_predict(X.values) + 1
            X = X.loc[:, ['cc']]
            out_file = './results/' + cancer_type + '.cc'
            X.to_csv(out_file, header=True, index=True, sep='\t')
            fp.write("%s, k=%d\n" % (cancer_type, cc.bestK))
        else:
            print('file does not exist!')
        fp.close()


if __name__ == "__main__":
    main()