DeepProg / Git / [53737a] /examples/example_hyperparameters

Models:
AlyssaS/
DeepProg
Downloads: 1
[53737a]: / examples / example_hyperparameters_tuning.py
History
Download this file
147 lines (120 with data), 4.5 kB

"""
This example details how to optimize the choice of the hyperparameters to cluster
a multi-omic dataset.

Multiple objective criteria can be used, such as model final cox-PH pvalues,
cox-PH pvalues for agglomerated out-of-bags samples, cluster consistency,
 c-index for out-of-bags samples or for the full labels, mix score, or sum of the pvalues

Sum of pvalues formula:
sum_log_pval = - np.log10(1e-128 + full_model_pvalue) - np.log10(1e-128 + test_fold_pvalue)

Mix score formula:
mix_score = sum_log_pval * cluster_consistency * test_fold_cindex
"""

from os.path import abspath
from os.path import split

from simdeep.simdeep_tuning import SimDeepTuning

import ray


def test_instance():
    """
    example of SimDeepBoosting
    """
    PATH_DATA = '{0}/../examples/data/'.format(split(abspath(__file__))[0])

    #Input file
    TRAINING_TSV = {'RNA': 'rna_dummy.tsv', 'METH': 'meth_dummy.tsv'}
    SURVIVAL_TSV = 'survival_dummy.tsv'

    PROJECT_NAME = 'TestProjectTuning'
    nb_threads = 2 # Number of processes to be used to fit individual survival models

    ### Below are examples of parameters that can parsed in the hyperparameter tuning

    ################ AUTOENCODER PARAMETERS ################
    # LEVEL_DIMS_IN = [250]
    # LEVEL_DIMS_OUT = [250]
    # LOSS = 'binary_crossentropy'
    # OPTIMIZER = 'adam'
    # ACT_REG = 0
    # W_REG = 0
    # DROPOUT = 0.5
    # DATA_SPLIT = 0
    # ACTIVATION = 'tanh'
    #########################################################

    ################ ADDITIONAL PARAMETERS ##################
    # stack_multi_omic=STACK_MULTI_OMIC,
    # level_dims_in=LEVEL_DIMS_IN,
    # level_dims_out=LEVEL_DIMS_OUT,
    # loss=LOSS,
    # optimizer=OPTIMIZER,
    # act_reg=ACT_REG,
    # w_reg=W_REG,
    # dropout=DROPOUT,
    # data_split=DATA_SPLIT,
    # activation=ACTIVATION,
    # path_to_save_model=PATH_TO_SAVE_MODEL,
    # pvalue_threshold=PVALUE_THRESHOLD,
    # nb_selected_features=NB_SELECTED_FEATURES,
    # pvalue_threshold = 0.01
    # nb_selected_features = 10
    # stack_multi_omic = False
    # use_autoencoders = True
    # feature_surv_analysis = True
    #########################################################

    # ray.init(num_cpus=3)

    # AgglomerativeClustering is an external class that can be used as
    # a clustering algorithm since it has a fit_predict method
    from sklearn.cluster import AgglomerativeClustering

    args_to_optimize = {
        'seed': [100, 200, 300, 400],
        # 'nb_clusters': [2, 5],
        'cluster_method': [
            'mixture', # Gaussian mixture
            'coxPHM', # coxPH fitting and dichotomization
            'coxPHMixture', # coxPH fitting and gaussian mixture on the predicted time (1D)
            AgglomerativeClustering # scikit-learn hierarchical clustering
        ],

        'normalization': ['default', 'alternative']
        # 'use_autoencoders': (True, False),
        # 'class_selection': ('mean', 'max'),
    }

    # Different normalisations can be tested
    from sklearn.preprocessing import RobustScaler
    # An external normalisation class can be used
    # it requires the class to have fit and fit_transform method

    normalization = {
        'default': {
            'NB_FEATURES_TO_KEEP': 100,
            'TRAIN_RANK_NORM': True,
            'TRAIN_CORR_REDUCTION': True,
            'TRAIN_CORR_RANK_NORM': True,
        },
        'alternative': {
            'CUSTOM': RobustScaler,
        }
    }

    tuning = SimDeepTuning(
        args_to_optimize=args_to_optimize,
        nb_threads=nb_threads,
        survival_tsv=SURVIVAL_TSV,
        training_tsv=TRAINING_TSV,
        path_data=PATH_DATA,
        project_name=PROJECT_NAME,
        path_results=PATH_DATA,
        normalization=normalization,
    )

    # Possible metrics for evaluating training set: {
    #              "cluster_consistency",
    #              "full_pvalue",
    #              "sum_log_pval",
    #              "test_fold_cindex",
    #              "test_fold_pval",
    #              "mix_score",
    #     }

    ray.init(num_cpus=8)
    tuning.fit(
        metric='log_test_fold_pvalue',
        num_samples=8,
        max_concurrent=2,
        distribute_deepprog=True,
        # iterations is usefull to take into account the DL parameter fitting variations
        iterations=1)

    table = tuning.get_results_table()
    tuning.save_results_table()

    ray.shutdown()


if __name__ == '__main__':
    test_instance()