DeepProg / Git / [53737a] /simdeep/config.py

Models:
AlyssaS/
DeepProg
Downloads: 1
[53737a]: / simdeep / config.py
History
Download this file
181 lines (150 with data), 6.3 kB

from sklearn.svm import SVC
from sklearn.model_selection import KFold

from os.path import abspath
from os.path import split as pathsplit

# absolute path of this file
PATH_THIS_FILE = pathsplit(abspath(__file__))[0]

#################### SimDeep variable ##################

USE_R_PACKAGES_FOR_SURVIVAL = False
NB_CLUSTERS = 2 # Number of clusters
CLUSTER_METHOD = 'coxPHMixture' # possible choice: ['mixture', 'kmeans', 'coxPH', 'coxPHMixture']
CLUSTER_EVAL_METHOD = 'silhouette' # possible choice: ['bic', 'silhouette', 'calinski']
CLASSIFIER_TYPE = 'svm'
NODES_SELECTION = 'Cox-PH' # possible choice: ['Cox-PH', 'C-index']
CLASSIFICATION_METHOD = 'ALL_FEATURES' # ['ALL_FEATURES', 'SURVIVAL_FEATURES']
FILL_UNKOWN_FEATURE_WITH_0 = True
USE_AUTOENCODERS = True # Construct autoencoders rather than using all the features
FEATURE_SURV_ANALYSIS = True # Refine feature with survival analysis

# Number of top features selected for classification
# Apply only when CLASSIFICATION_METHOD == 'ALL_FEATURES'
NB_SELECTED_FEATURES = 50

CLUSTER_ARRAY = []
PVALUE_THRESHOLD = 0.01 # Threshold for survival significance to set a node as valid
CINDEX_THRESHOLD = 0.65 # experimental
NB_THREADS_COXPH = 10
STACK_MULTI_OMIC = False

#### Boosting values
NB_ITER = 10 # boosting iteration
NB_THREADS= 5 # number of simdeep instance launched in parallel
NB_FOLDS = 5 # for each instance, the original dataset is split in folds and one fold is left
CLASS_SELECTION = 'mean' # mean or max: the method used to select the final class, according to class probas
MODEL_THRES = 0.05 # Cox-PH p-value threshold to reject a model for DeepProg Boosting module

#### SimDeep analysis
# save fitted models
SAVE_FITTED_MODELS = False
# load existing models if founds
LOAD_EXISTING_MODELS = False

# Which omics to use for clustering. If empty, then all the available omics will be used
CLUSTERING_OMICS = []
########################################################

#################### Paths to data file ################
# path to the folder containing the data

PROJECT_NAME = 'test_dummy_dataset'
PATH_DATA = PATH_THIS_FILE + "/../examples/data/"
# PATH_DATA = "/home/opoirion/data/survival_analysis_multiple/"

# name of the tsv file containing the survival data of the training set
SURVIVAL_TSV = 'survival_dummy.tsv'
# name of the tsv file containing the survival data of the test set
SURVIVAL_TSV_TEST = 'survival_test_dummy.tsv'

# True if
USE_INPUT_TRANSPOSE = False

ENTREZ_TO_ENSG_FILE = PATH_THIS_FILE + '/../data/entrez2ensg.tsv'

# Field from the survival tsv file
SURVIVAL_FLAG = {'patient_id': 'barcode',
                  'survival': 'days',
                 'event': 'recurrence'}

# dict('data type', 'name of the tsv file which are inside PATH_DATA')
# These data will be stacked together to build the autoencoder
TRAINING_TSV = {
    'GE': 'rna_dummy.tsv',
    'MIR': 'mir_dummy.tsv',
    'METH': 'meth_dummy.tsv',
}

TEST_TSV = {
    'MIR': 'mir_test_dummy.tsv',
}

DEFAULTSEP = '\t'

SEPARATOR = {
    '0717_methyl_cnv_inter_matrix.tsv' : ' ',
    '0717_expr_methyl_inter_matrix.tsv': ' ',
    '0717_expr_cnv_inter_matrix.tsv': ' ',
    }

# Path where to save load the Keras models
PATH_TO_SAVE_MODEL = './'

# Path to generate png images
PATH_RESULTS = './'

######## Cross-validation on the training set ############
CROSS_VALIDATION_INSTANCE = KFold(n_splits=5, shuffle=True, random_state=1)

TEST_FOLD = 0
##########################################################
########################################################

##################### NORMALIZATION PROCEDURE ###########
## Normalize before the autoencoder construction ########
NORMALIZATION = {
    'NB_FEATURES_TO_KEEP': 100, # variance selection features. 0 is all the features
    'TRAIN_MIN_MAX': False,
    'TRAIN_ROBUST_SCALE': False,
    'TRAIN_ROBUST_SCALE_TWO_WAY': False,
    'TRAIN_MAD_SCALE': False,
    'TRAIN_QUANTILE_TRANSFORM': False,
    'TRAIN_NORM_SCALE': False,
    'TRAIN_RANK_NORM': True,
    'TRAIN_CORR_REDUCTION': True,
    'TRAIN_CORR_RANK_NORM': True,
}
#########################################################

##################### Autoencoder Variable ##############
# Dimensions of the intermediate layers before and after the middle hidden layer
# if LEVEL_DIMS == [500, 250] then there will be two hidden layers with 500 and 250 nodes
# before and after the hidden middle layer (5 hidden layers)
# if LEVEL_DIMS = [], then the autoencoder will have only one hidden layer
LEVEL_DIMS_IN = ()
LEVEL_DIMS_OUT = ()
# Number of nodes in the middle hidden layer
# (i.e. the new dimensions of the transformed data)
NEW_DIM = 100
# Percentage of edges being dropout at each training iteration (None for no dropout)
DROPOUT = 0.5
# L2 Regularization constant on the node activity
ACT_REG = False
# L1 Regularization constant on the weight
W_REG = False
# Fraction of the dataset to be used as test set when building the autoencoder
DATA_SPLIT = None
# activation function
ACTIVATION = 'tanh'
# Number of epoch
EPOCHS = 10
# Loss function to minimize
LOSS = 'binary_crossentropy'
# Optimizer (sgd for Stochastic Gradient Descent)
OPTIMIZER = 'adam'
########################################################

################## CLASSIFIER ##########################
# Variables used to perform the supervized classification procedure
# to assign labels to the test set

MIXTURE_PARAMS = {
    'covariance_type': 'diag',
    'max_iter': 1000,
    'n_init': 100
    }

# Hyper parameters used to perform the grid search to find the best classifier
HYPER_PARAMETERS = [
    {'kernel': ['rbf'],
     'class_weight': [None, 'balanced'],
     'gamma': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001],
     'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1],
     'max_iter':[10000],
     },
    {'kernel': ['linear'],
     'class_weight': [None, 'balanced'],
     'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1],
     'max_iter':[10000],
     }
]

# grid search classifier using Support Vector Machine Classifier (SVC)
CLASSIFIER = SVC
##########################################################

#################### Other variables #####################
SEED = 2020 # for experiment reproducibility (if set to an integer)
##########################################################