--- a +++ b/simdeep/config.py @@ -0,0 +1,180 @@ +from sklearn.svm import SVC +from sklearn.model_selection import KFold + +from os.path import abspath +from os.path import split as pathsplit + +# absolute path of this file +PATH_THIS_FILE = pathsplit(abspath(__file__))[0] + +#################### SimDeep variable ################## + +USE_R_PACKAGES_FOR_SURVIVAL = False +NB_CLUSTERS = 2 # Number of clusters +CLUSTER_METHOD = 'coxPHMixture' # possible choice: ['mixture', 'kmeans', 'coxPH', 'coxPHMixture'] +CLUSTER_EVAL_METHOD = 'silhouette' # possible choice: ['bic', 'silhouette', 'calinski'] +CLASSIFIER_TYPE = 'svm' +NODES_SELECTION = 'Cox-PH' # possible choice: ['Cox-PH', 'C-index'] +CLASSIFICATION_METHOD = 'ALL_FEATURES' # ['ALL_FEATURES', 'SURVIVAL_FEATURES'] +FILL_UNKOWN_FEATURE_WITH_0 = True +USE_AUTOENCODERS = True # Construct autoencoders rather than using all the features +FEATURE_SURV_ANALYSIS = True # Refine feature with survival analysis + +# Number of top features selected for classification +# Apply only when CLASSIFICATION_METHOD == 'ALL_FEATURES' +NB_SELECTED_FEATURES = 50 + +CLUSTER_ARRAY = [] +PVALUE_THRESHOLD = 0.01 # Threshold for survival significance to set a node as valid +CINDEX_THRESHOLD = 0.65 # experimental +NB_THREADS_COXPH = 10 +STACK_MULTI_OMIC = False + +#### Boosting values +NB_ITER = 10 # boosting iteration +NB_THREADS= 5 # number of simdeep instance launched in parallel +NB_FOLDS = 5 # for each instance, the original dataset is split in folds and one fold is left +CLASS_SELECTION = 'mean' # mean or max: the method used to select the final class, according to class probas +MODEL_THRES = 0.05 # Cox-PH p-value threshold to reject a model for DeepProg Boosting module + +#### SimDeep analysis +# save fitted models +SAVE_FITTED_MODELS = False +# load existing models if founds +LOAD_EXISTING_MODELS = False + +# Which omics to use for clustering. If empty, then all the available omics will be used +CLUSTERING_OMICS = [] +######################################################## + +#################### Paths to data file ################ +# path to the folder containing the data + +PROJECT_NAME = 'test_dummy_dataset' +PATH_DATA = PATH_THIS_FILE + "/../examples/data/" +# PATH_DATA = "/home/opoirion/data/survival_analysis_multiple/" + +# name of the tsv file containing the survival data of the training set +SURVIVAL_TSV = 'survival_dummy.tsv' +# name of the tsv file containing the survival data of the test set +SURVIVAL_TSV_TEST = 'survival_test_dummy.tsv' + +# True if +USE_INPUT_TRANSPOSE = False + +ENTREZ_TO_ENSG_FILE = PATH_THIS_FILE + '/../data/entrez2ensg.tsv' + +# Field from the survival tsv file +SURVIVAL_FLAG = {'patient_id': 'barcode', + 'survival': 'days', + 'event': 'recurrence'} + +# dict('data type', 'name of the tsv file which are inside PATH_DATA') +# These data will be stacked together to build the autoencoder +TRAINING_TSV = { + 'GE': 'rna_dummy.tsv', + 'MIR': 'mir_dummy.tsv', + 'METH': 'meth_dummy.tsv', +} + +TEST_TSV = { + 'MIR': 'mir_test_dummy.tsv', +} + +DEFAULTSEP = '\t' + +SEPARATOR = { + '0717_methyl_cnv_inter_matrix.tsv' : ' ', + '0717_expr_methyl_inter_matrix.tsv': ' ', + '0717_expr_cnv_inter_matrix.tsv': ' ', + } + +# Path where to save load the Keras models +PATH_TO_SAVE_MODEL = './' + +# Path to generate png images +PATH_RESULTS = './' + +######## Cross-validation on the training set ############ +CROSS_VALIDATION_INSTANCE = KFold(n_splits=5, shuffle=True, random_state=1) + +TEST_FOLD = 0 +########################################################## +######################################################## + +##################### NORMALIZATION PROCEDURE ########### +## Normalize before the autoencoder construction ######## +NORMALIZATION = { + 'NB_FEATURES_TO_KEEP': 100, # variance selection features. 0 is all the features + 'TRAIN_MIN_MAX': False, + 'TRAIN_ROBUST_SCALE': False, + 'TRAIN_ROBUST_SCALE_TWO_WAY': False, + 'TRAIN_MAD_SCALE': False, + 'TRAIN_QUANTILE_TRANSFORM': False, + 'TRAIN_NORM_SCALE': False, + 'TRAIN_RANK_NORM': True, + 'TRAIN_CORR_REDUCTION': True, + 'TRAIN_CORR_RANK_NORM': True, +} +######################################################### + +##################### Autoencoder Variable ############## +# Dimensions of the intermediate layers before and after the middle hidden layer +# if LEVEL_DIMS == [500, 250] then there will be two hidden layers with 500 and 250 nodes +# before and after the hidden middle layer (5 hidden layers) +# if LEVEL_DIMS = [], then the autoencoder will have only one hidden layer +LEVEL_DIMS_IN = () +LEVEL_DIMS_OUT = () +# Number of nodes in the middle hidden layer +# (i.e. the new dimensions of the transformed data) +NEW_DIM = 100 +# Percentage of edges being dropout at each training iteration (None for no dropout) +DROPOUT = 0.5 +# L2 Regularization constant on the node activity +ACT_REG = False +# L1 Regularization constant on the weight +W_REG = False +# Fraction of the dataset to be used as test set when building the autoencoder +DATA_SPLIT = None +# activation function +ACTIVATION = 'tanh' +# Number of epoch +EPOCHS = 10 +# Loss function to minimize +LOSS = 'binary_crossentropy' +# Optimizer (sgd for Stochastic Gradient Descent) +OPTIMIZER = 'adam' +######################################################## + +################## CLASSIFIER ########################## +# Variables used to perform the supervized classification procedure +# to assign labels to the test set + +MIXTURE_PARAMS = { + 'covariance_type': 'diag', + 'max_iter': 1000, + 'n_init': 100 + } + +# Hyper parameters used to perform the grid search to find the best classifier +HYPER_PARAMETERS = [ + {'kernel': ['rbf'], + 'class_weight': [None, 'balanced'], + 'gamma': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001], + 'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1], + 'max_iter':[10000], + }, + {'kernel': ['linear'], + 'class_weight': [None, 'balanced'], + 'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1], + 'max_iter':[10000], + } +] + +# grid search classifier using Support Vector Machine Classifier (SVC) +CLASSIFIER = SVC +########################################################## + +#################### Other variables ##################### +SEED = 2020 # for experiment reproducibility (if set to an integer) +##########################################################