Diff of /simdeep/config.py [000000] .. [53737a]

Switch to side-by-side view

--- a
+++ b/simdeep/config.py
@@ -0,0 +1,180 @@
+from sklearn.svm import SVC
+from sklearn.model_selection import KFold
+
+from os.path import abspath
+from os.path import split as pathsplit
+
+# absolute path of this file
+PATH_THIS_FILE = pathsplit(abspath(__file__))[0]
+
+#################### SimDeep variable ##################
+
+USE_R_PACKAGES_FOR_SURVIVAL = False
+NB_CLUSTERS = 2 # Number of clusters
+CLUSTER_METHOD = 'coxPHMixture' # possible choice: ['mixture', 'kmeans', 'coxPH', 'coxPHMixture']
+CLUSTER_EVAL_METHOD = 'silhouette' # possible choice: ['bic', 'silhouette', 'calinski']
+CLASSIFIER_TYPE = 'svm'
+NODES_SELECTION = 'Cox-PH' # possible choice: ['Cox-PH', 'C-index']
+CLASSIFICATION_METHOD = 'ALL_FEATURES' # ['ALL_FEATURES', 'SURVIVAL_FEATURES']
+FILL_UNKOWN_FEATURE_WITH_0 = True
+USE_AUTOENCODERS = True # Construct autoencoders rather than using all the features
+FEATURE_SURV_ANALYSIS = True # Refine feature with survival analysis
+
+# Number of top features selected for classification
+# Apply only when CLASSIFICATION_METHOD == 'ALL_FEATURES'
+NB_SELECTED_FEATURES = 50
+
+CLUSTER_ARRAY = []
+PVALUE_THRESHOLD = 0.01 # Threshold for survival significance to set a node as valid
+CINDEX_THRESHOLD = 0.65 # experimental
+NB_THREADS_COXPH = 10
+STACK_MULTI_OMIC = False
+
+#### Boosting values
+NB_ITER = 10 # boosting iteration
+NB_THREADS= 5 # number of simdeep instance launched in parallel
+NB_FOLDS = 5 # for each instance, the original dataset is split in folds and one fold is left
+CLASS_SELECTION = 'mean' # mean or max: the method used to select the final class, according to class probas
+MODEL_THRES = 0.05 # Cox-PH p-value threshold to reject a model for DeepProg Boosting module
+
+#### SimDeep analysis
+# save fitted models
+SAVE_FITTED_MODELS = False
+# load existing models if founds
+LOAD_EXISTING_MODELS = False
+
+# Which omics to use for clustering. If empty, then all the available omics will be used
+CLUSTERING_OMICS = []
+########################################################
+
+#################### Paths to data file ################
+# path to the folder containing the data
+
+PROJECT_NAME = 'test_dummy_dataset'
+PATH_DATA = PATH_THIS_FILE + "/../examples/data/"
+# PATH_DATA = "/home/opoirion/data/survival_analysis_multiple/"
+
+# name of the tsv file containing the survival data of the training set
+SURVIVAL_TSV = 'survival_dummy.tsv'
+# name of the tsv file containing the survival data of the test set
+SURVIVAL_TSV_TEST = 'survival_test_dummy.tsv'
+
+# True if
+USE_INPUT_TRANSPOSE = False
+
+ENTREZ_TO_ENSG_FILE = PATH_THIS_FILE + '/../data/entrez2ensg.tsv'
+
+# Field from the survival tsv file
+SURVIVAL_FLAG = {'patient_id': 'barcode',
+                  'survival': 'days',
+                 'event': 'recurrence'}
+
+# dict('data type', 'name of the tsv file which are inside PATH_DATA')
+# These data will be stacked together to build the autoencoder
+TRAINING_TSV = {
+    'GE': 'rna_dummy.tsv',
+    'MIR': 'mir_dummy.tsv',
+    'METH': 'meth_dummy.tsv',
+}
+
+TEST_TSV = {
+    'MIR': 'mir_test_dummy.tsv',
+}
+
+DEFAULTSEP = '\t'
+
+SEPARATOR = {
+    '0717_methyl_cnv_inter_matrix.tsv' : ' ',
+    '0717_expr_methyl_inter_matrix.tsv': ' ',
+    '0717_expr_cnv_inter_matrix.tsv': ' ',
+    }
+
+# Path where to save load the Keras models
+PATH_TO_SAVE_MODEL = './'
+
+# Path to generate png images
+PATH_RESULTS = './'
+
+######## Cross-validation on the training set ############
+CROSS_VALIDATION_INSTANCE = KFold(n_splits=5, shuffle=True, random_state=1)
+
+TEST_FOLD = 0
+##########################################################
+########################################################
+
+##################### NORMALIZATION PROCEDURE ###########
+## Normalize before the autoencoder construction ########
+NORMALIZATION = {
+    'NB_FEATURES_TO_KEEP': 100, # variance selection features. 0 is all the features
+    'TRAIN_MIN_MAX': False,
+    'TRAIN_ROBUST_SCALE': False,
+    'TRAIN_ROBUST_SCALE_TWO_WAY': False,
+    'TRAIN_MAD_SCALE': False,
+    'TRAIN_QUANTILE_TRANSFORM': False,
+    'TRAIN_NORM_SCALE': False,
+    'TRAIN_RANK_NORM': True,
+    'TRAIN_CORR_REDUCTION': True,
+    'TRAIN_CORR_RANK_NORM': True,
+}
+#########################################################
+
+##################### Autoencoder Variable ##############
+# Dimensions of the intermediate layers before and after the middle hidden layer
+# if LEVEL_DIMS == [500, 250] then there will be two hidden layers with 500 and 250 nodes
+# before and after the hidden middle layer (5 hidden layers)
+# if LEVEL_DIMS = [], then the autoencoder will have only one hidden layer
+LEVEL_DIMS_IN = ()
+LEVEL_DIMS_OUT = ()
+# Number of nodes in the middle hidden layer
+# (i.e. the new dimensions of the transformed data)
+NEW_DIM = 100
+# Percentage of edges being dropout at each training iteration (None for no dropout)
+DROPOUT = 0.5
+# L2 Regularization constant on the node activity
+ACT_REG = False
+# L1 Regularization constant on the weight
+W_REG = False
+# Fraction of the dataset to be used as test set when building the autoencoder
+DATA_SPLIT = None
+# activation function
+ACTIVATION = 'tanh'
+# Number of epoch
+EPOCHS = 10
+# Loss function to minimize
+LOSS = 'binary_crossentropy'
+# Optimizer (sgd for Stochastic Gradient Descent)
+OPTIMIZER = 'adam'
+########################################################
+
+################## CLASSIFIER ##########################
+# Variables used to perform the supervized classification procedure
+# to assign labels to the test set
+
+MIXTURE_PARAMS = {
+    'covariance_type': 'diag',
+    'max_iter': 1000,
+    'n_init': 100
+    }
+
+# Hyper parameters used to perform the grid search to find the best classifier
+HYPER_PARAMETERS = [
+    {'kernel': ['rbf'],
+     'class_weight': [None, 'balanced'],
+     'gamma': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001],
+     'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1],
+     'max_iter':[10000],
+     },
+    {'kernel': ['linear'],
+     'class_weight': [None, 'balanced'],
+     'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1],
+     'max_iter':[10000],
+     }
+]
+
+# grid search classifier using Support Vector Machine Classifier (SVC)
+CLASSIFIER = SVC
+##########################################################
+
+#################### Other variables #####################
+SEED = 2020 # for experiment reproducibility (if set to an integer)
+##########################################################