[53737a]: / simdeep / config.py

Download this file

181 lines (150 with data), 6.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from os.path import abspath
from os.path import split as pathsplit
# absolute path of this file
PATH_THIS_FILE = pathsplit(abspath(__file__))[0]
#################### SimDeep variable ##################
USE_R_PACKAGES_FOR_SURVIVAL = False
NB_CLUSTERS = 2 # Number of clusters
CLUSTER_METHOD = 'coxPHMixture' # possible choice: ['mixture', 'kmeans', 'coxPH', 'coxPHMixture']
CLUSTER_EVAL_METHOD = 'silhouette' # possible choice: ['bic', 'silhouette', 'calinski']
CLASSIFIER_TYPE = 'svm'
NODES_SELECTION = 'Cox-PH' # possible choice: ['Cox-PH', 'C-index']
CLASSIFICATION_METHOD = 'ALL_FEATURES' # ['ALL_FEATURES', 'SURVIVAL_FEATURES']
FILL_UNKOWN_FEATURE_WITH_0 = True
USE_AUTOENCODERS = True # Construct autoencoders rather than using all the features
FEATURE_SURV_ANALYSIS = True # Refine feature with survival analysis
# Number of top features selected for classification
# Apply only when CLASSIFICATION_METHOD == 'ALL_FEATURES'
NB_SELECTED_FEATURES = 50
CLUSTER_ARRAY = []
PVALUE_THRESHOLD = 0.01 # Threshold for survival significance to set a node as valid
CINDEX_THRESHOLD = 0.65 # experimental
NB_THREADS_COXPH = 10
STACK_MULTI_OMIC = False
#### Boosting values
NB_ITER = 10 # boosting iteration
NB_THREADS= 5 # number of simdeep instance launched in parallel
NB_FOLDS = 5 # for each instance, the original dataset is split in folds and one fold is left
CLASS_SELECTION = 'mean' # mean or max: the method used to select the final class, according to class probas
MODEL_THRES = 0.05 # Cox-PH p-value threshold to reject a model for DeepProg Boosting module
#### SimDeep analysis
# save fitted models
SAVE_FITTED_MODELS = False
# load existing models if founds
LOAD_EXISTING_MODELS = False
# Which omics to use for clustering. If empty, then all the available omics will be used
CLUSTERING_OMICS = []
########################################################
#################### Paths to data file ################
# path to the folder containing the data
PROJECT_NAME = 'test_dummy_dataset'
PATH_DATA = PATH_THIS_FILE + "/../examples/data/"
# PATH_DATA = "/home/opoirion/data/survival_analysis_multiple/"
# name of the tsv file containing the survival data of the training set
SURVIVAL_TSV = 'survival_dummy.tsv'
# name of the tsv file containing the survival data of the test set
SURVIVAL_TSV_TEST = 'survival_test_dummy.tsv'
# True if
USE_INPUT_TRANSPOSE = False
ENTREZ_TO_ENSG_FILE = PATH_THIS_FILE + '/../data/entrez2ensg.tsv'
# Field from the survival tsv file
SURVIVAL_FLAG = {'patient_id': 'barcode',
'survival': 'days',
'event': 'recurrence'}
# dict('data type', 'name of the tsv file which are inside PATH_DATA')
# These data will be stacked together to build the autoencoder
TRAINING_TSV = {
'GE': 'rna_dummy.tsv',
'MIR': 'mir_dummy.tsv',
'METH': 'meth_dummy.tsv',
}
TEST_TSV = {
'MIR': 'mir_test_dummy.tsv',
}
DEFAULTSEP = '\t'
SEPARATOR = {
'0717_methyl_cnv_inter_matrix.tsv' : ' ',
'0717_expr_methyl_inter_matrix.tsv': ' ',
'0717_expr_cnv_inter_matrix.tsv': ' ',
}
# Path where to save load the Keras models
PATH_TO_SAVE_MODEL = './'
# Path to generate png images
PATH_RESULTS = './'
######## Cross-validation on the training set ############
CROSS_VALIDATION_INSTANCE = KFold(n_splits=5, shuffle=True, random_state=1)
TEST_FOLD = 0
##########################################################
########################################################
##################### NORMALIZATION PROCEDURE ###########
## Normalize before the autoencoder construction ########
NORMALIZATION = {
'NB_FEATURES_TO_KEEP': 100, # variance selection features. 0 is all the features
'TRAIN_MIN_MAX': False,
'TRAIN_ROBUST_SCALE': False,
'TRAIN_ROBUST_SCALE_TWO_WAY': False,
'TRAIN_MAD_SCALE': False,
'TRAIN_QUANTILE_TRANSFORM': False,
'TRAIN_NORM_SCALE': False,
'TRAIN_RANK_NORM': True,
'TRAIN_CORR_REDUCTION': True,
'TRAIN_CORR_RANK_NORM': True,
}
#########################################################
##################### Autoencoder Variable ##############
# Dimensions of the intermediate layers before and after the middle hidden layer
# if LEVEL_DIMS == [500, 250] then there will be two hidden layers with 500 and 250 nodes
# before and after the hidden middle layer (5 hidden layers)
# if LEVEL_DIMS = [], then the autoencoder will have only one hidden layer
LEVEL_DIMS_IN = ()
LEVEL_DIMS_OUT = ()
# Number of nodes in the middle hidden layer
# (i.e. the new dimensions of the transformed data)
NEW_DIM = 100
# Percentage of edges being dropout at each training iteration (None for no dropout)
DROPOUT = 0.5
# L2 Regularization constant on the node activity
ACT_REG = False
# L1 Regularization constant on the weight
W_REG = False
# Fraction of the dataset to be used as test set when building the autoencoder
DATA_SPLIT = None
# activation function
ACTIVATION = 'tanh'
# Number of epoch
EPOCHS = 10
# Loss function to minimize
LOSS = 'binary_crossentropy'
# Optimizer (sgd for Stochastic Gradient Descent)
OPTIMIZER = 'adam'
########################################################
################## CLASSIFIER ##########################
# Variables used to perform the supervized classification procedure
# to assign labels to the test set
MIXTURE_PARAMS = {
'covariance_type': 'diag',
'max_iter': 1000,
'n_init': 100
}
# Hyper parameters used to perform the grid search to find the best classifier
HYPER_PARAMETERS = [
{'kernel': ['rbf'],
'class_weight': [None, 'balanced'],
'gamma': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001],
'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1],
'max_iter':[10000],
},
{'kernel': ['linear'],
'class_weight': [None, 'balanced'],
'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1],
'max_iter':[10000],
}
]
# grid search classifier using Support Vector Machine Classifier (SVC)
CLASSIFIER = SVC
##########################################################
#################### Other variables #####################
SEED = 2020 # for experiment reproducibility (if set to an integer)
##########################################################