a b/simdeep/config.py
1
from sklearn.svm import SVC
2
from sklearn.model_selection import KFold
3
4
from os.path import abspath
5
from os.path import split as pathsplit
6
7
# absolute path of this file
8
PATH_THIS_FILE = pathsplit(abspath(__file__))[0]
9
10
#################### SimDeep variable ##################
11
12
USE_R_PACKAGES_FOR_SURVIVAL = False
13
NB_CLUSTERS = 2 # Number of clusters
14
CLUSTER_METHOD = 'coxPHMixture' # possible choice: ['mixture', 'kmeans', 'coxPH', 'coxPHMixture']
15
CLUSTER_EVAL_METHOD = 'silhouette' # possible choice: ['bic', 'silhouette', 'calinski']
16
CLASSIFIER_TYPE = 'svm'
17
NODES_SELECTION = 'Cox-PH' # possible choice: ['Cox-PH', 'C-index']
18
CLASSIFICATION_METHOD = 'ALL_FEATURES' # ['ALL_FEATURES', 'SURVIVAL_FEATURES']
19
FILL_UNKOWN_FEATURE_WITH_0 = True
20
USE_AUTOENCODERS = True # Construct autoencoders rather than using all the features
21
FEATURE_SURV_ANALYSIS = True # Refine feature with survival analysis
22
23
# Number of top features selected for classification
24
# Apply only when CLASSIFICATION_METHOD == 'ALL_FEATURES'
25
NB_SELECTED_FEATURES = 50
26
27
CLUSTER_ARRAY = []
28
PVALUE_THRESHOLD = 0.01 # Threshold for survival significance to set a node as valid
29
CINDEX_THRESHOLD = 0.65 # experimental
30
NB_THREADS_COXPH = 10
31
STACK_MULTI_OMIC = False
32
33
#### Boosting values
34
NB_ITER = 10 # boosting iteration
35
NB_THREADS= 5 # number of simdeep instance launched in parallel
36
NB_FOLDS = 5 # for each instance, the original dataset is split in folds and one fold is left
37
CLASS_SELECTION = 'mean' # mean or max: the method used to select the final class, according to class probas
38
MODEL_THRES = 0.05 # Cox-PH p-value threshold to reject a model for DeepProg Boosting module
39
40
#### SimDeep analysis
41
# save fitted models
42
SAVE_FITTED_MODELS = False
43
# load existing models if founds
44
LOAD_EXISTING_MODELS = False
45
46
# Which omics to use for clustering. If empty, then all the available omics will be used
47
CLUSTERING_OMICS = []
48
########################################################
49
50
#################### Paths to data file ################
51
# path to the folder containing the data
52
53
PROJECT_NAME = 'test_dummy_dataset'
54
PATH_DATA = PATH_THIS_FILE + "/../examples/data/"
55
# PATH_DATA = "/home/opoirion/data/survival_analysis_multiple/"
56
57
# name of the tsv file containing the survival data of the training set
58
SURVIVAL_TSV = 'survival_dummy.tsv'
59
# name of the tsv file containing the survival data of the test set
60
SURVIVAL_TSV_TEST = 'survival_test_dummy.tsv'
61
62
# True if
63
USE_INPUT_TRANSPOSE = False
64
65
ENTREZ_TO_ENSG_FILE = PATH_THIS_FILE + '/../data/entrez2ensg.tsv'
66
67
# Field from the survival tsv file
68
SURVIVAL_FLAG = {'patient_id': 'barcode',
69
                  'survival': 'days',
70
                 'event': 'recurrence'}
71
72
# dict('data type', 'name of the tsv file which are inside PATH_DATA')
73
# These data will be stacked together to build the autoencoder
74
TRAINING_TSV = {
75
    'GE': 'rna_dummy.tsv',
76
    'MIR': 'mir_dummy.tsv',
77
    'METH': 'meth_dummy.tsv',
78
}
79
80
TEST_TSV = {
81
    'MIR': 'mir_test_dummy.tsv',
82
}
83
84
DEFAULTSEP = '\t'
85
86
SEPARATOR = {
87
    '0717_methyl_cnv_inter_matrix.tsv' : ' ',
88
    '0717_expr_methyl_inter_matrix.tsv': ' ',
89
    '0717_expr_cnv_inter_matrix.tsv': ' ',
90
    }
91
92
# Path where to save load the Keras models
93
PATH_TO_SAVE_MODEL = './'
94
95
# Path to generate png images
96
PATH_RESULTS = './'
97
98
######## Cross-validation on the training set ############
99
CROSS_VALIDATION_INSTANCE = KFold(n_splits=5, shuffle=True, random_state=1)
100
101
TEST_FOLD = 0
102
##########################################################
103
########################################################
104
105
##################### NORMALIZATION PROCEDURE ###########
106
## Normalize before the autoencoder construction ########
107
NORMALIZATION = {
108
    'NB_FEATURES_TO_KEEP': 100, # variance selection features. 0 is all the features
109
    'TRAIN_MIN_MAX': False,
110
    'TRAIN_ROBUST_SCALE': False,
111
    'TRAIN_ROBUST_SCALE_TWO_WAY': False,
112
    'TRAIN_MAD_SCALE': False,
113
    'TRAIN_QUANTILE_TRANSFORM': False,
114
    'TRAIN_NORM_SCALE': False,
115
    'TRAIN_RANK_NORM': True,
116
    'TRAIN_CORR_REDUCTION': True,
117
    'TRAIN_CORR_RANK_NORM': True,
118
}
119
#########################################################
120
121
##################### Autoencoder Variable ##############
122
# Dimensions of the intermediate layers before and after the middle hidden layer
123
# if LEVEL_DIMS == [500, 250] then there will be two hidden layers with 500 and 250 nodes
124
# before and after the hidden middle layer (5 hidden layers)
125
# if LEVEL_DIMS = [], then the autoencoder will have only one hidden layer
126
LEVEL_DIMS_IN = ()
127
LEVEL_DIMS_OUT = ()
128
# Number of nodes in the middle hidden layer
129
# (i.e. the new dimensions of the transformed data)
130
NEW_DIM = 100
131
# Percentage of edges being dropout at each training iteration (None for no dropout)
132
DROPOUT = 0.5
133
# L2 Regularization constant on the node activity
134
ACT_REG = False
135
# L1 Regularization constant on the weight
136
W_REG = False
137
# Fraction of the dataset to be used as test set when building the autoencoder
138
DATA_SPLIT = None
139
# activation function
140
ACTIVATION = 'tanh'
141
# Number of epoch
142
EPOCHS = 10
143
# Loss function to minimize
144
LOSS = 'binary_crossentropy'
145
# Optimizer (sgd for Stochastic Gradient Descent)
146
OPTIMIZER = 'adam'
147
########################################################
148
149
################## CLASSIFIER ##########################
150
# Variables used to perform the supervized classification procedure
151
# to assign labels to the test set
152
153
MIXTURE_PARAMS = {
154
    'covariance_type': 'diag',
155
    'max_iter': 1000,
156
    'n_init': 100
157
    }
158
159
# Hyper parameters used to perform the grid search to find the best classifier
160
HYPER_PARAMETERS = [
161
    {'kernel': ['rbf'],
162
     'class_weight': [None, 'balanced'],
163
     'gamma': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001],
164
     'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1],
165
     'max_iter':[10000],
166
     },
167
    {'kernel': ['linear'],
168
     'class_weight': [None, 'balanced'],
169
     'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1],
170
     'max_iter':[10000],
171
     }
172
]
173
174
# grid search classifier using Support Vector Machine Classifier (SVC)
175
CLASSIFIER = SVC
176
##########################################################
177
178
#################### Other variables #####################
179
SEED = 2020 # for experiment reproducibility (if set to an integer)
180
##########################################################