|
a |
|
b/simdeep/config.py |
|
|
1 |
from sklearn.svm import SVC |
|
|
2 |
from sklearn.model_selection import KFold |
|
|
3 |
|
|
|
4 |
from os.path import abspath |
|
|
5 |
from os.path import split as pathsplit |
|
|
6 |
|
|
|
7 |
# absolute path of this file |
|
|
8 |
PATH_THIS_FILE = pathsplit(abspath(__file__))[0] |
|
|
9 |
|
|
|
10 |
#################### SimDeep variable ################## |
|
|
11 |
|
|
|
12 |
USE_R_PACKAGES_FOR_SURVIVAL = False |
|
|
13 |
NB_CLUSTERS = 2 # Number of clusters |
|
|
14 |
CLUSTER_METHOD = 'coxPHMixture' # possible choice: ['mixture', 'kmeans', 'coxPH', 'coxPHMixture'] |
|
|
15 |
CLUSTER_EVAL_METHOD = 'silhouette' # possible choice: ['bic', 'silhouette', 'calinski'] |
|
|
16 |
CLASSIFIER_TYPE = 'svm' |
|
|
17 |
NODES_SELECTION = 'Cox-PH' # possible choice: ['Cox-PH', 'C-index'] |
|
|
18 |
CLASSIFICATION_METHOD = 'ALL_FEATURES' # ['ALL_FEATURES', 'SURVIVAL_FEATURES'] |
|
|
19 |
FILL_UNKOWN_FEATURE_WITH_0 = True |
|
|
20 |
USE_AUTOENCODERS = True # Construct autoencoders rather than using all the features |
|
|
21 |
FEATURE_SURV_ANALYSIS = True # Refine feature with survival analysis |
|
|
22 |
|
|
|
23 |
# Number of top features selected for classification |
|
|
24 |
# Apply only when CLASSIFICATION_METHOD == 'ALL_FEATURES' |
|
|
25 |
NB_SELECTED_FEATURES = 50 |
|
|
26 |
|
|
|
27 |
CLUSTER_ARRAY = [] |
|
|
28 |
PVALUE_THRESHOLD = 0.01 # Threshold for survival significance to set a node as valid |
|
|
29 |
CINDEX_THRESHOLD = 0.65 # experimental |
|
|
30 |
NB_THREADS_COXPH = 10 |
|
|
31 |
STACK_MULTI_OMIC = False |
|
|
32 |
|
|
|
33 |
#### Boosting values |
|
|
34 |
NB_ITER = 10 # boosting iteration |
|
|
35 |
NB_THREADS= 5 # number of simdeep instance launched in parallel |
|
|
36 |
NB_FOLDS = 5 # for each instance, the original dataset is split in folds and one fold is left |
|
|
37 |
CLASS_SELECTION = 'mean' # mean or max: the method used to select the final class, according to class probas |
|
|
38 |
MODEL_THRES = 0.05 # Cox-PH p-value threshold to reject a model for DeepProg Boosting module |
|
|
39 |
|
|
|
40 |
#### SimDeep analysis |
|
|
41 |
# save fitted models |
|
|
42 |
SAVE_FITTED_MODELS = False |
|
|
43 |
# load existing models if founds |
|
|
44 |
LOAD_EXISTING_MODELS = False |
|
|
45 |
|
|
|
46 |
# Which omics to use for clustering. If empty, then all the available omics will be used |
|
|
47 |
CLUSTERING_OMICS = [] |
|
|
48 |
######################################################## |
|
|
49 |
|
|
|
50 |
#################### Paths to data file ################ |
|
|
51 |
# path to the folder containing the data |
|
|
52 |
|
|
|
53 |
PROJECT_NAME = 'test_dummy_dataset' |
|
|
54 |
PATH_DATA = PATH_THIS_FILE + "/../examples/data/" |
|
|
55 |
# PATH_DATA = "/home/opoirion/data/survival_analysis_multiple/" |
|
|
56 |
|
|
|
57 |
# name of the tsv file containing the survival data of the training set |
|
|
58 |
SURVIVAL_TSV = 'survival_dummy.tsv' |
|
|
59 |
# name of the tsv file containing the survival data of the test set |
|
|
60 |
SURVIVAL_TSV_TEST = 'survival_test_dummy.tsv' |
|
|
61 |
|
|
|
62 |
# True if |
|
|
63 |
USE_INPUT_TRANSPOSE = False |
|
|
64 |
|
|
|
65 |
ENTREZ_TO_ENSG_FILE = PATH_THIS_FILE + '/../data/entrez2ensg.tsv' |
|
|
66 |
|
|
|
67 |
# Field from the survival tsv file |
|
|
68 |
SURVIVAL_FLAG = {'patient_id': 'barcode', |
|
|
69 |
'survival': 'days', |
|
|
70 |
'event': 'recurrence'} |
|
|
71 |
|
|
|
72 |
# dict('data type', 'name of the tsv file which are inside PATH_DATA') |
|
|
73 |
# These data will be stacked together to build the autoencoder |
|
|
74 |
TRAINING_TSV = { |
|
|
75 |
'GE': 'rna_dummy.tsv', |
|
|
76 |
'MIR': 'mir_dummy.tsv', |
|
|
77 |
'METH': 'meth_dummy.tsv', |
|
|
78 |
} |
|
|
79 |
|
|
|
80 |
TEST_TSV = { |
|
|
81 |
'MIR': 'mir_test_dummy.tsv', |
|
|
82 |
} |
|
|
83 |
|
|
|
84 |
DEFAULTSEP = '\t' |
|
|
85 |
|
|
|
86 |
SEPARATOR = { |
|
|
87 |
'0717_methyl_cnv_inter_matrix.tsv' : ' ', |
|
|
88 |
'0717_expr_methyl_inter_matrix.tsv': ' ', |
|
|
89 |
'0717_expr_cnv_inter_matrix.tsv': ' ', |
|
|
90 |
} |
|
|
91 |
|
|
|
92 |
# Path where to save load the Keras models |
|
|
93 |
PATH_TO_SAVE_MODEL = './' |
|
|
94 |
|
|
|
95 |
# Path to generate png images |
|
|
96 |
PATH_RESULTS = './' |
|
|
97 |
|
|
|
98 |
######## Cross-validation on the training set ############ |
|
|
99 |
CROSS_VALIDATION_INSTANCE = KFold(n_splits=5, shuffle=True, random_state=1) |
|
|
100 |
|
|
|
101 |
TEST_FOLD = 0 |
|
|
102 |
########################################################## |
|
|
103 |
######################################################## |
|
|
104 |
|
|
|
105 |
##################### NORMALIZATION PROCEDURE ########### |
|
|
106 |
## Normalize before the autoencoder construction ######## |
|
|
107 |
NORMALIZATION = { |
|
|
108 |
'NB_FEATURES_TO_KEEP': 100, # variance selection features. 0 is all the features |
|
|
109 |
'TRAIN_MIN_MAX': False, |
|
|
110 |
'TRAIN_ROBUST_SCALE': False, |
|
|
111 |
'TRAIN_ROBUST_SCALE_TWO_WAY': False, |
|
|
112 |
'TRAIN_MAD_SCALE': False, |
|
|
113 |
'TRAIN_QUANTILE_TRANSFORM': False, |
|
|
114 |
'TRAIN_NORM_SCALE': False, |
|
|
115 |
'TRAIN_RANK_NORM': True, |
|
|
116 |
'TRAIN_CORR_REDUCTION': True, |
|
|
117 |
'TRAIN_CORR_RANK_NORM': True, |
|
|
118 |
} |
|
|
119 |
######################################################### |
|
|
120 |
|
|
|
121 |
##################### Autoencoder Variable ############## |
|
|
122 |
# Dimensions of the intermediate layers before and after the middle hidden layer |
|
|
123 |
# if LEVEL_DIMS == [500, 250] then there will be two hidden layers with 500 and 250 nodes |
|
|
124 |
# before and after the hidden middle layer (5 hidden layers) |
|
|
125 |
# if LEVEL_DIMS = [], then the autoencoder will have only one hidden layer |
|
|
126 |
LEVEL_DIMS_IN = () |
|
|
127 |
LEVEL_DIMS_OUT = () |
|
|
128 |
# Number of nodes in the middle hidden layer |
|
|
129 |
# (i.e. the new dimensions of the transformed data) |
|
|
130 |
NEW_DIM = 100 |
|
|
131 |
# Percentage of edges being dropout at each training iteration (None for no dropout) |
|
|
132 |
DROPOUT = 0.5 |
|
|
133 |
# L2 Regularization constant on the node activity |
|
|
134 |
ACT_REG = False |
|
|
135 |
# L1 Regularization constant on the weight |
|
|
136 |
W_REG = False |
|
|
137 |
# Fraction of the dataset to be used as test set when building the autoencoder |
|
|
138 |
DATA_SPLIT = None |
|
|
139 |
# activation function |
|
|
140 |
ACTIVATION = 'tanh' |
|
|
141 |
# Number of epoch |
|
|
142 |
EPOCHS = 10 |
|
|
143 |
# Loss function to minimize |
|
|
144 |
LOSS = 'binary_crossentropy' |
|
|
145 |
# Optimizer (sgd for Stochastic Gradient Descent) |
|
|
146 |
OPTIMIZER = 'adam' |
|
|
147 |
######################################################## |
|
|
148 |
|
|
|
149 |
################## CLASSIFIER ########################## |
|
|
150 |
# Variables used to perform the supervized classification procedure |
|
|
151 |
# to assign labels to the test set |
|
|
152 |
|
|
|
153 |
MIXTURE_PARAMS = { |
|
|
154 |
'covariance_type': 'diag', |
|
|
155 |
'max_iter': 1000, |
|
|
156 |
'n_init': 100 |
|
|
157 |
} |
|
|
158 |
|
|
|
159 |
# Hyper parameters used to perform the grid search to find the best classifier |
|
|
160 |
HYPER_PARAMETERS = [ |
|
|
161 |
{'kernel': ['rbf'], |
|
|
162 |
'class_weight': [None, 'balanced'], |
|
|
163 |
'gamma': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001], |
|
|
164 |
'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1], |
|
|
165 |
'max_iter':[10000], |
|
|
166 |
}, |
|
|
167 |
{'kernel': ['linear'], |
|
|
168 |
'class_weight': [None, 'balanced'], |
|
|
169 |
'C': [1000, 750, 500, 250, 100, 50, 10, 5, 1, 0.1], |
|
|
170 |
'max_iter':[10000], |
|
|
171 |
} |
|
|
172 |
] |
|
|
173 |
|
|
|
174 |
# grid search classifier using Support Vector Machine Classifier (SVC) |
|
|
175 |
CLASSIFIER = SVC |
|
|
176 |
########################################################## |
|
|
177 |
|
|
|
178 |
#################### Other variables ##################### |
|
|
179 |
SEED = 2020 # for experiment reproducibility (if set to an integer) |
|
|
180 |
########################################################## |