Diff of /config.conf [000000] .. [ce076b]

Switch to unified view

a b/config.conf
1
# #### BioDiscML config file #### #
2
# See https://github.com/mickaelleclercq/BioDiscML/tree/master/release/Test_datasets 
3
# for examples.
4
5
# IMPORTANT: for classification, do not use classes with numeric attributes. Else,
6
# they will be interpreted as a regression problem.
7
8
#####################
9
### BASIC OPTIONS ###
10
#####################
11
12
## Working directory. If local execution, don't set it.
13
# wd must be defined if another classifiers.conf is provided
14
# Default: wd=*empty* (local directory)
15
16
#wd=working_directory
17
18
19
## Project name, used as prefix for outfiles.
20
# Default: project=myProject
21
22
project=myProject
23
24
25
## Type of classification: Classification
26
# Set to true if we perform a classification (nominal class). 
27
# Default: doClassification=false
28
29
doClassification=false
30
31
32
# If true, set the column class name to classify.
33
# Default: classificationClassName=class
34
35
classificationClassName=class
36
37
38
## Type of classification: Regression
39
# Set to true if we perform a regression (numeric class). 
40
# Default:doRegression=false
41
doRegression=false
42
43
# If true, set the column class name to classify.
44
# Default: regressionClassName=class
45
46
regressionClassName=class
47
48
49
## Training input files
50
# Set infiles here if you have several dataset with a common ID column that will
51
# be used for merging (see mergingID). Only IDs existing in all files will be kept 
52
# for training, those missing in one of the file will be ignored.
53
# All decimal separated values commas (,) will be changed to dots (.).
54
# You also must remove special symbols within your data (e.g.: %/\*"':éèà).
55
# Usage: trainFile=filename_in_working_directory,description 
56
# The description will be used as a prefix for features of the file to avoid  
57
# duplicated names. It can be left empty if there is no risk of duplicated names.
58
# (ex:  trainFile=myproteinsfile, protein
59
#       trainFile=mygenesfile, genes
60
#       trainFile=mymetadatafile).
61
# Default: trainFile=*empty*
62
63
#trainFile=trainFile1.csv, description
64
#trainFile=trainFile2.csv
65
66
## Predict new data input files
67
# If you have you own blind test dataset or new data, you can run biodiscml using 
68
# the -predict option (java -jar biodiscml.jar -config config.conf -predict).
69
# This function will need two defined input files: 
70
#  - A newData file (same format and structure as the training input files.
71
#   This file must contain at least all elements of the retained signature of the 
72
#   selected best model features. Features present in the newData file, but absent from 
73
#   the signature of the model will simply be ignored during the prediction)
74
#  - A model file (produced during a previous execution of biodiscml where a best
75
#    model have been identified) 
76
# Usage: newDataFile=filename_in_working_directory,description 
77
# The description will be used as a prefix for features of the file to avoid  
78
# duplicated names. It can be left empty if there is no risk of duplicated names.
79
# (ex:  newDataFile=myproteinsfile, protein
80
#       newDataFile=mygenesfile, genes
81
#       newDataFile=mymetadatafile).
82
# Default: newDataFile=*empty*
83
# Default: modelFile=*empty*
84
85
#newDataFile=newDataFile1.csv, description
86
#newDataFile=newDataFile2.csv
87
#modelFile=model.model
88
89
90
## Merging 
91
# Merging identifier, used if you have many files to merge. It is expected to be
92
# found in the first column of every files.
93
# Only rows containing identifiers that exist in all files will be considered in 
94
# the analysis.
95
# Default: mergingID=*empty*
96
97
#mergingID=identifier
98
99
100
## Sampling
101
# Perform sampling to create a random validation set not used during training and
102
# used for further evaluation. 
103
# Default: sampling=true
104
105
sampling=true
106
107
# The samplingFold option separate the set in x parts, keep 1 for validation, others 
108
# for training.
109
# e.g. samplingFold=3 means that the validation set will be composed of 1/3 of the 
110
# input data.
111
# Ignored if sampling=false
112
# Default: samplingFold=3
113
114
samplingFold=3
115
116
# Instead of random sampling, you can provide a validation file on which the models 
117
# will be tested. 
118
# Note that the validation file must contain the same structure and features as the 
119
# train file.
120
# You can also provide several validation files, they will be merged.
121
# If set, samplingFold options will be ignored. 
122
# Ignored if sampling=false
123
# Default: validationFile=*empty*
124
125
#validationFile=validationFile1.csv
126
127
128
## Feature exclusion
129
# Features to exclude from the dataset (separated by commas(,)).
130
# Do not exclude the identifier (usually the first column).
131
# Default: excluded=*empty*
132
133
#excluded=columnA,columnB
134
135
136
## Best model auto-selection 
137
# A specified number of models will be generated here, along with various performance 
138
# metrics and correlated features for each one. Choose how many best models to 
139
# generate and the metric on which the models will be sorted.
140
# Instead of a specific number of models, a threshold can also be set.
141
# Models will be selected based on both numberOfBestModels and 
142
# numberOfBestModelsSortingMetricThreshold conditions
143
# Metrics can be any column of the results file: 
144
#   We prefer those for classification: TEST_MCC, TEST_BER, TRAIN_TEST_BS_MCC, 
145
#                                           TRAIN_TEST_BS_BER, AVG_BER, AVG_MCC 
146
#   We prefer those for regression: TEST_CC, TEST_RMSE, TRAIN_TEST_BS_CC, 
147
#                                       TRAIN_TEST_BS_RMSE, AVG_RMSE, AVG_CC 
148
# Examples: AVG_MCC at 0.6, AVG_RMSE at 0.3, AVG_CC at 0.8
149
# See commands in readme.txt to extract specific models
150
# Default: computeBestModel=true
151
#    numberOfBestModels=1 
152
#    numberOfBestModelsSortingMetric=AVG_MCC
153
#    numberOfBestModelsSortingMetricThreshold=0.1
154
155
computeBestModel=true
156
numberOfBestModels=1
157
numberOfBestModelsSortingMetric=AVG_MCC
158
numberOfBestModelsSortingMetricThreshold=0.1
159
160
161
## Combine models
162
# If true, only one model will be computed using a combination of all models 
163
# selected with best models options. 
164
# Combination rules: 
165
#   AVG (Average of probabilities)
166
#   PROD (Product of probabilities)
167
#   MAJ (Majority voting)
168
#   MED (Median)
169
# Default: combineModels=false
170
#   combinationRule=AVG
171
172
combineModels=false
173
combinationRule=AVG
174
175
176
########################
177
### ADVANCED OPTIONS ###
178
########################
179
180
## Debug to show more outputs
181
# 2 levels of verbose, debug and debug2
182
# Also possibility to print failed models with error explanation
183
# Default:  debug=false
184
#           debug2=false
185
#           printFailedModels=false
186
187
debug=false
188
debug2=false
189
printFailedModels=false
190
191
## Maximum number of cpus to use (enter a value or "max").
192
# BioDiscML runs in low priority by regularly checking cpus available. So
193
# you can execute other softwares on your server and it will adapt itself. 
194
# Just be careful to available memory, limit number of cpus used to avoid out 
195
# of memory exception. 
196
# Default: cpus=max
197
198
cpus=max
199
200
201
## The separator (delimiter) of infiles will be detected automatically. 
202
# It is however possible to set it, but it must exist for all files.
203
# Default: separator=*empty*
204
205
#separator=\t
206
207
## Leave-One-Out cross validation
208
# If you have a very large set of samples (more than 2000), it may be better
209
# to skip Leave-One-Out cross validation by setting loocv to false
210
# Default: loocv=true
211
212
loocv=true
213
214
215
## Criterion (metrics) optimizers
216
# To test if a model generated with a feature subset is better with another 
217
# subset, we use various criterions as comparison metrics. 
218
# You can limit the list of criterions if wanted. 
219
# Avalaible criterions for classification:AUC, MCC, FDR, BER, ACC, TPR, TNR, 
220
#   kappa, AUPRC, Fscore, Precision, Recall, TP+FN
221
# Default: coptimizers=AUC, MCC, FDR, BER, ACC
222
223
coptimizers=AUC, MCC, FDR, BER, ACC
224
225
226
## Search modes
227
# various search modes are implemented, including topX features according to 
228
# information gain ranking, stepwise search (Forward(F), Forward-Backward(FB), 
229
# Backward(B) and Backward-Forward(BF)) and all features (all)
230
searchmodes=F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100
231
232
# Regression criterions
233
# Available criterions for regression: CC, MAE, RMSE, RAE, RRSE 
234
# Default: roptimizers=CC, RMSE
235
236
roptimizers=CC, RMSE 
237
238
239
## Maximum number of features kept after feature selection ranking.
240
# Higher this number is, longer will be the training.
241
# Default: maxNumberOfSelectedFeatures = 1000
242
243
maxNumberOfSelectedFeatures = 1000
244
245
246
## Maximum number of features models can have.
247
# Default: maxNumberOfFeaturesInModel = 200
248
249
maxNumberOfFeaturesInModel = 200
250
251
252
## Bootstrap and repeated holdout folds
253
# Default: bootstrapFolds=100
254
255
bootstrapFolds=100
256
257
# Run without feature selection
258
# If true, maxNumberOfSelectedFeatures and maxNumberOfFeaturesInModel 
259
# will be set to maximum (which is the number of features in the dataset).
260
# Also, if true, available search mode won't be executed
261
# Default: noFeatureSelection=false
262
noFeatureSelection=false
263
264
265
## Correlated features
266
# Thresholds for Spearman and Pearson correlations
267
# Correlated feature search can be disabled by setting retrieveCorrelatedGenes to 
268
# false
269
# Default: retrieveCorrelatedGenes=true
270
# Default: spearmanCorrelation_lower = -0.99
271
#   spearmanCorrelation_upper = 0.99
272
#   pearsonCorrelation_lower = -0.99
273
#   pearsonCorrelation_upper = 0.99
274
275
retrieveCorrelatedGenes=true
276
spearmanCorrelation_lower = -0.99
277
spearmanCorrelation_upper = 0.99
278
pearsonCorrelation_lower = -0.99
279
pearsonCorrelation_upper = 0.99
280
281
282
# Retrieve features based on equivalent infogain or relieff ranking score
283
# Default: maxRankingScoreDifference = 0.005
284
#   retreiveCorrelatedGenesByRankingScore=false
285
286
maxRankingScoreDifference = 0.005
287
retreiveCorrelatedGenesByRankingScore=false
288
289
# Create model with correlated genes
290
# Default: generateModelWithCorrelatedGenes = false
291
292
293
## RUN mode
294
# BioDiscML will test all available classifier algorithms. If you wish to choose
295
# specific classifiers, you'll need to use the fast way mode and provide a 
296
# list of classifiers configurations (classifier name and hyperparameters). 
297
# Please use Weka GUI to help you choose the configurations
298
# For each configuration, you'll need to provide what optimizer to use.
299
300
# Fast mode classification
301
# Usage: cfcmd=classifier with options,optimizer.
302
# Available optimizers: AUC,ACC,SEN,SPE,MCC,TP+FN,kappa and ALLOPT (all optimizers)
303
# Available search modes: F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100,top200,all and ALLSEARCH (all search modes)
304
# If no optimizer or search modes are provided, they will all be tested (equivalent to provide ALLOPT and ALLSEARCH)
305
# Default: classificationFastWay=false
306
#   ccmd=*empty*    
307
308
classificationFastWay=false
309
ccmd=bayes.NaiveBayes -K, SEN
310
ccmd=bayes.NaiveBayes -K, AUC
311
ccmd=misc.VFI -B 0.4, SEN
312
ccmd=misc.VFI -B 0.4, AUC
313
ccmd=misc.VFI -B 0.4, AUC, F
314
ccmd=misc.VFI -B 0.6
315
ccmd=misc.VFI -B 0.6, ALLOPT, FB
316
ccmd=trees.J48
317
318
# Fast mode regression
319
# Usage: rfcmd=classifier with options,optimizer.
320
# Available optimizers: CC, MAE, RMSE, RAE, RRSE. 
321
# Default: regressionFastWay=false
322
#   rcmd=*empty* 
323
324
regressionFastWay=false
325
rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", CC
326
rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", RMSE
327
328
329
# #### End of configuration file #### #