330 lines (245 with data), 11.6 kB
# #### BioDiscML config file #### #
# See https://github.com/mickaelleclercq/BioDiscML/tree/master/release/Test_datasets
# for examples.
# IMPORTANT: for classification, do not use classes with numeric attributes. Else,
# they will be interpreted as a regression problem.
#####################
### BASIC OPTIONS ###
#####################
## Working directory. If local execution, don't set it.
# wd must be defined if another classifiers.conf is provided
# Default: wd=*empty* (local directory)
#wd=working_directory
## Project name, used as prefix for outfiles.
# Default: project=myProject
project=myProject
## Type of classification: Classification
# Set to true if we perform a classification (nominal class).
# Default: doClassification=false
doClassification=false
# If true, set the column class name to classify.
# Default: classificationClassName=class
classificationClassName=class
## Type of classification: Regression
# Set to true if we perform a regression (numeric class).
# Default:doRegression=false
doRegression=false
# If true, set the column class name to classify.
# Default: regressionClassName=class
regressionClassName=class
## Training input files
# Set infiles here if you have several dataset with a common ID column that will
# be used for merging (see mergingID). Only IDs existing in all files will be kept
# for training, those missing in one of the file will be ignored.
# All decimal separated values commas (,) will be changed to dots (.).
# You also must remove special symbols within your data (e.g.: %/\*"':éèà).
# Usage: trainFile=filename_in_working_directory,description
# The description will be used as a prefix for features of the file to avoid
# duplicated names. It can be left empty if there is no risk of duplicated names.
# (ex: trainFile=myproteinsfile, protein
# trainFile=mygenesfile, genes
# trainFile=mymetadatafile).
# Default: trainFile=*empty*
#trainFile=trainFile1.csv, description
#trainFile=trainFile2.csv
## Predict new data input files
# If you have you own blind test dataset or new data, you can run biodiscml using
# the -predict option (java -jar biodiscml.jar -config config.conf -predict).
# This function will need two defined input files:
# - A newData file (same format and structure as the training input files.
# This file must contain at least all elements of the retained signature of the
# selected best model features. Features present in the newData file, but absent from
# the signature of the model will simply be ignored during the prediction)
# - A model file (produced during a previous execution of biodiscml where a best
# model have been identified)
# Usage: newDataFile=filename_in_working_directory,description
# The description will be used as a prefix for features of the file to avoid
# duplicated names. It can be left empty if there is no risk of duplicated names.
# (ex: newDataFile=myproteinsfile, protein
# newDataFile=mygenesfile, genes
# newDataFile=mymetadatafile).
# Default: newDataFile=*empty*
# Default: modelFile=*empty*
#newDataFile=newDataFile1.csv, description
#newDataFile=newDataFile2.csv
#modelFile=model.model
## Merging
# Merging identifier, used if you have many files to merge. It is expected to be
# found in the first column of every files.
# Only rows containing identifiers that exist in all files will be considered in
# the analysis.
# Default: mergingID=*empty*
#mergingID=identifier
## Sampling
# Perform sampling to create a random validation set not used during training and
# used for further evaluation.
# Default: sampling=true
sampling=true
# The samplingFold option separate the set in x parts, keep 1 for validation, others
# for training.
# e.g. samplingFold=3 means that the validation set will be composed of 1/3 of the
# input data.
# Ignored if sampling=false
# Default: samplingFold=3
samplingFold=3
# Instead of random sampling, you can provide a validation file on which the models
# will be tested.
# Note that the validation file must contain the same structure and features as the
# train file.
# You can also provide several validation files, they will be merged.
# If set, samplingFold options will be ignored.
# Ignored if sampling=false
# Default: validationFile=*empty*
#validationFile=validationFile1.csv
## Feature exclusion
# Features to exclude from the dataset (separated by commas(,)).
# Do not exclude the identifier (usually the first column).
# Default: excluded=*empty*
#excluded=columnA,columnB
## Best model auto-selection
# A specified number of models will be generated here, along with various performance
# metrics and correlated features for each one. Choose how many best models to
# generate and the metric on which the models will be sorted.
# Instead of a specific number of models, a threshold can also be set.
# Models will be selected based on both numberOfBestModels and
# numberOfBestModelsSortingMetricThreshold conditions
# Metrics can be any column of the results file:
# We prefer those for classification: TEST_MCC, TEST_BER, TRAIN_TEST_BS_MCC,
# TRAIN_TEST_BS_BER, AVG_BER, AVG_MCC
# We prefer those for regression: TEST_CC, TEST_RMSE, TRAIN_TEST_BS_CC,
# TRAIN_TEST_BS_RMSE, AVG_RMSE, AVG_CC
# Examples: AVG_MCC at 0.6, AVG_RMSE at 0.3, AVG_CC at 0.8
# See commands in readme.txt to extract specific models
# Default: computeBestModel=true
# numberOfBestModels=1
# numberOfBestModelsSortingMetric=AVG_MCC
# numberOfBestModelsSortingMetricThreshold=0.1
computeBestModel=true
numberOfBestModels=1
numberOfBestModelsSortingMetric=AVG_MCC
numberOfBestModelsSortingMetricThreshold=0.1
## Combine models
# If true, only one model will be computed using a combination of all models
# selected with best models options.
# Combination rules:
# AVG (Average of probabilities)
# PROD (Product of probabilities)
# MAJ (Majority voting)
# MED (Median)
# Default: combineModels=false
# combinationRule=AVG
combineModels=false
combinationRule=AVG
########################
### ADVANCED OPTIONS ###
########################
## Debug to show more outputs
# 2 levels of verbose, debug and debug2
# Also possibility to print failed models with error explanation
# Default: debug=false
# debug2=false
# printFailedModels=false
debug=false
debug2=false
printFailedModels=false
## Maximum number of cpus to use (enter a value or "max").
# BioDiscML runs in low priority by regularly checking cpus available. So
# you can execute other softwares on your server and it will adapt itself.
# Just be careful to available memory, limit number of cpus used to avoid out
# of memory exception.
# Default: cpus=max
cpus=max
## The separator (delimiter) of infiles will be detected automatically.
# It is however possible to set it, but it must exist for all files.
# Default: separator=*empty*
#separator=\t
## Leave-One-Out cross validation
# If you have a very large set of samples (more than 2000), it may be better
# to skip Leave-One-Out cross validation by setting loocv to false
# Default: loocv=true
loocv=true
## Criterion (metrics) optimizers
# To test if a model generated with a feature subset is better with another
# subset, we use various criterions as comparison metrics.
# You can limit the list of criterions if wanted.
# Avalaible criterions for classification:AUC, MCC, FDR, BER, ACC, TPR, TNR,
# kappa, AUPRC, Fscore, Precision, Recall, TP+FN
# Default: coptimizers=AUC, MCC, FDR, BER, ACC
coptimizers=AUC, MCC, FDR, BER, ACC
## Search modes
# various search modes are implemented, including topX features according to
# information gain ranking, stepwise search (Forward(F), Forward-Backward(FB),
# Backward(B) and Backward-Forward(BF)) and all features (all)
searchmodes=F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100
# Regression criterions
# Available criterions for regression: CC, MAE, RMSE, RAE, RRSE
# Default: roptimizers=CC, RMSE
roptimizers=CC, RMSE
## Maximum number of features kept after feature selection ranking.
# Higher this number is, longer will be the training.
# Default: maxNumberOfSelectedFeatures = 1000
maxNumberOfSelectedFeatures = 1000
## Maximum number of features models can have.
# Default: maxNumberOfFeaturesInModel = 200
maxNumberOfFeaturesInModel = 200
## Bootstrap and repeated holdout folds
# Default: bootstrapFolds=100
bootstrapFolds=100
# Run without feature selection
# If true, maxNumberOfSelectedFeatures and maxNumberOfFeaturesInModel
# will be set to maximum (which is the number of features in the dataset).
# Also, if true, available search mode won't be executed
# Default: noFeatureSelection=false
noFeatureSelection=false
## Correlated features
# Thresholds for Spearman and Pearson correlations
# Correlated feature search can be disabled by setting retrieveCorrelatedGenes to
# false
# Default: retrieveCorrelatedGenes=true
# Default: spearmanCorrelation_lower = -0.99
# spearmanCorrelation_upper = 0.99
# pearsonCorrelation_lower = -0.99
# pearsonCorrelation_upper = 0.99
retrieveCorrelatedGenes=true
spearmanCorrelation_lower = -0.99
spearmanCorrelation_upper = 0.99
pearsonCorrelation_lower = -0.99
pearsonCorrelation_upper = 0.99
# Retrieve features based on equivalent infogain or relieff ranking score
# Default: maxRankingScoreDifference = 0.005
# retreiveCorrelatedGenesByRankingScore=false
maxRankingScoreDifference = 0.005
retreiveCorrelatedGenesByRankingScore=false
# Create model with correlated genes
# Default: generateModelWithCorrelatedGenes = false
## RUN mode
# BioDiscML will test all available classifier algorithms. If you wish to choose
# specific classifiers, you'll need to use the fast way mode and provide a
# list of classifiers configurations (classifier name and hyperparameters).
# Please use Weka GUI to help you choose the configurations
# For each configuration, you'll need to provide what optimizer to use.
# Fast mode classification
# Usage: cfcmd=classifier with options,optimizer.
# Available optimizers: AUC,ACC,SEN,SPE,MCC,TP+FN,kappa and ALLOPT (all optimizers)
# Available search modes: F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100,top200,all and ALLSEARCH (all search modes)
# If no optimizer or search modes are provided, they will all be tested (equivalent to provide ALLOPT and ALLSEARCH)
# Default: classificationFastWay=false
# ccmd=*empty*
classificationFastWay=false
ccmd=bayes.NaiveBayes -K, SEN
ccmd=bayes.NaiveBayes -K, AUC
ccmd=misc.VFI -B 0.4, SEN
ccmd=misc.VFI -B 0.4, AUC
ccmd=misc.VFI -B 0.4, AUC, F
ccmd=misc.VFI -B 0.6
ccmd=misc.VFI -B 0.6, ALLOPT, FB
ccmd=trees.J48
# Fast mode regression
# Usage: rfcmd=classifier with options,optimizer.
# Available optimizers: CC, MAE, RMSE, RAE, RRSE.
# Default: regressionFastWay=false
# rcmd=*empty*
regressionFastWay=false
rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", CC
rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", RMSE
# #### End of configuration file #### #