[ce076b]: / release / config.conf

Download this file

330 lines (245 with data), 11.6 kB

# #### BioDiscML config file #### #
# See https://github.com/mickaelleclercq/BioDiscML/tree/master/release/Test_datasets 
# for examples.

# IMPORTANT: for classification, do not use classes with numeric attributes. Else,
# they will be interpreted as a regression problem.

#####################
### BASIC OPTIONS ###
#####################

## Working directory. If local execution, don't set it.
# wd must be defined if another classifiers.conf is provided
# Default: wd=*empty* (local directory)

#wd=working_directory


## Project name, used as prefix for outfiles.
# Default: project=myProject

project=myProject


## Type of classification: Classification
# Set to true if we perform a classification (nominal class). 
# Default: doClassification=false

doClassification=false


# If true, set the column class name to classify.
# Default: classificationClassName=class

classificationClassName=class


## Type of classification: Regression
# Set to true if we perform a regression (numeric class). 
# Default:doRegression=false
doRegression=false

# If true, set the column class name to classify.
# Default: regressionClassName=class

regressionClassName=class


## Training input files
# Set infiles here if you have several dataset with a common ID column that will
# be used for merging (see mergingID). Only IDs existing in all files will be kept 
# for training, those missing in one of the file will be ignored.
# All decimal separated values commas (,) will be changed to dots (.).
# You also must remove special symbols within your data (e.g.: %/\*"':éèà).
# Usage: trainFile=filename_in_working_directory,description 
# The description will be used as a prefix for features of the file to avoid  
# duplicated names. It can be left empty if there is no risk of duplicated names.
# (ex:  trainFile=myproteinsfile, protein
#       trainFile=mygenesfile, genes
#       trainFile=mymetadatafile).
# Default: trainFile=*empty*

#trainFile=trainFile1.csv, description
#trainFile=trainFile2.csv

## Predict new data input files
# If you have you own blind test dataset or new data, you can run biodiscml using 
# the -predict option (java -jar biodiscml.jar -config config.conf -predict).
# This function will need two defined input files: 
#  - A newData file (same format and structure as the training input files.
#   This file must contain at least all elements of the retained signature of the 
#   selected best model features. Features present in the newData file, but absent from 
#   the signature of the model will simply be ignored during the prediction)
#  - A model file (produced during a previous execution of biodiscml where a best
#    model have been identified) 
# Usage: newDataFile=filename_in_working_directory,description 
# The description will be used as a prefix for features of the file to avoid  
# duplicated names. It can be left empty if there is no risk of duplicated names.
# (ex:  newDataFile=myproteinsfile, protein
#       newDataFile=mygenesfile, genes
#       newDataFile=mymetadatafile).
# Default: newDataFile=*empty*
# Default: modelFile=*empty*

#newDataFile=newDataFile1.csv, description
#newDataFile=newDataFile2.csv
#modelFile=model.model


## Merging 
# Merging identifier, used if you have many files to merge. It is expected to be
# found in the first column of every files.
# Only rows containing identifiers that exist in all files will be considered in 
# the analysis.
# Default: mergingID=*empty*

#mergingID=identifier


## Sampling
# Perform sampling to create a random validation set not used during training and
# used for further evaluation. 
# Default: sampling=true

sampling=true

# The samplingFold option separate the set in x parts, keep 1 for validation, others 
# for training.
# e.g. samplingFold=3 means that the validation set will be composed of 1/3 of the 
# input data.
# Ignored if sampling=false
# Default: samplingFold=3

samplingFold=3

# Instead of random sampling, you can provide a validation file on which the models 
# will be tested. 
# Note that the validation file must contain the same structure and features as the 
# train file.
# You can also provide several validation files, they will be merged.
# If set, samplingFold options will be ignored. 
# Ignored if sampling=false
# Default: validationFile=*empty*

#validationFile=validationFile1.csv


## Feature exclusion
# Features to exclude from the dataset (separated by commas(,)).
# Do not exclude the identifier (usually the first column).
# Default: excluded=*empty*

#excluded=columnA,columnB


## Best model auto-selection 
# A specified number of models will be generated here, along with various performance 
# metrics and correlated features for each one. Choose how many best models to 
# generate and the metric on which the models will be sorted.
# Instead of a specific number of models, a threshold can also be set.
# Models will be selected based on both numberOfBestModels and 
# numberOfBestModelsSortingMetricThreshold conditions
# Metrics can be any column of the results file: 
# 	We prefer those for classification: TEST_MCC, TEST_BER, TRAIN_TEST_BS_MCC, 
#                                           TRAIN_TEST_BS_BER, AVG_BER, AVG_MCC 
# 	We prefer those for regression: TEST_CC, TEST_RMSE, TRAIN_TEST_BS_CC, 
#                                       TRAIN_TEST_BS_RMSE, AVG_RMSE, AVG_CC 
# Examples: AVG_MCC at 0.6, AVG_RMSE at 0.3, AVG_CC at 0.8
# See commands in readme.txt to extract specific models
# Default: computeBestModel=true
#    numberOfBestModels=1 
#    numberOfBestModelsSortingMetric=AVG_MCC
#    numberOfBestModelsSortingMetricThreshold=0.1

computeBestModel=true
numberOfBestModels=1
numberOfBestModelsSortingMetric=AVG_MCC
numberOfBestModelsSortingMetricThreshold=0.1


## Combine models
# If true, only one model will be computed using a combination of all models 
# selected with best models options. 
# Combination rules: 
# 	AVG (Average of probabilities)
#	PROD (Product of probabilities)
#	MAJ (Majority voting)
#	MED (Median)
# Default: combineModels=false
#   combinationRule=AVG

combineModels=false
combinationRule=AVG


########################
### ADVANCED OPTIONS ###
########################

## Debug to show more outputs
# 2 levels of verbose, debug and debug2
# Also possibility to print failed models with error explanation
# Default:  debug=false
#           debug2=false
#           printFailedModels=false

debug=false
debug2=false
printFailedModels=false

## Maximum number of cpus to use (enter a value or "max").
# BioDiscML runs in low priority by regularly checking cpus available. So
# you can execute other softwares on your server and it will adapt itself. 
# Just be careful to available memory, limit number of cpus used to avoid out 
# of memory exception. 
# Default: cpus=max

cpus=max


## The separator (delimiter) of infiles will be detected automatically. 
# It is however possible to set it, but it must exist for all files.
# Default: separator=*empty*

#separator=\t

## Leave-One-Out cross validation
# If you have a very large set of samples (more than 2000), it may be better
# to skip Leave-One-Out cross validation by setting loocv to false
# Default: loocv=true

loocv=true


## Criterion (metrics) optimizers
# To test if a model generated with a feature subset is better with another 
# subset, we use various criterions as comparison metrics. 
# You can limit the list of criterions if wanted. 
# Avalaible criterions for classification:AUC, MCC, FDR, BER, ACC, TPR, TNR, 
# 	kappa, AUPRC, Fscore, Precision, Recall, TP+FN
# Default: coptimizers=AUC, MCC, FDR, BER, ACC

coptimizers=AUC, MCC, FDR, BER, ACC


## Search modes
# various search modes are implemented, including topX features according to 
# information gain ranking, stepwise search (Forward(F), Forward-Backward(FB), 
# Backward(B) and Backward-Forward(BF)) and all features (all)
searchmodes=F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100

# Regression criterions
# Available criterions for regression: CC, MAE, RMSE, RAE, RRSE 
# Default: roptimizers=CC, RMSE

roptimizers=CC, RMSE 


## Maximum number of features kept after feature selection ranking.
# Higher this number is, longer will be the training.
# Default: maxNumberOfSelectedFeatures = 1000

maxNumberOfSelectedFeatures = 1000


## Maximum number of features models can have.
# Default: maxNumberOfFeaturesInModel = 200

maxNumberOfFeaturesInModel = 200


## Bootstrap and repeated holdout folds
# Default: bootstrapFolds=100

bootstrapFolds=100

# Run without feature selection
# If true, maxNumberOfSelectedFeatures and maxNumberOfFeaturesInModel 
# will be set to maximum (which is the number of features in the dataset).
# Also, if true, available search mode won't be executed
# Default: noFeatureSelection=false
noFeatureSelection=false


## Correlated features
# Thresholds for Spearman and Pearson correlations
# Correlated feature search can be disabled by setting retrieveCorrelatedGenes to 
# false
# Default: retrieveCorrelatedGenes=true
# Default: spearmanCorrelation_lower = -0.99
#   spearmanCorrelation_upper = 0.99
#   pearsonCorrelation_lower = -0.99
#   pearsonCorrelation_upper = 0.99

retrieveCorrelatedGenes=true
spearmanCorrelation_lower = -0.99
spearmanCorrelation_upper = 0.99
pearsonCorrelation_lower = -0.99
pearsonCorrelation_upper = 0.99


# Retrieve features based on equivalent infogain or relieff ranking score
# Default: maxRankingScoreDifference = 0.005
# 	retreiveCorrelatedGenesByRankingScore=false

maxRankingScoreDifference = 0.005
retreiveCorrelatedGenesByRankingScore=false

# Create model with correlated genes
# Default: generateModelWithCorrelatedGenes = false


## RUN mode
# BioDiscML will test all available classifier algorithms. If you wish to choose
# specific classifiers, you'll need to use the fast way mode and provide a 
# list of classifiers configurations (classifier name and hyperparameters). 
# Please use Weka GUI to help you choose the configurations
# For each configuration, you'll need to provide what optimizer to use.

# Fast mode classification
# Usage: cfcmd=classifier with options,optimizer.
# Available optimizers: AUC,ACC,SEN,SPE,MCC,TP+FN,kappa and ALLOPT (all optimizers)
# Available search modes: F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100,top200,all and ALLSEARCH (all search modes)
# If no optimizer or search modes are provided, they will all be tested (equivalent to provide ALLOPT and ALLSEARCH)
# Default: classificationFastWay=false
#   ccmd=*empty*    

classificationFastWay=false
ccmd=bayes.NaiveBayes -K, SEN
ccmd=bayes.NaiveBayes -K, AUC
ccmd=misc.VFI -B 0.4, SEN
ccmd=misc.VFI -B 0.4, AUC
ccmd=misc.VFI -B 0.4, AUC, F
ccmd=misc.VFI -B 0.6
ccmd=misc.VFI -B 0.6, ALLOPT, FB
ccmd=trees.J48

# Fast mode regression
# Usage: rfcmd=classifier with options,optimizer.
# Available optimizers: CC, MAE, RMSE, RAE, RRSE. 
# Default: regressionFastWay=false
#   rcmd=*empty* 

regressionFastWay=false
rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", CC
rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", RMSE


# #### End of configuration file #### #