--- a +++ b/release/config.conf @@ -0,0 +1,329 @@ +# #### BioDiscML config file #### # +# See https://github.com/mickaelleclercq/BioDiscML/tree/master/release/Test_datasets +# for examples. + +# IMPORTANT: for classification, do not use classes with numeric attributes. Else, +# they will be interpreted as a regression problem. + +##################### +### BASIC OPTIONS ### +##################### + +## Working directory. If local execution, don't set it. +# wd must be defined if another classifiers.conf is provided +# Default: wd=*empty* (local directory) + +#wd=working_directory + + +## Project name, used as prefix for outfiles. +# Default: project=myProject + +project=myProject + + +## Type of classification: Classification +# Set to true if we perform a classification (nominal class). +# Default: doClassification=false + +doClassification=false + + +# If true, set the column class name to classify. +# Default: classificationClassName=class + +classificationClassName=class + + +## Type of classification: Regression +# Set to true if we perform a regression (numeric class). +# Default:doRegression=false +doRegression=false + +# If true, set the column class name to classify. +# Default: regressionClassName=class + +regressionClassName=class + + +## Training input files +# Set infiles here if you have several dataset with a common ID column that will +# be used for merging (see mergingID). Only IDs existing in all files will be kept +# for training, those missing in one of the file will be ignored. +# All decimal separated values commas (,) will be changed to dots (.). +# You also must remove special symbols within your data (e.g.: %/\*"':éèà). +# Usage: trainFile=filename_in_working_directory,description +# The description will be used as a prefix for features of the file to avoid +# duplicated names. It can be left empty if there is no risk of duplicated names. +# (ex: trainFile=myproteinsfile, protein +# trainFile=mygenesfile, genes +# trainFile=mymetadatafile). +# Default: trainFile=*empty* + +#trainFile=trainFile1.csv, description +#trainFile=trainFile2.csv + +## Predict new data input files +# If you have you own blind test dataset or new data, you can run biodiscml using +# the -predict option (java -jar biodiscml.jar -config config.conf -predict). +# This function will need two defined input files: +# - A newData file (same format and structure as the training input files. +# This file must contain at least all elements of the retained signature of the +# selected best model features. Features present in the newData file, but absent from +# the signature of the model will simply be ignored during the prediction) +# - A model file (produced during a previous execution of biodiscml where a best +# model have been identified) +# Usage: newDataFile=filename_in_working_directory,description +# The description will be used as a prefix for features of the file to avoid +# duplicated names. It can be left empty if there is no risk of duplicated names. +# (ex: newDataFile=myproteinsfile, protein +# newDataFile=mygenesfile, genes +# newDataFile=mymetadatafile). +# Default: newDataFile=*empty* +# Default: modelFile=*empty* + +#newDataFile=newDataFile1.csv, description +#newDataFile=newDataFile2.csv +#modelFile=model.model + + +## Merging +# Merging identifier, used if you have many files to merge. It is expected to be +# found in the first column of every files. +# Only rows containing identifiers that exist in all files will be considered in +# the analysis. +# Default: mergingID=*empty* + +#mergingID=identifier + + +## Sampling +# Perform sampling to create a random validation set not used during training and +# used for further evaluation. +# Default: sampling=true + +sampling=true + +# The samplingFold option separate the set in x parts, keep 1 for validation, others +# for training. +# e.g. samplingFold=3 means that the validation set will be composed of 1/3 of the +# input data. +# Ignored if sampling=false +# Default: samplingFold=3 + +samplingFold=3 + +# Instead of random sampling, you can provide a validation file on which the models +# will be tested. +# Note that the validation file must contain the same structure and features as the +# train file. +# You can also provide several validation files, they will be merged. +# If set, samplingFold options will be ignored. +# Ignored if sampling=false +# Default: validationFile=*empty* + +#validationFile=validationFile1.csv + + +## Feature exclusion +# Features to exclude from the dataset (separated by commas(,)). +# Do not exclude the identifier (usually the first column). +# Default: excluded=*empty* + +#excluded=columnA,columnB + + +## Best model auto-selection +# A specified number of models will be generated here, along with various performance +# metrics and correlated features for each one. Choose how many best models to +# generate and the metric on which the models will be sorted. +# Instead of a specific number of models, a threshold can also be set. +# Models will be selected based on both numberOfBestModels and +# numberOfBestModelsSortingMetricThreshold conditions +# Metrics can be any column of the results file: +# We prefer those for classification: TEST_MCC, TEST_BER, TRAIN_TEST_BS_MCC, +# TRAIN_TEST_BS_BER, AVG_BER, AVG_MCC +# We prefer those for regression: TEST_CC, TEST_RMSE, TRAIN_TEST_BS_CC, +# TRAIN_TEST_BS_RMSE, AVG_RMSE, AVG_CC +# Examples: AVG_MCC at 0.6, AVG_RMSE at 0.3, AVG_CC at 0.8 +# See commands in readme.txt to extract specific models +# Default: computeBestModel=true +# numberOfBestModels=1 +# numberOfBestModelsSortingMetric=AVG_MCC +# numberOfBestModelsSortingMetricThreshold=0.1 + +computeBestModel=true +numberOfBestModels=1 +numberOfBestModelsSortingMetric=AVG_MCC +numberOfBestModelsSortingMetricThreshold=0.1 + + +## Combine models +# If true, only one model will be computed using a combination of all models +# selected with best models options. +# Combination rules: +# AVG (Average of probabilities) +# PROD (Product of probabilities) +# MAJ (Majority voting) +# MED (Median) +# Default: combineModels=false +# combinationRule=AVG + +combineModels=false +combinationRule=AVG + + +######################## +### ADVANCED OPTIONS ### +######################## + +## Debug to show more outputs +# 2 levels of verbose, debug and debug2 +# Also possibility to print failed models with error explanation +# Default: debug=false +# debug2=false +# printFailedModels=false + +debug=false +debug2=false +printFailedModels=false + +## Maximum number of cpus to use (enter a value or "max"). +# BioDiscML runs in low priority by regularly checking cpus available. So +# you can execute other softwares on your server and it will adapt itself. +# Just be careful to available memory, limit number of cpus used to avoid out +# of memory exception. +# Default: cpus=max + +cpus=max + + +## The separator (delimiter) of infiles will be detected automatically. +# It is however possible to set it, but it must exist for all files. +# Default: separator=*empty* + +#separator=\t + +## Leave-One-Out cross validation +# If you have a very large set of samples (more than 2000), it may be better +# to skip Leave-One-Out cross validation by setting loocv to false +# Default: loocv=true + +loocv=true + + +## Criterion (metrics) optimizers +# To test if a model generated with a feature subset is better with another +# subset, we use various criterions as comparison metrics. +# You can limit the list of criterions if wanted. +# Avalaible criterions for classification:AUC, MCC, FDR, BER, ACC, TPR, TNR, +# kappa, AUPRC, Fscore, Precision, Recall, TP+FN +# Default: coptimizers=AUC, MCC, FDR, BER, ACC + +coptimizers=AUC, MCC, FDR, BER, ACC + + +## Search modes +# various search modes are implemented, including topX features according to +# information gain ranking, stepwise search (Forward(F), Forward-Backward(FB), +# Backward(B) and Backward-Forward(BF)) and all features (all) +searchmodes=F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100 + +# Regression criterions +# Available criterions for regression: CC, MAE, RMSE, RAE, RRSE +# Default: roptimizers=CC, RMSE + +roptimizers=CC, RMSE + + +## Maximum number of features kept after feature selection ranking. +# Higher this number is, longer will be the training. +# Default: maxNumberOfSelectedFeatures = 1000 + +maxNumberOfSelectedFeatures = 1000 + + +## Maximum number of features models can have. +# Default: maxNumberOfFeaturesInModel = 200 + +maxNumberOfFeaturesInModel = 200 + + +## Bootstrap and repeated holdout folds +# Default: bootstrapFolds=100 + +bootstrapFolds=100 + +# Run without feature selection +# If true, maxNumberOfSelectedFeatures and maxNumberOfFeaturesInModel +# will be set to maximum (which is the number of features in the dataset). +# Also, if true, available search mode won't be executed +# Default: noFeatureSelection=false +noFeatureSelection=false + + +## Correlated features +# Thresholds for Spearman and Pearson correlations +# Correlated feature search can be disabled by setting retrieveCorrelatedGenes to +# false +# Default: retrieveCorrelatedGenes=true +# Default: spearmanCorrelation_lower = -0.99 +# spearmanCorrelation_upper = 0.99 +# pearsonCorrelation_lower = -0.99 +# pearsonCorrelation_upper = 0.99 + +retrieveCorrelatedGenes=true +spearmanCorrelation_lower = -0.99 +spearmanCorrelation_upper = 0.99 +pearsonCorrelation_lower = -0.99 +pearsonCorrelation_upper = 0.99 + + +# Retrieve features based on equivalent infogain or relieff ranking score +# Default: maxRankingScoreDifference = 0.005 +# retreiveCorrelatedGenesByRankingScore=false + +maxRankingScoreDifference = 0.005 +retreiveCorrelatedGenesByRankingScore=false + +# Create model with correlated genes +# Default: generateModelWithCorrelatedGenes = false + + +## RUN mode +# BioDiscML will test all available classifier algorithms. If you wish to choose +# specific classifiers, you'll need to use the fast way mode and provide a +# list of classifiers configurations (classifier name and hyperparameters). +# Please use Weka GUI to help you choose the configurations +# For each configuration, you'll need to provide what optimizer to use. + +# Fast mode classification +# Usage: cfcmd=classifier with options,optimizer. +# Available optimizers: AUC,ACC,SEN,SPE,MCC,TP+FN,kappa and ALLOPT (all optimizers) +# Available search modes: F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100,top200,all and ALLSEARCH (all search modes) +# If no optimizer or search modes are provided, they will all be tested (equivalent to provide ALLOPT and ALLSEARCH) +# Default: classificationFastWay=false +# ccmd=*empty* + +classificationFastWay=false +ccmd=bayes.NaiveBayes -K, SEN +ccmd=bayes.NaiveBayes -K, AUC +ccmd=misc.VFI -B 0.4, SEN +ccmd=misc.VFI -B 0.4, AUC +ccmd=misc.VFI -B 0.4, AUC, F +ccmd=misc.VFI -B 0.6 +ccmd=misc.VFI -B 0.6, ALLOPT, FB +ccmd=trees.J48 + +# Fast mode regression +# Usage: rfcmd=classifier with options,optimizer. +# Available optimizers: CC, MAE, RMSE, RAE, RRSE. +# Default: regressionFastWay=false +# rcmd=*empty* + +regressionFastWay=false +rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", CC +rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", RMSE + + +# #### End of configuration file #### #