Diff of /config.conf [000000] .. [ce076b]

Switch to side-by-side view

--- a
+++ b/config.conf
@@ -0,0 +1,329 @@
+# #### BioDiscML config file #### #
+# See https://github.com/mickaelleclercq/BioDiscML/tree/master/release/Test_datasets 
+# for examples.
+
+# IMPORTANT: for classification, do not use classes with numeric attributes. Else,
+# they will be interpreted as a regression problem.
+
+#####################
+### BASIC OPTIONS ###
+#####################
+
+## Working directory. If local execution, don't set it.
+# wd must be defined if another classifiers.conf is provided
+# Default: wd=*empty* (local directory)
+
+#wd=working_directory
+
+
+## Project name, used as prefix for outfiles.
+# Default: project=myProject
+
+project=myProject
+
+
+## Type of classification: Classification
+# Set to true if we perform a classification (nominal class). 
+# Default: doClassification=false
+
+doClassification=false
+
+
+# If true, set the column class name to classify.
+# Default: classificationClassName=class
+
+classificationClassName=class
+
+
+## Type of classification: Regression
+# Set to true if we perform a regression (numeric class). 
+# Default:doRegression=false
+doRegression=false
+
+# If true, set the column class name to classify.
+# Default: regressionClassName=class
+
+regressionClassName=class
+
+
+## Training input files
+# Set infiles here if you have several dataset with a common ID column that will
+# be used for merging (see mergingID). Only IDs existing in all files will be kept 
+# for training, those missing in one of the file will be ignored.
+# All decimal separated values commas (,) will be changed to dots (.).
+# You also must remove special symbols within your data (e.g.: %/\*"':éèà).
+# Usage: trainFile=filename_in_working_directory,description 
+# The description will be used as a prefix for features of the file to avoid  
+# duplicated names. It can be left empty if there is no risk of duplicated names.
+# (ex:  trainFile=myproteinsfile, protein
+#       trainFile=mygenesfile, genes
+#       trainFile=mymetadatafile).
+# Default: trainFile=*empty*
+
+#trainFile=trainFile1.csv, description
+#trainFile=trainFile2.csv
+
+## Predict new data input files
+# If you have you own blind test dataset or new data, you can run biodiscml using 
+# the -predict option (java -jar biodiscml.jar -config config.conf -predict).
+# This function will need two defined input files: 
+#  - A newData file (same format and structure as the training input files.
+#   This file must contain at least all elements of the retained signature of the 
+#   selected best model features. Features present in the newData file, but absent from 
+#   the signature of the model will simply be ignored during the prediction)
+#  - A model file (produced during a previous execution of biodiscml where a best
+#    model have been identified) 
+# Usage: newDataFile=filename_in_working_directory,description 
+# The description will be used as a prefix for features of the file to avoid  
+# duplicated names. It can be left empty if there is no risk of duplicated names.
+# (ex:  newDataFile=myproteinsfile, protein
+#       newDataFile=mygenesfile, genes
+#       newDataFile=mymetadatafile).
+# Default: newDataFile=*empty*
+# Default: modelFile=*empty*
+
+#newDataFile=newDataFile1.csv, description
+#newDataFile=newDataFile2.csv
+#modelFile=model.model
+
+
+## Merging 
+# Merging identifier, used if you have many files to merge. It is expected to be
+# found in the first column of every files.
+# Only rows containing identifiers that exist in all files will be considered in 
+# the analysis.
+# Default: mergingID=*empty*
+
+#mergingID=identifier
+
+
+## Sampling
+# Perform sampling to create a random validation set not used during training and
+# used for further evaluation. 
+# Default: sampling=true
+
+sampling=true
+
+# The samplingFold option separate the set in x parts, keep 1 for validation, others 
+# for training.
+# e.g. samplingFold=3 means that the validation set will be composed of 1/3 of the 
+# input data.
+# Ignored if sampling=false
+# Default: samplingFold=3
+
+samplingFold=3
+
+# Instead of random sampling, you can provide a validation file on which the models 
+# will be tested. 
+# Note that the validation file must contain the same structure and features as the 
+# train file.
+# You can also provide several validation files, they will be merged.
+# If set, samplingFold options will be ignored. 
+# Ignored if sampling=false
+# Default: validationFile=*empty*
+
+#validationFile=validationFile1.csv
+
+
+## Feature exclusion
+# Features to exclude from the dataset (separated by commas(,)).
+# Do not exclude the identifier (usually the first column).
+# Default: excluded=*empty*
+
+#excluded=columnA,columnB
+
+
+## Best model auto-selection 
+# A specified number of models will be generated here, along with various performance 
+# metrics and correlated features for each one. Choose how many best models to 
+# generate and the metric on which the models will be sorted.
+# Instead of a specific number of models, a threshold can also be set.
+# Models will be selected based on both numberOfBestModels and 
+# numberOfBestModelsSortingMetricThreshold conditions
+# Metrics can be any column of the results file: 
+# 	We prefer those for classification: TEST_MCC, TEST_BER, TRAIN_TEST_BS_MCC, 
+#                                           TRAIN_TEST_BS_BER, AVG_BER, AVG_MCC 
+# 	We prefer those for regression: TEST_CC, TEST_RMSE, TRAIN_TEST_BS_CC, 
+#                                       TRAIN_TEST_BS_RMSE, AVG_RMSE, AVG_CC 
+# Examples: AVG_MCC at 0.6, AVG_RMSE at 0.3, AVG_CC at 0.8
+# See commands in readme.txt to extract specific models
+# Default: computeBestModel=true
+#    numberOfBestModels=1 
+#    numberOfBestModelsSortingMetric=AVG_MCC
+#    numberOfBestModelsSortingMetricThreshold=0.1
+
+computeBestModel=true
+numberOfBestModels=1
+numberOfBestModelsSortingMetric=AVG_MCC
+numberOfBestModelsSortingMetricThreshold=0.1
+
+
+## Combine models
+# If true, only one model will be computed using a combination of all models 
+# selected with best models options. 
+# Combination rules: 
+# 	AVG (Average of probabilities)
+#	PROD (Product of probabilities)
+#	MAJ (Majority voting)
+#	MED (Median)
+# Default: combineModels=false
+#   combinationRule=AVG
+
+combineModels=false
+combinationRule=AVG
+
+
+########################
+### ADVANCED OPTIONS ###
+########################
+
+## Debug to show more outputs
+# 2 levels of verbose, debug and debug2
+# Also possibility to print failed models with error explanation
+# Default:  debug=false
+#           debug2=false
+#           printFailedModels=false
+
+debug=false
+debug2=false
+printFailedModels=false
+
+## Maximum number of cpus to use (enter a value or "max").
+# BioDiscML runs in low priority by regularly checking cpus available. So
+# you can execute other softwares on your server and it will adapt itself. 
+# Just be careful to available memory, limit number of cpus used to avoid out 
+# of memory exception. 
+# Default: cpus=max
+
+cpus=max
+
+
+## The separator (delimiter) of infiles will be detected automatically. 
+# It is however possible to set it, but it must exist for all files.
+# Default: separator=*empty*
+
+#separator=\t
+
+## Leave-One-Out cross validation
+# If you have a very large set of samples (more than 2000), it may be better
+# to skip Leave-One-Out cross validation by setting loocv to false
+# Default: loocv=true
+
+loocv=true
+
+
+## Criterion (metrics) optimizers
+# To test if a model generated with a feature subset is better with another 
+# subset, we use various criterions as comparison metrics. 
+# You can limit the list of criterions if wanted. 
+# Avalaible criterions for classification:AUC, MCC, FDR, BER, ACC, TPR, TNR, 
+# 	kappa, AUPRC, Fscore, Precision, Recall, TP+FN
+# Default: coptimizers=AUC, MCC, FDR, BER, ACC
+
+coptimizers=AUC, MCC, FDR, BER, ACC
+
+
+## Search modes
+# various search modes are implemented, including topX features according to 
+# information gain ranking, stepwise search (Forward(F), Forward-Backward(FB), 
+# Backward(B) and Backward-Forward(BF)) and all features (all)
+searchmodes=F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100
+
+# Regression criterions
+# Available criterions for regression: CC, MAE, RMSE, RAE, RRSE 
+# Default: roptimizers=CC, RMSE
+
+roptimizers=CC, RMSE 
+
+
+## Maximum number of features kept after feature selection ranking.
+# Higher this number is, longer will be the training.
+# Default: maxNumberOfSelectedFeatures = 1000
+
+maxNumberOfSelectedFeatures = 1000
+
+
+## Maximum number of features models can have.
+# Default: maxNumberOfFeaturesInModel = 200
+
+maxNumberOfFeaturesInModel = 200
+
+
+## Bootstrap and repeated holdout folds
+# Default: bootstrapFolds=100
+
+bootstrapFolds=100
+
+# Run without feature selection
+# If true, maxNumberOfSelectedFeatures and maxNumberOfFeaturesInModel 
+# will be set to maximum (which is the number of features in the dataset).
+# Also, if true, available search mode won't be executed
+# Default: noFeatureSelection=false
+noFeatureSelection=false
+
+
+## Correlated features
+# Thresholds for Spearman and Pearson correlations
+# Correlated feature search can be disabled by setting retrieveCorrelatedGenes to 
+# false
+# Default: retrieveCorrelatedGenes=true
+# Default: spearmanCorrelation_lower = -0.99
+#   spearmanCorrelation_upper = 0.99
+#   pearsonCorrelation_lower = -0.99
+#   pearsonCorrelation_upper = 0.99
+
+retrieveCorrelatedGenes=true
+spearmanCorrelation_lower = -0.99
+spearmanCorrelation_upper = 0.99
+pearsonCorrelation_lower = -0.99
+pearsonCorrelation_upper = 0.99
+
+
+# Retrieve features based on equivalent infogain or relieff ranking score
+# Default: maxRankingScoreDifference = 0.005
+# 	retreiveCorrelatedGenesByRankingScore=false
+
+maxRankingScoreDifference = 0.005
+retreiveCorrelatedGenesByRankingScore=false
+
+# Create model with correlated genes
+# Default: generateModelWithCorrelatedGenes = false
+
+
+## RUN mode
+# BioDiscML will test all available classifier algorithms. If you wish to choose
+# specific classifiers, you'll need to use the fast way mode and provide a 
+# list of classifiers configurations (classifier name and hyperparameters). 
+# Please use Weka GUI to help you choose the configurations
+# For each configuration, you'll need to provide what optimizer to use.
+
+# Fast mode classification
+# Usage: cfcmd=classifier with options,optimizer.
+# Available optimizers: AUC,ACC,SEN,SPE,MCC,TP+FN,kappa and ALLOPT (all optimizers)
+# Available search modes: F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100,top200,all and ALLSEARCH (all search modes)
+# If no optimizer or search modes are provided, they will all be tested (equivalent to provide ALLOPT and ALLSEARCH)
+# Default: classificationFastWay=false
+#   ccmd=*empty*    
+
+classificationFastWay=false
+ccmd=bayes.NaiveBayes -K, SEN
+ccmd=bayes.NaiveBayes -K, AUC
+ccmd=misc.VFI -B 0.4, SEN
+ccmd=misc.VFI -B 0.4, AUC
+ccmd=misc.VFI -B 0.4, AUC, F
+ccmd=misc.VFI -B 0.6
+ccmd=misc.VFI -B 0.6, ALLOPT, FB
+ccmd=trees.J48
+
+# Fast mode regression
+# Usage: rfcmd=classifier with options,optimizer.
+# Available optimizers: CC, MAE, RMSE, RAE, RRSE. 
+# Default: regressionFastWay=false
+#   rcmd=*empty* 
+
+regressionFastWay=false
+rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", CC
+rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", RMSE
+
+
+# #### End of configuration file #### #