Switch to side-by-side view

--- a
+++ b/exseek/config/machine_learning.yaml
@@ -0,0 +1,233 @@
+features: null
+transpose: true
+
+selector_grid_search: true
+selector_grid_search_params:
+  cv:
+    splitter: StratifiedShuffleSplit
+    n_splits: 5
+    test_size: 0.1
+  iid: false
+  scoring: roc_auc
+
+preprocess_steps:
+  # apply log transformation
+  - log_transform:
+      name: LogTransform
+      type: transformer
+      enabled: true
+      params:
+        base: 2
+        pseudo_count: 1
+  # method to scale features across samples
+  - scale_features:
+      name: StandardScaler
+      type: scaler
+      enabled: true
+      params:
+        with_mean: true
+
+# all possible selectors
+feature_selector_params:
+  DiffExp_TTest:
+    name: DiffExpFilter
+    type: selector
+    params:
+      # rscript_path: /usr/bin/Rscript
+      score_type: neglogp
+      method: ttest
+  MaxFeatures_RandomForest:
+    name: MaxFeatures
+    type: selector
+    params:
+      classifier: RandomForestClassifier
+      grid_search: true
+      grid_search_params:
+        param_grid:
+          n_estimators: [25, 50, 75]
+          max_depth: [3, 4, 5]
+  MaxFeatures_LogRegL2:
+    name: MaxFeatures
+    type: selector
+    params:
+      classifier: LogisticRegression
+      # parameters for the classifier used for feature selection
+      classifier_params:
+        penalty: l2
+        solver: liblinear
+      # grid search for hyper-parameters for the classifier
+      grid_search: true
+      grid_search_params:
+        param_grid:
+          C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
+  MaxFeatures_LogRegL1:
+    name: MaxFeatures
+    type: selector
+    params:
+      classifier: LogisticRegression
+      # parameters for the classifier used for feature selection
+      classifier_params:
+        penalty: l1
+        solver: liblinear
+      # grid search for hyper-parameters for the classifier
+      grid_search: true
+      grid_search_params:
+        param_grid:
+          C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
+  MaxFeatures_ElasticNet:
+    name: MaxFeatures
+    type: selector
+    params:
+      classifier: SGDClassifier
+      classifier_params:
+        penalty: elasticnet
+        max_iter: 100
+        tol: 0.001
+      grid_search: true
+      grid_search_params:
+        param_grid:
+          alpha: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
+          l1_ratio: [0.15, 0.30, 0.45, 0.60, 0.75, 0.90]
+  SIS:
+    name: SIS
+    type: selector
+    params:
+      # rscript_path: /usr/bin/Rscript
+      sis_params:
+        family: binomial
+        tune: bic
+        penalty: lasso
+        varISIS: cons
+  ReliefF:
+    name: ReliefF
+    type: selector
+    params:
+      n_jobs: 1
+  SURF:
+    name: SURF
+    type: selector
+    params:
+      n_jobs: 1
+  MultiSURF:
+    name: MultiSURF
+    type: selector
+    params:
+      n_jobs: 1
+  RandomSubset_RandomForest:
+    name: RandomSubsetSelector
+    type: selector
+    params:
+      subset_size: 50
+      n_subsets: 20
+      classifier: RandomForestClassifier
+      grid_search: true
+      grid_search_params:
+        param_grid:
+          n_estimators: [25, 50, 75]
+          max_depth: [3, 4, 5]
+  RandomSubset_LogRegL2:
+    name: RandomSubsetSelector
+    type: selector
+    params:
+      subset_size: 50
+      n_subsets: 20
+      classifier: LogisticRegression
+      # parameters for the classifier used for feature selection
+      classifier_params:
+        penalty: l2
+        solver: liblinear
+      # grid search for hyper-parameters for the classifier
+      grid_search: true
+      grid_search_params:
+        param_grid:
+          C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
+  RandomSubset_LogRegL1:
+    name: RandomSubsetSelector
+    type: selector
+    params:
+      subset_size: 50
+      n_subsets: 20
+      classifier: LogisticRegression
+      # parameters for the classifier used for feature selection
+      classifier_params:
+        penalty: l1
+        solver: liblinear
+      # grid search for hyper-parameters for the classifier
+      grid_search: true
+      grid_search_params:
+        param_grid:
+          C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
+
+# template for grid_search_params in classifiers
+classifier_grid_search_params:
+  cv:
+    splitter: StratifiedShuffleSplit
+    n_splits: 5
+    test_size: 0.1
+  iid: false
+  scoring: roc_auc
+
+classifier_params:
+  LogRegL2:
+    classifier: LogisticRegression
+    # parameters for the classifier used for feature selection
+    classifier_params:
+      penalty: l2
+      solver: liblinear
+    # grid search for hyper-parameters for the classifier
+    grid_search: true
+    grid_search_params:
+      param_grid:
+        C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
+  RandomForest:
+    classifier: RandomForestClassifier
+    grid_search: true
+    grid_search_params:
+      param_grid:
+        n_estimators: [25, 50, 75]
+        max_depth: [3, 4, 5]
+  RBFSVM:
+    classifier: SVC
+    classifier_params:
+      kernel: rbf
+      gamma: scale
+    grid_search: true
+    grid_search_params:
+      param_grid:
+        C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
+  DecisionTree:
+    classifier: DecisionTreeClassifier
+    grid_search: true
+    grid_search_params:
+      param_grid:
+        max_depth: [2, 3, 4, 5, 6, 7, 8]
+  MLP:
+    classifier: MLPClassifier
+    classifier_params:
+      activation: relu
+      solver: adam
+      max_iter: 40
+    grid_search: true
+    grid_search_params:
+      param_grid:
+        hidden_layer_sizes: [[50], [100], [150], [200], [250], [300]]
+
+# cross-validation parameters for performance evaluation
+cv_params:
+  splitter: StratifiedShuffleSplit
+  # number of train-test splits for cross-validation
+  n_splits: 5
+  # number or proportion of samples to use as test set
+  test_size: 0.1
+  # scoring metric for performance evaluation
+  scoring: roc_auc
+# method for computing sample weight
+#  balanced: compute sample weight from data such that classes are balanced
+sample_weight: balanced
+
+# list of feature selector names
+feature_selectors: [DiffExp_TTest, MaxFeatures_RandomForest, MaxFeatures_LogRegL2, MaxFeatures_LogRegL1,
+  MaxFeatures_ElasticNet, SIS, ReliefF, SURF, MultiSURF, RandomSubset_RandomForest,
+  RandomSubset_LogRegL2, RandomSubset_LogRegL1]
+# list of classifier names
+classifiers: [LogRegL2, RandomForest, RBFSVM, DecisionTree, MLP]