--- a +++ b/exseek/config/machine_learning.yaml @@ -0,0 +1,233 @@ +features: null +transpose: true + +selector_grid_search: true +selector_grid_search_params: + cv: + splitter: StratifiedShuffleSplit + n_splits: 5 + test_size: 0.1 + iid: false + scoring: roc_auc + +preprocess_steps: + # apply log transformation + - log_transform: + name: LogTransform + type: transformer + enabled: true + params: + base: 2 + pseudo_count: 1 + # method to scale features across samples + - scale_features: + name: StandardScaler + type: scaler + enabled: true + params: + with_mean: true + +# all possible selectors +feature_selector_params: + DiffExp_TTest: + name: DiffExpFilter + type: selector + params: + # rscript_path: /usr/bin/Rscript + score_type: neglogp + method: ttest + MaxFeatures_RandomForest: + name: MaxFeatures + type: selector + params: + classifier: RandomForestClassifier + grid_search: true + grid_search_params: + param_grid: + n_estimators: [25, 50, 75] + max_depth: [3, 4, 5] + MaxFeatures_LogRegL2: + name: MaxFeatures + type: selector + params: + classifier: LogisticRegression + # parameters for the classifier used for feature selection + classifier_params: + penalty: l2 + solver: liblinear + # grid search for hyper-parameters for the classifier + grid_search: true + grid_search_params: + param_grid: + C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] + MaxFeatures_LogRegL1: + name: MaxFeatures + type: selector + params: + classifier: LogisticRegression + # parameters for the classifier used for feature selection + classifier_params: + penalty: l1 + solver: liblinear + # grid search for hyper-parameters for the classifier + grid_search: true + grid_search_params: + param_grid: + C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] + MaxFeatures_ElasticNet: + name: MaxFeatures + type: selector + params: + classifier: SGDClassifier + classifier_params: + penalty: elasticnet + max_iter: 100 + tol: 0.001 + grid_search: true + grid_search_params: + param_grid: + alpha: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] + l1_ratio: [0.15, 0.30, 0.45, 0.60, 0.75, 0.90] + SIS: + name: SIS + type: selector + params: + # rscript_path: /usr/bin/Rscript + sis_params: + family: binomial + tune: bic + penalty: lasso + varISIS: cons + ReliefF: + name: ReliefF + type: selector + params: + n_jobs: 1 + SURF: + name: SURF + type: selector + params: + n_jobs: 1 + MultiSURF: + name: MultiSURF + type: selector + params: + n_jobs: 1 + RandomSubset_RandomForest: + name: RandomSubsetSelector + type: selector + params: + subset_size: 50 + n_subsets: 20 + classifier: RandomForestClassifier + grid_search: true + grid_search_params: + param_grid: + n_estimators: [25, 50, 75] + max_depth: [3, 4, 5] + RandomSubset_LogRegL2: + name: RandomSubsetSelector + type: selector + params: + subset_size: 50 + n_subsets: 20 + classifier: LogisticRegression + # parameters for the classifier used for feature selection + classifier_params: + penalty: l2 + solver: liblinear + # grid search for hyper-parameters for the classifier + grid_search: true + grid_search_params: + param_grid: + C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] + RandomSubset_LogRegL1: + name: RandomSubsetSelector + type: selector + params: + subset_size: 50 + n_subsets: 20 + classifier: LogisticRegression + # parameters for the classifier used for feature selection + classifier_params: + penalty: l1 + solver: liblinear + # grid search for hyper-parameters for the classifier + grid_search: true + grid_search_params: + param_grid: + C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] + +# template for grid_search_params in classifiers +classifier_grid_search_params: + cv: + splitter: StratifiedShuffleSplit + n_splits: 5 + test_size: 0.1 + iid: false + scoring: roc_auc + +classifier_params: + LogRegL2: + classifier: LogisticRegression + # parameters for the classifier used for feature selection + classifier_params: + penalty: l2 + solver: liblinear + # grid search for hyper-parameters for the classifier + grid_search: true + grid_search_params: + param_grid: + C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] + RandomForest: + classifier: RandomForestClassifier + grid_search: true + grid_search_params: + param_grid: + n_estimators: [25, 50, 75] + max_depth: [3, 4, 5] + RBFSVM: + classifier: SVC + classifier_params: + kernel: rbf + gamma: scale + grid_search: true + grid_search_params: + param_grid: + C: [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] + DecisionTree: + classifier: DecisionTreeClassifier + grid_search: true + grid_search_params: + param_grid: + max_depth: [2, 3, 4, 5, 6, 7, 8] + MLP: + classifier: MLPClassifier + classifier_params: + activation: relu + solver: adam + max_iter: 40 + grid_search: true + grid_search_params: + param_grid: + hidden_layer_sizes: [[50], [100], [150], [200], [250], [300]] + +# cross-validation parameters for performance evaluation +cv_params: + splitter: StratifiedShuffleSplit + # number of train-test splits for cross-validation + n_splits: 5 + # number or proportion of samples to use as test set + test_size: 0.1 + # scoring metric for performance evaluation + scoring: roc_auc +# method for computing sample weight +# balanced: compute sample weight from data such that classes are balanced +sample_weight: balanced + +# list of feature selector names +feature_selectors: [DiffExp_TTest, MaxFeatures_RandomForest, MaxFeatures_LogRegL2, MaxFeatures_LogRegL1, + MaxFeatures_ElasticNet, SIS, ReliefF, SURF, MultiSURF, RandomSubset_RandomForest, + RandomSubset_LogRegL2, RandomSubset_LogRegL1] +# list of classifier names +classifiers: [LogRegL2, RandomForest, RBFSVM, DecisionTree, MLP]