--- a +++ b/src/scpanel/SVMRFECV.py @@ -0,0 +1,742 @@ +"""Recursive feature elimination for feature ranking""" + +import math +import numbers + +import numpy as np +from joblib import Parallel, effective_n_jobs +from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier +from sklearn.feature_selection._base import SelectorMixin, _get_feature_importances +from sklearn.metrics import check_scoring +from sklearn.model_selection import check_cv +from sklearn.model_selection._validation import _score +from sklearn.utils._tags import _safe_tags +from sklearn.utils.deprecation import deprecated +from sklearn.utils.fixes import delayed +from sklearn.utils.metaestimators import _safe_split, if_delegate_has_method +from sklearn.utils.validation import check_is_fitted +from numpy import ndarray +from sklearn.svm._classes import SVC +from typing import Dict, List, Optional + +# def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): +# """ +# Return the score for a fit across one fold. +# """ +# X_train, y_train = _safe_split(estimator, X, y, train) +# X_test, y_test = _safe_split(estimator, X, y, test, train) +# return rfe._fit( +# X_train, +# y_train, +# lambda estimator, features: _score( +# estimator, X_test[:, features], y_test, scorer +# ), +# ).scores_ + + +def _rfe_single_fit( + rfe, estimator, X, y, train_idx, val_idx, scorer, sample_weight=None +): + """ + Return the score for a fit across one fold. + """ + X_train, y_train = X[train_idx], y[train_idx] + X_test, y_test = X[val_idx], y[val_idx] + + if sample_weight is not None: + return ( + rfe._fit( + X_train, + y_train, + lambda estimator, features: _score( + estimator, X_test[:, features], y_test, scorer + ), + sample_weight=sample_weight, + ).scores_, + rfe._fit( + X_train, + y_train, + lambda estimator, features: _score( + estimator, X_test[:, features], y_test, scorer + ), + sample_weight=sample_weight, + ).ranking_, + ) + else: + return ( + rfe._fit( + X_train, + y_train, + lambda estimator, features: _score( + estimator, X_test[:, features], y_test, scorer + ), + ).scores_, + rfe._fit( + X_train, + y_train, + lambda estimator, features: _score( + estimator, X_test[:, features], y_test, scorer + ), + ).ranking_, + ) + + +class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator): + """Feature ranking with recursive feature elimination. + Given an external estimator that assigns weights to features (e.g., the + coefficients of a linear model), the goal of recursive feature elimination + (RFE) is to select features by recursively considering smaller and smaller + sets of features. First, the estimator is trained on the initial set of + features and the importance of each feature is obtained either through + any specific attribute or callable. + Then, the least important features are pruned from current set of features. + That procedure is recursively repeated on the pruned set until the desired + number of features to select is eventually reached. + Read more in the :ref:`User Guide <rfe>`. + Parameters + ---------- + estimator : ``Estimator`` instance + A supervised learning estimator with a ``fit`` method that provides + information about feature importance + (e.g. `coef_`, `feature_importances_`). + n_features_to_select : int or float, default=None + The number of features to select. If `None`, half of the features are + selected. If integer, the parameter is the absolute number of features + to select. If float between 0 and 1, it is the fraction of features to + select. + .. versionchanged:: 0.24 + Added float values for fractions. + step : int or float, default=1 + If greater than or equal to 1, then ``step`` corresponds to the + (integer) number of features to remove at each iteration. + If within (0.0, 1.0), then ``step`` corresponds to the percentage + (rounded down) of features to remove at each iteration. + verbose : int, default=0 + Controls verbosity of output. + importance_getter : str or callable, default='auto' + If 'auto', uses the feature importance either through a `coef_` + or `feature_importances_` attributes of estimator. + Also accepts a string that specifies an attribute name/path + for extracting feature importance (implemented with `attrgetter`). + For example, give `regressor_.coef_` in case of + :class:`~sklearn.compose.TransformedTargetRegressor` or + `named_steps.clf.feature_importances_` in case of + class:`~sklearn.pipeline.Pipeline` with its last step named `clf`. + If `callable`, overrides the default feature importance getter. + The callable is passed with the fitted estimator and it should + return importance for each feature. + .. versionadded:: 0.24 + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + The classes labels. Only available when `estimator` is a classifier. + estimator_ : ``Estimator`` instance + The fitted estimator used to select features. + n_features_ : int + The number of selected features. + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + .. versionadded:: 0.24 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + .. versionadded:: 1.0 + ranking_ : ndarray of shape (n_features,) + The feature ranking, such that ``ranking_[i]`` corresponds to the + ranking position of the i-th feature. Selected (i.e., estimated + best) features are assigned rank 1. + support_ : ndarray of shape (n_features,) + The mask of selected features. + See Also + -------- + RFECV : Recursive feature elimination with built-in cross-validated + selection of the best number of features. + SelectFromModel : Feature selection based on thresholds of importance + weights. + SequentialFeatureSelector : Sequential cross-validation based feature + selection. Does not rely on importance weights. + Notes + ----- + Allows NaN/Inf in the input if the underlying estimator does as well. + References + ---------- + .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., "Gene selection + for cancer classification using support vector machines", + Mach. Learn., 46(1-3), 389--422, 2002. + Examples + -------- + The following example shows how to retrieve the 5 most informative + features in the Friedman #1 dataset. + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.feature_selection import RFE + >>> from sklearn.svm import SVR + >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) + >>> estimator = SVR(kernel="linear") + >>> selector = RFE(estimator, n_features_to_select=5, step=1) + >>> selector = selector.fit(X, y) + >>> selector.support_ + array([ True, True, True, True, True, False, False, False, False, + False]) + >>> selector.ranking_ + array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5]) + """ + + def __init__( + self, + estimator: SVC, + *, + n_features_to_select=None, + step=1, + verbose=0, + importance_getter="auto", + ) -> None: + self.estimator = estimator + self.n_features_to_select = n_features_to_select + self.step = step + self.importance_getter = importance_getter + self.verbose = verbose + + @property + def _estimator_type(self): + return self.estimator._estimator_type + + @property + def classes_(self): + """Classes labels available when `estimator` is a classifier. + Returns + ------- + ndarray of shape (n_classes,) + """ + return self.estimator_.classes_ + + def fit(self, X: ndarray, y: ndarray, **fit_params) -> "RFE": + """Fit the RFE model and then the underlying estimator on the selected features. + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. + y : array-like of shape (n_samples,) + The target values. + **fit_params : dict + Additional parameters passed to the `fit` method of the underlying + estimator. + Returns + ------- + self : object + Fitted estimator. + """ + return self._fit(X, y, **fit_params) + + def _fit(self, X: ndarray, y: ndarray, step_score: None=None, **fit_params) -> "RFE": + # Parameter step_score controls the calculation of self.scores_ + # step_score is not exposed to users + # and is used when implementing RFECV + # self.scores_ will not be calculated when calling _fit through fit + + tags = self._get_tags() + X, y = self._validate_data( + X, + y, + accept_sparse="csc", + ensure_min_features=2, + force_all_finite=not tags.get("allow_nan", True), + multi_output=True, + ) + error_msg = ( + "n_features_to_select must be either None, a " + "positive integer representing the absolute " + "number of features or a float in (0.0, 1.0] " + "representing a percentage of features to " + f"select. Got {self.n_features_to_select}" + ) + + # Initialization + n_features = X.shape[1] + if self.n_features_to_select is None: + n_features_to_select = n_features // 2 + elif self.n_features_to_select < 0: + raise ValueError(error_msg) + elif isinstance(self.n_features_to_select, numbers.Integral): # int + n_features_to_select = self.n_features_to_select + elif self.n_features_to_select > 1.0: # float > 1 + raise ValueError(error_msg) + else: # float + n_features_to_select = int(n_features * self.n_features_to_select) + + # if 0.0 < self.step < 1.0: + # step = int(max(1, self.step * n_features)) + # else: + # step = int(self.step) + # if step <= 0: + # raise ValueError("Step must be >0") + + support_ = np.ones(n_features, dtype=bool) + ranking_ = np.ones(n_features, dtype=int) + + if step_score: + self.scores_ = [] + + # collect feature importance score in each round of elimation + self.importances_ = [] + + # Elimination + while np.sum(support_) > n_features_to_select: + # Remaining features + features = np.arange(n_features)[support_] + + # Rank the remaining features + estimator = clone(self.estimator) + if self.verbose > 0: + print("Fitting estimator with %d features." % np.sum(support_)) + + estimator.fit(X[:, features], y, **fit_params) + + # Get importance and rank them + importances = _get_feature_importances( + estimator, + self.importance_getter, + transform_func="square", + ) + ranks = np.argsort(importances) + + # for sparse case ranks is matrix + ranks = np.ravel(ranks) + + nstep = self.step * np.sum(support_) + + # Eliminate the worse features + threshold = min(math.ceil(nstep), np.sum(support_) - n_features_to_select) + + # Compute step score on the previous selection iteration + # because 'estimator' must use features + # that have not been eliminated yet + if step_score: + self.scores_.append(step_score(estimator, features)) + + self.importances_.append(importances) + + support_[features[ranks][:threshold]] = False + ranking_[np.logical_not(support_)] += 1 + + # Set final attributes + features = np.arange(n_features)[support_] + self.estimator_ = clone(self.estimator) + self.estimator_.fit(X[:, features], y, **fit_params) + + # Compute step score when only n_features_to_select features left + if step_score: + self.scores_.append(step_score(self.estimator_, features)) + self.n_features_ = support_.sum() + self.support_ = support_ + self.ranking_ = ranking_ + + return self + + @if_delegate_has_method(delegate="estimator") + def predict(self, X): + """Reduce X to the selected features and then predict using the underlying estimator. + Parameters + ---------- + X : array of shape [n_samples, n_features] + The input samples. + Returns + ------- + y : array of shape [n_samples] + The predicted target values. + """ + check_is_fitted(self) + return self.estimator_.predict(self.transform(X)) + + @if_delegate_has_method(delegate="estimator") + def score(self, X, y, **fit_params): + """Reduce X to the selected features and return the score of the underlying estimator. + Parameters + ---------- + X : array of shape [n_samples, n_features] + The input samples. + y : array of shape [n_samples] + The target values. + **fit_params : dict + Parameters to pass to the `score` method of the underlying + estimator. + .. versionadded:: 1.0 + Returns + ------- + score : float + Score of the underlying base estimator computed with the selected + features returned by `rfe.transform(X)` and `y`. + """ + check_is_fitted(self) + return self.estimator_.score(self.transform(X), y, **fit_params) + + def _get_support_mask(self): + check_is_fitted(self) + return self.support_ + + @if_delegate_has_method(delegate="estimator") + def decision_function(self, X): + """Compute the decision function of ``X``. + Parameters + ---------- + X : {array-like or sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + Returns + ------- + score : array, shape = [n_samples, n_classes] or [n_samples] + The decision function of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + Regression and binary classification produce an array of shape + [n_samples]. + """ + check_is_fitted(self) + return self.estimator_.decision_function(self.transform(X)) + + @if_delegate_has_method(delegate="estimator") + def predict_proba(self, X): + """Predict class probabilities for X. + Parameters + ---------- + X : {array-like or sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + Returns + ------- + p : array of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + check_is_fitted(self) + return self.estimator_.predict_proba(self.transform(X)) + + @if_delegate_has_method(delegate="estimator") + def predict_log_proba(self, X): + """Predict class log-probabilities for X. + Parameters + ---------- + X : array of shape [n_samples, n_features] + The input samples. + Returns + ------- + p : array of shape (n_samples, n_classes) + The class log-probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + check_is_fitted(self) + return self.estimator_.predict_log_proba(self.transform(X)) + + def _more_tags(self) -> Dict[str, bool]: + return { + "poor_score": True, + "allow_nan": _safe_tags(self.estimator, key="allow_nan"), + "requires_y": True, + } + + +class RFECV(RFE): + """Recursive feature elimination with cross-validation to select the number of features. + See glossary entry for :term:`cross-validation estimator`. + Read more in the :ref:`User Guide <rfe>`. + Parameters + ---------- + estimator : ``Estimator`` instance + A supervised learning estimator with a ``fit`` method that provides + information about feature importance either through a ``coef_`` + attribute or through a ``feature_importances_`` attribute. + step : int or float, default=1 + If greater than or equal to 1, then ``step`` corresponds to the + (integer) number of features to remove at each iteration. + If within (0.0, 1.0), then ``step`` corresponds to the percentage + (rounded down) of features to remove at each iteration. + Note that the last iteration may remove fewer than ``step`` features in + order to reach ``min_features_to_select``. + min_features_to_select : int, default=1 + The minimum number of features to be selected. This number of features + will always be scored, even if the difference between the original + feature count and ``min_features_to_select`` isn't divisible by + ``step``. + .. versionadded:: 0.20 + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + - None, to use the default 5-fold cross-validation, + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + For integer/None inputs, if ``y`` is binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. If the + estimator is a classifier or if ``y`` is neither binary nor multiclass, + :class:`~sklearn.model_selection.KFold` is used. + Refer :ref:`User Guide <cross_validation>` for the various + cross-validation strategies that can be used here. + .. versionchanged:: 0.22 + ``cv`` default value of None changed from 3-fold to 5-fold. + scoring : str, callable or None, default=None + A string (see model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + verbose : int, default=0 + Controls verbosity of output. + n_jobs : int or None, default=None + Number of cores to run in parallel while fitting across folds. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary <n_jobs>` + for more details. + .. versionadded:: 0.18 + importance_getter : str or callable, default='auto' + If 'auto', uses the feature importance either through a `coef_` + or `feature_importances_` attributes of estimator. + Also accepts a string that specifies an attribute name/path + for extracting feature importance. + For example, give `regressor_.coef_` in case of + :class:`~sklearn.compose.TransformedTargetRegressor` or + `named_steps.clf.feature_importances_` in case of + :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`. + If `callable`, overrides the default feature importance getter. + The callable is passed with the fitted estimator and it should + return importance for each feature. + .. versionadded:: 0.24 + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + The classes labels. Only available when `estimator` is a classifier. + estimator_ : ``Estimator`` instance + The fitted estimator used to select features. + grid_scores_ : ndarray of shape (n_subsets_of_features,) + The cross-validation scores such that + ``grid_scores_[i]`` corresponds to + the CV score of the i-th subset of features. + .. deprecated:: 1.0 + The `grid_scores_` attribute is deprecated in version 1.0 in favor + of `cv_results_` and will be removed in version 1.2. + cv_results_ : dict of ndarrays + A dict with keys: + split(k)_test_score : ndarray of shape (n_features,) + The cross-validation scores across (k)th fold. + mean_test_score : ndarray of shape (n_features,) + Mean of scores over the folds. + std_test_score : ndarray of shape (n_features,) + Standard deviation of scores over the folds. + .. versionadded:: 1.0 + n_features_ : int + The number of selected features with cross-validation. + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + .. versionadded:: 0.24 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + .. versionadded:: 1.0 + ranking_ : narray of shape (n_features,) + The feature ranking, such that `ranking_[i]` + corresponds to the ranking + position of the i-th feature. + Selected (i.e., estimated best) + features are assigned rank 1. + support_ : ndarray of shape (n_features,) + The mask of selected features. + See Also + -------- + RFE : Recursive feature elimination. + Notes + ----- + The size of ``grid_scores_`` is equal to + ``ceil((n_features - min_features_to_select) / step) + 1``, + where step is the number of features removed at each iteration. + Allows NaN/Inf in the input if the underlying estimator does as well. + References + ---------- + .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., "Gene selection + for cancer classification using support vector machines", + Mach. Learn., 46(1-3), 389--422, 2002. + Examples + -------- + The following example shows how to retrieve the a-priori not known 5 + informative features in the Friedman #1 dataset. + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.feature_selection import RFECV + >>> from sklearn.svm import SVR + >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) + >>> estimator = SVR(kernel="linear") + >>> selector = RFECV(estimator, step=1, cv=5) + >>> selector = selector.fit(X, y) + >>> selector.support_ + array([ True, True, True, True, True, False, False, False, False, + False]) + >>> selector.ranking_ + array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5]) + """ + + def __init__( + self, + estimator: SVC, + *, + step=1, + min_features_to_select=1, + cv=None, + scoring=None, + verbose=0, + n_jobs=None, + importance_getter="auto", + ) -> None: + self.estimator = estimator + self.step = step + self.importance_getter = importance_getter + self.cv = cv + self.scoring = scoring + self.verbose = verbose + self.n_jobs = n_jobs + self.min_features_to_select = min_features_to_select + + def fit( + self, X: ndarray, y: ndarray, train_idx_list: List[List[int]], val_idx_list: List[List[int]], groups: None=None, sample_weight_list: Optional[List[List[float]]]=None + ) -> "RFECV": + """Fit the RFE model and automatically tune the number of selected features. + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the total number of features. + y : array-like of shape (n_samples,) + Target values (integers for classification, real numbers for + regression). + groups : array-like of shape (n_samples,) or None, default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). + .. versionadded:: 0.20 + Returns + ------- + self : object + Fitted estimator. + """ + tags = self._get_tags() + X, y = self._validate_data( + X, + y, + accept_sparse="csr", + ensure_min_features=2, + force_all_finite=not tags.get("allow_nan", True), + multi_output=True, + ) + + # Initialization + # cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) + scorer = check_scoring(self.estimator, scoring=self.scoring) + n_features = X.shape[1] + + # if 0.0 < self.step < 1.0: + # step = int(max(1, self.step * n_features)) + # else: + # step = int(self.step) + # if step <= 0: + # raise ValueError("Step must be >0") + + # Build an RFE object, which will evaluate and score each possible + # feature count, down to self.min_features_to_select + rfe = RFE( + estimator=self.estimator, + n_features_to_select=self.min_features_to_select, + importance_getter=self.importance_getter, + step=self.step, + verbose=self.verbose, + ) + + # Determine the number of subsets of features by fitting across + # the train folds and choosing the "features_to_select" parameter + # that gives the least averaged error across all folds. + + # Note that joblib raises a non-picklable error for bound methods + # even if n_jobs is set to 1 with the default multiprocessing + # backend. + # This branching is done so that to + # make sure that user code that sets n_jobs to 1 + # and provides bound methods as scorers is not broken with the + # addition of n_jobs parameter in version 0.18. + + if effective_n_jobs(self.n_jobs) == 1: + parallel, func = list, _rfe_single_fit + else: + parallel = Parallel(n_jobs=self.n_jobs) + func = delayed(_rfe_single_fit) + + res = parallel( + func(rfe, self.estimator, X, y, train_idx, val_idx, scorer, sample_weight) + for train_idx, val_idx, sample_weight in zip( + train_idx_list, val_idx_list, sample_weight_list + ) + ) + + # scores = _rfe_single_fit(rfe, self.estimator, X, y, X_ts, y_ts, scorer) + + scores = [] + ranking_ = [] + for i in res: + scores.append(i[0]) + ranking_.append(i[1]) + + scores = np.array(scores) + ranking_ = np.array(ranking_) + + # scores_sum = np.sum(scores, axis=0) + # scores_sum_rev = scores_sum[::-1] + # argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1 #indices of the maximum value + # n_features_to_select = max( + # n_features - (argmax_idx * step), self.min_features_to_select + # ) + + # # Re-execute an elimination with best_k over the whole set + # rfe = RFE( + # estimator=self.estimator, + # n_features_to_select=n_features_to_select, + # step=self.step, + # importance_getter=self.importance_getter, + # verbose=self.verbose, + # ) + + # rfe.fit(X, y) + + # # Set final attributes + # self.support_ = rfe.support_ + # self.n_features_ = rfe.n_features_ + # self.ranking_ = rfe.ranking_ + # self.estimator_ = clone(self.estimator) + # self.estimator_.fit(self.transform(X), y) + + # reverse to stay consistent with before + scores_rev = scores[:, ::-1] + self.cv_results_ = {} + self.cv_ranking_ = {} + self.cv_results_["mean_test_score"] = np.mean(scores_rev, axis=0) + self.cv_results_["std_test_score"] = np.std(scores_rev, axis=0) + self.cv_ranking_["mean_feature_ranking"] = np.mean(ranking_, axis=0) + + self.top_features = {} + self.top_features["mean_feature_ranking"] = np.mean(ranking_, axis=0) + + for i in range(scores.shape[0]): + self.cv_results_[f"split{i}_test_score"] = scores_rev[i] + self.cv_ranking_[f"split{i}_test_score"] = ranking_[i] + + self.ranking_ = ranking_ + self.scores = scores + + return self + + # TODO: Remove in v1.2 when grid_scores_ is removed + # mypy error: Decorated property not supported + @deprecated( # type: ignore + "The `grid_scores_` attribute is deprecated in version 1.0 in favor " + "of `cv_results_` and will be removed in version 1.2." + ) + @property + def grid_scores_(self): + # remove 2 for mean_test_score, std_test_score + grid_size = len(self.cv_results_) - 2 + return np.asarray( + [self.cv_results_[f"split{i}_test_score"] for i in range(grid_size)] + ).T