T-CARER / Git / [973ab6] /Stats/FeatureSelection.py

Models:
RaymondKing/
T-CARER
Downloads: 1
[973ab6]: / Stats / FeatureSelection.py
History
Download this file
266 lines (241 with data), 13.3 kB

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#
# Copyright 2017 University of Westminster. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""It is an interface for ranking features importance.
"""

from typing import List, TypeVar, Any
from sklearn import ensemble
from sklearn import feature_selection
from sklearn import tree
from sklearn import svm
from sklearn.svm import SVR
from sklearn.linear_model import RandomizedLogisticRegression
import logging
from Configs.CONSTANTS import CONSTANTS

PandasDataFrame = TypeVar('DataFrame')

__author__ = "Mohsen Mesgarpour"
__copyright__ = "Copyright 2016, https://github.com/mesgarpour"
__credits__ = ["Mohsen Mesgarpour"]
__license__ = "GPL"
__version__ = "1.1"
__maintainer__ = "Mohsen Mesgarpour"
__email__ = "mohsen.mesgarpour@gmail.com"
__status__ = "Release"


class FeatureSelection:
    def __init__(self):
        """Initialise the objects and constants.
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)

    def rank_random_forest_breiman(self,
                                   features_indep_df: PandasDataFrame,
                                   feature_target: List,
                                   n_jobs: int=-1,
                                   **kwargs: Any) -> object:
        """Use Brieman Random Forest Classifier to rank features.
        Attributes:
        model.estimators_
        model.classes_
        model.n_classes_
        model.n_features_
        model.n_outputs_
        model.feature_importances_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs.
        :param kwargs: n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
        min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True,
        oob_score=False, random_state=None, verbose=0, warm_start=False, class_weight=None
        :return: the importance ranking model.
        """
        self.__logger.debug("Run Random Forest Classifier (Brieman).")
        classifier = ensemble.RandomForestClassifier(n_jobs=n_jobs, **kwargs)
        return classifier.fit(features_indep_df, feature_target)

    def rank_random_logistic_regression(self,
                                        features_indep_df: PandasDataFrame,
                                        feature_target: List,
                                        n_jobs: int=-1,
                                        **kwargs: Any) -> object:
        """Use Randomized Logistic Regression to rank features.
        Attributes:
        model.scores_
        model.all_scores_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs.
        :param kwargs: C=1, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, tol=0.001,
        fit_intercept=True, verbose=False, normalize=True, random_state=None, pre_dispatch='3*n_jobs'
        :return: the importance ranking model.
        """
        self.__logger.debug("Run Random Logistic Regression.")
        classifier = RandomizedLogisticRegression(n_jobs=n_jobs, **kwargs)
        return classifier.fit(features_indep_df, feature_target)

    def rank_svm_c_support(self,
                           features_indep_df: PandasDataFrame,
                           feature_target: List,
                           **kwargs: Any) -> object:
        """Use Scalable Linear Support Vector Machine for classification.
        In C-Support Vector Classification (SVC), the C parameter trades off misclassification of training examples
        against simplicity of the decision surface.
        Attributes:
        model.support_
        model.support_vectors_
        model.n_support_
        model.dual_coef_
        model.coef_
        model.intercept_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param kwargs: C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False,
        tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None,
        random_state=None
        :return: the importance ranking model.
        """
        self.__logger.debug("Run C-Support Vector Classification.")
        classifier = svm.SVC(**kwargs)
        return classifier.fit(features_indep_df, feature_target)

    def rank_tree_brieman(self,
                          features_indep_df: PandasDataFrame,
                          feature_target: List,
                          **kwargs: Any) -> object:
        """Use Brieman decision tree classifier to rank features.
        Attributes:
        model.classes_
        model.feature_importances_
        model.max_features_
        model.n_classes_
        model.n_features_
        model.n_outputs_
        model.tree_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param kwargs: criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
        min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None,
        min_impurity_split=1e-07, class_weight=None, presort=False
        :return: the importance ranking model.
        """
        self.__logger.debug("Run Decision Tree Classifier (Brieman).")
        classifier = tree.DecisionTreeClassifier(**kwargs)
        return classifier.fit(features_indep_df, feature_target)

    def rank_tree_gbrt(self,
                       features_indep_df: PandasDataFrame,
                       feature_target: List,
                       **kwargs: Any) -> object:
        """Use Gradient Boosted Regression Trees (GBRT) to rank features.
        Attributes:
        model.feature_importances_
        model.train_score_
        model.loss_
        model.init
        model.estimators_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param kwargs: loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse',
        min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_split=1e-07,
        init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
        presort='auto'
        :return: the importance ranking model.
        """
        self.__logger.debug("Run Gradient Boosted Regression Trees (GBRT).")
        classifier = ensemble.GradientBoostingRegressor(**kwargs)
        return classifier.fit(features_indep_df, feature_target)

    def selector_logistic_rfe(self,
                              features_indep_df: PandasDataFrame,
                              feature_target: List,
                              kernel: str="linear",
                              n_jobs: int=-1,
                              **kwargs: Any) -> object:
        """Select top features using recursive feature elimination and cross-validated selection of the best number
        of features, to rank features.
        Attributes:
        model.n_features_
        model.support_
        model.ranking_
        model.grid_scores_
        model.estimator_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’,
        ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used.
        :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs.
        :param kwargs: step=1, cv=None, scoring=None, verbose=0
        :return: the feature selection model.
        """
        self.__logger.debug("Run Feature Ranking with Recursive Feature Elimination.")
        estimator = SVR(kernel=kernel)
        selector = feature_selection.RFECV(estimator=estimator, n_jobs=n_jobs, **kwargs)
        return selector.fit(features_indep_df, feature_target)

    def selector_univarite_selection_kbest_chi2(self,
                                                features_indep_df: PandasDataFrame,
                                                feature_target: List,
                                                kbest: int) -> object:
        """Select features according to the k highest scores, using 'chi2':
        Chi-squared stats of non-negative features for classification tasks.
        Attributes:
        model.scores_
        model.pvalues_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param kbest: number of top features to select. The “all” option bypasses selection, for use in a parameter
        search.
        :return: the feature selection model.
        """
        self.__logger.debug("Select features according to the k highest scores, using 'chi2'.")
        return self.__selector_univarite_selection_kbest(features_indep_df,
                                                         feature_target,
                                                         feature_selection.chi2,
                                                         kbest)

    def selector_univarite_selection_kbest_f_classif(self,
                                                     features_indep_df: PandasDataFrame,
                                                     feature_target: List,
                                                     kbest: int) -> object:
        """Select features according to the k highest scores, using 'f_classif':
        ANOVA F-value between label/feature for classification tasks.
        Attributes:
        model.scores_
        model.pvalues_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param kbest: number of top features to select. The “all” option bypasses selection, for use in a parameter
        search.
        :return: the feature selection model.
        """
        self.__logger.debug("Select features according to the k highest scores, using 'f_classif'.")
        return self.__selector_univarite_selection_kbest(features_indep_df, feature_target,
                                                         feature_selection.f_classif, kbest)

    def __selector_univarite_selection_kbest(self,
                                             features_indep_df: PandasDataFrame,
                                             feature_target: List,
                                             score_func: Any,
                                             kbest: int) -> object:
        """Select features according to the k highest scores.
        Attributes:
        model.scores_
        model.pvalues_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param score_func: Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or
        a single array with scores.
        :param kbest: number of top features to select. The “all” option bypasses selection, for use in a parameter
        search.
        :return: the feature selection model.
        """
        self.__logger.debug("Run Univariate Feature Selection with Configurable Strategy.")
        kbest = int(float(kbest) * features_indep_df.shape[1])
        selector = feature_selection.SelectKBest(
            score_func=score_func, k=kbest)
        return selector.fit(features_indep_df, feature_target)