T-CARER / Git / Diff of /Stats/FeatureSelection.py

Models:
RaymondKing/
T-CARER
Downloads: 1
Diff of /Stats/FeatureSelection.py [000000] .. [b4a150]
Switch to side-by-side view

--- a
+++ b/Stats/FeatureSelection.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+#
+# Copyright 2017 University of Westminster. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""It is an interface for ranking features importance.
+"""
+
+from typing import List, TypeVar, Any
+from sklearn import ensemble
+from sklearn import feature_selection
+from sklearn import tree
+from sklearn import svm
+from sklearn.svm import SVR
+from sklearn.linear_model import RandomizedLogisticRegression
+import logging
+from Configs.CONSTANTS import CONSTANTS
+
+PandasDataFrame = TypeVar('DataFrame')
+
+__author__ = "Mohsen Mesgarpour"
+__copyright__ = "Copyright 2016, https://github.com/mesgarpour"
+__credits__ = ["Mohsen Mesgarpour"]
+__license__ = "GPL"
+__version__ = "1.1"
+__maintainer__ = "Mohsen Mesgarpour"
+__email__ = "mohsen.mesgarpour@gmail.com"
+__status__ = "Release"
+
+
+class FeatureSelection:
+    def __init__(self):
+        """Initialise the objects and constants.
+        """
+        self.__logger = logging.getLogger(CONSTANTS.app_name)
+        self.__logger.debug(__name__)
+
+    def rank_random_forest_breiman(self,
+                                   features_indep_df: PandasDataFrame,
+                                   feature_target: List,
+                                   n_jobs: int=-1,
+                                   **kwargs: Any) -> object:
+        """Use Brieman Random Forest Classifier to rank features.
+        Attributes:
+        model.estimators_
+        model.classes_
+        model.n_classes_
+        model.n_features_
+        model.n_outputs_
+        model.feature_importances_
+
+        :param features_indep_df: the independent features, which are inputted into the model.
+        :param feature_target: the target feature, which is being estimated.
+        :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs.
+        :param kwargs: n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True,
+        oob_score=False, random_state=None, verbose=0, warm_start=False, class_weight=None
+        :return: the importance ranking model.
+        """
+        self.__logger.debug("Run Random Forest Classifier (Brieman).")
+        classifier = ensemble.RandomForestClassifier(n_jobs=n_jobs, **kwargs)
+        return classifier.fit(features_indep_df, feature_target)
+
+    def rank_random_logistic_regression(self,
+                                        features_indep_df: PandasDataFrame,
+                                        feature_target: List,
+                                        n_jobs: int=-1,
+                                        **kwargs: Any) -> object:
+        """Use Randomized Logistic Regression to rank features.
+        Attributes:
+        model.scores_
+        model.all_scores_
+
+        :param features_indep_df: the independent features, which are inputted into the model.
+        :param feature_target: the target feature, which is being estimated.
+        :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs.
+        :param kwargs: C=1, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, tol=0.001,
+        fit_intercept=True, verbose=False, normalize=True, random_state=None, pre_dispatch='3*n_jobs'
+        :return: the importance ranking model.
+        """
+        self.__logger.debug("Run Random Logistic Regression.")
+        classifier = RandomizedLogisticRegression(n_jobs=n_jobs, **kwargs)
+        return classifier.fit(features_indep_df, feature_target)
+
+    def rank_svm_c_support(self,
+                           features_indep_df: PandasDataFrame,
+                           feature_target: List,
+                           **kwargs: Any) -> object:
+        """Use Scalable Linear Support Vector Machine for classification.
+        In C-Support Vector Classification (SVC), the C parameter trades off misclassification of training examples
+        against simplicity of the decision surface.
+        Attributes:
+        model.support_
+        model.support_vectors_
+        model.n_support_
+        model.dual_coef_
+        model.coef_
+        model.intercept_
+
+        :param features_indep_df: the independent features, which are inputted into the model.
+        :param feature_target: the target feature, which is being estimated.
+        :param kwargs: C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False,
+        tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None,
+        random_state=None
+        :return: the importance ranking model.
+        """
+        self.__logger.debug("Run C-Support Vector Classification.")
+        classifier = svm.SVC(**kwargs)
+        return classifier.fit(features_indep_df, feature_target)
+
+    def rank_tree_brieman(self,
+                          features_indep_df: PandasDataFrame,
+                          feature_target: List,
+                          **kwargs: Any) -> object:
+        """Use Brieman decision tree classifier to rank features.
+        Attributes:
+        model.classes_
+        model.feature_importances_
+        model.max_features_
+        model.n_classes_
+        model.n_features_
+        model.n_outputs_
+        model.tree_
+
+        :param features_indep_df: the independent features, which are inputted into the model.
+        :param feature_target: the target feature, which is being estimated.
+        :param kwargs: criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None,
+        min_impurity_split=1e-07, class_weight=None, presort=False
+        :return: the importance ranking model.
+        """
+        self.__logger.debug("Run Decision Tree Classifier (Brieman).")
+        classifier = tree.DecisionTreeClassifier(**kwargs)
+        return classifier.fit(features_indep_df, feature_target)
+
+    def rank_tree_gbrt(self,
+                       features_indep_df: PandasDataFrame,
+                       feature_target: List,
+                       **kwargs: Any) -> object:
+        """Use Gradient Boosted Regression Trees (GBRT) to rank features.
+        Attributes:
+        model.feature_importances_
+        model.train_score_
+        model.loss_
+        model.init
+        model.estimators_
+
+        :param features_indep_df: the independent features, which are inputted into the model.
+        :param feature_target: the target feature, which is being estimated.
+        :param kwargs: loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse',
+        min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_split=1e-07,
+        init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
+        presort='auto'
+        :return: the importance ranking model.
+        """
+        self.__logger.debug("Run Gradient Boosted Regression Trees (GBRT).")
+        classifier = ensemble.GradientBoostingRegressor(**kwargs)
+        return classifier.fit(features_indep_df, feature_target)
+
+    def selector_logistic_rfe(self,
+                              features_indep_df: PandasDataFrame,
+                              feature_target: List,
+                              kernel: str="linear",
+                              n_jobs: int=-1,
+                              **kwargs: Any) -> object:
+        """Select top features using recursive feature elimination and cross-validated selection of the best number
+        of features, to rank features.
+        Attributes:
+        model.n_features_
+        model.support_
+        model.ranking_
+        model.grid_scores_
+        model.estimator_
+
+        :param features_indep_df: the independent features, which are inputted into the model.
+        :param feature_target: the target feature, which is being estimated.
+        :param kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’,
+        ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used.
+        :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs.
+        :param kwargs: step=1, cv=None, scoring=None, verbose=0
+        :return: the feature selection model.
+        """
+        self.__logger.debug("Run Feature Ranking with Recursive Feature Elimination.")
+        estimator = SVR(kernel=kernel)
+        selector = feature_selection.RFECV(estimator=estimator, n_jobs=n_jobs, **kwargs)
+        return selector.fit(features_indep_df, feature_target)
+
+    def selector_univarite_selection_kbest_chi2(self,
+                                                features_indep_df: PandasDataFrame,
+                                                feature_target: List,
+                                                kbest: int) -> object:
+        """Select features according to the k highest scores, using 'chi2':
+        Chi-squared stats of non-negative features for classification tasks.
+        Attributes:
+        model.scores_
+        model.pvalues_
+
+        :param features_indep_df: the independent features, which are inputted into the model.
+        :param feature_target: the target feature, which is being estimated.
+        :param kbest: number of top features to select. The “all” option bypasses selection, for use in a parameter
+        search.
+        :return: the feature selection model.
+        """
+        self.__logger.debug("Select features according to the k highest scores, using 'chi2'.")
+        return self.__selector_univarite_selection_kbest(features_indep_df,
+                                                         feature_target,
+                                                         feature_selection.chi2,
+                                                         kbest)
+
+    def selector_univarite_selection_kbest_f_classif(self,
+                                                     features_indep_df: PandasDataFrame,
+                                                     feature_target: List,
+                                                     kbest: int) -> object:
+        """Select features according to the k highest scores, using 'f_classif':
+        ANOVA F-value between label/feature for classification tasks.
+        Attributes:
+        model.scores_
+        model.pvalues_
+
+        :param features_indep_df: the independent features, which are inputted into the model.
+        :param feature_target: the target feature, which is being estimated.
+        :param kbest: number of top features to select. The “all” option bypasses selection, for use in a parameter
+        search.
+        :return: the feature selection model.
+        """
+        self.__logger.debug("Select features according to the k highest scores, using 'f_classif'.")
+        return self.__selector_univarite_selection_kbest(features_indep_df, feature_target,
+                                                         feature_selection.f_classif, kbest)
+
+    def __selector_univarite_selection_kbest(self,
+                                             features_indep_df: PandasDataFrame,
+                                             feature_target: List,
+                                             score_func: Any,
+                                             kbest: int) -> object:
+        """Select features according to the k highest scores.
+        Attributes:
+        model.scores_
+        model.pvalues_
+
+        :param features_indep_df: the independent features, which are inputted into the model.
+        :param feature_target: the target feature, which is being estimated.
+        :param score_func: Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or
+        a single array with scores.
+        :param kbest: number of top features to select. The “all” option bypasses selection, for use in a parameter
+        search.
+        :return: the feature selection model.
+        """
+        self.__logger.debug("Run Univariate Feature Selection with Configurable Strategy.")
+        kbest = int(float(kbest) * features_indep_df.shape[1])
+        selector = feature_selection.SelectKBest(
+            score_func=score_func, k=kbest)
+        return selector.fit(features_indep_df, feature_target)
+