Switch to unified view

a b/python/feature_selection.py
1
from sklearn.feature_selection import SelectFromModel
2
from sklearn.linear_model import LassoCV
3
from sklearn.feature_selection import SelectPercentile, f_classif
4
from sklearn.feature_selection import SelectKBest
5
from sklearn.feature_selection import chi2
6
7
import numpy as np
8
# https://machinelearningmastery.com/feature-selection-machine-learning-python/
9
10
def run_feature_selection(features, labels, feature_selection, best_features):
11
    
12
    if feature_selection == 'select_K_Best':
13
        # feature extraction
14
        selector = SelectKBest(score_func=f_classif, k=4) # score_func=chi2 : only for non-negative features
15
        selector.fit(features, labels)
16
        # summarize scores
17
        scores = selector.scores_
18
        features_index_sorted = np.argsort(-scores)
19
        features_selected = features[:, features_index_sorted[0:best_features]]
20
21
    # SelectFromModel and LassoCV
22
    
23
    # We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
24
    if feature_selection == 'LassoCV':
25
        clf = LassoCV()
26
27
        # Set a minimum threshold of 0.25
28
        sfm = SelectFromModel(clf, threshold=0.95)
29
        sfm.fit(features, labels)
30
        features_selected = sfm.transform(features).shape[1]
31
32
        """
33
        # Reset the threshold till the number of features equals two.
34
        # Note that the attribute can be set directly instead of repeatedly
35
        # fitting the metatransformer.
36
        while n_features > 2:
37
            sfm.threshold += 0.1
38
            X_transform = sfm.transform(X)
39
            n_features = X_transform.shape[1]
40
        """
41
42
    # Univariate feature selection
43
    # Univariate feature selection works by selecting the best features based on univariate statistical tests. 
44
    # It can be seen as a preprocessing step to an estimator. 
45
    # Scikit-learn exposes feature selection routines as objects that implement the transform method:
46
    #   - SelectKBest removes all but the k highest scoring features
47
    #   - SelectPercentile removes all but a user-specified highest scoring percentage of features
48
    #       common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, or family wise error SelectFwe.
49
    #   - GenericUnivariateSelect allows to perform univariate feature selection with a configurable strategy. This allows to select the best univariate selection strategy with hyper-parameter search estimator.
50
51
    if feature_selection == 'slct_percentile':
52
        selector = SelectPercentile(f_classif, percentile=10)
53
        selector.fit(features, labels)
54
        # The percentile not affect. 
55
        # Just select in order the top features by number or threshold
56
57
        # Keep best 8 values?
58
        scores = selector.scores_
59
        features_index_sorted = np.argsort(-scores)
60
        # scores = selector.scores_
61
62
        # scores = -np.log10(selector.pvalues_)
63
        # scores /= scores.max()
64
65
        features_selected = features[:, features_index_sorted[0:best_features]]
66
67
    print("Selected only " + str(features_selected.shape) + " features ")
68
69
    return features_selected, features_index_sorted