Diff of /intelligenes/selection.py [000000] .. [0a2626]

Switch to unified view

a b/intelligenes/selection.py
1
# (Packages/Libraries) Matrix Manipulation
2
import pandas as pd 
3
4
# (Packages/Libraries) Statistical Analysis & Machine Learning
5
from sklearn.model_selection import train_test_split
6
from sklearn.preprocessing import MinMaxScaler
7
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE
8
from sklearn.tree import DecisionTreeClassifier
9
from scipy.stats import pearsonr
10
11
# (Packages/Libraries) Miscellaneous
12
import argparse
13
import warnings
14
from sklearn.exceptions import ConvergenceWarning
15
import os
16
from datetime import datetime
17
from pathlib import Path
18
19
class FeatureSelection:
20
    
21
    def __init__(self: 'FeatureSelection', cgit_file: str, output_dir: str, random_state: 42, test_size: 0.3, use_rfe = True, use_pearson = True, use_chi2 = True, use_anova = True, use_normalization = False): 
22
        self.cgit_file = cgit_file
23
        self.output_dir = output_dir
24
        self.random_state = random_state
25
        self.test_size = test_size
26
        self.use_rfe = use_rfe
27
        self.use_pearson = use_pearson
28
        self.use_chi2 = use_chi2
29
        self.use_anova = use_anova
30
        self.use_normalization = use_normalization
31
        
32
        self.df = pd.read_csv(self.cgit_file)
33
        
34
        self.y = self.df['Type']
35
        self.X = self.df.drop(['Type', 'ID'], axis = 1)
36
        
37
        if self.use_normalization:
38
            self.X = pd.DataFrame(MinMaxScaler().fit_transform(self.X), columns = self.X.columns)
39
        
40
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = self.test_size, random_state = self.random_state)
41
        
42
        self.selectors = []        
43
44
    def rfe_selector(self: 'FeatureSelection'):
45
        if self.use_rfe:
46
            print("Recursive Feature Elimination...") 
47
            rfe_selection = RFE(estimator = DecisionTreeClassifier(random_state = self.random_state), n_features_to_select = 1).fit(self.X_train, self.y_train)
48
            rfe_df = pd.DataFrame({'attributes': self.X_train.columns,
49
                                   'rfe_rankings': rfe_selection.ranking_})
50
            
51
            rfe_df = rfe_df.sort_values(by = 'rfe_rankings').loc[rfe_df['rfe_rankings'] <= int((self.df.shape[1] - 2) * .10)]
52
            return rfe_df 
53
        return None
54
55
    def pearson_selector(self: 'FeatureSelection'):
56
        if self.use_pearson:
57
            print("Pearson's Correlation...") 
58
            pearson_selection = [pearsonr(self.X_train[column], self.y_train) for column in self.X.columns]
59
            pearson_df = pd.DataFrame({'attributes': self.X_train.columns,
60
                                       'pearson_p-value': [corr[1] for corr in pearson_selection]})
61
        
62
            pearson_df = pearson_df[pearson_df['pearson_p-value'] < 0.05]
63
            return pearson_df
64
        return None
65
    
66
    def chi2_selector(self: 'FeatureSelection'):
67
        if self.use_chi2:
68
            print("Chi-Square Test...") 
69
            chi2_selection = SelectKBest(score_func = chi2, k = 10).fit(self.X_train, self.y_train)
70
            chi2_df = pd.DataFrame({'attributes': self.X_train.columns, 
71
                                    'chi2_p-value': chi2_selection.pvalues_})
72
            
73
            chi2_df = chi2_df[chi2_df['chi2_p-value'] < 0.05]
74
            return chi2_df
75
        return None
76
              
77
    def anova_selector(self: 'FeatureSelection'):
78
        if self.use_anova:
79
            print("ANOVA...")
80
            anova_selection = SelectKBest(score_func = f_classif, k = 10).fit(self.X_train, self.y_train)
81
            anova_df = pd.DataFrame({'attributes': self.X_train.columns, 
82
                                     'anova_p-value': anova_selection.pvalues_})
83
 
84
            anova_df = anova_df[anova_df['anova_p-value'] < 0.05]
85
            return anova_df
86
        return None
87
    
88
    def execute_selectors(self: 'FeatureSelection'):
89
        self.selectors = [self.rfe_selector(), 
90
                          self.pearson_selector(), 
91
                          self.chi2_selector(), 
92
                          self.anova_selector()]
93
        
94
        self.selectors = [df for df in self.selectors if df is not None]
95
        
96
    def selected_attributes(self: 'FeatureSelection'):
97
        selected_attributes = pd.DataFrame({'attributes': self.X_train.columns})
98
        for df in self.selectors:
99
            selected_attributes = selected_attributes.merge(df, how = 'inner', on = 'attributes')
100
101
        selector_cols = ['rfe_rankings', 'pearson_p-value', 'chi2_p-value', 'anova_p-value']
102
        selectors_used = [col for col in selector_cols if col in selected_attributes.columns]
103
        if any(not self.__dict__[f"use_{selector.split('_')[0]}"] for selector in selectors_used):
104
            selected_attributes = selected_attributes.dropna(subset = selectors_used, how = 'any')
105
            
106
        selected_attributes = selected_attributes.rename(columns={
107
            'attributes': 'Features',
108
            'rfe_rankings': 'RFE Rankings',
109
            'pearson_p-value': "Pearson's Correlation (p-value)",
110
            'chi2_p-value': 'Chi-Square Test (p-value)',
111
            'anova_p-value': 'ANOVA (p-value)'
112
        })
113
114
        return selected_attributes
115
    
116
def main():
117
    print("\n")
118
    print("IntelliGenes Feature Selection/Biomarker Location...")
119
    
120
    parser = argparse.ArgumentParser()
121
    parser.add_argument('-i', '--cgit_file', required = True)
122
    parser.add_argument('-o', '--output_dir', required = True)
123
    parser.add_argument('--random_state', type = int, default = 42)
124
    parser.add_argument('--test_size', type = float, default = 0.3)
125
    parser.add_argument('--no_rfe', action = 'store_true')
126
    parser.add_argument('--no_pearson', action = 'store_true')
127
    parser.add_argument('--no_chi2', action = 'store_true')
128
    parser.add_argument('--no_anova', action = 'store_true')
129
    parser.add_argument('--normalize', action = 'store_true')
130
    args = parser.parse_args()
131
132
    pipeline = FeatureSelection(
133
        cgit_file  = args.cgit_file, 
134
        output_dir = args.output_dir, 
135
        random_state = args.random_state, 
136
        test_size = args.test_size, 
137
        use_rfe = not args.no_rfe, 
138
        use_pearson = not args.no_pearson, 
139
        use_chi2 = not args.no_chi2, 
140
        use_anova = not args.no_anova, 
141
        use_normalization = args.normalize
142
    )
143
    
144
    pipeline.execute_selectors()
145
    features_df = pipeline.selected_attributes()
146
    
147
    if not os.path.exists(args.output_dir):
148
        os.makedirs(args.output_dir)
149
150
    file_name = Path(args.cgit_file).stem
151
    features_name = f"{file_name}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_Selected-Features.csv"
152
    features_file = os.path.join(args.output_dir, features_name)
153
    
154
    features_df.to_csv(features_file, index = False)
155
    print("\n Selected Features:", features_file, "\n")
156
157
if __name__ == '__main__':
158
    main()