|
a |
|
b/intelligenes/selection.py |
|
|
1 |
# (Packages/Libraries) Matrix Manipulation |
|
|
2 |
import pandas as pd |
|
|
3 |
|
|
|
4 |
# (Packages/Libraries) Statistical Analysis & Machine Learning |
|
|
5 |
from sklearn.model_selection import train_test_split |
|
|
6 |
from sklearn.preprocessing import MinMaxScaler |
|
|
7 |
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE |
|
|
8 |
from sklearn.tree import DecisionTreeClassifier |
|
|
9 |
from scipy.stats import pearsonr |
|
|
10 |
|
|
|
11 |
# (Packages/Libraries) Miscellaneous |
|
|
12 |
import argparse |
|
|
13 |
import warnings |
|
|
14 |
from sklearn.exceptions import ConvergenceWarning |
|
|
15 |
import os |
|
|
16 |
from datetime import datetime |
|
|
17 |
from pathlib import Path |
|
|
18 |
|
|
|
19 |
class FeatureSelection: |
|
|
20 |
|
|
|
21 |
def __init__(self: 'FeatureSelection', cgit_file: str, output_dir: str, random_state: 42, test_size: 0.3, use_rfe = True, use_pearson = True, use_chi2 = True, use_anova = True, use_normalization = False): |
|
|
22 |
self.cgit_file = cgit_file |
|
|
23 |
self.output_dir = output_dir |
|
|
24 |
self.random_state = random_state |
|
|
25 |
self.test_size = test_size |
|
|
26 |
self.use_rfe = use_rfe |
|
|
27 |
self.use_pearson = use_pearson |
|
|
28 |
self.use_chi2 = use_chi2 |
|
|
29 |
self.use_anova = use_anova |
|
|
30 |
self.use_normalization = use_normalization |
|
|
31 |
|
|
|
32 |
self.df = pd.read_csv(self.cgit_file) |
|
|
33 |
|
|
|
34 |
self.y = self.df['Type'] |
|
|
35 |
self.X = self.df.drop(['Type', 'ID'], axis = 1) |
|
|
36 |
|
|
|
37 |
if self.use_normalization: |
|
|
38 |
self.X = pd.DataFrame(MinMaxScaler().fit_transform(self.X), columns = self.X.columns) |
|
|
39 |
|
|
|
40 |
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = self.test_size, random_state = self.random_state) |
|
|
41 |
|
|
|
42 |
self.selectors = [] |
|
|
43 |
|
|
|
44 |
def rfe_selector(self: 'FeatureSelection'): |
|
|
45 |
if self.use_rfe: |
|
|
46 |
print("Recursive Feature Elimination...") |
|
|
47 |
rfe_selection = RFE(estimator = DecisionTreeClassifier(random_state = self.random_state), n_features_to_select = 1).fit(self.X_train, self.y_train) |
|
|
48 |
rfe_df = pd.DataFrame({'attributes': self.X_train.columns, |
|
|
49 |
'rfe_rankings': rfe_selection.ranking_}) |
|
|
50 |
|
|
|
51 |
rfe_df = rfe_df.sort_values(by = 'rfe_rankings').loc[rfe_df['rfe_rankings'] <= int((self.df.shape[1] - 2) * .10)] |
|
|
52 |
return rfe_df |
|
|
53 |
return None |
|
|
54 |
|
|
|
55 |
def pearson_selector(self: 'FeatureSelection'): |
|
|
56 |
if self.use_pearson: |
|
|
57 |
print("Pearson's Correlation...") |
|
|
58 |
pearson_selection = [pearsonr(self.X_train[column], self.y_train) for column in self.X.columns] |
|
|
59 |
pearson_df = pd.DataFrame({'attributes': self.X_train.columns, |
|
|
60 |
'pearson_p-value': [corr[1] for corr in pearson_selection]}) |
|
|
61 |
|
|
|
62 |
pearson_df = pearson_df[pearson_df['pearson_p-value'] < 0.05] |
|
|
63 |
return pearson_df |
|
|
64 |
return None |
|
|
65 |
|
|
|
66 |
def chi2_selector(self: 'FeatureSelection'): |
|
|
67 |
if self.use_chi2: |
|
|
68 |
print("Chi-Square Test...") |
|
|
69 |
chi2_selection = SelectKBest(score_func = chi2, k = 10).fit(self.X_train, self.y_train) |
|
|
70 |
chi2_df = pd.DataFrame({'attributes': self.X_train.columns, |
|
|
71 |
'chi2_p-value': chi2_selection.pvalues_}) |
|
|
72 |
|
|
|
73 |
chi2_df = chi2_df[chi2_df['chi2_p-value'] < 0.05] |
|
|
74 |
return chi2_df |
|
|
75 |
return None |
|
|
76 |
|
|
|
77 |
def anova_selector(self: 'FeatureSelection'): |
|
|
78 |
if self.use_anova: |
|
|
79 |
print("ANOVA...") |
|
|
80 |
anova_selection = SelectKBest(score_func = f_classif, k = 10).fit(self.X_train, self.y_train) |
|
|
81 |
anova_df = pd.DataFrame({'attributes': self.X_train.columns, |
|
|
82 |
'anova_p-value': anova_selection.pvalues_}) |
|
|
83 |
|
|
|
84 |
anova_df = anova_df[anova_df['anova_p-value'] < 0.05] |
|
|
85 |
return anova_df |
|
|
86 |
return None |
|
|
87 |
|
|
|
88 |
def execute_selectors(self: 'FeatureSelection'): |
|
|
89 |
self.selectors = [self.rfe_selector(), |
|
|
90 |
self.pearson_selector(), |
|
|
91 |
self.chi2_selector(), |
|
|
92 |
self.anova_selector()] |
|
|
93 |
|
|
|
94 |
self.selectors = [df for df in self.selectors if df is not None] |
|
|
95 |
|
|
|
96 |
def selected_attributes(self: 'FeatureSelection'): |
|
|
97 |
selected_attributes = pd.DataFrame({'attributes': self.X_train.columns}) |
|
|
98 |
for df in self.selectors: |
|
|
99 |
selected_attributes = selected_attributes.merge(df, how = 'inner', on = 'attributes') |
|
|
100 |
|
|
|
101 |
selector_cols = ['rfe_rankings', 'pearson_p-value', 'chi2_p-value', 'anova_p-value'] |
|
|
102 |
selectors_used = [col for col in selector_cols if col in selected_attributes.columns] |
|
|
103 |
if any(not self.__dict__[f"use_{selector.split('_')[0]}"] for selector in selectors_used): |
|
|
104 |
selected_attributes = selected_attributes.dropna(subset = selectors_used, how = 'any') |
|
|
105 |
|
|
|
106 |
selected_attributes = selected_attributes.rename(columns={ |
|
|
107 |
'attributes': 'Features', |
|
|
108 |
'rfe_rankings': 'RFE Rankings', |
|
|
109 |
'pearson_p-value': "Pearson's Correlation (p-value)", |
|
|
110 |
'chi2_p-value': 'Chi-Square Test (p-value)', |
|
|
111 |
'anova_p-value': 'ANOVA (p-value)' |
|
|
112 |
}) |
|
|
113 |
|
|
|
114 |
return selected_attributes |
|
|
115 |
|
|
|
116 |
def main(): |
|
|
117 |
print("\n") |
|
|
118 |
print("IntelliGenes Feature Selection/Biomarker Location...") |
|
|
119 |
|
|
|
120 |
parser = argparse.ArgumentParser() |
|
|
121 |
parser.add_argument('-i', '--cgit_file', required = True) |
|
|
122 |
parser.add_argument('-o', '--output_dir', required = True) |
|
|
123 |
parser.add_argument('--random_state', type = int, default = 42) |
|
|
124 |
parser.add_argument('--test_size', type = float, default = 0.3) |
|
|
125 |
parser.add_argument('--no_rfe', action = 'store_true') |
|
|
126 |
parser.add_argument('--no_pearson', action = 'store_true') |
|
|
127 |
parser.add_argument('--no_chi2', action = 'store_true') |
|
|
128 |
parser.add_argument('--no_anova', action = 'store_true') |
|
|
129 |
parser.add_argument('--normalize', action = 'store_true') |
|
|
130 |
args = parser.parse_args() |
|
|
131 |
|
|
|
132 |
pipeline = FeatureSelection( |
|
|
133 |
cgit_file = args.cgit_file, |
|
|
134 |
output_dir = args.output_dir, |
|
|
135 |
random_state = args.random_state, |
|
|
136 |
test_size = args.test_size, |
|
|
137 |
use_rfe = not args.no_rfe, |
|
|
138 |
use_pearson = not args.no_pearson, |
|
|
139 |
use_chi2 = not args.no_chi2, |
|
|
140 |
use_anova = not args.no_anova, |
|
|
141 |
use_normalization = args.normalize |
|
|
142 |
) |
|
|
143 |
|
|
|
144 |
pipeline.execute_selectors() |
|
|
145 |
features_df = pipeline.selected_attributes() |
|
|
146 |
|
|
|
147 |
if not os.path.exists(args.output_dir): |
|
|
148 |
os.makedirs(args.output_dir) |
|
|
149 |
|
|
|
150 |
file_name = Path(args.cgit_file).stem |
|
|
151 |
features_name = f"{file_name}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_Selected-Features.csv" |
|
|
152 |
features_file = os.path.join(args.output_dir, features_name) |
|
|
153 |
|
|
|
154 |
features_df.to_csv(features_file, index = False) |
|
|
155 |
print("\n Selected Features:", features_file, "\n") |
|
|
156 |
|
|
|
157 |
if __name__ == '__main__': |
|
|
158 |
main() |