[0a2626]: / intelligenes / selection.py

Download this file

159 lines (127 with data), 7.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# (Packages/Libraries) Matrix Manipulation
import pandas as pd
# (Packages/Libraries) Statistical Analysis & Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import pearsonr
# (Packages/Libraries) Miscellaneous
import argparse
import warnings
from sklearn.exceptions import ConvergenceWarning
import os
from datetime import datetime
from pathlib import Path
class FeatureSelection:
def __init__(self: 'FeatureSelection', cgit_file: str, output_dir: str, random_state: 42, test_size: 0.3, use_rfe = True, use_pearson = True, use_chi2 = True, use_anova = True, use_normalization = False):
self.cgit_file = cgit_file
self.output_dir = output_dir
self.random_state = random_state
self.test_size = test_size
self.use_rfe = use_rfe
self.use_pearson = use_pearson
self.use_chi2 = use_chi2
self.use_anova = use_anova
self.use_normalization = use_normalization
self.df = pd.read_csv(self.cgit_file)
self.y = self.df['Type']
self.X = self.df.drop(['Type', 'ID'], axis = 1)
if self.use_normalization:
self.X = pd.DataFrame(MinMaxScaler().fit_transform(self.X), columns = self.X.columns)
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = self.test_size, random_state = self.random_state)
self.selectors = []
def rfe_selector(self: 'FeatureSelection'):
if self.use_rfe:
print("Recursive Feature Elimination...")
rfe_selection = RFE(estimator = DecisionTreeClassifier(random_state = self.random_state), n_features_to_select = 1).fit(self.X_train, self.y_train)
rfe_df = pd.DataFrame({'attributes': self.X_train.columns,
'rfe_rankings': rfe_selection.ranking_})
rfe_df = rfe_df.sort_values(by = 'rfe_rankings').loc[rfe_df['rfe_rankings'] <= int((self.df.shape[1] - 2) * .10)]
return rfe_df
return None
def pearson_selector(self: 'FeatureSelection'):
if self.use_pearson:
print("Pearson's Correlation...")
pearson_selection = [pearsonr(self.X_train[column], self.y_train) for column in self.X.columns]
pearson_df = pd.DataFrame({'attributes': self.X_train.columns,
'pearson_p-value': [corr[1] for corr in pearson_selection]})
pearson_df = pearson_df[pearson_df['pearson_p-value'] < 0.05]
return pearson_df
return None
def chi2_selector(self: 'FeatureSelection'):
if self.use_chi2:
print("Chi-Square Test...")
chi2_selection = SelectKBest(score_func = chi2, k = 10).fit(self.X_train, self.y_train)
chi2_df = pd.DataFrame({'attributes': self.X_train.columns,
'chi2_p-value': chi2_selection.pvalues_})
chi2_df = chi2_df[chi2_df['chi2_p-value'] < 0.05]
return chi2_df
return None
def anova_selector(self: 'FeatureSelection'):
if self.use_anova:
print("ANOVA...")
anova_selection = SelectKBest(score_func = f_classif, k = 10).fit(self.X_train, self.y_train)
anova_df = pd.DataFrame({'attributes': self.X_train.columns,
'anova_p-value': anova_selection.pvalues_})
anova_df = anova_df[anova_df['anova_p-value'] < 0.05]
return anova_df
return None
def execute_selectors(self: 'FeatureSelection'):
self.selectors = [self.rfe_selector(),
self.pearson_selector(),
self.chi2_selector(),
self.anova_selector()]
self.selectors = [df for df in self.selectors if df is not None]
def selected_attributes(self: 'FeatureSelection'):
selected_attributes = pd.DataFrame({'attributes': self.X_train.columns})
for df in self.selectors:
selected_attributes = selected_attributes.merge(df, how = 'inner', on = 'attributes')
selector_cols = ['rfe_rankings', 'pearson_p-value', 'chi2_p-value', 'anova_p-value']
selectors_used = [col for col in selector_cols if col in selected_attributes.columns]
if any(not self.__dict__[f"use_{selector.split('_')[0]}"] for selector in selectors_used):
selected_attributes = selected_attributes.dropna(subset = selectors_used, how = 'any')
selected_attributes = selected_attributes.rename(columns={
'attributes': 'Features',
'rfe_rankings': 'RFE Rankings',
'pearson_p-value': "Pearson's Correlation (p-value)",
'chi2_p-value': 'Chi-Square Test (p-value)',
'anova_p-value': 'ANOVA (p-value)'
})
return selected_attributes
def main():
print("\n")
print("IntelliGenes Feature Selection/Biomarker Location...")
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--cgit_file', required = True)
parser.add_argument('-o', '--output_dir', required = True)
parser.add_argument('--random_state', type = int, default = 42)
parser.add_argument('--test_size', type = float, default = 0.3)
parser.add_argument('--no_rfe', action = 'store_true')
parser.add_argument('--no_pearson', action = 'store_true')
parser.add_argument('--no_chi2', action = 'store_true')
parser.add_argument('--no_anova', action = 'store_true')
parser.add_argument('--normalize', action = 'store_true')
args = parser.parse_args()
pipeline = FeatureSelection(
cgit_file = args.cgit_file,
output_dir = args.output_dir,
random_state = args.random_state,
test_size = args.test_size,
use_rfe = not args.no_rfe,
use_pearson = not args.no_pearson,
use_chi2 = not args.no_chi2,
use_anova = not args.no_anova,
use_normalization = args.normalize
)
pipeline.execute_selectors()
features_df = pipeline.selected_attributes()
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
file_name = Path(args.cgit_file).stem
features_name = f"{file_name}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_Selected-Features.csv"
features_file = os.path.join(args.output_dir, features_name)
features_df.to_csv(features_file, index = False)
print("\n Selected Features:", features_file, "\n")
if __name__ == '__main__':
main()