[e6e569]: / FeatureSelection / feature_selection.py

Download this file

99 lines (65 with data), 2.8 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import sys
import os
# Add the parent directory to the system path
sys.path.append(os.path.abspath('../')) # Adjust the path as needed
from my_util import df_to_corr_matrix
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from joblib import Parallel, delayed
from pickle import dump , load
TARGET_NUM_OF_FEATURES = 50
# Read data
training_file = "../TrainDataset2024.xls"
data = pd.read_excel(training_file)
data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
data_no_na = data.replace(999, np.nan)
data_no_na.dropna(ignore_index=True, inplace=True)
X = data_no_na.drop('pCR (outcome)', axis=1)
y = data_no_na['pCR (outcome)']
# Drop highly correlated features
CORR_THRESHOLD = 0.9
# Create a correlation matrix
correlation_matrix = X.corr()
highly_correlated_features = set()
for i in range(len(correlation_matrix.columns)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > CORR_THRESHOLD:
highly_correlated_features.add(correlation_matrix.columns[i])
X_no_highly_correlated = X.drop(columns=highly_correlated_features)
scaler = StandardScaler()
Xs = scaler.fit_transform(X_no_highly_correlated)
Xs = pd.DataFrame(Xs, columns=X_no_highly_correlated.columns)
def process_k_best(K, i):
k_best = SelectKBest(score_func=mutual_info_classif, k=K)
Xs_k_best = k_best.fit_transform(Xs, y)
return k_best.get_feature_names_out()
# find features
features = {}
# Run in parallel
for K in range(1, TARGET_NUM_OF_FEATURES + 5):
best = {}
results = Parallel(n_jobs=-1)(delayed(process_k_best)(K, i) for i in range(K + 5))
for feature_list in results:
for feature in feature_list:
if feature in best:
best[feature] += 1
else:
best[feature] = 1
sorted_best = dict(sorted(best.items(), key=lambda item: item[1], reverse=True))
# Update features based on the counts
for key in best:
if best[key] > (K - 2):
features[key] = features.get(key, 0) + 1
sorted_features = dict(sorted(features.items(), key=lambda item: item[1], reverse=True))
feature_names = list(sorted_features.keys())
num_max = sum(np.array(list(sorted_best.values())) == max(sorted_best.values()))
num_of_features = max(num_max, TARGET_NUM_OF_FEATURES)
important_features = ["Gene", "ER", "HER2"]
selected_features = list(set(important_features + feature_names[:num_of_features]))
num_of_features = len(selected_features)
print(f"Best {num_of_features} features are: ")
print(selected_features)
with open(f"pkl/{num_of_features}_selected_features.pkl", "wb") as file:
dump(selected_features, file)