|
a |
|
b/intelligenes/classification.py |
|
|
1 |
# (Packages/Libraries) Matrix Manipulation |
|
|
2 |
import pandas as pd |
|
|
3 |
import numpy as np |
|
|
4 |
|
|
|
5 |
# (Packages/Libraries) Statistical Analysis & Machine Learning |
|
|
6 |
import sklearn |
|
|
7 |
from sklearn.model_selection import train_test_split, KFold, GridSearchCV |
|
|
8 |
from sklearn.pipeline import Pipeline |
|
|
9 |
from sklearn.ensemble import RandomForestClassifier, VotingClassifier |
|
|
10 |
from sklearn.svm import SVC |
|
|
11 |
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score |
|
|
12 |
from sklearn.preprocessing import StandardScaler |
|
|
13 |
from sklearn.neighbors import KNeighborsClassifier |
|
|
14 |
from sklearn.neural_network import MLPClassifier |
|
|
15 |
import xgboost as xgb |
|
|
16 |
import shap |
|
|
17 |
import matplotlib.pyplot as plt |
|
|
18 |
|
|
|
19 |
# (Packages/Libraries) Miscellaneous |
|
|
20 |
import os |
|
|
21 |
from datetime import datetime |
|
|
22 |
import argparse |
|
|
23 |
import warnings |
|
|
24 |
from pathlib import Path |
|
|
25 |
|
|
|
26 |
warnings.filterwarnings("ignore", message = "No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored") |
|
|
27 |
|
|
|
28 |
class DiseasePrediction: |
|
|
29 |
|
|
|
30 |
# Initialize DiseasePrediction Class |
|
|
31 |
def __init__(self: 'DiseasePrediction', cgit_file: str, |
|
|
32 |
features_file: str, |
|
|
33 |
output_dir: str, |
|
|
34 |
voting: str, |
|
|
35 |
random_state: 42, |
|
|
36 |
test_size: 0.3, |
|
|
37 |
n_splits: 5, |
|
|
38 |
use_rf = True, |
|
|
39 |
use_svm = True, |
|
|
40 |
use_xgb = True, |
|
|
41 |
use_knn = True, |
|
|
42 |
use_mlp = True, |
|
|
43 |
use_tuning = False, |
|
|
44 |
use_normalization = False, |
|
|
45 |
use_igenes = True, |
|
|
46 |
use_visualization = False): |
|
|
47 |
|
|
|
48 |
|
|
|
49 |
self.cgit_file = Path(cgit_file).stem |
|
|
50 |
self.df = pd.read_csv(cgit_file) |
|
|
51 |
self.features_file = features_file |
|
|
52 |
self.output_dir = output_dir |
|
|
53 |
self.voting = voting |
|
|
54 |
self.random_state = random_state |
|
|
55 |
self.test_size = test_size |
|
|
56 |
self.n_splits = n_splits |
|
|
57 |
self.use_rf = use_rf |
|
|
58 |
self.use_svm = use_svm |
|
|
59 |
self.use_xgb = use_xgb |
|
|
60 |
self.use_knn = use_knn |
|
|
61 |
self.use_mlp = use_mlp |
|
|
62 |
self.use_tuning = use_tuning |
|
|
63 |
self.use_normalization = use_normalization |
|
|
64 |
self.use_igenes = use_igenes |
|
|
65 |
self. use_visualization = use_visualization |
|
|
66 |
|
|
|
67 |
self.features = pd.read_csv(self.features_file)['Features'].values.flatten().tolist() |
|
|
68 |
if not self.features or self.features[0] == "Features": |
|
|
69 |
raise ValueError("Features not included.") |
|
|
70 |
|
|
|
71 |
self.y = self.df['Type'] |
|
|
72 |
self.X = self.df[self.features] |
|
|
73 |
|
|
|
74 |
self.kfold = KFold(n_splits = self.n_splits, shuffle = True, random_state = self.random_state) |
|
|
75 |
|
|
|
76 |
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = self.test_size, random_state = self.random_state) |
|
|
77 |
|
|
|
78 |
self.classifiers = [] |
|
|
79 |
|
|
|
80 |
# Random Forest Classifier |
|
|
81 |
def rf_classifier(self: 'DiseasePrediction'): |
|
|
82 |
if self.use_rf: |
|
|
83 |
print('Random Forest...') |
|
|
84 |
if self.use_normalization: |
|
|
85 |
pipeline = Pipeline([('scaling', StandardScaler()), ('rf_clf', RandomForestClassifier(random_state = self.random_state))]) |
|
|
86 |
else: |
|
|
87 |
pipeline = Pipeline([('rf_clf', RandomForestClassifier(random_state = self.random_state))]) |
|
|
88 |
|
|
|
89 |
if self.use_tuning: |
|
|
90 |
rf_parameters = { |
|
|
91 |
'rf_clf__max_features': np.arange(1, self.X_train.shape[1] + 1), |
|
|
92 |
'rf_clf__min_samples_split': np.arange(2, 11), |
|
|
93 |
'rf_clf__min_samples_leaf': np.concatenate([np.arange(1, 11), [100, 150]]) |
|
|
94 |
} |
|
|
95 |
|
|
|
96 |
rf_clf = GridSearchCV(pipeline, param_grid = rf_parameters, cv = self.kfold, scoring = 'accuracy', n_jobs = -1, verbose = 0).fit(self.X_train, self.y_train) |
|
|
97 |
rf_clf = rf_clf.best_estimator_ |
|
|
98 |
else: |
|
|
99 |
rf_clf = pipeline.fit(self.X_train, self.y_train) |
|
|
100 |
|
|
|
101 |
if self.use_igenes or self.use_visualization: |
|
|
102 |
rf_importances = shap.TreeExplainer(rf_clf.named_steps['rf_clf']).shap_values(self.X_test)[1] |
|
|
103 |
|
|
|
104 |
if self.use_igenes: |
|
|
105 |
rf_importances_extracted = np.mean(rf_importances, axis = 0) |
|
|
106 |
rf_importances_normalized = (np.abs(rf_importances_extracted) / np.sum(np.abs(rf_importances_extracted))) * 100 |
|
|
107 |
rf_hhi = np.sum(np.square(rf_importances_normalized)) |
|
|
108 |
|
|
|
109 |
if self.use_visualization: |
|
|
110 |
shap.summary_plot(rf_importances, self.X_test, plot_type = "dot", show = False) |
|
|
111 |
|
|
|
112 |
plt.title("Random Forest Feature Importances", fontsize = 16) |
|
|
113 |
plt.xlabel("SHAP Value", fontsize = 14) |
|
|
114 |
plt.ylabel("Feature", fontsize = 14) |
|
|
115 |
plt.tight_layout() |
|
|
116 |
|
|
|
117 |
file_name = f"{self.cgit_file}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_RF-SHAP.png" |
|
|
118 |
rf_plot = os.path.join(self.output_dir, file_name) |
|
|
119 |
|
|
|
120 |
if not os.path.exists(self.output_dir): |
|
|
121 |
os.makedirs(self.output_dir) |
|
|
122 |
|
|
|
123 |
plt.savefig(rf_plot) |
|
|
124 |
plt.close() |
|
|
125 |
|
|
|
126 |
y_pred = rf_clf.predict(self.X_test) |
|
|
127 |
y_proba = rf_clf.predict_proba(self.X_test)[:, 1] |
|
|
128 |
|
|
|
129 |
rf_accuracy = accuracy_score(self.y_test, y_pred) |
|
|
130 |
rf_roc_auc = roc_auc_score(self.y_test, y_proba) |
|
|
131 |
rf_f1 = f1_score(self.y_test, y_pred, average = 'weighted') |
|
|
132 |
|
|
|
133 |
if self.use_igenes: |
|
|
134 |
return rf_clf, rf_accuracy, rf_roc_auc, rf_f1, rf_importances_extracted / np.max(np.abs(rf_importances_extracted)), rf_hhi, rf_importances |
|
|
135 |
else: |
|
|
136 |
return rf_clf, rf_accuracy, rf_roc_auc, rf_f1 |
|
|
137 |
|
|
|
138 |
return None |
|
|
139 |
|
|
|
140 |
# Support Vector Machine Classifier |
|
|
141 |
def svm_classifier(self: 'DiseasePrediction'): |
|
|
142 |
if self.use_svm: |
|
|
143 |
print('Support Vector Machine...') |
|
|
144 |
if self.use_normalization: |
|
|
145 |
pipeline = Pipeline([('scaling', StandardScaler()), ('svm_clf', SVC(random_state = self.random_state, kernel = 'linear', probability = True))]) |
|
|
146 |
else: |
|
|
147 |
pipeline = Pipeline([('svm_clf', SVC(random_state = self.random_state, kernel = 'linear', probability = True))]) |
|
|
148 |
|
|
|
149 |
if self.use_tuning: |
|
|
150 |
svm_parameters = { |
|
|
151 |
"svm_clf__kernel": ['linear'], |
|
|
152 |
'svm_clf__gamma': [0.001, 0.01, 0.1, 1, 10], |
|
|
153 |
'svm_clf__C': [0.01, 0.1, 1, 10, 50, 100, 500, 1000, 5000, 10000] |
|
|
154 |
} |
|
|
155 |
|
|
|
156 |
svm_clf = GridSearchCV(pipeline, param_grid = svm_parameters, cv = self.kfold, scoring = 'accuracy', n_jobs = -1, verbose = 0).fit(self.X_train, self.y_train) |
|
|
157 |
svm_clf = svm_clf.best_estimator_ |
|
|
158 |
else: |
|
|
159 |
svm_clf = pipeline.fit(self.X_train, self.y_train) |
|
|
160 |
|
|
|
161 |
if self.use_igenes or self.use_visualization: |
|
|
162 |
svm_importances = shap.LinearExplainer(svm_clf.named_steps['svm_clf'], masker = shap.maskers.Independent(self.X_train)).shap_values(self.X_test) |
|
|
163 |
|
|
|
164 |
if self.use_igenes: |
|
|
165 |
svm_importances_extracted = np.mean(svm_importances, axis = 0) |
|
|
166 |
svm_importances_normalized = (np.abs(svm_importances_extracted) / np.sum(np.abs(svm_importances_extracted))) * 100 |
|
|
167 |
svm_hhi = np.sum(np.square(svm_importances_normalized)) |
|
|
168 |
|
|
|
169 |
if self.use_visualization: |
|
|
170 |
shap.summary_plot(svm_importances, self.X_test, plot_type = "dot", show = False) |
|
|
171 |
|
|
|
172 |
plt.title("Support Vector Machine Feature Importances", fontsize = 16) |
|
|
173 |
plt.xlabel("SHAP Value", fontsize = 14) |
|
|
174 |
plt.ylabel("Feature", fontsize = 14) |
|
|
175 |
plt.tight_layout() |
|
|
176 |
|
|
|
177 |
file_name = f"{self.cgit_file}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_SVM-SHAP.png" |
|
|
178 |
svm_plot = os.path.join(self.output_dir, file_name) |
|
|
179 |
|
|
|
180 |
if not os.path.exists(self.output_dir): |
|
|
181 |
os.makedirs(self.output_dir) |
|
|
182 |
|
|
|
183 |
plt.savefig(svm_plot) |
|
|
184 |
plt.close() |
|
|
185 |
|
|
|
186 |
y_pred = svm_clf.predict(self.X_test) |
|
|
187 |
y_proba = svm_clf.predict_proba(self.X_test)[:, 1] |
|
|
188 |
|
|
|
189 |
svm_accuracy = accuracy_score(self.y_test, y_pred) |
|
|
190 |
svm_roc_auc = roc_auc_score(self.y_test, y_proba) |
|
|
191 |
svm_f1 = f1_score(self.y_test, y_pred, average = 'weighted') |
|
|
192 |
|
|
|
193 |
if self.use_igenes: |
|
|
194 |
return svm_clf, svm_accuracy, svm_roc_auc, svm_f1, svm_importances_extracted / np.max(np.abs(svm_importances_extracted)), svm_hhi, svm_importances |
|
|
195 |
else: |
|
|
196 |
return svm_clf, svm_accuracy, svm_roc_auc |
|
|
197 |
|
|
|
198 |
return None |
|
|
199 |
|
|
|
200 |
# XGBoost Classifier |
|
|
201 |
def xgb_classifier(self: 'DiseasePrediction'): |
|
|
202 |
if self.use_xgb: |
|
|
203 |
print('XGBoost...') |
|
|
204 |
if self.use_normalization: |
|
|
205 |
pipeline = Pipeline([('scaling', StandardScaler()), ('xgb_clf', xgb.XGBClassifier(random_state = self.random_state, objective = "reg:squarederror"))]) |
|
|
206 |
else: |
|
|
207 |
pipeline = Pipeline([('xgb_clf', xgb.XGBClassifier(random_state = self.random_state, objective = "binary:logistic"))]) |
|
|
208 |
|
|
|
209 |
if self.use_tuning: |
|
|
210 |
xgb_parameters = { |
|
|
211 |
"xgb_clf__n_estimators": [int(x) for x in np.linspace(100, 500, 5)], |
|
|
212 |
"xgb_clf__max_depth": [int(x) for x in np.linspace(3, 9, 4)], |
|
|
213 |
"xgb_clf__gamma": [0.01, 0.1], |
|
|
214 |
"xgb_clf__learning_rate": [0.001, 0.01, 0.1, 1] |
|
|
215 |
} |
|
|
216 |
|
|
|
217 |
xgb_clf = GridSearchCV(pipeline, param_grid = xgb_parameters, cv = self.kfold, scoring = 'accuracy', n_jobs = -1, verbose = 0).fit(self.X_train, self.y_train) |
|
|
218 |
xgb_clf = xgb_clf.best_estimator_ |
|
|
219 |
else: |
|
|
220 |
xgb_clf = pipeline.fit(self.X_train, self.y_train) |
|
|
221 |
|
|
|
222 |
if self.use_igenes or self.use_visualization: |
|
|
223 |
xgb_importances = shap.TreeExplainer(xgb_clf.named_steps['xgb_clf']).shap_values(self.X_test) |
|
|
224 |
|
|
|
225 |
if self.use_igenes: |
|
|
226 |
xgb_importances_extracted = np.mean(xgb_importances, axis = 0) |
|
|
227 |
xgb_importances_normalized = (np.abs(xgb_importances_extracted) / np.sum(np.abs(xgb_importances_extracted))) * 100 |
|
|
228 |
xgb_hhi = np.sum(np.square(xgb_importances_normalized)) |
|
|
229 |
|
|
|
230 |
if self.use_visualization: |
|
|
231 |
shap.summary_plot(xgb_importances, self.X_test, plot_type = "dot", show = False) |
|
|
232 |
|
|
|
233 |
plt.title("XGBoost Feature Importances", fontsize = 16) |
|
|
234 |
plt.xlabel("SHAP Value", fontsize = 14) |
|
|
235 |
plt.ylabel("Feature", fontsize = 14) |
|
|
236 |
plt.tight_layout() |
|
|
237 |
|
|
|
238 |
file_name = f"{self.cgit_file}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_XGB-SHAP.png" |
|
|
239 |
xgb_plot = os.path.join(self.output_dir, file_name) |
|
|
240 |
|
|
|
241 |
if not os.path.exists(self.output_dir): |
|
|
242 |
os.makedirs(self.output_dir) |
|
|
243 |
|
|
|
244 |
plt.savefig(xgb_plot) |
|
|
245 |
plt.close() |
|
|
246 |
|
|
|
247 |
y_pred = xgb_clf.predict(self.X_test) |
|
|
248 |
y_proba = xgb_clf.predict_proba(self.X_test)[:, 1] |
|
|
249 |
|
|
|
250 |
xgb_accuracy = accuracy_score(self.y_test, y_pred) |
|
|
251 |
xgb_roc_auc = roc_auc_score(self.y_test, y_proba) |
|
|
252 |
xgb_f1 = f1_score(self.y_test, y_pred, average = 'weighted') |
|
|
253 |
|
|
|
254 |
if self.use_igenes: |
|
|
255 |
return xgb_clf, xgb_accuracy, xgb_roc_auc, xgb_f1, xgb_importances_extracted / np.max(np.abs(xgb_importances_extracted)), xgb_hhi, xgb_importances |
|
|
256 |
else: |
|
|
257 |
return xgb_clf, xgb_accuracy, xgb_roc_auc, xgb_f1 |
|
|
258 |
|
|
|
259 |
return None |
|
|
260 |
|
|
|
261 |
# k-Nearest Neighbors Classifier |
|
|
262 |
def knn_classifier(self: 'DiseasePrediction'): |
|
|
263 |
if self.use_knn: |
|
|
264 |
print('k-Nearest Neighbors...') |
|
|
265 |
if self.use_normalization: |
|
|
266 |
pipeline = Pipeline([('scaling', StandardScaler()), ('knn_clf', sklearn.neighbors.KNeighborsClassifier())]) |
|
|
267 |
else: |
|
|
268 |
pipeline = Pipeline([('knn_clf', sklearn.neighbors.KNeighborsClassifier())]) |
|
|
269 |
|
|
|
270 |
if self.use_tuning: |
|
|
271 |
knn_parameters = { |
|
|
272 |
"knn_clf__leaf_size": list(range(1, 50)), |
|
|
273 |
"knn_clf__n_neighbors": list(range(1, min(len(self.X_train) - 1, 30) + 1)), |
|
|
274 |
"knn_clf__p": [1, 2] |
|
|
275 |
} |
|
|
276 |
|
|
|
277 |
knn_clf = GridSearchCV(pipeline, param_grid = knn_parameters, cv = self.kfold, scoring = 'accuracy', n_jobs = -1, verbose = 0).fit(self.X_train, self.y_train) |
|
|
278 |
knn_clf = knn_clf.best_estimator_ |
|
|
279 |
else: |
|
|
280 |
knn_clf = pipeline.fit(self.X_train, self.y_train) |
|
|
281 |
|
|
|
282 |
if self.use_igenes or self.use_visualization: |
|
|
283 |
knn_importances = shap.KernelExplainer(knn_clf.named_steps['knn_clf'].predict, shap.sample(self.X_train, 1000)).shap_values(self.X_test) |
|
|
284 |
|
|
|
285 |
if self.use_igenes: |
|
|
286 |
knn_importances_extracted = np.mean(knn_importances, axis = 0) |
|
|
287 |
knn_importances_normalized = (np.abs(knn_importances_extracted) / np.sum(np.abs(knn_importances_extracted))) * 100 |
|
|
288 |
knn_hhi = np.sum(np.square(knn_importances_normalized)) |
|
|
289 |
|
|
|
290 |
if self.use_visualization: |
|
|
291 |
shap.summary_plot(knn_importances, self.X_test, plot_type = "dot", show = False) |
|
|
292 |
|
|
|
293 |
plt.title("k-Nearest Neighbors Feature Importances", fontsize = 16) |
|
|
294 |
plt.xlabel("SHAP Value", fontsize = 14) |
|
|
295 |
plt.ylabel("Feature", fontsize = 14) |
|
|
296 |
plt.tight_layout() |
|
|
297 |
|
|
|
298 |
file_name = f"{self.cgit_file}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_kNN-SHAP.png" |
|
|
299 |
knn_plot = os.path.join(self.output_dir, file_name) |
|
|
300 |
|
|
|
301 |
if not os.path.exists(self.output_dir): |
|
|
302 |
os.makedirs(self.output_dir) |
|
|
303 |
|
|
|
304 |
plt.savefig(knn_plot) |
|
|
305 |
plt.close() |
|
|
306 |
|
|
|
307 |
y_pred = knn_clf.predict(self.X_test) |
|
|
308 |
y_proba = knn_clf.predict_proba(self.X_test)[:, 1] |
|
|
309 |
|
|
|
310 |
knn_accuracy = accuracy_score(self.y_test, y_pred) |
|
|
311 |
knn_roc_auc = roc_auc_score(self.y_test, y_proba) |
|
|
312 |
knn_f1 = f1_score(self.y_test, y_pred, average = 'weighted') |
|
|
313 |
|
|
|
314 |
if self.use_igenes: |
|
|
315 |
return knn_clf, knn_accuracy, knn_roc_auc, knn_f1, knn_importances_extracted / np.max(np.abs(knn_importances_extracted)), knn_hhi, knn_importances |
|
|
316 |
else: |
|
|
317 |
return knn_clf, knn_accuracy, knn_roc_auc, knn_f1 |
|
|
318 |
|
|
|
319 |
return None |
|
|
320 |
|
|
|
321 |
# Multi-Layer Perceptron Classifier |
|
|
322 |
def mlp_classifier(self: 'DiseasePrediction'): |
|
|
323 |
if self.use_mlp: |
|
|
324 |
print('Multi-Layer Perceptron...') |
|
|
325 |
if self.use_normalization: |
|
|
326 |
pipeline = Pipeline([('scaling', StandardScaler()), ('mlp_clf', MLPClassifier(random_state = self.random_state, max_iter = 3000))]) |
|
|
327 |
else: |
|
|
328 |
pipeline = Pipeline([('mlp_clf', MLPClassifier(random_state = self.random_state, max_iter = 2000))]) |
|
|
329 |
|
|
|
330 |
if self.use_tuning: |
|
|
331 |
mlp_parameters = { |
|
|
332 |
'mlp_clf__hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,), (100, 100,), (100, 100, 100)], |
|
|
333 |
'mlp_clf__activation': ['tanh', 'relu'], |
|
|
334 |
'mlp_clf__solver': ['sgd', 'adam'], |
|
|
335 |
'mlp_clf__alpha': [0.0001, 0.001, 0.01, 0.05, 0.1], |
|
|
336 |
'mlp_clf__learning_rate': ['constant', 'adaptive'], |
|
|
337 |
'mlp_clf__learning_rate_init': [0.001, 0.01, 0.1], |
|
|
338 |
'mlp_clf__momentum': [0.1, 0.2, 0.5, 0.9], |
|
|
339 |
} |
|
|
340 |
|
|
|
341 |
mlp_clf = GridSearchCV(pipeline, param_grid = mlp_parameters, cv = self.kfold, scoring = 'accuracy', n_jobs = -1, verbose = 0).fit(self.X_train, self.y_train) |
|
|
342 |
mlp_clf = mlp_clf.best_estimator_ |
|
|
343 |
else: |
|
|
344 |
mlp_clf = pipeline.fit(self.X_train, self.y_train) |
|
|
345 |
|
|
|
346 |
if self.use_igenes or self.use_visualization: |
|
|
347 |
mlp_importances = shap.KernelExplainer(mlp_clf.named_steps['mlp_clf'].predict, shap.sample(self.X_train, 1000)).shap_values(self.X_test) |
|
|
348 |
|
|
|
349 |
if self.use_igenes: |
|
|
350 |
mlp_importances_extracted = np.mean(mlp_importances, axis = 0) |
|
|
351 |
mlp_importances_normalized = (np.abs(mlp_importances_extracted) / np.sum(np.abs(mlp_importances_extracted))) * 100 |
|
|
352 |
mlp_hhi = np.sum(np.square(mlp_importances_normalized)) |
|
|
353 |
|
|
|
354 |
if self.use_visualization: |
|
|
355 |
shap.summary_plot(mlp_importances, self.X_test, plot_type = "dot", show = False) |
|
|
356 |
|
|
|
357 |
plt.title("Multi-Layer Perceptron Feature Importances", fontsize = 16) |
|
|
358 |
plt.xlabel("SHAP Value", fontsize = 14) |
|
|
359 |
plt.ylabel("Feature", fontsize = 14) |
|
|
360 |
plt.tight_layout() |
|
|
361 |
|
|
|
362 |
file_name = f"{self.cgit_file}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_MLP-SHAP.png" |
|
|
363 |
mlp_plot = os.path.join(self.output_dir, file_name) |
|
|
364 |
|
|
|
365 |
if not os.path.exists(self.output_dir): |
|
|
366 |
os.makedirs(self.output_dir) |
|
|
367 |
|
|
|
368 |
plt.savefig(mlp_plot) |
|
|
369 |
plt.close() |
|
|
370 |
|
|
|
371 |
y_pred = mlp_clf.predict(self.X_test) |
|
|
372 |
y_proba = mlp_clf.predict_proba(self.X_test)[:, 1] |
|
|
373 |
|
|
|
374 |
mlp_accuracy = accuracy_score(self.y_test, y_pred) |
|
|
375 |
mlp_roc_auc = roc_auc_score(self.y_test, y_proba) |
|
|
376 |
mlp_f1 = f1_score(self.y_test, y_pred, average = 'weighted') |
|
|
377 |
|
|
|
378 |
if self.use_igenes: |
|
|
379 |
return mlp_clf, mlp_accuracy, mlp_roc_auc, mlp_f1, mlp_importances_extracted / np.max(np.abs(mlp_importances_extracted)), mlp_hhi, mlp_importances |
|
|
380 |
else: |
|
|
381 |
return mlp_clf, mlp_accuracy, mlp_roc_auc, mlp_f1 |
|
|
382 |
|
|
|
383 |
return None |
|
|
384 |
|
|
|
385 |
# Voting Classifier |
|
|
386 |
def voting_classifier(self: 'DiseasePrediction'): |
|
|
387 |
if self.classifiers: |
|
|
388 |
print('Voting Classifier...') |
|
|
389 |
voting_clf = VotingClassifier(estimators = [(clf[0], clf[1]["Classifier"]) for clf in self.classifiers], voting = self.voting).fit(self.X_train, self.y_train) |
|
|
390 |
|
|
|
391 |
y_pred = voting_clf.predict(self.X_test) |
|
|
392 |
y_proba = voting_clf.predict_proba(self.X_test)[:, 1] |
|
|
393 |
|
|
|
394 |
voting_accuracy = accuracy_score(self.y_test, y_pred) |
|
|
395 |
voting_roc_auc = roc_auc_score(self.y_test, y_proba) |
|
|
396 |
voting_f1 = f1_score(self.y_test, y_pred, average = 'weighted') |
|
|
397 |
|
|
|
398 |
return voting_clf, voting_accuracy, voting_roc_auc, voting_f1 |
|
|
399 |
|
|
|
400 |
return None |
|
|
401 |
|
|
|
402 |
# Execute Classifiers |
|
|
403 |
def execute_classifiers(self: 'DiseasePrediction'): |
|
|
404 |
classifiers = [ |
|
|
405 |
("Random Forest", self.rf_classifier), |
|
|
406 |
("Support Vector Machine", self.svm_classifier), |
|
|
407 |
("XGBoost", self.xgb_classifier), |
|
|
408 |
("k-Nearest Neighbors", self.knn_classifier), |
|
|
409 |
("Multi-Layer Perceptron", self.mlp_classifier) |
|
|
410 |
] |
|
|
411 |
|
|
|
412 |
self.classifiers = [] |
|
|
413 |
for name, classifier in classifiers: |
|
|
414 |
metrics_tuple = classifier() |
|
|
415 |
if metrics_tuple is not None: |
|
|
416 |
clf, accuracy, roc_auc, f1, *rest = metrics_tuple |
|
|
417 |
|
|
|
418 |
classifier_info = { |
|
|
419 |
"Classifier": clf, |
|
|
420 |
"Accuracy": accuracy, |
|
|
421 |
"ROC-AUC": roc_auc, |
|
|
422 |
"F1": f1 |
|
|
423 |
} |
|
|
424 |
|
|
|
425 |
if len(rest) == 2: |
|
|
426 |
classifier_info["Importances"] = rest[0] |
|
|
427 |
classifier_info["HHI"] = rest[1] |
|
|
428 |
elif len(rest) == 3: |
|
|
429 |
classifier_info["Importances"] = rest[0] |
|
|
430 |
classifier_info["HHI"] = rest[1] |
|
|
431 |
classifier_info["SHAP Values"] = rest[2] |
|
|
432 |
|
|
|
433 |
self.classifiers.append((name, classifier_info)) |
|
|
434 |
|
|
|
435 |
voting_metrics_tuple = self.voting_classifier() |
|
|
436 |
if voting_metrics_tuple: |
|
|
437 |
voting_clf, voting_accuracy, voting_roc_auc, voting_f1 = voting_metrics_tuple |
|
|
438 |
self.classifiers.append(("Voting Classifier", { |
|
|
439 |
"Classifier": voting_clf, |
|
|
440 |
"Accuracy": voting_accuracy, |
|
|
441 |
"ROC-AUC": voting_roc_auc, |
|
|
442 |
"F1": voting_f1 |
|
|
443 |
})) |
|
|
444 |
|
|
|
445 |
# Generate Classifier Metrics |
|
|
446 |
def classifier_metrics(self: 'DiseasePrediction'): |
|
|
447 |
if hasattr(self, 'classifiers') and self.classifiers: |
|
|
448 |
metrics_df = pd.DataFrame({ |
|
|
449 |
'Classifier': [clf[0] for clf in self.classifiers], |
|
|
450 |
'Accuracy': [clf[1]['Accuracy'] for clf in self.classifiers], |
|
|
451 |
'ROC-AUC': [clf[1]['ROC-AUC'] for clf in self.classifiers], |
|
|
452 |
'F1': [clf[1]['F1'] for clf in self.classifiers] |
|
|
453 |
}) |
|
|
454 |
|
|
|
455 |
return metrics_df |
|
|
456 |
|
|
|
457 |
# Generate I-Genes Profile |
|
|
458 |
def igenes_scores(self: 'DiseasePrediction'): |
|
|
459 |
if self.use_igenes: |
|
|
460 |
print('Generating I-Genes Scores...') |
|
|
461 |
|
|
|
462 |
if self.classifiers: |
|
|
463 |
normalized_importances = [] |
|
|
464 |
herfindahl_hirschman_indices = [] |
|
|
465 |
SHAP_values = [] |
|
|
466 |
|
|
|
467 |
for name, metrics in self.classifiers: |
|
|
468 |
if name != "Voting Classifier": |
|
|
469 |
if "Importances" in metrics: |
|
|
470 |
normalized_importances.append(metrics["Importances"]) |
|
|
471 |
herfindahl_hirschman_indices.append(metrics["HHI"]) |
|
|
472 |
if "SHAP Values" in metrics: |
|
|
473 |
SHAP_values.append(metrics["SHAP Values"]) |
|
|
474 |
|
|
|
475 |
if not normalized_importances or not herfindahl_hirschman_indices or not SHAP_values: |
|
|
476 |
return |
|
|
477 |
|
|
|
478 |
num_classifiers = len(self.classifiers) |
|
|
479 |
hhi_weights = (np.array(herfindahl_hirschman_indices) / np.sum(herfindahl_hirschman_indices)) |
|
|
480 |
initial_weights = (1 / (num_classifiers)) |
|
|
481 |
final_weights = (initial_weights + (initial_weights * hhi_weights)) |
|
|
482 |
|
|
|
483 |
igenes_scores = np.zeros_like(normalized_importances[0]) |
|
|
484 |
for weight, importance in zip(final_weights, normalized_importances): |
|
|
485 |
igenes_scores += weight * np.abs(importance) |
|
|
486 |
|
|
|
487 |
case_control_predictions = ['Cases' if importances > 0 else 'Controls' if importances < 0 else 'Cases' for importances in np.sum(normalized_importances, axis = 0)] |
|
|
488 |
|
|
|
489 |
igenes_df = pd.DataFrame({ |
|
|
490 |
'Feature': self.X_train.columns, |
|
|
491 |
'I-Genes Score': np.round(igenes_scores, 4), |
|
|
492 |
'Prediction': case_control_predictions, |
|
|
493 |
}) |
|
|
494 |
|
|
|
495 |
feature_expression = [] |
|
|
496 |
for i, prediction in enumerate(case_control_predictions): |
|
|
497 |
if prediction in ['Cases', 'Controls']: |
|
|
498 |
SHAP_normalized = [SHAP / np.max(np.abs(SHAP)) for SHAP in SHAP_values] |
|
|
499 |
aggregated_shap = np.sum([SHAP[:, i] for SHAP in SHAP_normalized], axis = 0) |
|
|
500 |
|
|
|
501 |
relevant_shap_samples = aggregated_shap > 0 if prediction == 'Positive' else aggregated_shap < 0 |
|
|
502 |
if len(relevant_shap_samples) != len(self.X_test): |
|
|
503 |
raise ValueError("Length of SHAP samples does not match the number of rows in X_train.") |
|
|
504 |
|
|
|
505 |
feature_average = self.X_test.iloc[relevant_shap_samples, i].mean() |
|
|
506 |
overall_average = self.X_test.iloc[:, i].mean() |
|
|
507 |
|
|
|
508 |
if feature_average > overall_average: |
|
|
509 |
feature_expression.append('Overexpression') |
|
|
510 |
elif feature_average < overall_average: |
|
|
511 |
feature_expression.append('Underexpression') |
|
|
512 |
else: |
|
|
513 |
feature_expression.append('Inconclusive') |
|
|
514 |
else: |
|
|
515 |
feature_expression.append('Not Applicable') |
|
|
516 |
|
|
|
517 |
igenes_df['Expression'] = feature_expression |
|
|
518 |
igenes_df['I-Genes Rankings'] = igenes_df['I-Genes Score'].rank(ascending = False).astype(int) |
|
|
519 |
igenes_df = igenes_df.sort_values(by = 'I-Genes Rankings') |
|
|
520 |
|
|
|
521 |
return igenes_df |
|
|
522 |
|
|
|
523 |
# Execute DiseasePrediction Class |
|
|
524 |
def main(): |
|
|
525 |
print("IntelliGenes Disease Prediction and I-Genes Scores...") |
|
|
526 |
|
|
|
527 |
parser = argparse.ArgumentParser() |
|
|
528 |
parser.add_argument('-i', '--cgit_file', required = True) |
|
|
529 |
parser.add_argument('-f', '--features_file', required = True) |
|
|
530 |
parser.add_argument('-o', '--output_dir', required = True) |
|
|
531 |
parser.add_argument('--voting', type = str, default = 'soft') |
|
|
532 |
parser.add_argument('--random_state', type = int, default = 42) |
|
|
533 |
parser.add_argument('--test_size', type = float, default = 0.3) |
|
|
534 |
parser.add_argument('--n_splits', type = int, default = 5) |
|
|
535 |
parser.add_argument('--no_rf', action = 'store_true') |
|
|
536 |
parser.add_argument('--no_svm', action = 'store_true') |
|
|
537 |
parser.add_argument('--no_xgb', action = 'store_true') |
|
|
538 |
parser.add_argument('--no_knn', action = 'store_true') |
|
|
539 |
parser.add_argument('--no_mlp', action = 'store_true') |
|
|
540 |
parser.add_argument('--tune', action = 'store_true') |
|
|
541 |
parser.add_argument('--normalize', action = 'store_true') |
|
|
542 |
parser.add_argument('--no_igenes', action = 'store_true') |
|
|
543 |
parser.add_argument('--no_visualizations', action = 'store_true') |
|
|
544 |
|
|
|
545 |
args = parser.parse_args() |
|
|
546 |
|
|
|
547 |
pipeline = DiseasePrediction(cgit_file = args.cgit_file, |
|
|
548 |
features_file = args.features_file, |
|
|
549 |
output_dir = args.output_dir, |
|
|
550 |
voting = args.voting, |
|
|
551 |
random_state = args.random_state, |
|
|
552 |
test_size = args.test_size, |
|
|
553 |
n_splits = args.n_splits, |
|
|
554 |
use_rf = not args.no_rf, |
|
|
555 |
use_svm = not args.no_svm, |
|
|
556 |
use_xgb = not args.no_xgb, |
|
|
557 |
use_knn = not args.no_knn, |
|
|
558 |
use_mlp = not args.no_mlp, |
|
|
559 |
use_tuning = args.tune, |
|
|
560 |
use_normalization = args.normalize, |
|
|
561 |
use_igenes = not args.no_igenes, |
|
|
562 |
use_visualization = not args.no_visualizations) |
|
|
563 |
|
|
|
564 |
pipeline.execute_classifiers() |
|
|
565 |
metrics_df = pipeline.classifier_metrics() |
|
|
566 |
igenes_df = pipeline.igenes_scores() |
|
|
567 |
|
|
|
568 |
if not os.path.exists(args.output_dir): |
|
|
569 |
os.makedirs(args.output_dir) |
|
|
570 |
|
|
|
571 |
file_name = Path(args.cgit_file).stem |
|
|
572 |
metrics_name = f"{file_name}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_Classifier-Metrics.csv" |
|
|
573 |
metrics_file = os.path.join(args.output_dir, metrics_name) |
|
|
574 |
|
|
|
575 |
metrics_df.to_csv(metrics_file, index = False) |
|
|
576 |
print("\n Clasifier Metrics:", metrics_file, "\n") |
|
|
577 |
|
|
|
578 |
if args.no_igenes is False: |
|
|
579 |
igenes_name = f"{file_name}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_I-Genes-Score.csv" |
|
|
580 |
igenes_file = os.path.join(args.output_dir, igenes_name) |
|
|
581 |
|
|
|
582 |
igenes_df.to_csv(igenes_file, index = False) |
|
|
583 |
print("\n I-Genes Scores:", igenes_file, "\n") |
|
|
584 |
|
|
|
585 |
if __name__ == '__main__': |
|
|
586 |
main() |