Switch to unified view

a b/intelligenes/classification.py
1
# (Packages/Libraries) Matrix Manipulation
2
import pandas as pd
3
import numpy as np
4
5
# (Packages/Libraries) Statistical Analysis & Machine Learning
6
import sklearn
7
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
8
from sklearn.pipeline import Pipeline
9
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
10
from sklearn.svm import SVC
11
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
12
from sklearn.preprocessing import StandardScaler
13
from sklearn.neighbors import KNeighborsClassifier
14
from sklearn.neural_network import MLPClassifier
15
import xgboost as xgb
16
import shap
17
import matplotlib.pyplot as plt
18
19
# (Packages/Libraries) Miscellaneous
20
import os
21
from datetime import datetime
22
import argparse
23
import warnings
24
from pathlib import Path
25
26
warnings.filterwarnings("ignore", message = "No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored")
27
28
class DiseasePrediction:
29
    
30
    # Initialize DiseasePrediction Class 
31
    def __init__(self: 'DiseasePrediction', cgit_file: str,
32
                                            features_file: str, 
33
                                            output_dir: str,
34
                                            voting: str,
35
                                            random_state: 42,
36
                                            test_size: 0.3,
37
                                            n_splits: 5,
38
                                            use_rf = True,
39
                                            use_svm = True,
40
                                            use_xgb = True,
41
                                            use_knn = True,
42
                                            use_mlp = True,
43
                                            use_tuning = False,
44
                                            use_normalization = False,
45
                                            use_igenes = True,
46
                                            use_visualization = False):
47
        
48
49
        self.cgit_file = Path(cgit_file).stem
50
        self.df = pd.read_csv(cgit_file)
51
        self.features_file = features_file
52
        self.output_dir = output_dir
53
        self.voting = voting
54
        self.random_state = random_state
55
        self.test_size = test_size
56
        self.n_splits = n_splits
57
        self.use_rf = use_rf
58
        self.use_svm = use_svm
59
        self.use_xgb = use_xgb
60
        self.use_knn = use_knn
61
        self.use_mlp = use_mlp
62
        self.use_tuning = use_tuning
63
        self.use_normalization = use_normalization
64
        self.use_igenes = use_igenes
65
        self. use_visualization = use_visualization
66
        
67
        self.features = pd.read_csv(self.features_file)['Features'].values.flatten().tolist()
68
        if not self.features or self.features[0] == "Features":
69
            raise ValueError("Features not included.")
70
        
71
        self.y = self.df['Type']
72
        self.X = self.df[self.features]
73
        
74
        self.kfold = KFold(n_splits = self.n_splits, shuffle = True, random_state = self.random_state)
75
        
76
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = self.test_size, random_state = self.random_state)
77
        
78
        self.classifiers = [] 
79
            
80
    # Random Forest Classifier 
81
    def rf_classifier(self: 'DiseasePrediction'):
82
        if self.use_rf:         
83
            print('Random Forest...')
84
            if self.use_normalization:
85
                pipeline = Pipeline([('scaling', StandardScaler()), ('rf_clf', RandomForestClassifier(random_state = self.random_state))])
86
            else:
87
                pipeline = Pipeline([('rf_clf', RandomForestClassifier(random_state = self.random_state))])
88
            
89
            if self.use_tuning:
90
                rf_parameters = {
91
                    'rf_clf__max_features': np.arange(1, self.X_train.shape[1] + 1),
92
                    'rf_clf__min_samples_split': np.arange(2, 11),
93
                    'rf_clf__min_samples_leaf': np.concatenate([np.arange(1, 11), [100, 150]])  
94
                }
95
                
96
                rf_clf = GridSearchCV(pipeline, param_grid = rf_parameters, cv = self.kfold, scoring = 'accuracy', n_jobs = -1, verbose = 0).fit(self.X_train, self.y_train)
97
                rf_clf = rf_clf.best_estimator_
98
            else: 
99
                rf_clf = pipeline.fit(self.X_train, self.y_train)
100
                
101
            if self.use_igenes or self.use_visualization:
102
                rf_importances = shap.TreeExplainer(rf_clf.named_steps['rf_clf']).shap_values(self.X_test)[1]
103
            
104
            if self.use_igenes:
105
                rf_importances_extracted = np.mean(rf_importances, axis = 0)
106
                rf_importances_normalized = (np.abs(rf_importances_extracted) / np.sum(np.abs(rf_importances_extracted))) * 100
107
                rf_hhi = np.sum(np.square(rf_importances_normalized))
108
                
109
            if self.use_visualization:
110
               shap.summary_plot(rf_importances, self.X_test, plot_type = "dot", show = False)
111
               
112
               plt.title("Random Forest Feature Importances", fontsize = 16)
113
               plt.xlabel("SHAP Value", fontsize = 14)
114
               plt.ylabel("Feature", fontsize = 14)
115
               plt.tight_layout()
116
               
117
               file_name = f"{self.cgit_file}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_RF-SHAP.png"
118
               rf_plot = os.path.join(self.output_dir, file_name)
119
               
120
               if not os.path.exists(self.output_dir):
121
                   os.makedirs(self.output_dir)
122
                   
123
               plt.savefig(rf_plot)
124
               plt.close()
125
               
126
            y_pred = rf_clf.predict(self.X_test)
127
            y_proba = rf_clf.predict_proba(self.X_test)[:, 1]
128
            
129
            rf_accuracy = accuracy_score(self.y_test, y_pred)
130
            rf_roc_auc = roc_auc_score(self.y_test, y_proba)
131
            rf_f1 = f1_score(self.y_test, y_pred, average = 'weighted')
132
            
133
            if self.use_igenes:
134
                return rf_clf, rf_accuracy, rf_roc_auc, rf_f1, rf_importances_extracted / np.max(np.abs(rf_importances_extracted)), rf_hhi, rf_importances
135
            else:
136
                return rf_clf, rf_accuracy, rf_roc_auc, rf_f1                  
137
        
138
        return None
139
    
140
    # Support Vector Machine Classifier 
141
    def svm_classifier(self: 'DiseasePrediction'):
142
        if self.use_svm:
143
            print('Support Vector Machine...')
144
            if self.use_normalization:
145
                pipeline = Pipeline([('scaling', StandardScaler()), ('svm_clf', SVC(random_state = self.random_state, kernel = 'linear', probability = True))])
146
            else:
147
                pipeline = Pipeline([('svm_clf', SVC(random_state = self.random_state, kernel = 'linear', probability = True))])
148
                
149
            if self.use_tuning:
150
                svm_parameters = {
151
                    "svm_clf__kernel": ['linear'],
152
                    'svm_clf__gamma': [0.001, 0.01, 0.1, 1, 10],
153
                    'svm_clf__C': [0.01, 0.1, 1, 10, 50, 100, 500, 1000, 5000, 10000]
154
                }
155
                
156
                svm_clf = GridSearchCV(pipeline, param_grid = svm_parameters, cv = self.kfold, scoring = 'accuracy', n_jobs = -1, verbose = 0).fit(self.X_train, self.y_train)
157
                svm_clf = svm_clf.best_estimator_
158
            else: 
159
                svm_clf = pipeline.fit(self.X_train, self.y_train)
160
                
161
            if self.use_igenes or self.use_visualization:
162
                svm_importances = shap.LinearExplainer(svm_clf.named_steps['svm_clf'], masker = shap.maskers.Independent(self.X_train)).shap_values(self.X_test)
163
            
164
            if self.use_igenes:
165
                svm_importances_extracted = np.mean(svm_importances, axis = 0)
166
                svm_importances_normalized = (np.abs(svm_importances_extracted) / np.sum(np.abs(svm_importances_extracted))) * 100
167
                svm_hhi = np.sum(np.square(svm_importances_normalized))
168
                
169
            if self.use_visualization:
170
                shap.summary_plot(svm_importances, self.X_test, plot_type = "dot", show = False)
171
                
172
                plt.title("Support Vector Machine Feature Importances", fontsize = 16)
173
                plt.xlabel("SHAP Value", fontsize = 14)
174
                plt.ylabel("Feature", fontsize = 14)
175
                plt.tight_layout()
176
                
177
                file_name = f"{self.cgit_file}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_SVM-SHAP.png"
178
                svm_plot = os.path.join(self.output_dir, file_name)
179
                
180
                if not os.path.exists(self.output_dir):
181
                   os.makedirs(self.output_dir)
182
                   
183
                plt.savefig(svm_plot)
184
                plt.close()
185
            
186
            y_pred = svm_clf.predict(self.X_test)
187
            y_proba = svm_clf.predict_proba(self.X_test)[:, 1]
188
            
189
            svm_accuracy = accuracy_score(self.y_test, y_pred)
190
            svm_roc_auc = roc_auc_score(self.y_test, y_proba)
191
            svm_f1 = f1_score(self.y_test, y_pred, average = 'weighted')
192
            
193
            if self.use_igenes:
194
                return svm_clf, svm_accuracy, svm_roc_auc, svm_f1, svm_importances_extracted / np.max(np.abs(svm_importances_extracted)), svm_hhi, svm_importances
195
            else: 
196
                return svm_clf, svm_accuracy, svm_roc_auc
197
            
198
        return None
199
    
200
    # XGBoost Classifier 
201
    def xgb_classifier(self: 'DiseasePrediction'):
202
        if self.use_xgb: 
203
            print('XGBoost...')
204
            if self.use_normalization:
205
                pipeline = Pipeline([('scaling', StandardScaler()), ('xgb_clf', xgb.XGBClassifier(random_state = self.random_state, objective = "reg:squarederror"))])
206
            else:
207
                pipeline = Pipeline([('xgb_clf', xgb.XGBClassifier(random_state = self.random_state, objective = "binary:logistic"))])
208
                
209
            if self.use_tuning:
210
                xgb_parameters = {
211
                    "xgb_clf__n_estimators": [int(x) for x in np.linspace(100, 500, 5)],
212
                    "xgb_clf__max_depth": [int(x) for x in np.linspace(3, 9, 4)],
213
                    "xgb_clf__gamma": [0.01, 0.1],
214
                    "xgb_clf__learning_rate": [0.001, 0.01, 0.1, 1]
215
                }
216
                
217
                xgb_clf = GridSearchCV(pipeline, param_grid = xgb_parameters, cv = self.kfold, scoring = 'accuracy', n_jobs = -1, verbose = 0).fit(self.X_train, self.y_train)
218
                xgb_clf = xgb_clf.best_estimator_
219
            else: 
220
                xgb_clf = pipeline.fit(self.X_train, self.y_train)
221
                
222
            if self.use_igenes or self.use_visualization:
223
                xgb_importances = shap.TreeExplainer(xgb_clf.named_steps['xgb_clf']).shap_values(self.X_test)
224
            
225
            if self.use_igenes:
226
                xgb_importances_extracted = np.mean(xgb_importances, axis = 0)
227
                xgb_importances_normalized = (np.abs(xgb_importances_extracted) / np.sum(np.abs(xgb_importances_extracted))) * 100
228
                xgb_hhi = np.sum(np.square(xgb_importances_normalized))
229
                
230
            if self.use_visualization:
231
                shap.summary_plot(xgb_importances, self.X_test, plot_type = "dot", show = False)
232
                
233
                plt.title("XGBoost Feature Importances", fontsize = 16)
234
                plt.xlabel("SHAP Value", fontsize = 14)
235
                plt.ylabel("Feature", fontsize = 14)
236
                plt.tight_layout()
237
                
238
                file_name = f"{self.cgit_file}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_XGB-SHAP.png"
239
                xgb_plot = os.path.join(self.output_dir, file_name)
240
                
241
                if not os.path.exists(self.output_dir):
242
                    os.makedirs(self.output_dir)
243
                
244
                plt.savefig(xgb_plot)
245
                plt.close()
246
                
247
            y_pred = xgb_clf.predict(self.X_test)
248
            y_proba = xgb_clf.predict_proba(self.X_test)[:, 1]
249
            
250
            xgb_accuracy = accuracy_score(self.y_test, y_pred)
251
            xgb_roc_auc = roc_auc_score(self.y_test, y_proba)
252
            xgb_f1 = f1_score(self.y_test, y_pred, average = 'weighted')
253
            
254
            if self.use_igenes:
255
                return xgb_clf, xgb_accuracy, xgb_roc_auc, xgb_f1, xgb_importances_extracted / np.max(np.abs(xgb_importances_extracted)), xgb_hhi, xgb_importances
256
            else: 
257
                return xgb_clf, xgb_accuracy, xgb_roc_auc, xgb_f1
258
            
259
        return None
260
    
261
    # k-Nearest Neighbors Classifier
262
    def knn_classifier(self: 'DiseasePrediction'):
263
        if self.use_knn: 
264
            print('k-Nearest Neighbors...')
265
            if self.use_normalization:
266
                pipeline = Pipeline([('scaling', StandardScaler()), ('knn_clf', sklearn.neighbors.KNeighborsClassifier())])
267
            else:
268
                pipeline = Pipeline([('knn_clf', sklearn.neighbors.KNeighborsClassifier())])
269
                
270
            if self.use_tuning:
271
                knn_parameters = {
272
                    "knn_clf__leaf_size": list(range(1, 50)),
273
                    "knn_clf__n_neighbors": list(range(1, min(len(self.X_train) - 1, 30) + 1)),
274
                    "knn_clf__p": [1, 2]
275
                }
276
                
277
                knn_clf = GridSearchCV(pipeline, param_grid = knn_parameters, cv = self.kfold, scoring = 'accuracy', n_jobs = -1, verbose = 0).fit(self.X_train, self.y_train)
278
                knn_clf = knn_clf.best_estimator_
279
            else:
280
                knn_clf = pipeline.fit(self.X_train, self.y_train)
281
                
282
            if self.use_igenes or self.use_visualization:
283
                knn_importances = shap.KernelExplainer(knn_clf.named_steps['knn_clf'].predict, shap.sample(self.X_train, 1000)).shap_values(self.X_test)
284
            
285
            if self.use_igenes:
286
                knn_importances_extracted = np.mean(knn_importances, axis = 0)
287
                knn_importances_normalized = (np.abs(knn_importances_extracted) / np.sum(np.abs(knn_importances_extracted))) * 100
288
                knn_hhi = np.sum(np.square(knn_importances_normalized))
289
            
290
            if self.use_visualization:
291
                shap.summary_plot(knn_importances, self.X_test, plot_type = "dot", show = False)
292
                
293
                plt.title("k-Nearest Neighbors Feature Importances", fontsize = 16)
294
                plt.xlabel("SHAP Value", fontsize = 14)
295
                plt.ylabel("Feature", fontsize = 14)
296
                plt.tight_layout()
297
                
298
                file_name = f"{self.cgit_file}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_kNN-SHAP.png"
299
                knn_plot = os.path.join(self.output_dir, file_name)
300
                
301
                if not os.path.exists(self.output_dir):
302
                    os.makedirs(self.output_dir)
303
                    
304
                plt.savefig(knn_plot)
305
                plt.close()
306
            
307
            y_pred = knn_clf.predict(self.X_test)
308
            y_proba = knn_clf.predict_proba(self.X_test)[:, 1]
309
            
310
            knn_accuracy = accuracy_score(self.y_test, y_pred)
311
            knn_roc_auc = roc_auc_score(self.y_test, y_proba)
312
            knn_f1 = f1_score(self.y_test, y_pred, average = 'weighted')
313
            
314
            if self.use_igenes:
315
                return knn_clf, knn_accuracy, knn_roc_auc, knn_f1, knn_importances_extracted / np.max(np.abs(knn_importances_extracted)), knn_hhi, knn_importances
316
            else: 
317
                return knn_clf, knn_accuracy, knn_roc_auc, knn_f1
318
            
319
        return None
320
    
321
    # Multi-Layer Perceptron Classifier
322
    def mlp_classifier(self: 'DiseasePrediction'):
323
        if self.use_mlp:
324
            print('Multi-Layer Perceptron...')
325
            if self.use_normalization:
326
                pipeline = Pipeline([('scaling', StandardScaler()), ('mlp_clf', MLPClassifier(random_state = self.random_state, max_iter = 3000))])
327
            else:
328
                pipeline = Pipeline([('mlp_clf', MLPClassifier(random_state = self.random_state, max_iter = 2000))])
329
            
330
            if self.use_tuning:
331
                mlp_parameters = {
332
                    'mlp_clf__hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,), (100, 100,), (100, 100, 100)],
333
                    'mlp_clf__activation': ['tanh', 'relu'],
334
                    'mlp_clf__solver': ['sgd', 'adam'],
335
                    'mlp_clf__alpha': [0.0001, 0.001, 0.01, 0.05, 0.1],
336
                    'mlp_clf__learning_rate': ['constant', 'adaptive'],
337
                    'mlp_clf__learning_rate_init': [0.001, 0.01, 0.1],
338
                    'mlp_clf__momentum': [0.1, 0.2, 0.5, 0.9],
339
                }
340
                
341
                mlp_clf = GridSearchCV(pipeline, param_grid = mlp_parameters, cv = self.kfold, scoring = 'accuracy', n_jobs = -1, verbose = 0).fit(self.X_train, self.y_train)
342
                mlp_clf = mlp_clf.best_estimator_
343
            else:
344
                mlp_clf = pipeline.fit(self.X_train, self.y_train)
345
                
346
            if self.use_igenes or self.use_visualization:
347
                mlp_importances =  shap.KernelExplainer(mlp_clf.named_steps['mlp_clf'].predict, shap.sample(self.X_train, 1000)).shap_values(self.X_test)
348
                
349
            if self.use_igenes:
350
                mlp_importances_extracted = np.mean(mlp_importances, axis = 0)
351
                mlp_importances_normalized = (np.abs(mlp_importances_extracted) / np.sum(np.abs(mlp_importances_extracted))) * 100
352
                mlp_hhi = np.sum(np.square(mlp_importances_normalized))
353
                
354
            if self.use_visualization:
355
                shap.summary_plot(mlp_importances, self.X_test, plot_type = "dot", show = False)
356
                
357
                plt.title("Multi-Layer Perceptron Feature Importances", fontsize = 16)
358
                plt.xlabel("SHAP Value", fontsize = 14)
359
                plt.ylabel("Feature", fontsize = 14)
360
                plt.tight_layout()
361
                
362
                file_name = f"{self.cgit_file}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_MLP-SHAP.png"
363
                mlp_plot = os.path.join(self.output_dir, file_name)
364
                
365
                if not os.path.exists(self.output_dir):
366
                    os.makedirs(self.output_dir)
367
                    
368
                plt.savefig(mlp_plot)
369
                plt.close()
370
                
371
            y_pred = mlp_clf.predict(self.X_test)
372
            y_proba = mlp_clf.predict_proba(self.X_test)[:, 1]
373
            
374
            mlp_accuracy = accuracy_score(self.y_test, y_pred)
375
            mlp_roc_auc = roc_auc_score(self.y_test, y_proba)
376
            mlp_f1 = f1_score(self.y_test, y_pred, average = 'weighted')           
377
            
378
            if self.use_igenes:
379
                return mlp_clf, mlp_accuracy, mlp_roc_auc, mlp_f1, mlp_importances_extracted / np.max(np.abs(mlp_importances_extracted)), mlp_hhi, mlp_importances
380
            else:
381
                return mlp_clf, mlp_accuracy, mlp_roc_auc, mlp_f1
382
            
383
        return None
384
    
385
    # Voting Classifier
386
    def voting_classifier(self: 'DiseasePrediction'):
387
        if self.classifiers:
388
            print('Voting Classifier...')
389
            voting_clf = VotingClassifier(estimators = [(clf[0], clf[1]["Classifier"]) for clf in self.classifiers], voting = self.voting).fit(self.X_train, self.y_train)
390
            
391
            y_pred = voting_clf.predict(self.X_test)
392
            y_proba = voting_clf.predict_proba(self.X_test)[:, 1]
393
            
394
            voting_accuracy = accuracy_score(self.y_test, y_pred) 
395
            voting_roc_auc = roc_auc_score(self.y_test, y_proba) 
396
            voting_f1 = f1_score(self.y_test, y_pred, average = 'weighted')
397
            
398
            return voting_clf, voting_accuracy, voting_roc_auc, voting_f1
399
        
400
        return None
401
    
402
    # Execute Classifiers
403
    def execute_classifiers(self: 'DiseasePrediction'):
404
        classifiers = [
405
            ("Random Forest", self.rf_classifier),
406
            ("Support Vector Machine", self.svm_classifier),
407
            ("XGBoost", self.xgb_classifier),
408
            ("k-Nearest Neighbors", self.knn_classifier),
409
            ("Multi-Layer Perceptron", self.mlp_classifier)
410
        ]
411
412
        self.classifiers = []
413
        for name, classifier in classifiers:
414
            metrics_tuple = classifier()
415
            if metrics_tuple is not None:
416
                clf, accuracy, roc_auc, f1, *rest = metrics_tuple
417
418
                classifier_info = {
419
                    "Classifier": clf,
420
                    "Accuracy": accuracy,
421
                    "ROC-AUC": roc_auc,
422
                    "F1": f1
423
                }
424
425
                if len(rest) == 2:
426
                    classifier_info["Importances"] = rest[0]
427
                    classifier_info["HHI"] = rest[1]
428
                elif len(rest) == 3:
429
                    classifier_info["Importances"] = rest[0]
430
                    classifier_info["HHI"] = rest[1]
431
                    classifier_info["SHAP Values"] = rest[2]
432
433
                self.classifiers.append((name, classifier_info))
434
                
435
        voting_metrics_tuple = self.voting_classifier()
436
        if voting_metrics_tuple:
437
            voting_clf, voting_accuracy, voting_roc_auc, voting_f1 = voting_metrics_tuple
438
            self.classifiers.append(("Voting Classifier", {
439
                "Classifier": voting_clf,
440
                "Accuracy": voting_accuracy,
441
                "ROC-AUC": voting_roc_auc,
442
                "F1": voting_f1
443
            }))
444
            
445
    # Generate Classifier Metrics
446
    def classifier_metrics(self: 'DiseasePrediction'):   
447
        if hasattr(self, 'classifiers') and self.classifiers:
448
            metrics_df = pd.DataFrame({
449
                'Classifier': [clf[0] for clf in self.classifiers],
450
                'Accuracy': [clf[1]['Accuracy'] for clf in self.classifiers],
451
                'ROC-AUC': [clf[1]['ROC-AUC'] for clf in self.classifiers],
452
                'F1': [clf[1]['F1'] for clf in self.classifiers]
453
            })
454
        
455
            return metrics_df
456
    
457
    # Generate I-Genes Profile
458
    def igenes_scores(self: 'DiseasePrediction'):
459
        if self.use_igenes:
460
            print('Generating I-Genes Scores...')
461
462
            if self.classifiers:
463
                normalized_importances = []
464
                herfindahl_hirschman_indices = []
465
                SHAP_values = []
466
467
                for name, metrics in self.classifiers:
468
                    if name != "Voting Classifier":
469
                        if "Importances" in metrics:
470
                            normalized_importances.append(metrics["Importances"])
471
                            herfindahl_hirschman_indices.append(metrics["HHI"])
472
                        if "SHAP Values" in metrics:
473
                            SHAP_values.append(metrics["SHAP Values"])
474
475
                if not normalized_importances or not herfindahl_hirschman_indices or not SHAP_values:
476
                    return
477
478
                num_classifiers = len(self.classifiers)
479
                hhi_weights = (np.array(herfindahl_hirschman_indices) / np.sum(herfindahl_hirschman_indices))
480
                initial_weights = (1 / (num_classifiers))
481
                final_weights = (initial_weights + (initial_weights * hhi_weights))
482
                                
483
                igenes_scores = np.zeros_like(normalized_importances[0])
484
                for weight, importance in zip(final_weights, normalized_importances):
485
                    igenes_scores += weight * np.abs(importance)
486
                
487
                case_control_predictions = ['Cases' if importances > 0 else 'Controls' if importances < 0 else 'Cases' for importances in np.sum(normalized_importances, axis = 0)]
488
                
489
                igenes_df = pd.DataFrame({
490
                    'Feature': self.X_train.columns,
491
                    'I-Genes Score': np.round(igenes_scores, 4),
492
                    'Prediction': case_control_predictions,  
493
                })
494
                
495
                feature_expression = []
496
                for i, prediction in enumerate(case_control_predictions):
497
                    if prediction in ['Cases', 'Controls']:
498
                        SHAP_normalized = [SHAP / np.max(np.abs(SHAP)) for SHAP in SHAP_values]
499
                        aggregated_shap = np.sum([SHAP[:, i] for SHAP in SHAP_normalized], axis = 0)
500
                    
501
                        relevant_shap_samples = aggregated_shap > 0 if prediction == 'Positive' else aggregated_shap < 0
502
                        if len(relevant_shap_samples) != len(self.X_test):
503
                            raise ValueError("Length of SHAP samples does not match the number of rows in X_train.")
504
505
                        feature_average = self.X_test.iloc[relevant_shap_samples, i].mean()
506
                        overall_average = self.X_test.iloc[:, i].mean()
507
508
                        if feature_average > overall_average:
509
                            feature_expression.append('Overexpression')
510
                        elif feature_average < overall_average:
511
                            feature_expression.append('Underexpression')
512
                        else:
513
                            feature_expression.append('Inconclusive')
514
                    else:
515
                        feature_expression.append('Not Applicable')
516
517
                igenes_df['Expression'] = feature_expression
518
                igenes_df['I-Genes Rankings'] = igenes_df['I-Genes Score'].rank(ascending = False).astype(int)
519
                igenes_df = igenes_df.sort_values(by = 'I-Genes Rankings')
520
521
                return igenes_df
522
    
523
# Execute DiseasePrediction Class
524
def main():
525
    print("IntelliGenes Disease Prediction and I-Genes Scores...")
526
    
527
    parser = argparse.ArgumentParser()
528
    parser.add_argument('-i', '--cgit_file', required = True)
529
    parser.add_argument('-f', '--features_file', required = True)
530
    parser.add_argument('-o', '--output_dir', required = True)
531
    parser.add_argument('--voting', type = str, default = 'soft')
532
    parser.add_argument('--random_state', type = int, default = 42)
533
    parser.add_argument('--test_size', type = float, default = 0.3)
534
    parser.add_argument('--n_splits', type = int, default = 5)
535
    parser.add_argument('--no_rf', action = 'store_true')
536
    parser.add_argument('--no_svm', action = 'store_true')
537
    parser.add_argument('--no_xgb', action = 'store_true')
538
    parser.add_argument('--no_knn', action = 'store_true')
539
    parser.add_argument('--no_mlp', action = 'store_true')
540
    parser.add_argument('--tune', action = 'store_true')
541
    parser.add_argument('--normalize', action = 'store_true')
542
    parser.add_argument('--no_igenes', action = 'store_true')
543
    parser.add_argument('--no_visualizations', action = 'store_true')
544
545
    args = parser.parse_args()
546
547
    pipeline = DiseasePrediction(cgit_file = args.cgit_file,
548
                                features_file = args.features_file,
549
                                output_dir = args.output_dir,
550
                                voting = args.voting,
551
                                random_state = args.random_state,
552
                                test_size = args.test_size,
553
                                n_splits = args.n_splits,
554
                                use_rf = not args.no_rf,
555
                                use_svm = not args.no_svm,
556
                                use_xgb = not args.no_xgb,
557
                                use_knn = not args.no_knn,
558
                                use_mlp = not args.no_mlp,
559
                                use_tuning = args.tune,
560
                                use_normalization = args.normalize,
561
                                use_igenes = not args.no_igenes,
562
                                use_visualization = not args.no_visualizations)
563
    
564
    pipeline.execute_classifiers()
565
    metrics_df = pipeline.classifier_metrics()
566
    igenes_df = pipeline.igenes_scores()
567
    
568
    if not os.path.exists(args.output_dir):
569
        os.makedirs(args.output_dir)
570
    
571
    file_name = Path(args.cgit_file).stem
572
    metrics_name = f"{file_name}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_Classifier-Metrics.csv"
573
    metrics_file = os.path.join(args.output_dir, metrics_name)
574
    
575
    metrics_df.to_csv(metrics_file, index = False)
576
    print("\n Clasifier Metrics:", metrics_file, "\n")
577
    
578
    if args.no_igenes is False: 
579
        igenes_name = f"{file_name}_{datetime.now().strftime('%m-%d-%Y-%I-%M-%S-%p')}_I-Genes-Score.csv"
580
        igenes_file = os.path.join(args.output_dir, igenes_name)
581
        
582
        igenes_df.to_csv(igenes_file, index = False)
583
        print("\n I-Genes Scores:", igenes_file, "\n")
584
                       
585
if __name__ == '__main__':
586
    main()