|
a |
|
b/analysis/utils.py |
|
|
1 |
import numpy as np |
|
|
2 |
from read_file import read_file |
|
|
3 |
from eli5.sklearn import PermutationImportance |
|
|
4 |
import matplotlib.pyplot as plt |
|
|
5 |
from sklearn.metrics import roc_curve, auc |
|
|
6 |
import pdb |
|
|
7 |
|
|
|
8 |
def feature_importance(save_file, clf, feature_names, training_features, training_classes, |
|
|
9 |
random_state, clf_name, clf_params, coefs=None, perm=True): |
|
|
10 |
""" prints feature importance information for a trained estimator (clf)""" |
|
|
11 |
if coefs is None: |
|
|
12 |
coefs = compute_imp_score(clf, clf_name, training_features, training_classes, |
|
|
13 |
random_state, perm) |
|
|
14 |
|
|
|
15 |
assert(len(coefs)==len(feature_names)) |
|
|
16 |
|
|
|
17 |
out_text='' |
|
|
18 |
# algorithm seed feature score |
|
|
19 |
for i,c in enumerate(coefs): |
|
|
20 |
out_text += '\t'.join([clf_name, |
|
|
21 |
clf_params, |
|
|
22 |
str(random_state), |
|
|
23 |
feature_names[i], |
|
|
24 |
str(c)])+'\n' |
|
|
25 |
|
|
|
26 |
ext = '.perm_score' if perm else '.imp_score' |
|
|
27 |
with open(save_file.split('.csv')[0] + ext,'a') as out: |
|
|
28 |
out.write(out_text) |
|
|
29 |
|
|
|
30 |
def compute_imp_score(pipe, clf_name, training_features, training_classes, random_state, |
|
|
31 |
perm): |
|
|
32 |
# clf = pipe.named_steps[clf_name] |
|
|
33 |
clf = pipe |
|
|
34 |
# pdb.set_trace() |
|
|
35 |
if hasattr(clf, 'coef_') : |
|
|
36 |
coefs = np.abs(clf.coef_.flatten()) |
|
|
37 |
coefs = coefs/np.sum(coefs) |
|
|
38 |
elif clf_name == 'ScaleLR': |
|
|
39 |
coefs = np.abs(clf.named_steps['lr'].coef_.flatten()) |
|
|
40 |
coefs = coefs/np.sum(coefs) |
|
|
41 |
else: |
|
|
42 |
coefs = getattr(clf, 'feature_importances_', None) |
|
|
43 |
# print('coefs:',coefs) |
|
|
44 |
|
|
|
45 |
if coefs is None or perm: |
|
|
46 |
perm = PermutationImportance( |
|
|
47 |
estimator=clf, |
|
|
48 |
n_iter=5, |
|
|
49 |
random_state=random_state, |
|
|
50 |
refit=False |
|
|
51 |
) |
|
|
52 |
perm.fit(training_features, training_classes) |
|
|
53 |
coefs = perm.feature_importances_ |
|
|
54 |
|
|
|
55 |
|
|
|
56 |
#return (coefs-np.min(coefs))/(np.max(coefs)-np.min(coefs)) |
|
|
57 |
# return coefs/np.sum(coefs) |
|
|
58 |
return coefs |
|
|
59 |
|
|
|
60 |
# def plot_imp_score(save_file, coefs, feature_names, seed): |
|
|
61 |
# # plot bar charts for top 10 importanct features |
|
|
62 |
# num_bar = min(10, len(coefs)) |
|
|
63 |
# indices = np.argsort(coefs)[-num_bar:] |
|
|
64 |
# h=plt.figure() |
|
|
65 |
# plt.title("Feature importances") |
|
|
66 |
# plt.barh(range(num_bar), coefs[indices], color="r", align="center") |
|
|
67 |
# plt.yticks(range(num_bar), feature_names[indices]) |
|
|
68 |
# plt.ylim([-1, num_bar]) |
|
|
69 |
# h.tight_layout() |
|
|
70 |
# plt.savefig(save_file.split('.')[0] + '_imp_score_' + str(seed) + '.pdf') |
|
|
71 |
|
|
|
72 |
######################################################################################### ROC Curve |
|
|
73 |
|
|
|
74 |
def roc(save_file, y_true, probabilities, random_state, clf_name, clf_params): |
|
|
75 |
"""prints receiver operator chacteristic curve data""" |
|
|
76 |
|
|
|
77 |
# pdb.set_trace() |
|
|
78 |
fpr,tpr,_ = roc_curve(y_true, probabilities) |
|
|
79 |
|
|
|
80 |
AUC = auc(fpr,tpr) |
|
|
81 |
# print results |
|
|
82 |
out_text='' |
|
|
83 |
for f,t in zip(fpr,tpr): |
|
|
84 |
out_text += '\t'.join([clf_name, |
|
|
85 |
clf_params, |
|
|
86 |
str(random_state), |
|
|
87 |
str(f), |
|
|
88 |
str(t), |
|
|
89 |
str(AUC)])+'\n' |
|
|
90 |
|
|
|
91 |
with open(save_file.split('.csv')[0] + '.roc','a') as out: |
|
|
92 |
out.write(out_text) |
|
|
93 |
|
|
|
94 |
|