[23d48c]: / analysis / utils.py

Download this file

95 lines (78 with data), 3.5 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
from read_file import read_file
from eli5.sklearn import PermutationImportance
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import pdb
def feature_importance(save_file, clf, feature_names, training_features, training_classes,
random_state, clf_name, clf_params, coefs=None, perm=True):
""" prints feature importance information for a trained estimator (clf)"""
if coefs is None:
coefs = compute_imp_score(clf, clf_name, training_features, training_classes,
random_state, perm)
assert(len(coefs)==len(feature_names))
out_text=''
# algorithm seed feature score
for i,c in enumerate(coefs):
out_text += '\t'.join([clf_name,
clf_params,
str(random_state),
feature_names[i],
str(c)])+'\n'
ext = '.perm_score' if perm else '.imp_score'
with open(save_file.split('.csv')[0] + ext,'a') as out:
out.write(out_text)
def compute_imp_score(pipe, clf_name, training_features, training_classes, random_state,
perm):
# clf = pipe.named_steps[clf_name]
clf = pipe
# pdb.set_trace()
if hasattr(clf, 'coef_') :
coefs = np.abs(clf.coef_.flatten())
coefs = coefs/np.sum(coefs)
elif clf_name == 'ScaleLR':
coefs = np.abs(clf.named_steps['lr'].coef_.flatten())
coefs = coefs/np.sum(coefs)
else:
coefs = getattr(clf, 'feature_importances_', None)
# print('coefs:',coefs)
if coefs is None or perm:
perm = PermutationImportance(
estimator=clf,
n_iter=5,
random_state=random_state,
refit=False
)
perm.fit(training_features, training_classes)
coefs = perm.feature_importances_
#return (coefs-np.min(coefs))/(np.max(coefs)-np.min(coefs))
# return coefs/np.sum(coefs)
return coefs
# def plot_imp_score(save_file, coefs, feature_names, seed):
# # plot bar charts for top 10 importanct features
# num_bar = min(10, len(coefs))
# indices = np.argsort(coefs)[-num_bar:]
# h=plt.figure()
# plt.title("Feature importances")
# plt.barh(range(num_bar), coefs[indices], color="r", align="center")
# plt.yticks(range(num_bar), feature_names[indices])
# plt.ylim([-1, num_bar])
# h.tight_layout()
# plt.savefig(save_file.split('.')[0] + '_imp_score_' + str(seed) + '.pdf')
######################################################################################### ROC Curve
def roc(save_file, y_true, probabilities, random_state, clf_name, clf_params):
"""prints receiver operator chacteristic curve data"""
# pdb.set_trace()
fpr,tpr,_ = roc_curve(y_true, probabilities)
AUC = auc(fpr,tpr)
# print results
out_text=''
for f,t in zip(fpr,tpr):
out_text += '\t'.join([clf_name,
clf_params,
str(random_state),
str(f),
str(t),
str(AUC)])+'\n'
with open(save_file.split('.csv')[0] + '.roc','a') as out:
out.write(out_text)