a b/analysis/utils.py
1
import numpy as np
2
from read_file import read_file
3
from eli5.sklearn import PermutationImportance
4
import matplotlib.pyplot as plt
5
from sklearn.metrics import roc_curve, auc
6
import pdb
7
8
def feature_importance(save_file, clf, feature_names, training_features, training_classes, 
9
                       random_state, clf_name, clf_params, coefs=None, perm=True):
10
    """ prints feature importance information for a trained estimator (clf)"""
11
    if coefs is None:
12
        coefs = compute_imp_score(clf, clf_name, training_features, training_classes, 
13
                                  random_state, perm)
14
    
15
    assert(len(coefs)==len(feature_names))
16
17
    out_text=''
18
    # algorithm seed    feature score
19
    for i,c in enumerate(coefs):
20
        out_text += '\t'.join([clf_name,
21
                               clf_params,
22
                               str(random_state),
23
                               feature_names[i],
24
                               str(c)])+'\n'
25
       
26
    ext = '.perm_score' if perm else '.imp_score'
27
    with open(save_file.split('.csv')[0] + ext,'a') as out:
28
        out.write(out_text)
29
30
def compute_imp_score(pipe, clf_name, training_features, training_classes, random_state, 
31
                      perm):
32
    # clf = pipe.named_steps[clf_name]  
33
    clf = pipe
34
    # pdb.set_trace()
35
    if hasattr(clf, 'coef_') :
36
        coefs = np.abs(clf.coef_.flatten())
37
        coefs = coefs/np.sum(coefs)
38
    elif clf_name == 'ScaleLR':
39
        coefs = np.abs(clf.named_steps['lr'].coef_.flatten())
40
        coefs = coefs/np.sum(coefs)
41
    else:
42
        coefs = getattr(clf, 'feature_importances_', None)
43
    # print('coefs:',coefs)
44
   
45
    if coefs is None or perm:
46
        perm = PermutationImportance(
47
                                    estimator=clf,
48
                                    n_iter=5,
49
                                    random_state=random_state,
50
                                    refit=False
51
                                    )
52
        perm.fit(training_features, training_classes)
53
        coefs = perm.feature_importances_
54
55
    
56
    #return (coefs-np.min(coefs))/(np.max(coefs)-np.min(coefs))
57
    # return coefs/np.sum(coefs)
58
    return coefs
59
60
# def plot_imp_score(save_file, coefs, feature_names, seed):
61
#     # plot bar charts for top 10 importanct features
62
#     num_bar = min(10, len(coefs))
63
#     indices = np.argsort(coefs)[-num_bar:]
64
#     h=plt.figure()
65
#     plt.title("Feature importances")
66
#     plt.barh(range(num_bar), coefs[indices], color="r", align="center")
67
#     plt.yticks(range(num_bar), feature_names[indices])
68
#     plt.ylim([-1, num_bar])
69
#     h.tight_layout()
70
#     plt.savefig(save_file.split('.')[0] + '_imp_score_' + str(seed) + '.pdf')
71
72
######################################################################################### ROC Curve
73
74
def roc(save_file, y_true, probabilities, random_state, clf_name, clf_params):
75
    """prints receiver operator chacteristic curve data"""
76
77
    # pdb.set_trace()
78
    fpr,tpr,_ = roc_curve(y_true, probabilities)
79
80
    AUC = auc(fpr,tpr)
81
    # print results
82
    out_text=''
83
    for f,t in zip(fpr,tpr):
84
        out_text += '\t'.join([clf_name,
85
                               clf_params,
86
                               str(random_state),
87
                               str(f),
88
                               str(t),
89
                               str(AUC)])+'\n'
90
91
    with open(save_file.split('.csv')[0] + '.roc','a') as out:
92
        out.write(out_text)
93
94