Diff of /analysis/utils.py [000000] .. [23d48c]

Switch to side-by-side view

--- a
+++ b/analysis/utils.py
@@ -0,0 +1,94 @@
+import numpy as np
+from read_file import read_file
+from eli5.sklearn import PermutationImportance
+import matplotlib.pyplot as plt
+from sklearn.metrics import roc_curve, auc
+import pdb
+
+def feature_importance(save_file, clf, feature_names, training_features, training_classes, 
+                       random_state, clf_name, clf_params, coefs=None, perm=True):
+    """ prints feature importance information for a trained estimator (clf)"""
+    if coefs is None:
+        coefs = compute_imp_score(clf, clf_name, training_features, training_classes, 
+                                  random_state, perm)
+    
+    assert(len(coefs)==len(feature_names))
+
+    out_text=''
+    # algorithm seed    feature score
+    for i,c in enumerate(coefs):
+        out_text += '\t'.join([clf_name,
+                               clf_params,
+                               str(random_state),
+                               feature_names[i],
+                               str(c)])+'\n'
+       
+    ext = '.perm_score' if perm else '.imp_score'
+    with open(save_file.split('.csv')[0] + ext,'a') as out:
+        out.write(out_text)
+
+def compute_imp_score(pipe, clf_name, training_features, training_classes, random_state, 
+                      perm):
+    # clf = pipe.named_steps[clf_name]  
+    clf = pipe
+    # pdb.set_trace()
+    if hasattr(clf, 'coef_') :
+        coefs = np.abs(clf.coef_.flatten())
+        coefs = coefs/np.sum(coefs)
+    elif clf_name == 'ScaleLR':
+        coefs = np.abs(clf.named_steps['lr'].coef_.flatten())
+        coefs = coefs/np.sum(coefs)
+    else:
+        coefs = getattr(clf, 'feature_importances_', None)
+    # print('coefs:',coefs)
+   
+    if coefs is None or perm:
+        perm = PermutationImportance(
+                                    estimator=clf,
+                                    n_iter=5,
+                                    random_state=random_state,
+                                    refit=False
+                                    )
+        perm.fit(training_features, training_classes)
+        coefs = perm.feature_importances_
+
+    
+    #return (coefs-np.min(coefs))/(np.max(coefs)-np.min(coefs))
+    # return coefs/np.sum(coefs)
+    return coefs
+
+# def plot_imp_score(save_file, coefs, feature_names, seed):
+#     # plot bar charts for top 10 importanct features
+#     num_bar = min(10, len(coefs))
+#     indices = np.argsort(coefs)[-num_bar:]
+#     h=plt.figure()
+#     plt.title("Feature importances")
+#     plt.barh(range(num_bar), coefs[indices], color="r", align="center")
+#     plt.yticks(range(num_bar), feature_names[indices])
+#     plt.ylim([-1, num_bar])
+#     h.tight_layout()
+#     plt.savefig(save_file.split('.')[0] + '_imp_score_' + str(seed) + '.pdf')
+
+######################################################################################### ROC Curve
+
+def roc(save_file, y_true, probabilities, random_state, clf_name, clf_params):
+    """prints receiver operator chacteristic curve data"""
+
+    # pdb.set_trace()
+    fpr,tpr,_ = roc_curve(y_true, probabilities)
+
+    AUC = auc(fpr,tpr)
+    # print results
+    out_text=''
+    for f,t in zip(fpr,tpr):
+        out_text += '\t'.join([clf_name,
+                               clf_params,
+                               str(random_state),
+                               str(f),
+                               str(t),
+                               str(AUC)])+'\n'
+
+    with open(save_file.split('.csv')[0] + '.roc','a') as out:
+        out.write(out_text)
+
+