|
a |
|
b/utils/analysis_utils.py |
|
|
1 |
def impute_with_median (df): |
|
|
2 |
"""Iterate through columns of Pandas DataFrame. |
|
|
3 |
Where NaNs exist replace with median""" |
|
|
4 |
|
|
|
5 |
# Get list of DataFrame column names |
|
|
6 |
cols = list(df) |
|
|
7 |
# Loop through columns |
|
|
8 |
for column in cols: |
|
|
9 |
# Transfer column to independent series |
|
|
10 |
col_data = df[column] |
|
|
11 |
# Look to see if there is any missing numerical data |
|
|
12 |
missing_data = sum(col_data.isna()) |
|
|
13 |
if missing_data > 0: |
|
|
14 |
# Get median and replace missing numerical data with median |
|
|
15 |
col_median = col_data.median() |
|
|
16 |
col_data.fillna(col_median, inplace=True) |
|
|
17 |
df[column] = col_data |
|
|
18 |
return df |
|
|
19 |
|
|
|
20 |
|
|
|
21 |
def plot_roc_curve(fpr, tpr, label=None): |
|
|
22 |
"""Plot the ROC curve from False Positive Rate |
|
|
23 |
and True Positive Rate""" |
|
|
24 |
|
|
|
25 |
plt.plot(fpr, tpr, linewidth=2, label=label) |
|
|
26 |
plt.plot([0, 1], [0, 1], 'k--') |
|
|
27 |
plt.axis([0, 1, 0, 1]) |
|
|
28 |
plt.xlabel('False Positive Rate', fontsize=16) |
|
|
29 |
plt.ylabel('True Positive Rate', fontsize=16) |
|
|
30 |
plt.grid(True) |