#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#
# Copyright 2017 University of Westminster. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""It consists of a set of custom plots, using Matplotlib and Scikit libraries.
"""
from typing import Dict, List, TypeVar, Any
from sklearn import metrics
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt
import numpy as np
import itertools
PandasDataFrame = TypeVar('DataFrame')
MatplotlibFigure = TypeVar('Figure')
__author__ = "Mohsen Mesgarpour"
__copyright__ = "Copyright 2016, https://github.com/mesgarpour"
__credits__ = ["Mohsen Mesgarpour"]
__license__ = "GPL"
__version__ = "1.1"
__maintainer__ = "Mohsen Mesgarpour"
__email__ = "mohsen.mesgarpour@gmail.com"
__status__ = "Release"
class Plots:
@staticmethod
def confusion_matrix(predicted_scores: List,
feature_target: List,
model_labals: List=list([0, 1]),
normalize: bool=False,
title: str='Confusion Matrix',
cmap: str="Blues") -> [MatplotlibFigure, Dict]:
"""Plot the confusion matrix.
:param predicted_scores: the predicted Scores.
:param feature_target: the target feature, which is being estimated.
:param model_labals: the target labels (default [0, 1]).
:param normalize: to normalise the labels.
:param title: the figure title.
:param cmap: the plot color.
:return: the plot object, and the data used to plot.
"""
summaries = dict()
tick_marks = np.arange(len(model_labals))
# Compute confusion matrix
summaries["cnf_matrix"] = confusion_matrix(feature_target, predicted_scores)
np.set_printoptions(precision=2)
# plot metadata
fig, ax = plt.subplots(nrows=1, ncols=1)
ax.clf()
ax.title(title + ' Average Precision={0:0.2f}'.format(summaries["avg_precision"]))
ax.ylabel('True label')
ax.xlabel('Predicted label')
ax.xticks(tick_marks, model_labals, rotation=45)
ax.yticks(tick_marks, model_labals)
ax.grid()
ax.colorbar()
# plot matrix
plt.imshow(summaries["cnf_matrix"], interpolation='nearest', cmap=cmap)
if normalize:
summaries["cnf_matrix"] = \
summaries["cnf_matrix"].astype('float') / summaries["cnf_matrix"].sum(axis=1)[:, np.newaxis]
thresh = summaries["cnf_matrix"].max() / 2.
for i, j in itertools.product(range(summaries["cnf_matrix"].shape[0]), range(summaries["cnf_matrix"].shape[1])):
plt.text(j, i, summaries["cnf_matrix"][i, j],
horizontalalignment="center",
color="white" if summaries["cnf_matrix"][i, j] > thresh else "black")
plt.tight_layout()
return fig, summaries
@staticmethod
def stepwise_model(summaries: Dict,
title: str="Step-Wise Train & Test",
lw: int=2) -> MatplotlibFigure:
"""Plot a performance summary plot for the step-wise training and testing.
:param summaries: the summary statistics which will be used for plotting.
It must contain 'Train_Precision', 'Train_Recall', 'Train_ROC', 'Test_Precision', 'Test_Recall', and 'Test_ROC'
for each training and testing step.
:param title: the figure title.
:param lw: the line-width.
:return: the plot object.
"""
# plot metadata
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.clf()
plt.title(title)
plt.ylim([0.0, 1.05])
plt.xlabel('Number of Features')
plt.ylabel('Summary Statistics')
plt.grid()
plt.plot(summaries["Step"], summaries["Train_Precision"], lw=lw, color='r', label='Train - Precision')
plt.plot(summaries["Step"], summaries["Train_Recall"], lw=lw, color='g', label='Train - Recall')
plt.plot(summaries["Step"], summaries["Train_ROC"], lw=lw, color='b', label='Train - ROC')
plt.plot(summaries["Step"], summaries["Test_Precision"], lw=lw, color='brown', label='Test - Precision')
plt.plot(summaries["Step"], summaries["Test_Recall"], lw=lw, color='orange', label='Test - Recall')
plt.plot(summaries["Step"], summaries["Test_ROC"], lw=lw, color='pink', label='Test - ROC')
plt.legend(loc="lower left")
return fig
@staticmethod
def precision_recall(predicted_scores: List,
feature_target: List,
title: str="Precision-Recall Curve",
lw: int=2) -> [MatplotlibFigure, Dict]:
"""Plot the precision-recall curve.
"The precision-recall plot is a model-wide measure for evaluating binary classifiers
and closely related to the ROC plot."
:param predicted_scores: the predicted Scores.
:param feature_target: the target feature, which is being estimated.
:param title: the figure title.
:param lw: the line-width.
:return: the plot object, and the data used to plot.
"""
summaries = dict()
# calculate
summaries["precision"], summaries["recall"], _ = metrics.precision_recall_curve(
feature_target, predicted_scores)
# summaries
summaries["avg_precision"] = metrics.average_precision_score(feature_target, predicted_scores)
# plot metadata
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.clf()
plt.title(title + ' Average Precision={0:0.2f}'.format(summaries["avg_precision"]))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid()
plt.plot(summaries["precision"], summaries["recall"],
lw=lw, color='navy', label='Precision-Recall curve')
plt.legend(loc="lower left")
return fig, summaries
@staticmethod
def precision_recall_multiple(predicted_scores_list: List,
feature_target_list: List,
label_list: List,
marker_list: List,
linestyle_list: List,
color_list: List,
title: str="Precision-Recall Curve",
lw: int=2,
markersize: int=6,
markevery: int=10000,
legend_prop: int=2,
legend_markerscale: int=2) -> [MatplotlibFigure, Dict]:
"""Plot the precision-recall curve.
"The precision-recall plot is a model-wide measure for evaluating binary classifiers
and closely related to the ROC plot."
:param predicted_scores_list: the predicted Scores (one or multiple).
:param feature_target_list: the target feature, which is being estimated (one or multiple).
:param label_list: the line label (one or multiple).
:param marker_list: the line marker (one or multiple).
:param linestyle_list: the line style (one or multiple).
:param color_list: the line color (one or multiple).
:param title: the figure title.
:param lw: the line-width.
:param markersize: the marker size.
:param markevery: to mark every x point.
:param legend_prop: the legend proportion
:param legend_markerscale: The legend's marker scale.
:return: the plot object, and the data used to plot.
"""
# calculate summaries
summaries = [None] * len(predicted_scores_list)
for i in range(len(predicted_scores_list)):
summaries[i] = dict()
summaries[i]["precision"], summaries[i]["recall"], _ = metrics.precision_recall_curve(
feature_target_list[i], predicted_scores_list[i])
summaries[i]["avg_precision"] = metrics.average_precision_score(
feature_target_list[i], predicted_scores_list[i])
# plot metadata
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(nrows=1, ncols=1)
plt.clf()
plt.title(title)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid()
for i in range(len(predicted_scores_list)):
plt.plot(summaries[i]["precision"], summaries[i]["recall"],
markersize=markersize,
marker=marker_list[i],
markevery=markevery,
linestyle=linestyle_list[i],
linewidth=lw,
color=color_list[i],
label='Avg. Precision (' + label_list[i] + ')={0:0.2f}'.format(summaries[i]["avg_precision"]))
plt.legend(loc="lower left", prop={'size': legend_prop}, markerscale=legend_markerscale)
return fig, summaries
@staticmethod
def roc(predicted_scores: List,
feature_target: List,
title: str="ROC Curve",
lw: int=2) -> [MatplotlibFigure, Dict]:
"""Plot the Receiver Operating Characteristic (ROC)
:param predicted_scores: the predicted Scores.
:param feature_target: the target feature, which is being estimated.
:param title: the figure title.
:param lw: the line-width.
:return: the plot object, and the data used to plot.
"""
summaries = dict()
# calculate
summaries["fpr"], summaries["tpr"], _ = metrics.roc_curve(feature_target, predicted_scores)
# summaries
summaries["roc_auc"] = metrics.auc(summaries["fpr"], summaries["tpr"])
# plot metadata
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.clf()
plt.title(title + ' AUC={0:0.2f}'.format(summaries["roc_auc"]))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
plt.plot(summaries["fpr"], summaries["tpr"], color='r',
lw=lw, label='ROC curve (area = %0.2f)' % summaries["roc_auc"])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.legend(loc="lower right")
return fig, summaries
@staticmethod
def roc_multiple(predicted_scores_list: List,
feature_target_list: List,
label_list: List,
marker_list: List,
linestyle_list: List,
color_list: List,
title: str="ROC Curve",
lw: int=2,
markersize: int=6,
markevery: int=10000,
legend_prop: int=2,
legend_markerscale: int=2) -> [MatplotlibFigure, Dict]:
"""Plot the Receiver Operating Characteristic (ROC)
:param predicted_scores_list: the predicted Scores (one or multiple).
:param feature_target_list: the target feature, which is being estimated (one or multiple).
:param label_list: the line label (one or multiple).
:param marker_list: the line marker (one or multiple).
:param linestyle_list: the line style (one or multiple).
:param color_list: the line color (one or multiple).
:param title: the figure title.
:param lw: the line-width.
:param markersize: the marker size.
:param markevery: to mark every x point.
:param legend_prop: the legend proportion
:param legend_markerscale: the legend's marker scale.
:return: the plot object, and the data used to plot.
"""
# calculate summaries
summaries = [None] * len(predicted_scores_list)
for i in range(len(predicted_scores_list)):
summaries[i] = dict()
summaries[i]["fpr"], summaries[i]["tpr"], _ = metrics.roc_curve(
feature_target_list[i], predicted_scores_list[i])
summaries[i]["roc_auc"] = metrics.auc(
summaries[i]["fpr"], summaries[i]["tpr"])
# plot metadata
plt.clf()
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(nrows=1, ncols=1)
plt.title(title)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
for i in range(len(predicted_scores_list)):
plt.plot(summaries[i]["fpr"], summaries[i]["tpr"],
markersize=markersize,
marker=marker_list[i],
markevery=markevery,
linestyle=linestyle_list[i],
linewidth=lw,
color=color_list[i],
label='AUC(' + label_list[i] + ')={0:0.2f}'.format(summaries[i]["roc_auc"]))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.legend(loc="lower right", prop={'size': legend_prop}, markerscale=legend_markerscale)
return fig, summaries
@staticmethod
def learning_curve(estimator: Any,
features_indep_df: PandasDataFrame,
feature_target: List,
title: str="Learning Curve",
ylim: List=None,
cv: int=None,
n_jobs: int=-1,
train_sizes: List=np.linspace(.1, 1.0, 5)) -> [MatplotlibFigure, Dict]:
"""Plot the learning curve.
"A learning curve shows the validation and training score of an estimator for varying numbers of training
samples. It is a tool to find out how much we benefit from adding more training data and whether the estimator
suffers more from a variance error or a bias error. If both the validation score and the training score
converge to a value that is too low with increasing size of the training set, we will not benefit much
from more training data."
:param estimator: the object type that implements the “fit” and “predict” methods.
An object of that type which is cloned for each validation.
:param features_indep_df: the independent features, which are inputted into the model.
:param feature_target: the target feature, which is being estimated.
:param title: the figure title.
:param ylim: the y-limit for the axis.
:param cv: the cross-validation splitting strategy (optional).
:param n_jobs: the number of jobs to run in parallel (default -1).
:param train_sizes: the size of the training samples for the learning curve.
:return: the plot object, and the data used to plot.
"""
summaries = dict()
# calculate
train_sizes, train_scores, test_scores = learning_curve(
estimator, features_indep_df, feature_target,
cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
# summaries
summaries["train_scores_mean"] = np.mean(train_scores, axis=1)
summaries["train_scores_std"] = np.std(train_scores, axis=1)
summaries["test_scores_mean"] = np.mean(test_scores, axis=1)
summaries["test_scores_std"] = np.std(test_scores, axis=1)
# plot metadata
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.clf()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()
# plot curves
plt.fill_between(train_sizes, summaries["train_scores_mean"] - summaries["train_scores_std"],
summaries["train_scores_mean"] + summaries["train_scores_std"], alpha=0.1, color="r")
plt.fill_between(train_sizes, summaries["test_scores_mean"] - summaries["test_scores_std"],
summaries["test_scores_mean"] + summaries["test_scores_std"], alpha=0.1, color="g")
plt.plot(train_sizes, summaries["train_scores_mean"], 'o-', color="r", label="Training score")
plt.plot(train_sizes, summaries["test_scores_mean"], 'o-', color="g", label="Cross-validation score")
plt.legend(loc="best")
return fig, summaries
@staticmethod
def validation_curve(estimator: Any,
features_indep_df: PandasDataFrame,
feature_target: List,
param_name: str,
param_range: List,
title: str="Learning Curve",
ylim: List=None,
cv: int=None,
lw: int=2,
n_jobs: int=-1) -> [MatplotlibFigure, Dict]:
"""Plot the validation curve
"it is sometimes helpful to plot the influence of a single hyperparameter on the training score and the
validation score to find out whether the estimator is overfitting or underfitting for some hyperparameter
values."
:param estimator: the object type that implements the “fit” and “predict” methods.
An object of that type which is cloned for each validation.
:param features_indep_df: the independent features, which are inputted into the model.
:param feature_target: the target feature, which is being estimated.
:param param_name: the N=name of the parameter that will be varied.
:param param_range: the values of the parameter that will be evaluated.
:param title: the figure title.
:param ylim: the y-limit for the axis.
:param cv: the cross-validation splitting strategy (optional).
:param lw: the line-width.
:param n_jobs: the number of jobs to run in parallel (default -1).
:return: the plot object, and the data used to plot.
"""
summaries = dict()
# train & test
train_scores, test_scores = validation_curve(
estimator, features_indep_df, feature_target,
param_name=param_name, param_range=param_range,
cv=cv, scoring="accuracy", n_jobs=n_jobs)
# summaries
summaries["train_scores_mean"] = np.mean(train_scores, axis=1)
summaries["train_scores_std"] = np.std(train_scores, axis=1)
summaries["test_scores_mean"] = np.mean(test_scores, axis=1)
summaries["test_scores_std"] = np.std(test_scores, axis=1)
# plot metadata
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.clf()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.grid()
# plot curves
plt.semilogx(param_range, summaries["train_scores_mean"], label="Training score", color="darkorange", lw=lw)
plt.fill_between(param_range, summaries["train_scores_mean"] - summaries["train_scores_std"],
summaries["train_scores_mean"] + summaries["train_scores_std"], alpha=0.2,
color="darkorange", lw=lw)
plt.semilogx(param_range, summaries["test_scores_mean"], label="Cross-validation score", color="navy", lw=lw)
plt.fill_between(param_range, summaries["test_scores_mean"] - summaries["test_scores_std"],
summaries["test_scores_mean"] + summaries["test_scores_std"], alpha=0.2, color="navy", lw=lw)
plt.legend(loc="best")
return fig, summaries
@staticmethod
def distribution_bar(feature: List,
feature_name: str,
title: str,
ylim: List=[0.0, 1.05]) -> tuple:
"""Plot distribution, using bar plot.
:param feature: the value of the feature.
:param feature_name: the name of the feature.
:param title: the figure title.
:param ylim: the y-limit for the axis.
:return: the plot object.
"""
uniques = np.unique(feature)
uniques.sort()
# plot metadata
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.clf()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel(feature_name)
plt.ylabel("Probability")
plt.grid()
# plot curves
plt.hist(feature, bins=uniques, normed=1, facecolor='green', alpha=0.5)
return fig
@staticmethod
def distribution_hist(feature: List,
feature_name: str,
title: str,
num_bins: int=50,
ylim: List=[0.0, 1.05]) -> tuple:
"""Plot distribution, using histogram.
:param feature: the value of the feature.
:param feature_name: the name of the feature.
:param title: the figure title.
:param num_bins: number of bins in the histogram.
:param ylim: the y-limit for the axis.
:return: the plot object.
"""
# plot metadata
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.clf()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel(feature_name)
plt.ylabel("Probability")
plt.grid()
# plot curves# the histogram of the data
plt.hist(feature, num_bins, normed=1, facecolor='green', alpha=0.5)
return fig
@staticmethod
def distribution_kde(feature: List,
feature_name: str,
title: str,
x_values: List=None,
kernel: str="gaussian",
bandwidth: float=0.5,
ylim: List=[0.0, 1.05]) -> List:
"""Plot distribution, using Kernel Density Estimation (KDE).
:param feature: the value of the feature.
:param feature_name: the name of the feature.
:param title: the figure title.
:param x_values: the grid to use for plotting (default: based on the feature range and size)
:param kernel: the kernel to use. Valid kernels are
:param bandwidth: the bandwidth of the kernel.
:param ylim: the y-limit for the axis.
:return: the plot object.
"""
if x_values is None:
x_values = np.linspace(min(feature), max(feature), len(feature))[:, np.newaxis]
else:
x_values = x_values[:, np.newaxis]
# plot metadata
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.clf()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel(feature_name)
plt.ylabel("Probability")
plt.grid()
# plot curves# the histogram of the data
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(np.array(feature)[:, np.newaxis])
log_dens = kde.score_samples(x_values)
plt.plot(x_values[:, 0], np.exp(log_dens), '-', label="kernel = '{0}'".format(kernel))
return fig