--- a +++ b/Stats/Plots.py @@ -0,0 +1,542 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +# +# Copyright 2017 University of Westminster. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""It consists of a set of custom plots, using Matplotlib and Scikit libraries. +""" + +from typing import Dict, List, TypeVar, Any +from sklearn import metrics +from sklearn.model_selection import validation_curve +from sklearn.model_selection import learning_curve +from sklearn.metrics import confusion_matrix +from sklearn.neighbors import KernelDensity +import matplotlib.pyplot as plt +import numpy as np +import itertools + +PandasDataFrame = TypeVar('DataFrame') +MatplotlibFigure = TypeVar('Figure') + +__author__ = "Mohsen Mesgarpour" +__copyright__ = "Copyright 2016, https://github.com/mesgarpour" +__credits__ = ["Mohsen Mesgarpour"] +__license__ = "GPL" +__version__ = "1.1" +__maintainer__ = "Mohsen Mesgarpour" +__email__ = "mohsen.mesgarpour@gmail.com" +__status__ = "Release" + + +class Plots: + @staticmethod + def confusion_matrix(predicted_scores: List, + feature_target: List, + model_labals: List=list([0, 1]), + normalize: bool=False, + title: str='Confusion Matrix', + cmap: str="Blues") -> [MatplotlibFigure, Dict]: + """Plot the confusion matrix. + :param predicted_scores: the predicted Scores. + :param feature_target: the target feature, which is being estimated. + :param model_labals: the target labels (default [0, 1]). + :param normalize: to normalise the labels. + :param title: the figure title. + :param cmap: the plot color. + :return: the plot object, and the data used to plot. + """ + summaries = dict() + tick_marks = np.arange(len(model_labals)) + + # Compute confusion matrix + summaries["cnf_matrix"] = confusion_matrix(feature_target, predicted_scores) + np.set_printoptions(precision=2) + + # plot metadata + fig, ax = plt.subplots(nrows=1, ncols=1) + ax.clf() + ax.title(title + ' Average Precision={0:0.2f}'.format(summaries["avg_precision"])) + ax.ylabel('True label') + ax.xlabel('Predicted label') + ax.xticks(tick_marks, model_labals, rotation=45) + ax.yticks(tick_marks, model_labals) + ax.grid() + ax.colorbar() + + # plot matrix + plt.imshow(summaries["cnf_matrix"], interpolation='nearest', cmap=cmap) + + if normalize: + summaries["cnf_matrix"] = \ + summaries["cnf_matrix"].astype('float') / summaries["cnf_matrix"].sum(axis=1)[:, np.newaxis] + + thresh = summaries["cnf_matrix"].max() / 2. + for i, j in itertools.product(range(summaries["cnf_matrix"].shape[0]), range(summaries["cnf_matrix"].shape[1])): + plt.text(j, i, summaries["cnf_matrix"][i, j], + horizontalalignment="center", + color="white" if summaries["cnf_matrix"][i, j] > thresh else "black") + plt.tight_layout() + return fig, summaries + + @staticmethod + def stepwise_model(summaries: Dict, + title: str="Step-Wise Train & Test", + lw: int=2) -> MatplotlibFigure: + """Plot a performance summary plot for the step-wise training and testing. + :param summaries: the summary statistics which will be used for plotting. + It must contain 'Train_Precision', 'Train_Recall', 'Train_ROC', 'Test_Precision', 'Test_Recall', and 'Test_ROC' + for each training and testing step. + :param title: the figure title. + :param lw: the line-width. + :return: the plot object. + """ + # plot metadata + fig, ax = plt.subplots(nrows=1, ncols=1) + plt.clf() + plt.title(title) + plt.ylim([0.0, 1.05]) + plt.xlabel('Number of Features') + plt.ylabel('Summary Statistics') + plt.grid() + + plt.plot(summaries["Step"], summaries["Train_Precision"], lw=lw, color='r', label='Train - Precision') + plt.plot(summaries["Step"], summaries["Train_Recall"], lw=lw, color='g', label='Train - Recall') + plt.plot(summaries["Step"], summaries["Train_ROC"], lw=lw, color='b', label='Train - ROC') + + plt.plot(summaries["Step"], summaries["Test_Precision"], lw=lw, color='brown', label='Test - Precision') + plt.plot(summaries["Step"], summaries["Test_Recall"], lw=lw, color='orange', label='Test - Recall') + plt.plot(summaries["Step"], summaries["Test_ROC"], lw=lw, color='pink', label='Test - ROC') + plt.legend(loc="lower left") + return fig + + @staticmethod + def precision_recall(predicted_scores: List, + feature_target: List, + title: str="Precision-Recall Curve", + lw: int=2) -> [MatplotlibFigure, Dict]: + """Plot the precision-recall curve. + "The precision-recall plot is a model-wide measure for evaluating binary classifiers + and closely related to the ROC plot." + :param predicted_scores: the predicted Scores. + :param feature_target: the target feature, which is being estimated. + :param title: the figure title. + :param lw: the line-width. + :return: the plot object, and the data used to plot. + """ + summaries = dict() + + # calculate + summaries["precision"], summaries["recall"], _ = metrics.precision_recall_curve( + feature_target, predicted_scores) + + # summaries + summaries["avg_precision"] = metrics.average_precision_score(feature_target, predicted_scores) + + # plot metadata + fig, ax = plt.subplots(nrows=1, ncols=1) + plt.clf() + plt.title(title + ' Average Precision={0:0.2f}'.format(summaries["avg_precision"])) + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.grid() + + plt.plot(summaries["precision"], summaries["recall"], + lw=lw, color='navy', label='Precision-Recall curve') + plt.legend(loc="lower left") + return fig, summaries + + @staticmethod + def precision_recall_multiple(predicted_scores_list: List, + feature_target_list: List, + label_list: List, + marker_list: List, + linestyle_list: List, + color_list: List, + title: str="Precision-Recall Curve", + lw: int=2, + markersize: int=6, + markevery: int=10000, + legend_prop: int=2, + legend_markerscale: int=2) -> [MatplotlibFigure, Dict]: + """Plot the precision-recall curve. + "The precision-recall plot is a model-wide measure for evaluating binary classifiers + and closely related to the ROC plot." + :param predicted_scores_list: the predicted Scores (one or multiple). + :param feature_target_list: the target feature, which is being estimated (one or multiple). + :param label_list: the line label (one or multiple). + :param marker_list: the line marker (one or multiple). + :param linestyle_list: the line style (one or multiple). + :param color_list: the line color (one or multiple). + :param title: the figure title. + :param lw: the line-width. + :param markersize: the marker size. + :param markevery: to mark every x point. + :param legend_prop: the legend proportion + :param legend_markerscale: The legend's marker scale. + :return: the plot object, and the data used to plot. + """ + + # calculate summaries + summaries = [None] * len(predicted_scores_list) + for i in range(len(predicted_scores_list)): + summaries[i] = dict() + summaries[i]["precision"], summaries[i]["recall"], _ = metrics.precision_recall_curve( + feature_target_list[i], predicted_scores_list[i]) + summaries[i]["avg_precision"] = metrics.average_precision_score( + feature_target_list[i], predicted_scores_list[i]) + + # plot metadata + fig = plt.figure(figsize=(10, 10)) + ax = fig.add_subplot(nrows=1, ncols=1) + plt.clf() + plt.title(title) + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.grid() + + for i in range(len(predicted_scores_list)): + plt.plot(summaries[i]["precision"], summaries[i]["recall"], + markersize=markersize, + marker=marker_list[i], + markevery=markevery, + linestyle=linestyle_list[i], + linewidth=lw, + color=color_list[i], + label='Avg. Precision (' + label_list[i] + ')={0:0.2f}'.format(summaries[i]["avg_precision"])) + + plt.legend(loc="lower left", prop={'size': legend_prop}, markerscale=legend_markerscale) + return fig, summaries + + @staticmethod + def roc(predicted_scores: List, + feature_target: List, + title: str="ROC Curve", + lw: int=2) -> [MatplotlibFigure, Dict]: + """Plot the Receiver Operating Characteristic (ROC) + :param predicted_scores: the predicted Scores. + :param feature_target: the target feature, which is being estimated. + :param title: the figure title. + :param lw: the line-width. + :return: the plot object, and the data used to plot. + """ + summaries = dict() + + # calculate + summaries["fpr"], summaries["tpr"], _ = metrics.roc_curve(feature_target, predicted_scores) + + # summaries + summaries["roc_auc"] = metrics.auc(summaries["fpr"], summaries["tpr"]) + + # plot metadata + fig, ax = plt.subplots(nrows=1, ncols=1) + plt.clf() + plt.title(title + ' AUC={0:0.2f}'.format(summaries["roc_auc"])) + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.grid() + + plt.plot(summaries["fpr"], summaries["tpr"], color='r', + lw=lw, label='ROC curve (area = %0.2f)' % summaries["roc_auc"]) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.legend(loc="lower right") + return fig, summaries + + @staticmethod + def roc_multiple(predicted_scores_list: List, + feature_target_list: List, + label_list: List, + marker_list: List, + linestyle_list: List, + color_list: List, + title: str="ROC Curve", + lw: int=2, + markersize: int=6, + markevery: int=10000, + legend_prop: int=2, + legend_markerscale: int=2) -> [MatplotlibFigure, Dict]: + """Plot the Receiver Operating Characteristic (ROC) + :param predicted_scores_list: the predicted Scores (one or multiple). + :param feature_target_list: the target feature, which is being estimated (one or multiple). + :param label_list: the line label (one or multiple). + :param marker_list: the line marker (one or multiple). + :param linestyle_list: the line style (one or multiple). + :param color_list: the line color (one or multiple). + :param title: the figure title. + :param lw: the line-width. + :param markersize: the marker size. + :param markevery: to mark every x point. + :param legend_prop: the legend proportion + :param legend_markerscale: the legend's marker scale. + :return: the plot object, and the data used to plot. + """ + + # calculate summaries + summaries = [None] * len(predicted_scores_list) + for i in range(len(predicted_scores_list)): + summaries[i] = dict() + summaries[i]["fpr"], summaries[i]["tpr"], _ = metrics.roc_curve( + feature_target_list[i], predicted_scores_list[i]) + summaries[i]["roc_auc"] = metrics.auc( + summaries[i]["fpr"], summaries[i]["tpr"]) + + # plot metadata + plt.clf() + fig = plt.figure(figsize=(10, 10)) + ax = fig.add_subplot(nrows=1, ncols=1) + plt.title(title) + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.grid() + + for i in range(len(predicted_scores_list)): + plt.plot(summaries[i]["fpr"], summaries[i]["tpr"], + markersize=markersize, + marker=marker_list[i], + markevery=markevery, + linestyle=linestyle_list[i], + linewidth=lw, + color=color_list[i], + label='AUC(' + label_list[i] + ')={0:0.2f}'.format(summaries[i]["roc_auc"])) + + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.legend(loc="lower right", prop={'size': legend_prop}, markerscale=legend_markerscale) + return fig, summaries + + @staticmethod + def learning_curve(estimator: Any, + features_indep_df: PandasDataFrame, + feature_target: List, + title: str="Learning Curve", + ylim: List=None, + cv: int=None, + n_jobs: int=-1, + train_sizes: List=np.linspace(.1, 1.0, 5)) -> [MatplotlibFigure, Dict]: + """Plot the learning curve. + "A learning curve shows the validation and training score of an estimator for varying numbers of training + samples. It is a tool to find out how much we benefit from adding more training data and whether the estimator + suffers more from a variance error or a bias error. If both the validation score and the training score + converge to a value that is too low with increasing size of the training set, we will not benefit much + from more training data." + :param estimator: the object type that implements the “fit” and “predict” methods. + An object of that type which is cloned for each validation. + :param features_indep_df: the independent features, which are inputted into the model. + :param feature_target: the target feature, which is being estimated. + :param title: the figure title. + :param ylim: the y-limit for the axis. + :param cv: the cross-validation splitting strategy (optional). + :param n_jobs: the number of jobs to run in parallel (default -1). + :param train_sizes: the size of the training samples for the learning curve. + :return: the plot object, and the data used to plot. + """ + summaries = dict() + + # calculate + train_sizes, train_scores, test_scores = learning_curve( + estimator, features_indep_df, feature_target, + cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) + + # summaries + summaries["train_scores_mean"] = np.mean(train_scores, axis=1) + summaries["train_scores_std"] = np.std(train_scores, axis=1) + summaries["test_scores_mean"] = np.mean(test_scores, axis=1) + summaries["test_scores_std"] = np.std(test_scores, axis=1) + + # plot metadata + fig, ax = plt.subplots(nrows=1, ncols=1) + plt.clf() + plt.title(title) + if ylim is not None: + plt.ylim(*ylim) + plt.xlabel("Training examples") + plt.ylabel("Score") + plt.grid() + + # plot curves + plt.fill_between(train_sizes, summaries["train_scores_mean"] - summaries["train_scores_std"], + summaries["train_scores_mean"] + summaries["train_scores_std"], alpha=0.1, color="r") + plt.fill_between(train_sizes, summaries["test_scores_mean"] - summaries["test_scores_std"], + summaries["test_scores_mean"] + summaries["test_scores_std"], alpha=0.1, color="g") + plt.plot(train_sizes, summaries["train_scores_mean"], 'o-', color="r", label="Training score") + plt.plot(train_sizes, summaries["test_scores_mean"], 'o-', color="g", label="Cross-validation score") + plt.legend(loc="best") + return fig, summaries + + @staticmethod + def validation_curve(estimator: Any, + features_indep_df: PandasDataFrame, + feature_target: List, + param_name: str, + param_range: List, + title: str="Learning Curve", + ylim: List=None, + cv: int=None, + lw: int=2, + n_jobs: int=-1) -> [MatplotlibFigure, Dict]: + """Plot the validation curve + "it is sometimes helpful to plot the influence of a single hyperparameter on the training score and the + validation score to find out whether the estimator is overfitting or underfitting for some hyperparameter + values." + :param estimator: the object type that implements the “fit” and “predict” methods. + An object of that type which is cloned for each validation. + :param features_indep_df: the independent features, which are inputted into the model. + :param feature_target: the target feature, which is being estimated. + :param param_name: the N=name of the parameter that will be varied. + :param param_range: the values of the parameter that will be evaluated. + :param title: the figure title. + :param ylim: the y-limit for the axis. + :param cv: the cross-validation splitting strategy (optional). + :param lw: the line-width. + :param n_jobs: the number of jobs to run in parallel (default -1). + :return: the plot object, and the data used to plot. + """ + summaries = dict() + + # train & test + train_scores, test_scores = validation_curve( + estimator, features_indep_df, feature_target, + param_name=param_name, param_range=param_range, + cv=cv, scoring="accuracy", n_jobs=n_jobs) + + # summaries + summaries["train_scores_mean"] = np.mean(train_scores, axis=1) + summaries["train_scores_std"] = np.std(train_scores, axis=1) + summaries["test_scores_mean"] = np.mean(test_scores, axis=1) + summaries["test_scores_std"] = np.std(test_scores, axis=1) + + # plot metadata + fig, ax = plt.subplots(nrows=1, ncols=1) + plt.clf() + plt.title(title) + if ylim is not None: + plt.ylim(*ylim) + plt.xlabel("$\gamma$") + plt.ylabel("Score") + plt.grid() + + # plot curves + plt.semilogx(param_range, summaries["train_scores_mean"], label="Training score", color="darkorange", lw=lw) + plt.fill_between(param_range, summaries["train_scores_mean"] - summaries["train_scores_std"], + summaries["train_scores_mean"] + summaries["train_scores_std"], alpha=0.2, + color="darkorange", lw=lw) + plt.semilogx(param_range, summaries["test_scores_mean"], label="Cross-validation score", color="navy", lw=lw) + plt.fill_between(param_range, summaries["test_scores_mean"] - summaries["test_scores_std"], + summaries["test_scores_mean"] + summaries["test_scores_std"], alpha=0.2, color="navy", lw=lw) + plt.legend(loc="best") + return fig, summaries + + @staticmethod + def distribution_bar(feature: List, + feature_name: str, + title: str, + ylim: List=[0.0, 1.05]) -> tuple: + """Plot distribution, using bar plot. + :param feature: the value of the feature. + :param feature_name: the name of the feature. + :param title: the figure title. + :param ylim: the y-limit for the axis. + :return: the plot object. + """ + uniques = np.unique(feature) + uniques.sort() + + # plot metadata + fig, ax = plt.subplots(nrows=1, ncols=1) + plt.clf() + plt.title(title) + if ylim is not None: + plt.ylim(*ylim) + plt.xlabel(feature_name) + plt.ylabel("Probability") + plt.grid() + + # plot curves + plt.hist(feature, bins=uniques, normed=1, facecolor='green', alpha=0.5) + return fig + + @staticmethod + def distribution_hist(feature: List, + feature_name: str, + title: str, + num_bins: int=50, + ylim: List=[0.0, 1.05]) -> tuple: + """Plot distribution, using histogram. + :param feature: the value of the feature. + :param feature_name: the name of the feature. + :param title: the figure title. + :param num_bins: number of bins in the histogram. + :param ylim: the y-limit for the axis. + :return: the plot object. + """ + # plot metadata + fig, ax = plt.subplots(nrows=1, ncols=1) + plt.clf() + plt.title(title) + if ylim is not None: + plt.ylim(*ylim) + plt.xlabel(feature_name) + plt.ylabel("Probability") + plt.grid() + + # plot curves# the histogram of the data + plt.hist(feature, num_bins, normed=1, facecolor='green', alpha=0.5) + return fig + + @staticmethod + def distribution_kde(feature: List, + feature_name: str, + title: str, + x_values: List=None, + kernel: str="gaussian", + bandwidth: float=0.5, + ylim: List=[0.0, 1.05]) -> List: + """Plot distribution, using Kernel Density Estimation (KDE). + :param feature: the value of the feature. + :param feature_name: the name of the feature. + :param title: the figure title. + :param x_values: the grid to use for plotting (default: based on the feature range and size) + :param kernel: the kernel to use. Valid kernels are + :param bandwidth: the bandwidth of the kernel. + :param ylim: the y-limit for the axis. + :return: the plot object. + """ + if x_values is None: + x_values = np.linspace(min(feature), max(feature), len(feature))[:, np.newaxis] + else: + x_values = x_values[:, np.newaxis] + + # plot metadata + fig, ax = plt.subplots(nrows=1, ncols=1) + plt.clf() + plt.title(title) + if ylim is not None: + plt.ylim(*ylim) + plt.xlabel(feature_name) + plt.ylabel("Probability") + plt.grid() + + # plot curves# the histogram of the data + kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(np.array(feature)[:, np.newaxis]) + log_dens = kde.score_samples(x_values) + plt.plot(x_values[:, 0], np.exp(log_dens), '-', label="kernel = '{0}'".format(kernel)) + return fig