--- a +++ b/Stats/TrainingMethod.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +# +# Copyright 2017 University of Westminster. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""It applies the training functions +""" + +from typing import Dict, List, TypeVar, Any +from Stats._LogisticRegression import _LogisticRegression +from Stats._LogisticRegressionCV import _LogisticRegressionCV +from Stats._MixedLinearModel import _MixedLinearModel +from Stats._RandomForestClassifier import _RandomForestClassifier +from Stats._GradientBoostingClassifier import _GradientBoostingClassifier +from Stats._DecisionTreeClassifier import _DecisionTreeClassifier +from Stats._KNeighborsClassifier import _KNeighborsClassifier +from Stats._NaiveBayes import _NaiveBayes +from Stats._NeuralNetwork import _NeuralNetwork +from ReadersWriters.ReadersWriters import ReadersWriters +from Configs.CONSTANTS import CONSTANTS +from sklearn.model_selection import cross_val_score +import numpy as np +import sys +import logging + +PandasDataFrame = TypeVar('DataFrame') +NumpyNDArray = TypeVar('ndarray') +CollectionsOrderedDict = TypeVar('OrderedDict') + +__author__ = "Mohsen Mesgarpour" +__copyright__ = "Copyright 2016, https://github.com/mesgarpour" +__credits__ = ["Mohsen Mesgarpour"] +__license__ = "GPL" +__version__ = "1.1" +__maintainer__ = "Mohsen Mesgarpour" +__email__ = "mohsen.mesgarpour@gmail.com" +__status__ = "Release" + + +class TrainingMethod: + def __init__(self, + method_name: str, + path: str=None, + title: str=None): + """Initialise the objects and constants. + :param method_name: the training method that will be used + (options: {'lr': Logistic Regression, 'lr_cv': Logistic Regression with Cross-Validation, + 'mlm': Mixed Linear Model, 'rfc': Random Forest Classifier, 'gbc': Gradient Boosting Classifier, + 'dtc' Decision Tree Classifier, 'knc': K-Nearest Neighbors Classifier, 'nb': Multinomial Naive Bayes, + 'nn': Multi-Layer Perceptron (MLP) Neural Network}). + :param path: the directory path of the saved trained model file, using this application (if applicable). + :param title: the file name of the saved trained model file, using this application + """ + self.__logger = logging.getLogger(CONSTANTS.app_name) + self.__logger.debug(__name__) + + self.__readers_writers = ReadersWriters() + self.__method = None + self.method_name = method_name + self.model_labels = None + self.model_train = None + self.model_predict = dict() + self.model_cross_validate = None + if method_name is not None: + self.__init__method(method_name) + else: + self.load(path, title) + + def __init__method(self, + method_name: str, + model_labels: List=None, + model_train: Any=None, + model_predict: Dict=None, + model_cross_validate: NumpyNDArray=None): + """Initialise the selected training method. + :param method_name: the training method that will be used + (options: {'lr': Logistic Regression, 'lr_cv': Logistic Regression with Cross-Validation, + 'mlm': Mixed Linear Model, 'rfc': Random Forest Classifier, 'gbc': Gradient Boosting Classifier, + 'dtc' Decision Tree Classifier, 'knc': K-Nearest Neighbors Classifier, 'nb': Multinomial Naive Bayes, + 'nn': Multi-Layer Perceptron (MLP) Neural Network}). + :param model_labels: the features names to be inputted into the model. + Note: the order of features will be preserved internally. + :param model_train: the training model. + :param model_predict: the prediction outputs. + :param model_cross_validate: the cross-validation model. + """ + self.__logger.debug("Initialise the training method.") + if method_name == "lr": + self.__method = _LogisticRegression() + elif method_name == "lr_cv": + self.__method = _LogisticRegressionCV() + elif method_name == "mlm": + self.__method = _MixedLinearModel() + elif method_name == "rfc": + self.__method = _RandomForestClassifier() + elif method_name == "gbc": + self.__method = _GradientBoostingClassifier() + elif method_name == "dtc": + self.__method = _DecisionTreeClassifier() + elif method_name == "knc": + self.__method = _KNeighborsClassifier() + elif method_name == "nb": + self.__method = _NaiveBayes() + elif method_name == "nn": + self.__method = _NeuralNetwork() + else: + self.__logger.error(__name__ + " - Invalid training method: " + str(method_name)) + sys.exit() + + self.model_labels = model_labels + self.model_train = model_train + self.model_predict = dict() if model_predict is None else model_predict + self.model_cross_validate = model_cross_validate + + def train(self, + features_indep_df: PandasDataFrame, + feature_target: List, + **kwargs: Any) -> Any: + """Perform the training, using the selected method. + :param features_indep_df: the independent features, which are inputted into the model. + :param feature_target: the target feature, which is being estimated. + :param kwargs: the training method's argument. + :return: the trained model. + """ + self.__logger.debug("Train.") + self.model_labels = list(features_indep_df.columns.values) + self.model_train = self.__method.train( + features_indep_df[self.model_labels], feature_target, self.model_labels, **kwargs) + return self.model_train + + def plot(self) -> Any: + """Plot the tree diagram. + :return: the model graph. + """ + self.__logger.debug("Plot.") + return self.__method.plot(self.model_train, self.model_labels, ["True", "False"]) + + def train_summaries(self) -> Any: + """ Produce the training summary. + :return: the training summary. + """ + self.__logger.debug("Summarise training model.") + return self.__method.train_summaries(self.model_train) + + def predict(self, + features_indep_df: PandasDataFrame, + sample_name: str) -> PandasDataFrame: + """Predict probability of labels, using the training model. + :param features_indep_df: the independent features, which are inputted into the model. + :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate'). + :return: the predicted probabilities, and the predicted labels. + """ + self.__logger.debug("Predict.") + self.model_predict[sample_name] = self.__method.predict(self.model_train, features_indep_df[self.model_labels]) + return self.model_predict[sample_name] + + def predict_summaries(self, + feature_target: List, + sample_name: str) -> CollectionsOrderedDict: + """roduce summary statistics for the prediction performance. + :param feature_target: the target feature, which is being estimated. + :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate'). + :return: the prediction summaries. + """ + self.__logger.debug("Summarise predictions.") + self.model_predict[sample_name]['target'] = feature_target + return self.__method.predict_summaries(self.model_predict[sample_name], feature_target) + + def predict_summaries_risk_bands(self, + feature_target: List, + sample_name: str, + cutoffs: List=np.arange(0, 1.05, 0.05)) -> CollectionsOrderedDict: + """Produce a summary statistics table for a range of cut-off points. + :param feature_target: the target feature, which is being estimated. + :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate'). + :param cutoffs: a list of risk cut-off points. + :return: the summary statistics table for the cut-off points. + """ + self.__logger.debug("Summarise predictions.") + self.model_predict[sample_name]['target'] = feature_target + return self.__method.predict_summaries_cutoffs_table( + self.model_predict[sample_name]['score'], feature_target, cutoffs) + + def cross_validate(self, + features_indep_df: PandasDataFrame, + feature_target: List, + scoring: str="neg_mean_squared_error", + cv: int=10) -> Any: + """Evaluate the model by performing cross-validation. + :param features_indep_df: the independent features, which are inputted into the model. + :param feature_target: the target feature, which is being estimated. + :param scoring: the scoring method (default: 'neg_mean_squared_error'). + :param cv: the cross-validation splitting strategy (optional). + :return: the cross-validation summary + """ + self.__logger.info("Cross-Validate") + + self.model_cross_validate = cross_val_score( + self.model_train, features_indep_df[self.model_labels], feature_target, scoring=scoring, cv=cv) + return self.model_cross_validate + + def cross_validate_summaries(self) -> Any: + """Produce a summary of the applied cross-validation + :return: the cross-validation summary + """ + return self.model_cross_validate + + def save_model(self, + path: str, + title: str): + """Save (pickle) the training model, as well as predictions and cross-validations. + Note: summaries statistics won't not saved. + :param path: the directory path of the saved trained model file, using this application (if applicable). + :param title: the file name of the saved trained model file, using this application. + """ + self.__logger.info("Saving model") + objects = dict() + objects['method_name'] = self.method_name + objects['model_labels'] = self.model_labels + objects['model_train'] = self.model_train + objects['model_predict'] = self.model_predict + objects['model_cross_validate'] = self.model_cross_validate + self.__readers_writers.save_serialised(path, title, objects=objects) + + def save_model_compressed(self, + path: str, + title: str): + """Save (pickle) & compressthe training model, as well as predictions and cross-validations. + Note: summaries statistics won't not saved. + :param path: the directory path of the saved trained model file, using this application (if applicable). + :param title: the file name of the saved trained model file, using this application. + """ + self.__logger.debug("Save model.") + objects = dict() + objects['method_name'] = self.method_name + objects['model_labels'] = self.model_labels + objects['model_train'] = self.model_train + objects['model_predict'] = self.model_predict + objects['model_cross_validate'] = self.model_cross_validate + self.__readers_writers.save_serialised_compressed(path, title, objects=objects) + + def load(self, + path: str, + title: str): + """Load (unpickle) the model, which was saved using this application. + :param path: the directory path of the saved trained model file, using this application (if applicable). + :param title: the file name of the saved trained model file, using this application + """ + self.__logger.debug("Load model.") + objects = self.__readers_writers.load_serialised(path, title) + try: + self.__init__method(method_name=objects['method_name'], + model_labels=objects['model_labels'], + model_train=objects['model_train'], + model_predict=objects['model_predict'], + model_cross_validate=objects['model_cross_validate']) + except(): + self.__logger.error(__name__ + " - Invalid field(s) in the model file: " + path) + sys.exit()