T-CARER / Git / [973ab6] /Stats/TrainingMethod.py

Models:
RaymondKing/
T-CARER
Downloads: 1
[973ab6]: / Stats / TrainingMethod.py
History
Download this file
272 lines (251 with data), 13.2 kB

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#
# Copyright 2017 University of Westminster. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""It applies the training functions
"""

from typing import Dict, List, TypeVar, Any
from Stats._LogisticRegression import _LogisticRegression
from Stats._LogisticRegressionCV import _LogisticRegressionCV
from Stats._MixedLinearModel import _MixedLinearModel
from Stats._RandomForestClassifier import _RandomForestClassifier
from Stats._GradientBoostingClassifier import _GradientBoostingClassifier
from Stats._DecisionTreeClassifier import _DecisionTreeClassifier
from Stats._KNeighborsClassifier import _KNeighborsClassifier
from Stats._NaiveBayes import _NaiveBayes
from Stats._NeuralNetwork import _NeuralNetwork
from ReadersWriters.ReadersWriters import ReadersWriters
from Configs.CONSTANTS import CONSTANTS
from sklearn.model_selection import cross_val_score
import numpy as np
import sys
import logging

PandasDataFrame = TypeVar('DataFrame')
NumpyNDArray = TypeVar('ndarray')
CollectionsOrderedDict = TypeVar('OrderedDict')

__author__ = "Mohsen Mesgarpour"
__copyright__ = "Copyright 2016, https://github.com/mesgarpour"
__credits__ = ["Mohsen Mesgarpour"]
__license__ = "GPL"
__version__ = "1.1"
__maintainer__ = "Mohsen Mesgarpour"
__email__ = "mohsen.mesgarpour@gmail.com"
__status__ = "Release"


class TrainingMethod:
    def __init__(self,
                 method_name: str,
                 path: str=None,
                 title: str=None):
        """Initialise the objects and constants.
        :param method_name: the training method that will be used
        (options: {'lr': Logistic Regression, 'lr_cv': Logistic Regression with Cross-Validation,
        'mlm': Mixed Linear Model, 'rfc': Random Forest Classifier, 'gbc': Gradient Boosting Classifier,
        'dtc' Decision Tree Classifier, 'knc': K-Nearest Neighbors Classifier, 'nb': Multinomial Naive Bayes,
        'nn': Multi-Layer Perceptron (MLP) Neural Network}).
        :param path: the directory path of the saved trained model file, using this application (if applicable).
        :param title: the file name of the saved trained model file, using this application
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)

        self.__readers_writers = ReadersWriters()
        self.__method = None
        self.method_name = method_name
        self.model_labels = None
        self.model_train = None
        self.model_predict = dict()
        self.model_cross_validate = None
        if method_name is not None:
            self.__init__method(method_name)
        else:
            self.load(path, title)

    def __init__method(self,
                       method_name: str,
                       model_labels: List=None,
                       model_train: Any=None,
                       model_predict: Dict=None,
                       model_cross_validate: NumpyNDArray=None):
        """Initialise the selected training method.
        :param method_name: the training method that will be used
        (options: {'lr': Logistic Regression, 'lr_cv': Logistic Regression with Cross-Validation,
        'mlm': Mixed Linear Model, 'rfc': Random Forest Classifier, 'gbc': Gradient Boosting Classifier,
        'dtc' Decision Tree Classifier, 'knc': K-Nearest Neighbors Classifier, 'nb': Multinomial Naive Bayes,
        'nn': Multi-Layer Perceptron (MLP) Neural Network}).
        :param model_labels: the features names to be inputted into the model.
        Note: the order of features will be preserved internally.
        :param model_train: the training model.
        :param model_predict: the prediction outputs.
        :param model_cross_validate: the cross-validation model.
        """
        self.__logger.debug("Initialise the training method.")
        if method_name == "lr":
            self.__method = _LogisticRegression()
        elif method_name == "lr_cv":
            self.__method = _LogisticRegressionCV()
        elif method_name == "mlm":
            self.__method = _MixedLinearModel()
        elif method_name == "rfc":
            self.__method = _RandomForestClassifier()
        elif method_name == "gbc":
            self.__method = _GradientBoostingClassifier()
        elif method_name == "dtc":
            self.__method = _DecisionTreeClassifier()
        elif method_name == "knc":
            self.__method = _KNeighborsClassifier()
        elif method_name == "nb":
            self.__method = _NaiveBayes()
        elif method_name == "nn":
            self.__method = _NeuralNetwork()
        else:
            self.__logger.error(__name__ + " - Invalid training method: " + str(method_name))
            sys.exit()

        self.model_labels = model_labels
        self.model_train = model_train
        self.model_predict = dict() if model_predict is None else model_predict
        self.model_cross_validate = model_cross_validate

    def train(self,
              features_indep_df: PandasDataFrame,
              feature_target: List,
              **kwargs: Any) -> Any:
        """Perform the training, using the selected method.
        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param kwargs: the training method's argument.
        :return: the trained model.
        """
        self.__logger.debug("Train.")
        self.model_labels = list(features_indep_df.columns.values)
        self.model_train = self.__method.train(
            features_indep_df[self.model_labels], feature_target, self.model_labels, **kwargs)
        return self.model_train

    def plot(self) -> Any:
        """Plot the tree diagram.
        :return: the model graph.
        """
        self.__logger.debug("Plot.")
        return self.__method.plot(self.model_train, self.model_labels, ["True", "False"])

    def train_summaries(self) -> Any:
        """ Produce the training summary.
        :return: the training summary.
        """
        self.__logger.debug("Summarise training model.")
        return self.__method.train_summaries(self.model_train)

    def predict(self,
                features_indep_df: PandasDataFrame,
                sample_name: str) -> PandasDataFrame:
        """Predict probability of labels, using the training model.
        :param features_indep_df: the independent features, which are inputted into the model.
        :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate').
        :return: the predicted probabilities, and the predicted labels.
        """
        self.__logger.debug("Predict.")
        self.model_predict[sample_name] = self.__method.predict(self.model_train, features_indep_df[self.model_labels])
        return self.model_predict[sample_name]

    def predict_summaries(self,
                          feature_target: List,
                          sample_name: str) -> CollectionsOrderedDict:
        """roduce summary statistics for the prediction performance.
        :param feature_target: the target feature, which is being estimated.
        :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate').
        :return: the prediction summaries.
        """
        self.__logger.debug("Summarise predictions.")
        self.model_predict[sample_name]['target'] = feature_target
        return self.__method.predict_summaries(self.model_predict[sample_name], feature_target)

    def predict_summaries_risk_bands(self,
                                     feature_target: List,
                                     sample_name: str,
                                     cutoffs: List=np.arange(0, 1.05, 0.05)) -> CollectionsOrderedDict:
        """Produce a summary statistics table for a range of cut-off points.
        :param feature_target: the target feature, which is being estimated.
        :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate').
        :param cutoffs: a list of risk cut-off points.
        :return: the summary statistics table for the cut-off points.
        """
        self.__logger.debug("Summarise predictions.")
        self.model_predict[sample_name]['target'] = feature_target
        return self.__method.predict_summaries_cutoffs_table(
            self.model_predict[sample_name]['score'], feature_target, cutoffs)

    def cross_validate(self,
                       features_indep_df: PandasDataFrame,
                       feature_target: List,
                       scoring: str="neg_mean_squared_error",
                       cv: int=10) -> Any:
        """Evaluate the model by performing cross-validation.
        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param scoring: the scoring method (default: 'neg_mean_squared_error').
        :param cv: the cross-validation splitting strategy (optional).
        :return: the cross-validation summary
        """
        self.__logger.info("Cross-Validate")

        self.model_cross_validate = cross_val_score(
            self.model_train, features_indep_df[self.model_labels], feature_target, scoring=scoring, cv=cv)
        return self.model_cross_validate

    def cross_validate_summaries(self) -> Any:
        """Produce a summary of the applied cross-validation
        :return: the cross-validation summary
        """
        return self.model_cross_validate

    def save_model(self,
                   path: str,
                   title: str):
        """Save (pickle) the training model, as well as predictions and cross-validations.
        Note: summaries statistics won't not saved.
        :param path: the directory path of the saved trained model file, using this application (if applicable).
        :param title: the file name of the saved trained model file, using this application.
        """
        self.__logger.info("Saving model")
        objects = dict()
        objects['method_name'] = self.method_name
        objects['model_labels'] = self.model_labels
        objects['model_train'] = self.model_train
        objects['model_predict'] = self.model_predict
        objects['model_cross_validate'] = self.model_cross_validate
        self.__readers_writers.save_serialised(path, title, objects=objects)

    def save_model_compressed(self,
                              path: str,
                              title: str):
        """Save (pickle) & compressthe training model, as well as predictions and cross-validations.
        Note: summaries statistics won't not saved.
        :param path: the directory path of the saved trained model file, using this application (if applicable).
        :param title: the file name of the saved trained model file, using this application.
        """
        self.__logger.debug("Save model.")
        objects = dict()
        objects['method_name'] = self.method_name
        objects['model_labels'] = self.model_labels
        objects['model_train'] = self.model_train
        objects['model_predict'] = self.model_predict
        objects['model_cross_validate'] = self.model_cross_validate
        self.__readers_writers.save_serialised_compressed(path, title, objects=objects)

    def load(self,
             path: str,
             title: str):
        """Load (unpickle) the model, which was saved using this application.
        :param path: the directory path of the saved trained model file, using this application (if applicable).
        :param title: the file name of the saved trained model file, using this application
        """
        self.__logger.debug("Load model.")
        objects = self.__readers_writers.load_serialised(path, title)
        try:
            self.__init__method(method_name=objects['method_name'],
                                model_labels=objects['model_labels'],
                                model_train=objects['model_train'],
                                model_predict=objects['model_predict'],
                                model_cross_validate=objects['model_cross_validate'])
        except():
            self.__logger.error(__name__ + " - Invalid field(s) in the model file: " + path)
            sys.exit()