Diff of /Stats/FactoringThread.py [000000] .. [b4a150]

Switch to side-by-side view

--- a
+++ b/Stats/FactoringThread.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+#
+# Copyright 2017 University of Westminster. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+""" It applies feature factorisation (categorisation) using independent threads.
+"""
+
+from typing import Dict, TypeVar
+from sklearn import preprocessing
+import pandas as pd
+
+PandasDataFrame = TypeVar('DataFrame')
+
+__author__ = "Mohsen Mesgarpour"
+__copyright__ = "Copyright 2016, https://github.com/mesgarpour"
+__credits__ = ["Mohsen Mesgarpour"]
+__license__ = "GPL"
+__version__ = "1.1"
+__maintainer__ = "Mohsen Mesgarpour"
+__email__ = "mohsen.mesgarpour@gmail.com"
+__status__ = "Release"
+
+
+class FactoringThread:
+    def __init__(self,
+                 df: PandasDataFrame,
+                 categories_dic: Dict,
+                 labels_dic: Dict):
+        """Initialise the objects and constants.
+        :param df: the inputted dataframe to process.
+        :param categories_dic: the categorisation dictionary.
+        :param labels_dic: the name of the new features.
+        """
+        self.__df = df
+        self.__categories_dic = categories_dic
+        self.__labels_dic = labels_dic
+
+    def factor_arr_multiple(self,
+                            label_group: str) -> PandasDataFrame:
+        """Categorise multiple features.
+        :param label_group: the names of features to be categorised.
+        :return: the categorised features.
+        """
+        labels_encoded = list(self.__categories_dic[label_group].keys())
+        df_encoded = self.__factor_arr(labels_encoded[0], label_group)
+
+        if len(labels_encoded) > 1:
+            for label in labels_encoded[1:]:
+                df_encoded = df_encoded.add(self.__factor_arr(label, label_group))
+        return df_encoded
+
+    def factor_arr(self,
+                   label: str) -> PandasDataFrame:
+        """Categorise a single feature.
+        :param label: the name of the feature to be categorised.
+        :return: the categorised feature.
+        """
+        df_encoded = preprocessing.label_binarize(self.__df[label], classes=self.__categories_dic[label])
+        df_encoded = pd.DataFrame(df_encoded, columns=self.__labels_dic[label])
+        return df_encoded
+
+    def __factor_arr(self,
+                     label: str,
+                     label_group: str) -> PandasDataFrame:
+        """Categorise a list using the 'preprocessing.label_binarize'.
+        :param label: the name of the feature to be categorised.
+        :param label_group: the name of the feature group in the categorisation dictionary.
+        :return: the categorised feature.
+        """
+        df_encoded = preprocessing.label_binarize(self.__df[label], classes=self.__categories_dic[label_group][label])
+        df_encoded = pd.DataFrame(df_encoded, columns=self.__labels_dic[label_group])
+        return df_encoded