a b/Stats/FactoringThread.py
1
#!/usr/bin/env python
2
# -*- coding: UTF-8 -*-
3
#
4
# Copyright 2017 University of Westminster. All Rights Reserved.
5
#
6
# Licensed under the Apache License, Version 2.0 (the "License");
7
# you may not use this file except in compliance with the License.
8
# You may obtain a copy of the License at
9
#
10
#     http://www.apache.org/licenses/LICENSE-2.0
11
#
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
17
# ==============================================================================
18
""" It applies feature factorisation (categorisation) using independent threads.
19
"""
20
21
from typing import Dict, TypeVar
22
from sklearn import preprocessing
23
import pandas as pd
24
25
PandasDataFrame = TypeVar('DataFrame')
26
27
__author__ = "Mohsen Mesgarpour"
28
__copyright__ = "Copyright 2016, https://github.com/mesgarpour"
29
__credits__ = ["Mohsen Mesgarpour"]
30
__license__ = "GPL"
31
__version__ = "1.1"
32
__maintainer__ = "Mohsen Mesgarpour"
33
__email__ = "mohsen.mesgarpour@gmail.com"
34
__status__ = "Release"
35
36
37
class FactoringThread:
38
    def __init__(self,
39
                 df: PandasDataFrame,
40
                 categories_dic: Dict,
41
                 labels_dic: Dict):
42
        """Initialise the objects and constants.
43
        :param df: the inputted dataframe to process.
44
        :param categories_dic: the categorisation dictionary.
45
        :param labels_dic: the name of the new features.
46
        """
47
        self.__df = df
48
        self.__categories_dic = categories_dic
49
        self.__labels_dic = labels_dic
50
51
    def factor_arr_multiple(self,
52
                            label_group: str) -> PandasDataFrame:
53
        """Categorise multiple features.
54
        :param label_group: the names of features to be categorised.
55
        :return: the categorised features.
56
        """
57
        labels_encoded = list(self.__categories_dic[label_group].keys())
58
        df_encoded = self.__factor_arr(labels_encoded[0], label_group)
59
60
        if len(labels_encoded) > 1:
61
            for label in labels_encoded[1:]:
62
                df_encoded = df_encoded.add(self.__factor_arr(label, label_group))
63
        return df_encoded
64
65
    def factor_arr(self,
66
                   label: str) -> PandasDataFrame:
67
        """Categorise a single feature.
68
        :param label: the name of the feature to be categorised.
69
        :return: the categorised feature.
70
        """
71
        df_encoded = preprocessing.label_binarize(self.__df[label], classes=self.__categories_dic[label])
72
        df_encoded = pd.DataFrame(df_encoded, columns=self.__labels_dic[label])
73
        return df_encoded
74
75
    def __factor_arr(self,
76
                     label: str,
77
                     label_group: str) -> PandasDataFrame:
78
        """Categorise a list using the 'preprocessing.label_binarize'.
79
        :param label: the name of the feature to be categorised.
80
        :param label_group: the name of the feature group in the categorisation dictionary.
81
        :return: the categorised feature.
82
        """
83
        df_encoded = preprocessing.label_binarize(self.__df[label], classes=self.__categories_dic[label_group][label])
84
        df_encoded = pd.DataFrame(df_encoded, columns=self.__labels_dic[label_group])
85
        return df_encoded