|
a |
|
b/Stats/FactoringThread.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
# -*- coding: UTF-8 -*- |
|
|
3 |
# |
|
|
4 |
# Copyright 2017 University of Westminster. All Rights Reserved. |
|
|
5 |
# |
|
|
6 |
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
|
7 |
# you may not use this file except in compliance with the License. |
|
|
8 |
# You may obtain a copy of the License at |
|
|
9 |
# |
|
|
10 |
# http://www.apache.org/licenses/LICENSE-2.0 |
|
|
11 |
# |
|
|
12 |
# Unless required by applicable law or agreed to in writing, software |
|
|
13 |
# distributed under the License is distributed on an "AS IS" BASIS, |
|
|
14 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
|
15 |
# See the License for the specific language governing permissions and |
|
|
16 |
# limitations under the License. |
|
|
17 |
# ============================================================================== |
|
|
18 |
""" It applies feature factorisation (categorisation) using independent threads. |
|
|
19 |
""" |
|
|
20 |
|
|
|
21 |
from typing import Dict, TypeVar |
|
|
22 |
from sklearn import preprocessing |
|
|
23 |
import pandas as pd |
|
|
24 |
|
|
|
25 |
PandasDataFrame = TypeVar('DataFrame') |
|
|
26 |
|
|
|
27 |
__author__ = "Mohsen Mesgarpour" |
|
|
28 |
__copyright__ = "Copyright 2016, https://github.com/mesgarpour" |
|
|
29 |
__credits__ = ["Mohsen Mesgarpour"] |
|
|
30 |
__license__ = "GPL" |
|
|
31 |
__version__ = "1.1" |
|
|
32 |
__maintainer__ = "Mohsen Mesgarpour" |
|
|
33 |
__email__ = "mohsen.mesgarpour@gmail.com" |
|
|
34 |
__status__ = "Release" |
|
|
35 |
|
|
|
36 |
|
|
|
37 |
class FactoringThread: |
|
|
38 |
def __init__(self, |
|
|
39 |
df: PandasDataFrame, |
|
|
40 |
categories_dic: Dict, |
|
|
41 |
labels_dic: Dict): |
|
|
42 |
"""Initialise the objects and constants. |
|
|
43 |
:param df: the inputted dataframe to process. |
|
|
44 |
:param categories_dic: the categorisation dictionary. |
|
|
45 |
:param labels_dic: the name of the new features. |
|
|
46 |
""" |
|
|
47 |
self.__df = df |
|
|
48 |
self.__categories_dic = categories_dic |
|
|
49 |
self.__labels_dic = labels_dic |
|
|
50 |
|
|
|
51 |
def factor_arr_multiple(self, |
|
|
52 |
label_group: str) -> PandasDataFrame: |
|
|
53 |
"""Categorise multiple features. |
|
|
54 |
:param label_group: the names of features to be categorised. |
|
|
55 |
:return: the categorised features. |
|
|
56 |
""" |
|
|
57 |
labels_encoded = list(self.__categories_dic[label_group].keys()) |
|
|
58 |
df_encoded = self.__factor_arr(labels_encoded[0], label_group) |
|
|
59 |
|
|
|
60 |
if len(labels_encoded) > 1: |
|
|
61 |
for label in labels_encoded[1:]: |
|
|
62 |
df_encoded = df_encoded.add(self.__factor_arr(label, label_group)) |
|
|
63 |
return df_encoded |
|
|
64 |
|
|
|
65 |
def factor_arr(self, |
|
|
66 |
label: str) -> PandasDataFrame: |
|
|
67 |
"""Categorise a single feature. |
|
|
68 |
:param label: the name of the feature to be categorised. |
|
|
69 |
:return: the categorised feature. |
|
|
70 |
""" |
|
|
71 |
df_encoded = preprocessing.label_binarize(self.__df[label], classes=self.__categories_dic[label]) |
|
|
72 |
df_encoded = pd.DataFrame(df_encoded, columns=self.__labels_dic[label]) |
|
|
73 |
return df_encoded |
|
|
74 |
|
|
|
75 |
def __factor_arr(self, |
|
|
76 |
label: str, |
|
|
77 |
label_group: str) -> PandasDataFrame: |
|
|
78 |
"""Categorise a list using the 'preprocessing.label_binarize'. |
|
|
79 |
:param label: the name of the feature to be categorised. |
|
|
80 |
:param label_group: the name of the feature group in the categorisation dictionary. |
|
|
81 |
:return: the categorised feature. |
|
|
82 |
""" |
|
|
83 |
df_encoded = preprocessing.label_binarize(self.__df[label], classes=self.__categories_dic[label_group][label]) |
|
|
84 |
df_encoded = pd.DataFrame(df_encoded, columns=self.__labels_dic[label_group]) |
|
|
85 |
return df_encoded |