Switch to unified view

a b/Features/FeatureParserThread.py
1
#!/usr/bin/env python
2
# -*- coding: UTF-8 -*-
3
#
4
# Copyright 2017 University of Westminster. All Rights Reserved.
5
#
6
# Licensed under the Apache License, Version 2.0 (the "License");
7
# you may not use this file except in compliance with the License.
8
# You may obtain a copy of the License at
9
#
10
#     http://www.apache.org/licenses/LICENSE-2.0
11
#
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
17
# ==============================================================================
18
""" It reads and parses the variables, then it generate features, in threaded batches.
19
"""
20
21
from typing import List, TypeVar, Dict
22
import numpy as np
23
import statistics
24
from scipy.stats import itemfreq
25
26
PandasDataFrame = TypeVar('DataFrame')
27
NumpyNdarray = TypeVar('ndarray')
28
29
__author__ = "Mohsen Mesgarpour"
30
__copyright__ = "Copyright 2016, https://github.com/mesgarpour"
31
__credits__ = ["Mohsen Mesgarpour"]
32
__license__ = "GPL"
33
__version__ = "1.1"
34
__maintainer__ = "Mohsen Mesgarpour"
35
__email__ = "mohsen.mesgarpour@gmail.com"
36
__status__ = "Release"
37
38
39
class FeatureParserThread:
40
41
    @staticmethod
42
    def aggregate_cell(postfixes: str,
43
                       variable_type: str,
44
                       prevalence: Dict,
45
                       variable_cell: str) -> NumpyNdarray:
46
        """Aggregate the variable value, based on the selected aggregated functions.
47
        :param postfixes: the aggregated variable.
48
        :param variable_type: the type of the input variable.
49
        :param prevalence: the prevalence dictionary of values for all the variables.
50
        :param variable_cell: the variable value (a single row) to aggregate.
51
        :return: the aggregated value (a single row).
52
        """
53
        features_temp = np.zeros([len(postfixes)])
54
55
        # if null or empty
56
        if variable_cell is None or variable_cell == "":
57
            return features_temp
58
59
        # parse variables
60
        # Note: replace None with '0'
61
        variable_cell = variable_cell.split('|')
62
        variable_cell = [v2 for v1 in variable_cell for v2 in set(v1.split(','))]
63
        variable_cell = [v if v != "" else 0 for v in variable_cell]
64
        if variable_type == "INT":
65
            variable_cell = list(map(int, variable_cell))
66
67
        # generate freq. table
68
        freq = itemfreq(variable_cell)
69
        freq = np.array([tuple(row) for row in freq if row[1] != 0], dtype=[('value', 'int'), ('freq', 'int')])
70
        freq_sorted = np.sort(freq, order=['freq', 'value'])[::-1]['value']
71
        freq_dic = dict(zip(map(str, freq['value']), freq['freq']))
72
73
        # set
74
        for p in range(len(postfixes)):
75
            if len(postfixes[p]) > 11 and postfixes[p][0:11] == "prevalence_":
76
                index = int(postfixes[p].split('_')[1]) - 1
77
                if index < len(prevalence):
78
                    value = prevalence[index]
79
                    if str(value) in freq_dic.keys():
80
                        features_temp[p] = freq_dic[value]
81
            elif len(postfixes[p]) > 9 and postfixes[p][0:9] == "max_freq_":
82
                index = int(postfixes[p][9:]) - 1
83
                if len(freq_sorted) > index:
84
                    features_temp[p] = freq_sorted[index]
85
            elif postfixes[p] == "others_cnt":
86
                features_temp[p] = len(freq_sorted)  # np.count_nonzero(variable_cell)
87
            elif postfixes[p] == "max":
88
                features_temp[p] = max(variable_cell)
89
            elif postfixes[p] == "avg":
90
                features_temp[p] = statistics.mean(variable_cell)
91
            elif postfixes[p] == "min":
92
                features_temp[p] = min(variable_cell)
93
            elif postfixes[p] == "median":
94
                features_temp[p] = statistics.median(variable_cell)
95
            else:
96
                raise ValueError(postfixes)
97
        return features_temp
98
99
    @staticmethod
100
    def prevalence_cell(variable_cell: str) -> List:
101
        """Parse the inputted variable value (a single row), to a list of value.
102
        :param variable_cell: the variable value (a single row), to calculate the prevalence.
103
        :return: the list of values of the current variable value.
104
        """
105
        # if null or empty
106
        if variable_cell is None or variable_cell == "":
107
            return []
108
        else:
109
            # parse variables
110
            # Note: replace None with '0'
111
            variable_cell = variable_cell.split('|')
112
            variable_cell = [v2 for v1 in variable_cell for v2 in set(v1.split(','))]
113
            variable_cell = [v if v != "" else 0 for v in variable_cell]
114
            variable_cell = list(map(str, variable_cell))
115
            return variable_cell