|
a |
|
b/Features/FeatureParserThread.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
# -*- coding: UTF-8 -*- |
|
|
3 |
# |
|
|
4 |
# Copyright 2017 University of Westminster. All Rights Reserved. |
|
|
5 |
# |
|
|
6 |
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
|
7 |
# you may not use this file except in compliance with the License. |
|
|
8 |
# You may obtain a copy of the License at |
|
|
9 |
# |
|
|
10 |
# http://www.apache.org/licenses/LICENSE-2.0 |
|
|
11 |
# |
|
|
12 |
# Unless required by applicable law or agreed to in writing, software |
|
|
13 |
# distributed under the License is distributed on an "AS IS" BASIS, |
|
|
14 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
|
15 |
# See the License for the specific language governing permissions and |
|
|
16 |
# limitations under the License. |
|
|
17 |
# ============================================================================== |
|
|
18 |
""" It reads and parses the variables, then it generate features, in threaded batches. |
|
|
19 |
""" |
|
|
20 |
|
|
|
21 |
from typing import List, TypeVar, Dict |
|
|
22 |
import numpy as np |
|
|
23 |
import statistics |
|
|
24 |
from scipy.stats import itemfreq |
|
|
25 |
|
|
|
26 |
PandasDataFrame = TypeVar('DataFrame') |
|
|
27 |
NumpyNdarray = TypeVar('ndarray') |
|
|
28 |
|
|
|
29 |
__author__ = "Mohsen Mesgarpour" |
|
|
30 |
__copyright__ = "Copyright 2016, https://github.com/mesgarpour" |
|
|
31 |
__credits__ = ["Mohsen Mesgarpour"] |
|
|
32 |
__license__ = "GPL" |
|
|
33 |
__version__ = "1.1" |
|
|
34 |
__maintainer__ = "Mohsen Mesgarpour" |
|
|
35 |
__email__ = "mohsen.mesgarpour@gmail.com" |
|
|
36 |
__status__ = "Release" |
|
|
37 |
|
|
|
38 |
|
|
|
39 |
class FeatureParserThread: |
|
|
40 |
|
|
|
41 |
@staticmethod |
|
|
42 |
def aggregate_cell(postfixes: str, |
|
|
43 |
variable_type: str, |
|
|
44 |
prevalence: Dict, |
|
|
45 |
variable_cell: str) -> NumpyNdarray: |
|
|
46 |
"""Aggregate the variable value, based on the selected aggregated functions. |
|
|
47 |
:param postfixes: the aggregated variable. |
|
|
48 |
:param variable_type: the type of the input variable. |
|
|
49 |
:param prevalence: the prevalence dictionary of values for all the variables. |
|
|
50 |
:param variable_cell: the variable value (a single row) to aggregate. |
|
|
51 |
:return: the aggregated value (a single row). |
|
|
52 |
""" |
|
|
53 |
features_temp = np.zeros([len(postfixes)]) |
|
|
54 |
|
|
|
55 |
# if null or empty |
|
|
56 |
if variable_cell is None or variable_cell == "": |
|
|
57 |
return features_temp |
|
|
58 |
|
|
|
59 |
# parse variables |
|
|
60 |
# Note: replace None with '0' |
|
|
61 |
variable_cell = variable_cell.split('|') |
|
|
62 |
variable_cell = [v2 for v1 in variable_cell for v2 in set(v1.split(','))] |
|
|
63 |
variable_cell = [v if v != "" else 0 for v in variable_cell] |
|
|
64 |
if variable_type == "INT": |
|
|
65 |
variable_cell = list(map(int, variable_cell)) |
|
|
66 |
|
|
|
67 |
# generate freq. table |
|
|
68 |
freq = itemfreq(variable_cell) |
|
|
69 |
freq = np.array([tuple(row) for row in freq if row[1] != 0], dtype=[('value', 'int'), ('freq', 'int')]) |
|
|
70 |
freq_sorted = np.sort(freq, order=['freq', 'value'])[::-1]['value'] |
|
|
71 |
freq_dic = dict(zip(map(str, freq['value']), freq['freq'])) |
|
|
72 |
|
|
|
73 |
# set |
|
|
74 |
for p in range(len(postfixes)): |
|
|
75 |
if len(postfixes[p]) > 11 and postfixes[p][0:11] == "prevalence_": |
|
|
76 |
index = int(postfixes[p].split('_')[1]) - 1 |
|
|
77 |
if index < len(prevalence): |
|
|
78 |
value = prevalence[index] |
|
|
79 |
if str(value) in freq_dic.keys(): |
|
|
80 |
features_temp[p] = freq_dic[value] |
|
|
81 |
elif len(postfixes[p]) > 9 and postfixes[p][0:9] == "max_freq_": |
|
|
82 |
index = int(postfixes[p][9:]) - 1 |
|
|
83 |
if len(freq_sorted) > index: |
|
|
84 |
features_temp[p] = freq_sorted[index] |
|
|
85 |
elif postfixes[p] == "others_cnt": |
|
|
86 |
features_temp[p] = len(freq_sorted) # np.count_nonzero(variable_cell) |
|
|
87 |
elif postfixes[p] == "max": |
|
|
88 |
features_temp[p] = max(variable_cell) |
|
|
89 |
elif postfixes[p] == "avg": |
|
|
90 |
features_temp[p] = statistics.mean(variable_cell) |
|
|
91 |
elif postfixes[p] == "min": |
|
|
92 |
features_temp[p] = min(variable_cell) |
|
|
93 |
elif postfixes[p] == "median": |
|
|
94 |
features_temp[p] = statistics.median(variable_cell) |
|
|
95 |
else: |
|
|
96 |
raise ValueError(postfixes) |
|
|
97 |
return features_temp |
|
|
98 |
|
|
|
99 |
@staticmethod |
|
|
100 |
def prevalence_cell(variable_cell: str) -> List: |
|
|
101 |
"""Parse the inputted variable value (a single row), to a list of value. |
|
|
102 |
:param variable_cell: the variable value (a single row), to calculate the prevalence. |
|
|
103 |
:return: the list of values of the current variable value. |
|
|
104 |
""" |
|
|
105 |
# if null or empty |
|
|
106 |
if variable_cell is None or variable_cell == "": |
|
|
107 |
return [] |
|
|
108 |
else: |
|
|
109 |
# parse variables |
|
|
110 |
# Note: replace None with '0' |
|
|
111 |
variable_cell = variable_cell.split('|') |
|
|
112 |
variable_cell = [v2 for v1 in variable_cell for v2 in set(v1.split(','))] |
|
|
113 |
variable_cell = [v if v != "" else 0 for v in variable_cell] |
|
|
114 |
variable_cell = list(map(str, variable_cell)) |
|
|
115 |
return variable_cell |