|
a |
|
b/Stats/YeoJohnson.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
# -*- coding: UTF-8 -*- |
|
|
3 |
# |
|
|
4 |
# Copyright 2017 University of Westminster. All Rights Reserved. |
|
|
5 |
# |
|
|
6 |
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
|
7 |
# you may not use this file except in compliance with the License. |
|
|
8 |
# You may obtain a copy of the License at |
|
|
9 |
# |
|
|
10 |
# http://www.apache.org/licenses/LICENSE-2.0 |
|
|
11 |
# |
|
|
12 |
# Unless required by applicable law or agreed to in writing, software |
|
|
13 |
# distributed under the License is distributed on an "AS IS" BASIS, |
|
|
14 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
|
15 |
# See the License for the specific language governing permissions and |
|
|
16 |
# limitations under the License. |
|
|
17 |
# ============================================================================== |
|
|
18 |
""" It computes the Yeo-Johnson transofrmation, which is an extension of Box-Cox transformation |
|
|
19 |
but can handle both positive and negative values. |
|
|
20 |
|
|
|
21 |
References: |
|
|
22 |
Weisberg, S. (2001). Yeo-Johnson Power Transformations. |
|
|
23 |
Department of Applied Statistics, University of Minnesota. Retrieved June, 1, 2003. |
|
|
24 |
https://www.stat.umn.edu/arc/yjpower.pdf |
|
|
25 |
|
|
|
26 |
Adapted from CRAN - Package VGAM |
|
|
27 |
""" |
|
|
28 |
|
|
|
29 |
from typing import List, TypeVar, Callable |
|
|
30 |
import sys |
|
|
31 |
import warnings |
|
|
32 |
import numpy as np |
|
|
33 |
import pandas as pd |
|
|
34 |
|
|
|
35 |
NumpyNDArray = TypeVar('ndarray') |
|
|
36 |
PandasSeries = TypeVar('Series') |
|
|
37 |
|
|
|
38 |
|
|
|
39 |
class YeoJohnson: |
|
|
40 |
def fit(self, |
|
|
41 |
y: Callable[[List, NumpyNDArray, PandasSeries], None], |
|
|
42 |
lmbda: Callable[[int, float], None], |
|
|
43 |
derivative: Callable[[int, float], None]=0, |
|
|
44 |
epsilon: Callable[[int, float], None]=np.finfo(np.float).eps, |
|
|
45 |
inverse: bool=False): |
|
|
46 |
"""Calculate the yeo-johnson transformation for a feature. |
|
|
47 |
:param y: the variable to be transformed (numeric array). |
|
|
48 |
:param lmbda: the function's Lambda value (numeric value or array). |
|
|
49 |
:param derivative: the derivative with respect to lambda. |
|
|
50 |
(non-negative integer; default: ordinary function evaluation). |
|
|
51 |
:param epsilon: the lambda's tolerance (positive value). |
|
|
52 |
:param inverse: the inverse transformation option (logical value). |
|
|
53 |
:return: the Yeo-Johnson transformation or its inverse, or its derivatives with respect to lambda, of y. |
|
|
54 |
""" |
|
|
55 |
# Validate arguments |
|
|
56 |
self.__validate(y, lmbda, derivative, epsilon, inverse) |
|
|
57 |
|
|
|
58 |
# initialise |
|
|
59 |
y = np.array(y, dtype=float) |
|
|
60 |
result = y |
|
|
61 |
if not (isinstance(lmbda, list) or isinstance(lmbda, np.ndarray)): |
|
|
62 |
lmbda, y = np.broadcast_arrays(lmbda, y) |
|
|
63 |
lmbda = np.array(lmbda, dtype=float) |
|
|
64 |
l0 = np.abs(lmbda) > epsilon |
|
|
65 |
l2 = np.abs(lmbda - 2) > epsilon |
|
|
66 |
|
|
|
67 |
# inverse |
|
|
68 |
with warnings.catch_warnings(): # suppress warnings |
|
|
69 |
warnings.simplefilter("ignore") |
|
|
70 |
if inverse is True: |
|
|
71 |
mask = np.where(((y >= 0) & l0) is True) |
|
|
72 |
result[mask] = np.power(np.multiply(y[mask], lmbda[mask]) + 1, 1 / lmbda[mask]) - 1 |
|
|
73 |
|
|
|
74 |
mask = np.where(((y >= 0) & ~l0) is True) |
|
|
75 |
result[mask] = np.expm1(y[mask]) |
|
|
76 |
|
|
|
77 |
mask = np.where(((y < 0) & l2) is True) |
|
|
78 |
result[mask] = 1 - np.power(np.multiply(-(2 - lmbda[mask]), y[mask]) + 1, 1 / (2 - lmbda[mask])) |
|
|
79 |
|
|
|
80 |
mask = np.where(((y < 0) & ~l2) is True) |
|
|
81 |
result[mask] = -np.expm1(-y[mask]) |
|
|
82 |
|
|
|
83 |
# derivative |
|
|
84 |
else: |
|
|
85 |
if derivative == 0: |
|
|
86 |
mask = np.where(((y >= 0) & l0) is True) |
|
|
87 |
result[mask] = np.divide(np.power(y[mask] + 1, lmbda[mask]) - 1, lmbda[mask]) |
|
|
88 |
|
|
|
89 |
mask = np.where(((y >= 0) & ~l0) is True) |
|
|
90 |
result[mask] = np.log1p(y[mask]) |
|
|
91 |
|
|
|
92 |
mask = np.where(((y < 0) & l2) is True) |
|
|
93 |
result[mask] = np.divide(-(np.power(-y[mask] + 1, 2 - lmbda[mask]) - 1), 2 - lmbda[mask]) |
|
|
94 |
|
|
|
95 |
mask = np.where(((y < 0) & ~l2) is True) |
|
|
96 |
result[mask] = -np.log1p(-y[mask]) |
|
|
97 |
|
|
|
98 |
# Not derivative |
|
|
99 |
else: |
|
|
100 |
p = self.fit(y, lmbda, derivative=derivative - 1, epsilon=epsilon, inverse=inverse) |
|
|
101 |
|
|
|
102 |
mask = np.where(((y >= 0) & l0) is True) |
|
|
103 |
result[mask] = np.divide(np.multiply( |
|
|
104 |
np.power(y[mask] + 1, |
|
|
105 |
lmbda[mask]), |
|
|
106 |
np.power(np.log1p(y[mask]), |
|
|
107 |
derivative)) - np.multiply(derivative, p[mask]), lmbda[mask]) |
|
|
108 |
|
|
|
109 |
mask = np.where(((y >= 0) & ~l0) is True) |
|
|
110 |
result[mask] = np.divide(np.power(np.log1p(y[mask]), derivative + 1), derivative + 1) |
|
|
111 |
|
|
|
112 |
mask = np.where(((y < 0) & l2) is True) |
|
|
113 |
result[mask] = np.divide(-(np.multiply( |
|
|
114 |
np.power(-y[mask] + 1, |
|
|
115 |
2 - lmbda[mask]), |
|
|
116 |
np.power(-np.log1p(-y[mask]), |
|
|
117 |
derivative)) - np.multiply(derivative, p[mask])), 2 - lmbda[mask]) |
|
|
118 |
|
|
|
119 |
mask = np.where(((y < 0) & ~l2) is True) |
|
|
120 |
result[mask] = np.divide(np.power(-np.log1p(-y[mask]), derivative + 1), derivative + 1) |
|
|
121 |
return result |
|
|
122 |
|
|
|
123 |
@staticmethod |
|
|
124 |
def __validate(y: Callable[[List, NumpyNDArray, PandasSeries], None], |
|
|
125 |
lmbda: Callable[[int, float], None], |
|
|
126 |
derivative: Callable[[int, float], None], |
|
|
127 |
epsilon: Callable[[int, float], None], |
|
|
128 |
inverse: bool): |
|
|
129 |
"""Validate the input arguments. |
|
|
130 |
:param y: the variable to be transformed (numeric array). |
|
|
131 |
:param lmbda: the function's Lambda value (numeric value or array). |
|
|
132 |
:param derivative: the derivative with respect to lambda. |
|
|
133 |
(non-negative integer; default: ordinary function evaluation). |
|
|
134 |
:param epsilon: the lambda's tolerance (positive value). |
|
|
135 |
:param inverse: the inverse transformation option (logical value). |
|
|
136 |
""" |
|
|
137 |
try: |
|
|
138 |
if not isinstance(y, (list, np.ndarray, pd.Series)): |
|
|
139 |
raise Exception("Argument 'y' must be a list") |
|
|
140 |
if not isinstance(lmbda, (int, float, np.int, np.float)): |
|
|
141 |
if not isinstance(lmbda, (list, np.ndarray, pd.Series)) or len(lmbda) != len(y): |
|
|
142 |
raise Exception("Argument 'lmbda' must be a number " |
|
|
143 |
"or a list, which its length matches 'y' argument") |
|
|
144 |
if not isinstance(derivative, (int, float, np.int, np.float)) or derivative < 0: |
|
|
145 |
raise Exception("Argument 'derivative' must be a non-negative integer") |
|
|
146 |
if not isinstance(epsilon, (int, float, np.int, np.float)) or epsilon <= 0: |
|
|
147 |
raise Exception("Argument 'epsilon' must be a positive number") |
|
|
148 |
if not isinstance(inverse, bool): |
|
|
149 |
raise Exception("Argument 'inverse' must be boolean") |
|
|
150 |
if inverse is True and derivative != 0: |
|
|
151 |
raise Exception("Argument 'derivative' must be zero " |
|
|
152 |
"when argument 'inverse' is 'True'") |
|
|
153 |
except (): |
|
|
154 |
sys.exit() |