Diff of /Stats/YeoJohnson.py [000000] .. [b4a150]

Switch to unified view

a b/Stats/YeoJohnson.py
1
#!/usr/bin/env python
2
# -*- coding: UTF-8 -*-
3
#
4
# Copyright 2017 University of Westminster. All Rights Reserved.
5
#
6
# Licensed under the Apache License, Version 2.0 (the "License");
7
# you may not use this file except in compliance with the License.
8
# You may obtain a copy of the License at
9
#
10
#     http://www.apache.org/licenses/LICENSE-2.0
11
#
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
17
# ==============================================================================
18
""" It computes the Yeo-Johnson transofrmation, which is an extension of Box-Cox transformation
19
but can handle both positive and negative values.
20
21
References:
22
Weisberg, S. (2001). Yeo-Johnson Power Transformations.
23
Department of Applied Statistics, University of Minnesota. Retrieved June, 1, 2003.
24
https://www.stat.umn.edu/arc/yjpower.pdf
25
26
Adapted from CRAN - Package VGAM
27
"""
28
29
from typing import List, TypeVar, Callable
30
import sys
31
import warnings
32
import numpy as np
33
import pandas as pd
34
35
NumpyNDArray = TypeVar('ndarray')
36
PandasSeries = TypeVar('Series')
37
38
39
class YeoJohnson:
40
    def fit(self,
41
            y: Callable[[List, NumpyNDArray, PandasSeries], None],
42
            lmbda: Callable[[int, float], None],
43
            derivative: Callable[[int, float], None]=0,
44
            epsilon: Callable[[int, float], None]=np.finfo(np.float).eps,
45
            inverse: bool=False):
46
        """Calculate the yeo-johnson transformation for a feature.
47
        :param y: the variable to be transformed (numeric array).
48
        :param lmbda: the function's Lambda value (numeric value or array).
49
        :param derivative: the derivative with respect to lambda.
50
        (non-negative integer; default: ordinary function evaluation).
51
        :param epsilon: the lambda's tolerance (positive value).
52
        :param inverse: the inverse transformation option (logical value).
53
        :return: the Yeo-Johnson transformation or its inverse, or its derivatives with respect to lambda, of y.
54
        """
55
        # Validate arguments
56
        self.__validate(y, lmbda, derivative, epsilon, inverse)
57
58
        # initialise
59
        y = np.array(y, dtype=float)
60
        result = y
61
        if not (isinstance(lmbda, list) or isinstance(lmbda, np.ndarray)):
62
            lmbda, y = np.broadcast_arrays(lmbda, y)
63
            lmbda = np.array(lmbda, dtype=float)
64
        l0 = np.abs(lmbda) > epsilon
65
        l2 = np.abs(lmbda - 2) > epsilon
66
67
        # inverse
68
        with warnings.catch_warnings():  # suppress warnings
69
            warnings.simplefilter("ignore")
70
            if inverse is True:
71
                mask = np.where(((y >= 0) & l0) is True)
72
                result[mask] = np.power(np.multiply(y[mask], lmbda[mask]) + 1, 1 / lmbda[mask]) - 1
73
74
                mask = np.where(((y >= 0) & ~l0) is True)
75
                result[mask] = np.expm1(y[mask])
76
77
                mask = np.where(((y < 0) & l2) is True)
78
                result[mask] = 1 - np.power(np.multiply(-(2 - lmbda[mask]), y[mask]) + 1, 1 / (2 - lmbda[mask]))
79
80
                mask = np.where(((y < 0) & ~l2) is True)
81
                result[mask] = -np.expm1(-y[mask])
82
83
            # derivative
84
            else:
85
                if derivative == 0:
86
                    mask = np.where(((y >= 0) & l0) is True)
87
                    result[mask] = np.divide(np.power(y[mask] + 1, lmbda[mask]) - 1, lmbda[mask])
88
89
                    mask = np.where(((y >= 0) & ~l0) is True)
90
                    result[mask] = np.log1p(y[mask])
91
92
                    mask = np.where(((y < 0) & l2) is True)
93
                    result[mask] = np.divide(-(np.power(-y[mask] + 1, 2 - lmbda[mask]) - 1), 2 - lmbda[mask])
94
95
                    mask = np.where(((y < 0) & ~l2) is True)
96
                    result[mask] = -np.log1p(-y[mask])
97
98
                # Not derivative
99
                else:
100
                    p = self.fit(y, lmbda, derivative=derivative - 1, epsilon=epsilon, inverse=inverse)
101
102
                    mask = np.where(((y >= 0) & l0) is True)
103
                    result[mask] = np.divide(np.multiply(
104
                        np.power(y[mask] + 1,
105
                                 lmbda[mask]),
106
                        np.power(np.log1p(y[mask]),
107
                                 derivative)) - np.multiply(derivative, p[mask]), lmbda[mask])
108
109
                    mask = np.where(((y >= 0) & ~l0) is True)
110
                    result[mask] = np.divide(np.power(np.log1p(y[mask]), derivative + 1), derivative + 1)
111
112
                    mask = np.where(((y < 0) & l2) is True)
113
                    result[mask] = np.divide(-(np.multiply(
114
                        np.power(-y[mask] + 1,
115
                                 2 - lmbda[mask]),
116
                        np.power(-np.log1p(-y[mask]),
117
                                 derivative)) - np.multiply(derivative, p[mask])), 2 - lmbda[mask])
118
119
                    mask = np.where(((y < 0) & ~l2) is True)
120
                    result[mask] = np.divide(np.power(-np.log1p(-y[mask]), derivative + 1), derivative + 1)
121
        return result
122
123
    @staticmethod
124
    def __validate(y: Callable[[List, NumpyNDArray, PandasSeries], None],
125
                   lmbda: Callable[[int, float], None],
126
                   derivative: Callable[[int, float], None],
127
                   epsilon: Callable[[int, float], None],
128
                   inverse: bool):
129
        """Validate the input arguments.
130
        :param y: the variable to be transformed (numeric array).
131
        :param lmbda: the function's Lambda value (numeric value or array).
132
        :param derivative: the derivative with respect to lambda.
133
        (non-negative integer; default: ordinary function evaluation).
134
        :param epsilon: the lambda's tolerance (positive value).
135
        :param inverse: the inverse transformation option (logical value).
136
        """
137
        try:
138
            if not isinstance(y, (list, np.ndarray, pd.Series)):
139
                raise Exception("Argument 'y' must be a list")
140
            if not isinstance(lmbda, (int, float, np.int, np.float)):
141
                if not isinstance(lmbda, (list, np.ndarray, pd.Series)) or len(lmbda) != len(y):
142
                    raise Exception("Argument 'lmbda' must be a number "
143
                                    "or a list, which its length matches 'y' argument")
144
            if not isinstance(derivative, (int, float, np.int, np.float)) or derivative < 0:
145
                raise Exception("Argument 'derivative' must be a non-negative integer")
146
            if not isinstance(epsilon, (int, float, np.int, np.float)) or epsilon <= 0:
147
                raise Exception("Argument 'epsilon' must be a positive number")
148
            if not isinstance(inverse, bool):
149
                raise Exception("Argument 'inverse' must be boolean")
150
            if inverse is True and derivative != 0:
151
                raise Exception("Argument 'derivative' must be zero "
152
                                "when argument 'inverse' is 'True'")
153
        except ():
154
            sys.exit()