a b/src/move/data/preprocessing.py
1
__all__ = ["one_hot_encode", "one_hot_encode_single", "scale"]
2
3
from typing import Any, Optional
4
5
import numpy as np
6
import pandas as pd
7
from numpy.typing import ArrayLike
8
from sklearn.preprocessing import scale as standardize
9
10
from move.core.typing import BoolArray, FloatArray, IntArray
11
12
13
def _category_name(value: Any) -> str:
14
    return value if isinstance(value, str) else str(int(value))
15
16
17
def one_hot_encode(x_: ArrayLike) -> tuple[IntArray, dict[str, int]]:
18
    """One-hot encode a matrix with samples in its rows and features in its
19
    columns. Columns share number of classes.
20
21
    Args:
22
        x: a 1D or 2D matrix, can be numerical or contain strings
23
24
    Returns:
25
        A 3D one-hot encoded matrix (extra dim corresponds to number of
26
        classes) and a mapping between classes and corresponding codes
27
    """
28
    x: np.ndarray = np.copy(x_)
29
    if x.ndim == 1:
30
        x = x[:, np.newaxis]
31
    shape = x.shape
32
    has_na = np.any(pd.isna(x))
33
    if x.dtype == object:
34
        x = x.astype(str)
35
    categories, codes = np.unique(x, return_inverse=True)
36
    num_classes = len(categories)
37
    encoded_x = np.zeros((x.size, num_classes), dtype=np.uint8)
38
    encoded_x[np.arange(x.size), codes.astype(np.uint8).ravel()] = 1
39
    encoded_x = encoded_x.reshape(*shape, num_classes)
40
    if has_na:
41
        # remove NaN column
42
        categories = categories[:-1]
43
        encoded_x = encoded_x[:, :, :-1]
44
    mapping = {
45
        _category_name(category): code for code, category in enumerate(categories)
46
    }
47
    return encoded_x, mapping
48
49
50
def one_hot_encode_single(mapping: dict[str, int], value: Optional[str]) -> IntArray:
51
    """One-hot encode a single value given an existing mapping.
52
53
    Args:
54
        mapping: cateogry-to-code lookup dictionary
55
        value: category
56
57
    Returns:
58
        2D array
59
    """
60
    encoded_value = np.zeros((1, len(mapping)), dtype=int)
61
    if not pd.isna(value):
62
        code = mapping[str(value)]
63
        encoded_value[0, code] = 1
64
    return encoded_value
65
66
67
def scale(x: np.ndarray, log2: bool = False) -> tuple[FloatArray, BoolArray]:
68
    """Center to mean and scale to unit variance. Convert NaN values to 0.
69
70
    Args:
71
        x: 2D array with samples in its rows and features in its columns
72
73
    Returns:
74
        Tuple containing (1) scaled output and (2) a 1D mask marking columns
75
        (i.e., features) without zero variance
76
    """
77
    logx = x
78
    if log2:
79
        logx = np.log2(x + 1)
80
    mask_1d = ~np.isclose(np.nanstd(logx, axis=0), 0.0)
81
    scaled_x = standardize(logx[:, mask_1d], axis=0)
82
    scaled_x[np.isnan(scaled_x)] = 0
83
    return scaled_x, mask_1d
84
85
86
def feature_stats(x: ArrayLike) -> tuple[FloatArray, FloatArray, FloatArray]:
87
    """
88
    Read an array of continuous values and extract the
89
    minimum, maximum and standard deviation per column (feature).
90
91
    Args:
92
        x: 2D array with samples in its rows and features in its columns
93
94
    Returns:
95
        minimum: list with minimum value per feature (column)
96
        maximum: list with maximum  " "
97
        std: list with std " "
98
    """
99
100
    minimum = np.nanmin(x, axis=0)
101
    maximum = np.nanmax(x, axis=0)
102
    std = np.nanstd(x, axis=0)
103
    return minimum, maximum, std