|
a |
|
b/src/move/data/preprocessing.py |
|
|
1 |
__all__ = ["one_hot_encode", "one_hot_encode_single", "scale"] |
|
|
2 |
|
|
|
3 |
from typing import Any, Optional |
|
|
4 |
|
|
|
5 |
import numpy as np |
|
|
6 |
import pandas as pd |
|
|
7 |
from numpy.typing import ArrayLike |
|
|
8 |
from sklearn.preprocessing import scale as standardize |
|
|
9 |
|
|
|
10 |
from move.core.typing import BoolArray, FloatArray, IntArray |
|
|
11 |
|
|
|
12 |
|
|
|
13 |
def _category_name(value: Any) -> str: |
|
|
14 |
return value if isinstance(value, str) else str(int(value)) |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
def one_hot_encode(x_: ArrayLike) -> tuple[IntArray, dict[str, int]]: |
|
|
18 |
"""One-hot encode a matrix with samples in its rows and features in its |
|
|
19 |
columns. Columns share number of classes. |
|
|
20 |
|
|
|
21 |
Args: |
|
|
22 |
x: a 1D or 2D matrix, can be numerical or contain strings |
|
|
23 |
|
|
|
24 |
Returns: |
|
|
25 |
A 3D one-hot encoded matrix (extra dim corresponds to number of |
|
|
26 |
classes) and a mapping between classes and corresponding codes |
|
|
27 |
""" |
|
|
28 |
x: np.ndarray = np.copy(x_) |
|
|
29 |
if x.ndim == 1: |
|
|
30 |
x = x[:, np.newaxis] |
|
|
31 |
shape = x.shape |
|
|
32 |
has_na = np.any(pd.isna(x)) |
|
|
33 |
if x.dtype == object: |
|
|
34 |
x = x.astype(str) |
|
|
35 |
categories, codes = np.unique(x, return_inverse=True) |
|
|
36 |
num_classes = len(categories) |
|
|
37 |
encoded_x = np.zeros((x.size, num_classes), dtype=np.uint8) |
|
|
38 |
encoded_x[np.arange(x.size), codes.astype(np.uint8).ravel()] = 1 |
|
|
39 |
encoded_x = encoded_x.reshape(*shape, num_classes) |
|
|
40 |
if has_na: |
|
|
41 |
# remove NaN column |
|
|
42 |
categories = categories[:-1] |
|
|
43 |
encoded_x = encoded_x[:, :, :-1] |
|
|
44 |
mapping = { |
|
|
45 |
_category_name(category): code for code, category in enumerate(categories) |
|
|
46 |
} |
|
|
47 |
return encoded_x, mapping |
|
|
48 |
|
|
|
49 |
|
|
|
50 |
def one_hot_encode_single(mapping: dict[str, int], value: Optional[str]) -> IntArray: |
|
|
51 |
"""One-hot encode a single value given an existing mapping. |
|
|
52 |
|
|
|
53 |
Args: |
|
|
54 |
mapping: cateogry-to-code lookup dictionary |
|
|
55 |
value: category |
|
|
56 |
|
|
|
57 |
Returns: |
|
|
58 |
2D array |
|
|
59 |
""" |
|
|
60 |
encoded_value = np.zeros((1, len(mapping)), dtype=int) |
|
|
61 |
if not pd.isna(value): |
|
|
62 |
code = mapping[str(value)] |
|
|
63 |
encoded_value[0, code] = 1 |
|
|
64 |
return encoded_value |
|
|
65 |
|
|
|
66 |
|
|
|
67 |
def scale(x: np.ndarray, log2: bool = False) -> tuple[FloatArray, BoolArray]: |
|
|
68 |
"""Center to mean and scale to unit variance. Convert NaN values to 0. |
|
|
69 |
|
|
|
70 |
Args: |
|
|
71 |
x: 2D array with samples in its rows and features in its columns |
|
|
72 |
|
|
|
73 |
Returns: |
|
|
74 |
Tuple containing (1) scaled output and (2) a 1D mask marking columns |
|
|
75 |
(i.e., features) without zero variance |
|
|
76 |
""" |
|
|
77 |
logx = x |
|
|
78 |
if log2: |
|
|
79 |
logx = np.log2(x + 1) |
|
|
80 |
mask_1d = ~np.isclose(np.nanstd(logx, axis=0), 0.0) |
|
|
81 |
scaled_x = standardize(logx[:, mask_1d], axis=0) |
|
|
82 |
scaled_x[np.isnan(scaled_x)] = 0 |
|
|
83 |
return scaled_x, mask_1d |
|
|
84 |
|
|
|
85 |
|
|
|
86 |
def feature_stats(x: ArrayLike) -> tuple[FloatArray, FloatArray, FloatArray]: |
|
|
87 |
""" |
|
|
88 |
Read an array of continuous values and extract the |
|
|
89 |
minimum, maximum and standard deviation per column (feature). |
|
|
90 |
|
|
|
91 |
Args: |
|
|
92 |
x: 2D array with samples in its rows and features in its columns |
|
|
93 |
|
|
|
94 |
Returns: |
|
|
95 |
minimum: list with minimum value per feature (column) |
|
|
96 |
maximum: list with maximum " " |
|
|
97 |
std: list with std " " |
|
|
98 |
""" |
|
|
99 |
|
|
|
100 |
minimum = np.nanmin(x, axis=0) |
|
|
101 |
maximum = np.nanmax(x, axis=0) |
|
|
102 |
std = np.nanstd(x, axis=0) |
|
|
103 |
return minimum, maximum, std |