Data: Tabular Time Series Specialty: Endocrinology Laboratory: Blood Tests EHR: Demographics Diagnoses Medications Omics: Genomics Multi-omics Transcriptomics Wearable: Activity Clinical Purpose: Treatment Response Assessment Task: Biomarker Discovery
[c23b31]: / src / move / data / preprocessing.py

Download this file

104 lines (81 with data), 3.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
__all__ = ["one_hot_encode", "one_hot_encode_single", "scale"]
from typing import Any, Optional
import numpy as np
import pandas as pd
from numpy.typing import ArrayLike
from sklearn.preprocessing import scale as standardize
from move.core.typing import BoolArray, FloatArray, IntArray
def _category_name(value: Any) -> str:
return value if isinstance(value, str) else str(int(value))
def one_hot_encode(x_: ArrayLike) -> tuple[IntArray, dict[str, int]]:
"""One-hot encode a matrix with samples in its rows and features in its
columns. Columns share number of classes.
Args:
x: a 1D or 2D matrix, can be numerical or contain strings
Returns:
A 3D one-hot encoded matrix (extra dim corresponds to number of
classes) and a mapping between classes and corresponding codes
"""
x: np.ndarray = np.copy(x_)
if x.ndim == 1:
x = x[:, np.newaxis]
shape = x.shape
has_na = np.any(pd.isna(x))
if x.dtype == object:
x = x.astype(str)
categories, codes = np.unique(x, return_inverse=True)
num_classes = len(categories)
encoded_x = np.zeros((x.size, num_classes), dtype=np.uint8)
encoded_x[np.arange(x.size), codes.astype(np.uint8).ravel()] = 1
encoded_x = encoded_x.reshape(*shape, num_classes)
if has_na:
# remove NaN column
categories = categories[:-1]
encoded_x = encoded_x[:, :, :-1]
mapping = {
_category_name(category): code for code, category in enumerate(categories)
}
return encoded_x, mapping
def one_hot_encode_single(mapping: dict[str, int], value: Optional[str]) -> IntArray:
"""One-hot encode a single value given an existing mapping.
Args:
mapping: cateogry-to-code lookup dictionary
value: category
Returns:
2D array
"""
encoded_value = np.zeros((1, len(mapping)), dtype=int)
if not pd.isna(value):
code = mapping[str(value)]
encoded_value[0, code] = 1
return encoded_value
def scale(x: np.ndarray, log2: bool = False) -> tuple[FloatArray, BoolArray]:
"""Center to mean and scale to unit variance. Convert NaN values to 0.
Args:
x: 2D array with samples in its rows and features in its columns
Returns:
Tuple containing (1) scaled output and (2) a 1D mask marking columns
(i.e., features) without zero variance
"""
logx = x
if log2:
logx = np.log2(x + 1)
mask_1d = ~np.isclose(np.nanstd(logx, axis=0), 0.0)
scaled_x = standardize(logx[:, mask_1d], axis=0)
scaled_x[np.isnan(scaled_x)] = 0
return scaled_x, mask_1d
def feature_stats(x: ArrayLike) -> tuple[FloatArray, FloatArray, FloatArray]:
"""
Read an array of continuous values and extract the
minimum, maximum and standard deviation per column (feature).
Args:
x: 2D array with samples in its rows and features in its columns
Returns:
minimum: list with minimum value per feature (column)
maximum: list with maximum " "
std: list with std " "
"""
minimum = np.nanmin(x, axis=0)
maximum = np.nanmax(x, axis=0)
std = np.nanstd(x, axis=0)
return minimum, maximum, std