[e5f1db]: / tests / preprocessing / test_balanced_sampling.py

Download this file

63 lines (49 with data), 2.2 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from pathlib import Path
import pytest
import ehrapy as ep
from ehrapy.io._read import read_csv
from tests.conftest import TEST_DATA_PATH
CURRENT_DIR = Path(__file__).parent
@pytest.fixture
def adata_mini():
return read_csv(f"{TEST_DATA_PATH}/encode/dataset1.csv", columns_obs_only=["clinic_day"])
def test_balanced_sampling_basic(adata_mini):
# no key
with pytest.raises(TypeError):
ep.pp.balanced_sample(adata_mini)
# invalid key
with pytest.raises(ValueError):
ep.pp.balanced_sample(adata_mini, key="non_existing_column")
# invalid method
with pytest.raises(ValueError):
ep.pp.balanced_sample(adata_mini, key="clinic_day", method="non_existing_method")
# undersampling
adata_sampled = ep.pp.balanced_sample(adata_mini, key="clinic_day", method="RandomUnderSampler", copy=True)
assert adata_sampled.n_obs == 4
assert adata_sampled.obs.clinic_day.value_counts().min() == adata_sampled.obs.clinic_day.value_counts().max()
# oversampling
adata_sampled = ep.pp.balanced_sample(adata_mini, key="clinic_day", method="RandomOverSampler", copy=True)
assert adata_sampled.n_obs == 8
assert adata_sampled.obs.clinic_day.value_counts().min() == adata_sampled.obs.clinic_day.value_counts().max()
# undersampling, no copy
adata_mini_for_undersampling = adata_mini.copy()
output = ep.pp.balanced_sample(
adata_mini_for_undersampling, key="clinic_day", method="RandomUnderSampler", copy=False
)
assert output is None
assert adata_mini_for_undersampling.n_obs == 4
assert (
adata_mini_for_undersampling.obs.clinic_day.value_counts().min()
== adata_mini_for_undersampling.obs.clinic_day.value_counts().max()
)
# oversampling, no copy
adata_mini_for_oversampling = adata_mini.copy()
output = ep.pp.balanced_sample(
adata_mini_for_oversampling, key="clinic_day", method="RandomOverSampler", copy=False
)
assert output is None
assert adata_mini_for_oversampling.n_obs == 8
assert (
adata_mini_for_oversampling.obs.clinic_day.value_counts().min()
== adata_mini_for_oversampling.obs.clinic_day.value_counts().max()
)