[e5f1db]: / tests / preprocessing / test_bias.py

Download this file

111 lines (91 with data), 4.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
import numpy as np
import pandas as pd
import pytest
import ehrapy as ep
from ehrapy.anndata._constants import CATEGORICAL_TAG, FEATURE_TYPE_KEY, NUMERIC_TAG
@pytest.fixture
def adata(rng):
corr = rng.integers(0, 100, 100)
df = pd.DataFrame(
{
"corr1": corr,
"corr2": corr * 2,
"corr3": corr * -1,
"contin1": rng.integers(0, 20, 50).tolist() + rng.integers(20, 40, 50).tolist(),
"cat1": [0] * 50 + [1] * 50,
"cat2": [10] * 10 + [11] * 40 + [10] * 30 + [11] * 20,
}
)
adata = ep.ad.df_to_anndata(df)
adata.var[FEATURE_TYPE_KEY] = [NUMERIC_TAG] * 4 + [CATEGORICAL_TAG] * 2
return adata
def test_detect_bias_all_sensitive_features(adata):
results = ep.pp.detect_bias(
adata, "all", run_feature_importances=True, corr_method="spearman", feature_importance_threshold=0.4
)
assert "feature_correlations" in results.keys()
df = results["feature_correlations"]
assert len(df) == 4
assert df[(df["Feature 1"] == "corr1") & (df["Feature 2"] == "corr2")]["Spearman CC"].values[0] == 1
assert df[(df["Feature 1"] == "corr1") & (df["Feature 2"] == "corr3")]["Spearman CC"].values[0] == -1
assert df[(df["Feature 1"] == "corr2") & (df["Feature 2"] == "corr3")]["Spearman CC"].values[0] == -1
assert df[(df["Feature 1"] == "contin1") & (df["Feature 2"] == "cat1")]["Spearman CC"].values[0] > 0.5
assert "standardized_mean_differences" in results.keys()
df = results["standardized_mean_differences"]
assert len(df) == 4 # Both groups of cat1, cat2 respectively show a high SMD with contin1
smd_key = "Standardized Mean Difference"
assert df[(df["Sensitive Feature"] == "cat1") & (df["Sensitive Group"] == 0)][smd_key].values[0] < 0
assert df[(df["Sensitive Feature"] == "cat1") & (df["Sensitive Group"] == 1)][smd_key].values[0] > 0
assert df[(df["Sensitive Feature"] == "cat2") & (df["Sensitive Group"] == 10)][smd_key].values[0] > 0
assert df[(df["Sensitive Feature"] == "cat2") & (df["Sensitive Group"] == 11)][smd_key].values[0] < 0
assert "categorical_value_counts" in results.keys()
df = results["categorical_value_counts"]
assert len(df) == 2
assert df[(df["Sensitive Feature"] == "cat1") & (df["Sensitive Group"] == 0)]["Group 1 Percentage"].values[0] == 0.2
assert df[(df["Sensitive Feature"] == "cat1") & (df["Sensitive Group"] == 0)]["Group 2 Percentage"].values[0] == 0.8
assert (
df[(df["Sensitive Feature"] == "cat2") & (df["Sensitive Group"] == 10)]["Group 1 Percentage"].values[0] == 0.25
)
assert (
df[(df["Sensitive Feature"] == "cat2") & (df["Sensitive Group"] == 10)]["Group 2 Percentage"].values[0] == 0.75
)
assert "feature_importances" in results.keys()
df = results["feature_importances"]
assert len(df) >= 7 # 6 for the pairwise correlating features and one/two for contin1, which predicts cat1
def test_detect_bias_specified_sensitive_features(adata):
results, result_adata = ep.pp.detect_bias(
adata,
["contin1", "cat1"],
run_feature_importances=True,
corr_method="spearman",
feature_importance_threshold=0.5,
prediction_confidence_threshold=0.4,
copy=True,
)
assert "smd" not in adata.uns.keys()
assert "smd" in result_adata.uns.keys()
assert "feature_correlations" in results.keys()
df = results["feature_correlations"]
assert len(df) == 2 # cat1 & contin1 and contin1 & cat1
assert np.all(df["Spearman CC"] > 0.5)
assert "standardized_mean_differences" in results.keys()
df = results["standardized_mean_differences"]
assert len(df) == 2 # Both groups of cat1 show a high SMD with contin1
smd_key = "Standardized Mean Difference"
assert df[(df["Sensitive Feature"] == "cat1") & (df["Sensitive Group"] == 0)][smd_key].values[0] < -1
assert df[(df["Sensitive Feature"] == "cat1") & (df["Sensitive Group"] == 1)][smd_key].values[0] > 1
assert "categorical_value_counts" in results.keys()
df = results["categorical_value_counts"]
assert len(df) == 1
assert df[(df["Sensitive Feature"] == "cat1") & (df["Sensitive Group"] == 0)]["Group 1 Percentage"].values[0] == 0.2
assert df[(df["Sensitive Feature"] == "cat1") & (df["Sensitive Group"] == 0)]["Group 2 Percentage"].values[0] == 0.8
assert "feature_importances" in results.keys()
df = results["feature_importances"]
assert len(df) == 2 # contin1 predicts cat1 and cat1 predicts contin1
def test_unencoded_data():
adata = ep.ad.df_to_anndata(
pd.DataFrame({"Unencoded": ["A", "B", "C", "D", "E", "F"], "Encoded": [1, 2, 3, 4, 5, 6]})
)
adata.var[FEATURE_TYPE_KEY] = [CATEGORICAL_TAG] * 2
with pytest.raises(ValueError):
ep.pp.detect_bias(adata, "all")