Data: Tabular Time Series Specialty: Endocrinology Laboratory: Blood Tests EHR: Demographics Diagnoses Medications Omics: Genomics Multi-omics Transcriptomics Wearable: Activity Clinical Purpose: Treatment Response Assessment Task: Biomarker Discovery
[c23b31]: / src / move / visualization / feature_importance.py

Download this file

188 lines (164 with data), 6.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
__all__ = ["plot_categorical_feature_importance", "plot_continuous_feature_importance"]
import matplotlib
import matplotlib.figure
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.cm import ScalarMappable
from matplotlib.colors import TwoSlopeNorm
from move.core.typing import FloatArray
from move.visualization.style import ( # color_cycle,
DEFAULT_DIVERGING_PALETTE,
DEFAULT_PLOT_STYLE,
DEFAULT_QUALITATIVE_PALETTE,
style_settings,
)
def plot_categorical_feature_importance(
diffs: FloatArray,
feature_values: FloatArray,
feature_names: list[str],
feature_mapping: dict[str, int],
style: str = DEFAULT_PLOT_STYLE,
colormap: str = DEFAULT_QUALITATIVE_PALETTE,
) -> matplotlib.figure.Figure:
"""Plot a beeswarm displaying the top ten categorical features, based on
their impact on the latent space when perturbed.
Args:
diffs:
Impact caused by perturbing each feature, a 2D array (`num_samples`
x `num_features`).
feature_values:
Values of the features, a 3D array (`num_samples` x `num_features`
x `num_categories`).
feature_names:
Names of the features.
feature_mapping:
Mapping feature values to category names.
style:
Name of style to apply to the plot.
colormap:
Name of colormap to apply to the legend.
Raises:
ValueError:
If inputs do not have expected number of dimensions or have
ill-matched shapes.
Returns:
Figure
"""
if feature_values.ndim != 3:
raise ValueError("Expected feature values to have three dimensions.")
if diffs.ndim != 2:
raise ValueError("Expected differences to have two dimensions.")
if feature_values[:, :, 0].shape != diffs.shape:
raise ValueError("Feature values and differences shapes do not match.")
# Select top 10 absolute sum difference
top10_ids = np.argsort(np.sum(np.abs(diffs), axis=0))[::-1][:10]
# Force figure aspect ratio to 1:1 or 2:1 (if less than 5 features)
width: float = max(matplotlib.rcParams["figure.figsize"])
figsize = (width, width)
if top10_ids.size < 5:
figsize = (width, width / 2)
is_nan = (feature_values.sum(axis=2) == 0)[:, top10_ids].T.ravel()
feature_values = np.argmax(feature_values, axis=2) # 3D => 2D
num_samples = diffs.shape[0]
order = np.take(feature_names, top10_ids)
perturbed_features = []
for name in order:
perturbed_features.extend([name] * num_samples)
data = pd.DataFrame(
dict(
x=diffs.T[top10_ids, :].ravel()[~is_nan],
y=np.compress(~is_nan, perturbed_features),
category=feature_values.T[top10_ids, :].ravel()[~is_nan],
)
)
with style_settings(style):
fig, ax = plt.subplots(figsize=figsize)
sns.stripplot(
data=data, x="x", y="y", hue="category", size=1, ax=ax, palette=colormap
)
ax.set(xlabel="Impact on latent space", ylabel="Feature")
# Fix labels in legend
legend = ax.get_legend()
assert legend is not None
for text in legend.get_texts():
code = text.get_text()
if code in feature_mapping:
text.set_text(feature_mapping[code])
return fig
def plot_continuous_feature_importance(
diffs: FloatArray,
feature_values: FloatArray,
feature_names: list[str],
style: str = DEFAULT_PLOT_STYLE,
colormap: str = DEFAULT_DIVERGING_PALETTE,
) -> matplotlib.figure.Figure:
"""Plot a beeswarm displaying the top ten continuous features, based on
their impact on the latent space when perturbed.
Args:
diffs:
Impact caused by perturbing each feature, a 2D array (`num_samples`
x `num_features`).
feature_values:
Values of the features, a 2D array (`num_samples` x `num_features`).
feature_names:
Names of the features.
style:
Name of style to apply to the plot.
colormap:
Name of colormap to apply to the colorbar.
Raises:
ValueError:
If inputs do not have two dimensions or have ill-matched shapes.
Returns:
Figure
"""
if diffs.ndim != 2:
raise ValueError("Expected differences to have two dimensions.")
if feature_values[:, :].shape != diffs.shape:
raise ValueError("Feature values and differences shapes do not match.")
# Select top 10 absolute sum difference
top10_ids = np.argsort(np.sum(np.abs(diffs), axis=0))[::-1][:10]
# Force figure aspect ratio to 1:1 or 2:1 (if less than 5 features)
width: float = max(matplotlib.rcParams["figure.figsize"])
figsize = (width, width)
if top10_ids.size < 5:
figsize = (width, width / 2)
is_nan = (feature_values == 0)[:, top10_ids].T.ravel()
num_samples = diffs.shape[0]
order = np.take(feature_names, top10_ids)
perturbed_features = []
for name in order:
perturbed_features.extend([name] * num_samples)
data = pd.DataFrame(
dict(
x=diffs.T[top10_ids, :].ravel()[~is_nan],
y=np.compress(~is_nan, perturbed_features),
value=feature_values.T[top10_ids, :].ravel()[~is_nan],
)
)
# To obtain a colormap, we map the feature values to 25 discrete categories
# using the two-slope norm. We then assign one color to each category
# using the scalar mappable.
vmin, vmax = data["value"].min(), data["value"].max()
norm = TwoSlopeNorm(0.0, vmin, vmax)
sm = ScalarMappable(norm, colormap)
data["category"] = np.ma.compressed(norm(data["value"]) * 25).astype(int)
palette = np.empty((25, 4)) # 25 colors x 4 channels
palette[:13, :] = sm.to_rgba(np.linspace(vmin, 0, 13)) # first slope
palette[12:, :] = sm.to_rgba(np.linspace(0, vmax, 13)) # second slope
palette = palette.tolist() # NDArray not always supported
with style_settings(style):
fig, ax = plt.subplots(figsize=figsize)
sns.stripplot(
data=data, x="x", y="y", hue="category", ax=ax, palette=palette, size=2
)
ax.set(xlabel="Impact on latent space", ylabel="Feature")
# Add colormap
legend = ax.get_legend()
if legend is not None:
legend.remove()
cbar = fig.colorbar(sm, ax=ax)
cbar.ax.set(ylabel="Feature value")
return fig