MOVE / Git / [c23b31] /src/move/visualization/feature

Models:
AlyssaS/
MOVE
Downloads: 1
Data:
Tabular
Time Series Specialty:
Endocrinology Laboratory:
Blood Tests EHR:
Demographics
Diagnoses
Medications Omics:
Genomics
Multi-omics
Transcriptomics Wearable:
Activity Clinical Purpose:
Treatment Response Assessment Task:
Biomarker Discovery
[c23b31]: / src / move / visualization / feature_importance.py
History
Download this file
188 lines (164 with data), 6.6 kB

__all__ = ["plot_categorical_feature_importance", "plot_continuous_feature_importance"]

import matplotlib
import matplotlib.figure
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.cm import ScalarMappable
from matplotlib.colors import TwoSlopeNorm

from move.core.typing import FloatArray
from move.visualization.style import (  # color_cycle,
    DEFAULT_DIVERGING_PALETTE,
    DEFAULT_PLOT_STYLE,
    DEFAULT_QUALITATIVE_PALETTE,
    style_settings,
)


def plot_categorical_feature_importance(
    diffs: FloatArray,
    feature_values: FloatArray,
    feature_names: list[str],
    feature_mapping: dict[str, int],
    style: str = DEFAULT_PLOT_STYLE,
    colormap: str = DEFAULT_QUALITATIVE_PALETTE,
) -> matplotlib.figure.Figure:
    """Plot a beeswarm displaying the top ten categorical features, based on
    their impact on the latent space when perturbed.

    Args:
        diffs:
            Impact caused by perturbing each feature, a 2D array (`num_samples`
            x `num_features`).
        feature_values:
            Values of the features, a 3D array (`num_samples` x `num_features`
            x `num_categories`).
        feature_names:
            Names of the features.
        feature_mapping:
            Mapping feature values to category names.
        style:
            Name of style to apply to the plot.
        colormap:
            Name of colormap to apply to the legend.

    Raises:
        ValueError:
            If inputs do not have expected number of dimensions or have
            ill-matched shapes.

    Returns:
        Figure
    """

    if feature_values.ndim != 3:
        raise ValueError("Expected feature values to have three dimensions.")
    if diffs.ndim != 2:
        raise ValueError("Expected differences to have two dimensions.")
    if feature_values[:, :, 0].shape != diffs.shape:
        raise ValueError("Feature values and differences shapes do not match.")

    # Select top 10 absolute sum difference
    top10_ids = np.argsort(np.sum(np.abs(diffs), axis=0))[::-1][:10]

    # Force figure aspect ratio to 1:1 or 2:1 (if less than 5 features)
    width: float = max(matplotlib.rcParams["figure.figsize"])
    figsize = (width, width)
    if top10_ids.size < 5:
        figsize = (width, width / 2)

    is_nan = (feature_values.sum(axis=2) == 0)[:, top10_ids].T.ravel()
    feature_values = np.argmax(feature_values, axis=2)  # 3D => 2D

    num_samples = diffs.shape[0]
    order = np.take(feature_names, top10_ids)
    perturbed_features = []
    for name in order:
        perturbed_features.extend([name] * num_samples)
    data = pd.DataFrame(
        dict(
            x=diffs.T[top10_ids, :].ravel()[~is_nan],
            y=np.compress(~is_nan, perturbed_features),
            category=feature_values.T[top10_ids, :].ravel()[~is_nan],
        )
    )
    with style_settings(style):
        fig, ax = plt.subplots(figsize=figsize)
        sns.stripplot(
            data=data, x="x", y="y", hue="category", size=1, ax=ax, palette=colormap
        )
        ax.set(xlabel="Impact on latent space", ylabel="Feature")
        # Fix labels in legend
        legend = ax.get_legend()
        assert legend is not None
        for text in legend.get_texts():
            code = text.get_text()
            if code in feature_mapping:
                text.set_text(feature_mapping[code])
    return fig


def plot_continuous_feature_importance(
    diffs: FloatArray,
    feature_values: FloatArray,
    feature_names: list[str],
    style: str = DEFAULT_PLOT_STYLE,
    colormap: str = DEFAULT_DIVERGING_PALETTE,
) -> matplotlib.figure.Figure:
    """Plot a beeswarm displaying the top ten continuous features, based on
    their impact on the latent space when perturbed.

    Args:
        diffs:
            Impact caused by perturbing each feature, a 2D array (`num_samples`
            x `num_features`).
        feature_values:
            Values of the features, a 2D array (`num_samples` x `num_features`).
        feature_names:
            Names of the features.
        style:
            Name of style to apply to the plot.
        colormap:
            Name of colormap to apply to the colorbar.

    Raises:
        ValueError:
            If inputs do not have two dimensions or have ill-matched shapes.

    Returns:
        Figure
    """
    if diffs.ndim != 2:
        raise ValueError("Expected differences to have two dimensions.")
    if feature_values[:, :].shape != diffs.shape:
        raise ValueError("Feature values and differences shapes do not match.")

    # Select top 10 absolute sum difference
    top10_ids = np.argsort(np.sum(np.abs(diffs), axis=0))[::-1][:10]

    # Force figure aspect ratio to 1:1 or 2:1 (if less than 5 features)
    width: float = max(matplotlib.rcParams["figure.figsize"])
    figsize = (width, width)
    if top10_ids.size < 5:
        figsize = (width, width / 2)

    is_nan = (feature_values == 0)[:, top10_ids].T.ravel()

    num_samples = diffs.shape[0]
    order = np.take(feature_names, top10_ids)
    perturbed_features = []
    for name in order:
        perturbed_features.extend([name] * num_samples)
    data = pd.DataFrame(
        dict(
            x=diffs.T[top10_ids, :].ravel()[~is_nan],
            y=np.compress(~is_nan, perturbed_features),
            value=feature_values.T[top10_ids, :].ravel()[~is_nan],
        )
    )

    # To obtain a colormap, we map the feature values to 25 discrete categories
    # using the two-slope norm. We then assign one color to each category
    # using the scalar mappable.
    vmin, vmax = data["value"].min(), data["value"].max()
    norm = TwoSlopeNorm(0.0, vmin, vmax)
    sm = ScalarMappable(norm, colormap)
    data["category"] = np.ma.compressed(norm(data["value"]) * 25).astype(int)
    palette = np.empty((25, 4))  # 25 colors x 4 channels
    palette[:13, :] = sm.to_rgba(np.linspace(vmin, 0, 13))  # first slope
    palette[12:, :] = sm.to_rgba(np.linspace(0, vmax, 13))  # second slope
    palette = palette.tolist()  # NDArray not always supported

    with style_settings(style):
        fig, ax = plt.subplots(figsize=figsize)
        sns.stripplot(
            data=data, x="x", y="y", hue="category", ax=ax, palette=palette, size=2
        )
        ax.set(xlabel="Impact on latent space", ylabel="Feature")
        # Add colormap
        legend = ax.get_legend()
        if legend is not None:
            legend.remove()
        cbar = fig.colorbar(sm, ax=ax)
        cbar.ax.set(ylabel="Feature value")
    return fig