"""Model comparison visualizations."""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from nestkit.plotting._style import _apply_axis_limits, _get_ax
if TYPE_CHECKING:
from matplotlib.axes import Axes
[docs]
def plot_comparison(
comparator,
metric: str,
threshold: str = "default",
point_alpha: float = 0.6,
line_alpha: float = 0.15,
ylim: tuple[float, float] | None = None,
ax=None,
**kwargs,
) -> Axes:
"""Paired box-and-strip plot of per-fold scores across models.
Parameters
----------
comparator : ModelComparator
Fitted model comparator containing two or more result sets.
metric : str
Name of the scoring metric to compare.
threshold : {'default', 'optimized'}, optional
Which threshold's scores to use.
point_alpha : float, optional
Opacity of individual fold score markers.
line_alpha : float, optional
Opacity of lines connecting paired observations.
ylim : tuple of float or None, optional
Explicit y-axis limits.
ax : matplotlib.axes.Axes or None, optional
Axes to plot on. If ``None``, a new figure is created.
**kwargs
Additional keyword arguments passed to the underlying matplotlib call.
Returns
-------
matplotlib.axes.Axes
The axes with the plot.
"""
ax = _get_ax(ax)
models = list(comparator._results.keys())
all_scores = []
for name in models:
scores = comparator._get_scores(name, metric, threshold)
all_scores.append(scores)
ax.boxplot(all_scores, labels=models)
for i, scores in enumerate(all_scores):
ax.scatter(np.full(len(scores), i + 1), scores, alpha=point_alpha, zorder=3)
for j in range(len(all_scores[0])):
xs = list(range(1, len(models) + 1))
ys = [all_scores[i][j] for i in range(len(models))]
ax.plot(xs, ys, "k-", alpha=line_alpha)
ax.set_ylabel(metric)
ax.set_title(f"Model Comparison: {metric}")
_apply_axis_limits(ax, ylim=ylim)
return ax
[docs]
def plot_score_differences(
comparator,
metric: str,
model_a: str,
model_b: str,
threshold: str = "default",
bar_alpha: float = 0.7,
bar_color: str | None = None,
ylim: tuple[float, float] | None = None,
ax=None,
**kwargs,
) -> Axes:
"""Per-fold score differences between two models.
Parameters
----------
comparator : ModelComparator
Fitted model comparator containing the two models.
metric : str
Name of the scoring metric to compare.
model_a, model_b : str
Names of the two models to compare.
threshold : {'default', 'optimized'}, optional
Which threshold's scores to use.
bar_alpha : float, optional
Opacity of the bars.
bar_color : str or None, optional
Color of the bars. ``None`` uses the default color cycle.
ylim : tuple of float or None, optional
Explicit y-axis limits.
ax : matplotlib.axes.Axes or None, optional
Axes to plot on. If ``None``, a new figure is created.
**kwargs
Additional keyword arguments passed to the underlying matplotlib call.
Returns
-------
matplotlib.axes.Axes
The axes with the plot.
"""
ax = _get_ax(ax)
scores_a = comparator._get_scores(model_a, metric, threshold)
scores_b = comparator._get_scores(model_b, metric, threshold)
diffs = scores_a - scores_b
bar_kw = {"alpha": bar_alpha}
if bar_color is not None:
bar_kw["color"] = bar_color
ax.bar(range(len(diffs)), diffs, **bar_kw)
ax.axhline(0, color="black", linestyle="-", linewidth=0.5)
ax.axhline(np.mean(diffs), color="red", linestyle="--", label=f"Mean={np.mean(diffs):.4f}")
ax.set_xlabel("Fold")
ax.set_ylabel(f"Score diff ({model_a} - {model_b})")
ax.set_title(f"Score Differences: {metric}")
ax.legend()
_apply_axis_limits(ax, ylim=ylim, full_range=False)
return ax
[docs]
def plot_bayesian_posterior(
comparator,
metric: str,
model_a: str,
model_b: str,
rope: float = 0.01,
threshold: str = "default",
color_a: str = "blue",
color_b: str = "red",
color_rope: str = "gray",
fill_alpha: float = 0.3,
ax=None,
**kwargs,
) -> Axes:
"""Posterior distribution of score differences with ROPE.
Parameters
----------
comparator : ModelComparator
Fitted model comparator containing the two models.
metric : str
Name of the scoring metric to compare.
model_a, model_b : str
Names of the two models to compare.
rope : float, optional
Half-width of the Region of Practical Equivalence.
threshold : {'default', 'optimized'}, optional
Which threshold's scores to use.
color_a : str, optional
Fill color for the "A is better" region.
color_b : str, optional
Fill color for the "B is better" region.
color_rope : str, optional
Fill color for the equivalence region.
fill_alpha : float, optional
Opacity of all filled regions.
ax : matplotlib.axes.Axes or None, optional
Axes to plot on. If ``None``, a new figure is created.
**kwargs
Additional keyword arguments passed to the underlying matplotlib call.
Returns
-------
matplotlib.axes.Axes
The axes with the plot.
"""
ax = _get_ax(ax)
result = comparator.bayesian_comparison(metric, model_a, model_b, rope, threshold)
scores_a = comparator._get_scores(model_a, metric, threshold)
scores_b = comparator._get_scores(model_b, metric, threshold)
diffs = scores_a - scores_b
from scipy.stats import t as t_dist
n = len(diffs)
mean = np.mean(diffs)
sd = np.std(diffs, ddof=1)
ref_results = next(iter(comparator._results.values()))
n_test = np.mean([len(fr.test_indices) for fr in ref_results.fold_results_])
n_train = np.mean([len(fr.train_indices) for fr in ref_results.fold_results_])
se = sd * np.sqrt((1.0 / n) + (n_test / n_train))
x = np.linspace(mean - 4 * se, mean + 4 * se, 200)
pdf = t_dist.pdf(x, df=n - 1, loc=mean, scale=se)
ax.plot(x, pdf, "k-")
ax.fill_between(
x,
pdf,
where=(x > rope),
alpha=fill_alpha,
color=color_a,
label=f"P(A>{model_a})={result['p_a_better']:.3f}",
)
ax.fill_between(
x,
pdf,
where=(x < -rope),
alpha=fill_alpha,
color=color_b,
label=f"P(B>{model_b})={result['p_b_better']:.3f}",
)
ax.fill_between(
x,
pdf,
where=(np.abs(x) <= rope),
alpha=fill_alpha,
color=color_rope,
label=f"P(equiv)={result['p_equivalent']:.3f}",
)
ax.axvline(0, color="black", linestyle="--", alpha=0.5)
ax.set_xlabel(f"Score difference ({model_a} - {model_b})")
ax.set_ylabel("Density")
ax.set_title(f"Bayesian Comparison: {metric}")
ax.legend(fontsize=7)
return ax
[docs]
def plot_critical_difference(
comparator,
metric: str,
threshold: str = "default",
bar_alpha: float = 0.7,
bar_color: str | None = None,
ax=None,
**kwargs,
) -> Axes:
"""Demsar critical difference diagram.
Parameters
----------
comparator : ModelComparator
Fitted model comparator containing three or more result sets.
metric : str
Name of the scoring metric to rank.
threshold : {'default', 'optimized'}, optional
Which threshold's scores to use.
bar_alpha : float, optional
Opacity of the bars.
bar_color : str or None, optional
Color of the bars. ``None`` uses the default color cycle.
ax : matplotlib.axes.Axes or None, optional
Axes to plot on. If ``None``, a new figure is created.
**kwargs
Additional keyword arguments passed to the underlying matplotlib call.
Returns
-------
matplotlib.axes.Axes
The axes with the plot.
"""
ax = _get_ax(ax)
models = list(comparator._results.keys())
if len(models) < 3:
ax.text(0.5, 0.5, "Need >= 3 models", ha="center", va="center")
return ax
n_folds = comparator._results[models[0]].n_outer_folds_
ranks_per_model = {}
for name in models:
scores = comparator._get_scores(name, metric, threshold)
ranks_per_model[name] = scores
from scipy.stats import rankdata
avg_ranks = {}
for fold_idx in range(n_folds):
fold_scores = [ranks_per_model[m][fold_idx] for m in models]
fold_ranks = rankdata(-np.array(fold_scores))
for i, m in enumerate(models):
avg_ranks.setdefault(m, []).append(fold_ranks[i])
mean_ranks = {m: np.mean(r) for m, r in avg_ranks.items()}
sorted_models = sorted(mean_ranks, key=mean_ranks.get)
y_pos = np.arange(len(sorted_models))
bar_kw = {"alpha": bar_alpha}
if bar_color is not None:
bar_kw["color"] = bar_color
ax.barh(y_pos, [mean_ranks[m] for m in sorted_models], **bar_kw)
ax.set_yticks(y_pos)
ax.set_yticklabels(sorted_models)
ax.set_xlabel("Average rank")
ax.set_title(f"Critical Difference Diagram: {metric}")
ax.invert_xaxis()
return ax