Source code for nestkit.plotting.comparison

"""Model comparison visualizations."""

from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np

from nestkit.plotting._style import _apply_axis_limits, _get_ax

if TYPE_CHECKING:
    from matplotlib.axes import Axes


[docs] def plot_comparison( comparator, metric: str, threshold: str = "default", point_alpha: float = 0.6, line_alpha: float = 0.15, ylim: tuple[float, float] | None = None, ax=None, **kwargs, ) -> Axes: """Paired box-and-strip plot of per-fold scores across models. Parameters ---------- comparator : ModelComparator Fitted model comparator containing two or more result sets. metric : str Name of the scoring metric to compare. threshold : {'default', 'optimized'}, optional Which threshold's scores to use. point_alpha : float, optional Opacity of individual fold score markers. line_alpha : float, optional Opacity of lines connecting paired observations. ylim : tuple of float or None, optional Explicit y-axis limits. ax : matplotlib.axes.Axes or None, optional Axes to plot on. If ``None``, a new figure is created. **kwargs Additional keyword arguments passed to the underlying matplotlib call. Returns ------- matplotlib.axes.Axes The axes with the plot. """ ax = _get_ax(ax) models = list(comparator._results.keys()) all_scores = [] for name in models: scores = comparator._get_scores(name, metric, threshold) all_scores.append(scores) ax.boxplot(all_scores, labels=models) for i, scores in enumerate(all_scores): ax.scatter(np.full(len(scores), i + 1), scores, alpha=point_alpha, zorder=3) for j in range(len(all_scores[0])): xs = list(range(1, len(models) + 1)) ys = [all_scores[i][j] for i in range(len(models))] ax.plot(xs, ys, "k-", alpha=line_alpha) ax.set_ylabel(metric) ax.set_title(f"Model Comparison: {metric}") _apply_axis_limits(ax, ylim=ylim) return ax
[docs] def plot_score_differences( comparator, metric: str, model_a: str, model_b: str, threshold: str = "default", bar_alpha: float = 0.7, bar_color: str | None = None, ylim: tuple[float, float] | None = None, ax=None, **kwargs, ) -> Axes: """Per-fold score differences between two models. Parameters ---------- comparator : ModelComparator Fitted model comparator containing the two models. metric : str Name of the scoring metric to compare. model_a, model_b : str Names of the two models to compare. threshold : {'default', 'optimized'}, optional Which threshold's scores to use. bar_alpha : float, optional Opacity of the bars. bar_color : str or None, optional Color of the bars. ``None`` uses the default color cycle. ylim : tuple of float or None, optional Explicit y-axis limits. ax : matplotlib.axes.Axes or None, optional Axes to plot on. If ``None``, a new figure is created. **kwargs Additional keyword arguments passed to the underlying matplotlib call. Returns ------- matplotlib.axes.Axes The axes with the plot. """ ax = _get_ax(ax) scores_a = comparator._get_scores(model_a, metric, threshold) scores_b = comparator._get_scores(model_b, metric, threshold) diffs = scores_a - scores_b bar_kw = {"alpha": bar_alpha} if bar_color is not None: bar_kw["color"] = bar_color ax.bar(range(len(diffs)), diffs, **bar_kw) ax.axhline(0, color="black", linestyle="-", linewidth=0.5) ax.axhline(np.mean(diffs), color="red", linestyle="--", label=f"Mean={np.mean(diffs):.4f}") ax.set_xlabel("Fold") ax.set_ylabel(f"Score diff ({model_a} - {model_b})") ax.set_title(f"Score Differences: {metric}") ax.legend() _apply_axis_limits(ax, ylim=ylim, full_range=False) return ax
[docs] def plot_bayesian_posterior( comparator, metric: str, model_a: str, model_b: str, rope: float = 0.01, threshold: str = "default", color_a: str = "blue", color_b: str = "red", color_rope: str = "gray", fill_alpha: float = 0.3, ax=None, **kwargs, ) -> Axes: """Posterior distribution of score differences with ROPE. Parameters ---------- comparator : ModelComparator Fitted model comparator containing the two models. metric : str Name of the scoring metric to compare. model_a, model_b : str Names of the two models to compare. rope : float, optional Half-width of the Region of Practical Equivalence. threshold : {'default', 'optimized'}, optional Which threshold's scores to use. color_a : str, optional Fill color for the "A is better" region. color_b : str, optional Fill color for the "B is better" region. color_rope : str, optional Fill color for the equivalence region. fill_alpha : float, optional Opacity of all filled regions. ax : matplotlib.axes.Axes or None, optional Axes to plot on. If ``None``, a new figure is created. **kwargs Additional keyword arguments passed to the underlying matplotlib call. Returns ------- matplotlib.axes.Axes The axes with the plot. """ ax = _get_ax(ax) result = comparator.bayesian_comparison(metric, model_a, model_b, rope, threshold) scores_a = comparator._get_scores(model_a, metric, threshold) scores_b = comparator._get_scores(model_b, metric, threshold) diffs = scores_a - scores_b from scipy.stats import t as t_dist n = len(diffs) mean = np.mean(diffs) sd = np.std(diffs, ddof=1) ref_results = next(iter(comparator._results.values())) n_test = np.mean([len(fr.test_indices) for fr in ref_results.fold_results_]) n_train = np.mean([len(fr.train_indices) for fr in ref_results.fold_results_]) se = sd * np.sqrt((1.0 / n) + (n_test / n_train)) x = np.linspace(mean - 4 * se, mean + 4 * se, 200) pdf = t_dist.pdf(x, df=n - 1, loc=mean, scale=se) ax.plot(x, pdf, "k-") ax.fill_between( x, pdf, where=(x > rope), alpha=fill_alpha, color=color_a, label=f"P(A>{model_a})={result['p_a_better']:.3f}", ) ax.fill_between( x, pdf, where=(x < -rope), alpha=fill_alpha, color=color_b, label=f"P(B>{model_b})={result['p_b_better']:.3f}", ) ax.fill_between( x, pdf, where=(np.abs(x) <= rope), alpha=fill_alpha, color=color_rope, label=f"P(equiv)={result['p_equivalent']:.3f}", ) ax.axvline(0, color="black", linestyle="--", alpha=0.5) ax.set_xlabel(f"Score difference ({model_a} - {model_b})") ax.set_ylabel("Density") ax.set_title(f"Bayesian Comparison: {metric}") ax.legend(fontsize=7) return ax
[docs] def plot_critical_difference( comparator, metric: str, threshold: str = "default", bar_alpha: float = 0.7, bar_color: str | None = None, ax=None, **kwargs, ) -> Axes: """Demsar critical difference diagram. Parameters ---------- comparator : ModelComparator Fitted model comparator containing three or more result sets. metric : str Name of the scoring metric to rank. threshold : {'default', 'optimized'}, optional Which threshold's scores to use. bar_alpha : float, optional Opacity of the bars. bar_color : str or None, optional Color of the bars. ``None`` uses the default color cycle. ax : matplotlib.axes.Axes or None, optional Axes to plot on. If ``None``, a new figure is created. **kwargs Additional keyword arguments passed to the underlying matplotlib call. Returns ------- matplotlib.axes.Axes The axes with the plot. """ ax = _get_ax(ax) models = list(comparator._results.keys()) if len(models) < 3: ax.text(0.5, 0.5, "Need >= 3 models", ha="center", va="center") return ax n_folds = comparator._results[models[0]].n_outer_folds_ ranks_per_model = {} for name in models: scores = comparator._get_scores(name, metric, threshold) ranks_per_model[name] = scores from scipy.stats import rankdata avg_ranks = {} for fold_idx in range(n_folds): fold_scores = [ranks_per_model[m][fold_idx] for m in models] fold_ranks = rankdata(-np.array(fold_scores)) for i, m in enumerate(models): avg_ranks.setdefault(m, []).append(fold_ranks[i]) mean_ranks = {m: np.mean(r) for m, r in avg_ranks.items()} sorted_models = sorted(mean_ranks, key=mean_ranks.get) y_pos = np.arange(len(sorted_models)) bar_kw = {"alpha": bar_alpha} if bar_color is not None: bar_kw["color"] = bar_color ax.barh(y_pos, [mean_ranks[m] for m in sorted_models], **bar_kw) ax.set_yticks(y_pos) ax.set_yticklabels(sorted_models) ax.set_xlabel("Average rank") ax.set_title(f"Critical Difference Diagram: {metric}") ax.invert_xaxis() return ax