Source code for nestkit.comparison.comparator

"""Statistical model comparison for nested cross-validation results.

This module provides :class:`NestedCVComparator`, which performs
statistically rigorous pairwise and multi-model comparisons using
corrected paired t-tests, Bayesian tests, and Holm--Bonferroni
multiple-comparison correction.

References
----------
.. [1] Nadeau, C. and Bengio, Y. (2003). "Inference for the
       Generalization Error." *Machine Learning*, 52(3), 239--281.
.. [2] Benavoli, A., Corani, G., Demsar, J., and Zaffalon, M. (2017).
       "Time for a Change: a Tutorial for Comparing Multiple Classifiers
       Through Bayesian Analysis." *JMLR*, 18(77), 1--36.
.. [3] Demsar, J. (2006). "Statistical Comparisons of Classifiers over
       Multiple Data Sets." *JMLR*, 7, 1--30.
"""

from __future__ import annotations

from itertools import combinations

import numpy as np
import pandas as pd
from scipy.stats import t as t_dist

from nestkit.comparison.statistical_tests import (
    bayesian_correlated_ttest,
    holm_bonferroni_correction,
    nadeau_bengio_corrected_ttest,
)
from nestkit.results._base import _BaseNestedCVResults



[docs]
class NestedCVComparator:
    """Statistically rigorous comparison of nested cross-validation results.

    Provides corrected paired t-tests (Nadeau & Bengio, 2003), Bayesian
    correlated t-tests (Benavoli et al., 2017), and Holm--Bonferroni
    multiple-comparison correction (Demsar, 2006) for comparing two or
    more models that were evaluated on **identical** outer folds.

    All registered models must share the same outer-fold split indices;
    this is validated automatically when a new model is added.

    Attributes
    ----------
    _results : dict[str, _BaseNestedCVResults]
        Mapping from model name to its nested CV results object.

    Examples
    --------
    >>> comparator = NestedCVComparator()  # doctest: +SKIP
    >>> comparator.add("rf", rf_results)  # doctest: +SKIP
    >>> comparator.add("svm", svm_results)  # doctest: +SKIP
    >>> comparator.summary("accuracy")  # doctest: +SKIP

    See Also
    --------
    nestkit.comparison.statistical_tests.nadeau_bengio_corrected_ttest
    nestkit.comparison.statistical_tests.bayesian_correlated_ttest

    References
    ----------
    .. [1] Nadeau, C. and Bengio, Y. (2003). "Inference for the
           Generalization Error." *Machine Learning*, 52(3), 239--281.
    .. [2] Benavoli, A. et al. (2017). *JMLR*, 18(77), 1--36.
    .. [3] Demsar, J. (2006). *JMLR*, 7, 1--30.
    """

    def __init__(self):
        self._results: dict[str, _BaseNestedCVResults] = {}


[docs]
    def add(self, name: str, results: _BaseNestedCVResults) -> None:
        """Register a model's nested cross-validation results.

        Parameters
        ----------
        name : str
            Unique human-readable identifier for the model (e.g.,
            ``"random_forest"``).
        results : _BaseNestedCVResults
            Fitted nested CV results object.  Must contain per-fold
            ``test_indices`` that match every previously registered model.

        Raises
        ------
        ValueError
            If the outer-fold structure of *results* does not match that
            of models already registered.

        See Also
        --------
        _validate_fold_alignment : Alignment check called internally.
        """
        if self._results:
            self._validate_fold_alignment(name, results)
        self._results[name] = results


    def _validate_fold_alignment(self, name: str, new_results: _BaseNestedCVResults) -> None:
        """Verify that outer-fold indices match exactly.

        Compares the number of outer folds and the test-set indices of
        every fold against the first registered model.

        Parameters
        ----------
        name : str
            Name of the model being added.
        new_results : _BaseNestedCVResults
            Nested CV results to validate.

        Raises
        ------
        ValueError
            If the number of outer folds or any fold's test indices
            differ from the reference model.
        """
        ref_name, ref_results = next(iter(self._results.items()))

        if new_results.n_outer_folds_ != ref_results.n_outer_folds_:
            raise ValueError(
                f"Model '{name}' has {new_results.n_outer_folds_} outer folds, "
                f"but '{ref_name}' has {ref_results.n_outer_folds_}. "
                f"All models must use identical outer_cv splits."
            )

        for k in range(ref_results.n_outer_folds_):
            ref_test = ref_results.fold_results_[k].test_indices
            new_test = new_results.fold_results_[k].test_indices
            if not np.array_equal(ref_test, new_test):
                raise ValueError(
                    f"Outer fold {k}: test indices for '{name}' do not match "
                    f"'{ref_name}'. All models must be evaluated on identical "
                    f"outer folds for valid comparison."
                )

    def _get_scores(self, model: str, metric: str, threshold: str = "default") -> np.ndarray:
        """Extract per-fold scores for a registered model.

        Parameters
        ----------
        model : str
            Name of the registered model.
        metric : str
            Scoring metric key (e.g., ``"accuracy"``, ``"roc_auc"``).
        threshold : {"default", "optimized"}, default="default"
            Whether to use default or threshold-optimized outer scores.

        Returns
        -------
        numpy.ndarray
            1-D array of shape ``(n_outer_folds,)`` with per-fold scores.

        Raises
        ------
        ValueError
            If *threshold* is ``"optimized"`` but the model has no
            optimized scores.
        KeyError
            If *model* was not previously registered via :meth:`add`.
        """
        results = self._results[model]
        if threshold == "optimized":
            if not hasattr(results, "outer_scores_optimized_"):
                raise ValueError(f"Model '{model}' has no optimized scores.")
            return results.outer_scores_optimized_[metric].values
        return results.outer_scores_default_[metric].values


[docs]
    def summary(self, metric: str, threshold: str = "default") -> pd.DataFrame:
        """Produce a side-by-side summary table of all registered models.

        For each model the table includes the mean, standard deviation,
        median, 95 % confidence interval (Nadeau--Bengio corrected,
        t-distribution), min, max, and inter-quartile range of the
        outer-fold scores.

        Parameters
        ----------
        metric : str
            Scoring metric key to summarise.
        threshold : {"default", "optimized"}, default="default"
            Which score variant to use.

        Returns
        -------
        pandas.DataFrame
            One row per model with columns ``model``, ``mean``, ``std``,
            ``median``, ``ci_lower``, ``ci_upper``, ``min``, ``max``,
            ``iqr``.

        Examples
        --------
        >>> comparator.summary("roc_auc")  # doctest: +SKIP
        """
        rows = []
        for name in self._results:
            scores = self._get_scores(name, metric, threshold)
            mean = float(np.mean(scores))
            std = float(np.std(scores, ddof=1))
            n = len(scores)

            # Nadeau-Bengio corrected CI using t-distribution
            results_obj = self._results[name]
            n_test = np.mean([len(fr.test_indices) for fr in results_obj.fold_results_])
            n_train = np.mean([len(fr.train_indices) for fr in results_obj.fold_results_])
            correction = np.sqrt((1.0 / n) + (n_test / n_train))
            t_crit = float(t_dist.ppf(0.975, df=n - 1))
            ci_half = t_crit * correction * std

            rows.append(
                {
                    "model": name,
                    "mean": mean,
                    "std": std,
                    "median": float(np.median(scores)),
                    "ci_lower": mean - ci_half,
                    "ci_upper": mean + ci_half,
                    "min": float(np.min(scores)),
                    "max": float(np.max(scores)),
                    "iqr": float(np.subtract(*np.percentile(scores, [75, 25]))),
                }
            )
        return pd.DataFrame(rows)



[docs]
    def corrected_paired_ttest(
        self, metric: str, model_a: str, model_b: str, threshold: str = "default"
    ) -> dict:
        """Perform the Nadeau--Bengio corrected paired t-test.

        Accounts for the non-independence of cross-validation fold
        scores caused by overlapping training sets.

        Parameters
        ----------
        metric : str
            Scoring metric key.
        model_a : str
            Name of the first model.
        model_b : str
            Name of the second model.
        threshold : {"default", "optimized"}, default="default"
            Which score variant to use.

        Returns
        -------
        dict
            Test results including ``t_statistic``, ``p_value``,
            ``mean_difference``, ``corrected_std``, ``ci_lower``,
            ``ci_upper``, ``n_folds``, ``significant_at_005``,
            ``significant_at_001``.

        See Also
        --------
        nestkit.comparison.statistical_tests.nadeau_bengio_corrected_ttest

        References
        ----------
        .. [1] Nadeau, C. and Bengio, Y. (2003). *Machine Learning*,
               52(3), 239--281.
        """
        scores_a = self._get_scores(model_a, metric, threshold)
        scores_b = self._get_scores(model_b, metric, threshold)
        folds = self._results[model_a].fold_results_
        n_test = np.mean([len(fr.test_indices) for fr in folds])
        n_train = np.mean([len(fr.train_indices) for fr in folds])
        return nadeau_bengio_corrected_ttest(scores_a, scores_b, n_train, n_test)



[docs]
    def pairwise_corrected_ttest(self, metric: str, threshold: str = "default") -> pd.DataFrame:
        """Run corrected paired t-tests for every model pair.

        All ``C(n, 2)`` pairwise Nadeau--Bengio tests are performed and
        then adjusted for multiple comparisons via the step-down
        Holm--Bonferroni procedure.

        Parameters
        ----------
        metric : str
            Scoring metric key.
        threshold : {"default", "optimized"}, default="default"
            Which score variant to use.

        Returns
        -------
        pandas.DataFrame
            One row per pair with columns ``model_a``, ``model_b``,
            all keys from :func:`nadeau_bengio_corrected_ttest`, and
            ``p_value_corrected``.

        See Also
        --------
        corrected_paired_ttest : Single-pair test.
        nestkit.comparison.statistical_tests.holm_bonferroni_correction

        References
        ----------
        .. [1] Demsar, J. (2006). *JMLR*, 7, 1--30.
        """
        models = list(self._results.keys())
        rows = []
        p_values = []

        for a, b in combinations(models, 2):
            result = self.corrected_paired_ttest(metric, a, b, threshold)
            rows.append({"model_a": a, "model_b": b, **result})
            p_values.append(result["p_value"])

        if rows:
            corrected = holm_bonferroni_correction(p_values)
            for i, row in enumerate(rows):
                row["p_value_corrected"] = corrected[i]

        return pd.DataFrame(rows)



[docs]
    def bayesian_comparison(
        self,
        metric: str,
        model_a: str,
        model_b: str,
        rope: float = 0.01,
        threshold: str = "default",
    ) -> dict:
        """Perform a Bayesian correlated t-test between two models.

        Uses a Student-t posterior over the mean score difference and
        partitions the probability mass into three regions: model A
        better, practically equivalent (within the ROPE), and model B
        better.

        Parameters
        ----------
        metric : str
            Scoring metric key.
        model_a : str
            Name of the first model.
        model_b : str
            Name of the second model.
        rope : float, default=0.01
            Half-width of the Region of Practical Equivalence.
        threshold : {"default", "optimized"}, default="default"
            Which score variant to use.

        Returns
        -------
        dict
            Posterior probabilities and diagnostics: ``p_a_better``,
            ``p_equivalent``, ``p_b_better``, ``rope``,
            ``mean_difference``, ``hdi_lower``, ``hdi_upper``.

        See Also
        --------
        nestkit.comparison.statistical_tests.bayesian_correlated_ttest

        References
        ----------
        .. [1] Benavoli, A. et al. (2017). *JMLR*, 18(77), 1--36.
        """
        scores_a = self._get_scores(model_a, metric, threshold)
        scores_b = self._get_scores(model_b, metric, threshold)
        folds = self._results[model_a].fold_results_
        n_test = np.mean([len(fr.test_indices) for fr in folds])
        n_train = np.mean([len(fr.train_indices) for fr in folds])
        return bayesian_correlated_ttest(scores_a, scores_b, n_train, n_test, rope)



[docs]
    def rank_models(self, metric: str, threshold: str = "default") -> pd.DataFrame:
        """Rank all registered models by mean outer-fold score.

        Returns the same summary table as :meth:`summary` sorted in
        descending order of ``mean`` with an additional ``rank`` column
        (1 = best).

        Parameters
        ----------
        metric : str
            Scoring metric key.
        threshold : {"default", "optimized"}, default="default"
            Which score variant to use.

        Returns
        -------
        pandas.DataFrame
            Sorted summary table with an extra ``rank`` column.

        See Also
        --------
        summary : Unsorted summary table.
        """
        summary = self.summary(metric, threshold)
        summary = summary.sort_values("mean", ascending=False).reset_index(drop=True)
        summary["rank"] = range(1, len(summary) + 1)
        return summary