"""Statistical model comparison for nested cross-validation results.
This module provides :class:`NestedCVComparator`, which performs
statistically rigorous pairwise and multi-model comparisons using
corrected paired t-tests, Bayesian tests, and Holm--Bonferroni
multiple-comparison correction.
References
----------
.. [1] Nadeau, C. and Bengio, Y. (2003). "Inference for the
Generalization Error." *Machine Learning*, 52(3), 239--281.
.. [2] Benavoli, A., Corani, G., Demsar, J., and Zaffalon, M. (2017).
"Time for a Change: a Tutorial for Comparing Multiple Classifiers
Through Bayesian Analysis." *JMLR*, 18(77), 1--36.
.. [3] Demsar, J. (2006). "Statistical Comparisons of Classifiers over
Multiple Data Sets." *JMLR*, 7, 1--30.
"""
from __future__ import annotations
from itertools import combinations
import numpy as np
import pandas as pd
from scipy.stats import t as t_dist
from nestkit.comparison.statistical_tests import (
bayesian_correlated_ttest,
holm_bonferroni_correction,
nadeau_bengio_corrected_ttest,
)
from nestkit.results._base import _BaseNestedCVResults
[docs]
class NestedCVComparator:
"""Statistically rigorous comparison of nested cross-validation results.
Provides corrected paired t-tests (Nadeau & Bengio, 2003), Bayesian
correlated t-tests (Benavoli et al., 2017), and Holm--Bonferroni
multiple-comparison correction (Demsar, 2006) for comparing two or
more models that were evaluated on **identical** outer folds.
All registered models must share the same outer-fold split indices;
this is validated automatically when a new model is added.
Attributes
----------
_results : dict[str, _BaseNestedCVResults]
Mapping from model name to its nested CV results object.
Examples
--------
>>> comparator = NestedCVComparator() # doctest: +SKIP
>>> comparator.add("rf", rf_results) # doctest: +SKIP
>>> comparator.add("svm", svm_results) # doctest: +SKIP
>>> comparator.summary("accuracy") # doctest: +SKIP
See Also
--------
nestkit.comparison.statistical_tests.nadeau_bengio_corrected_ttest
nestkit.comparison.statistical_tests.bayesian_correlated_ttest
References
----------
.. [1] Nadeau, C. and Bengio, Y. (2003). "Inference for the
Generalization Error." *Machine Learning*, 52(3), 239--281.
.. [2] Benavoli, A. et al. (2017). *JMLR*, 18(77), 1--36.
.. [3] Demsar, J. (2006). *JMLR*, 7, 1--30.
"""
def __init__(self):
self._results: dict[str, _BaseNestedCVResults] = {}
[docs]
def add(self, name: str, results: _BaseNestedCVResults) -> None:
"""Register a model's nested cross-validation results.
Parameters
----------
name : str
Unique human-readable identifier for the model (e.g.,
``"random_forest"``).
results : _BaseNestedCVResults
Fitted nested CV results object. Must contain per-fold
``test_indices`` that match every previously registered model.
Raises
------
ValueError
If the outer-fold structure of *results* does not match that
of models already registered.
See Also
--------
_validate_fold_alignment : Alignment check called internally.
"""
if self._results:
self._validate_fold_alignment(name, results)
self._results[name] = results
def _validate_fold_alignment(self, name: str, new_results: _BaseNestedCVResults) -> None:
"""Verify that outer-fold indices match exactly.
Compares the number of outer folds and the test-set indices of
every fold against the first registered model.
Parameters
----------
name : str
Name of the model being added.
new_results : _BaseNestedCVResults
Nested CV results to validate.
Raises
------
ValueError
If the number of outer folds or any fold's test indices
differ from the reference model.
"""
ref_name, ref_results = next(iter(self._results.items()))
if new_results.n_outer_folds_ != ref_results.n_outer_folds_:
raise ValueError(
f"Model '{name}' has {new_results.n_outer_folds_} outer folds, "
f"but '{ref_name}' has {ref_results.n_outer_folds_}. "
f"All models must use identical outer_cv splits."
)
for k in range(ref_results.n_outer_folds_):
ref_test = ref_results.fold_results_[k].test_indices
new_test = new_results.fold_results_[k].test_indices
if not np.array_equal(ref_test, new_test):
raise ValueError(
f"Outer fold {k}: test indices for '{name}' do not match "
f"'{ref_name}'. All models must be evaluated on identical "
f"outer folds for valid comparison."
)
def _get_scores(self, model: str, metric: str, threshold: str = "default") -> np.ndarray:
"""Extract per-fold scores for a registered model.
Parameters
----------
model : str
Name of the registered model.
metric : str
Scoring metric key (e.g., ``"accuracy"``, ``"roc_auc"``).
threshold : {"default", "optimized"}, default="default"
Whether to use default or threshold-optimized outer scores.
Returns
-------
numpy.ndarray
1-D array of shape ``(n_outer_folds,)`` with per-fold scores.
Raises
------
ValueError
If *threshold* is ``"optimized"`` but the model has no
optimized scores.
KeyError
If *model* was not previously registered via :meth:`add`.
"""
results = self._results[model]
if threshold == "optimized":
if not hasattr(results, "outer_scores_optimized_"):
raise ValueError(f"Model '{model}' has no optimized scores.")
return results.outer_scores_optimized_[metric].values
return results.outer_scores_default_[metric].values
[docs]
def summary(self, metric: str, threshold: str = "default") -> pd.DataFrame:
"""Produce a side-by-side summary table of all registered models.
For each model the table includes the mean, standard deviation,
median, 95 % confidence interval (Nadeau--Bengio corrected,
t-distribution), min, max, and inter-quartile range of the
outer-fold scores.
Parameters
----------
metric : str
Scoring metric key to summarise.
threshold : {"default", "optimized"}, default="default"
Which score variant to use.
Returns
-------
pandas.DataFrame
One row per model with columns ``model``, ``mean``, ``std``,
``median``, ``ci_lower``, ``ci_upper``, ``min``, ``max``,
``iqr``.
Examples
--------
>>> comparator.summary("roc_auc") # doctest: +SKIP
"""
rows = []
for name in self._results:
scores = self._get_scores(name, metric, threshold)
mean = float(np.mean(scores))
std = float(np.std(scores, ddof=1))
n = len(scores)
# Nadeau-Bengio corrected CI using t-distribution
results_obj = self._results[name]
n_test = np.mean([len(fr.test_indices) for fr in results_obj.fold_results_])
n_train = np.mean([len(fr.train_indices) for fr in results_obj.fold_results_])
correction = np.sqrt((1.0 / n) + (n_test / n_train))
t_crit = float(t_dist.ppf(0.975, df=n - 1))
ci_half = t_crit * correction * std
rows.append(
{
"model": name,
"mean": mean,
"std": std,
"median": float(np.median(scores)),
"ci_lower": mean - ci_half,
"ci_upper": mean + ci_half,
"min": float(np.min(scores)),
"max": float(np.max(scores)),
"iqr": float(np.subtract(*np.percentile(scores, [75, 25]))),
}
)
return pd.DataFrame(rows)
[docs]
def corrected_paired_ttest(
self, metric: str, model_a: str, model_b: str, threshold: str = "default"
) -> dict:
"""Perform the Nadeau--Bengio corrected paired t-test.
Accounts for the non-independence of cross-validation fold
scores caused by overlapping training sets.
Parameters
----------
metric : str
Scoring metric key.
model_a : str
Name of the first model.
model_b : str
Name of the second model.
threshold : {"default", "optimized"}, default="default"
Which score variant to use.
Returns
-------
dict
Test results including ``t_statistic``, ``p_value``,
``mean_difference``, ``corrected_std``, ``ci_lower``,
``ci_upper``, ``n_folds``, ``significant_at_005``,
``significant_at_001``.
See Also
--------
nestkit.comparison.statistical_tests.nadeau_bengio_corrected_ttest
References
----------
.. [1] Nadeau, C. and Bengio, Y. (2003). *Machine Learning*,
52(3), 239--281.
"""
scores_a = self._get_scores(model_a, metric, threshold)
scores_b = self._get_scores(model_b, metric, threshold)
folds = self._results[model_a].fold_results_
n_test = np.mean([len(fr.test_indices) for fr in folds])
n_train = np.mean([len(fr.train_indices) for fr in folds])
return nadeau_bengio_corrected_ttest(scores_a, scores_b, n_train, n_test)
[docs]
def pairwise_corrected_ttest(self, metric: str, threshold: str = "default") -> pd.DataFrame:
"""Run corrected paired t-tests for every model pair.
All ``C(n, 2)`` pairwise Nadeau--Bengio tests are performed and
then adjusted for multiple comparisons via the step-down
Holm--Bonferroni procedure.
Parameters
----------
metric : str
Scoring metric key.
threshold : {"default", "optimized"}, default="default"
Which score variant to use.
Returns
-------
pandas.DataFrame
One row per pair with columns ``model_a``, ``model_b``,
all keys from :func:`nadeau_bengio_corrected_ttest`, and
``p_value_corrected``.
See Also
--------
corrected_paired_ttest : Single-pair test.
nestkit.comparison.statistical_tests.holm_bonferroni_correction
References
----------
.. [1] Demsar, J. (2006). *JMLR*, 7, 1--30.
"""
models = list(self._results.keys())
rows = []
p_values = []
for a, b in combinations(models, 2):
result = self.corrected_paired_ttest(metric, a, b, threshold)
rows.append({"model_a": a, "model_b": b, **result})
p_values.append(result["p_value"])
if rows:
corrected = holm_bonferroni_correction(p_values)
for i, row in enumerate(rows):
row["p_value_corrected"] = corrected[i]
return pd.DataFrame(rows)
[docs]
def bayesian_comparison(
self,
metric: str,
model_a: str,
model_b: str,
rope: float = 0.01,
threshold: str = "default",
) -> dict:
"""Perform a Bayesian correlated t-test between two models.
Uses a Student-t posterior over the mean score difference and
partitions the probability mass into three regions: model A
better, practically equivalent (within the ROPE), and model B
better.
Parameters
----------
metric : str
Scoring metric key.
model_a : str
Name of the first model.
model_b : str
Name of the second model.
rope : float, default=0.01
Half-width of the Region of Practical Equivalence.
threshold : {"default", "optimized"}, default="default"
Which score variant to use.
Returns
-------
dict
Posterior probabilities and diagnostics: ``p_a_better``,
``p_equivalent``, ``p_b_better``, ``rope``,
``mean_difference``, ``hdi_lower``, ``hdi_upper``.
See Also
--------
nestkit.comparison.statistical_tests.bayesian_correlated_ttest
References
----------
.. [1] Benavoli, A. et al. (2017). *JMLR*, 18(77), 1--36.
"""
scores_a = self._get_scores(model_a, metric, threshold)
scores_b = self._get_scores(model_b, metric, threshold)
folds = self._results[model_a].fold_results_
n_test = np.mean([len(fr.test_indices) for fr in folds])
n_train = np.mean([len(fr.train_indices) for fr in folds])
return bayesian_correlated_ttest(scores_a, scores_b, n_train, n_test, rope)
[docs]
def rank_models(self, metric: str, threshold: str = "default") -> pd.DataFrame:
"""Rank all registered models by mean outer-fold score.
Returns the same summary table as :meth:`summary` sorted in
descending order of ``mean`` with an additional ``rank`` column
(1 = best).
Parameters
----------
metric : str
Scoring metric key.
threshold : {"default", "optimized"}, default="default"
Which score variant to use.
Returns
-------
pandas.DataFrame
Sorted summary table with an extra ``rank`` column.
See Also
--------
summary : Unsorted summary table.
"""
summary = self.summary(metric, threshold)
summary = summary.sort_values("mean", ascending=False).reset_index(drop=True)
summary["rank"] = range(1, len(summary) + 1)
return summary