Source code for nestkit.calibration.diagnostics

"""Calibration diagnostic metrics.

Provides ``CalibrationDiagnostics``, a collection of static methods for
evaluating the quality of predicted probabilities before and after
post-hoc calibration.  Includes Expected Calibration Error (ECE),
Maximum Calibration Error (MCE), Brier score and its decomposition, and
reliability diagram data.
"""

from __future__ import annotations

from itertools import pairwise

import numpy as np
import pandas as pd

from nestkit._validation import extract_positive_proba


def _make_bins(p: np.ndarray, n_bins: int, strategy: str) -> np.ndarray:
    """Return bin edges for calibration diagnostics.

    Parameters
    ----------
    p : numpy.ndarray
        1-D array of predicted probabilities.
    n_bins : int
        Desired number of bins.
    strategy : {"quantile", "uniform"}
        ``"quantile"`` produces bins with approximately equal sample
        counts (adaptive binning).  ``"uniform"`` produces equal-width
        bins over [0, 1].

    Returns
    -------
    numpy.ndarray
        Sorted, unique bin edges.  For quantile bins the actual number
        of bins may be smaller than *n_bins* when many probabilities
        share the same value.
    """
    if strategy == "quantile":
        return np.unique(np.quantile(p, np.linspace(0, 1, n_bins + 1)))
    if strategy == "uniform":
        return np.linspace(0, 1, n_bins + 1)
    raise ValueError(f"Unknown binning strategy: {strategy!r}. Use 'quantile' or 'uniform'.")


def _bin_mask(p: np.ndarray, lo: float, hi: float, *, is_last: bool) -> np.ndarray:
    """Return a boolean mask selecting samples in the bin ``[lo, hi)`` (or ``[lo, hi]`` for the last bin)."""
    if is_last:
        return (p >= lo) & (p <= hi)
    return (p >= lo) & (p < hi)


[docs] class CalibrationDiagnostics: """Assess calibration quality before and after post-hoc calibration. All methods are static and operate on arrays of true labels and predicted probabilities. No fitting or state is required. See Also -------- nestkit.calibration.calibrators.PostHocCalibrator : Apply post-hoc calibration to predicted probabilities. Examples -------- >>> import numpy as np >>> from nestkit.calibration.diagnostics import CalibrationDiagnostics >>> y_true = np.array([0, 0, 1, 1, 1]) >>> y_proba = np.array([0.1, 0.3, 0.6, 0.8, 0.9]) >>> CalibrationDiagnostics.brier_score(y_true, y_proba) # doctest: +SKIP 0.042... """
[docs] @staticmethod def expected_calibration_error( y_true: np.ndarray, y_proba: np.ndarray, n_bins: int = 10, strategy: str = "quantile", ) -> float: """Compute the Expected Calibration Error (ECE). ECE is the weighted average of the absolute difference between observed accuracy and mean predicted confidence within each probability bin. Parameters ---------- y_true : numpy.ndarray of shape (n_samples,) True binary labels (0 or 1). y_proba : numpy.ndarray of shape (n_samples,) or (n_samples, 2) Predicted probabilities. If 2-D, the positive-class column is extracted. n_bins : int, default 10 Number of bins. strategy : {"quantile", "uniform"}, default "quantile" Binning strategy. ``"quantile"`` produces bins with approximately equal sample counts; ``"uniform"`` produces equal-width bins over [0, 1]. Returns ------- float Expected Calibration Error in [0, 1]. Notes ----- ECE is defined as: .. math:: \\text{ECE} = \\sum_{b=1}^{B} \\frac{n_b}{N} \\left| \\text{acc}(b) - \\text{conf}(b) \\right| where *B* is the number of bins, :math:`n_b` the number of samples in bin *b*, *N* the total number of samples, :math:`\\text{acc}(b)` the observed accuracy in bin *b*, and :math:`\\text{conf}(b)` the mean predicted probability in bin *b*. References ---------- .. [1] Naeini, M.P., Cooper, G.F., and Hauskrecht, M. (2015). "Obtaining Well Calibrated Probabilities Using Bayesian Binning into Quantiles." *AAAI*. Examples -------- >>> import numpy as np >>> from nestkit.calibration.diagnostics import CalibrationDiagnostics >>> y_true = np.array([0, 0, 1, 1]) >>> y_proba = np.array([0.2, 0.3, 0.7, 0.8]) >>> CalibrationDiagnostics.expected_calibration_error( ... y_true, y_proba, n_bins=5 ... ) # doctest: +SKIP 0.0... """ p = extract_positive_proba(y_proba) bins = _make_bins(p, n_bins, strategy) bin_pairs = list(pairwise(bins)) ece = 0.0 n_total = len(y_true) for i, (lo, hi) in enumerate(bin_pairs): mask = _bin_mask(p, lo, hi, is_last=(i == len(bin_pairs) - 1)) if mask.sum() == 0: continue bin_acc = y_true[mask].mean() bin_conf = p[mask].mean() ece += mask.sum() / n_total * abs(bin_acc - bin_conf) return float(ece)
[docs] @staticmethod def maximum_calibration_error( y_true: np.ndarray, y_proba: np.ndarray, n_bins: int = 10, strategy: str = "quantile", ) -> float: """Compute the Maximum Calibration Error (MCE). MCE is the worst-case (maximum) absolute difference between observed accuracy and mean predicted confidence across all bins. Parameters ---------- y_true : numpy.ndarray of shape (n_samples,) True binary labels (0 or 1). y_proba : numpy.ndarray of shape (n_samples,) or (n_samples, 2) Predicted probabilities. n_bins : int, default 10 Number of bins. strategy : {"quantile", "uniform"}, default "quantile" Binning strategy. ``"quantile"`` produces bins with approximately equal sample counts; ``"uniform"`` produces equal-width bins over [0, 1]. Returns ------- float Maximum Calibration Error in [0, 1]. Notes ----- MCE is defined as: .. math:: \\text{MCE} = \\max_{b \\in \\{1, \\ldots, B\\}} \\left| \\text{acc}(b) - \\text{conf}(b) \\right| This is useful for identifying the single worst-calibrated probability region. Examples -------- >>> import numpy as np >>> from nestkit.calibration.diagnostics import CalibrationDiagnostics >>> y_true = np.array([0, 0, 1, 1]) >>> y_proba = np.array([0.2, 0.3, 0.7, 0.8]) >>> CalibrationDiagnostics.maximum_calibration_error( ... y_true, y_proba, n_bins=5 ... ) # doctest: +SKIP 0.0... """ p = extract_positive_proba(y_proba) bins = _make_bins(p, n_bins, strategy) bin_pairs = list(pairwise(bins)) mce = 0.0 for i, (lo, hi) in enumerate(bin_pairs): mask = _bin_mask(p, lo, hi, is_last=(i == len(bin_pairs) - 1)) if mask.sum() == 0: continue bin_acc = y_true[mask].mean() bin_conf = p[mask].mean() mce = max(mce, abs(bin_acc - bin_conf)) return float(mce)
[docs] @staticmethod def brier_score(y_true: np.ndarray, y_proba: np.ndarray) -> float: """Compute the Brier score (mean squared error of probabilities). Parameters ---------- y_true : numpy.ndarray of shape (n_samples,) True binary labels (0 or 1). y_proba : numpy.ndarray of shape (n_samples,) or (n_samples, 2) Predicted probabilities. Returns ------- float Brier score in [0, 1]. Lower is better; 0 indicates perfect probabilistic predictions. Notes ----- The Brier score is defined as: .. math:: \\text{BS} = \\frac{1}{N} \\sum_{i=1}^{N} (p_i - y_i)^2 It can be decomposed into reliability, resolution, and uncertainty (see :meth:`brier_decomposition`). References ---------- .. [1] Brier, G.W. (1950). "Verification of Forecasts Expressed in Terms of Probability." *Monthly Weather Review*, 78(1), 1--3. Examples -------- >>> import numpy as np >>> from nestkit.calibration.diagnostics import CalibrationDiagnostics >>> CalibrationDiagnostics.brier_score( ... np.array([0, 1]), np.array([0.0, 1.0]) ... ) 0.0 See Also -------- brier_decomposition : Reliability--resolution--uncertainty decomposition of the Brier score. """ p = extract_positive_proba(y_proba) return float(np.mean((p - y_true) ** 2))
[docs] @staticmethod def brier_decomposition( y_true: np.ndarray, y_proba: np.ndarray, n_bins: int = 10, strategy: str = "quantile", ) -> dict: """Decompose the Brier score into reliability, resolution, and uncertainty. Parameters ---------- y_true : numpy.ndarray of shape (n_samples,) True binary labels (0 or 1). y_proba : numpy.ndarray of shape (n_samples,) or (n_samples, 2) Predicted probabilities. n_bins : int, default 10 Number of bins. strategy : {"quantile", "uniform"}, default "quantile" Binning strategy. ``"quantile"`` produces bins with approximately equal sample counts; ``"uniform"`` produces equal-width bins over [0, 1]. Returns ------- dict Dictionary with keys: * ``"reliability"`` -- Calibration component (lower is better). Measures how close the predicted probabilities are to the observed frequencies within each bin. * ``"resolution"`` -- Resolution component (higher is better). Measures how much the per-bin frequencies deviate from the overall base rate. * ``"uncertainty"`` -- Uncertainty component. Equal to ``p_bar * (1 - p_bar)`` where ``p_bar`` is the base rate. This is independent of the model. Notes ----- The decomposition satisfies: .. math:: \\text{BS} = \\text{Reliability} - \\text{Resolution} + \\text{Uncertainty} References ---------- .. [1] Murphy, A.H. (1973). "A New Vector Partition of the Probability Score." *Journal of Applied Meteorology*, 12(4), 595--600. Examples -------- >>> import numpy as np >>> from nestkit.calibration.diagnostics import CalibrationDiagnostics >>> decomp = CalibrationDiagnostics.brier_decomposition( ... np.array([0, 0, 1, 1]), ... np.array([0.1, 0.2, 0.8, 0.9]), ... ) # doctest: +SKIP >>> decomp.keys() # doctest: +SKIP dict_keys(['reliability', 'resolution', 'uncertainty']) See Also -------- brier_score : The scalar Brier score. """ p = extract_positive_proba(y_proba) n = len(y_true) climatology = y_true.mean() uncertainty = climatology * (1 - climatology) bins = _make_bins(p, n_bins, strategy) bin_pairs = list(pairwise(bins)) reliability = 0.0 resolution = 0.0 for i, (lo, hi) in enumerate(bin_pairs): mask = _bin_mask(p, lo, hi, is_last=(i == len(bin_pairs) - 1)) n_k = mask.sum() if n_k == 0: continue bin_acc = y_true[mask].mean() bin_conf = p[mask].mean() reliability += n_k / n * (bin_conf - bin_acc) ** 2 resolution += n_k / n * (bin_acc - climatology) ** 2 return { "reliability": float(reliability), "resolution": float(resolution), "uncertainty": float(uncertainty), }
[docs] @staticmethod def reliability_diagram_data( y_true: np.ndarray, y_proba: np.ndarray, n_bins: int = 10, strategy: str = "quantile", ) -> pd.DataFrame: """Compute binned data for reliability (calibration) diagrams. Returns a DataFrame with one row per bin, suitable for plotting a reliability diagram (mean predicted probability vs. observed fraction of positives). Parameters ---------- y_true : numpy.ndarray of shape (n_samples,) True binary labels (0 or 1). y_proba : numpy.ndarray of shape (n_samples,) or (n_samples, 2) Predicted probabilities. n_bins : int, default 10 Number of bins. strategy : {"quantile", "uniform"}, default "quantile" Binning strategy. ``"quantile"`` produces bins with approximately equal sample counts; ``"uniform"`` produces equal-width bins over [0, 1]. Returns ------- pandas.DataFrame DataFrame with columns: * ``bin_lower`` -- Lower edge of the bin. * ``bin_upper`` -- Upper edge of the bin. * ``bin_mid`` -- Midpoint of the bin. * ``mean_predicted`` -- Mean predicted probability in the bin (NaN if the bin is empty). * ``fraction_positive`` -- Observed fraction of positive samples in the bin (NaN if empty). * ``count`` -- Number of samples in the bin. Examples -------- >>> import numpy as np >>> from nestkit.calibration.diagnostics import CalibrationDiagnostics >>> df = CalibrationDiagnostics.reliability_diagram_data( ... np.array([0, 0, 1, 1]), ... np.array([0.1, 0.2, 0.8, 0.9]), ... ) # doctest: +SKIP >>> df.columns.tolist() # doctest: +SKIP ['bin_lower', 'bin_upper', 'bin_mid', 'mean_predicted', 'fraction_positive', 'count'] """ p = extract_positive_proba(y_proba) bins = _make_bins(p, n_bins, strategy) bin_pairs = list(pairwise(bins)) rows = [] for i, (lo, hi) in enumerate(bin_pairs): mask = _bin_mask(p, lo, hi, is_last=(i == len(bin_pairs) - 1)) n_k = mask.sum() if n_k == 0: rows.append( { "bin_lower": lo, "bin_upper": hi, "bin_mid": (lo + hi) / 2, "mean_predicted": np.nan, "fraction_positive": np.nan, "count": 0, } ) else: rows.append( { "bin_lower": lo, "bin_upper": hi, "bin_mid": (lo + hi) / 2, "mean_predicted": float(p[mask].mean()), "fraction_positive": float(y_true[mask].mean()), "count": int(n_k), } ) return pd.DataFrame(rows)
[docs] @staticmethod def compare_before_after( y_true: np.ndarray, raw_proba: np.ndarray, cal_proba: np.ndarray ) -> pd.DataFrame: """Side-by-side comparison of calibration metrics before and after calibration. Computes ECE, MCE, and Brier score for both the raw and calibrated predicted probabilities and returns them in a two-row DataFrame. Parameters ---------- y_true : numpy.ndarray of shape (n_samples,) True binary labels (0 or 1). raw_proba : numpy.ndarray of shape (n_samples,) or (n_samples, 2) Raw (uncalibrated) predicted probabilities. cal_proba : numpy.ndarray of shape (n_samples,) or (n_samples, 2) Calibrated predicted probabilities. Returns ------- pandas.DataFrame DataFrame with columns ``stage`` (``"raw"`` or ``"calibrated"``), ``ece``, ``mce``, ``brier``. Examples -------- >>> import numpy as np >>> from nestkit.calibration.diagnostics import CalibrationDiagnostics >>> y_true = np.array([0, 0, 1, 1]) >>> raw = np.array([0.3, 0.4, 0.6, 0.7]) >>> cal = np.array([0.2, 0.3, 0.7, 0.8]) >>> df = CalibrationDiagnostics.compare_before_after( ... y_true, raw, cal ... ) # doctest: +SKIP >>> df["stage"].tolist() # doctest: +SKIP ['raw', 'calibrated'] See Also -------- expected_calibration_error : ECE computation. maximum_calibration_error : MCE computation. brier_score : Brier score computation. """ diag = CalibrationDiagnostics return pd.DataFrame( [ { "stage": "raw", "ece": diag.expected_calibration_error(y_true, raw_proba), "mce": diag.maximum_calibration_error(y_true, raw_proba), "brier": diag.brier_score(y_true, raw_proba), }, { "stage": "calibrated", "ece": diag.expected_calibration_error(y_true, cal_proba), "mce": diag.maximum_calibration_error(y_true, cal_proba), "brier": diag.brier_score(y_true, cal_proba), }, ] )