Source code for nestkit.calibration.calibrators

"""Post-hoc probability calibration."""

from __future__ import annotations

import logging

import numpy as np
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression

from nestkit._constants import _EPS
from nestkit._validation import extract_positive_proba

logger = logging.getLogger("nestkit")



[docs]
class PostHocCalibrator:
    """Unified interface for post-hoc probability calibration.

    Binary-only. For multiclass, use one calibrator per class via OVR
    decomposition at the NestedCVClassifier level.

    Parameters
    ----------
    method : {"sigmoid", "isotonic", "beta", "venn_abers"}
        Calibration method.  ``"sigmoid"`` applies logistic recalibration
        on probability logits (sometimes called temperature scaling on
        probabilities).  This differs from classical Platt scaling, which
        operates on raw decision function scores rather than probabilities.
    """

    def __init__(self, method: str = "sigmoid"):
        self.method = method
        self._calibrator = None
        self._is_fitted = False


[docs]
    def fit(self, y_proba: np.ndarray, y_true: np.ndarray) -> PostHocCalibrator:
        """Fit calibration mapping from uncalibrated probs to calibrated.

        Parameters
        ----------
        y_proba : array of shape (n_samples,) or (n_samples, 2)
            Uncalibrated predicted probabilities.
        y_true : array of shape (n_samples,)
            True binary labels.
        """
        p = extract_positive_proba(y_proba)

        if self.method == "sigmoid":
            self._fit_sigmoid(p, y_true)
        elif self.method == "isotonic":
            self._fit_isotonic(p, y_true)
        elif self.method == "beta":
            self._fit_beta(p, y_true)
        elif self.method == "venn_abers":
            self._fit_venn_abers(p, y_true)
        else:
            raise ValueError(f"Unknown calibration method: {self.method}")

        self._is_fitted = True
        return self



[docs]
    def predict_proba(self, y_proba: np.ndarray) -> np.ndarray:
        """Apply calibration mapping.

        Parameters
        ----------
        y_proba : array of shape (n_samples,) or (n_samples, 2) or (n_samples,)
            Uncalibrated probabilities.

        Returns
        -------
        calibrated_proba : array of shape (n_samples, 2)
        """
        if not self._is_fitted:
            raise RuntimeError("PostHocCalibrator is not fitted.")

        p = extract_positive_proba(y_proba)

        if self.method == "sigmoid":
            cal_p = self._predict_sigmoid(p)
        elif self.method == "isotonic":
            cal_p = self._predict_isotonic(p)
        elif self.method == "beta":
            cal_p = self._predict_beta(p)
        elif self.method == "venn_abers":
            cal_p = self._predict_venn_abers(p)
        else:
            raise ValueError(f"Unknown calibration method: {self.method}")

        cal_p = np.clip(cal_p, 0, 1)
        return np.column_stack([1 - cal_p, cal_p])


    # --- Sigmoid (logistic recalibration on probability logits) ---

    def _fit_sigmoid(self, p: np.ndarray, y: np.ndarray) -> None:
        p_clipped = np.clip(p, _EPS, 1 - _EPS)
        logits = np.log(p_clipped / (1 - p_clipped))
        self._calibrator = LogisticRegression(C=1e10, solver="lbfgs", max_iter=1000)
        self._calibrator.fit(logits.reshape(-1, 1), y)

    def _predict_sigmoid(self, p: np.ndarray) -> np.ndarray:
        p_clipped = np.clip(p, _EPS, 1 - _EPS)
        logits = np.log(p_clipped / (1 - p_clipped))
        return self._calibrator.predict_proba(logits.reshape(-1, 1))[:, 1]

    # --- Isotonic ---

    def _fit_isotonic(self, p: np.ndarray, y: np.ndarray) -> None:
        self._calibrator = IsotonicRegression(out_of_bounds="clip", y_min=0, y_max=1)
        self._calibrator.fit(p, y)

    def _predict_isotonic(self, p: np.ndarray) -> np.ndarray:
        return self._calibrator.predict(p)

    # --- Beta calibration (Kull et al., 2017) ---

    def _fit_beta(self, p: np.ndarray, y: np.ndarray) -> None:
        # 3-parameter beta calibration: logit(q) = a * log(p) + b * log(1-p) + c
        # Fit via logistic regression on log(p) and log(1-p)
        p_clipped = np.clip(p, _EPS, 1 - _EPS)
        features = np.column_stack([np.log(p_clipped), np.log(1 - p_clipped)])
        self._calibrator = LogisticRegression(C=1e10, solver="lbfgs", max_iter=1000)
        self._calibrator.fit(features, y)

    def _predict_beta(self, p: np.ndarray) -> np.ndarray:
        p_clipped = np.clip(p, _EPS, 1 - _EPS)
        features = np.column_stack([np.log(p_clipped), np.log(1 - p_clipped)])
        return self._calibrator.predict_proba(features)[:, 1]

    # --- Venn-ABERS ---

    def _fit_venn_abers(self, p: np.ndarray, y: np.ndarray) -> None:
        # Store calibration data for Venn-ABERS inductive prediction
        sort_idx = np.argsort(p)
        self._va_scores = p[sort_idx]
        self._va_labels = y[sort_idx]

    def _predict_venn_abers(self, p: np.ndarray) -> np.ndarray:
        cal_probs = np.zeros(len(p))

        for i, score in enumerate(p):
            # Compute isotonic regression with score inserted as label=0 and label=1
            p0 = self._va_isotonic_with(score, 0)
            p1 = self._va_isotonic_with(score, 1)
            # Venn-ABERS midpoint
            cal_probs[i] = p1 / (1 - p0 + p1 + _EPS)

        return cal_probs

    def _va_isotonic_with(self, score: float, label: int) -> float:
        """Insert (score, label) into calibration set and return isotonic prediction."""
        scores = np.append(self._va_scores, score)
        labels = np.append(self._va_labels, label)
        sort_idx = np.argsort(scores)
        scores_sorted = scores[sort_idx]
        labels_sorted = labels[sort_idx]

        iso = IsotonicRegression(out_of_bounds="clip", y_min=0, y_max=1)
        iso.fit(scores_sorted, labels_sorted)
        return float(iso.predict([score])[0])