Source code for nestkit.thresholding.criteria

"""Built-in threshold optimization criteria.

All criterion functions follow the signature
``(y_true, y_proba, threshold) -> float`` and are designed to be
**maximised** by ``argmax``.  For cost-based criteria (where the goal is
minimisation), the returned value is negated so that ``argmax`` still
selects the optimum.

See Also
--------
nestkit.thresholding.strategies.optimize_threshold :
    Grid search that maximises a criterion over thresholds.
"""

from __future__ import annotations

import numpy as np
from sklearn.metrics import (
    balanced_accuracy_score,
    confusion_matrix,
    fbeta_score,
    precision_score,
    recall_score,
)


[docs] def youden_j(y_true: np.ndarray, y_proba: np.ndarray, threshold: float) -> float: """Compute Youden's J statistic at the given threshold. Youden's J is defined as ``sensitivity + specificity - 1`` and ranges from -1 (complete misclassification) to +1 (perfect classification). Maximising J yields the threshold that best separates the two classes. Parameters ---------- y_true : numpy.ndarray of shape (n_samples,) True binary labels (0 or 1). y_proba : numpy.ndarray of shape (n_samples,) Predicted positive-class probabilities. threshold : float Decision threshold in [0, 1]. Returns ------- float Youden's J in [-1, 1]. Notes ----- .. math:: J = \\text{sensitivity} + \\text{specificity} - 1 = \\frac{TP}{TP + FN} + \\frac{TN}{TN + FP} - 1 This is equivalent to the vertical distance between the ROC curve and the diagonal chance line. Examples -------- >>> import numpy as np >>> from nestkit.thresholding.criteria import youden_j >>> youden_j(np.array([0, 0, 1, 1]), np.array([0.1, 0.4, 0.6, 0.9]), 0.5) 1.0 """ y_pred = (y_proba >= threshold).astype(int) cm = confusion_matrix(y_true, y_pred, labels=[0, 1]) tn, fp, fn, tp = cm.ravel() sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0.0 specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0 return sensitivity + specificity - 1.0
[docs] def f_beta_criterion(beta: float = 1.0): """Create a criterion function that maximises the F-beta score. Parameters ---------- beta : float, default 1.0 The beta parameter of the F-beta score. ``beta < 1`` weights precision higher; ``beta > 1`` weights recall higher. ``beta = 1`` gives the standard F1 score. Returns ------- callable A criterion function with signature ``(y_true, y_proba, threshold) -> float``. Notes ----- The F-beta score is defined as: .. math:: F_\\beta = (1 + \\beta^2) \\cdot \\frac{\\text{precision} \\cdot \\text{recall}} {\\beta^2 \\cdot \\text{precision} + \\text{recall}} Examples -------- >>> import numpy as np >>> from nestkit.thresholding.criteria import f_beta_criterion >>> f1_criterion = f_beta_criterion(beta=1.0) >>> f1_criterion( ... np.array([0, 0, 1, 1]), ... np.array([0.1, 0.4, 0.6, 0.9]), ... 0.5, ... ) # doctest: +SKIP 1.0 See Also -------- youden_j : Alternative criterion based on sensitivity + specificity. """ def _criterion(y_true: np.ndarray, y_proba: np.ndarray, threshold: float) -> float: y_pred = (y_proba >= threshold).astype(int) return fbeta_score(y_true, y_pred, beta=beta, zero_division=0.0) _criterion.__name__ = f"f_{beta}" return _criterion
[docs] def cost_sensitive(cost_matrix): """Create a criterion that minimises expected misclassification cost. The returned function computes the *negative* total cost so that ``argmax`` corresponds to ``argmin`` of cost. Parameters ---------- cost_matrix : array-like of shape (2, 2) Cost matrix ``[[C_TN, C_FP], [C_FN, C_TP]]`` where: * ``C_TN`` -- cost of a true negative (usually 0). * ``C_FP`` -- cost of a false positive. * ``C_FN`` -- cost of a false negative. * ``C_TP`` -- cost of a true positive (usually 0). Returns ------- callable A criterion function with signature ``(y_true, y_proba, threshold) -> float`` returning negative total cost. Notes ----- The total cost is: .. math:: \\text{Cost} = C_{TN} \\cdot TN + C_{FP} \\cdot FP + C_{FN} \\cdot FN + C_{TP} \\cdot TP The function returns ``-\\text{Cost}`` so that maximisation via ``argmax`` yields the cost-minimising threshold. Examples -------- >>> import numpy as np >>> from nestkit.thresholding.criteria import cost_sensitive >>> # FP costs 1, FN costs 5 >>> criterion = cost_sensitive([[0, 1], [5, 0]]) >>> criterion( ... np.array([0, 0, 1, 1]), ... np.array([0.1, 0.4, 0.6, 0.9]), ... 0.5, ... ) # doctest: +SKIP 0 See Also -------- youden_j : Cost-agnostic criterion. """ c_tn, c_fp = cost_matrix[0][0], cost_matrix[0][1] c_fn, c_tp = cost_matrix[1][0], cost_matrix[1][1] def _criterion(y_true: np.ndarray, y_proba: np.ndarray, threshold: float) -> float: y_pred = (y_proba >= threshold).astype(int) cm = confusion_matrix(y_true, y_pred, labels=[0, 1]) tn, fp, fn, tp = cm.ravel() total_cost = c_tn * tn + c_fp * fp + c_fn * fn + c_tp * tp return -total_cost _criterion.__name__ = "cost_sensitive" return _criterion
[docs] def balanced_accuracy_criterion( y_true: np.ndarray, y_proba: np.ndarray, threshold: float ) -> float: """Maximise balanced accuracy at the given threshold. Balanced accuracy is the arithmetic mean of sensitivity and specificity, equivalent to ``(Youden's J + 1) / 2``. Parameters ---------- y_true : numpy.ndarray of shape (n_samples,) True binary labels (0 or 1). y_proba : numpy.ndarray of shape (n_samples,) Predicted positive-class probabilities. threshold : float Decision threshold in [0, 1]. Returns ------- float Balanced accuracy in [0, 1]. Notes ----- .. math:: \\text{BA} = \\frac{\\text{sensitivity} + \\text{specificity}}{2} Examples -------- >>> import numpy as np >>> from nestkit.thresholding.criteria import balanced_accuracy_criterion >>> balanced_accuracy_criterion( ... np.array([0, 0, 1, 1]), ... np.array([0.1, 0.4, 0.6, 0.9]), ... 0.5, ... ) 1.0 See Also -------- youden_j : Equivalent to ``2 * balanced_accuracy - 1``. """ y_pred = (y_proba >= threshold).astype(int) return balanced_accuracy_score(y_true, y_pred)
[docs] def precision_at_recall(min_recall: float = 0.90): """Create a criterion that maximises precision subject to a minimum recall. Thresholds that produce a recall below ``min_recall`` receive a score of -1, effectively excluding them from selection. Parameters ---------- min_recall : float, default 0.90 Minimum acceptable recall. Must be in (0, 1]. Returns ------- callable A criterion function with signature ``(y_true, y_proba, threshold) -> float``. Returns ``precision`` when ``recall >= min_recall``, else ``-1``. Notes ----- This implements a constrained optimisation: among all thresholds achieving at least ``min_recall``, select the one with the highest precision. The penalty of -1 for violating the recall constraint ensures that ``argmax`` never selects an infeasible threshold. Examples -------- >>> import numpy as np >>> from nestkit.thresholding.criteria import precision_at_recall >>> criterion = precision_at_recall(min_recall=0.80) >>> criterion( ... np.array([0, 0, 1, 1, 1]), ... np.array([0.1, 0.3, 0.6, 0.7, 0.9]), ... 0.5, ... ) # doctest: +SKIP 1.0 See Also -------- f_beta_criterion : Unconstrained precision--recall trade-off. """ def _criterion(y_true: np.ndarray, y_proba: np.ndarray, threshold: float) -> float: y_pred = (y_proba >= threshold).astype(int) rec = recall_score(y_true, y_pred, zero_division=0.0) if rec < min_recall: return -1.0 return precision_score(y_true, y_pred, zero_division=0.0) _criterion.__name__ = f"precision_at_recall_{min_recall}" return _criterion