"""Built-in threshold optimization criteria.
All criterion functions follow the signature
``(y_true, y_proba, threshold) -> float`` and are designed to be
**maximised** by ``argmax``. For cost-based criteria (where the goal is
minimisation), the returned value is negated so that ``argmax`` still
selects the optimum.
See Also
--------
nestkit.thresholding.strategies.optimize_threshold :
Grid search that maximises a criterion over thresholds.
"""
from __future__ import annotations
import numpy as np
from sklearn.metrics import (
balanced_accuracy_score,
confusion_matrix,
fbeta_score,
precision_score,
recall_score,
)
[docs]
def youden_j(y_true: np.ndarray, y_proba: np.ndarray, threshold: float) -> float:
"""Compute Youden's J statistic at the given threshold.
Youden's J is defined as ``sensitivity + specificity - 1`` and
ranges from -1 (complete misclassification) to +1 (perfect
classification). Maximising J yields the threshold that best
separates the two classes.
Parameters
----------
y_true : numpy.ndarray of shape (n_samples,)
True binary labels (0 or 1).
y_proba : numpy.ndarray of shape (n_samples,)
Predicted positive-class probabilities.
threshold : float
Decision threshold in [0, 1].
Returns
-------
float
Youden's J in [-1, 1].
Notes
-----
.. math::
J = \\text{sensitivity} + \\text{specificity} - 1
= \\frac{TP}{TP + FN} + \\frac{TN}{TN + FP} - 1
This is equivalent to the vertical distance between the ROC curve
and the diagonal chance line.
Examples
--------
>>> import numpy as np
>>> from nestkit.thresholding.criteria import youden_j
>>> youden_j(np.array([0, 0, 1, 1]), np.array([0.1, 0.4, 0.6, 0.9]), 0.5)
1.0
"""
y_pred = (y_proba >= threshold).astype(int)
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0.0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
return sensitivity + specificity - 1.0
[docs]
def f_beta_criterion(beta: float = 1.0):
"""Create a criterion function that maximises the F-beta score.
Parameters
----------
beta : float, default 1.0
The beta parameter of the F-beta score. ``beta < 1`` weights
precision higher; ``beta > 1`` weights recall higher.
``beta = 1`` gives the standard F1 score.
Returns
-------
callable
A criterion function with signature
``(y_true, y_proba, threshold) -> float``.
Notes
-----
The F-beta score is defined as:
.. math::
F_\\beta = (1 + \\beta^2) \\cdot
\\frac{\\text{precision} \\cdot \\text{recall}}
{\\beta^2 \\cdot \\text{precision} + \\text{recall}}
Examples
--------
>>> import numpy as np
>>> from nestkit.thresholding.criteria import f_beta_criterion
>>> f1_criterion = f_beta_criterion(beta=1.0)
>>> f1_criterion(
... np.array([0, 0, 1, 1]),
... np.array([0.1, 0.4, 0.6, 0.9]),
... 0.5,
... ) # doctest: +SKIP
1.0
See Also
--------
youden_j : Alternative criterion based on sensitivity + specificity.
"""
def _criterion(y_true: np.ndarray, y_proba: np.ndarray, threshold: float) -> float:
y_pred = (y_proba >= threshold).astype(int)
return fbeta_score(y_true, y_pred, beta=beta, zero_division=0.0)
_criterion.__name__ = f"f_{beta}"
return _criterion
[docs]
def cost_sensitive(cost_matrix):
"""Create a criterion that minimises expected misclassification cost.
The returned function computes the *negative* total cost so that
``argmax`` corresponds to ``argmin`` of cost.
Parameters
----------
cost_matrix : array-like of shape (2, 2)
Cost matrix ``[[C_TN, C_FP], [C_FN, C_TP]]`` where:
* ``C_TN`` -- cost of a true negative (usually 0).
* ``C_FP`` -- cost of a false positive.
* ``C_FN`` -- cost of a false negative.
* ``C_TP`` -- cost of a true positive (usually 0).
Returns
-------
callable
A criterion function with signature
``(y_true, y_proba, threshold) -> float`` returning negative
total cost.
Notes
-----
The total cost is:
.. math::
\\text{Cost} = C_{TN} \\cdot TN + C_{FP} \\cdot FP
+ C_{FN} \\cdot FN + C_{TP} \\cdot TP
The function returns ``-\\text{Cost}`` so that maximisation via
``argmax`` yields the cost-minimising threshold.
Examples
--------
>>> import numpy as np
>>> from nestkit.thresholding.criteria import cost_sensitive
>>> # FP costs 1, FN costs 5
>>> criterion = cost_sensitive([[0, 1], [5, 0]])
>>> criterion(
... np.array([0, 0, 1, 1]),
... np.array([0.1, 0.4, 0.6, 0.9]),
... 0.5,
... ) # doctest: +SKIP
0
See Also
--------
youden_j : Cost-agnostic criterion.
"""
c_tn, c_fp = cost_matrix[0][0], cost_matrix[0][1]
c_fn, c_tp = cost_matrix[1][0], cost_matrix[1][1]
def _criterion(y_true: np.ndarray, y_proba: np.ndarray, threshold: float) -> float:
y_pred = (y_proba >= threshold).astype(int)
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
tn, fp, fn, tp = cm.ravel()
total_cost = c_tn * tn + c_fp * fp + c_fn * fn + c_tp * tp
return -total_cost
_criterion.__name__ = "cost_sensitive"
return _criterion
[docs]
def balanced_accuracy_criterion(
y_true: np.ndarray, y_proba: np.ndarray, threshold: float
) -> float:
"""Maximise balanced accuracy at the given threshold.
Balanced accuracy is the arithmetic mean of sensitivity and
specificity, equivalent to ``(Youden's J + 1) / 2``.
Parameters
----------
y_true : numpy.ndarray of shape (n_samples,)
True binary labels (0 or 1).
y_proba : numpy.ndarray of shape (n_samples,)
Predicted positive-class probabilities.
threshold : float
Decision threshold in [0, 1].
Returns
-------
float
Balanced accuracy in [0, 1].
Notes
-----
.. math::
\\text{BA} = \\frac{\\text{sensitivity} + \\text{specificity}}{2}
Examples
--------
>>> import numpy as np
>>> from nestkit.thresholding.criteria import balanced_accuracy_criterion
>>> balanced_accuracy_criterion(
... np.array([0, 0, 1, 1]),
... np.array([0.1, 0.4, 0.6, 0.9]),
... 0.5,
... )
1.0
See Also
--------
youden_j : Equivalent to ``2 * balanced_accuracy - 1``.
"""
y_pred = (y_proba >= threshold).astype(int)
return balanced_accuracy_score(y_true, y_pred)
[docs]
def precision_at_recall(min_recall: float = 0.90):
"""Create a criterion that maximises precision subject to a minimum recall.
Thresholds that produce a recall below ``min_recall`` receive a
score of -1, effectively excluding them from selection.
Parameters
----------
min_recall : float, default 0.90
Minimum acceptable recall. Must be in (0, 1].
Returns
-------
callable
A criterion function with signature
``(y_true, y_proba, threshold) -> float``. Returns
``precision`` when ``recall >= min_recall``, else ``-1``.
Notes
-----
This implements a constrained optimisation: among all thresholds
achieving at least ``min_recall``, select the one with the highest
precision. The penalty of -1 for violating the recall constraint
ensures that ``argmax`` never selects an infeasible threshold.
Examples
--------
>>> import numpy as np
>>> from nestkit.thresholding.criteria import precision_at_recall
>>> criterion = precision_at_recall(min_recall=0.80)
>>> criterion(
... np.array([0, 0, 1, 1, 1]),
... np.array([0.1, 0.3, 0.6, 0.7, 0.9]),
... 0.5,
... ) # doctest: +SKIP
1.0
See Also
--------
f_beta_criterion : Unconstrained precision--recall trade-off.
"""
def _criterion(y_true: np.ndarray, y_proba: np.ndarray, threshold: float) -> float:
y_pred = (y_proba >= threshold).astype(int)
rec = recall_score(y_true, y_pred, zero_division=0.0)
if rec < min_recall:
return -1.0
return precision_score(y_true, y_pred, zero_division=0.0)
_criterion.__name__ = f"precision_at_recall_{min_recall}"
return _criterion