Source code for nestkit.results.regressor_results
"""Regressor-specific results containers."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from nestkit.results._base import _BaseNestedCVResults
[docs]
@dataclass
class RegressorOuterFoldResult:
"""Result of a single outer fold evaluation (regression)."""
fold_idx: int
train_indices: np.ndarray
test_indices: np.ndarray
best_params: dict
best_inner_score: float
inner_cv_results: dict
fit_time: float
score_time: float
fitted_estimator: BaseEstimator | None
y_true: np.ndarray
y_pred: np.ndarray
outer_scores: dict = field(default_factory=dict)
residuals: np.ndarray = field(default_factory=lambda: np.array([]))
# Optional prediction intervals
prediction_interval_lower: np.ndarray | None = None
prediction_interval_upper: np.ndarray | None = None
coverage: float | None = None
# Optional Mondrian conformal binning
mondrian_bin_assignments: np.ndarray | None = None
[docs]
class RegressorResults(_BaseNestedCVResults):
"""Aggregated nested CV results for regression."""
def __init__(
self,
n_outer_folds: int,
feature_names: list[str] | None = None,
original_index: Any | None = None,
):
super().__init__(n_outer_folds, feature_names, original_index)
[docs]
def finalize(self) -> None:
if self._finalized:
return
self._finalized = True
self.best_params_per_fold_ = [fr.best_params for fr in self.fold_results_]
from nestkit.inner.tuning_report import InnerCVReport
self.inner_reports_ = [
InnerCVReport(fr.inner_cv_results, fr.fold_idx) for fr in self.fold_results_
]
self.outer_scores_default_ = pd.DataFrame([fr.outer_scores for fr in self.fold_results_])
self.summary_default_ = self._compute_summary(self.outer_scores_default_)
self._build_predictions_df()
self._compute_generalization_gap()
self._compute_residual_stats()
def _build_predictions_df(self) -> None:
dfs = []
for fr in self.fold_results_:
fold_df = pd.DataFrame(
{
"y_true": fr.y_true,
"y_pred": fr.y_pred,
"residual": fr.residuals,
"fold_idx": fr.fold_idx,
}
)
if fr.prediction_interval_lower is not None:
fold_df["pi_lower"] = fr.prediction_interval_lower
fold_df["pi_upper"] = fr.prediction_interval_upper
if fr.mondrian_bin_assignments is not None:
fold_df["mondrian_bin"] = fr.mondrian_bin_assignments
if self._original_index is not None:
fold_df.index = self._original_index[fr.test_indices]
else:
fold_df.index = fr.test_indices
dfs.append(fold_df)
self.predictions_ = pd.concat(dfs).sort_index()
def _compute_generalization_gap(self) -> None:
rows = []
for fr in self.fold_results_:
row = {"fold_idx": fr.fold_idx, "best_inner_score": fr.best_inner_score}
for metric, val in fr.outer_scores.items():
row[f"outer_{metric}"] = val
rows.append(row)
self.generalization_gap_ = pd.DataFrame(rows)
def _compute_residual_stats(self) -> None:
all_residuals = np.concatenate([fr.residuals for fr in self.fold_results_])
self.residual_stats_ = {
"mean": float(np.mean(all_residuals)),
"std": float(np.std(all_residuals, ddof=1)),
"median": float(np.median(all_residuals)),
"skewness": float(_skewness(all_residuals)),
"kurtosis": float(_kurtosis(all_residuals)),
}
# Prediction interval coverage
coverages = [fr.coverage for fr in self.fold_results_ if fr.coverage is not None]
if coverages:
self.prediction_interval_coverage_ = {
"mean": float(np.mean(coverages)),
"per_fold": coverages,
}
else:
self.prediction_interval_coverage_ = None
# Mondrian per-bin coverage
bin_assignments_list = [
fr.mondrian_bin_assignments
for fr in self.fold_results_
if fr.mondrian_bin_assignments is not None
]
if bin_assignments_list:
all_bins = np.concatenate(bin_assignments_list)
all_y_true = np.concatenate(
[fr.y_true for fr in self.fold_results_ if fr.mondrian_bin_assignments is not None]
)
all_lower = np.concatenate(
[
fr.prediction_interval_lower
for fr in self.fold_results_
if fr.mondrian_bin_assignments is not None
]
)
all_upper = np.concatenate(
[
fr.prediction_interval_upper
for fr in self.fold_results_
if fr.mondrian_bin_assignments is not None
]
)
per_bin_coverage = {}
for b in np.unique(all_bins):
mask = all_bins == b
cov = float(
np.mean(
(all_y_true[mask] >= all_lower[mask])
& (all_y_true[mask] <= all_upper[mask])
)
)
per_bin_coverage[int(b)] = cov
self.mondrian_coverage_per_bin_ = per_bin_coverage
else:
self.mondrian_coverage_per_bin_ = None
def _skewness(x: np.ndarray) -> float:
n = len(x)
if n < 3:
return 0.0
m = np.mean(x)
s = np.std(x, ddof=1)
if s == 0:
return 0.0
return (n / ((n - 1) * (n - 2))) * np.sum(((x - m) / s) ** 3)
def _kurtosis(x: np.ndarray) -> float:
n = len(x)
if n < 4:
return 0.0
m = np.mean(x)
s = np.std(x, ddof=1)
if s == 0:
return 0.0
return (n * (n + 1) / ((n - 1) * (n - 2) * (n - 3))) * np.sum(((x - m) / s) ** 4) - 3 * (
n - 1
) ** 2 / ((n - 2) * (n - 3))