Source code for nestkit.inner.tuning_report

"""Inner CV tuning report.

Provides :class:`InnerCVReport` for inspecting and diagnosing the
hyperparameter search that takes place inside each outer fold of a
nested cross-validation procedure.
"""

from __future__ import annotations

import numpy as np
import pandas as pd



[docs]
class InnerCVReport:
    """Diagnostics for the inner CV hyperparameter search of a single outer fold.

    Wraps the ``cv_results_`` dictionary produced by scikit-learn search
    objects (``GridSearchCV``, ``RandomizedSearchCV``, etc.) and exposes
    convenience methods for ranking configurations, estimating parameter
    importance, and examining score distributions.

    Parameters
    ----------
    cv_results : dict
        The ``cv_results_`` dictionary from the inner search object.
    outer_fold_idx : int
        Zero-based index of the outer fold this report belongs to.

    Attributes
    ----------
    cv_results_ : dict
        Raw ``cv_results_`` dictionary.
    outer_fold_idx : int
        Outer fold index.

    Examples
    --------
    >>> report = InnerCVReport(search.cv_results_, outer_fold_idx=0)  # doctest: +SKIP
    >>> report.top_k(3, metric="roc_auc")  # doctest: +SKIP

    See Also
    --------
    nestkit.inner.search.build_search : Construct the inner search object.
    """

    def __init__(self, cv_results: dict, outer_fold_idx: int):
        self.cv_results_ = cv_results
        self.outer_fold_idx = outer_fold_idx


[docs]
    def to_dataframe(self) -> pd.DataFrame:
        """Convert the full ``cv_results_`` dictionary to a DataFrame.

        Returns
        -------
        pandas.DataFrame
            One row per hyperparameter configuration with all columns
            from scikit-learn's ``cv_results_`` (parameters, mean/std
            scores, ranks, fit times, etc.).
        """
        return pd.DataFrame(self.cv_results_)



[docs]
    def ranking(self, metric: str | None = None) -> pd.DataFrame:
        """Return all configurations ranked by mean inner-CV score.

        Sorts by the ``rank_test_<metric>`` column if it exists;
        otherwise falls back to descending ``mean_test_<metric>``.

        Parameters
        ----------
        metric : str or None, optional
            Metric name (e.g., ``"roc_auc"``).  If ``None``, uses the
            default ``"score"`` suffix from scikit-learn's single-metric
            results.

        Returns
        -------
        pandas.DataFrame
            Sorted configurations with all ``cv_results_`` columns.
        """
        df = self.to_dataframe()
        rank_col = self._rank_col(metric)
        if rank_col in df.columns:
            return df.sort_values(rank_col).reset_index(drop=True)
        score_col = self._score_col(metric)
        if score_col in df.columns:
            return df.sort_values(score_col, ascending=False).reset_index(drop=True)
        return df



[docs]
    def top_k(self, k: int = 5, metric: str | None = None) -> pd.DataFrame:
        """Return the top *k* hyperparameter configurations.

        Parameters
        ----------
        k : int, default=5
            Number of top configurations to return.
        metric : str or None, optional
            Metric name for ranking.  See :meth:`ranking`.

        Returns
        -------
        pandas.DataFrame
            The *k* best-ranked rows from :meth:`ranking`.
        """
        return self.ranking(metric).head(k)



[docs]
    def param_importance(self, metric: str | None = None) -> pd.DataFrame:
        """Estimate the marginal importance of each hyperparameter.

        Groups configurations by each ``param_*`` column and computes
        the variance of the group means (a simplified, fANOVA-inspired
        measure).  Higher variance indicates that the parameter has a
        larger effect on the inner-CV score.

        Parameters
        ----------
        metric : str or None, optional
            Metric name.  See :meth:`ranking`.

        Returns
        -------
        pandas.DataFrame
            Columns: ``parameter``, ``variance_explained``,
            ``n_unique``, ``relative_importance``.  Sorted by
            ``variance_explained`` in descending order.  Returns an
            empty DataFrame if the score column is not found.

        Notes
        -----
        This is a first-order marginal analysis and does not account
        for interactions between hyperparameters.  For a full fANOVA
        decomposition, consider dedicated tools such as ``fanova``.
        """
        df = self.to_dataframe()
        score_col = self._score_col(metric)
        if score_col not in df.columns:
            return pd.DataFrame()

        param_cols = [c for c in df.columns if c.startswith("param_")]
        rows = []
        for col in param_cols:
            grouped = df.groupby(col)[score_col]
            variance_explained = grouped.mean().var()
            rows.append(
                {
                    "parameter": col.replace("param_", ""),
                    "variance_explained": variance_explained
                    if not np.isnan(variance_explained)
                    else 0.0,
                    "n_unique": df[col].nunique(),
                }
            )

        result = pd.DataFrame(rows)
        if not result.empty:
            total = result["variance_explained"].sum()
            if total > 0:
                result["relative_importance"] = result["variance_explained"] / total
            else:
                result["relative_importance"] = 0.0
            result = result.sort_values("variance_explained", ascending=False)
        return result.reset_index(drop=True)



[docs]
    def score_distribution(self, param: str, metric: str | None = None) -> pd.DataFrame:
        """Show mean inner-CV score as a function of a single hyperparameter.

        Useful for generating 1-D parameter-sweep plots.

        Parameters
        ----------
        param : str
            Hyperparameter name **without** the ``param_`` prefix.
        metric : str or None, optional
            Metric name.  See :meth:`ranking`.

        Returns
        -------
        pandas.DataFrame
            Columns include ``param_<param>``, the mean score column,
            and (if available) the standard deviation column.  Sorted
            by the parameter value.  Returns an empty DataFrame if
            the requested columns are not found.
        """
        df = self.to_dataframe()
        score_col = self._score_col(metric)
        param_col = f"param_{param}"
        if score_col not in df.columns or param_col not in df.columns:
            return pd.DataFrame()

        std_col = self._std_col(metric)
        cols = [param_col, score_col]
        if std_col in df.columns:
            cols.append(std_col)

        return df[cols].sort_values(param_col).reset_index(drop=True)


    def _rank_col(self, metric: str | None) -> str:
        if metric:
            return f"rank_test_{metric}"
        return "rank_test_score"

    def _score_col(self, metric: str | None) -> str:
        if metric:
            return f"mean_test_{metric}"
        return "mean_test_score"

    def _std_col(self, metric: str | None) -> str:
        if metric:
            return f"std_test_{metric}"
        return "std_test_score"