Source code for pystatistics.gam.solution

"""
User-facing GAM solution with R-style summary output.

Wraps a :class:`Result[GAMParams]` and a list of :class:`SmoothInfo`
objects, exposing convenient properties and an ``mgcv``-style
``summary()`` method.
"""

from __future__ import annotations

from typing import Any

import numpy as np
from numpy.typing import NDArray
from scipy import stats as sp_stats

from pystatistics.core.result import Result, SolutionReprMixin
from pystatistics.gam._common import GAMParams, SmoothInfo
from pystatistics.regression._formatting import significance_stars



[docs]
class GAMSolution(SolutionReprMixin):
    """User-facing GAM result with convenient properties and R-style output.

    Wraps a ``Result[GAMParams]`` and provides:

    * Coefficient access.
    * Smooth-term information (EDF, approximate significance tests).
    * R-style ``summary()`` matching ``mgcv::summary.gam()``.
    """

    __slots__ = ("_result", "_smooth_infos", "_names")


[docs]
    def __init__(
        self,
        _result: Result[GAMParams],
        _smooth_infos: list[SmoothInfo],
        _names: list[str] | None = None,
    ) -> None:
        """Initialise the solution wrapper.

        Args:
            _result: The generic ``Result`` envelope carrying ``GAMParams``.
            _smooth_infos: One ``SmoothInfo`` per smooth term.
            _names: Optional names for parametric coefficients.
        """
        self._result = _result
        self._smooth_infos = _smooth_infos
        self._names = _names


    # ------------------------------------------------------------------
    # Properties delegating to GAMParams
    # ------------------------------------------------------------------

    @property
    def params(self) -> GAMParams:
        """The underlying frozen parameter payload."""
        return self._result.params

    @property
    def coefficients(self) -> NDArray:
        """Full coefficient vector (parametric + basis)."""
        return self._result.params.coefficients

    @property
    def fitted_values(self) -> NDArray:
        """Response-scale predictions."""
        return self._result.params.fitted_values

    @property
    def residuals(self) -> NDArray:
        """Working residuals (y - mu_hat)."""
        return self._result.params.residuals

    @property
    def edf(self) -> NDArray:
        """Effective degrees of freedom per smooth term."""
        return self._result.params.edf

    @property
    def total_edf(self) -> float:
        """Total effective degrees of freedom."""
        return self._result.params.total_edf

    @property
    def smooth_terms(self) -> list[SmoothInfo]:
        """Information about each smooth term."""
        return list(self._smooth_infos)

    @property
    def deviance(self) -> float:
        """Model deviance."""
        return self._result.params.deviance

    @property
    def null_deviance(self) -> float:
        """Null-model deviance."""
        return self._result.params.null_deviance

    @property
    def deviance_explained(self) -> float:
        """Proportion of null deviance explained: ``1 - dev / null_dev``."""
        if self.null_deviance == 0.0:
            return 0.0
        return 1.0 - self.deviance / self.null_deviance

    @property
    def aic(self) -> float:
        """Akaike information criterion."""
        return self._result.params.aic

    @property
    def gcv(self) -> float:
        """GCV score."""
        return self._result.params.gcv

    @property
    def scale(self) -> float:
        """Estimated dispersion parameter."""
        return self._result.params.scale

    @property
    def ubre(self) -> float:
        """UBRE / scaled-AIC score (the criterion mgcv's GCV.Cp minimises
        for known-scale families)."""
        return self._result.params.ubre

    @property
    def lambdas(self) -> NDArray:
        """Selected (or user-fixed) smoothing parameters, one per smooth.

        Reported in the same penalty scaling mgcv reports ``sp`` in; divide
        by ``params.s_scales`` for function-space units.
        """
        return self._result.params.lambdas

    @property
    def covariance(self) -> NDArray:
        """Bayesian posterior covariance of the coefficients (mgcv ``Vp``)."""
        return self._result.params.covariance

    @property
    def se(self) -> NDArray:
        """Standard errors for ALL coefficients (posterior, mgcv-style)."""
        return np.sqrt(np.maximum(
            np.diag(self._result.params.covariance), 0.0,
        ))

    @property
    def reml_score(self) -> float | None:
        """Laplace REML criterion (``None`` unless ``method='REML'``)."""
        return self._result.params.reml_score

    @property
    def converged(self) -> bool:
        """Whether the P-IRLS algorithm converged."""
        return self._result.params.converged

    @property
    def outer_converged(self) -> bool:
        """Whether the smoothing-parameter search converged off-boundary."""
        return self._result.params.outer_converged

    @property
    def n_iter(self) -> int:
        """Number of P-IRLS iterations executed."""
        return self._result.params.n_iter

    # ------------------------------------------------------------------
    # R-squared (adjusted)
    # ------------------------------------------------------------------

    @property
    def r_squared_adj(self) -> float:
        """Adjusted R-squared based on deviance explained.

        Matches the formula used by ``mgcv``::

            R_adj = 1 - (deviance / df_resid) / (null_deviance / df_null)
        """
        n = self._result.params.n_obs
        df_resid = max(n - self.total_edf, 1.0)
        df_null = max(n - 1.0, 1.0)
        if self.null_deviance == 0.0:
            return 0.0
        return 1.0 - (self.deviance / df_resid) / (self.null_deviance / df_null)

    # ------------------------------------------------------------------
    # Summary
    # ------------------------------------------------------------------


[docs]
    def summary(self) -> str:
        """R-style summary matching ``summary(gam(...))``.

        Includes:

        * Family and link function.
        * Parametric coefficient table with standard errors and tests.
        * Approximate significance tests for each smooth term.
        * Adjusted R-squared, deviance explained, GCV, and scale.

        Returns:
            Multi-line summary string.
        """
        p = self._result.params
        lines: list[str] = []

        lines.append(f"Family: {p.family_name}")
        lines.append(f"Link function: {p.link_name}")
        lines.append("")

        # Formula reconstruction: parametric terms first, then smooths.
        n_param = (
            self._smooth_infos[0].coef_indices[0]
            if self._smooth_infos else len(p.coefficients)
        )
        if self._names:
            param_terms = [
                nm for nm in self._names
                if nm.strip("()") not in ("Intercept", "intercept")
            ]
        else:
            # Unnamed: report the count honestly rather than inventing names.
            param_terms = (
                [f"X[{i}]" for i in range(1, n_param)] if n_param > 1 else []
            )
        pieces = param_terms + [si.term_name for si in self._smooth_infos]
        formula = f"y ~ {' + '.join(pieces)}" if pieces else "y ~ 1"
        lines.append(f"Formula: {formula}")
        lines.append("")

        # Parametric coefficients
        lines.append("Parametric coefficients:")
        # names length is validated against the parametric block in gam();
        # n_param always comes from the model structure, never from names.
        param_names = (
            list(self._names) if self._names
            else [f"B[{i}]" for i in range(n_param)]
        )
        stat_label = "z" if p.dispersion_fixed else "t"
        lines.append(
            f"{'':>16s} {'Estimate':>10s} {'Std. Error':>10s} "
            f"{stat_label + ' value':>10s} {'Pr(>|' + stat_label + '|)':>10s}"
        )

        # Known-scale families: z reference; estimated scale: t with the
        # residual df (mgcv's summary.gam convention).
        scale_known = p.dispersion_fixed
        resid_df = max(p.n_obs - p.total_edf, 1.0)
        for i, name in enumerate(param_names):
            coef_val = float(p.coefficients[i])
            se = self._param_se(i)
            z_val = coef_val / se if se > 0 else 0.0
            if scale_known:
                p_val = 2.0 * (1.0 - sp_stats.norm.cdf(abs(z_val)))
            else:
                p_val = 2.0 * float(sp_stats.t.sf(abs(z_val), resid_df))
            stars = significance_stars(p_val)
            p_str = f"{p_val:.4e}" if p_val >= 1e-4 else "<2e-16"
            lines.append(
                f"{name:>16s} {coef_val:10.4f} {se:10.4f} "
                f"{z_val:10.3f} {p_str:>10s} {stars}"
            )
        lines.append("")

        # Approximate significance of smooth terms (mgcv labels the
        # statistic Chi.sq for fixed dispersion, F when it is estimated).
        lines.append("Approximate significance of smooth terms:")
        smooth_stat = "Chi.sq" if p.dispersion_fixed else "F"
        lines.append(
            f"{'':>16s} {'edf':>8s} {'Ref.df':>8s} "
            f"{smooth_stat:>10s} {'p-value':>10s}"
        )
        for si in self._smooth_infos:
            stars = significance_stars(si.p_value)
            p_str = f"{si.p_value:.4e}" if si.p_value >= 1e-4 else "<2e-16"
            lines.append(
                f"{si.term_name:>16s} {si.edf:8.3f} {si.ref_df:8.3f} "
                f"{si.chi_sq:10.2f} {p_str:>10s} {stars}"
            )
        lines.append("")

        # Diagnostics footer
        dev_expl_pct = self.deviance_explained * 100.0
        r2 = self.r_squared_adj
        lines.append(
            f"R-sq.(adj) = {r2:.3f}   "
            f"Deviance explained = {dev_expl_pct:.1f}%"
        )
        if p.method == "REML" and p.reml_score is not None:
            lines.append(
                f"-REML = {p.reml_score:.4g}  "
                f"Scale est. = {p.scale:.4g}  n = {p.n_obs}"
            )
        elif p.dispersion_fixed:
            # Known scale: UBRE was the selection criterion (mgcv GCV.Cp).
            lines.append(
                f"UBRE = {p.ubre:.4g}  Scale est. = {p.scale:.4g}  "
                f"n = {p.n_obs}"
            )
        else:
            lines.append(
                f"GCV = {p.gcv:.4g}  Scale est. = {p.scale:.4g}  n = {p.n_obs}"
            )

        return "\n".join(lines)


    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _param_se(self, idx: int) -> float:
        """Standard error for the *idx*-th coefficient.

        From the stored Bayesian posterior covariance
        ``V_beta = scale * (X'WX + sum lam*S)^{-1}`` (mgcv's ``Vp``) —
        the same quantity ``summary.gam`` uses for its parametric table.
        """
        v = float(self._result.params.covariance[idx, idx])
        return float(np.sqrt(v)) if v > 0.0 else 0.0

    def __repr__(self) -> str:
        p = self._result.params
        n_smooth = len(self._smooth_infos)
        return (
            f"GAMSolution(family={p.family_name!r}, "
            f"n_smooths={n_smooth}, "
            f"deviance_explained={self.deviance_explained:.1%})"
        )