Source code for pystatistics.gam.solution

"""
User-facing GAM solution with R-style summary output.

Wraps a :class:`Result[GAMParams]` and a list of :class:`SmoothInfo`
objects, exposing convenient properties and an ``mgcv``-style
``summary()`` method.
"""

from __future__ import annotations

from typing import Any

import numpy as np
from numpy.typing import NDArray
from scipy import stats as sp_stats

from pystatistics.core.result import Result, SolutionReprMixin
from pystatistics.gam._common import GAMParams, SmoothInfo
from pystatistics.regression._formatting import significance_stars


[docs] class GAMSolution(SolutionReprMixin): """User-facing GAM result with convenient properties and R-style output. Wraps a ``Result[GAMParams]`` and provides: * Coefficient access. * Smooth-term information (EDF, approximate significance tests). * R-style ``summary()`` matching ``mgcv::summary.gam()``. """ __slots__ = ("_result", "_smooth_infos", "_names")
[docs] def __init__( self, _result: Result[GAMParams], _smooth_infos: list[SmoothInfo], _names: list[str] | None = None, ) -> None: """Initialise the solution wrapper. Args: _result: The generic ``Result`` envelope carrying ``GAMParams``. _smooth_infos: One ``SmoothInfo`` per smooth term. _names: Optional names for parametric coefficients. """ self._result = _result self._smooth_infos = _smooth_infos self._names = _names
# ------------------------------------------------------------------ # Properties delegating to GAMParams # ------------------------------------------------------------------ @property def params(self) -> GAMParams: """The underlying frozen parameter payload.""" return self._result.params @property def coefficients(self) -> NDArray: """Full coefficient vector (parametric + basis).""" return self._result.params.coefficients @property def fitted_values(self) -> NDArray: """Response-scale predictions.""" return self._result.params.fitted_values @property def residuals(self) -> NDArray: """Working residuals (y - mu_hat).""" return self._result.params.residuals @property def edf(self) -> NDArray: """Effective degrees of freedom per smooth term.""" return self._result.params.edf @property def total_edf(self) -> float: """Total effective degrees of freedom.""" return self._result.params.total_edf @property def smooth_terms(self) -> list[SmoothInfo]: """Information about each smooth term.""" return list(self._smooth_infos) @property def deviance(self) -> float: """Model deviance.""" return self._result.params.deviance @property def null_deviance(self) -> float: """Null-model deviance.""" return self._result.params.null_deviance @property def deviance_explained(self) -> float: """Proportion of null deviance explained: ``1 - dev / null_dev``.""" if self.null_deviance == 0.0: return 0.0 return 1.0 - self.deviance / self.null_deviance @property def aic(self) -> float: """Akaike information criterion.""" return self._result.params.aic @property def gcv(self) -> float: """GCV score.""" return self._result.params.gcv @property def scale(self) -> float: """Estimated dispersion parameter.""" return self._result.params.scale @property def ubre(self) -> float: """UBRE / scaled-AIC score (the criterion mgcv's GCV.Cp minimises for known-scale families).""" return self._result.params.ubre @property def lambdas(self) -> NDArray: """Selected (or user-fixed) smoothing parameters, one per smooth. Reported in the same penalty scaling mgcv reports ``sp`` in; divide by ``params.s_scales`` for function-space units. """ return self._result.params.lambdas @property def covariance(self) -> NDArray: """Bayesian posterior covariance of the coefficients (mgcv ``Vp``).""" return self._result.params.covariance @property def se(self) -> NDArray: """Standard errors for ALL coefficients (posterior, mgcv-style).""" return np.sqrt(np.maximum( np.diag(self._result.params.covariance), 0.0, )) @property def reml_score(self) -> float | None: """Laplace REML criterion (``None`` unless ``method='REML'``).""" return self._result.params.reml_score @property def converged(self) -> bool: """Whether the P-IRLS algorithm converged.""" return self._result.params.converged @property def outer_converged(self) -> bool: """Whether the smoothing-parameter search converged off-boundary.""" return self._result.params.outer_converged @property def n_iter(self) -> int: """Number of P-IRLS iterations executed.""" return self._result.params.n_iter # ------------------------------------------------------------------ # R-squared (adjusted) # ------------------------------------------------------------------ @property def r_squared_adj(self) -> float: """Adjusted R-squared based on deviance explained. Matches the formula used by ``mgcv``:: R_adj = 1 - (deviance / df_resid) / (null_deviance / df_null) """ n = self._result.params.n_obs df_resid = max(n - self.total_edf, 1.0) df_null = max(n - 1.0, 1.0) if self.null_deviance == 0.0: return 0.0 return 1.0 - (self.deviance / df_resid) / (self.null_deviance / df_null) # ------------------------------------------------------------------ # Summary # ------------------------------------------------------------------
[docs] def summary(self) -> str: """R-style summary matching ``summary(gam(...))``. Includes: * Family and link function. * Parametric coefficient table with standard errors and tests. * Approximate significance tests for each smooth term. * Adjusted R-squared, deviance explained, GCV, and scale. Returns: Multi-line summary string. """ p = self._result.params lines: list[str] = [] lines.append(f"Family: {p.family_name}") lines.append(f"Link function: {p.link_name}") lines.append("") # Formula reconstruction: parametric terms first, then smooths. n_param = ( self._smooth_infos[0].coef_indices[0] if self._smooth_infos else len(p.coefficients) ) if self._names: param_terms = [ nm for nm in self._names if nm.strip("()") not in ("Intercept", "intercept") ] else: # Unnamed: report the count honestly rather than inventing names. param_terms = ( [f"X[{i}]" for i in range(1, n_param)] if n_param > 1 else [] ) pieces = param_terms + [si.term_name for si in self._smooth_infos] formula = f"y ~ {' + '.join(pieces)}" if pieces else "y ~ 1" lines.append(f"Formula: {formula}") lines.append("") # Parametric coefficients lines.append("Parametric coefficients:") # names length is validated against the parametric block in gam(); # n_param always comes from the model structure, never from names. param_names = ( list(self._names) if self._names else [f"B[{i}]" for i in range(n_param)] ) stat_label = "z" if p.dispersion_fixed else "t" lines.append( f"{'':>16s} {'Estimate':>10s} {'Std. Error':>10s} " f"{stat_label + ' value':>10s} {'Pr(>|' + stat_label + '|)':>10s}" ) # Known-scale families: z reference; estimated scale: t with the # residual df (mgcv's summary.gam convention). scale_known = p.dispersion_fixed resid_df = max(p.n_obs - p.total_edf, 1.0) for i, name in enumerate(param_names): coef_val = float(p.coefficients[i]) se = self._param_se(i) z_val = coef_val / se if se > 0 else 0.0 if scale_known: p_val = 2.0 * (1.0 - sp_stats.norm.cdf(abs(z_val))) else: p_val = 2.0 * float(sp_stats.t.sf(abs(z_val), resid_df)) stars = significance_stars(p_val) p_str = f"{p_val:.4e}" if p_val >= 1e-4 else "<2e-16" lines.append( f"{name:>16s} {coef_val:10.4f} {se:10.4f} " f"{z_val:10.3f} {p_str:>10s} {stars}" ) lines.append("") # Approximate significance of smooth terms (mgcv labels the # statistic Chi.sq for fixed dispersion, F when it is estimated). lines.append("Approximate significance of smooth terms:") smooth_stat = "Chi.sq" if p.dispersion_fixed else "F" lines.append( f"{'':>16s} {'edf':>8s} {'Ref.df':>8s} " f"{smooth_stat:>10s} {'p-value':>10s}" ) for si in self._smooth_infos: stars = significance_stars(si.p_value) p_str = f"{si.p_value:.4e}" if si.p_value >= 1e-4 else "<2e-16" lines.append( f"{si.term_name:>16s} {si.edf:8.3f} {si.ref_df:8.3f} " f"{si.chi_sq:10.2f} {p_str:>10s} {stars}" ) lines.append("") # Diagnostics footer dev_expl_pct = self.deviance_explained * 100.0 r2 = self.r_squared_adj lines.append( f"R-sq.(adj) = {r2:.3f} " f"Deviance explained = {dev_expl_pct:.1f}%" ) if p.method == "REML" and p.reml_score is not None: lines.append( f"-REML = {p.reml_score:.4g} " f"Scale est. = {p.scale:.4g} n = {p.n_obs}" ) elif p.dispersion_fixed: # Known scale: UBRE was the selection criterion (mgcv GCV.Cp). lines.append( f"UBRE = {p.ubre:.4g} Scale est. = {p.scale:.4g} " f"n = {p.n_obs}" ) else: lines.append( f"GCV = {p.gcv:.4g} Scale est. = {p.scale:.4g} n = {p.n_obs}" ) return "\n".join(lines)
# ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ def _param_se(self, idx: int) -> float: """Standard error for the *idx*-th coefficient. From the stored Bayesian posterior covariance ``V_beta = scale * (X'WX + sum lam*S)^{-1}`` (mgcv's ``Vp``) — the same quantity ``summary.gam`` uses for its parametric table. """ v = float(self._result.params.covariance[idx, idx]) return float(np.sqrt(v)) if v > 0.0 else 0.0 def __repr__(self) -> str: p = self._result.params n_smooth = len(self._smooth_infos) return ( f"GAMSolution(family={p.family_name!r}, " f"n_smooths={n_smooth}, " f"deviance_explained={self.deviance_explained:.1%})" )