Source code for pylimma.selmod

# SPDX-License-Identifier: GPL-3.0-or-later
#
# This module is a Python port of code from R limma. Original R copyrights:
#   selmod.R  Copyright (C) 2008 Alicia Oshlack, Gordon Smyth
# Python port: Copyright (C) 2026 John Mulvey
"""
``select_model`` - AIC / BIC / Mallows' Cp model selection for gene-wise
linear models. Port of R limma's ``selectModel``.
"""

from __future__ import annotations

from typing import Sequence

import numpy as np
import pandas as pd


[docs] def select_model( y: np.ndarray, design_list: Sequence[np.ndarray] | dict, criterion: str = "aic", df_prior: float = 0, s2_prior: float | None = None, s2_true: np.ndarray | None = None, **lmfit_kwargs, ) -> dict: """ Gene-wise model comparison via AIC, BIC, or Mallows' Cp. Port of R limma's ``selectModel(y, designlist, criterion, df.prior, s2.prior, s2.true, ...)`` (``selmod.R``). Parameters ---------- y : ndarray or DataFrame Expression matrix, rows = genes, columns = arrays. No NaNs. design_list : sequence or dict of design matrices Collection of candidate design matrices. When a dict is supplied its keys become the model names (matching R's ``names(designlist)``). criterion : {"aic", "bic", "mallowscp"}, default "aic" df_prior : float, default 0 Prior degrees of freedom for the variance estimate. s2_prior : float, optional Prior variance. Required when ``df_prior > 0``. s2_true : ndarray, optional True per-gene variance (required when ``criterion="mallowscp"``). **lmfit_kwargs : forwarded to :func:`lm_fit`. Returns ------- dict ``IC``: ``DataFrame`` of per-model information criteria. ``pref``: ``pd.Categorical`` of preferred model per gene. ``criterion``: the criterion actually used. """ from .lmfit import lm_fit y = np.asarray(y, dtype=np.float64) if np.any(np.isnan(y)): raise ValueError("NAs not allowed") narrays = y.shape[1] if isinstance(design_list, dict): model_names = list(design_list.keys()) designs = list(design_list.values()) else: designs = list(design_list) model_names = [str(i + 1) for i in range(len(designs))] nmodels = len(designs) if df_prior > 0 and s2_prior is None: raise ValueError("s2_prior must be set when df_prior > 0") if df_prior == 0: s2_prior = 0.0 criterion = criterion.lower() if criterion not in ("aic", "bic", "mallowscp"): raise ValueError("criterion must be one of 'aic', 'bic', 'mallowscp'") IC = None if criterion == "mallowscp": if s2_true is None: raise ValueError("Need s2.true values") s2_true = np.asarray(s2_true, dtype=np.float64) for i, design in enumerate(designs): fit = lm_fit(y, design, **lmfit_kwargs) df_res = np.asarray(fit["df_residual"], dtype=np.float64) npar = narrays - float(df_res[0]) if IC is None: IC = np.full((y.shape[0], nmodels), np.nan) if s2_true.size != y.shape[0] and s2_true.size != 1: raise ValueError("s2_true wrong length") sigma = np.asarray(fit["sigma"], dtype=np.float64) IC[:, i] = df_res * sigma**2 / s2_true + npar * 2 - narrays else: ntotal = df_prior + narrays penalty = np.log(narrays) if criterion == "bic" else 2.0 for i, design in enumerate(designs): fit = lm_fit(y, design, **lmfit_kwargs) df_res = np.asarray(fit["df_residual"], dtype=np.float64) sigma = np.asarray(fit["sigma"], dtype=np.float64) npar = narrays - float(df_res[0]) + 1 s2_post = (df_prior * s2_prior + df_res * sigma**2) / ntotal if IC is None: IC = np.full((y.shape[0], nmodels), np.nan) IC[:, i] = ntotal * np.log(s2_post) + npar * penalty IC_df = pd.DataFrame(IC, columns=model_names) pref_idx = np.argmin(IC, axis=1) pref = pd.Categorical.from_codes(pref_idx, categories=model_names) return {"IC": IC_df, "pref": pref, "criterion": criterion}