Source code for pylimma.contrasts

# SPDX-License-Identifier: GPL-3.0-or-later
#
# This module is a Python port of code from R limma. Original R copyrights:
#   contrasts.R                Copyright (C) 2002-2024 Gordon Smyth
#   contrastAsCoef.R           Copyright (C) 2013-2025 Gordon Smyth
#   modelmatrix.R              Copyright (C) 2003-2005 Gordon Smyth
# Python port: Copyright (C) 2026 John Mulvey
"""
Contrast matrices and contrast fitting for pylimma.

Implements:
- model_matrix(): create design matrices from formula strings
- make_contrasts(): create contrast matrices from expressions
- contrasts_fit(): apply contrasts to a fitted model
- contrast_as_coef(): re-express contrasts as coefficients
"""

from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from scipy import linalg

from .classes import MArrayLM, _resolve_fit_input

if TYPE_CHECKING:
    pass



[docs]
def model_matrix(
    formula: str,
    data: pd.DataFrame,
) -> np.ndarray:
    """
    Create a design matrix from a formula and data.

    This function creates design matrices from R-style formula strings,
    matching the behaviour of R's model.matrix() with default contrasts
    (contr.treatment / dummy coding).

    Parameters
    ----------
    formula : str
        R-style formula string. Examples:
        - "~ group" : intercept + dummy variables for group (reference coding)
        - "~ 0 + group" or "~ group - 1" : no intercept (cell-means coding)
        - "~ group + batch" : additive model with two factors
        - "~ group + age" : factor plus numeric covariate
    data : DataFrame
        Data containing the variables referenced in the formula.
        Columns should include all variables used in the formula.

    Returns
    -------
    ndarray
        Design matrix of shape (n_samples, n_coefficients).
        Use model_matrix_with_names() if column names are needed.

    Examples
    --------
    >>> import pandas as pd
    >>> data = pd.DataFrame({
    ...     'group': ['A', 'A', 'B', 'B', 'C', 'C'],
    ...     'age': [25, 30, 35, 40, 45, 50]
    ... })
    >>> model_matrix("~ group", data)
    array([[1., 0., 0.],
           [1., 0., 0.],
           [1., 1., 0.],
           [1., 1., 0.],
           [1., 0., 1.],
           [1., 0., 1.]])

    >>> model_matrix("~ 0 + group", data)
    array([[1., 0., 0.],
           [1., 0., 0.],
           [0., 1., 0.],
           [0., 1., 0.],
           [0., 0., 1.],
           [0., 0., 1.]])

    Notes
    -----
    This function uses patsy for formula parsing with Treatment coding
    to match R's default contr.treatment contrast scheme.

    See Also
    --------
    make_contrasts : Create contrast matrices for hypothesis testing
    """
    try:
        import patsy
    except ImportError:
        raise ImportError(
            "patsy is required for formula-based design matrices. Install with: pip install patsy"
        )

    # Use Treatment coding to match R's contr.treatment (reference coding)
    # This is patsy's default, but we set it explicitly for clarity
    design_info = patsy.dmatrix(formula, data, return_type="dataframe")

    return np.asarray(design_info, dtype=np.float64)



def model_matrix_with_names(
    formula: str,
    data: pd.DataFrame,
) -> pd.DataFrame:
    """
    Create a design matrix from a formula, returning a DataFrame with column names.

    This is the same as model_matrix() but returns a DataFrame preserving
    the column names, which is useful for inspecting the design structure.

    Parameters
    ----------
    formula : str
        R-style formula string (see model_matrix for examples).
    data : DataFrame
        Data containing the variables referenced in the formula.

    Returns
    -------
    DataFrame
        Design matrix with named columns.

    See Also
    --------
    model_matrix : Returns a numpy array (faster, no column names)
    """
    try:
        import patsy
    except ImportError:
        raise ImportError(
            "patsy is required for formula-based design matrices. Install with: pip install patsy"
        )

    return patsy.dmatrix(formula, data, return_type="dataframe")



[docs]
def make_contrasts(
    *contrasts_args: str,
    contrasts: list[str] | None = None,
    levels: list[str] | np.ndarray | pd.DataFrame,
    **named_contrasts: str,
) -> pd.DataFrame:
    """
    Construct a contrast matrix from contrast expressions.

    Parameters
    ----------
    *contrasts_args : str
        Contrast expressions like "B-A", "C-A", "(B+C)/2-A".
        Each expression becomes a column in the contrast matrix.
        The column name is the expression itself.
    contrasts : list of str, optional
        Alternative way to pass contrasts as a list (R parity).
        Cannot be used together with positional contrasts.
    levels : list of str, ndarray, or DataFrame
        Coefficient names. Can be:
        - List of coefficient names
        - Design matrix (column names extracted)
        - Factor (levels extracted)
    **named_contrasts : str
        Named contrast expressions. The keyword becomes the column name,
        the value is the expression. E.g., TreatmentVsControl="B-A".

    Returns
    -------
    DataFrame
        Contrast matrix of shape (n_levels, n_contrasts).
        Row index contains level names, columns contain contrast names.

    Examples
    --------
    >>> # Unnamed contrasts (expression becomes name)
    >>> make_contrasts("B-A", "C-A", levels=['A', 'B', 'C'])
           B-A  C-A
    A     -1.0 -1.0
    B      1.0  0.0
    C      0.0  1.0

    >>> # Named contrasts
    >>> make_contrasts(
    ...     TreatmentVsControl="B-A",
    ...     DrugVsDMSO="C-A",
    ...     levels=['A', 'B', 'C']
    ... )
           TreatmentVsControl  DrugVsDMSO
    A                    -1.0        -1.0
    B                     1.0         0.0
    C                     0.0         1.0

    >>> # Mixed: unnamed and named
    >>> make_contrasts("C-B", AvsRest="A-(B+C)/2", levels=['A', 'B', 'C'])

    Notes
    -----
    The contrast expressions are evaluated in an environment where each
    level name is bound to an indicator vector. For example, with levels
    ['A', 'B', 'C'], the expression "B-A" evaluates to [0, 1, 0] - [1, 0, 0]
    = [-1, 1, 0].
    """
    # Extract level names
    if isinstance(levels, pd.DataFrame):
        levels = list(levels.columns)
    elif isinstance(levels, np.ndarray):
        if hasattr(levels, "columns"):
            levels = list(levels.columns)
        elif levels.ndim == 2:
            levels = [f"x{i}" for i in range(levels.shape[1])]
        else:
            levels = list(levels)
    else:
        levels = list(levels)

    # Handle R's "(Intercept)" naming
    if levels and levels[0] == "(Intercept)":
        levels[0] = "Intercept"

    n = len(levels)
    if n < 1:
        raise ValueError("No levels to construct contrasts from")

    # Validate level names are valid Python identifiers (R parity)
    invalid = [lev for lev in levels if not lev.isidentifier()]
    if invalid:
        raise ValueError(
            f"Level names must be valid Python identifiers. Invalid names: {', '.join(invalid)}"
        )

    # Create indicator vectors for each level
    indicators = {lev: np.zeros(n) for lev in levels}
    for i, lev in enumerate(levels):
        indicators[lev][i] = 1.0

    # Handle contrasts= parameter (R parity)
    if contrasts is not None:
        if contrasts_args:
            raise ValueError("Cannot specify both positional contrasts and contrasts= parameter")
        contrast_exprs = list(contrasts)
    else:
        contrast_exprs = list(contrasts_args)

    # Combine unnamed and named contrasts
    # Unnamed: expression is both the name and expression
    # Named: keyword is name, value is expression
    all_contrasts = [(expr, expr) for expr in contrast_exprs]
    all_contrasts.extend((name, expr) for name, expr in named_contrasts.items())

    if not all_contrasts:
        raise ValueError("No contrasts specified")

    # Evaluate each contrast expression
    n_contrasts = len(all_contrasts)
    contrast_matrix = np.zeros((n, n_contrasts))
    contrast_names = []

    for j, (name, expr) in enumerate(all_contrasts):
        contrast_names.append(name)

        # Evaluate the expression
        try:
            result = eval(expr, {"__builtins__": {}}, indicators)
            contrast_matrix[:, j] = np.asarray(result)
        except Exception as e:
            raise ValueError(f"Could not evaluate contrast expression '{expr}': {e}")

    # Return as DataFrame with named rows and columns
    return pd.DataFrame(contrast_matrix, index=levels, columns=contrast_names)




[docs]
def contrasts_fit(
    data,
    contrasts: np.ndarray | pd.DataFrame | None = None,
    coefficients: int | str | list | None = None,
    key: str = "pylimma",
) -> dict | None:
    """
    Apply contrast matrix to a fitted model.

    Transforms coefficients and standard errors to reflect contrasts of
    interest rather than the original model parameterisation.

    Parameters
    ----------
    data : AnnData or dict
        Either an AnnData object with fit results in adata.uns[key],
        or a dict returned by lm_fit().
    contrasts : ndarray or DataFrame, optional
        Contrast matrix of shape (n_original_coefs, n_contrasts).
        Each column defines a contrast. If DataFrame, column names are
        preserved as contrast names.
    coefficients : int, str, or list, optional
        Alternative to `contrasts`. Specifies which coefficients to keep
        in the revised fit object. Can be indices (int), names (str), or
        a list of either. This is a simpler way to subset coefficients
        without defining a full contrast matrix.

        .. warning::
           Integer indices are **0-based** (Python convention). R's
           ``contrasts.fit(fit, coefficients=c(2, 3))`` uses 1-based
           indices; the equivalent pylimma call is
           ``contrasts_fit(fit, coefficients=[1, 2])``. Prefer string
           names when porting R code to avoid silent off-by-one errors.
    key : str, default "pylimma"
        Key for fit results in adata.uns (AnnData input only).

    Returns
    -------
    dict or None
        If input is dict, returns updated dict with transformed coefficients.
        If input is AnnData, updates adata.uns[key] in place and returns None.

    Notes
    -----
    Exactly one of `contrasts` or `coefficients` must be provided.

    The `coefficients` parameter provides a simpler way to specify the
    contrasts matrix when the desired contrasts are just a subset of
    the original coefficients.

    The transformation preserves the relationship between coefficients and
    their standard errors. For orthogonal designs, the standard errors
    transform simply. For non-orthogonal designs, the correlation structure
    is accounted for.

    Any previous test statistics (t, p-values, etc.) are removed since they
    are no longer valid after the transformation.

    References
    ----------
    Smyth, G. K. (2004). Linear models and empirical Bayes methods for
    assessing differential expression in microarray experiments.
    Statistical Applications in Genetics and Molecular Biology, 3(1), Article 3.
    """
    fit, _adata, _adata_key = _resolve_fit_input(data, key)
    is_anndata = _adata is not None

    # Validate fit object
    if "coefficients" not in fit:
        raise ValueError("fit must contain coefficients")
    if "stdev_unscaled" not in fit:
        raise ValueError("fit must contain stdev_unscaled")

    # Check that exactly one of contrasts or coefficients is provided
    if contrasts is None and coefficients is None:
        raise ValueError("Must specify either 'contrasts' or 'coefficients'")
    if contrasts is not None and coefficients is not None:
        raise ValueError("Cannot specify both 'contrasts' and 'coefficients'")

    # Remove any previous test statistics
    fit = {k: v for k, v in fit.items() if k not in ("t", "p_value", "lods", "F", "F_p_value")}

    # Get dimensions
    fit_coef = fit["coefficients"]
    stdev_unscaled = fit["stdev_unscaled"]
    n_genes, n_coef = fit_coef.shape

    # Handle coefficients parameter - convert to contrast matrix
    contrast_names = None
    if coefficients is not None:
        # Normalize to list
        if not isinstance(coefficients, (list, tuple)):
            coefficients = [coefficients]

        # Get coefficient names from fit if available
        coef_names = fit.get("coef_names")

        # Convert names to indices and build contrast matrix
        coef_indices = []
        selected_names = []
        for c in coefficients:
            if isinstance(c, str):
                if coef_names is None:
                    raise ValueError(f"Cannot use coefficient name '{c}' - no coef_names in fit")
                if c not in coef_names:
                    raise ValueError(f"Coefficient '{c}' not found. Available: {coef_names}")
                idx = coef_names.index(c)
                coef_indices.append(idx)
                selected_names.append(c)
            else:
                coef_indices.append(int(c))
                if coef_names is not None:
                    selected_names.append(coef_names[int(c)])
                else:
                    selected_names.append(f"coef{int(c)}")

        # Build identity-like contrast matrix for selected coefficients
        contrasts = np.zeros((n_coef, len(coef_indices)))
        for j, idx in enumerate(coef_indices):
            contrasts[idx, j] = 1.0
        contrast_names = selected_names

    # Extract contrast names if DataFrame, then convert to array.
    # Capture row names too so we can replicate R's row/col name check.
    contrast_rownames = None
    if isinstance(contrasts, pd.DataFrame):
        contrast_names = list(contrasts.columns)
        contrast_rownames = list(contrasts.index)
        contrasts = contrasts.values

    # R contrasts.R:31: `if(!is.numeric(contrasts)) stop("contrasts
    # must be a numeric matrix")`. Reject logical / character inputs
    # before the float coercion silently turns booleans into 0/1 and
    # parses string digits.
    contrasts_raw = np.asarray(contrasts)
    if contrasts_raw.dtype == np.bool_ or not np.issubdtype(contrasts_raw.dtype, np.number):
        raise ValueError("contrasts must be a numeric matrix")
    contrasts = contrasts_raw.astype(np.float64, copy=False)
    if contrasts.ndim == 1:
        contrasts = contrasts.reshape(-1, 1)
    if contrasts.shape[0] != n_coef:
        raise ValueError(
            f"Number of rows in contrasts ({contrasts.shape[0]}) must match "
            f"number of coefficients ({n_coef})"
        )
    if np.any(np.isnan(contrasts)):
        raise ValueError("NAs not allowed in contrasts")

    # R contrasts.R:35-40: rn = rownames(contrasts), cn = colnames(
    # fit$coefficients); rename "(Intercept)" -> "Intercept" in both
    # then warn if they don't match.
    fit_coef_names = fit.get("coef_names")
    if contrast_rownames is not None and fit_coef_names is not None:
        rn = list(contrast_rownames)
        cn = list(fit_coef_names)
        if rn and rn[0] == "(Intercept)":
            rn[0] = "Intercept"
        if cn and cn[0] == "(Intercept)":
            cn[0] = "Intercept"
        if rn != cn:
            warnings.warn("row names of contrasts don't match col names of coefficients")

    fit["contrasts"] = contrasts
    n_contrasts = contrasts.shape[1]

    # Store contrast names (generate default names if not provided)
    if contrast_names is None:
        contrast_names = [f"contrast{i}" for i in range(n_contrasts)]
    fit["contrast_names"] = contrast_names

    # Handle empty contrasts
    if n_contrasts == 0:
        fit["coefficients"] = fit_coef[:, :0]
        fit["stdev_unscaled"] = stdev_unscaled[:, :0]
        # Match R's fit[,0] subsetting: every coefficient-indexed slot
        # collapses to its 0-col form so the returned MArrayLM is
        # internally shape-consistent.
        if fit.get("cov_coefficients") is not None:
            cov = np.asarray(fit["cov_coefficients"])
            fit["cov_coefficients"] = cov[:0, :0]
        if fit.get("var_prior") is not None:
            fit["var_prior"] = np.asarray(fit["var_prior"])[:0]
        if fit.get("coef_names") is not None:
            fit["coef_names"] = []
        if is_anndata:
            # Plain dict for h5ad compatibility; see lm_fit.
            data.uns[key] = dict(fit)
            return None
        if not isinstance(fit, MArrayLM):
            fit = MArrayLM(fit)
        return fit

    # R's contrasts.fit strips rows (coefficients) that are zero in every
    # contrast column before the orthogonality check (contrasts.R:77-85).
    # Comment there says "Not necessary but can make the function faster",
    # but removing those rows also changes which correlations enter the
    # lower.tri(cormatrix) orthog test and can flip the code path on
    # pathological contrasts that leave an unused-but-correlated
    # coefficient.
    contrasts_all_zero = np.where(np.all(contrasts == 0, axis=1))[0]
    if contrasts_all_zero.size and contrasts_all_zero.size < n_coef:
        keep = np.setdiff1d(np.arange(n_coef), contrasts_all_zero)
        contrasts = contrasts[keep, :]
        fit_coef = fit_coef[:, keep]
        stdev_unscaled = stdev_unscaled[:, keep]
        if fit.get("cov_coefficients") is not None:
            cov_existing = np.asarray(fit["cov_coefficients"])
            fit["cov_coefficients"] = cov_existing[np.ix_(keep, keep)]
        n_coef = len(keep)
        fit["contrasts"] = contrasts

    # Get or construct covariance matrix
    if "cov_coefficients" not in fit or fit["cov_coefficients"] is None:
        warnings.warn("cov_coefficients not found in fit - assuming coefficients are orthogonal")
        var_coef = np.nanmean(stdev_unscaled**2, axis=0)
        cov_coefficients = np.diag(var_coef)
        cormatrix = np.eye(n_coef)
        orthog = True
    else:
        cov_coefficients = fit["cov_coefficients"]
        # Handle NaN in cov matrix (non-estimable coefficients)
        valid_idx = ~np.isnan(np.diag(cov_coefficients))
        if not np.all(valid_idx):
            # Reduce to estimable coefficients
            est_idx = np.where(valid_idx)[0]
            cov_coefficients = cov_coefficients[np.ix_(est_idx, est_idx)]

            # Check contrasts only use estimable coefficients
            non_est_idx = np.where(~valid_idx)[0]
            if np.any(contrasts[non_est_idx, :] != 0):
                raise ValueError("Trying to take contrast of non-estimable coefficient")

            contrasts = contrasts[est_idx, :]
            fit_coef = fit_coef[:, est_idx]
            stdev_unscaled = stdev_unscaled[:, est_idx]
            n_coef = len(est_idx)

        # Compute correlation matrix
        std = np.sqrt(np.diag(cov_coefficients))
        std[std == 0] = 1  # Avoid division by zero
        cormatrix = cov_coefficients / np.outer(std, std)

        # Check if orthogonal
        if cormatrix.size < 2:
            orthog = True
        else:
            orthog = np.sum(np.abs(cormatrix[np.tril_indices(n_coef, -1)])) < 1e-12

    # Handle NA coefficients
    na_coef = np.any(np.isnan(fit_coef))
    if na_coef:
        na_mask = np.isnan(fit_coef)
        fit_coef = fit_coef.copy()
        stdev_unscaled = stdev_unscaled.copy()
        fit_coef[na_mask] = 0
        stdev_unscaled[na_mask] = 1e30

    # Transform coefficients
    new_coefficients = fit_coef @ contrasts

    # Transform covariance matrix
    R = np.linalg.cholesky(cov_coefficients).T  # Upper triangular
    new_cov = (R @ contrasts).T @ (R @ contrasts)
    fit["cov_coefficients"] = new_cov

    # Transform standard errors
    if orthog:
        # Simple case: variances add
        new_stdev = np.sqrt(stdev_unscaled**2 @ contrasts**2)
    else:
        # Non-orthogonal: need to account for correlations
        R_cor = np.linalg.cholesky(cormatrix).T
        new_stdev = np.zeros((n_genes, n_contrasts))
        for i in range(n_genes):
            # Scale contrasts by stdev
            scaled_contrasts = stdev_unscaled[i, :, np.newaxis] * contrasts
            RUC = R_cor @ scaled_contrasts
            new_stdev[i, :] = np.sqrt(np.sum(RUC**2, axis=0))

    # Restore NAs
    if na_coef:
        large_stdev = new_stdev > 1e20
        new_coefficients[large_stdev] = np.nan
        new_stdev[large_stdev] = np.nan

    fit["coefficients"] = new_coefficients
    fit["stdev_unscaled"] = new_stdev

    if not isinstance(fit, MArrayLM):
        fit = MArrayLM(fit)

    if is_anndata:
        # Plain dict for h5ad compatibility; see lm_fit.
        data.uns[key] = dict(fit)
        return None
    return fit



def contrast_as_coef(
    design,
    contrast=None,
    first: bool = True,
) -> dict:
    """
    Reform a design matrix so that contrasts become simple coefficients.

    Port of R limma's ``contrastAsCoef``. Re-parameterises ``design`` so
    that fitting the new design directly estimates the requested
    contrasts as coefficients.

    Parameters
    ----------
    design : ndarray or DataFrame
        Design matrix of shape (n_samples, n_coef).
    contrast : ndarray or DataFrame, optional
        Contrast matrix of shape (n_coef, n_contrasts). If ``None``,
        ``design`` is returned unchanged (matching R).
    first : bool, default True
        If True, contrast columns are placed first in the new design.
        If False, they are moved to the end.

    Returns
    -------
    dict
        ``design`` : DataFrame
            Re-parameterised design with shape (n_samples, n_coef) and
            named columns.
        ``coef`` : list of int
            0-based indices of the contrast columns within the new
            design.
        ``qr`` : dict
            QR decomposition of ``contrast`` with keys ``q``, ``r``,
            ``pivot``, ``rank``.
    """
    from .lmfit import _qr_r_style

    if isinstance(design, pd.DataFrame):
        design_arr = np.asarray(design.values, dtype=np.float64)
    else:
        design_arr = np.asarray(design, dtype=np.float64)
    if design_arr.ndim == 1:
        design_arr = design_arr.reshape(-1, 1)

    if contrast is None:
        return design_arr

    contrast_names = None
    if isinstance(contrast, pd.DataFrame):
        contrast_names = list(contrast.columns)
        contrast_arr = np.asarray(contrast.values, dtype=np.float64)
    else:
        contrast_arr = np.asarray(contrast, dtype=np.float64)
    if contrast_arr.ndim == 1:
        contrast_arr = contrast_arr.reshape(-1, 1)

    if design_arr.shape[1] != contrast_arr.shape[0]:
        raise ValueError("Length of contrast doesn't match ncol(design)")

    q, r, pivot, rank = _qr_r_style(contrast_arr)
    if rank == 0:
        raise ValueError("contrast is all zero")

    designT = q.T @ design_arr.T
    R_block = r[:rank, :rank]
    designT[:rank, :] = linalg.solve_triangular(R_block, designT[:rank, :])
    new_design = designT.T

    n_cols = new_design.shape[1]
    col_names = [f"Q{i + 1}" for i in range(n_cols)]
    if contrast_names is None:
        col_names[:rank] = [f"C{int(pivot[i]) + 1}" for i in range(rank)]
    else:
        col_names[:rank] = list(contrast_names[:rank])
    coef = list(range(rank))

    if not first:
        non_coef = [i for i in range(n_cols) if i not in coef]
        new_design = np.column_stack([new_design[:, non_coef], new_design[:, coef]])
        col_names = [col_names[i] for i in non_coef] + [col_names[i] for i in coef]
        coef = list(range(n_cols - rank, n_cols))

    new_design_df = pd.DataFrame(new_design, columns=col_names)

    return {
        "design": new_design_df,
        "coef": coef,
        "qr": {"q": q, "r": r, "pivot": pivot, "rank": rank},
    }