Source code for pylimma.contrasts

# SPDX-License-Identifier: GPL-3.0-or-later
#
# This module is a Python port of code from R limma. Original R copyrights:
#   contrasts.R                Copyright (C) 2002-2024 Gordon Smyth
#   contrastAsCoef.R           Copyright (C) 2013-2025 Gordon Smyth
#   modelmatrix.R              Copyright (C) 2003-2005 Gordon Smyth
# Python port: Copyright (C) 2026 John Mulvey
"""
Contrast matrices and contrast fitting for pylimma.

Implements:
- model_matrix(): create design matrices from formula strings
- make_contrasts(): create contrast matrices from expressions
- contrasts_fit(): apply contrasts to a fitted model
- contrast_as_coef(): re-express contrasts as coefficients
"""

from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from scipy import linalg

from .classes import MArrayLM, _resolve_fit_input

if TYPE_CHECKING:
    pass


[docs] def model_matrix( formula: str, data: pd.DataFrame, ) -> np.ndarray: """ Create a design matrix from a formula and data. This function creates design matrices from R-style formula strings, matching the behaviour of R's model.matrix() with default contrasts (contr.treatment / dummy coding). Parameters ---------- formula : str R-style formula string. Examples: - "~ group" : intercept + dummy variables for group (reference coding) - "~ 0 + group" or "~ group - 1" : no intercept (cell-means coding) - "~ group + batch" : additive model with two factors - "~ group + age" : factor plus numeric covariate data : DataFrame Data containing the variables referenced in the formula. Columns should include all variables used in the formula. Returns ------- ndarray Design matrix of shape (n_samples, n_coefficients). Use model_matrix_with_names() if column names are needed. Examples -------- >>> import pandas as pd >>> data = pd.DataFrame({ ... 'group': ['A', 'A', 'B', 'B', 'C', 'C'], ... 'age': [25, 30, 35, 40, 45, 50] ... }) >>> model_matrix("~ group", data) array([[1., 0., 0.], [1., 0., 0.], [1., 1., 0.], [1., 1., 0.], [1., 0., 1.], [1., 0., 1.]]) >>> model_matrix("~ 0 + group", data) array([[1., 0., 0.], [1., 0., 0.], [0., 1., 0.], [0., 1., 0.], [0., 0., 1.], [0., 0., 1.]]) Notes ----- This function uses patsy for formula parsing with Treatment coding to match R's default contr.treatment contrast scheme. See Also -------- make_contrasts : Create contrast matrices for hypothesis testing """ try: import patsy except ImportError: raise ImportError( "patsy is required for formula-based design matrices. Install with: pip install patsy" ) # Use Treatment coding to match R's contr.treatment (reference coding) # This is patsy's default, but we set it explicitly for clarity design_info = patsy.dmatrix(formula, data, return_type="dataframe") return np.asarray(design_info, dtype=np.float64)
def model_matrix_with_names( formula: str, data: pd.DataFrame, ) -> pd.DataFrame: """ Create a design matrix from a formula, returning a DataFrame with column names. This is the same as model_matrix() but returns a DataFrame preserving the column names, which is useful for inspecting the design structure. Parameters ---------- formula : str R-style formula string (see model_matrix for examples). data : DataFrame Data containing the variables referenced in the formula. Returns ------- DataFrame Design matrix with named columns. See Also -------- model_matrix : Returns a numpy array (faster, no column names) """ try: import patsy except ImportError: raise ImportError( "patsy is required for formula-based design matrices. Install with: pip install patsy" ) return patsy.dmatrix(formula, data, return_type="dataframe")
[docs] def make_contrasts( *contrasts_args: str, contrasts: list[str] | None = None, levels: list[str] | np.ndarray | pd.DataFrame, **named_contrasts: str, ) -> pd.DataFrame: """ Construct a contrast matrix from contrast expressions. Parameters ---------- *contrasts_args : str Contrast expressions like "B-A", "C-A", "(B+C)/2-A". Each expression becomes a column in the contrast matrix. The column name is the expression itself. contrasts : list of str, optional Alternative way to pass contrasts as a list (R parity). Cannot be used together with positional contrasts. levels : list of str, ndarray, or DataFrame Coefficient names. Can be: - List of coefficient names - Design matrix (column names extracted) - Factor (levels extracted) **named_contrasts : str Named contrast expressions. The keyword becomes the column name, the value is the expression. E.g., TreatmentVsControl="B-A". Returns ------- DataFrame Contrast matrix of shape (n_levels, n_contrasts). Row index contains level names, columns contain contrast names. Examples -------- >>> # Unnamed contrasts (expression becomes name) >>> make_contrasts("B-A", "C-A", levels=['A', 'B', 'C']) B-A C-A A -1.0 -1.0 B 1.0 0.0 C 0.0 1.0 >>> # Named contrasts >>> make_contrasts( ... TreatmentVsControl="B-A", ... DrugVsDMSO="C-A", ... levels=['A', 'B', 'C'] ... ) TreatmentVsControl DrugVsDMSO A -1.0 -1.0 B 1.0 0.0 C 0.0 1.0 >>> # Mixed: unnamed and named >>> make_contrasts("C-B", AvsRest="A-(B+C)/2", levels=['A', 'B', 'C']) Notes ----- The contrast expressions are evaluated in an environment where each level name is bound to an indicator vector. For example, with levels ['A', 'B', 'C'], the expression "B-A" evaluates to [0, 1, 0] - [1, 0, 0] = [-1, 1, 0]. """ # Extract level names if isinstance(levels, pd.DataFrame): levels = list(levels.columns) elif isinstance(levels, np.ndarray): if hasattr(levels, "columns"): levels = list(levels.columns) elif levels.ndim == 2: levels = [f"x{i}" for i in range(levels.shape[1])] else: levels = list(levels) else: levels = list(levels) # Handle R's "(Intercept)" naming if levels and levels[0] == "(Intercept)": levels[0] = "Intercept" n = len(levels) if n < 1: raise ValueError("No levels to construct contrasts from") # Validate level names are valid Python identifiers (R parity) invalid = [lev for lev in levels if not lev.isidentifier()] if invalid: raise ValueError( f"Level names must be valid Python identifiers. Invalid names: {', '.join(invalid)}" ) # Create indicator vectors for each level indicators = {lev: np.zeros(n) for lev in levels} for i, lev in enumerate(levels): indicators[lev][i] = 1.0 # Handle contrasts= parameter (R parity) if contrasts is not None: if contrasts_args: raise ValueError("Cannot specify both positional contrasts and contrasts= parameter") contrast_exprs = list(contrasts) else: contrast_exprs = list(contrasts_args) # Combine unnamed and named contrasts # Unnamed: expression is both the name and expression # Named: keyword is name, value is expression all_contrasts = [(expr, expr) for expr in contrast_exprs] all_contrasts.extend((name, expr) for name, expr in named_contrasts.items()) if not all_contrasts: raise ValueError("No contrasts specified") # Evaluate each contrast expression n_contrasts = len(all_contrasts) contrast_matrix = np.zeros((n, n_contrasts)) contrast_names = [] for j, (name, expr) in enumerate(all_contrasts): contrast_names.append(name) # Evaluate the expression try: result = eval(expr, {"__builtins__": {}}, indicators) contrast_matrix[:, j] = np.asarray(result) except Exception as e: raise ValueError(f"Could not evaluate contrast expression '{expr}': {e}") # Return as DataFrame with named rows and columns return pd.DataFrame(contrast_matrix, index=levels, columns=contrast_names)
[docs] def contrasts_fit( data, contrasts: np.ndarray | pd.DataFrame | None = None, coefficients: int | str | list | None = None, key: str = "pylimma", ) -> dict | None: """ Apply contrast matrix to a fitted model. Transforms coefficients and standard errors to reflect contrasts of interest rather than the original model parameterisation. Parameters ---------- data : AnnData or dict Either an AnnData object with fit results in adata.uns[key], or a dict returned by lm_fit(). contrasts : ndarray or DataFrame, optional Contrast matrix of shape (n_original_coefs, n_contrasts). Each column defines a contrast. If DataFrame, column names are preserved as contrast names. coefficients : int, str, or list, optional Alternative to `contrasts`. Specifies which coefficients to keep in the revised fit object. Can be indices (int), names (str), or a list of either. This is a simpler way to subset coefficients without defining a full contrast matrix. .. warning:: Integer indices are **0-based** (Python convention). R's ``contrasts.fit(fit, coefficients=c(2, 3))`` uses 1-based indices; the equivalent pylimma call is ``contrasts_fit(fit, coefficients=[1, 2])``. Prefer string names when porting R code to avoid silent off-by-one errors. key : str, default "pylimma" Key for fit results in adata.uns (AnnData input only). Returns ------- dict or None If input is dict, returns updated dict with transformed coefficients. If input is AnnData, updates adata.uns[key] in place and returns None. Notes ----- Exactly one of `contrasts` or `coefficients` must be provided. The `coefficients` parameter provides a simpler way to specify the contrasts matrix when the desired contrasts are just a subset of the original coefficients. The transformation preserves the relationship between coefficients and their standard errors. For orthogonal designs, the standard errors transform simply. For non-orthogonal designs, the correlation structure is accounted for. Any previous test statistics (t, p-values, etc.) are removed since they are no longer valid after the transformation. References ---------- Smyth, G. K. (2004). Linear models and empirical Bayes methods for assessing differential expression in microarray experiments. Statistical Applications in Genetics and Molecular Biology, 3(1), Article 3. """ fit, _adata, _adata_key = _resolve_fit_input(data, key) is_anndata = _adata is not None # Validate fit object if "coefficients" not in fit: raise ValueError("fit must contain coefficients") if "stdev_unscaled" not in fit: raise ValueError("fit must contain stdev_unscaled") # Check that exactly one of contrasts or coefficients is provided if contrasts is None and coefficients is None: raise ValueError("Must specify either 'contrasts' or 'coefficients'") if contrasts is not None and coefficients is not None: raise ValueError("Cannot specify both 'contrasts' and 'coefficients'") # Remove any previous test statistics fit = {k: v for k, v in fit.items() if k not in ("t", "p_value", "lods", "F", "F_p_value")} # Get dimensions fit_coef = fit["coefficients"] stdev_unscaled = fit["stdev_unscaled"] n_genes, n_coef = fit_coef.shape # Handle coefficients parameter - convert to contrast matrix contrast_names = None if coefficients is not None: # Normalize to list if not isinstance(coefficients, (list, tuple)): coefficients = [coefficients] # Get coefficient names from fit if available coef_names = fit.get("coef_names") # Convert names to indices and build contrast matrix coef_indices = [] selected_names = [] for c in coefficients: if isinstance(c, str): if coef_names is None: raise ValueError(f"Cannot use coefficient name '{c}' - no coef_names in fit") if c not in coef_names: raise ValueError(f"Coefficient '{c}' not found. Available: {coef_names}") idx = coef_names.index(c) coef_indices.append(idx) selected_names.append(c) else: coef_indices.append(int(c)) if coef_names is not None: selected_names.append(coef_names[int(c)]) else: selected_names.append(f"coef{int(c)}") # Build identity-like contrast matrix for selected coefficients contrasts = np.zeros((n_coef, len(coef_indices))) for j, idx in enumerate(coef_indices): contrasts[idx, j] = 1.0 contrast_names = selected_names # Extract contrast names if DataFrame, then convert to array. # Capture row names too so we can replicate R's row/col name check. contrast_rownames = None if isinstance(contrasts, pd.DataFrame): contrast_names = list(contrasts.columns) contrast_rownames = list(contrasts.index) contrasts = contrasts.values # R contrasts.R:31: `if(!is.numeric(contrasts)) stop("contrasts # must be a numeric matrix")`. Reject logical / character inputs # before the float coercion silently turns booleans into 0/1 and # parses string digits. contrasts_raw = np.asarray(contrasts) if contrasts_raw.dtype == np.bool_ or not np.issubdtype(contrasts_raw.dtype, np.number): raise ValueError("contrasts must be a numeric matrix") contrasts = contrasts_raw.astype(np.float64, copy=False) if contrasts.ndim == 1: contrasts = contrasts.reshape(-1, 1) if contrasts.shape[0] != n_coef: raise ValueError( f"Number of rows in contrasts ({contrasts.shape[0]}) must match " f"number of coefficients ({n_coef})" ) if np.any(np.isnan(contrasts)): raise ValueError("NAs not allowed in contrasts") # R contrasts.R:35-40: rn = rownames(contrasts), cn = colnames( # fit$coefficients); rename "(Intercept)" -> "Intercept" in both # then warn if they don't match. fit_coef_names = fit.get("coef_names") if contrast_rownames is not None and fit_coef_names is not None: rn = list(contrast_rownames) cn = list(fit_coef_names) if rn and rn[0] == "(Intercept)": rn[0] = "Intercept" if cn and cn[0] == "(Intercept)": cn[0] = "Intercept" if rn != cn: warnings.warn("row names of contrasts don't match col names of coefficients") fit["contrasts"] = contrasts n_contrasts = contrasts.shape[1] # Store contrast names (generate default names if not provided) if contrast_names is None: contrast_names = [f"contrast{i}" for i in range(n_contrasts)] fit["contrast_names"] = contrast_names # Handle empty contrasts if n_contrasts == 0: fit["coefficients"] = fit_coef[:, :0] fit["stdev_unscaled"] = stdev_unscaled[:, :0] # Match R's fit[,0] subsetting: every coefficient-indexed slot # collapses to its 0-col form so the returned MArrayLM is # internally shape-consistent. if fit.get("cov_coefficients") is not None: cov = np.asarray(fit["cov_coefficients"]) fit["cov_coefficients"] = cov[:0, :0] if fit.get("var_prior") is not None: fit["var_prior"] = np.asarray(fit["var_prior"])[:0] if fit.get("coef_names") is not None: fit["coef_names"] = [] if is_anndata: # Plain dict for h5ad compatibility; see lm_fit. data.uns[key] = dict(fit) return None if not isinstance(fit, MArrayLM): fit = MArrayLM(fit) return fit # R's contrasts.fit strips rows (coefficients) that are zero in every # contrast column before the orthogonality check (contrasts.R:77-85). # Comment there says "Not necessary but can make the function faster", # but removing those rows also changes which correlations enter the # lower.tri(cormatrix) orthog test and can flip the code path on # pathological contrasts that leave an unused-but-correlated # coefficient. contrasts_all_zero = np.where(np.all(contrasts == 0, axis=1))[0] if contrasts_all_zero.size and contrasts_all_zero.size < n_coef: keep = np.setdiff1d(np.arange(n_coef), contrasts_all_zero) contrasts = contrasts[keep, :] fit_coef = fit_coef[:, keep] stdev_unscaled = stdev_unscaled[:, keep] if fit.get("cov_coefficients") is not None: cov_existing = np.asarray(fit["cov_coefficients"]) fit["cov_coefficients"] = cov_existing[np.ix_(keep, keep)] n_coef = len(keep) fit["contrasts"] = contrasts # Get or construct covariance matrix if "cov_coefficients" not in fit or fit["cov_coefficients"] is None: warnings.warn("cov_coefficients not found in fit - assuming coefficients are orthogonal") var_coef = np.nanmean(stdev_unscaled**2, axis=0) cov_coefficients = np.diag(var_coef) cormatrix = np.eye(n_coef) orthog = True else: cov_coefficients = fit["cov_coefficients"] # Handle NaN in cov matrix (non-estimable coefficients) valid_idx = ~np.isnan(np.diag(cov_coefficients)) if not np.all(valid_idx): # Reduce to estimable coefficients est_idx = np.where(valid_idx)[0] cov_coefficients = cov_coefficients[np.ix_(est_idx, est_idx)] # Check contrasts only use estimable coefficients non_est_idx = np.where(~valid_idx)[0] if np.any(contrasts[non_est_idx, :] != 0): raise ValueError("Trying to take contrast of non-estimable coefficient") contrasts = contrasts[est_idx, :] fit_coef = fit_coef[:, est_idx] stdev_unscaled = stdev_unscaled[:, est_idx] n_coef = len(est_idx) # Compute correlation matrix std = np.sqrt(np.diag(cov_coefficients)) std[std == 0] = 1 # Avoid division by zero cormatrix = cov_coefficients / np.outer(std, std) # Check if orthogonal if cormatrix.size < 2: orthog = True else: orthog = np.sum(np.abs(cormatrix[np.tril_indices(n_coef, -1)])) < 1e-12 # Handle NA coefficients na_coef = np.any(np.isnan(fit_coef)) if na_coef: na_mask = np.isnan(fit_coef) fit_coef = fit_coef.copy() stdev_unscaled = stdev_unscaled.copy() fit_coef[na_mask] = 0 stdev_unscaled[na_mask] = 1e30 # Transform coefficients new_coefficients = fit_coef @ contrasts # Transform covariance matrix R = np.linalg.cholesky(cov_coefficients).T # Upper triangular new_cov = (R @ contrasts).T @ (R @ contrasts) fit["cov_coefficients"] = new_cov # Transform standard errors if orthog: # Simple case: variances add new_stdev = np.sqrt(stdev_unscaled**2 @ contrasts**2) else: # Non-orthogonal: need to account for correlations R_cor = np.linalg.cholesky(cormatrix).T new_stdev = np.zeros((n_genes, n_contrasts)) for i in range(n_genes): # Scale contrasts by stdev scaled_contrasts = stdev_unscaled[i, :, np.newaxis] * contrasts RUC = R_cor @ scaled_contrasts new_stdev[i, :] = np.sqrt(np.sum(RUC**2, axis=0)) # Restore NAs if na_coef: large_stdev = new_stdev > 1e20 new_coefficients[large_stdev] = np.nan new_stdev[large_stdev] = np.nan fit["coefficients"] = new_coefficients fit["stdev_unscaled"] = new_stdev if not isinstance(fit, MArrayLM): fit = MArrayLM(fit) if is_anndata: # Plain dict for h5ad compatibility; see lm_fit. data.uns[key] = dict(fit) return None return fit
def contrast_as_coef( design, contrast=None, first: bool = True, ) -> dict: """ Reform a design matrix so that contrasts become simple coefficients. Port of R limma's ``contrastAsCoef``. Re-parameterises ``design`` so that fitting the new design directly estimates the requested contrasts as coefficients. Parameters ---------- design : ndarray or DataFrame Design matrix of shape (n_samples, n_coef). contrast : ndarray or DataFrame, optional Contrast matrix of shape (n_coef, n_contrasts). If ``None``, ``design`` is returned unchanged (matching R). first : bool, default True If True, contrast columns are placed first in the new design. If False, they are moved to the end. Returns ------- dict ``design`` : DataFrame Re-parameterised design with shape (n_samples, n_coef) and named columns. ``coef`` : list of int 0-based indices of the contrast columns within the new design. ``qr`` : dict QR decomposition of ``contrast`` with keys ``q``, ``r``, ``pivot``, ``rank``. """ from .lmfit import _qr_r_style if isinstance(design, pd.DataFrame): design_arr = np.asarray(design.values, dtype=np.float64) else: design_arr = np.asarray(design, dtype=np.float64) if design_arr.ndim == 1: design_arr = design_arr.reshape(-1, 1) if contrast is None: return design_arr contrast_names = None if isinstance(contrast, pd.DataFrame): contrast_names = list(contrast.columns) contrast_arr = np.asarray(contrast.values, dtype=np.float64) else: contrast_arr = np.asarray(contrast, dtype=np.float64) if contrast_arr.ndim == 1: contrast_arr = contrast_arr.reshape(-1, 1) if design_arr.shape[1] != contrast_arr.shape[0]: raise ValueError("Length of contrast doesn't match ncol(design)") q, r, pivot, rank = _qr_r_style(contrast_arr) if rank == 0: raise ValueError("contrast is all zero") designT = q.T @ design_arr.T R_block = r[:rank, :rank] designT[:rank, :] = linalg.solve_triangular(R_block, designT[:rank, :]) new_design = designT.T n_cols = new_design.shape[1] col_names = [f"Q{i + 1}" for i in range(n_cols)] if contrast_names is None: col_names[:rank] = [f"C{int(pivot[i]) + 1}" for i in range(rank)] else: col_names[:rank] = list(contrast_names[:rank]) coef = list(range(rank)) if not first: non_coef = [i for i in range(n_cols) if i not in coef] new_design = np.column_stack([new_design[:, non_coef], new_design[:, coef]]) col_names = [col_names[i] for i in non_coef] + [col_names[i] for i in coef] coef = list(range(n_cols - rank, n_cols)) new_design_df = pd.DataFrame(new_design, columns=col_names) return { "design": new_design_df, "coef": coef, "qr": {"q": q, "r": r, "pivot": pivot, "rank": rank}, }