# SPDX-License-Identifier: GPL-3.0-or-later
#
# This module is a Python port of code from R limma. Original R copyrights:
# toptable.R Copyright (C) 2003-2023 Gordon Smyth
# topTableF.R Copyright (C) 2006-2022 Gordon Smyth
# Python port: Copyright (C) 2026 John Mulvey
"""
Top table output for pylimma.
Implements:
- top_table(): extract and format results as a DataFrame
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
import pandas as pd
from scipy import stats
from .classes import _resolve_fit_input
from .utils import p_adjust
if TYPE_CHECKING:
pass
def _handle_duplicated_rownames(
genes: list, genelist_df: pd.DataFrame | None
) -> tuple[list, pd.DataFrame | None]:
"""
Handle duplicated gene names following R limma's logic.
If gene names have duplicates:
- Move original names to genelist_df as 'ID' column (or 'ID0' if 'ID' exists)
- Replace gene names with 1-indexed integers
Parameters
----------
genes : list
Gene names/identifiers
genelist_df : DataFrame or None
Gene annotation DataFrame
Returns
-------
tuple
(genes, genelist_df) - updated gene names and genelist DataFrame
"""
# Check for duplicates
if len(genes) != len(set(genes)):
# Duplicates found - move original names to a column
if genelist_df is None:
genelist_df = pd.DataFrame({"ID": genes})
elif "ID" in genelist_df.columns:
genelist_df = genelist_df.copy()
genelist_df["ID0"] = genes
else:
genelist_df = genelist_df.copy()
genelist_df["ID"] = genes
# Replace with 1-indexed integers
genes = [i + 1 for i in range(len(genes))]
return genes, genelist_df
[docs]
def top_table(
data,
coef: int | str | list | None = None,
number: int = 10,
genelist: pd.DataFrame | list | np.ndarray | None = None,
adjust_method: str = "BH",
sort_by: str = "B",
resort_by: str | None = None,
p_value: float = 1.0,
fc: float | None = None,
lfc: float = 0.0,
confint: bool | float = False,
key: str = "pylimma",
) -> pd.DataFrame:
r"""
Extract a table of top-ranked genes from a linear model fit.
Parameters
----------
data : AnnData or dict
Either an AnnData object with fit results in adata.uns[key],
or a dict from e_bayes().
coef : int, str, list, or None
Coefficient(s) to extract. If None or multiple coefficients,
returns F-statistic results. If single coefficient (int or str),
returns t-statistic results for that coefficient.
number : int, default 10
Maximum number of genes to return. Use np.inf for all genes.
genelist : DataFrame, list, or array, optional
Gene annotations to include in output. If a DataFrame, columns
are merged with results. If a list/array, used as gene identifiers.
If None (default), uses fit["genes"] if available.
For AnnData input, fit["genes"] only carries var_names; annotation
columns in ``adata.var`` (e.g. ``symbol``, ``chromosome``) are not
duplicated into the fit. Pass ``genelist=adata.var`` explicitly
to merge those columns into the output table.
adjust_method : str, default "BH"
Multiple testing adjustment method. Options: "BH", "bonferroni",
"holm", "none".
sort_by : str, default "B"
Column to sort by. Options:
- "B" or "b": B-statistic (log-odds of DE)
- "p" or "P": p-value
- "t" or "T": t-statistic (absolute value)
- "logFC" or "log_fc": log fold-change (absolute value)
- "AveExpr" or "ave_expr": average expression
- "F": F-statistic (for multiple coefficients)
- "none": no sorting
resort_by : str, optional
Secondary sort column applied after filtering by p_value/lfc.
Same options as sort_by. Useful when you want to filter by
p-value but display sorted by fold-change.
p_value : float, default 1.0
Adjusted p-value cutoff. Only genes with adj_p_value <= p_value
are returned.
fc : float, optional
Minimum fold-change required (linear scale, must be >= 1).
Only genes with fold-change >= fc are returned. This is an
alternative to `lfc`; if both are specified, `fc` takes precedence.
lfc : float, default 0.0
Log2 fold-change cutoff. Only genes with \|log_fc\| >= lfc are
returned. Equivalent to log2(fc).
confint : bool or float, default False
If True, compute 95% confidence intervals for log fold-changes.
If a float, specifies the confidence level (e.g., 0.99 for 99%).
Only applies for single coefficient tests.
key : str, default "pylimma"
Key for fit results in adata.uns (AnnData input only).
Returns
-------
DataFrame
Table of top genes with columns:
- gene: gene identifier (if available)
- log_fc: log fold-change
- ci_l, ci_r: confidence interval bounds (if confint=True)
- ave_expr: average expression
- t: moderated t-statistic
- p_value: raw p-value
- adj_p_value: adjusted p-value
- b: B-statistic (log-odds)
For multiple coefficients (F-test):
- One log_fc column per coefficient
- F: F-statistic
- p_value: F-test p-value
- adj_p_value: adjusted p-value
Notes
-----
This function must be called after e_bayes(). The results are sorted
and filtered according to the specified parameters.
Examples
--------
>>> fit = lm_fit(expr, design)
>>> fit = e_bayes(fit)
>>> top_table(fit, coef=1, number=20) # Top 20 genes for coefficient 1
>>> top_table(fit, coef=None) # F-test across all coefficients
>>> top_table(fit, coef=1, confint=True) # With 95% CI
"""
fit, _adata, _adata_key = _resolve_fit_input(data, key)
# Validate fit
if "t" not in fit and "F" not in fit:
raise ValueError("Need to run e_bayes() first")
if "coefficients" not in fit:
raise ValueError("coefficients not found in fit")
coefficients = fit["coefficients"]
n_genes, n_coefs = coefficients.shape
# Determine which coefficients to use
is_treat = "treat_lfc" in fit
# R's topTreat defaults to sort.by="p", unlike topTable which defaults to "B"
# Match this behaviour when treat results are detected
if is_treat and sort_by == "B":
sort_by = "P"
if coef is None:
if is_treat:
# treat results: default to last coefficient only (R parity)
coef_idx = [n_coefs - 1]
elif n_coefs > 1:
# Use all coefficients, but remove intercept if present (like R)
coef_idx = list(range(n_coefs))
coef_names_check = fit.get("contrast_names") or fit.get("coef_names")
if coef_names_check is not None:
try:
intercept_idx = coef_names_check.index("(Intercept)")
coef_idx.remove(intercept_idx)
import warnings
warnings.warn("Removing intercept from test coefficients")
except ValueError:
pass # No intercept found
else:
coef_idx = [n_coefs - 1] # Use last coefficient (like R)
elif isinstance(coef, (list, tuple)):
coef_idx = list(coef)
else:
coef_idx = [coef]
# Convert string coefficient/contrast names to indices
# Use contrast_names if available (after contrasts_fit), otherwise coef_names
coef_names = fit.get("contrast_names") or fit.get("coef_names")
if coef_names is not None:
new_coef_idx = []
for c in coef_idx:
if isinstance(c, str):
if c not in coef_names:
raise ValueError(
f"Coefficient/contrast '{c}' not found. Available: {coef_names}"
)
new_coef_idx.append(coef_names.index(c))
else:
new_coef_idx.append(c)
coef_idx = new_coef_idx
# Get genelist from fit if not provided. Track whether the value
# came from the user (explicit) or from `fit["genes"]` (auto-default
# from rownames in pylimma). R only wraps explicit vectors into an
# ID column (toptable.R:168); rownames are kept as row index.
genelist_explicit = genelist is not None
if genelist is None:
genelist = fit.get("genes")
# Handle fc parameter (convert to lfc, fc takes precedence)
if fc is not None:
if fc < 1:
raise ValueError("fc must be >= 1 (fold-change cannot be less than 1)")
lfc = np.log2(fc)
# Dispatch to appropriate function
if len(coef_idx) > 1:
if is_treat:
raise ValueError(
"Treat p-values can only be displayed for single coefficients. "
"Specify a single coef or use e_bayes() instead of treat()."
)
if confint:
import warnings
warnings.warn("confint is ignored for F-test (multiple coefficients)")
# R toptable.R:46: `if(sort.by=="B") sort.by <- "F"` - silent
# translation when routing to .topTableF.
if sort_by == "B":
sort_by = "F"
return top_table_f(
fit,
coef_idx=coef_idx,
number=number,
genelist=genelist,
adjust_method=adjust_method,
sort_by=sort_by,
resort_by=resort_by,
p_value=p_value,
lfc=lfc,
_genelist_explicit=genelist_explicit,
)
else:
return _top_table_t(
fit,
coef=coef_idx[0],
number=number,
genelist=genelist,
adjust_method=adjust_method,
sort_by=sort_by,
resort_by=resort_by,
p_value=p_value,
lfc=lfc,
confint=confint,
_genelist_explicit=genelist_explicit,
)
def _top_table_t(
fit: dict,
coef: int,
number: int,
genelist,
adjust_method: str,
sort_by: str,
resort_by: str | None,
p_value: float,
lfc: float,
confint: bool | float,
_genelist_explicit: bool = False,
) -> pd.DataFrame:
"""Top table for single coefficient (t-statistics)."""
coefficients = fit["coefficients"]
n_genes = coefficients.shape[0]
# Extract statistics. R toptable.R:273-275 conditions the AveExpr
# and B columns on slot presence (`if(!is.null(A))`,
# `if(include.B)`). Track presence so the DataFrame builder can
# omit the columns when their source is missing - matching R's
# `data.frame$col <- NULL` no-op behaviour.
log_fc = coefficients[:, coef]
t_stat = fit["t"][:, coef]
p_val = fit["p_value"][:, coef]
lods = fit.get("lods")
has_lods = lods is not None
if has_lods:
b_stat = lods[:, coef] if lods.ndim > 1 else lods
else:
# R toptable.R:209-211: missing lods + sort/resort.by="B" raises.
if sort_by == "B":
raise ValueError(
"Trying to sort.by B, but B-statistic (lods) not found in MArrayLM object"
)
if resort_by == "B":
raise ValueError(
"Trying to resort.by B, but B-statistic (lods) not found in MArrayLM object"
)
b_stat = np.full(n_genes, np.nan)
has_amean = "Amean" in fit and fit["Amean"] is not None
if has_amean:
ave_expr = np.asarray(fit["Amean"])
# R: `if(NCOL(A)>1) A <- rowMeans(A, na.rm=TRUE)`.
if ave_expr.ndim > 1 and ave_expr.shape[1] > 1:
ave_expr = np.nanmean(ave_expr, axis=1)
else:
ave_expr = np.full(n_genes, np.nan)
# Handle genelist. R toptable.R:168 wraps an explicit vector
# genelist as `data.frame(ID=genelist)` so it appears as an ID
# column. pylimma additionally auto-defaults `genelist` to
# `fit["genes"]` (set from rownames in lm_fit) - that auto-default
# is treated as the row index, NOT an ID column, to preserve
# rownames flow through the pipeline.
if genelist is None:
genes = [f"gene{i + 1}" for i in range(n_genes)]
genelist_df = None
elif isinstance(genelist, pd.DataFrame):
genes = list(genelist.index) if len(genelist.index) == n_genes else list(range(n_genes))
genelist_df = genelist.copy()
elif isinstance(genelist, (list, np.ndarray)):
if len(genelist) == n_genes:
if _genelist_explicit:
# Explicit vector → ID column, row index from fit rownames
# (or 1..N fallback matching R toptable.R:170-172).
genelist_df = pd.DataFrame({"ID": list(genelist)})
genes = list(range(1, n_genes + 1))
else:
# Auto-default from fit["genes"] → use as row index
genes = list(genelist)
genelist_df = None
else:
genes = [f"gene{i}" for i in range(n_genes)]
genelist_df = None
else:
genes = [f"gene{i + 1}" for i in range(n_genes)]
genelist_df = None
# Handle duplicated gene names (DataFrame index must be unique)
genes, genelist_df = _handle_duplicated_rownames(genes, genelist_df)
# Confidence intervals
margin_error = None
if confint:
stdev_unscaled = fit.get("stdev_unscaled")
s2_post = fit.get("s2_post")
df_total = fit.get("df_total")
if stdev_unscaled is None or s2_post is None or df_total is None:
raise ValueError(
"Need stdev_unscaled, s2_post, df_total in fit for confidence intervals"
)
if isinstance(confint, float):
alpha = (1 + confint) / 2
else:
alpha = 0.975 # 95% CI
margin_error = np.sqrt(s2_post) * stdev_unscaled[:, coef] * stats.t.ppf(alpha, df_total)
# Multiple testing adjustment
adj_p_val = p_adjust(p_val, method=adjust_method)
# Filter by p-value and lfc thresholds.
# R toptable.R:233-246 gates filtering on `p.value < 1 | lfc > 0`,
# masking NaN sig rows to FALSE only inside that block. With default
# arguments R does no filtering and preserves NaN-p-value rows.
keep = np.ones(n_genes, dtype=bool)
if p_value < 1 or lfc > 0:
if p_value < 1:
keep &= adj_p_val <= p_value
if lfc > 0:
# `abs(M) >= lfc` (toptable.R:234) - inclusive boundary.
keep &= np.abs(log_fc) >= lfc
keep &= ~np.isnan(p_val) # R: sig[is.na(sig)] <- FALSE
if not np.any(keep):
return pd.DataFrame()
# Apply filter
idx = np.where(keep)[0]
log_fc = log_fc[idx]
t_stat = t_stat[idx]
p_val = p_val[idx]
adj_p_val = adj_p_val[idx]
b_stat = b_stat[idx]
ave_expr = ave_expr[idx]
genes = [genes[i] for i in idx]
if genelist_df is not None:
genelist_df = genelist_df.iloc[idx].reset_index(drop=True)
if margin_error is not None:
margin_error = margin_error[idx]
# Helper function for sorting
# use_abs: primary sort uses abs() for t and logFC; resort uses signed values (R behaviour)
def _get_sort_order(sort_col, data_dict, default_order, use_abs=True):
# R toptable.R:188: `sort.by="A" || sort.by=="Amean"` -> "AveExpr".
sort_by_map = {
"B": "b",
"b": "b",
"P": "p",
"p": "p",
"T": "t",
"t": "t",
"logFC": "logFC",
"log_fc": "logFC",
"M": "logFC",
"AveExpr": "AveExpr",
"ave_expr": "AveExpr",
"A": "AveExpr",
"Amean": "AveExpr",
"none": "none",
}
col = sort_by_map.get(sort_col, sort_col)
# Use stable descending sort (np.argsort on negated values with
# kind='stable') to match R's order(-x) tie-break behaviour.
if col == "b":
return np.argsort(-data_dict["b"], kind="stable")
elif col == "p":
return np.argsort(data_dict["p"], kind="stable")
elif col == "t":
vals = np.abs(data_dict["t"]) if use_abs else data_dict["t"]
return np.argsort(-vals, kind="stable")
elif col == "logFC":
vals = np.abs(data_dict["logFC"]) if use_abs else data_dict["logFC"]
return np.argsort(-vals, kind="stable")
elif col == "AveExpr":
return np.argsort(-data_dict["AveExpr"], kind="stable")
else:
return default_order
data_dict = {"b": b_stat, "p": p_val, "t": t_stat, "logFC": log_fc, "AveExpr": ave_expr}
# Sort by primary column (uses absolute values for t and logFC)
order = _get_sort_order(sort_by, data_dict, np.arange(len(log_fc)), use_abs=True)
# Limit number
if number < len(order):
order = order[:number]
# Apply primary sort
log_fc = log_fc[order]
t_stat = t_stat[order]
p_val = p_val[order]
adj_p_val = adj_p_val[order]
b_stat = b_stat[order]
ave_expr = ave_expr[order]
genes = [genes[i] for i in order]
if genelist_df is not None:
genelist_df = genelist_df.iloc[order].reset_index(drop=True)
if margin_error is not None:
margin_error = margin_error[order]
# Resort if requested (uses signed values for t and logFC, matching R)
if resort_by is not None:
data_dict = {"b": b_stat, "p": p_val, "t": t_stat, "logFC": log_fc, "AveExpr": ave_expr}
resort_order = _get_sort_order(resort_by, data_dict, np.arange(len(log_fc)), use_abs=False)
log_fc = log_fc[resort_order]
t_stat = t_stat[resort_order]
p_val = p_val[resort_order]
adj_p_val = adj_p_val[resort_order]
b_stat = b_stat[resort_order]
ave_expr = ave_expr[resort_order]
genes = [genes[i] for i in resort_order]
if genelist_df is not None:
genelist_df = genelist_df.iloc[resort_order].reset_index(drop=True)
if margin_error is not None:
margin_error = margin_error[resort_order]
# Build DataFrame. R toptable.R:268-280 puts AveExpr / B in the
# output only when their source slots are non-NULL. Mirror that.
cols: dict[str, np.ndarray] = {"log_fc": log_fc}
if confint and margin_error is not None:
cols["ci_l"] = log_fc - margin_error
cols["ci_r"] = log_fc + margin_error
if has_amean:
cols["ave_expr"] = ave_expr
cols["t"] = t_stat
cols["p_value"] = p_val
cols["adj_p_value"] = adj_p_val
if has_lods:
cols["b"] = b_stat
df = pd.DataFrame(cols)
# Add genelist columns if DataFrame
if genelist_df is not None:
for col in genelist_df.columns:
if col not in df.columns:
df[col] = genelist_df[col].values
df.index = genes
df.index.name = "gene"
return df
[docs]
def top_table_f(
fit: dict,
number: int = 10,
genelist=None,
adjust_method: str = "BH",
sort_by: str = "F",
p_value: float = 1.0,
fc: float | None = None,
lfc: float | None = None,
*,
coef_idx: list[int] | None = None,
resort_by: str | None = None,
_genelist_explicit: bool | None = None,
) -> pd.DataFrame:
"""
Top table for multiple coefficients ranked by F-statistic.
Port of R's ``limma::topTableF``. Positional arguments follow R's
order exactly; ``coef_idx`` and ``resort_by`` are pylimma-only
extensions and keyword-only.
Parameters
----------
fit : dict
Fit object from :func:`e_bayes` containing F-statistics.
number : int, default 10
Maximum number of genes to return.
genelist : DataFrame, list, or array, optional
Gene annotations to include in output. When None, uses
``fit.get("genes")`` (matches R's ``genelist=fit$genes``
default).
For AnnData input, fit["genes"] only carries var_names; pass
``genelist=adata.var`` explicitly to merge annotation columns
(e.g. ``symbol``, ``chromosome``) into the output table.
adjust_method : str, default "BH"
Multiple-testing adjustment method.
sort_by : str, default "F"
Column to sort by. ``"F"`` or ``"none"`` (R).
p_value : float, default 1.0
Adjusted p-value cutoff.
fc : float, optional
Fold-change cutoff. If given, sets ``lfc = log2(fc)``.
Must be >= 1.
lfc : float, optional
Log fold-change cutoff. Defaults to 0 when both ``fc`` and
``lfc`` are None (R's NULL).
coef_idx : list of int, optional (keyword-only)
Indices of coefficients to include. Defaults to all columns
of ``fit["coefficients"]``, matching R's ``topTableF`` which
always uses the whole coefficient matrix.
resort_by : str, optional (keyword-only)
pylimma extension: secondary sort column applied after
``sort_by`` + truncation.
Returns
-------
DataFrame
Table of top genes ranked by F-statistic.
"""
# R topTableF.R:7 emits a deprecation message on every call. Mirror
# it with DeprecationWarning so downstream tooling (pytest's
# warning capture, IDE linters) sees an equivalent signal.
import warnings as _warnings
_warnings.warn(
"top_table_f is obsolete and will be removed in a future "
"version of pylimma. Please consider using top_table instead.",
DeprecationWarning,
stacklevel=2,
)
# Resolve lfc from fc / lfc exactly as R does (toptable.R:33-39):
# when fc is supplied, lfc is silently overridden with log2(fc) -
# any user-supplied lfc value is discarded without warning.
if fc is not None:
if fc < 1:
raise ValueError("fc must be greater than or equal to 1")
lfc = float(np.log2(fc))
elif lfc is None:
lfc = 0.0
# R topTableF.R:38: `sort.by <- match.arg(sort.by, c("F","none"))`.
# match.arg errors on anything else; mirror that here.
if sort_by not in ("F", "none"):
raise ValueError(f"sort_by={sort_by!r} not recognised. Must be one of 'F', 'none'.")
# R topTableF.R:12: `rn <- rownames(M)`. When fit["coefficients"]
# is a DataFrame, surface its index as `coef_rownames` so duplicate
# detection (R-B11a topTableF.R:27-28) can promote them to an ID
# column. Plain ndarray inputs leave coef_rownames at None.
coef_obj = fit["coefficients"]
coef_rownames = None
if isinstance(coef_obj, pd.DataFrame):
coef_rownames = list(coef_obj.index)
coef_matrix = np.asarray(coef_obj)
if coef_idx is None:
# R's topTableF uses the whole coefficient matrix.
coef_idx = list(range(coef_matrix.shape[1]))
coefficients = coef_matrix[:, coef_idx]
n_genes = coefficients.shape[0]
# Default genelist from fit, matching R's genelist=fit$genes.
# _genelist_explicit is 3-state: True/False from the wrapper
# (which has already handled auto-default), None when the user
# invoked top_table_f directly. In the direct-call case we need
# to auto-detect: a non-None genelist at entry IS explicit.
if _genelist_explicit is None:
if genelist is not None:
_genelist_explicit = True
else:
_genelist_explicit = False
genelist = fit.get("genes")
elif genelist is None:
# Wrapper passed an explicit flag but no genelist - this can
# happen if the wrapper's auto-default also returned None.
genelist = fit.get("genes")
# F-statistics
f_stat = fit.get("F")
f_p_val = fit.get("F_p_value")
if f_stat is None or f_p_val is None:
raise ValueError("F-statistics not found. Run e_bayes() with multiple coefficients.")
# R topTableF.R:89: `tab$AveExpr <- Amean[o]`. When Amean is NULL,
# R's `data.frame$col <- NULL` is a no-op, so the column is absent.
# Track absence explicitly so the DataFrame builder can omit it.
has_amean = "Amean" in fit and fit["Amean"] is not None
if has_amean:
ave_expr = np.asarray(fit["Amean"])
# R: `if(NCOL(A)>1) A <- rowMeans(A, na.rm=TRUE)`.
if ave_expr.ndim > 1 and ave_expr.shape[1] > 1:
ave_expr = np.nanmean(ave_expr, axis=1)
else:
ave_expr = np.full(n_genes, np.nan)
# Handle genelist. R topTableF.R:85 wraps an explicit vector as
# `data.frame(ProbeID=genelist)` so it appears as a ProbeID column.
# pylimma's auto-default from `fit["genes"]` is used as the row
# index instead - see `_top_table_t` for the same pattern.
if genelist is None:
genes = (
list(coef_rownames)
if coef_rownames is not None
else [f"gene{i + 1}" for i in range(n_genes)]
)
genelist_df = None
elif isinstance(genelist, pd.DataFrame):
genes = list(genelist.index) if len(genelist.index) == n_genes else list(range(n_genes))
genelist_df = genelist.copy()
elif isinstance(genelist, (list, np.ndarray)):
if len(genelist) == n_genes:
if _genelist_explicit:
genelist_df = pd.DataFrame({"ProbeID": list(genelist)})
genes = list(range(1, n_genes + 1))
else:
genes = list(genelist)
genelist_df = None
else:
genes = [f"gene{i}" for i in range(n_genes)]
genelist_df = None
else:
genes = [f"gene{i + 1}" for i in range(n_genes)]
genelist_df = None
# R topTableF.R:87-100: when rownames(M) (i.e. fit$coefficients
# rownames) has duplicates, promote them to an ID/ID0 column and
# reset rn to 1..N. coef_rownames is None for ndarray fits;
# _handle_duplicated_rownames() catches the simpler case of
# duplicates inside `genes` itself.
if coef_rownames is not None and len(coef_rownames) != len(set(coef_rownames)):
if genelist_df is None:
genelist_df = pd.DataFrame({"ID": list(coef_rownames)})
elif "ID" in genelist_df.columns:
genelist_df = genelist_df.copy()
genelist_df["ID0"] = list(coef_rownames)
else:
genelist_df = genelist_df.copy()
genelist_df["ID"] = list(coef_rownames)
genes = list(range(1, n_genes + 1))
else:
# Handle duplicated gene names (DataFrame index must be unique)
genes, genelist_df = _handle_duplicated_rownames(genes, genelist_df)
# Multiple testing adjustment
adj_p_val = p_adjust(f_p_val, method=adjust_method)
# Filter (R topTableF.R: NaN sig dropped only when filtering is active).
keep = np.ones(n_genes, dtype=bool)
if p_value < 1 or lfc > 0:
if p_value < 1:
keep &= adj_p_val <= p_value
if lfc > 0:
keep &= np.any(np.abs(coefficients) > lfc, axis=1)
keep &= ~np.isnan(f_p_val)
if not np.any(keep):
return pd.DataFrame()
idx = np.where(keep)[0]
coefficients = coefficients[idx]
f_stat = f_stat[idx]
f_p_val = f_p_val[idx]
adj_p_val = adj_p_val[idx]
ave_expr = ave_expr[idx]
genes = [genes[i] for i in idx]
if genelist_df is not None:
genelist_df = genelist_df.iloc[idx].reset_index(drop=True)
# Helper function for sorting
def _get_sort_order(sort_col, f_stat, f_p_val, ave_expr, default_order):
sort_by_map = {
"F": "F",
"f": "F",
"P": "p",
"p": "p",
"AveExpr": "AveExpr",
"ave_expr": "AveExpr",
"A": "AveExpr",
"none": "none",
}
col = sort_by_map.get(sort_col, "F" if sort_col == "B" else sort_col)
if col == "F":
return np.argsort(f_p_val, kind="stable") # ascending p for F
elif col == "p":
return np.argsort(f_p_val, kind="stable")
elif col == "AveExpr":
return np.argsort(-ave_expr, kind="stable")
else:
return default_order
# Sort by primary column
order = _get_sort_order(sort_by, f_stat, f_p_val, ave_expr, np.arange(len(f_stat)))
if number < len(order):
order = order[:number]
# Apply primary sort
coefficients = coefficients[order]
f_stat = f_stat[order]
f_p_val = f_p_val[order]
adj_p_val = adj_p_val[order]
ave_expr = ave_expr[order]
genes = [genes[i] for i in order]
if genelist_df is not None:
genelist_df = genelist_df.iloc[order].reset_index(drop=True)
# Resort if requested
if resort_by is not None:
resort_order = _get_sort_order(resort_by, f_stat, f_p_val, ave_expr, np.arange(len(f_stat)))
coefficients = coefficients[resort_order]
f_stat = f_stat[resort_order]
f_p_val = f_p_val[resort_order]
adj_p_val = adj_p_val[resort_order]
ave_expr = ave_expr[resort_order]
genes = [genes[i] for i in resort_order]
if genelist_df is not None:
genelist_df = genelist_df.iloc[resort_order].reset_index(drop=True)
# Build DataFrame with coefficient columns
# Use contrast_names if available, otherwise generate default names
stored_names = fit.get("contrast_names") or fit.get("coef_names")
# R topTableF.R:13: default colnames are `Coef1, Coef2, ...`
# (1-based, no separator) when fit$coefficients has none.
if stored_names is not None:
coef_names = [stored_names[i] for i in coef_idx]
else:
coef_names = [f"Coef{i + 1}" for i in coef_idx]
# R topTableF.R:88 builds the table with genelist FIRST, then
# coefficient columns: `tab <- data.frame(genelist[o,,drop=FALSE],
# M[o,,drop=FALSE])`. Mirror that column order.
cols: dict[str, np.ndarray] = {}
if genelist_df is not None:
for col in genelist_df.columns:
cols[col] = genelist_df[col].values
for j, name in enumerate(coef_names):
cols[name] = coefficients[:, j]
if has_amean:
cols["ave_expr"] = ave_expr
cols["F"] = f_stat
cols["p_value"] = f_p_val
cols["adj_p_value"] = adj_p_val
df = pd.DataFrame(cols)
df.index = genes
df.index.name = "gene"
return df