Source code for pylimma.toptable

# SPDX-License-Identifier: GPL-3.0-or-later
#
# This module is a Python port of code from R limma. Original R copyrights:
#   toptable.R                 Copyright (C) 2003-2023 Gordon Smyth
#   topTableF.R                Copyright (C) 2006-2022 Gordon Smyth
# Python port: Copyright (C) 2026 John Mulvey
"""
Top table output for pylimma.

Implements:
- top_table(): extract and format results as a DataFrame
"""

from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from scipy import stats

from .classes import _resolve_fit_input
from .utils import p_adjust

if TYPE_CHECKING:
    pass


def _handle_duplicated_rownames(
    genes: list, genelist_df: pd.DataFrame | None
) -> tuple[list, pd.DataFrame | None]:
    """
    Handle duplicated gene names following R limma's logic.

    If gene names have duplicates:
    - Move original names to genelist_df as 'ID' column (or 'ID0' if 'ID' exists)
    - Replace gene names with 1-indexed integers

    Parameters
    ----------
    genes : list
        Gene names/identifiers
    genelist_df : DataFrame or None
        Gene annotation DataFrame

    Returns
    -------
    tuple
        (genes, genelist_df) - updated gene names and genelist DataFrame
    """
    # Check for duplicates
    if len(genes) != len(set(genes)):
        # Duplicates found - move original names to a column
        if genelist_df is None:
            genelist_df = pd.DataFrame({"ID": genes})
        elif "ID" in genelist_df.columns:
            genelist_df = genelist_df.copy()
            genelist_df["ID0"] = genes
        else:
            genelist_df = genelist_df.copy()
            genelist_df["ID"] = genes
        # Replace with 1-indexed integers
        genes = [i + 1 for i in range(len(genes))]

    return genes, genelist_df



[docs]
def top_table(
    data,
    coef: int | str | list | None = None,
    number: int = 10,
    genelist: pd.DataFrame | list | np.ndarray | None = None,
    adjust_method: str = "BH",
    sort_by: str = "B",
    resort_by: str | None = None,
    p_value: float = 1.0,
    fc: float | None = None,
    lfc: float = 0.0,
    confint: bool | float = False,
    key: str = "pylimma",
) -> pd.DataFrame:
    r"""
    Extract a table of top-ranked genes from a linear model fit.

    Parameters
    ----------
    data : AnnData or dict
        Either an AnnData object with fit results in adata.uns[key],
        or a dict from e_bayes().
    coef : int, str, list, or None
        Coefficient(s) to extract. If None or multiple coefficients,
        returns F-statistic results. If single coefficient (int or str),
        returns t-statistic results for that coefficient.
    number : int, default 10
        Maximum number of genes to return. Use np.inf for all genes.
    genelist : DataFrame, list, or array, optional
        Gene annotations to include in output. If a DataFrame, columns
        are merged with results. If a list/array, used as gene identifiers.
        If None (default), uses fit["genes"] if available.

        For AnnData input, fit["genes"] only carries var_names; annotation
        columns in ``adata.var`` (e.g. ``symbol``, ``chromosome``) are not
        duplicated into the fit. Pass ``genelist=adata.var`` explicitly
        to merge those columns into the output table.
    adjust_method : str, default "BH"
        Multiple testing adjustment method. Options: "BH", "bonferroni",
        "holm", "none".
    sort_by : str, default "B"
        Column to sort by. Options:
        - "B" or "b": B-statistic (log-odds of DE)
        - "p" or "P": p-value
        - "t" or "T": t-statistic (absolute value)
        - "logFC" or "log_fc": log fold-change (absolute value)
        - "AveExpr" or "ave_expr": average expression
        - "F": F-statistic (for multiple coefficients)
        - "none": no sorting
    resort_by : str, optional
        Secondary sort column applied after filtering by p_value/lfc.
        Same options as sort_by. Useful when you want to filter by
        p-value but display sorted by fold-change.
    p_value : float, default 1.0
        Adjusted p-value cutoff. Only genes with adj_p_value <= p_value
        are returned.
    fc : float, optional
        Minimum fold-change required (linear scale, must be >= 1).
        Only genes with fold-change >= fc are returned. This is an
        alternative to `lfc`; if both are specified, `fc` takes precedence.
    lfc : float, default 0.0
        Log2 fold-change cutoff. Only genes with \|log_fc\| >= lfc are
        returned. Equivalent to log2(fc).
    confint : bool or float, default False
        If True, compute 95% confidence intervals for log fold-changes.
        If a float, specifies the confidence level (e.g., 0.99 for 99%).
        Only applies for single coefficient tests.
    key : str, default "pylimma"
        Key for fit results in adata.uns (AnnData input only).

    Returns
    -------
    DataFrame
        Table of top genes with columns:
        - gene: gene identifier (if available)
        - log_fc: log fold-change
        - ci_l, ci_r: confidence interval bounds (if confint=True)
        - ave_expr: average expression
        - t: moderated t-statistic
        - p_value: raw p-value
        - adj_p_value: adjusted p-value
        - b: B-statistic (log-odds)

        For multiple coefficients (F-test):
        - One log_fc column per coefficient
        - F: F-statistic
        - p_value: F-test p-value
        - adj_p_value: adjusted p-value

    Notes
    -----
    This function must be called after e_bayes(). The results are sorted
    and filtered according to the specified parameters.

    Examples
    --------
    >>> fit = lm_fit(expr, design)
    >>> fit = e_bayes(fit)
    >>> top_table(fit, coef=1, number=20)  # Top 20 genes for coefficient 1
    >>> top_table(fit, coef=None)  # F-test across all coefficients
    >>> top_table(fit, coef=1, confint=True)  # With 95% CI
    """
    fit, _adata, _adata_key = _resolve_fit_input(data, key)

    # Validate fit
    if "t" not in fit and "F" not in fit:
        raise ValueError("Need to run e_bayes() first")
    if "coefficients" not in fit:
        raise ValueError("coefficients not found in fit")

    coefficients = fit["coefficients"]
    n_genes, n_coefs = coefficients.shape

    # Determine which coefficients to use
    is_treat = "treat_lfc" in fit

    # R's topTreat defaults to sort.by="p", unlike topTable which defaults to "B"
    # Match this behaviour when treat results are detected
    if is_treat and sort_by == "B":
        sort_by = "P"
    if coef is None:
        if is_treat:
            # treat results: default to last coefficient only (R parity)
            coef_idx = [n_coefs - 1]
        elif n_coefs > 1:
            # Use all coefficients, but remove intercept if present (like R)
            coef_idx = list(range(n_coefs))
            coef_names_check = fit.get("contrast_names") or fit.get("coef_names")
            if coef_names_check is not None:
                try:
                    intercept_idx = coef_names_check.index("(Intercept)")
                    coef_idx.remove(intercept_idx)
                    import warnings

                    warnings.warn("Removing intercept from test coefficients")
                except ValueError:
                    pass  # No intercept found
        else:
            coef_idx = [n_coefs - 1]  # Use last coefficient (like R)
    elif isinstance(coef, (list, tuple)):
        coef_idx = list(coef)
    else:
        coef_idx = [coef]

    # Convert string coefficient/contrast names to indices
    # Use contrast_names if available (after contrasts_fit), otherwise coef_names
    coef_names = fit.get("contrast_names") or fit.get("coef_names")
    if coef_names is not None:
        new_coef_idx = []
        for c in coef_idx:
            if isinstance(c, str):
                if c not in coef_names:
                    raise ValueError(
                        f"Coefficient/contrast '{c}' not found. Available: {coef_names}"
                    )
                new_coef_idx.append(coef_names.index(c))
            else:
                new_coef_idx.append(c)
        coef_idx = new_coef_idx

    # Get genelist from fit if not provided. Track whether the value
    # came from the user (explicit) or from `fit["genes"]` (auto-default
    # from rownames in pylimma). R only wraps explicit vectors into an
    # ID column (toptable.R:168); rownames are kept as row index.
    genelist_explicit = genelist is not None
    if genelist is None:
        genelist = fit.get("genes")

    # Handle fc parameter (convert to lfc, fc takes precedence)
    if fc is not None:
        if fc < 1:
            raise ValueError("fc must be >= 1 (fold-change cannot be less than 1)")
        lfc = np.log2(fc)

    # Dispatch to appropriate function
    if len(coef_idx) > 1:
        if is_treat:
            raise ValueError(
                "Treat p-values can only be displayed for single coefficients. "
                "Specify a single coef or use e_bayes() instead of treat()."
            )
        if confint:
            import warnings

            warnings.warn("confint is ignored for F-test (multiple coefficients)")
        # R toptable.R:46: `if(sort.by=="B") sort.by <- "F"` - silent
        # translation when routing to .topTableF.
        if sort_by == "B":
            sort_by = "F"
        return top_table_f(
            fit,
            coef_idx=coef_idx,
            number=number,
            genelist=genelist,
            adjust_method=adjust_method,
            sort_by=sort_by,
            resort_by=resort_by,
            p_value=p_value,
            lfc=lfc,
            _genelist_explicit=genelist_explicit,
        )
    else:
        return _top_table_t(
            fit,
            coef=coef_idx[0],
            number=number,
            genelist=genelist,
            adjust_method=adjust_method,
            sort_by=sort_by,
            resort_by=resort_by,
            p_value=p_value,
            lfc=lfc,
            confint=confint,
            _genelist_explicit=genelist_explicit,
        )



def _top_table_t(
    fit: dict,
    coef: int,
    number: int,
    genelist,
    adjust_method: str,
    sort_by: str,
    resort_by: str | None,
    p_value: float,
    lfc: float,
    confint: bool | float,
    _genelist_explicit: bool = False,
) -> pd.DataFrame:
    """Top table for single coefficient (t-statistics)."""
    coefficients = fit["coefficients"]
    n_genes = coefficients.shape[0]

    # Extract statistics. R toptable.R:273-275 conditions the AveExpr
    # and B columns on slot presence (`if(!is.null(A))`,
    # `if(include.B)`). Track presence so the DataFrame builder can
    # omit the columns when their source is missing - matching R's
    # `data.frame$col <- NULL` no-op behaviour.
    log_fc = coefficients[:, coef]
    t_stat = fit["t"][:, coef]
    p_val = fit["p_value"][:, coef]
    lods = fit.get("lods")
    has_lods = lods is not None
    if has_lods:
        b_stat = lods[:, coef] if lods.ndim > 1 else lods
    else:
        # R toptable.R:209-211: missing lods + sort/resort.by="B" raises.
        if sort_by == "B":
            raise ValueError(
                "Trying to sort.by B, but B-statistic (lods) not found in MArrayLM object"
            )
        if resort_by == "B":
            raise ValueError(
                "Trying to resort.by B, but B-statistic (lods) not found in MArrayLM object"
            )
        b_stat = np.full(n_genes, np.nan)
    has_amean = "Amean" in fit and fit["Amean"] is not None
    if has_amean:
        ave_expr = np.asarray(fit["Amean"])
        # R: `if(NCOL(A)>1) A <- rowMeans(A, na.rm=TRUE)`.
        if ave_expr.ndim > 1 and ave_expr.shape[1] > 1:
            ave_expr = np.nanmean(ave_expr, axis=1)
    else:
        ave_expr = np.full(n_genes, np.nan)

    # Handle genelist. R toptable.R:168 wraps an explicit vector
    # genelist as `data.frame(ID=genelist)` so it appears as an ID
    # column. pylimma additionally auto-defaults `genelist` to
    # `fit["genes"]` (set from rownames in lm_fit) - that auto-default
    # is treated as the row index, NOT an ID column, to preserve
    # rownames flow through the pipeline.
    if genelist is None:
        genes = [f"gene{i + 1}" for i in range(n_genes)]
        genelist_df = None
    elif isinstance(genelist, pd.DataFrame):
        genes = list(genelist.index) if len(genelist.index) == n_genes else list(range(n_genes))
        genelist_df = genelist.copy()
    elif isinstance(genelist, (list, np.ndarray)):
        if len(genelist) == n_genes:
            if _genelist_explicit:
                # Explicit vector → ID column, row index from fit rownames
                # (or 1..N fallback matching R toptable.R:170-172).
                genelist_df = pd.DataFrame({"ID": list(genelist)})
                genes = list(range(1, n_genes + 1))
            else:
                # Auto-default from fit["genes"] → use as row index
                genes = list(genelist)
                genelist_df = None
        else:
            genes = [f"gene{i}" for i in range(n_genes)]
            genelist_df = None
    else:
        genes = [f"gene{i + 1}" for i in range(n_genes)]
        genelist_df = None

    # Handle duplicated gene names (DataFrame index must be unique)
    genes, genelist_df = _handle_duplicated_rownames(genes, genelist_df)

    # Confidence intervals
    margin_error = None
    if confint:
        stdev_unscaled = fit.get("stdev_unscaled")
        s2_post = fit.get("s2_post")
        df_total = fit.get("df_total")
        if stdev_unscaled is None or s2_post is None or df_total is None:
            raise ValueError(
                "Need stdev_unscaled, s2_post, df_total in fit for confidence intervals"
            )
        if isinstance(confint, float):
            alpha = (1 + confint) / 2
        else:
            alpha = 0.975  # 95% CI
        margin_error = np.sqrt(s2_post) * stdev_unscaled[:, coef] * stats.t.ppf(alpha, df_total)

    # Multiple testing adjustment
    adj_p_val = p_adjust(p_val, method=adjust_method)

    # Filter by p-value and lfc thresholds.
    # R toptable.R:233-246 gates filtering on `p.value < 1 | lfc > 0`,
    # masking NaN sig rows to FALSE only inside that block. With default
    # arguments R does no filtering and preserves NaN-p-value rows.
    keep = np.ones(n_genes, dtype=bool)
    if p_value < 1 or lfc > 0:
        if p_value < 1:
            keep &= adj_p_val <= p_value
        if lfc > 0:
            # `abs(M) >= lfc` (toptable.R:234) - inclusive boundary.
            keep &= np.abs(log_fc) >= lfc
        keep &= ~np.isnan(p_val)  # R: sig[is.na(sig)] <- FALSE

    if not np.any(keep):
        return pd.DataFrame()

    # Apply filter
    idx = np.where(keep)[0]
    log_fc = log_fc[idx]
    t_stat = t_stat[idx]
    p_val = p_val[idx]
    adj_p_val = adj_p_val[idx]
    b_stat = b_stat[idx]
    ave_expr = ave_expr[idx]
    genes = [genes[i] for i in idx]
    if genelist_df is not None:
        genelist_df = genelist_df.iloc[idx].reset_index(drop=True)
    if margin_error is not None:
        margin_error = margin_error[idx]

    # Helper function for sorting
    # use_abs: primary sort uses abs() for t and logFC; resort uses signed values (R behaviour)
    def _get_sort_order(sort_col, data_dict, default_order, use_abs=True):
        # R toptable.R:188: `sort.by="A" || sort.by=="Amean"` -> "AveExpr".
        sort_by_map = {
            "B": "b",
            "b": "b",
            "P": "p",
            "p": "p",
            "T": "t",
            "t": "t",
            "logFC": "logFC",
            "log_fc": "logFC",
            "M": "logFC",
            "AveExpr": "AveExpr",
            "ave_expr": "AveExpr",
            "A": "AveExpr",
            "Amean": "AveExpr",
            "none": "none",
        }
        col = sort_by_map.get(sort_col, sort_col)
        # Use stable descending sort (np.argsort on negated values with
        # kind='stable') to match R's order(-x) tie-break behaviour.
        if col == "b":
            return np.argsort(-data_dict["b"], kind="stable")
        elif col == "p":
            return np.argsort(data_dict["p"], kind="stable")
        elif col == "t":
            vals = np.abs(data_dict["t"]) if use_abs else data_dict["t"]
            return np.argsort(-vals, kind="stable")
        elif col == "logFC":
            vals = np.abs(data_dict["logFC"]) if use_abs else data_dict["logFC"]
            return np.argsort(-vals, kind="stable")
        elif col == "AveExpr":
            return np.argsort(-data_dict["AveExpr"], kind="stable")
        else:
            return default_order

    data_dict = {"b": b_stat, "p": p_val, "t": t_stat, "logFC": log_fc, "AveExpr": ave_expr}

    # Sort by primary column (uses absolute values for t and logFC)
    order = _get_sort_order(sort_by, data_dict, np.arange(len(log_fc)), use_abs=True)

    # Limit number
    if number < len(order):
        order = order[:number]

    # Apply primary sort
    log_fc = log_fc[order]
    t_stat = t_stat[order]
    p_val = p_val[order]
    adj_p_val = adj_p_val[order]
    b_stat = b_stat[order]
    ave_expr = ave_expr[order]
    genes = [genes[i] for i in order]
    if genelist_df is not None:
        genelist_df = genelist_df.iloc[order].reset_index(drop=True)
    if margin_error is not None:
        margin_error = margin_error[order]

    # Resort if requested (uses signed values for t and logFC, matching R)
    if resort_by is not None:
        data_dict = {"b": b_stat, "p": p_val, "t": t_stat, "logFC": log_fc, "AveExpr": ave_expr}
        resort_order = _get_sort_order(resort_by, data_dict, np.arange(len(log_fc)), use_abs=False)
        log_fc = log_fc[resort_order]
        t_stat = t_stat[resort_order]
        p_val = p_val[resort_order]
        adj_p_val = adj_p_val[resort_order]
        b_stat = b_stat[resort_order]
        ave_expr = ave_expr[resort_order]
        genes = [genes[i] for i in resort_order]
        if genelist_df is not None:
            genelist_df = genelist_df.iloc[resort_order].reset_index(drop=True)
        if margin_error is not None:
            margin_error = margin_error[resort_order]

    # Build DataFrame. R toptable.R:268-280 puts AveExpr / B in the
    # output only when their source slots are non-NULL. Mirror that.
    cols: dict[str, np.ndarray] = {"log_fc": log_fc}
    if confint and margin_error is not None:
        cols["ci_l"] = log_fc - margin_error
        cols["ci_r"] = log_fc + margin_error
    if has_amean:
        cols["ave_expr"] = ave_expr
    cols["t"] = t_stat
    cols["p_value"] = p_val
    cols["adj_p_value"] = adj_p_val
    if has_lods:
        cols["b"] = b_stat
    df = pd.DataFrame(cols)

    # Add genelist columns if DataFrame
    if genelist_df is not None:
        for col in genelist_df.columns:
            if col not in df.columns:
                df[col] = genelist_df[col].values

    df.index = genes
    df.index.name = "gene"

    return df



[docs]
def top_table_f(
    fit: dict,
    number: int = 10,
    genelist=None,
    adjust_method: str = "BH",
    sort_by: str = "F",
    p_value: float = 1.0,
    fc: float | None = None,
    lfc: float | None = None,
    *,
    coef_idx: list[int] | None = None,
    resort_by: str | None = None,
    _genelist_explicit: bool | None = None,
) -> pd.DataFrame:
    """
    Top table for multiple coefficients ranked by F-statistic.

    Port of R's ``limma::topTableF``. Positional arguments follow R's
    order exactly; ``coef_idx`` and ``resort_by`` are pylimma-only
    extensions and keyword-only.

    Parameters
    ----------
    fit : dict
        Fit object from :func:`e_bayes` containing F-statistics.
    number : int, default 10
        Maximum number of genes to return.
    genelist : DataFrame, list, or array, optional
        Gene annotations to include in output. When None, uses
        ``fit.get("genes")`` (matches R's ``genelist=fit$genes``
        default).

        For AnnData input, fit["genes"] only carries var_names; pass
        ``genelist=adata.var`` explicitly to merge annotation columns
        (e.g. ``symbol``, ``chromosome``) into the output table.
    adjust_method : str, default "BH"
        Multiple-testing adjustment method.
    sort_by : str, default "F"
        Column to sort by. ``"F"`` or ``"none"`` (R).
    p_value : float, default 1.0
        Adjusted p-value cutoff.
    fc : float, optional
        Fold-change cutoff. If given, sets ``lfc = log2(fc)``.
        Must be >= 1.
    lfc : float, optional
        Log fold-change cutoff. Defaults to 0 when both ``fc`` and
        ``lfc`` are None (R's NULL).
    coef_idx : list of int, optional (keyword-only)
        Indices of coefficients to include. Defaults to all columns
        of ``fit["coefficients"]``, matching R's ``topTableF`` which
        always uses the whole coefficient matrix.
    resort_by : str, optional (keyword-only)
        pylimma extension: secondary sort column applied after
        ``sort_by`` + truncation.

    Returns
    -------
    DataFrame
        Table of top genes ranked by F-statistic.
    """
    # R topTableF.R:7 emits a deprecation message on every call. Mirror
    # it with DeprecationWarning so downstream tooling (pytest's
    # warning capture, IDE linters) sees an equivalent signal.
    import warnings as _warnings

    _warnings.warn(
        "top_table_f is obsolete and will be removed in a future "
        "version of pylimma. Please consider using top_table instead.",
        DeprecationWarning,
        stacklevel=2,
    )

    # Resolve lfc from fc / lfc exactly as R does (toptable.R:33-39):
    # when fc is supplied, lfc is silently overridden with log2(fc) -
    # any user-supplied lfc value is discarded without warning.
    if fc is not None:
        if fc < 1:
            raise ValueError("fc must be greater than or equal to 1")
        lfc = float(np.log2(fc))
    elif lfc is None:
        lfc = 0.0

    # R topTableF.R:38: `sort.by <- match.arg(sort.by, c("F","none"))`.
    # match.arg errors on anything else; mirror that here.
    if sort_by not in ("F", "none"):
        raise ValueError(f"sort_by={sort_by!r} not recognised. Must be one of 'F', 'none'.")

    # R topTableF.R:12: `rn <- rownames(M)`. When fit["coefficients"]
    # is a DataFrame, surface its index as `coef_rownames` so duplicate
    # detection (R-B11a topTableF.R:27-28) can promote them to an ID
    # column. Plain ndarray inputs leave coef_rownames at None.
    coef_obj = fit["coefficients"]
    coef_rownames = None
    if isinstance(coef_obj, pd.DataFrame):
        coef_rownames = list(coef_obj.index)
    coef_matrix = np.asarray(coef_obj)
    if coef_idx is None:
        # R's topTableF uses the whole coefficient matrix.
        coef_idx = list(range(coef_matrix.shape[1]))
    coefficients = coef_matrix[:, coef_idx]
    n_genes = coefficients.shape[0]

    # Default genelist from fit, matching R's genelist=fit$genes.
    # _genelist_explicit is 3-state: True/False from the wrapper
    # (which has already handled auto-default), None when the user
    # invoked top_table_f directly. In the direct-call case we need
    # to auto-detect: a non-None genelist at entry IS explicit.
    if _genelist_explicit is None:
        if genelist is not None:
            _genelist_explicit = True
        else:
            _genelist_explicit = False
            genelist = fit.get("genes")
    elif genelist is None:
        # Wrapper passed an explicit flag but no genelist - this can
        # happen if the wrapper's auto-default also returned None.
        genelist = fit.get("genes")

    # F-statistics
    f_stat = fit.get("F")
    f_p_val = fit.get("F_p_value")

    if f_stat is None or f_p_val is None:
        raise ValueError("F-statistics not found. Run e_bayes() with multiple coefficients.")

    # R topTableF.R:89: `tab$AveExpr <- Amean[o]`. When Amean is NULL,
    # R's `data.frame$col <- NULL` is a no-op, so the column is absent.
    # Track absence explicitly so the DataFrame builder can omit it.
    has_amean = "Amean" in fit and fit["Amean"] is not None
    if has_amean:
        ave_expr = np.asarray(fit["Amean"])
        # R: `if(NCOL(A)>1) A <- rowMeans(A, na.rm=TRUE)`.
        if ave_expr.ndim > 1 and ave_expr.shape[1] > 1:
            ave_expr = np.nanmean(ave_expr, axis=1)
    else:
        ave_expr = np.full(n_genes, np.nan)

    # Handle genelist. R topTableF.R:85 wraps an explicit vector as
    # `data.frame(ProbeID=genelist)` so it appears as a ProbeID column.
    # pylimma's auto-default from `fit["genes"]` is used as the row
    # index instead - see `_top_table_t` for the same pattern.
    if genelist is None:
        genes = (
            list(coef_rownames)
            if coef_rownames is not None
            else [f"gene{i + 1}" for i in range(n_genes)]
        )
        genelist_df = None
    elif isinstance(genelist, pd.DataFrame):
        genes = list(genelist.index) if len(genelist.index) == n_genes else list(range(n_genes))
        genelist_df = genelist.copy()
    elif isinstance(genelist, (list, np.ndarray)):
        if len(genelist) == n_genes:
            if _genelist_explicit:
                genelist_df = pd.DataFrame({"ProbeID": list(genelist)})
                genes = list(range(1, n_genes + 1))
            else:
                genes = list(genelist)
                genelist_df = None
        else:
            genes = [f"gene{i}" for i in range(n_genes)]
            genelist_df = None
    else:
        genes = [f"gene{i + 1}" for i in range(n_genes)]
        genelist_df = None

    # R topTableF.R:87-100: when rownames(M) (i.e. fit$coefficients
    # rownames) has duplicates, promote them to an ID/ID0 column and
    # reset rn to 1..N. coef_rownames is None for ndarray fits;
    # _handle_duplicated_rownames() catches the simpler case of
    # duplicates inside `genes` itself.
    if coef_rownames is not None and len(coef_rownames) != len(set(coef_rownames)):
        if genelist_df is None:
            genelist_df = pd.DataFrame({"ID": list(coef_rownames)})
        elif "ID" in genelist_df.columns:
            genelist_df = genelist_df.copy()
            genelist_df["ID0"] = list(coef_rownames)
        else:
            genelist_df = genelist_df.copy()
            genelist_df["ID"] = list(coef_rownames)
        genes = list(range(1, n_genes + 1))
    else:
        # Handle duplicated gene names (DataFrame index must be unique)
        genes, genelist_df = _handle_duplicated_rownames(genes, genelist_df)

    # Multiple testing adjustment
    adj_p_val = p_adjust(f_p_val, method=adjust_method)

    # Filter (R topTableF.R: NaN sig dropped only when filtering is active).
    keep = np.ones(n_genes, dtype=bool)
    if p_value < 1 or lfc > 0:
        if p_value < 1:
            keep &= adj_p_val <= p_value
        if lfc > 0:
            keep &= np.any(np.abs(coefficients) > lfc, axis=1)
        keep &= ~np.isnan(f_p_val)

    if not np.any(keep):
        return pd.DataFrame()

    idx = np.where(keep)[0]
    coefficients = coefficients[idx]
    f_stat = f_stat[idx]
    f_p_val = f_p_val[idx]
    adj_p_val = adj_p_val[idx]
    ave_expr = ave_expr[idx]
    genes = [genes[i] for i in idx]
    if genelist_df is not None:
        genelist_df = genelist_df.iloc[idx].reset_index(drop=True)

    # Helper function for sorting
    def _get_sort_order(sort_col, f_stat, f_p_val, ave_expr, default_order):
        sort_by_map = {
            "F": "F",
            "f": "F",
            "P": "p",
            "p": "p",
            "AveExpr": "AveExpr",
            "ave_expr": "AveExpr",
            "A": "AveExpr",
            "none": "none",
        }
        col = sort_by_map.get(sort_col, "F" if sort_col == "B" else sort_col)
        if col == "F":
            return np.argsort(f_p_val, kind="stable")  # ascending p for F
        elif col == "p":
            return np.argsort(f_p_val, kind="stable")
        elif col == "AveExpr":
            return np.argsort(-ave_expr, kind="stable")
        else:
            return default_order

    # Sort by primary column
    order = _get_sort_order(sort_by, f_stat, f_p_val, ave_expr, np.arange(len(f_stat)))

    if number < len(order):
        order = order[:number]

    # Apply primary sort
    coefficients = coefficients[order]
    f_stat = f_stat[order]
    f_p_val = f_p_val[order]
    adj_p_val = adj_p_val[order]
    ave_expr = ave_expr[order]
    genes = [genes[i] for i in order]
    if genelist_df is not None:
        genelist_df = genelist_df.iloc[order].reset_index(drop=True)

    # Resort if requested
    if resort_by is not None:
        resort_order = _get_sort_order(resort_by, f_stat, f_p_val, ave_expr, np.arange(len(f_stat)))
        coefficients = coefficients[resort_order]
        f_stat = f_stat[resort_order]
        f_p_val = f_p_val[resort_order]
        adj_p_val = adj_p_val[resort_order]
        ave_expr = ave_expr[resort_order]
        genes = [genes[i] for i in resort_order]
        if genelist_df is not None:
            genelist_df = genelist_df.iloc[resort_order].reset_index(drop=True)

    # Build DataFrame with coefficient columns
    # Use contrast_names if available, otherwise generate default names
    stored_names = fit.get("contrast_names") or fit.get("coef_names")
    # R topTableF.R:13: default colnames are `Coef1, Coef2, ...`
    # (1-based, no separator) when fit$coefficients has none.
    if stored_names is not None:
        coef_names = [stored_names[i] for i in coef_idx]
    else:
        coef_names = [f"Coef{i + 1}" for i in coef_idx]

    # R topTableF.R:88 builds the table with genelist FIRST, then
    # coefficient columns: `tab <- data.frame(genelist[o,,drop=FALSE],
    # M[o,,drop=FALSE])`. Mirror that column order.
    cols: dict[str, np.ndarray] = {}
    if genelist_df is not None:
        for col in genelist_df.columns:
            cols[col] = genelist_df[col].values
    for j, name in enumerate(coef_names):
        cols[name] = coefficients[:, j]
    if has_amean:
        cols["ave_expr"] = ave_expr
    cols["F"] = f_stat
    cols["p_value"] = f_p_val
    cols["adj_p_value"] = adj_p_val
    df = pd.DataFrame(cols)

    df.index = genes
    df.index.name = "gene"

    return df