Differential Expression Analysis of Mouse Mammary RNA-seq with pylimma

This tutorial walks through a standard limma-style RNA-seq differential-expression workflow using pylimma. It mirrors the equivalent analysis path from the R limma User’s Guide.

R-parity validation: the sibling notebook `gse60450_R_vs_Python.ipynb <gse60450_R_vs_Python.ipynb>`__ runs R limma 3.66.0 on the same input and compares the outputs numerically.

Dataset

Fu et al. 2015, Nature Cell Biology 17:365-375: paired-end RNA-seq of FACS-sorted mouse mammary epithelial populations. The shipped subset covers basal and luminal cell types across six replicates per group (12 samples total) for a ~27k-gene count matrix.

Pipeline

  1. Load counts and sample metadata

  2. Library-size sanity check

  3. Filter low-expression genes

  4. log-CPM kernel-density plot (before / after filter)

  5. Multi-dimensional scaling (coloured by group)

  6. Build the design matrix

  7. Run voom

  8. Fit linear models and contrasts

  9. Empirical Bayes moderation

  10. Extract the top table

  11. MD plot

  12. Volcano plot

  13. Heatmap of top 50 DE genes (row-ordered by signed t-statistic)

  14. Summary

[1]:
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

REPO = Path.cwd().resolve().parents[1]
sys.path.insert(0, str(REPO))
sys.path.insert(0, str(REPO / 'data'))

import generate_data as gd

import pylimma

pd.set_option('display.width', 120)
pd.set_option('display.max_columns', 10)
np.set_printoptions(precision=4, suppress=True)

# Shared plotting helpers used by every tutorial.

def _log_cpm(mat, lib_size):
    cpm = mat.div(lib_size, axis=1) * 1e6 if hasattr(mat, 'div') \
          else (mat / lib_size) * 1e6
    return np.log2(cpm + 1e-2)


def plot_log_cpm_density(mat, ax, title=""):
    '''Kernel-smoothed log-CPM density, one line per sample.

    Matches edgeR/limma's standard density plot (limma User's Guide
    Figure 15.1): a Gaussian KDE per sample drawn on a shared grid.
    '''
    from scipy.stats import gaussian_kde
    arr = mat.values if hasattr(mat, 'values') else np.asarray(mat)
    # Shared evaluation grid spanning the combined range.
    lo, hi = np.nanmin(arr), np.nanmax(arr)
    grid = np.linspace(lo, hi, 200)
    for j in range(arr.shape[1]):
        col = arr[:, j]
        col = col[np.isfinite(col)]
        if col.size < 2:
            continue
        kde = gaussian_kde(col)
        ax.plot(grid, kde(grid), linewidth=0.8, alpha=0.7)
    ax.set_xlabel('log2 CPM')
    ax.set_ylabel('Density')
    ax.set_title(title)


def mds_coords(mat, top=500):
    '''Pairwise leading-logFC MDS, matching R limma's plotMDS.default.

    Returns the (n_samples, ndim) coordinate matrix and the percent of
    variance explained per dimension. We reuse pylimma's private
    _mds_coordinates for consistency with plot_mds().
    '''
    from pylimma.plotting import _mds_coordinates
    r = _mds_coordinates(np.asarray(mat, dtype=np.float64), top=top,
                         gene_selection='pairwise', ndim=2)
    lam = np.maximum(r['eigen_values'], 0.0)
    coords = r['eigen_vectors'] * np.sqrt(lam)
    return coords, r['var_explained']


def plot_mds_coloured(mat, groups, ax, top=500, title='MDS'):
    '''MDS scatter plot coloured by sample group.'''
    coords, var_exp = mds_coords(mat, top=top)
    pct = np.round(var_exp * 100).astype(int)
    groups = np.asarray(groups)
    levels = sorted(pd.unique(groups))
    palette = plt.get_cmap('tab10').colors
    for k, level in enumerate(levels):
        m = groups == level
        ax.scatter(coords[m, 0], coords[m, 1],
                   s=55, color=palette[k % len(palette)],
                   label=str(level), edgecolor='k', linewidth=0.3)
    ax.axhline(0, color='grey', linewidth=0.4, linestyle=':')
    ax.axvline(0, color='grey', linewidth=0.4, linestyle=':')
    ax.set_xlabel(f'Leading logFC dim 1 ({pct[0]}%)')
    ax.set_ylabel(f'Leading logFC dim 2 ({pct[1]}%)')
    ax.set_title(title)
    ax.legend(fontsize=8, frameon=False)


def plot_heatmap(E, groups, fit, n_top=50, ax=None):
    '''Heatmap of top-n DE rows showing BOTH directions.

    Picks the top n/2 genes with the most positive t and the top n/2
    with the most negative t (by smallest p within each side), then
    stacks them - up-regulated in the contrast at top, down-regulated
    at bottom. Row z-scored.'''
    t = np.asarray(fit['t']).ravel()
    p = np.asarray(fit['p_value']).ravel()
    half = n_top // 2
    up_pool   = np.where(t > 0)[0]
    down_pool = np.where(t < 0)[0]
    top_up    = up_pool[np.argsort(p[up_pool])[:half]]
    top_down  = down_pool[np.argsort(p[down_pool])[:n_top - half]]
    # Sort each block by signed t descending so most-up is top and
    # most-down is bottom.
    top_up   = top_up[np.argsort(-t[top_up])]
    top_down = top_down[np.argsort(-t[top_down])]
    ordered  = np.concatenate([top_up, top_down])

    mat = E[ordered]
    col_order = np.argsort(groups)
    mat_sorted = mat[:, col_order]
    z = (mat_sorted - mat_sorted.mean(axis=1, keepdims=True)) / \
        (mat_sorted.std(axis=1, keepdims=True) + 1e-8)

    if ax is None:
        fig, ax = plt.subplots(figsize=(9, 8))
    im = ax.imshow(z, aspect='auto', cmap='RdBu_r', vmin=-2, vmax=2)
    ax.set_xticks(range(len(col_order)))
    ax.set_xticklabels(np.asarray(groups)[col_order], rotation=90, fontsize=6)
    ax.set_yticks([])
    ax.set_title(f'Top {n_top} DE genes (top {half} up + top {n_top - half} down; row z-scored)')
    return ax, im, ordered

1. Load counts and sample metadata

[2]:
data = gd.load_gse60450()
counts  = data['counts']
targets = data['targets']

print(f"counts shape: {counts.shape}  (genes x samples)")
group = targets["group"].astype(str).str.split(".").str[0]
print("\ncelltype breakdown:")
print(group.value_counts())
targets.head()
counts shape: (27179, 12)  (genes x samples)

celltype breakdown:
group
basal      6
luminal    6
Name: count, dtype: int64
[2]:
immunophenotype developmental_stage group
MCL1-DG basal virgin basal.virgin
MCL1-DH basal virgin basal.virgin
MCL1-DI basal pregnant basal.pregnant
MCL1-DJ basal pregnant basal.pregnant
MCL1-DK basal lactate basal.lactate

2. Library sizes

[3]:
lib_size = counts.sum(axis=0)

fig, ax = plt.subplots(figsize=(8, 3))
ax.bar(range(len(lib_size)), lib_size.values / 1e6, color='steelblue')
ax.set_xticks(range(len(lib_size)))
ax.set_xticklabels(counts.columns, rotation=45, ha='right', fontsize=7)
ax.set_ylabel('Library size (M reads)')
ax.set_title('Library sizes per sample')
fig.tight_layout()
plt.show()

print(f"min: {lib_size.min() / 1e6:.1f}M,  max: {lib_size.max() / 1e6:.1f}M")
../_images/tutorials_gse60450_5_0.png
min: 20.0M,  max: 24.7M

3. Filter low-expression genes

Keep genes with CPM > (10 / median library size in millions) in at least as many samples as the smallest group. This rule of thumb, from the limma User’s Guide, removes ~half the low-count noise without sacrificing real DE signal.

[4]:
cpm = counts.div(lib_size, axis=1) * 1e6
n_per_group = group.value_counts().min()
cutoff = 10.0 / (lib_size.median() / 1e6)
keep = (cpm > cutoff).sum(axis=1) >= n_per_group
counts_f = counts.loc[keep]
print(f"filter kept {keep.sum():,} / {len(keep):,} genes "
      f"({100 * keep.mean():.1f}%)")
filter kept 14,453 / 27,179 genes (53.2%)

4. log-CPM density before / after filter

Kernel density estimate per sample on a shared grid. Before filtering there is a big mass near zero-expression; filtering removes it and the sample densities overlap cleanly.

[5]:
lcpm_before = _log_cpm(counts,   lib_size)
lcpm_after  = _log_cpm(counts_f, lib_size)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 3.5), sharex=True)
plot_log_cpm_density(lcpm_before, ax1,
                     f"Before filter ({len(counts):,} genes)")
plot_log_cpm_density(lcpm_after,  ax2,
                     f"After filter ({len(counts_f):,} genes)")
fig.tight_layout()
plt.show()
../_images/tutorials_gse60450_9_0.png

5. MDS (coloured by group)

Samples projected into 2D via the top-500 leading log-FCs. Tight clusters separated along the biological axis of interest indicate a well-structured experiment.

[6]:
fig, ax = plt.subplots(figsize=(5.5, 5))
plot_mds_coloured(lcpm_after.values, group.values, ax,
                  top=500, title='MDS (celltype)')
fig.tight_layout()
plt.show()
../_images/tutorials_gse60450_11_0.png

6. Design matrix

Two-group cell-means design; contrast = luminal - basal.

[7]:
design, C = gd.build_two_group_design(group)
design_df = pd.DataFrame(
    design, index=counts_f.columns,
    columns=sorted(group.unique()),
)
print('Design matrix:')
print(design_df)
print(f"\nContrast (luminal - basal): {C.ravel()}")
Design matrix:
         basal  luminal
MCL1-DG    1.0      0.0
MCL1-DH    1.0      0.0
MCL1-DI    1.0      0.0
MCL1-DJ    1.0      0.0
MCL1-DK    1.0      0.0
MCL1-DL    1.0      0.0
MCL1-LA    0.0      1.0
MCL1-LB    0.0      1.0
MCL1-LC    0.0      1.0
MCL1-LD    0.0      1.0
MCL1-LE    0.0      1.0
MCL1-LF    0.0      1.0

Contrast (luminal - basal): [-1.  1.]

7. voom

voom models the mean-variance trend in log-CPM space and returns per-observation precision weights. The plot below is the trend fit; the scatter behind it is per-gene variance vs mean expression.

[8]:
v = pylimma.voom(counts_f.values.astype(float), design, plot=True)
plt.show()

print(f"voom$E shape:       {v['E'].shape}")
print(f"voom$weights shape: {v['weights'].shape}")
/Users/John/miniconda3/lib/python3.11/site-packages/h5py/__init__.py:36: UserWarning: h5py is running against HDF5 1.14.3 when it was built against 1.14.2, this may cause problems
  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "
../_images/tutorials_gse60450_15_1.png
voom$E shape:       (14453, 12)
voom$weights shape: (14453, 12)

8. Fit linear models and contrasts

[9]:
fit = pylimma.lm_fit(v['E'], design, weights=v['weights'])
fit = pylimma.contrasts_fit(fit, contrasts=C)
print(f"fit$coefficients shape: {fit['coefficients'].shape}")
fit$coefficients shape: (14453, 1)

9. Empirical Bayes moderation

[10]:
fit = pylimma.e_bayes(fit)
print(f"s2_prior: {fit['s2_prior']:.4f}")
print(f"df_prior: {fit['df_prior']:.2f}")
s2_prior: 0.7709
df_prior: 3.70

10. Top table

[11]:
tt = pylimma.top_table(fit, coef=0, number=np.inf, sort_by='p')
tt.index = counts_f.index.astype(str)[np.argsort(np.asarray(fit['p_value']).ravel())]
print(f"DE calls at adj_p_value < 0.05: "
      f"{(tt['adj_p_value'] < 0.05).sum():,} genes")
tt.head(10)
DE calls at adj_p_value < 0.05: 6,503 genes
[11]:
log_fc ave_expr t p_value adj_p_value b
EntrezGeneID
110308 -9.275588 6.724141 -36.574445 4.701459e-15 6.795019e-11 22.520253
12293 -8.677084 3.587313 -29.336405 9.294360e-14 4.675699e-10 19.794948
12821 -8.009916 6.069408 -28.673062 1.265345e-13 4.675699e-10 20.384824
22061 -8.385342 4.191759 -28.625416 1.294042e-13 4.675699e-10 19.803047
20692 -6.564264 7.640396 -26.468699 3.716121e-13 1.074182e-09 19.927955
12560 -7.487898 4.908416 -25.759656 5.353172e-13 1.164151e-09 19.136431
50706 -8.053250 7.948807 -25.660343 5.638313e-13 1.164151e-09 19.514670
77220 -8.435911 1.182273 -23.765867 1.577334e-12 2.535524e-09 17.351796
22004 -8.967683 6.988385 -23.744705 1.596263e-12 2.535524e-09 18.482607
59012 -7.891619 3.569024 -23.480766 1.853937e-12 2.535524e-09 17.870704

11. MD plot

x = average log-expression, y = log fold-change (luminal - basal). DE calls coloured by direction.

[12]:
logFC = fit['coefficients'][:, 0]
AveExpr = fit.get('Amean') if fit.get('Amean') is not None \
          else v['E'].mean(axis=1)
adj_sig = pylimma.decide_tests(fit, p_value=0.05).ravel() != 0

fig, ax = plt.subplots(figsize=(6.5, 4.5))
ax.scatter(AveExpr[~adj_sig], logFC[~adj_sig], s=2, c='lightgrey',
           label=f"n.s.  ({(~adj_sig).sum():,})")
ax.scatter(AveExpr[adj_sig & (logFC > 0)],
           logFC[adj_sig & (logFC > 0)], s=4, c='firebrick',
           label=f"up    ({(adj_sig & (logFC > 0)).sum():,})")
ax.scatter(AveExpr[adj_sig & (logFC < 0)],
           logFC[adj_sig & (logFC < 0)], s=4, c='steelblue',
           label=f"down  ({(adj_sig & (logFC < 0)).sum():,})")
ax.axhline(0, color='k', linewidth=0.5)
ax.set_xlabel('Average log-CPM')
ax.set_ylabel('log2 fold-change (luminal - basal)')
ax.set_title('MD plot')
ax.legend(fontsize=8, frameon=False)
fig.tight_layout()
plt.show()
../_images/tutorials_gse60450_23_0.png

12. Volcano plot

[13]:
p = np.asarray(fit['p_value']).ravel()
neglogp = -np.log10(np.maximum(p, 1e-300))

fig, ax = plt.subplots(figsize=(6, 5))
ax.scatter(logFC[~adj_sig], neglogp[~adj_sig], s=2, c='lightgrey')
ax.scatter(logFC[adj_sig & (logFC > 0)],
           neglogp[adj_sig & (logFC > 0)], s=4, c='firebrick')
ax.scatter(logFC[adj_sig & (logFC < 0)],
           neglogp[adj_sig & (logFC < 0)], s=4, c='steelblue')
ax.axhline(-np.log10(0.05), color='k', linewidth=0.5, linestyle='--')
ax.axvline(0, color='k', linewidth=0.5)
ax.set_xlabel('log2 fold-change (luminal - basal)')
ax.set_ylabel('-log10 p-value')
ax.set_title('Volcano plot')
fig.tight_layout()
plt.show()
../_images/tutorials_gse60450_25_0.png

13. Heatmap of top 50 DE genes

Rows = genes ordered by signed t-statistic (up-regulated at top, down-regulated at bottom). Columns grouped by cell type. Per-gene z-scored log-CPM.

[14]:
fig, ax = plt.subplots(figsize=(9, 8))
ax, im, ordered = plot_heatmap(v['E'], group.values, fit,
                                n_top=50, ax=ax)
gene_ids = counts_f.index.astype(str)[ordered]
ax.set_yticks(range(len(ordered)))
ax.set_yticklabels(gene_ids, fontsize=5)
fig.colorbar(im, ax=ax, shrink=0.5, label='z-score')
fig.tight_layout()
plt.show()
../_images/tutorials_gse60450_27_0.png