import numpy as np
import matplotlib.pyplot as plt
from beam.cards import properties_for
from beam.mcda import (
aggregate_across_datasets,
critical_difference,
leave_one_metric_out,
run,
run_from_registry,
smaa,
smallest_weight_perturbation,
)
from beam.scenarios import (
all_scenarios,
normalization_failure_scenarios,
)
def polarity_for(scenario):
return [p.polarity for p in properties_for(list(scenario.metric_ids))]
def bounds_for(scenario):
props = properties_for(list(scenario.metric_ids))
return [(p.range_lower, p.range_upper) for p in props]
def unguarded_minmax(scenario):
"""Force plain min-max on every column, the behaviour the card defaults improve on."""
return run(
scenario.scores,
polarity_for(scenario),
normalization="min_max",
bounds=bounds_for(scenario),
metric_ids=list(scenario.metric_ids),
)
def print_section(label):
print()
print("=" * 64)
print(label)
print("=" * 64)Canonical simulated scenarios
Goal
This vignette runs the simulated scenarios. Four scenarios with documented ground truth feed through the same MCDA report layout as the Duo vignette, to verify at a glance that the pipeline reports what each scenario was built to encode.
- random: anti-correlated scores so no method Pareto-dominates. Which method comes out on top depends entirely on weights; SMAA confidence is spread across the Pareto frontier; the top rank is fragile under weight perturbation.
- dominant: one method dominates on every metric. It is rank 1 under any weighting; SMAA confidence factor for it is 1; no single-criterion weight perturbation can move it off the top rank.
- ties: two methods produce identical scores. They share a rank under any weighting.
- odd dataset: one method is best on most datasets, another is best on one odd dataset. The cross-dataset aggregation rule declared on each metric card picks the method best overall.
Set-up
The aggregation methods used below are wrapped from pymcdm. beam normalizes by metric card, then calls pymcdm on the normalized matrix and keeps the higher-is-better convention. The weighting schemes are beam’s own.
Per-scenario reports
For each scenario the report prints:
- Description and ground truth from the generator.
- Pooled tool by metric scores.
- Ranks under SAW with equal weights using
run_from_registry. - Leave-one-metric-out per-tool stability.
- SMAA rank acceptability index over 500 weight samples.
- Triantaphyllou-Sanchez weight perturbation: most fragile pair and the fragility flag on the top rank.
for s in all_scenarios():
print_section(f"{s.name} ({s.kind})")
print(s.description)
print()
print("expected top-ranked method:",
"none" if s.expectation.expected_top_ranked is None
else s.method_names[s.expectation.expected_top_ranked])
if s.expectation.tied_pair is not None:
a, b = s.expectation.tied_pair
print("tied pair:", s.method_names[a], "==", s.method_names[b])
print()
print("scores:")
print(f" method {' '.join(f'{m:>9s}' for m in s.metric_ids)}")
for name, row in zip(s.method_names, s.scores):
print(f" {name:8s} {' '.join(f'{v:9.3f}' for v in row)}")
polarity = polarity_for(s)
base = run_from_registry(s.scores, s.metric_ids, weights="equal", method="saw")
print()
print("ranks (SAW + equal weights):")
for name, c, r in zip(s.method_names, base.composite, base.ranks):
print(f" {name:8s} composite={c:.3f} rank={int(r)}")
if s.scores.shape[1] >= 2:
loo = leave_one_metric_out(
s.scores, polarity,
metric_ids=list(s.metric_ids),
weights="equal", method="saw",
)
print()
print("leave-one-metric-out stability:")
for name, stab in zip(s.method_names, loo.rank_stability):
print(f" {name:8s} stability={stab:.2f}")
print(f" most influential metric: {s.metric_ids[loo.most_influential_metric]!r}")
sm = smaa(s.scores, polarity, n_samples=500, method="saw", seed=0)
print()
print("SMAA rank acceptability (rows = methods, columns = ranks 1..n):")
for name, row in zip(s.method_names, sm.rank_acceptability_index):
print(f" {name:8s} {[f'{v:.2f}' for v in row]}")
print("confidence factor:")
for name, c in zip(s.method_names, sm.confidence_factor):
print(f" {name:8s} {c:.2f}")
ts = smallest_weight_perturbation(s.scores, polarity, weights="equal", method="saw")
print()
print("Triantaphyllou-Sanchez weight perturbation:")
if ts.most_fragile_pair is not None:
p = ts.most_fragile_pair
print(
f" most fragile pair: {s.method_names[p.higher]} > {s.method_names[p.lower]} "
f"flips by delta={p.delta:+.3f} on {s.metric_ids[p.criterion]!r}"
)
else:
print(" no single-criterion weight change can flip any pair")
print(f" top rank is fragile (threshold 0.05): {ts.top_rank_is_fragile}")
================================================================
random (no_signal)
================================================================
ARI and runtime are drawn iid then re-paired so the two metrics are anti-correlated. No method Pareto-dominates the rest; the ranking depends entirely on the weighting. SMAA confidence is spread across the Pareto-frontier methods rather than concentrated on one.
expected top-ranked method: none
scores:
method ari runtime
m0 0.368 87.635
m1 0.185 69.337
m2 0.058 36.168
m3 0.070 45.279
m4 0.457 101.083
ranks (SAW + equal weights):
m0 composite=0.254 rank=4
m1 composite=0.276 rank=3
m2 composite=0.529 rank=1
m3 composite=0.426 rank=2
m4 composite=0.228 rank=5
leave-one-metric-out stability:
m0 stability=0.00
m1 stability=0.00
m2 stability=0.50
m3 stability=0.50
m4 stability=0.50
most influential metric: 'ari'
SMAA rank acceptability (rows = methods, columns = ranks 1..n):
m0 ['0.00', '0.52', '0.13', '0.35', '0.00']
m1 ['0.00', '0.00', '0.74', '0.11', '0.15']
m2 ['0.49', '0.00', '0.12', '0.22', '0.17']
m3 ['0.00', '0.46', '0.00', '0.26', '0.27']
m4 ['0.51', '0.02', '0.01', '0.06', '0.40']
confidence factor:
m0 0.00
m1 0.00
m2 0.49
m3 0.00
m4 0.51
Triantaphyllou-Sanchez weight perturbation:
most fragile pair: m2 > m0 flips by delta=-0.009 on 'runtime'
top rank is fragile (threshold 0.05): True
================================================================
dominant (dominant)
================================================================
Method 0 dominates on every metric. Under any positive weighting it ranks first; no single-criterion weight perturbation can move it off the top rank.
expected top-ranked method: m0
scores:
method ari runtime
m0 0.900 20.000
m1 0.131 155.756
m2 0.062 145.025
m3 0.055 104.245
m4 0.294 93.168
ranks (SAW + equal weights):
m0 composite=0.950 rank=1
m1 composite=0.065 rank=4
m2 composite=0.049 rank=5
m3 composite=0.125 rank=3
m4 composite=0.272 rank=2
leave-one-metric-out stability:
m0 stability=1.00
m1 stability=0.00
m2 stability=0.00
m3 stability=0.50
m4 stability=1.00
most influential metric: 'runtime'
SMAA rank acceptability (rows = methods, columns = ranks 1..n):
m0 ['1.00', '0.00', '0.00', '0.00', '0.00']
m1 ['0.00', '0.00', '0.18', '0.33', '0.48']
m2 ['0.00', '0.00', '0.00', '0.51', '0.49']
m3 ['0.00', '0.00', '0.82', '0.16', '0.03']
m4 ['0.00', '1.00', '0.00', '0.00', '0.00']
confidence factor:
m0 1.00
m1 0.00
m2 0.00
m3 0.00
m4 0.00
Triantaphyllou-Sanchez weight perturbation:
most fragile pair: m1 > m2 flips by delta=-0.013 on 'ari'
top rank is fragile (threshold 0.05): False
================================================================
ties (ties)
================================================================
Methods 0 and 1 have identical scores, set above the rest on both metrics, so they tie for the top rank under any weighting.
expected top-ranked method: none
tied pair: m0 == m1
scores:
method ari runtime
m0 0.900 15.000
m1 0.900 15.000
m2 0.070 87.635
m3 0.058 45.279
m4 0.457 36.168
ranks (SAW + equal weights):
m0 composite=0.950 rank=1
m1 composite=0.950 rank=1
m2 composite=0.035 rank=5
m3 composite=0.216 rank=4
m4 composite=0.479 rank=3
leave-one-metric-out stability:
m0 stability=1.00
m1 stability=1.00
m2 stability=0.50
m3 stability=0.50
m4 stability=1.00
most influential metric: 'runtime'
SMAA rank acceptability (rows = methods, columns = ranks 1..n):
m0 ['1.00', '0.00', '0.00', '0.00', '0.00']
m1 ['1.00', '0.00', '0.00', '0.00', '0.00']
m2 ['0.00', '0.00', '0.00', '0.02', '0.98']
m3 ['0.00', '0.00', '0.00', '0.98', '0.02']
m4 ['0.00', '0.00', '1.00', '0.00', '0.00']
confidence factor:
m0 1.00
m1 1.00
m2 0.00
m3 0.00
m4 0.00
Triantaphyllou-Sanchez weight perturbation:
most fragile pair: m3 > m2 flips by delta=-0.488 on 'runtime'
top rank is fragile (threshold 0.05): False
================================================================
odd_dataset (odd_dataset)
================================================================
Method 0 is best on 4 out of 5 datasets; method 1 is best on the last (odd) dataset. After cross-dataset aggregation with the per-metric recommended rule, the pooled ranking still has method 0 first.
expected top-ranked method: m0
scores:
method ari runtime
m0 0.608 38.622
m1 0.322 60.108
m2 0.149 91.515
m3 0.173 84.922
ranks (SAW + equal weights):
m0 composite=0.804 rank=1
m1 composite=0.404 rank=2
m2 composite=0.075 rank=4
m3 composite=0.130 rank=3
leave-one-metric-out stability:
m0 stability=1.00
m1 stability=1.00
m2 stability=1.00
m3 stability=1.00
most influential metric: 'ari'
SMAA rank acceptability (rows = methods, columns = ranks 1..n):
m0 ['1.00', '0.00', '0.00', '0.00']
m1 ['0.00', '1.00', '0.00', '0.00']
m2 ['0.00', '0.00', '0.00', '1.00']
m3 ['0.00', '0.00', '1.00', '0.00']
confidence factor:
m0 1.00
m1 0.00
m2 0.00
m3 0.00
Triantaphyllou-Sanchez weight perturbation:
no single-criterion weight change can flip any pair
top rank is fragile (threshold 0.05): False
SMAA confidence factor per scenario
Bar plot to check the share of weight samples in which each method is top-ranked, side by side across scenarios. The gold bar marks the method documented to rank first, and the green bars mark a documented tied pair. The random scenario has no single method expected to rank first, so all its bars are blue.
scenarios = all_scenarios()
n_scen = len(scenarios)
fig, axes = plt.subplots(1, n_scen, figsize=(3.0 * n_scen, 3.2), sharey=True)
for ax, s in zip(axes, scenarios):
pol = polarity_for(s)
rep = smaa(s.scores, pol, n_samples=500, method="saw", seed=0)
top = s.expectation.expected_top_ranked
tied = s.expectation.tied_pair
colors = []
for i in range(len(s.method_names)):
if top is not None and i == top:
colors.append("#d9a521")
elif tied is not None and i in tied:
colors.append("#55a868")
else:
colors.append("#3a7ca5")
ax.bar(range(len(s.method_names)), rep.confidence_factor, color=colors)
if top is not None:
truth = f"truth: {s.method_names[top]} ranks first"
elif tied is not None:
truth = f"truth: tie {s.method_names[tied[0]]}=={s.method_names[tied[1]]}"
else:
truth = "truth: no single top-ranked method"
ax.set_title(f"{s.name}\n({s.kind})\n{truth}")
ax.set_xticks(range(len(s.method_names)))
ax.set_xticklabels(s.method_names, rotation=45, ha="right")
ax.set_ylim(0, 1.05)
ax.set_xlabel("method")
ax.set_ylabel("SMAA confidence factor")
fig.tight_layout()
plt.show()Funky heatmap with rank robustness
The funky heatmap shows a scenario as a glyph table with robustness panels next to the order. The canonical scenarios are single-dataset, so there is no leave-one-dataset-out span here; the panels shown are the rank span across the five aggregations and the SMAA rank-acceptability bar. The random scenario has the two metrics trading off against each other, so the order changes when the aggregation or the weighting changes.
import beam
from beam.scenarios import random_scenario
s = random_scenario()
funky_run = beam.rank(s.scores, metric_ids=s.metric_ids, tool_names=s.method_names)
from beam.reporting import funky_heatmap_from_run
funky_heatmap_from_run(funky_run, title="Random scenario: scores and rank robustness")The glyph grid is small (five methods, two metrics). The aggregation panel and the SMAA bar show methods changing rank across the aggregations and across the sampled weightings, so the row order is not firm.
Per-dataset ranks for the odd-dataset scenario
Plot to check the contribution of the odd dataset to the pooled ranking.
odd = next(s for s in scenarios if s.kind == "odd_dataset")
tensor = odd.scores_per_dataset
n_datasets = tensor.shape[1]
ari_per_dataset_ranks = np.zeros((len(odd.method_names), n_datasets), dtype=int)
for d in range(n_datasets):
col = tensor[:, d, 0]
order = np.argsort(-col, kind="stable")
for rank_pos, tool_idx in enumerate(order, start=1):
ari_per_dataset_ranks[tool_idx, d] = rank_pos
fig, ax = plt.subplots(figsize=(5.5, 3.0))
im = ax.imshow(
ari_per_dataset_ranks, cmap="RdYlGn_r", aspect="auto",
vmin=1, vmax=len(odd.method_names),
)
ax.set_xticks(range(n_datasets))
ax.set_xticklabels([f"d{i}" for i in range(n_datasets)])
ax.set_yticks(range(len(odd.method_names)))
ax.set_yticklabels(odd.method_names)
ax.set_xlabel("dataset")
ax.set_ylabel("method")
truth_top = odd.method_names[odd.expectation.expected_top_ranked]
ax.set_title(f"truth: {truth_top} ranks first overall; one odd dataset flips locally")
for i in range(ari_per_dataset_ranks.shape[0]):
for j in range(ari_per_dataset_ranks.shape[1]):
ax.text(j, i, ari_per_dataset_ranks[i, j], ha="center", va="center",
fontweight="bold", color="black")
cbar = fig.colorbar(im, ax=ax, ticks=range(1, len(odd.method_names) + 1))
cbar.set_label("ARI rank (1 = best)")
fig.tight_layout()
plt.show()Normalization strategy matters
Plain min-max scaling is the obvious default, and it is wrong for some metrics. Two scenarios make the failure concrete. In each one the top-ranked method under unguarded all-min_max differs from the one under the card defaults. See the explanation page on normalization and scales for the reasoning.
for s in normalization_failure_scenarios():
print_section(f"{s.name} ({s.kind})")
print(s.description)
print()
unguarded = unguarded_minmax(s)
guarded = run_from_registry(s.scores, s.metric_ids, weights="equal", method="saw")
print("ranks: unguarded all-min_max vs card defaults")
print(f" method {'min_max':>10s} {'card':>10s}")
for name, ru, rg in zip(s.method_names, unguarded.ranks, guarded.ranks):
flag = " <- flips" if ru != rg else ""
print(f" {name:8s} {int(ru):>10d} {int(rg):>10d}{flag}")
print()
print("card normalization strategy per metric:")
for mid, strat in zip(s.metric_ids, guarded.normalization):
print(f" {mid:10s} {strat}")
if unguarded.warnings:
print()
print("guard warnings raised under unguarded min-max:")
for w in unguarded.warnings:
print(f" - {w}")
================================================================
outlier_runtime (minmax_heavy_tail)
================================================================
Five methods, metrics ari and runtime. Methods m0 to m3 sit on a 10, 20, 40, 80 second ladder; m4 is a 5000 second outlier. Under unguarded all-min_max the outlier compresses the good methods to near 0.99 on runtime, so a tiny ARI difference decides the order and m1 comes out on top. Under the card default log_min_max the runtime ladder survives and m0 ranks first.
ranks: unguarded all-min_max vs card defaults
method min_max card
m0 2 1 <- flips
m1 1 2 <- flips
m2 3 3
m3 4 4
m4 5 5
card normalization strategy per metric:
ari baseline_relative
runtime log_min_max
guard warnings raised under unguarded min-max:
- metric runtime: min_max used an empirical upper bound; normalized values are not comparable across different method sets
- metric runtime: min_max on a heavy-tailed column (max/median 125); one outlier compresses the rest. Consider log_min_max or rank.
================================================================
chance_baseline (minmax_chance_baseline)
================================================================
Three methods, metrics ari and runtime. m0 is at chance (ARI 0), m1 is modestly better (ARI 0.20) but slower, m2 is strong overall. Unguarded min-max scores the chance ARI as 0.5 and ranks m0 above m1. The card default baseline_relative scores chance as 0 and ranks m1 above m0. m2 ranks first either way.
ranks: unguarded all-min_max vs card defaults
method min_max card
m0 2 3 <- flips
m1 3 2 <- flips
m2 1 1
card normalization strategy per metric:
ari baseline_relative
runtime log_min_max
guard warnings raised under unguarded min-max:
- metric runtime: min_max used an empirical upper bound; normalized values are not comparable across different method sets
The outlier-runtime scenario shows how a single slow method flattens the runtime column under min-max. The plot puts the normalized runtime under min-max next to the same column under the card default log_min_max.
outlier = next(s for s in normalization_failure_scenarios() if s.kind == "minmax_heavy_tail")
runtime_col = list(outlier.metric_ids).index("runtime")
mm = unguarded_minmax(outlier).normalized[:, runtime_col]
lg = run_from_registry(outlier.scores, outlier.metric_ids).normalized[:, runtime_col]
x = np.arange(len(outlier.method_names))
width = 0.38
fig, ax = plt.subplots(figsize=(5.5, 3.2))
ax.bar(x - width / 2, mm, width, label="min_max", color="#c44e52")
ax.bar(x + width / 2, lg, width, label="log_min_max", color="#3a7ca5")
ax.set_xticks(x)
ax.set_xticklabels(outlier.method_names)
ax.set_xlabel("method")
ax.set_ylabel("normalized runtime (higher = faster)")
ax.set_title("Runtime outlier crushes the min-max scale")
# overlay the ground truth: the actual runtime in seconds, on a log scale
true_runtime = outlier.scores[:, runtime_col]
ax2 = ax.twinx()
ax2.plot(x, true_runtime, "k--o", label="true runtime (s)")
ax2.set_yscale("log")
ax2.set_ylabel("true runtime (s, log scale)")
lines1, labels1 = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax.legend(lines1 + lines2, labels1 + labels2, loc="center right")
fig.tight_layout()
plt.show()The dashed line is the actual runtime in seconds. Under min-max the four good methods (m0 to m3) sit at almost the same height, so their real speed differences vanish and a tiny ARI difference decides the order. Under log_min_max the bars track the true ordering, so the genuinely fastest good method ranks first.
Are the methods separable across datasets?
The MCDA composite gives one ranking, but it does not say whether that ranking is statistically real across the datasets. The Demsar (2006) Friedman test and Nemenyi post-hoc answer that. Here they run on the odd-dataset scenario, using the per-dataset ARI as the tool by dataset matrix. See the explanation page on comparing methods across datasets for the reasoning.
odd = next(s for s in scenarios if s.kind == "odd_dataset")
ari_by_dataset = odd.scores_per_dataset[:, :, 0] # tool by dataset, higher is better
cd = critical_difference(ari_by_dataset, higher_is_better=True, tool_names=odd.method_names)
print(f"Friedman statistic = {cd.friedman_statistic:.3f}, p = {cd.friedman_pvalue:.3f}")
print(f"critical difference (alpha={cd.alpha}) = {cd.critical_difference:.3f}")
print("average ranks (1 = best):")
for i in cd.order:
print(f" {odd.method_names[i]:8s} {cd.average_ranks[i]:.2f}")
groups = [tuple(odd.method_names[i] for i in c) for c in cd.cliques]
print("cliques (not significantly different):", groups or "none")Friedman statistic = 5.400, p = 0.145
critical difference (alpha=0.05) = 2.098
average ranks (1 = best):
m0 1.60
m1 2.20
m3 2.80
m2 3.40
cliques (not significantly different): [('m0', 'm1', 'm3', 'm2')]
order = cd.order
names_sorted = [odd.method_names[i] for i in order]
ranks_sorted = [cd.average_ranks[i] for i in order]
ypos = list(range(len(order)))
best = min(ranks_sorted)
fig, ax = plt.subplots(figsize=(5.5, 3.0))
ax.barh(ypos, ranks_sorted, color="#3a7ca5")
ax.axvspan(best, best + cd.critical_difference, color="#c44e52", alpha=0.15)
ax.axvline(best + cd.critical_difference, color="#c44e52", linestyle="--")
ax.set_yticks(ypos)
ax.set_yticklabels(names_sorted)
ax.invert_yaxis()
ax.set_xlabel("average rank (1 = best)")
ax.set_ylabel("method (best at top)")
ax.set_title(f"Friedman p = {cd.friedman_pvalue:.2f}; methods within the shaded band are not separable")
fig.tight_layout()
plt.show()With only five datasets the critical difference is wide. Every method falls inside the shaded band, and the Friedman test does not reject at alpha 0.05, so the data cannot separate the methods even though the MCDA composite puts m0 first. The recommendation holds under the stated weighting, but a claim that one method is better across datasets would need more datasets to support it.
Heterogeneity diagnostics on a synthetic interaction
The canonical scenarios above are single-dataset (a method by metric matrix), so they exercise the MCDA layer but not the heterogeneity layer, which needs a dataset axis. To show the heterogeneity diagnostics with a known ground truth, here is a small synthetic method by dataset by metric case built to contain a feature-driven ranking reversal: method A leads on the “easy” datasets and method C leads on the “hard” ones, with method B in the middle. The datasets carry a regime feature that marks which group they belong to.
rng = np.random.default_rng(0)
het_methods = ["method_A", "method_B", "method_C"]
het_datasets = [f"easy_{i}" for i in range(4)] + [f"hard_{i}" for i in range(4)]
regime = ["easy"] * 4 + ["hard"] * 4
het = np.empty((3, 8))
for j in range(8):
means = [0.9, 0.6, 0.4] if regime[j] == "easy" else [0.4, 0.6, 0.9]
het[:, j] = np.clip(np.array(means) + rng.normal(0, 0.05, 3), 0, 1)A mixed-effects model on this metric reports a small dataset intercept and a large residual. With one observation per cell the residual is the method-by-dataset interaction confounded with noise, and here it dominates because the methods reorder between the two regimes rather than the datasets differing in difficulty. The fits need the R toolchain, so the chunks run only when it is available.
from beam.heterogeneity import (
bradley_terry_tree,
bttree_available,
mixed_effects_from_matrix,
r_available,
)
if r_available():
me = mixed_effects_from_matrix(het, het_methods, het_datasets)
print(f"dataset shift (ICC): {me.icc_dataset:.2f}")
print(f"residual share: {me.residual_share:.2f} (interaction upper bound)")
else:
print("R with lme4 not available; skipping the mixed-effects fit.")R with lme4 not available; skipping the mixed-effects fit.
The Bradley-Terry tree finds the feature behind the interaction. Unlike the small real benchmarks, this synthetic case has enough datasets and a clean signal, so the parameter-stability test splits the datasets on regime and assigns a different leading method to each leaf, recovering exactly the reversal built into the construction.
if bttree_available():
bt = bradley_terry_tree(
het,
het_methods,
het_datasets,
categorical_features={"regime": regime},
polarity="higher_is_better",
minsize=3,
)
print(f"split found: {bt.did_split}")
print(bt.summary())
else:
print("R with psychotree not available; skipping the Bradley-Terry tree.")R with psychotree not available; skipping the Bradley-Terry tree.
This is the positive control for the diagnostics. The Duo, M4 and transportation vignettes show the small-sample case where no split is found; the OpenProblems spatial vignette shows a split on real data. Here the split is recovered from a known construction.
A cross-domain example: transportation
The MCDA core is not specific to bioinformatics. A worked cross-domain example scores transport modes (the methods) across terrains (the datasets) on speed, cost, and CO2, with infeasible mode-terrain pairs left as NaN. It shows the method-by-dataset interaction (no mode is fastest on every terrain) and the partial-coverage problem (no mode runs on every terrain) more sharply than the bio scenarios do. It lives in its own vignette so the content is not duplicated here: see examples/transportation/transportation.qmd.
What is missing
This vignette uses two metrics (ARI and runtime) and short tools lists for legibility. The next milestones widen it.
- Add cards for Shannon entropy difference and cluster-count deviation; rerun the canonical scenarios with four metrics so the SMAA simplex has higher dimension.
- Done: the Bradley-Terry tree and the mixed-effects model are demonstrated above on a synthetic multi-dataset interaction with a known regime feature; extend the canonical scenarios with a dataset axis so the odd-dataset scenario itself carries the split.
- Add a noise-floor field on the metric card and report whether the weight-perturbation flip would survive a noise-floor-sized change.