From 7f3dce34de5d9d26eab3ccb4402a6cd5bd0bd336 Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Sat, 23 May 2026 10:34:15 +0200 Subject: [PATCH 1/2] Add lifted nhood construction functionality --- MIGRATION_GUIDE.md | 35 +++ .../lifted_from_node_labels.hxx | 144 +++++++++++ src/bindings/graph.cxx | 97 +++++++ .../graph/lifted_multicut/__init__.py | 108 ++++++++ .../test_lifted_edges_from_node_labels.py | 240 ++++++++++++++++++ 5 files changed, 624 insertions(+) create mode 100644 include/bioimage_cpp/graph/lifted_multicut/lifted_from_node_labels.hxx create mode 100644 tests/graph/lifted_multicut/test_lifted_edges_from_node_labels.py diff --git a/MIGRATION_GUIDE.md b/MIGRATION_GUIDE.md index c88ca9d..70c0e43 100644 --- a/MIGRATION_GUIDE.md +++ b/MIGRATION_GUIDE.md @@ -1146,6 +1146,41 @@ lifted_features = bic.graph.features.lifted_affinity_features_complex(...) The output column conventions match the local-edge variants (`SIMPLE_EDGE_FEATURE_NAMES`, `COMPLEX_EDGE_FEATURE_NAMES`). +#### Building lifted edges from per-node labels + +When the lifted edges come from semantic / class labels per RAG node rather +than from long-range affinities, nifty offers +`nifty.distributed.liftedNeighborhoodFromNodeLabels`. The bioimage-cpp +equivalent lives under `bic.graph.lifted_multicut`: + +```python +# nifty +lifted_uvs = nifty.distributed.liftedNeighborhoodFromNodeLabels( + graph, node_labels, graphDepth=2, numberOfThreads=4, + mode='all', ignoreLabel=0, +) + +# bioimage-cpp +lifted_uvs = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, node_labels, graph_depth=2, + mode='all', ignore_label=0, number_of_threads=4, +) +``` + +Both functions return an `(n_lifted, 2)` `uint64` array of `(u, v)` pairs +with `u < v`, sorted lexicographically. The BFS hop distance is restricted +to `[2, graph_depth]`, so base-graph edges are excluded. `mode='same'` / +`'different'` filter by whether `node_labels[u] == node_labels[v]`; +`ignore_label` drops every pair where either endpoint label matches. + +Intentional differences vs. nifty: + +- snake_case parameter names (`graph_depth`, `ignore_label`, + `number_of_threads`); +- `ignore_label` defaults to `None` (no filtering) instead of `0`; +- node `0` is iterated as a source (nifty's distributed variant has an + off-by-one that silently skips it). + End-to-end pipeline (also in `examples/segmentation/lifted_multicut_from_affinities.py`): ```python diff --git a/include/bioimage_cpp/graph/lifted_multicut/lifted_from_node_labels.hxx b/include/bioimage_cpp/graph/lifted_multicut/lifted_from_node_labels.hxx new file mode 100644 index 0000000..438a57c --- /dev/null +++ b/include/bioimage_cpp/graph/lifted_multicut/lifted_from_node_labels.hxx @@ -0,0 +1,144 @@ +#pragma once + +#include "bioimage_cpp/array_view.hxx" +#include "bioimage_cpp/detail/edge_hash.hxx" +#include "bioimage_cpp/detail/threading.hxx" +#include "bioimage_cpp/graph/breadth_first_search.hxx" +#include "bioimage_cpp/graph/undirected_graph.hxx" + +#include +#include +#include +#include +#include +#include +#include + +namespace bioimage_cpp::graph::lifted_multicut { + +enum class LiftedNodeLabelMode { all, same, different }; + +// Discover lifted edges from per-node labels by BFS-neighborhood expansion. +// +// For every source node `u` the BFS reports each reachable node `v` together +// with the hop distance. A pair `(u, v)` with `u < v` becomes a lifted edge +// iff: +// - distance is in [2, graph_depth] (distance 1 corresponds to base edges +// and is excluded); +// - neither labels[u] nor labels[v] equals `ignore_label` (when set); +// - the `mode` predicate matches: `all` keeps every pair, `same` keeps +// pairs with labels[u] == labels[v], `different` keeps the complement. +// +// Returns the deduplicated set sorted lexicographically with `u < v`. +template +std::vector lifted_edges_from_node_labels( + const UndirectedGraph &graph, + const ConstArrayView &node_labels, + const std::uint64_t graph_depth, + const LiftedNodeLabelMode mode, + const std::optional ignore_label, + const std::size_t number_of_threads +) { + if (node_labels.ndim() != 1) { + throw std::invalid_argument( + "node_labels must be a 1D array" + ); + } + if (static_cast(node_labels.shape[0]) != graph.number_of_nodes()) { + throw std::invalid_argument( + "node_labels length must match graph number_of_nodes" + ); + } + if (graph_depth < 1) { + throw std::invalid_argument( + "graph_depth must be >= 1" + ); + } + + const auto n_nodes = static_cast(graph.number_of_nodes()); + if (n_nodes == 0) { + return {}; + } + + const auto n_threads = bioimage_cpp::detail::normalize_thread_count( + number_of_threads, n_nodes + ); + + const auto *labels = node_labels.data; + + const auto label_pair_passes = + [&](const LabelT label_u, const LabelT label_v) -> bool { + if (ignore_label.has_value()) { + if (label_u == *ignore_label || label_v == *ignore_label) { + return false; + } + } + switch (mode) { + case LiftedNodeLabelMode::all: + return true; + case LiftedNodeLabelMode::same: + return label_u == label_v; + case LiftedNodeLabelMode::different: + return label_u != label_v; + } + return false; + }; + + using EdgeSet = std::unordered_set< + bioimage_cpp::detail::Edge, bioimage_cpp::detail::EdgeHash + >; + std::vector per_thread(n_threads); + + bioimage_cpp::detail::parallel_for_chunks( + n_threads, + n_nodes, + [&](const std::size_t thread_id, const std::size_t begin, const std::size_t end) { + auto &out = per_thread[thread_id]; + BfsWorkspace workspace; + for (std::size_t source = begin; source < end; ++source) { + const auto label_u = labels[source]; + if (ignore_label.has_value() && label_u == *ignore_label) { + continue; + } + const auto entries = breadth_first_search( + graph, + static_cast(source), + graph_depth, + /*include_source=*/false, + workspace + ); + for (const auto &entry : entries) { + if (entry.distance < 2) { + continue; + } + if (entry.node <= source) { + continue; + } + const auto label_v = labels[static_cast(entry.node)]; + if (!label_pair_passes(label_u, label_v)) { + continue; + } + out.insert(bioimage_cpp::detail::edge_key( + static_cast(source), entry.node + )); + } + } + } + ); + + EdgeSet merged; + std::size_t total = 0; + for (const auto &set : per_thread) { + total += set.size(); + } + merged.reserve(total); + for (auto &set : per_thread) { + merged.insert(set.begin(), set.end()); + } + + std::vector result(merged.begin(), merged.end()); + std::sort(result.begin(), result.end()); + return result; +} + +} // namespace bioimage_cpp::graph::lifted_multicut diff --git a/src/bindings/graph.cxx b/src/bindings/graph.cxx index be65bdf..541b088 100644 --- a/src/bindings/graph.cxx +++ b/src/bindings/graph.cxx @@ -10,6 +10,7 @@ #include "bioimage_cpp/graph/lifted_from_affinities.hxx" #include "bioimage_cpp/graph/lifted_multicut.hxx" #include "bioimage_cpp/graph/lifted_multicut/fusion_move.hxx" +#include "bioimage_cpp/graph/lifted_multicut/lifted_from_node_labels.hxx" #include "bioimage_cpp/graph/multicut.hxx" #include "bioimage_cpp/graph/mutex_watershed.hxx" #include "bioimage_cpp/graph/multicut/fusion_move.hxx" @@ -24,7 +25,9 @@ #include "bioimage_cpp/graph/undirected_graph.hxx" #include +#include #include +#include #include #include @@ -32,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -1146,6 +1150,58 @@ UInt64Array lifted_edges_from_affinities_t( return result; } +template +UInt64Array lifted_edges_from_node_labels_t( + const Graph &graph, + LabelArray node_labels, + const std::uint64_t graph_depth, + const std::string &mode, + std::optional ignore_label, + const std::size_t number_of_threads +) { + if (node_labels.ndim() != 1) { + throw std::invalid_argument("node_labels must be a 1D array"); + } + if (node_labels.shape(0) != graph.number_of_nodes()) { + throw std::invalid_argument( + "node_labels length must match graph number_of_nodes" + ); + } + graph::lifted_multicut::LiftedNodeLabelMode mode_enum; + if (mode == "all") { + mode_enum = graph::lifted_multicut::LiftedNodeLabelMode::all; + } else if (mode == "same") { + mode_enum = graph::lifted_multicut::LiftedNodeLabelMode::same; + } else if (mode == "different") { + mode_enum = graph::lifted_multicut::LiftedNodeLabelMode::different; + } else { + throw std::invalid_argument( + "mode must be one of 'all', 'same', 'different', got '" + mode + "'" + ); + } + + ConstArrayView labels_view{ + node_labels.data(), + {static_cast(node_labels.shape(0))}, + {}, + }; + + std::vector lifted_edges; + { + nb::gil_scoped_release release; + lifted_edges = graph::lifted_multicut::lifted_edges_from_node_labels( + graph, labels_view, graph_depth, mode_enum, ignore_label, number_of_threads + ); + } + auto result = make_uint64_array({lifted_edges.size(), 2}); + auto *data = result.data(); + for (std::size_t index = 0; index < lifted_edges.size(); ++index) { + data[2 * index] = lifted_edges[index].first; + data[2 * index + 1] = lifted_edges[index].second; + } + return result; +} + template DoubleArray accumulate_lifted_affinity_features_t( LabelArray labels, @@ -1816,6 +1872,47 @@ void bind_graph(nb::module_ &m) { nb::arg("number_of_threads") ); + m.def( + "_lifted_edges_from_node_labels_uint32", + &lifted_edges_from_node_labels_t, + nb::arg("graph"), + nb::arg("node_labels"), + nb::arg("graph_depth"), + nb::arg("mode"), + nb::arg("ignore_label"), + nb::arg("number_of_threads") + ); + m.def( + "_lifted_edges_from_node_labels_uint64", + &lifted_edges_from_node_labels_t, + nb::arg("graph"), + nb::arg("node_labels"), + nb::arg("graph_depth"), + nb::arg("mode"), + nb::arg("ignore_label"), + nb::arg("number_of_threads") + ); + m.def( + "_lifted_edges_from_node_labels_int32", + &lifted_edges_from_node_labels_t, + nb::arg("graph"), + nb::arg("node_labels"), + nb::arg("graph_depth"), + nb::arg("mode"), + nb::arg("ignore_label"), + nb::arg("number_of_threads") + ); + m.def( + "_lifted_edges_from_node_labels_int64", + &lifted_edges_from_node_labels_t, + nb::arg("graph"), + nb::arg("node_labels"), + nb::arg("graph_depth"), + nb::arg("mode"), + nb::arg("ignore_label"), + nb::arg("number_of_threads") + ); + m.def( "_accumulate_lifted_affinity_features_uint32", &accumulate_lifted_affinity_features_t, diff --git a/src/bioimage_cpp/graph/lifted_multicut/__init__.py b/src/bioimage_cpp/graph/lifted_multicut/__init__.py index 0b52145..e7d30df 100644 --- a/src/bioimage_cpp/graph/lifted_multicut/__init__.py +++ b/src/bioimage_cpp/graph/lifted_multicut/__init__.py @@ -15,6 +15,9 @@ :class:`GreedyAdditiveProposalGenerator` re-exported from :mod:`bioimage_cpp.graph.multicut` (the lifted fusion-move solver consumes them). +- :func:`lifted_edges_from_node_labels` — discover lifted edges by combining + a BFS neighborhood on the base graph with a per-node label predicate + (port of ``nifty.distributed.liftedNeighborhoodFromNodeLabels``). """ from __future__ import annotations @@ -33,7 +36,111 @@ _as_edge_costs, _as_node_labels, _as_uv_array, + _normalize_number_of_threads, ) + + +_LIFTED_EDGES_FROM_NODE_LABELS_BY_DTYPE = { + np.dtype("uint32"): _core._lifted_edges_from_node_labels_uint32, + np.dtype("uint64"): _core._lifted_edges_from_node_labels_uint64, + np.dtype("int32"): _core._lifted_edges_from_node_labels_int32, + np.dtype("int64"): _core._lifted_edges_from_node_labels_int64, +} + + +def lifted_edges_from_node_labels( + graph, + node_labels, + graph_depth: int, + *, + mode: str = "all", + ignore_label: int | None = None, + number_of_threads: int = 0, +) -> np.ndarray: + """Discover lifted edges from a BFS neighborhood plus per-node labels. + + For every source node ``u`` the BFS reports each node ``v`` reached within + ``graph_depth`` hops. The pair ``(u, v)`` (with ``u < v``) becomes a lifted + edge iff: + + - the BFS hop distance is in ``[2, graph_depth]`` — base-graph edges + (distance 1) are always excluded; + - neither ``node_labels[u]`` nor ``node_labels[v]`` equals ``ignore_label`` + (when ``ignore_label`` is not ``None``); + - the ``mode`` predicate is satisfied: ``'all'`` keeps every pair, + ``'same'`` keeps pairs with matching labels, ``'different'`` keeps the + complement. + + Mirrors ``nifty.distributed.liftedNeighborhoodFromNodeLabels`` with the + following intentional differences: snake-case parameter names, + ``ignore_label`` defaults to ``None`` (no filtering), and node ``0`` is + iterated as a source (nifty's distributed variant skips it). + + Parameters + ---------- + graph: + :class:`bioimage_cpp.graph.UndirectedGraph` or + :class:`bioimage_cpp.graph.RegionAdjacencyGraph`. + node_labels: + 1D array of length ``graph.number_of_nodes``. Supported dtypes: + ``uint32``, ``uint64``, ``int32``, ``int64``. + graph_depth: + Maximum BFS hop distance (inclusive). Must be ``>= 1``; + ``graph_depth == 1`` returns an empty array because base edges are + excluded by construction. + mode: + ``'all'``, ``'same'``, or ``'different'``. + ignore_label: + If set, drop every pair where either endpoint label equals this value. + number_of_threads: + ``0`` (default) selects the bioimage-cpp default thread count. + + Returns + ------- + np.ndarray + ``(n_lifted, 2)`` ``uint64`` array of ``(u, v)`` pairs with + ``u < v``, sorted lexicographically. + """ + if mode not in ("all", "same", "different"): + raise ValueError( + f"mode must be one of 'all', 'same', 'different', got {mode!r}" + ) + depth = int(graph_depth) + if depth < 1: + raise ValueError(f"graph_depth must be >= 1, got {depth}") + + label_array = np.ascontiguousarray(np.asarray(node_labels)) + if label_array.ndim != 1: + raise ValueError( + f"node_labels must be a 1D array, got ndim={label_array.ndim}" + ) + if label_array.shape[0] != int(graph.number_of_nodes): + raise ValueError( + "node_labels length must match graph number_of_nodes, got " + f"node_labels length={label_array.shape[0]}, " + f"number_of_nodes={int(graph.number_of_nodes)}" + ) + + try: + run = _LIFTED_EDGES_FROM_NODE_LABELS_BY_DTYPE[label_array.dtype] + except KeyError as error: + supported = ", ".join( + str(dtype) for dtype in _LIFTED_EDGES_FROM_NODE_LABELS_BY_DTYPE + ) + raise TypeError( + f"node_labels must have one of dtypes ({supported}), got " + f"dtype={label_array.dtype}" + ) from error + + ignore_arg = None if ignore_label is None else int(ignore_label) + return run( + graph, + label_array, + depth, + mode, + ignore_arg, + _normalize_number_of_threads(number_of_threads), + ) from ..multicut import ( GreedyAdditiveProposalGenerator, ProposalGenerator, @@ -555,6 +662,7 @@ def optimize(self, objective: LiftedMulticutObjective) -> np.ndarray: "LiftedMulticutSolver", "ProposalGenerator", "WatershedProposalGenerator", + "lifted_edges_from_node_labels", "lifted_multicut_problem_path", "load_lifted_multicut_problem", ] diff --git a/tests/graph/lifted_multicut/test_lifted_edges_from_node_labels.py b/tests/graph/lifted_multicut/test_lifted_edges_from_node_labels.py new file mode 100644 index 0000000..957da34 --- /dev/null +++ b/tests/graph/lifted_multicut/test_lifted_edges_from_node_labels.py @@ -0,0 +1,240 @@ +import numpy as np +import pytest + +import bioimage_cpp as bic + + +def _make_chain(n: int): + edges = np.array([[i, i + 1] for i in range(n - 1)], dtype=np.uint64) + return bic.graph.UndirectedGraph.from_edges(n, edges) + + +def _as_pair_set(uvs): + return set(map(tuple, uvs.tolist())) + + +def test_chain_depth_1_returns_empty(): + graph = _make_chain(6) + labels = np.array([0, 1, 2, 3, 4, 5], dtype=np.uint64) + out = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=1, mode="all" + ) + assert out.shape == (0, 2) + assert out.dtype == np.uint64 + + +def test_chain_depth_2_pairs_at_distance_two(): + graph = _make_chain(6) + labels = np.arange(6, dtype=np.uint64) + out = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all" + ) + assert _as_pair_set(out) == {(0, 2), (1, 3), (2, 4), (3, 5)} + # Sorted lexicographically. + assert out.tolist() == sorted(out.tolist()) + + +def test_chain_depth_3_includes_distance_three(): + graph = _make_chain(6) + labels = np.arange(6, dtype=np.uint64) + out = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=3, mode="all" + ) + assert _as_pair_set(out) == { + (0, 2), (1, 3), (2, 4), (3, 5), # distance 2 + (0, 3), (1, 4), (2, 5), # distance 3 + } + + +def test_mode_same_and_different(): + graph = _make_chain(6) + # Two label-blocks: nodes 0..2 share label 1, nodes 3..5 share label 2. + # At depth=2 the only pairs are at distance 2: + # (0,2): (1,1) same; (1,3): (1,2) different; + # (2,4): (1,2) different; (3,5): (2,2) same. + labels = np.array([1, 1, 1, 2, 2, 2], dtype=np.uint64) + same = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="same" + ) + diff = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="different" + ) + all_pairs = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all" + ) + + assert _as_pair_set(same) == {(0, 2), (3, 5)} + assert _as_pair_set(diff) == {(1, 3), (2, 4)} + # 'same' + 'different' must partition 'all'. + assert _as_pair_set(same).isdisjoint(_as_pair_set(diff)) + assert _as_pair_set(same) | _as_pair_set(diff) == _as_pair_set(all_pairs) + + +def test_ignore_label_drops_pairs_with_that_label(): + graph = _make_chain(6) + labels = np.array([1, 1, 0, 2, 3, 3], dtype=np.uint64) + out = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all", ignore_label=0 + ) + # Node 2 has the ignore label, so every pair containing it is dropped: + # (0,2), (2,4) are gone; (1,3) and (3,5) remain. + assert _as_pair_set(out) == {(1, 3), (3, 5)} + + +def test_star_graph_emits_all_leaf_leaf_pairs(): + # Node 0 is the center; nodes 1..4 are leaves connected only to 0. + edges = np.array([[0, 1], [0, 2], [0, 3], [0, 4]], dtype=np.uint64) + graph = bic.graph.UndirectedGraph.from_edges(5, edges) + labels = np.arange(5, dtype=np.uint64) + out = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all" + ) + # Every pair of leaves is at distance 2 via the center. No base edges. + assert _as_pair_set(out) == {(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)} + + +def test_node_zero_is_iterated_as_source(): + # Regression guard: nifty.distributed.liftedNeighborhoodFromNodeLabels + # silently skips node 0 as a source (off-by-one). bic should not. + graph = _make_chain(4) + labels = np.arange(4, dtype=np.uint64) + out = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all" + ) + pairs = _as_pair_set(out) + assert (0, 2) in pairs + + +def test_disconnected_components(): + edges = np.array([[0, 1], [2, 3]], dtype=np.uint64) + graph = bic.graph.UndirectedGraph.from_edges(4, edges) + labels = np.arange(4, dtype=np.uint64) + out = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=5, mode="all" + ) + # Nothing is at distance >= 2 in either two-node component. + assert out.shape == (0, 2) + + +def test_rag_input_accepted(): + # Build a tiny 2D labeled image and use its RAG directly. + labels_img = np.array( + [ + [0, 0, 1, 1, 2, 2], + [0, 0, 1, 1, 2, 2], + [3, 3, 4, 4, 5, 5], + [3, 3, 4, 4, 5, 5], + ], + dtype=np.uint32, + ) + rag = bic.graph.region_adjacency_graph(labels_img) + node_labels = np.array([10, 10, 20, 10, 10, 20], dtype=np.uint64) + out = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + rag, node_labels, graph_depth=2, mode="all" + ) + assert out.dtype == np.uint64 + assert out.ndim == 2 and out.shape[1] == 2 + # Sanity: every pair is a valid (u < v) and not a base edge. + for u, v in out.tolist(): + assert u < v + assert rag.find_edge(int(u), int(v)) == -1 + + +@pytest.mark.parametrize( + "dtype", [np.uint32, np.uint64, np.int32, np.int64] +) +def test_dtype_variants_match(dtype): + graph = _make_chain(6) + labels = np.array([1, 1, 2, 2, 3, 3], dtype=dtype) + out = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all", ignore_label=0 + ) + # No node has the ignore label; result must match the no-ignore call. + out_noignore = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all" + ) + assert _as_pair_set(out) == _as_pair_set(out_noignore) + assert out.dtype == np.uint64 + + +def test_round_trip_into_lifted_multicut_objective(): + # The output should plug straight into LiftedMulticutObjective. + graph = _make_chain(6) + base_costs = np.ones(5, dtype=np.float64) + labels = np.array([1, 1, 2, 2, 3, 3], dtype=np.uint64) + lifted_uvs = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="different" + ) + lifted_costs = -np.ones(lifted_uvs.shape[0], dtype=np.float64) + objective = bic.graph.lifted_multicut.LiftedMulticutObjective( + graph, base_costs, + lifted_uvs=lifted_uvs, lifted_costs=lifted_costs, + ) + assert objective.number_of_lifted_edges == lifted_uvs.shape[0] + + +def test_error_on_unknown_mode(): + graph = _make_chain(3) + labels = np.zeros(3, dtype=np.uint64) + with pytest.raises(ValueError, match="mode"): + bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="not-a-mode" + ) + + +def test_error_on_zero_graph_depth(): + graph = _make_chain(3) + labels = np.zeros(3, dtype=np.uint64) + with pytest.raises(ValueError, match="graph_depth"): + bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=0, mode="all" + ) + + +def test_error_on_wrong_ndim(): + graph = _make_chain(3) + labels = np.zeros((3, 1), dtype=np.uint64) + with pytest.raises(ValueError, match="1D"): + bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all" + ) + + +def test_error_on_length_mismatch(): + graph = _make_chain(3) + labels = np.zeros(5, dtype=np.uint64) + with pytest.raises(ValueError, match="number_of_nodes"): + bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all" + ) + + +def test_error_on_unsupported_dtype(): + graph = _make_chain(3) + labels = np.zeros(3, dtype=np.float32) + with pytest.raises(TypeError, match="dtype"): + bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all" + ) + + +def test_threading_produces_same_result(): + graph = _make_chain(10) + labels = np.arange(10, dtype=np.uint64) + single = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=3, mode="all", number_of_threads=1 + ) + multi = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=3, mode="all", number_of_threads=4 + ) + assert _as_pair_set(single) == _as_pair_set(multi) + assert single.tolist() == multi.tolist() # sorted output is deterministic + + +def test_empty_graph(): + graph = bic.graph.UndirectedGraph(0) + labels = np.zeros(0, dtype=np.uint64) + out = bic.graph.lifted_multicut.lifted_edges_from_node_labels( + graph, labels, graph_depth=2, mode="all" + ) + assert out.shape == (0, 2) From 43c2a35718f828a5ba9909636ebde83a48e1c04f Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Wed, 27 May 2026 14:34:32 -0700 Subject: [PATCH 2/2] Add local maxima benchmarking script --- development/local_maxima/benchmark.py | 513 ++++++++++++++++++++++++++ 1 file changed, 513 insertions(+) create mode 100644 development/local_maxima/benchmark.py diff --git a/development/local_maxima/benchmark.py b/development/local_maxima/benchmark.py new file mode 100644 index 0000000..a13f796 --- /dev/null +++ b/development/local_maxima/benchmark.py @@ -0,0 +1,513 @@ +"""Benchmark local-maxima detection: vigra vs. skimage on smoothed 2D/3D data. + +This script exists to answer a single question: does bioimage-cpp need to +reimplement local-maxima / peak detection at all? If +``skimage.feature.peak_local_max`` performs comparably to +``vigra.analysis.localMaxima`` there is no reason to reimplement it — users can +just call skimage, and bioimage-cpp focuses on algorithms that are genuinely +hard to install. + +The two APIs differ in shape but agree on the underlying detection: + +* ``vigra.analysis.localMaxima`` (2D) / ``localMaxima3D`` (3D) return a *marked + image* of the input shape, with maxima set to ``marker``. Plain + neighborhood comparison (4/8 connectivity in 2D, 6/26 in 3D); no native + minimum-distance suppression. +* ``skimage.feature.peak_local_max`` returns a *coordinate list* and additionally + supports ``min_distance`` suppression (peaks separated by at least + ``min_distance``). + +With matched settings the two detect identical maxima at ``min_distance=1``: + +* vigra 2D: ``localMaxima(img, neighborhood=8, allowAtBorder=True, allowPlateaus=True)`` +* vigra 3D: ``localMaxima3D(vol, neighborhood=26, allowAtBorder=True, allowPlateaus=True)`` +* skimage : ``peak_local_max(img, min_distance=1, exclude_border=False, threshold_abs=None)`` + +Both libraries are fed the same Gaussian-smoothed ``float32`` image, allocated +once outside the timing loop. ``min_distance == 1`` is the apples-to-apples +case (both libraries timed; identical maxima). ``min_distance > 1`` is the +realistic seed-detection case (skimage only, since vigra has no native +distance suppression). + +Run:: + + python development/local_maxima/benchmark.py --small --repeats 3 + python development/local_maxima/benchmark.py --repeats 5 --csv local_maxima.csv + python development/local_maxima/benchmark.py --min-distance 3,5,10 + +Findings (sigma=2.0, repeats=5; vigra 1.11, skimage 0.25.2; single run, your +numbers will vary with hardware): + + dim shape min_dist n_maxima vigra ms skimage ms skimage/vigra + 2D (512, 512) 1 1228 12.0 4.8 0.40 + 2D (512, 512) 5 572 n/a 12.0 n/a + 2D (512, 512) 10 208 n/a 9.9 n/a + 3D (32, 256, 256) 1 857 284.6 55.0 0.19 + 3D (32, 256, 256) 5 271 n/a 69.1 n/a + 3D (32, 256, 256) 10 79 n/a 72.0 n/a + +* At ``min_distance=1`` (apples-to-apples) the two libraries detect *identical* + maxima (the agreement check reports 0 mismatches in both 2D and 3D), and + ``skimage.peak_local_max`` is ~2.5x faster in 2D and ~5x faster in 3D than + ``vigra.localMaxima`` (geomean skimage/vigra ~= 0.28). +* With realistic ``min_distance > 1`` suppression — which vigra has no native + equivalent for — skimage stays at or below vigra's ``min_distance=1`` time. + +Conclusion: ``peak_local_max`` is not just comparable to vigra but faster, and +additionally supports ``min_distance`` separation that vigra lacks. There is no +performance case for reimplementing local-maxima detection in bioimage-cpp; +users should call ``skimage.feature.peak_local_max`` directly. +""" + +from __future__ import annotations + +import argparse +import csv +import importlib.util +import math +import os +import sys +import tempfile +from collections.abc import Callable, Sequence +from dataclasses import dataclass +from statistics import geometric_mean, median +from time import perf_counter + +import numpy as np + + +LIBRARIES = ("vigra", "skimage") + + +@dataclass(frozen=True) +class DataSpec: + label: str + shape: tuple[int, ...] + + +@dataclass +class Adapter: + fn: Callable[[np.ndarray], np.ndarray] + image: np.ndarray + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Benchmark vigra.analysis.localMaxima vs skimage.feature.peak_local_max." + ) + parser.add_argument("--repeats", type=int, default=5) + parser.add_argument("--small", action="store_true", help="Fast smoke benchmark.") + parser.add_argument("--large", action="store_true", help="Larger benchmark sizes.") + parser.add_argument("--no-2d", action="store_true") + parser.add_argument("--no-3d", action="store_true") + parser.add_argument( + "--sigma", + type=float, + default=2.0, + help=( + "Gaussian smoothing sigma applied to the source image/volume. " + "Smoothing yields realistic, well-separated peaks (0 = no smoothing)." + ), + ) + parser.add_argument( + "--min-distance", + default="1,5", + help=( + "Comma-separated skimage min_distance values to sweep. vigra is only " + "comparable (and timed) at min_distance=1; larger values are skimage-only." + ), + ) + parser.add_argument( + "--csv", + default=None, + help="Optional path to write per-(dim, min_distance, library) timings.", + ) + parser.add_argument( + "--skip-checks", + action="store_true", + help="Skip the one-shot vigra/skimage agreement check at min_distance=1.", + ) + return parser.parse_args() + + +def _parse_min_distances(text: str) -> list[int]: + values = [] + for part in text.split(","): + part = part.strip() + if not part: + continue + value = int(part) + if value < 1: + raise ValueError(f"min_distance values must be >= 1, got {value}") + values.append(value) + if not values: + raise ValueError("--min-distance must contain at least one value") + # Preserve order, drop duplicates. + seen = set() + unique = [] + for value in values: + if value not in seen: + seen.add(value) + unique.append(value) + return unique + + +def _import_available(name: str) -> bool: + return importlib.util.find_spec(name) is not None + + +def _quiet_vigra_matplotlib_cache() -> None: + # VIGRA imports matplotlib in some environments. Point matplotlib's cache + # at /tmp to avoid noisy warnings when the user's config dir is read-only. + path = os.path.join(tempfile.gettempdir(), "bioimage_cpp_matplotlib_cache") + os.makedirs(path, exist_ok=True) + os.environ.setdefault("MPLCONFIGDIR", path) + + +# --------------------------------------------------------------------------- +# skimage-derived benchmark data +# --------------------------------------------------------------------------- + +def _smooth(image: np.ndarray, sigma: float) -> np.ndarray: + if sigma <= 0.0: + return np.ascontiguousarray(image, dtype=np.float32) + from scipy import ndimage + + smoothed = ndimage.gaussian_filter(image.astype(np.float32), sigma) + return np.ascontiguousarray(smoothed, dtype=np.float32) + + +def load_2d(spec: DataSpec, sigma: float) -> np.ndarray: + from skimage import data + + image = data.camera().astype(np.float32) + h, w = spec.shape + if h > image.shape[0] or w > image.shape[1]: + reps_h = math.ceil(h / image.shape[0]) + reps_w = math.ceil(w / image.shape[1]) + image = np.tile(image, (reps_h, reps_w)) + image = image[:h, :w] + return _smooth(image, sigma) + + +def load_3d(spec: DataSpec, sigma: float) -> np.ndarray: + from skimage import data + + volume = data.cells3d()[:, 1].astype(np.float32) + z, h, w = spec.shape + reps_z = math.ceil(z / volume.shape[0]) if z > volume.shape[0] else 1 + reps_h = math.ceil(h / volume.shape[1]) if h > volume.shape[1] else 1 + reps_w = math.ceil(w / volume.shape[2]) if w > volume.shape[2] else 1 + if reps_z * reps_h * reps_w > 1: + volume = np.tile(volume, (reps_z, reps_h, reps_w)) + volume = volume[:z, :h, :w] + return _smooth(volume, sigma) + + +def build_specs(args: argparse.Namespace) -> list[DataSpec]: + if args.small and args.large: + raise ValueError("--small and --large are mutually exclusive") + + if args.small: + shape_2d = (256, 256) + shape_3d = (16, 64, 64) + elif args.large: + shape_2d = (2048, 2048) + shape_3d = (64, 512, 512) + else: + shape_2d = (512, 512) + shape_3d = (32, 256, 256) + + specs = [] + if not args.no_2d: + specs.append(DataSpec("2D", shape_2d)) + if not args.no_3d: + specs.append(DataSpec("3D", shape_3d)) + return specs + + +# --------------------------------------------------------------------------- +# Library adapters +# --------------------------------------------------------------------------- + +def _skimage_peaks(min_distance: int): + from skimage.feature import peak_local_max + + def fn(image: np.ndarray) -> np.ndarray: + return peak_local_max( + image, + min_distance=min_distance, + exclude_border=False, + threshold_abs=None, + ) + + return fn + + +def _vigra_peaks(): + _quiet_vigra_matplotlib_cache() + import vigra.analysis as va + + def fn(image: np.ndarray) -> np.ndarray: + if image.ndim == 2: + marked = va.localMaxima( + image, neighborhood=8, allowAtBorder=True, allowPlateaus=True + ) + elif image.ndim == 3: + marked = va.localMaxima3D( + image, neighborhood=26, allowAtBorder=True, allowPlateaus=True + ) + else: + raise ValueError(f"vigra local maxima supports 2D/3D, got ndim={image.ndim}") + return np.asarray(marked) + + return fn + + +def _vigra_marked_to_coords(marked: np.ndarray) -> np.ndarray: + return np.argwhere(marked > 0) + + +def build_adapters(min_distance: int, image: np.ndarray) -> dict[str, Adapter]: + adapters: dict[str, Adapter] = {} + # vigra has no native min_distance suppression, so it is only timed in the + # apples-to-apples case (min_distance == 1). + if min_distance == 1 and _import_available("vigra"): + adapters["vigra"] = Adapter(fn=_vigra_peaks(), image=image) + if _import_available("skimage"): + adapters["skimage"] = Adapter(fn=_skimage_peaks(min_distance), image=image) + return adapters + + +# --------------------------------------------------------------------------- +# Timing and checks +# --------------------------------------------------------------------------- + +def _count_maxima(library: str, result: np.ndarray) -> int: + # vigra returns a marked image; skimage returns a coordinate list. + if library == "vigra": + return int(np.count_nonzero(result > 0)) + return int(result.shape[0]) + + +def time_interleaved( + adapters: dict[str, Adapter], + repeats: int, +) -> dict[str, dict]: + libs = list(adapters) + for adapter in adapters.values(): + adapter.fn(adapter.image) + + timings = {lib: [] for lib in libs} + last_result = {} + for repeat in range(repeats): + rotation = repeat % len(libs) + order = libs[rotation:] + libs[:rotation] + for lib in order: + adapter = adapters[lib] + t0 = perf_counter() + result = adapter.fn(adapter.image) + timings[lib].append(perf_counter() - t0) + last_result[lib] = np.asarray(result) + + return { + lib: { + "timings": timings[lib], + "median": median(timings[lib]), + "min": min(timings[lib]), + "n_maxima": _count_maxima(lib, last_result[lib]), + "result": last_result[lib], + } + for lib in libs + } + + +def check_agreement(adapters: dict[str, Adapter]) -> dict | None: + """Confirm vigra and skimage detect the same maxima (min_distance == 1).""" + if "vigra" not in adapters or "skimage" not in adapters: + return None + + marked = np.asarray(adapters["vigra"].fn(adapters["vigra"].image)) + vigra_coords = _vigra_marked_to_coords(marked) + skimage_coords = np.asarray(adapters["skimage"].fn(adapters["skimage"].image)) + + vigra_set = {tuple(int(c) for c in row) for row in vigra_coords} + skimage_set = {tuple(int(c) for c in row) for row in skimage_coords} + return { + "n_vigra": len(vigra_set), + "n_skimage": len(skimage_set), + "mismatched": len(vigra_set ^ skimage_set), + } + + +def format_results_table(rows: list[dict]) -> str: + headers = [ + "dim", "shape", "sigma", "min_dist", "n_maxima", + "vigra ms", "skimage ms", "skimage/vigra", + ] + + table_rows = [] + for row in rows: + vigra = row["results"].get("vigra") + skimage = row["results"].get("skimage") + n_maxima = (skimage or vigra)["n_maxima"] if (skimage or vigra) else 0 + vigra_ms = f"{vigra['median'] * 1e3:.3f}" if vigra else "n/a" + skimage_ms = f"{skimage['median'] * 1e3:.3f}" if skimage else "n/a" + if vigra and skimage and vigra["median"] > 0: + ratio = f"{skimage['median'] / vigra['median']:.2f}" + else: + ratio = "n/a" + table_rows.append([ + row["dim"], + row["shape"], + f"{row['sigma']:.1f}", + str(row["min_distance"]), + str(n_maxima), + vigra_ms, + skimage_ms, + ratio, + ]) + + widths = [len(header) for header in headers] + for row in table_rows: + for idx, value in enumerate(row): + widths[idx] = max(widths[idx], len(value)) + + def render(values: Sequence[str]) -> str: + return " ".join(value.ljust(widths[idx]) for idx, value in enumerate(values)) + + lines = [render(headers), render(["-" * width for width in widths])] + lines.extend(render(row) for row in table_rows) + return "\n".join(lines) + + +def print_headline_ratio(rows: list[dict]) -> None: + ratios = [] + for row in rows: + if row["min_distance"] != 1: + continue + vigra = row["results"].get("vigra") + skimage = row["results"].get("skimage") + if vigra and skimage and vigra["median"] > 0: + ratios.append(skimage["median"] / vigra["median"]) + print() + if ratios: + print( + "speedup summary (geomean of skimage.median / vigra.median at " + "min_distance=1; >1.0 means skimage slower):" + ) + print(f" skimage / vigra geomean = {geometric_mean(ratios):.3f} (n={len(ratios)})") + else: + print("no comparable (min_distance=1) rows with both libraries available.") + + +def main() -> int: + args = parse_args() + if args.repeats < 1: + print("--repeats must be >= 1", file=sys.stderr) + return 2 + if args.sigma < 0.0: + print("--sigma must be >= 0.0", file=sys.stderr) + return 2 + + try: + min_distances = _parse_min_distances(args.min_distance) + specs = build_specs(args) + except ValueError as error: + print(str(error), file=sys.stderr) + return 2 + + if not _import_available("skimage"): + print("skimage is required for benchmark data and the skimage baseline", file=sys.stderr) + return 2 + if args.sigma > 0.0 and not _import_available("scipy"): + print("scipy is required for Gaussian smoothing (--sigma > 0)", file=sys.stderr) + return 2 + if not _import_available("vigra"): + print("warning: vigra not available; timing skimage only", file=sys.stderr) + + print( + f"repeats={args.repeats}, sigma={args.sigma}, " + f"min_distance={','.join(str(d) for d in min_distances)}" + ) + print("data: Gaussian-smoothed skimage camera/cells3d (tiled if smaller than the requested shape)") + + rows = [] + csv_rows = [] + for spec in specs: + image = load_2d(spec, args.sigma) if spec.label == "2D" else load_3d(spec, args.sigma) + print(f"{spec.label}: shape={image.shape}, dtype={image.dtype}") + for min_distance in min_distances: + adapters = build_adapters(min_distance, image) + if not adapters: + continue + agreement = None + if not args.skip_checks and min_distance == 1: + agreement = check_agreement(adapters) + results = time_interleaved(adapters, args.repeats) + full_results = {lib: results.get(lib) for lib in LIBRARIES} + rows.append({ + "dim": spec.label, + "shape": str(tuple(image.shape)), + "sigma": args.sigma, + "min_distance": min_distance, + "results": full_results, + "agreement": agreement, + }) + for lib, result in full_results.items(): + if result is None: + continue + csv_rows.append({ + "dim": spec.label, + "shape": tuple(image.shape), + "sigma": args.sigma, + "min_distance": min_distance, + "library": lib, + "n_maxima": result["n_maxima"], + "median_s": result["median"], + "min_s": result["min"], + "repeats": args.repeats, + }) + + print() + print(format_results_table(rows)) + + if not args.skip_checks: + agreement_rows = [row for row in rows if row.get("agreement")] + if agreement_rows: + print() + print("vigra/skimage agreement at min_distance=1 (mismatched should be 0):") + for row in agreement_rows: + a = row["agreement"] + print( + f" {row['dim']}: n_vigra={a['n_vigra']}, " + f"n_skimage={a['n_skimage']}, mismatched={a['mismatched']}" + ) + + print_headline_ratio(rows) + + if args.csv is not None: + with open(args.csv, "w", newline="") as handle: + writer = csv.DictWriter( + handle, + fieldnames=[ + "dim", + "shape", + "sigma", + "min_distance", + "library", + "n_maxima", + "median_s", + "min_s", + "repeats", + ], + ) + writer.writeheader() + writer.writerows(csv_rows) + print(f"wrote {args.csv}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main())