diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..3a9afc8
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,26 @@
+# Build artifacts (out-of-source, copied per Containerfile)
+build/
+build-*/
+target/
+
+# pos2-chip is FetchContent-cloned at CMake configure time inside the
+# container; no need to ship a host-side copy.
+third_party/
+
+# Generated plot files left over from local benchmarks.
+*.plot2
+
+# Editor / tooling
+.vscode/
+.idea/
+.cache/
+compile_commands.json
+
+# Profiling artifacts
+*.nsys-rep
+*.qdrep
+*.qdstrm
+*.ncu-rep
+
+# git history is irrelevant to the build itself.
+.git/
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..2b96933
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,21 @@
+version: 2
+
+# Dependabot bumps deps via PR. Two ecosystems:
+#   - cargo: the keygen-rs subcrate's BLS / sha2 / address-codec stack.
+#     The build.rs at repo root only references env state and has no
+#     runtime crate deps, so it doesn't need its own entry.
+#   - github-actions: action versions in .github/workflows/.
+# Weekly cadence keeps PR volume low; bump to daily if security
+# advisories pile up.
+updates:
+  - package-ecosystem: cargo
+    directory: /keygen-rs
+    schedule:
+      interval: weekly
+    open-pull-requests-limit: 5
+
+  - package-ecosystem: github-actions
+    directory: /
+    schedule:
+      interval: weekly
+    open-pull-requests-limit: 5
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..0553fdf
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,148 @@
+name: CI
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+permissions:
+  contents: read
+
+jobs:
+  shell:
+    name: ShellCheck
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - name: Install shellcheck
+        run: sudo apt-get update && sudo apt-get install -y shellcheck
+      - name: Lint scripts/
+        # Recurse so scripts/test/install-container-deps/run.sh and any
+        # future helpers under scripts/ stay covered.
+        run: find scripts -name '*.sh' -print0 | xargs -0 shellcheck
+
+  actions:
+    name: actionlint
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: reviewdog/action-actionlint@v1
+        with:
+          fail_level: error
+
+  rust:
+    name: Rust (keygen-rs)
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: keygen-rs
+    steps:
+      - uses: actions/checkout@v6
+      - uses: dtolnay/rust-toolchain@stable
+        with:
+          components: clippy, rustfmt
+      - uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: keygen-rs
+      - name: cargo fmt --check
+        run: cargo fmt --all --check
+      - name: cargo check
+        run: cargo check --all-targets --locked || cargo check --all-targets
+      - name: cargo clippy (advisory)
+        run: cargo clippy --all-targets -- -W clippy::all
+        continue-on-error: true
+      - name: cargo test
+        run: cargo test --all-targets
+
+  hadolint:
+    name: hadolint Containerfile
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: hadolint/hadolint-action@v3.3.0
+        with:
+          dockerfile: Containerfile
+          # CUDA / ROCm base images make version-pinning warnings (DL3008,
+          # DL3009) impractical — package versions shift between base image
+          # rolls and the toolkit pin lives in BASE_DEVEL. Same for the
+          # `set -o pipefail` warnings on RUN-with-pipe (DL4006) — those
+          # pipes are bootstrap-time noise, not runtime data paths. Filter
+          # to errors so we still catch real bugs (root, ADD vs COPY,
+          # missing && \, COPY --chown typos, etc.).
+          failure-threshold: error
+
+  compose-config:
+    name: docker compose config validate
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - name: docker compose config --quiet
+        # Catches typos in service names / build-arg keys / unresolvable
+        # ${VAR} placeholders without ever pulling a base image. ~5s.
+        run: docker compose -f compose.yaml config --quiet
+
+  typos:
+    name: typos
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: crate-ci/typos@master
+
+  markdownlint:
+    name: markdownlint README
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: DavidAnson/markdownlint-cli2-action@v23
+        with:
+          globs: README.md
+
+  install-container-deps-dryrun:
+    name: install-container-deps.sh — dry-run fixtures
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - name: Diff --dry-run output against fixtures
+        # Runs --dry-run for every (distro × engine × gpu) tuple in
+        # arch / ubuntu / fedora containers and diffs against the
+        # checked-in fixtures under scripts/test/install-container-deps/.
+        # No mutating sudo calls — completes in ~60s.
+        run: scripts/test/install-container-deps/run.sh
+
+  install-container-deps-smoke:
+    name: install-container-deps.sh smoke (${{ matrix.engine }} ${{ matrix.gpu }})
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - engine: podman
+            gpu: cpu
+          - engine: podman
+            gpu: amd
+          - engine: docker
+            gpu: cpu
+        # NVIDIA smoke is intentionally skipped: nvidia-ctk cdi generate
+        # needs a real GPU + driver to populate the spec, and the dry-run
+        # fixtures already cover the planning logic for that path.
+    steps:
+      - uses: actions/checkout@v6
+      - name: Real install in ubuntu:24.04 + assert idempotent re-run
+        env:
+          ENGINE: ${{ matrix.engine }}
+          GPU: ${{ matrix.gpu }}
+        # Validates that engine + GPU-runtime packages actually install
+        # from the real apt repos (catches package-name drift / repo
+        # availability), and that re-running the script is a no-op.
+        run: |
+          docker run --rm \
+              -e ENGINE -e GPU \
+              -v "$PWD/scripts:/s:ro" \
+              docker.io/ubuntu:24.04 \
+              bash -ec '
+                  apt-get update -qq
+                  apt-get install -y -qq sudo curl ca-certificates gnupg >/dev/null
+                  /s/install-container-deps.sh --engine "$ENGINE" --gpu "$GPU"
+                  # Idempotence: a clean second run must still exit 0.
+                  /s/install-container-deps.sh --engine "$ENGINE" --gpu "$GPU"
+              '
diff --git a/.gitignore b/.gitignore
index 89e01ed..43f3299 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 build/
+build-*/
 *.plot2
 .cache/
 compile_commands.json
@@ -18,3 +19,4 @@ target/
 # pos2-chip is fetched here automatically by CMake at configure time.
 # See CMakeLists.txt → FetchContent_Declare(pos2_chip).
 third_party/
+docs/
diff --git a/.markdownlint.json b/.markdownlint.json
new file mode 100644
index 0000000..8b6d3d9
--- /dev/null
+++ b/.markdownlint.json
@@ -0,0 +1,12 @@
+{
+  "_comment": "README is prose-heavy and includes terminal output, wide tables, and mixed list markers. Disable rules that produce noise without catching real issues. MD051 is also disabled because markdownlint's link-fragment slug algorithm differs from GitHub's (e.g. `### Multi-GPU: --devices` slugs differently between the two).",
+  "MD004": false,
+  "MD013": false,
+  "MD026": false,
+  "MD028": false,
+  "MD031": false,
+  "MD032": false,
+  "MD040": false,
+  "MD051": false,
+  "MD060": false
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 25b5313..5f562e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,18 +1,140 @@
 cmake_minimum_required(VERSION 3.24)
 
-project(pos2-gpu LANGUAGES C CXX CUDA)
+project(pos2-gpu VERSION 0.6.0 LANGUAGES C CXX)
 
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
-set(CMAKE_CUDA_STANDARD 20)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+# Every static library here is linked into both the standalone xchplot2
+# executable and the top-level Rust crate's PIE binary (via build.rs +
+# cargo install). rust-lld (the default linker on some distros) rejects
+# non-PIC objects in a PIE output — seen in the wild as "relocation
+# R_X86_64_32 cannot be used against local symbol; recompile with
+# -fPIC" on Cancel.cpp, BatchPlotter.cpp, etc. Setting this globally
+# ensures pos2_gpu, pos2_gpu_host, fse, and any other transitively-
+# compiled object is built with -fPIC, so the linker choice doesn't
+# matter. The per-target POSITION_INDEPENDENT_CODE ON below stay as
+# explicit markers for the public-interface static libraries.
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+# CUDA toolchain is conditional in slice 15. The CUDA path provides:
+#   - SortCuda.cu (CUB radix sort — best perf on NVIDIA)
+#   - AesGpu.cu (T-tables in __constant__ memory + cudaMemcpyToSymbol init)
+#   - AesGpuBitsliced.cu (bench-only bitsliced AES; needs nvcc)
+#   - The cuda-flavoured parity tests in tools/parity/
+# The non-CUDA path uses SortSycl.cpp + AesStub.cpp — runs on AMD/Intel via
+# AdaptiveCpp's HIP / Level Zero backends. Default ON to preserve the
+# existing NVIDIA workflow.
+#
+# CAVEAT: with XCHPLOT2_BUILD_CUDA=OFF the build still needs the CUDA
+# Toolkit *headers* on the include path (the SYCL TUs reference cudaError_t
+# / cudaStream_t / cuda_fp16.h via the kernel-wrapper headers). Lifting
+# those CUDA-type dependencies out of the public SYCL API is a follow-up
+# refactor (see slice 17 in docs/gpu-portability-sketch.md). nvcc itself is
+# NOT required when XCHPLOT2_BUILD_CUDA=OFF — only the headers.
+option(XCHPLOT2_BUILD_CUDA "Compile CUDA-only TUs (CUB sort, __constant__ AES init, bench tests)" ON)
+
+# On dual-toolchain hosts (CUDA Toolkit + ROCm both installed), the SYCL
+# TUs pull in CUDA's <cuda_runtime.h> via CudaHalfShim.hpp AND ROCm's
+# <hip/hip_runtime.h> via AdaptiveCpp's HIP backend. Their vector_types
+# headers declare conflicting typedefs for char1 / int2 / etc., which
+# breaks the compile. CudaHalfShim respects XCHPLOT2_SKIP_CUDA_RUNTIME /
+# _FP16 — turn them on when we're (a) NOT building CUDA TUs and (b) ROCm
+# is present, so the shim falls back to its opaque stubs instead.
+if(NOT XCHPLOT2_BUILD_CUDA)
+    find_path(XCHPLOT2_HIP_RUNTIME_H hip/hip_runtime.h
+              PATHS /opt/rocm/include /usr/include /usr/local/include
+              NO_DEFAULT_PATH)
+    if(XCHPLOT2_HIP_RUNTIME_H)
+        add_compile_definitions(
+            XCHPLOT2_SKIP_CUDA_RUNTIME
+            XCHPLOT2_SKIP_CUDA_FP16)
+        message(STATUS "xchplot2: ROCm at ${XCHPLOT2_HIP_RUNTIME_H} — "
+                       "skipping CUDA runtime/fp16 includes (CudaHalfShim stubs)")
+    endif()
+endif()
 
-# Default arch: sm_89 (RTX 4090). Override via -DCMAKE_CUDA_ARCHITECTURES=...
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES 89)
+if(XCHPLOT2_BUILD_CUDA)
+    # Default arch: sm_89 (RTX 4090). Override via -DCMAKE_CUDA_ARCHITECTURES=...
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES 89)
+    endif()
+
+    # Preflight nvcc-vs-arch compatibility BEFORE enable_language(CUDA),
+    # which is what triggers the cryptic "Unsupported gpu architecture
+    # 'compute_61'" TryCompile failure when Pascal/Volta meets CUDA 13.x.
+    # CUDA 13.0 dropped codegen for sm_50/52/53/60/61/62/70/72 entirely.
+    # Skip the check if nvcc isn't findable yet — enable_language(CUDA)
+    # below will surface its own missing-toolchain message in that case.
+    find_program(_xchplot2_nvcc nvcc
+        HINTS ENV CUDA_PATH ENV CUDA_HOME /opt/cuda /usr/local/cuda
+        PATH_SUFFIXES bin
+        DOC "nvcc for arch-compat preflight")
+    if(_xchplot2_nvcc)
+        execute_process(
+            COMMAND "${_xchplot2_nvcc}" --version
+            OUTPUT_VARIABLE _nvcc_version_out
+            ERROR_QUIET
+            OUTPUT_STRIP_TRAILING_WHITESPACE)
+        # Parse "Cuda compilation tools, release 13.0, V13.0.48" → 13
+        if(_nvcc_version_out MATCHES "release ([0-9]+)")
+            set(_nvcc_major "${CMAKE_MATCH_1}")
+            set(_min_arch 9999)
+            foreach(_a IN LISTS CMAKE_CUDA_ARCHITECTURES)
+                # Strip sm_ / compute_ prefixes some users pass through
+                string(REGEX REPLACE "^(sm_|compute_)" "" _a "${_a}")
+                if(_a MATCHES "^[0-9]+$" AND _a LESS _min_arch)
+                    set(_min_arch ${_a})
+                endif()
+            endforeach()
+            if(_nvcc_major GREATER_EQUAL 13 AND _min_arch LESS 75)
+                # Container detection: Docker writes /.dockerenv, Podman writes
+                # /run/.containerenv. Either presence means the host-side fixes
+                # don't apply — the user needs to rebuild the image with a
+                # different BASE_DEVEL.
+                if(EXISTS "/.dockerenv" OR EXISTS "/run/.containerenv")
+                    set(_fix_block
+                        "You're building inside a container — the toolkit comes from\n"
+                        "the base image, not the host. Rebuild with a CUDA 12.x base:\n"
+                        "  - Recommended: rerun scripts/build-container.sh on the host;\n"
+                        "    it auto-pins nvidia/cuda:12.9.1 when CUDA_ARCH < 75.\n"
+                        "  - Or pass --build-arg explicitly:\n"
+                        "      podman build -t xchplot2:cuda \\\n"
+                        "        --build-arg BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n"
+                        "        --build-arg BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n"
+                        "        --build-arg CUDA_ARCH=${_min_arch} \\\n"
+                        "        .\n")
+                else()
+                    set(_fix_block
+                        "Fix one of:\n"
+                        "  - Install CUDA 12.9 (last toolkit with Pascal/Volta support) and re-run cmake:\n"
+                        "      sudo apt install cuda-toolkit-12-9     (Ubuntu/Debian)\n"
+                        "    Then point cmake at it:\n"
+                        "      cmake -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.9/bin/nvcc -B build -S . [...]\n"
+                        "  - Or override the target arch (only valid if you actually have a Turing+ card):\n"
+                        "      cmake -DCMAKE_CUDA_ARCHITECTURES=75 -B build -S . [...]\n"
+                        "  - Or use the container path — scripts/build-container.sh auto-pins\n"
+                        "    the 12.9 base image when it detects a pre-Turing GPU.\n")
+                endif()
+                message(FATAL_ERROR
+                    "xchplot2: CUDA Toolkit ${_nvcc_major}.x dropped codegen for "
+                    "sm_${_min_arch} (Pascal / Volta / pre-Turing).\n"
+                    "\n"
+                    "Detected:\n"
+                    "  nvcc ${_nvcc_major}.x at ${_xchplot2_nvcc}\n"
+                    "  target arch: sm_${_min_arch} (from CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES})\n"
+                    "\n"
+                    ${_fix_block})
+            endif()
+        endif()
+    endif()
+    unset(_xchplot2_nvcc CACHE)
+
+    enable_language(CUDA)
+    set(CMAKE_CUDA_STANDARD 20)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
 endif()
 
 # Optional: compile in clock64 instrumentation for T3 match_all_buckets.
@@ -20,6 +142,195 @@ endif()
 # call. Off by default — enable with -DXCHPLOT2_INSTRUMENT_MATCH=ON.
 option(XCHPLOT2_INSTRUMENT_MATCH "Instrument T3 match_all_buckets with clock64 breakdown" OFF)
 
+# SYCL kernels via AdaptiveCpp are the only backend; the previous
+# XCHPLOT2_BACKEND={cuda,sycl} toggle was retired in slice 9 once the
+# CUDA-native wrapper TUs (T*OffsetsCuda.cu, PipelineKernelsCuda.cu)
+# were deleted. AdaptiveCpp is now a hard build dependency.
+
+# AdaptiveCpp target autodetect — must run BEFORE find_package(AdaptiveCpp)
+# so the package config sees a non-empty target list. acpp errors on an
+# empty -DACPP_TARGETS= (which we'd otherwise pass through unchanged from
+# the Containerfile's default build-arg).
+#   1. NVIDIA: stay on "generic" (LLVM SSCP). Empirically a few percent
+#      faster than cuda:sm_XX on our kernels at k=28 — SSCP's runtime
+#      specialization beats the CUDA-AOT path for this workload.
+#   2. AMD:    rocminfo Name: gfxXXXX → hip:gfxXXXX. SSCP's HIP path is
+#      less mature, so AOT-compiling for the actual gfx target is the
+#      safer pick on AMD.
+#   3. Fallback: generic (works everywhere; JITs on first use).
+# Override with -DACPP_TARGETS=... on the cmake command line.
+if(NOT ACPP_TARGETS)
+    execute_process(
+        COMMAND nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits
+        OUTPUT_VARIABLE _xchplot2_cuda_cap
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE _xchplot2_nvsmi_rc
+        ERROR_QUIET)
+    if(_xchplot2_nvsmi_rc EQUAL 0 AND _xchplot2_cuda_cap)
+        set(ACPP_TARGETS "generic" CACHE STRING "AdaptiveCpp target list" FORCE)
+        message(STATUS "xchplot2: NVIDIA GPU detected; using ACPP_TARGETS=generic (SSCP)")
+    else()
+        execute_process(
+            COMMAND rocminfo
+            OUTPUT_VARIABLE _xchplot2_rocm_out
+            RESULT_VARIABLE _xchplot2_rocminfo_rc
+            ERROR_QUIET)
+        if(_xchplot2_rocminfo_rc EQUAL 0)
+            string(REGEX MATCH "Name:[ \t]+gfx[0-9a-f]+" _xchplot2_gfx_match "${_xchplot2_rocm_out}")
+            string(REGEX REPLACE "Name:[ \t]+" "" _xchplot2_gfx "${_xchplot2_gfx_match}")
+            if(_xchplot2_gfx)
+                set(ACPP_TARGETS "hip:${_xchplot2_gfx}" CACHE STRING "AdaptiveCpp target list" FORCE)
+                message(STATUS "xchplot2: ACPP_TARGETS auto-detected via rocminfo: ${ACPP_TARGETS}")
+            endif()
+        endif()
+    endif()
+    if(NOT ACPP_TARGETS)
+        set(ACPP_TARGETS "generic" CACHE STRING "AdaptiveCpp target list" FORCE)
+        message(STATUS "xchplot2: ACPP_TARGETS fell back to generic (no nvidia-smi/rocminfo)")
+    endif()
+endif()
+message(STATUS "xchplot2: ACPP_TARGETS=${ACPP_TARGETS}")
+
+# Lookup precedence:
+#   1. find_package(AdaptiveCpp) — system or local install (e.g. /opt/adaptivecpp).
+#      This is what scripts/install-deps.sh and the Containerfile produce.
+#   2. FetchContent fallback — clones AdaptiveCpp at v25.10.0 and adds it as
+#      a CMake subproject. Slow first build (LLVM compilation, ~15-30 min) but
+#      removes the manual install step. Opt out with -DXCHPLOT2_FETCH_ADAPTIVECPP=OFF.
+option(XCHPLOT2_FETCH_ADAPTIVECPP "Fall back to FetchContent if AdaptiveCpp not found" ON)
+
+# HINTS /opt/adaptivecpp matches scripts/install-deps.sh's default install
+# prefix, and ENV ACPP_PREFIX honours users who installed to a custom
+# location with `ACPP_PREFIX=/elsewhere ./scripts/install-deps.sh`. Without
+# these, find_package wouldn't search /opt (not a standard CMake path), the
+# user would have to remember to `export CMAKE_PREFIX_PATH=/opt/adaptivecpp`
+# between running install-deps.sh and the build (the script can't set env
+# vars in the parent shell), and FetchContent would fire pointlessly.
+find_package(AdaptiveCpp QUIET HINTS /opt/adaptivecpp ENV ACPP_PREFIX)
+if(NOT AdaptiveCpp_FOUND)
+    if(XCHPLOT2_FETCH_ADAPTIVECPP)
+        message(STATUS "xchplot2: AdaptiveCpp not found — fetching v25.10.0 via FetchContent")
+        message(STATUS "xchplot2:   first build will take ~15-30 min while AdaptiveCpp compiles")
+        message(STATUS "xchplot2:   pre-install via scripts/install-deps.sh to skip this")
+
+        # AdaptiveCpp's compiler/CMakeLists requires ld.lld at configure
+        # time and aborts with "Cannot find ld.lld. Please provide path
+        # via -DACPP_LLD_PATH=…" otherwise. Auto-probe the conventional
+        # LLVM-{16..20} prefixes and pass the path through so users on a
+        # FetchContent build don't have to know that detail. If the
+        # binary isn't installed at all, fail loud with a copy-paste
+        # install command — far less confusing than AdaptiveCpp's own
+        # message.
+        find_program(_xchplot2_ld_lld
+            NAMES ld.lld
+            HINTS
+                /usr/lib/llvm-20/bin /usr/lib/llvm-19/bin /usr/lib/llvm-18/bin
+                /usr/lib/llvm-17/bin /usr/lib/llvm-16/bin
+                /usr/lib/llvm20/bin  /usr/lib/llvm19/bin  /usr/lib/llvm18/bin
+                /usr/lib64/llvm20/bin /usr/lib64/llvm19/bin /usr/lib64/llvm18/bin
+                /opt/llvm-20/bin /opt/llvm-19/bin /opt/llvm-18/bin
+                /opt/llvm20/bin  /opt/llvm19/bin  /opt/llvm18/bin
+            DOC "ld.lld required by AdaptiveCpp's compiler/CMakeLists")
+        if(_xchplot2_ld_lld)
+            set(ACPP_LLD_PATH "${_xchplot2_ld_lld}" CACHE FILEPATH
+                "Path to ld.lld for AdaptiveCpp's compiler/CMakeLists" FORCE)
+            message(STATUS "xchplot2:   auto-probed ld.lld at ${_xchplot2_ld_lld}")
+        else()
+            message(FATAL_ERROR
+                "xchplot2: AdaptiveCpp's FetchContent build needs ld.lld "
+                "but it isn't installed at any of the standard LLVM-16..20 "
+                "prefixes. Install it:\n"
+                "  Ubuntu/Debian:  sudo apt install lld-18\n"
+                "  Fedora/RHEL:    sudo dnf install lld\n"
+                "  Arch/CachyOS:   sudo pacman -S lld\n"
+                "Or pre-install AdaptiveCpp via scripts/install-deps.sh "
+                "(also installs ld.lld and builds AdaptiveCpp at "
+                "/opt/adaptivecpp). Override the probe with "
+                "-DACPP_LLD_PATH=/path/to/ld.lld.")
+        endif()
+
+        include(FetchContent)
+        FetchContent_Declare(
+            adaptivecpp
+            GIT_REPOSITORY https://github.com/AdaptiveCpp/AdaptiveCpp.git
+            GIT_TAG        v25.10.0
+        )
+        FetchContent_MakeAvailable(adaptivecpp)
+        if(NOT COMMAND add_sycl_to_target)
+            message(FATAL_ERROR
+                "xchplot2: FetchContent built AdaptiveCpp but add_sycl_to_target "
+                "wasn't exported. Install AdaptiveCpp via scripts/install-deps.sh "
+                "or use the Containerfile.")
+        endif()
+    else()
+        message(FATAL_ERROR
+            "xchplot2: AdaptiveCpp not found. Install it via scripts/install-deps.sh, "
+            "use the Containerfile, or re-run with -DXCHPLOT2_FETCH_ADAPTIVECPP=ON.")
+    endif()
+endif()
+
+# Export the AdaptiveCpp lib directory to a file so build.rs knows where
+# to add -L for libacpp-rt / libacpp-common at link time. Without this,
+# the Rust binary fails to link on machines where AdaptiveCpp lives
+# anywhere other than /opt/adaptivecpp or /usr/local (and on FetchContent
+# builds, which leave the artifacts in CMake's _deps/ build tree).
+set(_xchplot2_acpp_lib_dir "")
+if(TARGET acpp-rt)
+    # FetchContent-built target: ask CMake where it'll land.
+    set(_xchplot2_acpp_lib_dir "$<TARGET_FILE_DIR:acpp-rt>")
+elseif(AdaptiveCpp_DIR)
+    # Installed AdaptiveCpp: AdaptiveCpp_DIR is <prefix>/lib/cmake/AdaptiveCpp,
+    # so two parent dirs up gives <prefix>/lib.
+    get_filename_component(_xchplot2_acpp_cmake_root "${AdaptiveCpp_DIR}" DIRECTORY)
+    get_filename_component(_xchplot2_acpp_lib_dir    "${_xchplot2_acpp_cmake_root}" DIRECTORY)
+endif()
+if(_xchplot2_acpp_lib_dir)
+    file(GENERATE OUTPUT "${CMAKE_BINARY_DIR}/acpp-prefix.txt"
+                  CONTENT "${_xchplot2_acpp_lib_dir}\n")
+    message(STATUS "xchplot2: AdaptiveCpp lib dir = ${_xchplot2_acpp_lib_dir}")
+endif()
+
+# Embed runtime library paths so binaries built via plain `cmake` (parity
+# tests, dev rebuilds, anything not invoked through cargo+build.rs) can
+# locate AdaptiveCpp's runtime lib + ROCm's libamdhip64.so without an
+# external LD_LIBRARY_PATH. build.rs sets the same rpaths via
+# rustc-link-arg for the cargo path, so this is idempotent for the
+# production binary. Without this, a fresh `cmake -B build && cmake
+# --build build --target sycl_t1_parity` produces a binary that throws
+# "No matching device" at SYCL queue construction because
+# librt-backend-hip.so can't dynamically link libamdhip64.so.
+#
+# The FetchContent path leaves _xchplot2_acpp_lib_dir as a generator
+# expression ("$<TARGET_FILE_DIR:acpp-rt>") which can't go into the
+# RPATH variables at config time — CMake's BUILD_WITH_INSTALL_RPATH=OFF
+# default already handles in-tree targets in that case.
+if(_xchplot2_acpp_lib_dir AND NOT _xchplot2_acpp_lib_dir MATCHES "\\$<")
+    list(APPEND CMAKE_BUILD_RPATH   "${_xchplot2_acpp_lib_dir}")
+    list(APPEND CMAKE_INSTALL_RPATH "${_xchplot2_acpp_lib_dir}")
+endif()
+if(XCHPLOT2_HIP_RUNTIME_H)
+    get_filename_component(_xchplot2_rocm_root "${XCHPLOT2_HIP_RUNTIME_H}/.." ABSOLUTE)
+    list(APPEND CMAKE_BUILD_RPATH   "${_xchplot2_rocm_root}/lib")
+    list(APPEND CMAKE_INSTALL_RPATH "${_xchplot2_rocm_root}/lib")
+    message(STATUS "xchplot2: embedded rpath includes ${_xchplot2_rocm_root}/lib")
+
+    # Direct-link libamdhip64 so AdaptiveCpp's runtime-dlopen'd HIP
+    # backend (librt-backend-hip.so) finds the library already loaded
+    # in the process address space. dlopen of a backend's transitive
+    # deps doesn't consult the calling binary's RUNPATH on glibc —
+    # without this explicit link, ROCm silently fails to initialise
+    # and AdaptiveCpp's default selector falls through to its OpenMP
+    # host device. The fall-through makes hellosycl / sycl_t1_parity
+    # report "ALL OK" while having executed entirely on CPU. Mirrors
+    # build.rs:631 (cargo:rustc-link-lib=amdhip64) for the cargo
+    # build path.
+    if(EXISTS "${_xchplot2_rocm_root}/lib/libamdhip64.so")
+        link_libraries("${_xchplot2_rocm_root}/lib/libamdhip64.so")
+        message(STATUS "xchplot2: link_libraries(libamdhip64.so) — "
+                       "AdaptiveCpp HIP backend will find ROCm at runtime")
+    endif()
+endif()
+
 # pos2-chip dependency.
 #
 # Default behavior: FetchContent auto-clones Chia-Network/pos2-chip into
@@ -74,15 +385,87 @@ endif()
 
 # Shared GPU support library (kernels). AesGpu.cu MUST come first — it
 # owns the constant-memory T-tables that all later kernels reference.
+# All backend-dispatched wrapper TUs (T*OffsetsSycl.cpp, PipelineKernelsSycl.cpp)
+# go through AdaptiveCpp via add_sycl_to_target below.
+set(POS2_GPU_SYCL_SRC
+    src/gpu/T1OffsetsSycl.cpp
+    src/gpu/T2OffsetsSycl.cpp
+    src/gpu/T3OffsetsSycl.cpp
+    src/gpu/PipelineKernelsSycl.cpp
+    src/gpu/XsKernel.cpp
+    src/gpu/XsKernelsSycl.cpp
+    src/gpu/T1Kernel.cpp
+    src/gpu/T2Kernel.cpp
+    src/gpu/T3Kernel.cpp
+    src/host/GpuBufferPool.cpp
+    src/host/GpuPipeline.cpp)
+
+# Sort path: SortSycl.cpp (hand-rolled LSD radix in pure SYCL) is now
+# always compiled — it's the runtime fallback for non-CUDA backends on
+# dual-toolchain builds, and the only path on AMD-only / Intel-only /
+# CPU builds. SortDispatch.cpp picks at runtime based on the queue's
+# device backend (sycl::backend::cuda → _cub variant; everything else →
+# _sycl variant). When BUILD_CUDA=OFF, the dispatcher's CUB branch is
+# compiled out and reduces to a single tail call into SortSycl.cpp.
+list(APPEND POS2_GPU_SYCL_SRC
+    src/gpu/SortSycl.cpp
+    src/gpu/SortDispatch.cpp
+    src/gpu/SyclDeviceList.cpp)
+
+if(XCHPLOT2_BUILD_CUDA)
+    set(POS2_GPU_CUDA_SRC
+        src/gpu/AesGpu.cu
+        src/gpu/AesGpuBitsliced.cu
+        src/gpu/SortCuda.cu)
+    # SortSyclCub.cpp is the SYCL-typed adapter that bridges
+    # sycl::queue → CUB. SortCuda.cu used to provide the SYCL-typed
+    # entry points itself, but mixing nvcc + <sycl/sycl.hpp> in one
+    # TU drags AdaptiveCpp's libkernel half.hpp into the legacy CUDA
+    # arm of __acpp_backend_switch — a path AdaptiveCpp doesn't
+    # support. Splitting the SYCL surface into this acpp-compiled
+    # adapter (does q.wait()) and a pure-CUDA cub_sort_* in
+    # SortCuda.cu (does the work + cudaStreamSync) keeps each
+    # compiler in its lane.
+    list(APPEND POS2_GPU_SYCL_SRC
+        src/gpu/SortSyclCub.cpp)
+else()
+    # AesStub.cpp: no-op initialize_aes_tables on builds without the
+    # CUDA AOT path. AesGpu.cu provides the real implementation when
+    # BUILD_CUDA=ON; SYCL workers ignore initialize_aes_tables anyway
+    # (they upload AES T-tables lazily via SyclBackend.hpp's
+    # aes_tables_device(q)).
+    set(POS2_GPU_CUDA_SRC)
+    list(APPEND POS2_GPU_SYCL_SRC
+        src/gpu/AesStub.cpp)
+endif()
+
+# CUDA OBJECT library: compiled once, referenced via $<TARGET_OBJECTS:>
+# from each consuming target EXACTLY ONCE. The earlier design tried to
+# put the .o files in BOTH pos2_gpu (STATIC) AND xchplot2_cli for hash
+# matching, but nvlink's device-link step at xchplot2_cli archive
+# creation refuses the duplicate kAesT0..3 / kernel definitions:
+#
+#   nvlink error : Multiple definition of '_ZN7pos2gpu6kAesT0E' in
+#       'libpos2_gpu.a:AesGpu.cu.o', first defined in
+#       'CMakeFiles/pos2_gpu_cuda_obj.dir/src/gpu/AesGpu.cu.o'
+#
+# (--allow-multiple-definition is a host-linker flag — nvlink doesn't
+# honour it.) So the .o files now live exclusively in xchplot2_cli for
+# the cargo install path, and each parity test adds them explicitly
+# below — pos2_gpu STATIC carries only the SYCL .cpp sources.
+if(XCHPLOT2_BUILD_CUDA)
+    add_library(pos2_gpu_cuda_obj OBJECT ${POS2_GPU_CUDA_SRC})
+    target_include_directories(pos2_gpu_cuda_obj PRIVATE src)
+    target_link_libraries(pos2_gpu_cuda_obj PRIVATE pos2_chip_headers)
+    target_compile_features(pos2_gpu_cuda_obj PRIVATE cxx_std_20)
+    set_target_properties(pos2_gpu_cuda_obj PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    if(XCHPLOT2_INSTRUMENT_MATCH)
+        target_compile_definitions(pos2_gpu_cuda_obj PRIVATE XCHPLOT2_INSTRUMENT_MATCH=1)
+    endif()
+endif()
+
 add_library(pos2_gpu STATIC
-    src/gpu/AesGpu.cu
-    src/gpu/AesGpuBitsliced.cu
-    src/gpu/XsKernel.cu
-    src/gpu/T1Kernel.cu
-    src/gpu/T2Kernel.cu
-    src/gpu/T3Kernel.cu
-    src/host/GpuBufferPool.cu
-    src/host/GpuPipeline.cu
+    ${POS2_GPU_SYCL_SRC}
 )
 target_include_directories(pos2_gpu PUBLIC
     src
@@ -92,10 +475,98 @@ target_compile_features(pos2_gpu PUBLIC cxx_std_20)
 if(XCHPLOT2_INSTRUMENT_MATCH)
     target_compile_definitions(pos2_gpu PUBLIC XCHPLOT2_INSTRUMENT_MATCH=1)
 endif()
+# Marker for SortDispatch.cpp: gates whether the runtime backend
+# dispatcher includes the CUB branch. Defined when SortSyclCub.cpp +
+# SortCuda.cu are linked (BUILD_CUDA=ON); undefined on AMD-only /
+# Intel-only / CPU builds, in which case the dispatcher reduces to a
+# single tail call into SortSycl.cpp.
+if(XCHPLOT2_BUILD_CUDA)
+    target_compile_definitions(pos2_gpu PUBLIC XCHPLOT2_HAVE_CUB=1)
+endif()
+add_sycl_to_target(TARGET pos2_gpu SOURCES ${POS2_GPU_SYCL_SRC})
+
+# AdaptiveCpp's acpp driver doesn't auto-propagate CMake's standard
+# CMAKE_CXX_FLAGS_RELEASE (-O3 -DNDEBUG) into the SYCL compile step.
+# Without an explicit -O flag, acpp warns "No optimization flag was
+# given, optimizations are disabled by default" and the AES-heavy SYCL
+# kernels (Xs gen, T*match) compile at -O0, which is dramatically
+# slower on amdgcn (Xs gen alone was 200 ms / ~25% of wall on RX 6700
+# XT before this fix).
+#
+# An earlier attempt at -O3 was reverted because parity tests appeared
+# to fail with it — but that diagnosis was confounded by an unrelated
+# build-time bug (compose.yaml's silent ACPP_GFX default to gfx1100
+# made every "broken" rebuild produce kernels for the wrong amdgcn
+# ISA, which executed as no-ops regardless of opt level). With
+# ACPP_GFX now enforced via ${VAR:?} in compose.yaml, -O3 should be
+# testable cleanly. Drop to -O2 here if it actually does fail at -O3
+# under correct gfx targeting.
+target_compile_options(pos2_gpu PRIVATE
+    $<$<CONFIG:Release>:-O3>
+    $<$<CONFIG:RelWithDebInfo>:-O2>
+    $<$<CONFIG:MinSizeRel>:-Os>)
+# The SYCL TUs include CUDA headers (cuda_fp16.h, transitively cuda_runtime.h
+# from the kernel-wrapper headers) on both the CUDA and non-CUDA paths
+# (slice 17 will lift the CUDA-type dependencies out of the public API).
+# On the CUDA build we already have CMAKE_CUDA_COMPILER. On the non-CUDA
+# build we need to locate the CUDA Toolkit headers via find_package
+# (CUDAToolkit) — which does NOT require enable_language(CUDA).
+if(XCHPLOT2_BUILD_CUDA)
+    get_filename_component(_xchplot2_cuda_bin ${CMAKE_CUDA_COMPILER} DIRECTORY)
+    get_filename_component(_xchplot2_cuda_root ${_xchplot2_cuda_bin} DIRECTORY)
+    set(_xchplot2_cuda_include "${_xchplot2_cuda_root}/include")
+else()
+    find_package(CUDAToolkit QUIET)
+    if(CUDAToolkit_INCLUDE_DIRS)
+        set(_xchplot2_cuda_include ${CUDAToolkit_INCLUDE_DIRS})
+    else()
+        # Last-resort guess; matches Arch / CachyOS layout.
+        set(_xchplot2_cuda_include "/opt/cuda/include")
+    endif()
+endif()
+target_include_directories(pos2_gpu PRIVATE ${_xchplot2_cuda_include})
+if(XCHPLOT2_BUILD_CUDA)
+    # OBJECT lib doesn't inherit pos2_gpu's PUBLIC includes via
+    # $<TARGET_OBJECTS:> (only the .o files travel), so propagate the
+    # CUDA include path explicitly. Mirrors the line above for pos2_gpu.
+    target_include_directories(pos2_gpu_cuda_obj PRIVATE ${_xchplot2_cuda_include})
+endif()
+
+# Slice 17 removed the last SYCL-TU reference to a cudart *function* — only
+# cuda* types survive (used for API compatibility), and types don't require
+# a link against libcudart.so. On the NVIDIA build path the nvcc-compiled
+# TUs (AesGpu.cu, SortCuda.cu, AesGpuBitsliced.cu) bring in cudart
+# automatically. On non-NVIDIA builds cudart isn't needed at all.
+# Now that the kernel-wrapper headers (T*Offsets.cuh, PipelineKernels.cuh,
+# T*Kernel.cuh, XsKernel.cuh) take sycl::queue&, every TU that includes them
+# needs sycl/sycl.hpp on its include path — including the parity tests
+# compiled by nvcc. Make AdaptiveCpp's include dir PUBLIC so it propagates.
+get_filename_component(_xchplot2_acpp_cmake_dir
+    "${AdaptiveCpp_DIR}" DIRECTORY)  # /opt/adaptivecpp/lib/cmake/AdaptiveCpp/.. = /opt/adaptivecpp/lib/cmake
+get_filename_component(_xchplot2_acpp_lib_dir
+    "${_xchplot2_acpp_cmake_dir}" DIRECTORY)  # /opt/adaptivecpp/lib
+get_filename_component(_xchplot2_acpp_root
+    "${_xchplot2_acpp_lib_dir}" DIRECTORY)  # /opt/adaptivecpp
+target_include_directories(pos2_gpu PUBLIC
+    ${_xchplot2_acpp_root}/include
+    ${_xchplot2_acpp_root}/include/AdaptiveCpp)
+if(XCHPLOT2_BUILD_CUDA)
+    # Same reasoning as the CUDA include above — propagate AdaptiveCpp's
+    # include dir to the OBJECT lib explicitly so its .cu TUs see the
+    # kernel-wrapper headers (T*Offsets.cuh / PipelineKernels.cuh / ...)
+    # that pull in sycl/sycl.hpp.
+    target_include_directories(pos2_gpu_cuda_obj PRIVATE
+        ${_xchplot2_acpp_root}/include
+        ${_xchplot2_acpp_root}/include/AdaptiveCpp)
+endif()
+
 set_target_properties(pos2_gpu PROPERTIES
     POSITION_INDEPENDENT_CODE ON
-    # Do NOT pre-resolve device symbols — consumers (e.g. aes_parity.cu)
-    # reference kAesT* directly and need them visible at final device link.
+    # No CUDA .o files in this archive (they live in pos2_gpu_cuda_obj
+    # OBJECT lib and are added explicitly to each leaf consumer), so
+    # device-symbol resolution doesn't apply here. CUDA_RESOLVE_DEVICE_SYMBOLS
+    # is left explicitly OFF for clarity and to defend against any future
+    # CUDA TU getting added to pos2_gpu's source list.
     CUDA_RESOLVE_DEVICE_SYMBOLS OFF
 )
 
@@ -107,6 +578,8 @@ add_library(pos2_gpu_host STATIC
     src/host/GpuPlotter.cpp
     src/host/PlotFileWriterParallel.cpp
     src/host/BatchPlotter.cpp
+    src/host/CpuPlotter.cpp
+    src/host/Cancel.cpp
 )
 target_include_directories(pos2_gpu_host PUBLIC src)
 target_link_libraries(pos2_gpu_host PUBLIC pos2_chip_headers pos2_gpu)
@@ -170,55 +643,168 @@ endif()
 add_library(xchplot2_cli STATIC tools/xchplot2/cli.cpp)
 target_include_directories(xchplot2_cli PUBLIC tools/xchplot2)
 target_link_libraries(xchplot2_cli PUBLIC pos2_gpu_host pos2_keygen)
+# CUDA_RESOLVE_DEVICE_SYMBOLS=ON triggers an nvcc --device-link step at
+# archive creation, producing a host-side dlink.o that defines the
+# `__cudaRegisterLinkedBinary_*` symbols every `__sti____cudaRegisterAll()`
+# constructor references. cli_devlink.cu is the marker that flips
+# xchplot2_cli to a CUDA-language target so the device-link actually
+# fires (it's a silent no-op on pure-C++ targets — see cli_devlink.cu).
+#
+# Just adding cli_devlink.cu isn't enough: the dlink.o it produces only
+# resolves symbols for .cu objects directly compiled into xchplot2_cli.
+# Pulling pos2_gpu's CUDA .o files in via $<TARGET_OBJECTS:pos2_gpu_cuda_obj>
+# brings them into xchplot2_cli's archive-time device-link scope so the
+# resulting dlink.o covers them too. See the pos2_gpu_cuda_obj OBJECT-lib
+# comment above for why we share the .o files instead of recompiling.
+if(XCHPLOT2_BUILD_CUDA)
+    target_sources(xchplot2_cli PRIVATE
+        tools/xchplot2/cli_devlink.cu
+        $<TARGET_OBJECTS:pos2_gpu_cuda_obj>)
+endif()
 set_target_properties(xchplot2_cli PROPERTIES
     POSITION_INDEPENDENT_CODE ON
     CUDA_RESOLVE_DEVICE_SYMBOLS ON
 )
 
 # CLI: xchplot2  (the standalone plotter binary, formerly gpu_plotter)
+#
+# LINK_GROUP RESCAN wraps xchplot2_cli + pos2_gpu_host so the linker
+# rescans them as a unit. xchplot2_cli holds the CUDA OBJECT files
+# (initialize_aes_tables, cub_sort_*); pos2_gpu_host's BatchPlotter.cpp
+# and SortSyclCub.cpp reference those symbols. With single-pass static-
+# archive scanning the references would land after xchplot2_cli was
+# already processed — rescan resolves the back-edge.
 add_executable(xchplot2 tools/xchplot2/main.cpp)
-target_link_libraries(xchplot2 PRIVATE xchplot2_cli)
-
-# Parity tests
-add_executable(aes_parity tools/parity/aes_parity.cu)
-target_link_libraries(aes_parity PRIVATE pos2_gpu_host)
-
-add_executable(aes_bs_parity tools/parity/aes_bs_parity.cu)
-target_link_libraries(aes_bs_parity PRIVATE pos2_gpu_host)
-
-add_executable(aes_bs_bench tools/parity/aes_bs_bench.cu)
-target_link_libraries(aes_bs_bench PRIVATE pos2_gpu_host)
-
-add_executable(aes_tezcan_bench tools/parity/aes_tezcan_bench.cu)
-target_link_libraries(aes_tezcan_bench PRIVATE pos2_gpu_host)
-
-add_executable(xs_parity tools/parity/xs_parity.cu)
-target_link_libraries(xs_parity PRIVATE pos2_gpu_host)
-
-add_executable(xs_bench tools/parity/xs_bench.cu)
-target_link_libraries(xs_bench PRIVATE pos2_gpu_host)
-
-add_executable(t1_parity tools/parity/t1_parity.cu)
-target_link_libraries(t1_parity PRIVATE pos2_gpu_host)
-
-add_executable(t1_debug tools/parity/t1_debug.cu)
-target_link_libraries(t1_debug PRIVATE pos2_gpu_host)
-set_target_properties(t1_debug PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity")
-
-add_executable(t2_parity tools/parity/t2_parity.cu)
-target_link_libraries(t2_parity PRIVATE pos2_gpu_host)
-
-add_executable(t3_parity tools/parity/t3_parity.cu)
-target_link_libraries(t3_parity PRIVATE pos2_gpu_host)
+target_link_libraries(xchplot2 PRIVATE
+    "$<LINK_GROUP:RESCAN,xchplot2_cli,pos2_gpu_host>")
+# pos2-chip headers define non-inline soft_aesenc/soft_aesdec, which now
+# end up in two TUs (PlotFileWriterParallel.cpp and CpuPlotter.cpp) inside
+# pos2_gpu_host. Tolerate the duplicates at host link.
+target_link_options(xchplot2 PRIVATE LINKER:--allow-multiple-definition)
+
+# Parity tests are nvcc-compiled (.cu) and reference __global__ kernels
+# from the bench-specific bitsliced AES path. They build only on the CUDA
+# target. The two SYCL-native parity tests below (sycl_*_parity) stay
+# unconditional so AMD/Intel builds still have correctness coverage.
+#
+# Each test gets $<TARGET_OBJECTS:pos2_gpu_cuda_obj> explicitly:
+# pos2_gpu (STATIC) doesn't carry the CUDA .o files anymore — putting
+# them in both pos2_gpu and xchplot2_cli triggered nvlink's "Multiple
+# definition" error at xchplot2_cli's archive-time device-link, which
+# host-only --allow-multiple-definition can't suppress. So leaf
+# executables that need kernel symbols (kAesT0..3, host-side
+# kernel-wrapper functions in pos2_gpu_host) pull them in directly,
+# making the .o files appear exactly once in each link line.
+if(XCHPLOT2_BUILD_CUDA)
+    foreach(t IN ITEMS aes_parity aes_bs_parity aes_bs_bench aes_tezcan_bench
+                       xs_parity xs_bench t1_parity t1_debug t2_parity t3_parity)
+        add_executable(${t} tools/parity/${t}.cu $<TARGET_OBJECTS:pos2_gpu_cuda_obj>)
+        target_link_libraries(${t} PRIVATE pos2_gpu_host)
+        set_target_properties(${t} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity")
+    endforeach()
+
+    message(STATUS "pos2-gpu configured for CUDA arch(es): ${CMAKE_CUDA_ARCHITECTURES}")
+endif()
 
-add_executable(plot_file_parity tools/parity/plot_file_parity.cpp)
+# plot_file_parity is a pure .cpp harness — reads a .plot file via
+# pos2_gpu_host's file-format code and checks the header / table offsets.
+# Builds on all backends (CUDA, HIP, SYCL-only). On the CUDA build it
+# transitively needs pos2_gpu_host's kernel-wrapper symbols, which now
+# live in the OBJECT lib rather than pos2_gpu.a — pull them in here.
+if(XCHPLOT2_BUILD_CUDA)
+    add_executable(plot_file_parity tools/parity/plot_file_parity.cpp
+        $<TARGET_OBJECTS:pos2_gpu_cuda_obj>)
+else()
+    add_executable(plot_file_parity tools/parity/plot_file_parity.cpp)
+endif()
 target_link_libraries(plot_file_parity PRIVATE pos2_gpu_host)
 set_target_properties(plot_file_parity PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity")
 
 # Group binaries under build/tools/...
 set_target_properties(xchplot2 PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/xchplot2")
-foreach(t aes_parity aes_bs_parity aes_bs_bench aes_tezcan_bench xs_parity xs_bench t1_parity t2_parity t3_parity)
-    set_target_properties(${t} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity")
-endforeach()
 
-message(STATUS "pos2-gpu configured for CUDA arch(es): ${CMAKE_CUDA_ARCHITECTURES}")
+# Slice-1 standalone SYCL parity test: exercises compute_bucket_offsets in
+# isolation against a CPU reference on synthetic input — orthogonal to the
+# t1_parity full-pipeline test, useful for narrowing any divergence to the
+# SYCL kernel itself.
+add_executable(sycl_bucket_offsets_parity tools/parity/sycl_bucket_offsets_parity.cpp)
+add_sycl_to_target(TARGET sycl_bucket_offsets_parity
+                   SOURCES tools/parity/sycl_bucket_offsets_parity.cpp)
+target_compile_features(sycl_bucket_offsets_parity PRIVATE cxx_std_20)
+set_target_properties(sycl_bucket_offsets_parity PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity")
+
+# Slice-4 standalone: validates the SYCL-compiled AES g_x_smem against the
+# same function run on the host. Pulls the AES headers (now portable behind
+# PortableAttrs.hpp) directly, so a host-vs-device divergence in the AES
+# math isolates here without t1_parity scaffolding.
+add_executable(sycl_g_x_parity tools/parity/sycl_g_x_parity.cpp)
+add_sycl_to_target(TARGET sycl_g_x_parity
+                   SOURCES tools/parity/sycl_g_x_parity.cpp)
+target_include_directories(sycl_g_x_parity PRIVATE src)
+target_compile_features(sycl_g_x_parity PRIVATE cxx_std_20)
+set_target_properties(sycl_g_x_parity PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity")
+
+# Slice-18 standalone: exercises launch_sort_pairs_u32_u32 and
+# launch_sort_keys_u64 against a std::sort reference. Built always — runs
+# the CUB-backed wrappers when XCHPLOT2_BUILD_CUDA=ON, the hand-rolled
+# SYCL radix when OFF. Lets the SYCL sort path be validated on NVIDIA
+# hardware without needing AMD/Intel access.
+add_executable(sycl_sort_parity tools/parity/sycl_sort_parity.cpp)
+add_sycl_to_target(TARGET sycl_sort_parity
+                   SOURCES tools/parity/sycl_sort_parity.cpp)
+target_link_libraries(sycl_sort_parity PRIVATE pos2_gpu)
+# On the CUDA build path, pos2_gpu's SortSyclCub.cpp (the SYCL→CUB
+# adapter) calls cub_sort_* defined in SortCuda.cu — now in
+# pos2_gpu_cuda_obj OBJECT lib instead of pos2_gpu STATIC. Pull the
+# OBJECT lib's .o files in directly so the CUB symbols resolve.
+# AMD/Intel builds use SortSycl.cpp (pure SYCL) instead and don't
+# need this.
+if(XCHPLOT2_BUILD_CUDA)
+    target_sources(sycl_sort_parity PRIVATE $<TARGET_OBJECTS:pos2_gpu_cuda_obj>)
+endif()
+# cuda_fp16.h transitively required by SyclBackend.hpp → sycl/sycl.hpp
+# (AdaptiveCpp's half.hpp uses cuda_fp16 intrinsics on the CUDA backend).
+target_include_directories(sycl_sort_parity PRIVATE ${_xchplot2_cuda_include})
+target_compile_features(sycl_sort_parity PRIVATE cxx_std_20)
+set_target_properties(sycl_sort_parity PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity")
+
+# SYCL-native sibling of t1_parity.cu. The .cu version is nvcc-only, so on
+# AMD/Intel hosts the T1 matcher had no end-to-end CPU-vs-GPU coverage —
+# this binary closes that gap. Same comparison semantics as t1_parity.cu
+# (sorted-set equality of T1Pairings against pos2-chip's Table1Constructor),
+# but uses sycl::malloc_device + q.memcpy in place of cudaMalloc /
+# cudaMemcpy so it builds on the SYCL-only path too.
+if(XCHPLOT2_BUILD_CUDA)
+    add_executable(sycl_t1_parity tools/parity/sycl_t1_parity.cpp
+        $<TARGET_OBJECTS:pos2_gpu_cuda_obj>)
+else()
+    add_executable(sycl_t1_parity tools/parity/sycl_t1_parity.cpp)
+endif()
+add_sycl_to_target(TARGET sycl_t1_parity
+                   SOURCES tools/parity/sycl_t1_parity.cpp)
+target_link_libraries(sycl_t1_parity PRIVATE pos2_gpu_host)
+target_include_directories(sycl_t1_parity PRIVATE ${_xchplot2_cuda_include})
+target_compile_features(sycl_t1_parity PRIVATE cxx_std_20)
+# pos2-chip's plot/PlotLayout.hpp + plot/TableConstructorGeneric.hpp pull
+# in non-inline soft_aesenc/soft_aesdec, which already exist in pos2_gpu_host
+# via PlotFileWriterParallel.cpp + CpuPlotter.cpp. Same mitigation as the
+# xchplot2 CLI link line — see the --allow-multiple-definition note above.
+target_link_options(sycl_t1_parity PRIVATE LINKER:--allow-multiple-definition)
+set_target_properties(sycl_t1_parity PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity")
+
+# Lowest-level diagnostic: a hello-world SYCL kernel that proves
+# AdaptiveCpp's HIP / CUDA backend can dispatch *anything* on the
+# detected device. No pos2_gpu / pos2_gpu_host link — purely the SYCL
+# runtime + a 16-element parallel_for. Use it as the first step when
+# sycl_t1_parity or the production CLI silently produces no output: if
+# hellosycl FAILs, no xchplot2-level fix can recover and the issue is
+# below our level (driver mismatch, JIT no-op stubs, etc.).
+add_executable(hellosycl tools/sanity/hellosycl.cpp)
+add_sycl_to_target(TARGET hellosycl SOURCES tools/sanity/hellosycl.cpp)
+target_compile_features(hellosycl PRIVATE cxx_std_20)
+set_target_properties(hellosycl PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/sanity")
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..b565621
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,69 @@
+# Contributing to xchplot2
+
+Thanks for taking the time. A few notes to keep review loops short.
+
+## Building + running the tests
+
+Build and run the parity tests following the
+[Build](https://github.com/Jsewill/xchplot2#build) section of the
+README. The parity binaries under `tools/parity/` are the correctness
+gate:
+
+- `aes_parity`, `xs_parity`, `t1_parity`, `t2_parity`, `t3_parity` —
+  bit-exact CPU vs GPU per-phase agreement with pos2-chip's reference.
+- `sycl_sort_parity`, `sycl_g_x_parity`, `sycl_bucket_offsets_parity` —
+  the SYCL/AdaptiveCpp backends vs the CUDA reference, so AMD/Intel
+  breakage is caught on NVIDIA hardware too.
+- `plot_file_parity` — writer + reader round-trip on the final
+  `.plot2`.
+
+Any change that touches a kernel, the sort path, or the plot file
+format **must** keep the parity tests passing at k=22 (quick) and at
+k=28 (slow — the realistic production k). Output bytes are specified
+to be identical to the pos2-chip CPU reference; this is the hard
+invariant.
+
+After a functional change, spot-check one real batch end-to-end with
+`xchplot2 verify <plot>` — zero proofs over 100 random challenges is
+a regression even if all parity tests pass.
+
+## Commit style
+
+Short imperative subjects, lowercase scope prefix, no trailing period:
+
+```
+gpu: split xs-sort keys_a to d_storage tail — drops pool VRAM min ~1.3 GB
+docs: tighten streaming peak (~7.3 GB measured), add AMD row
+CMakeLists: re-enable -O3 for SYCL TUs
+```
+
+Body paragraphs explain *why* (what invariant was wrong, what the
+measurement was, what alternative was considered and why it was
+rejected). The *what* is in the diff.
+
+## Scope of changes
+
+- Keep unrelated refactors out of correctness or performance commits.
+- Performance changes should cite before/after numbers on a named GPU
+  at a specified `k`.
+- New runtime knobs go in `README.md`'s
+  [Environment variables](https://github.com/Jsewill/xchplot2#environment-variables)
+  table so users can discover them.
+
+## PRs
+
+The `main` branch carries the SYCL/AdaptiveCpp port; the
+[`cuda-only`](https://github.com/Jsewill/xchplot2/tree/cuda-only)
+branch is the original CUDA-only path, preserved as the most-tested
+NVIDIA configuration. A PR that only helps NVIDIA may still land on
+`main`, but don't regress parity on AMD (`gfx1031`) along the way.
+
+## Reporting bugs
+
+Open an issue with:
+
+- Exact command line and the full stderr output.
+- GPU vendor + model + VRAM (`nvidia-smi -L` / `rocminfo | grep gfx`).
+- Build flavor: container (service name + `ACPP_GFX` / `CUDA_ARCH`),
+  native `scripts/install-deps.sh`, or `cargo install`.
+- Whether parity tests pass on your build.
diff --git a/Cargo.lock b/Cargo.lock
index 04951f4..8b9667a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,4 +4,4 @@ version = 4
 
 [[package]]
 name = "xchplot2"
-version = "0.1.0"
+version = "0.5.2"
diff --git a/Cargo.toml b/Cargo.toml
index 2147f53..50e3694 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,11 +1,11 @@
 [package]
 name        = "xchplot2"
-version     = "0.1.0"
+version     = "0.6.0"
 edition     = "2021"
 authors     = ["Abraham Sewill <abraham.sewill@proton.me>"]
 license     = "MIT"
 description = "GPU plotter for Chia v2 proofs of space (CHIP-48)"
-repository  = "https://github.com/Chia-Network/xchplot2"
+repository  = "https://github.com/Jsewill/xchplot2"
 readme      = "README.md"
 build       = "build.rs"
 
diff --git a/Containerfile b/Containerfile
new file mode 100644
index 0000000..7d97b2d
--- /dev/null
+++ b/Containerfile
@@ -0,0 +1,229 @@
+# syntax=docker/dockerfile:1
+#
+# Containerfile for xchplot2 — podman-first (works with docker too).
+# Supports NVIDIA (default), AMD ROCm, and Intel oneAPI via build args.
+#
+# ── NVIDIA (default; CUB sort) ───────────────────────────────────────────────
+#   podman build -t xchplot2:cuda .
+#   podman run --rm --device nvidia.com/gpu=all -v $PWD/plots:/out \
+#       xchplot2:cuda plot -k 28 -n 10 -f <farmer-pk> -c <pool-contract> -o /out
+#   (Requires nvidia-container-toolkit + CDI on the host.)
+#
+#   The default base image is CUDA 13.x, which only supports sm_75+ (Turing
+#   and newer). Pascal (sm_61) and Volta (sm_70) builds need a 12.x base —
+#   pass it explicitly:
+#     podman build -t xchplot2:cuda \
+#         --build-arg BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \
+#         --build-arg BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \
+#         --build-arg CUDA_ARCH=61 \
+#         .
+#   scripts/build-container.sh handles this automatically by probing
+#   nvidia-smi and pinning the 12.x base when CUDA_ARCH < 75.
+#
+# ── AMD ROCm (hand-rolled SYCL radix; XCHPLOT2_BUILD_CUDA=OFF) ───────────────
+#   podman build -t xchplot2:rocm \
+#       --build-arg BASE_DEVEL=docker.io/rocm/dev-ubuntu-24.04:latest \
+#       --build-arg BASE_RUNTIME=docker.io/rocm/dev-ubuntu-24.04:latest \
+#       --build-arg ACPP_TARGETS=hip:gfx1100 \
+#       --build-arg XCHPLOT2_BUILD_CUDA=OFF \
+#       --build-arg INSTALL_CUDA_HEADERS=1 \
+#       .
+#   podman run --rm --device /dev/kfd --device /dev/dri --group-add video \
+#       -v $PWD/plots:/out xchplot2:rocm plot -k 28 -n 10 ... -o /out
+#   (Adjust ACPP_TARGETS for your card: rocminfo | grep gfx.)
+#
+# ── Intel oneAPI (experimental, untested) ────────────────────────────────────
+#   podman build -t xchplot2:intel \
+#       --build-arg BASE_DEVEL=docker.io/intel/oneapi-basekit:latest \
+#       --build-arg BASE_RUNTIME=docker.io/intel/oneapi-runtime:latest \
+#       --build-arg ACPP_TARGETS=generic \
+#       --build-arg XCHPLOT2_BUILD_CUDA=OFF \
+#       --build-arg INSTALL_CUDA_HEADERS=1 \
+#       .
+#
+# ── CPU-only (AdaptiveCpp OpenMP backend; slow plotting) ─────────────────────
+#   podman build -t xchplot2:cpu \
+#       --build-arg BASE_DEVEL=docker.io/ubuntu:24.04 \
+#       --build-arg BASE_RUNTIME=docker.io/ubuntu:24.04 \
+#       --build-arg ACPP_TARGETS=omp \
+#       --build-arg XCHPLOT2_BUILD_CUDA=OFF \
+#       --build-arg INSTALL_CUDA_HEADERS=1 \
+#       .
+#   podman run --rm -v $PWD/plots:/out xchplot2:cpu plot -k 28 -n 1 ...
+#   No GPU needed at build or runtime. Plotting is 1-2 orders of magnitude
+#   slower than GPU — useful for headless CI / dev machines without a GPU.
+#
+# First build pulls + builds AdaptiveCpp from source — expect 10-30 min.
+# Subsequent rebuilds reuse the cached AdaptiveCpp layer.
+
+# BASE_RUNTIME defaults to the devel image because AdaptiveCpp's SSCP
+# (LLVM "generic" target) JIT-assembles PTX at runtime via ptxas, which
+# only ships in the CUDA *devel* image. The slim runtime image lacks it
+# and produces "Code object construction failed". Override with a slim
+# image only if you've switched ACPP_TARGETS to AOT (e.g. cuda:sm_89).
+ARG BASE_DEVEL=docker.io/nvidia/cuda:13.0.0-devel-ubuntu24.04
+ARG BASE_RUNTIME=docker.io/nvidia/cuda:13.0.0-devel-ubuntu24.04
+ARG ACPP_REF=v25.10.0
+ARG ACPP_TARGETS=
+ARG XCHPLOT2_BUILD_CUDA=ON
+ARG INSTALL_CUDA_HEADERS=0
+ARG CUDA_ARCH=89
+# LLVM/clang root used to build AdaptiveCpp. Pinned to Ubuntu's llvm-18
+# for every compose service (cuda / rocm / intel / cpu) — none of them
+# override these args. The HIP-backend version match-up happens at
+# *runtime*, not build-time: ROCm 6.2's bundled clang at /opt/rocm/llvm
+# ships LLVM 18.0git, so its device bitcode (ocml.bc, ockl.bc) is
+# ABI-compatible with the libacpp-rt that AdaptiveCpp linked against
+# Ubuntu's llvm-18. ROCm 7.x dropped LLVMConfig.cmake from its rocm-llvm
+# package, which is why compose.yaml's rocm service pins BASE to 6.2.
+# LLVM_CMAKE_DIR points at the dir containing LLVMConfig.cmake.
+ARG LLVM_ROOT=/usr/lib/llvm-18
+ARG LLVM_CMAKE_DIR=/usr/lib/llvm-18/cmake
+
+# ─── builder ────────────────────────────────────────────────────────────────
+FROM ${BASE_DEVEL} AS builder
+
+ARG ACPP_REF
+ARG ACPP_TARGETS
+ARG XCHPLOT2_BUILD_CUDA
+ARG INSTALL_CUDA_HEADERS
+ARG CUDA_ARCH
+ARG LLVM_ROOT
+ARG LLVM_CMAKE_DIR
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Common toolchain. AdaptiveCpp 25.10 wants LLVM ≥ 16 + clang + libclang;
+# Ubuntu 24.04 ships llvm-18. Boost.Context, libnuma, libomp are AdaptiveCpp
+# runtime deps. INSTALL_CUDA_HEADERS=1 pulls the CUDA Toolkit *headers* on
+# non-NVIDIA bases — required because AdaptiveCpp's libkernel/half.hpp
+# transitively includes cuda_fp16.h on every build path.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        cmake git ninja-build build-essential python3 pkg-config \
+        curl ca-certificates \
+        libboost-context-dev libnuma-dev \
+ && if [ "${LLVM_ROOT}" = "/usr/lib/llvm-18" ]; then \
+        apt-get install -y --no-install-recommends \
+            llvm-18 llvm-18-dev clang-18 libclang-18-dev libclang-cpp18-dev \
+            lld-18 libomp-18-dev; \
+    fi \
+ && if [ "${INSTALL_CUDA_HEADERS}" = "1" ]; then \
+        apt-get install -y --no-install-recommends nvidia-cuda-toolkit-headers \
+            || apt-get install -y --no-install-recommends nvidia-cuda-toolkit; \
+    fi \
+ && rm -rf /var/lib/apt/lists/*
+
+# AdaptiveCpp's HIP backend invokes a clang driver that expects
+# clang-offload-bundler in its own bin dir (clang looks for helper tools
+# next to itself). On ROCm 6.2-complete images /opt/rocm/llvm/bin is
+# missing that one binary even though clang-18 itself is there. Ubuntu's
+# llvm-18 ships the bundler; both LLVMs are 18-series so the format is
+# compatible.
+#
+# Because we don't know up-front which clang++ AdaptiveCpp will pick
+# (ROCm's /opt/rocm/llvm/bin/clang++, Ubuntu's /usr/lib/llvm-18/bin/
+# clang++, or the /usr/bin shim), symlink the bundler into every clang
+# bin dir we can find. Cheap, belt-and-braces, no per-base-image logic.
+RUN set -eux; \
+    echo "=== clang-offload-bundler discovery ==="; \
+    find / -xdev -name 'clang-offload-bundler*' -executable -type f 2>/dev/null | head -20 || true; \
+    BUNDLER=""; \
+    for c in /usr/lib/llvm-18/bin/clang-offload-bundler \
+             /opt/rocm/llvm/bin/clang-offload-bundler \
+             /usr/bin/clang-offload-bundler-18 \
+             /usr/bin/clang-offload-bundler; do \
+        if [ -x "$c" ]; then BUNDLER="$c"; break; fi; \
+    done; \
+    if [ -z "$BUNDLER" ]; then \
+        BUNDLER=$(find / -xdev -name clang-offload-bundler -executable -type f 2>/dev/null | head -1 || true); \
+    fi; \
+    echo "=== bundler resolved to: ${BUNDLER:-<none>} ==="; \
+    if [ -n "$BUNDLER" ]; then \
+        for d in /opt/rocm/llvm/bin /opt/rocm/bin /usr/lib/llvm-18/bin /usr/bin; do \
+            [ -d "$d" ] || continue; \
+            [ -e "$d/clang-offload-bundler" ] && continue; \
+            ln -sf "$BUNDLER" "$d/clang-offload-bundler"; \
+            echo "linked -> $d/clang-offload-bundler"; \
+        done; \
+    fi
+
+# Rust toolchain (for keygen-rs and the `cargo install` entry point).
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+        sh -s -- -y --default-toolchain stable --profile minimal
+ENV PATH=/root/.cargo/bin:${PATH}
+
+# AdaptiveCpp from source, pinned. Installs to /opt/adaptivecpp.
+RUN git clone --depth 1 --branch ${ACPP_REF} \
+        https://github.com/AdaptiveCpp/AdaptiveCpp.git /tmp/acpp-src \
+ && cmake -S /tmp/acpp-src -B /tmp/acpp-build -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_INSTALL_PREFIX=/opt/adaptivecpp \
+        -DCMAKE_C_COMPILER=${LLVM_ROOT}/bin/clang \
+        -DCMAKE_CXX_COMPILER=${LLVM_ROOT}/bin/clang++ \
+        -DLLVM_DIR=${LLVM_CMAKE_DIR} \
+        -DACPP_LLD_PATH=${LLVM_ROOT}/bin/ld.lld \
+ && cmake --build /tmp/acpp-build --parallel \
+ && cmake --install /tmp/acpp-build \
+ && echo "=== AdaptiveCpp LLVM linkage ===" \
+ && (ldd /opt/adaptivecpp/lib/libacpp-rt.so | grep -iE "llvm|libomp" || true) \
+ && (ldd /opt/adaptivecpp/lib/libacpp-common.so | grep -iE "llvm|libomp" || true) \
+ && rm -rf /tmp/acpp-src /tmp/acpp-build
+
+ENV CMAKE_PREFIX_PATH=/opt/adaptivecpp:${CMAKE_PREFIX_PATH}
+ENV PATH=/opt/adaptivecpp/bin:${PATH}
+
+WORKDIR /xchplot2
+COPY . .
+
+# Build xchplot2. CUDA_ARCHITECTURES + ACPP_TARGETS + XCHPLOT2_BUILD_CUDA
+# get picked up by build.rs; the latter switches the CMake source set
+# between the CUB-using TUs (.cu files via nvcc) and the SYCL-only path.
+RUN CUDA_ARCHITECTURES=${CUDA_ARCH} \
+    ACPP_TARGETS=${ACPP_TARGETS} \
+    XCHPLOT2_BUILD_CUDA=${XCHPLOT2_BUILD_CUDA} \
+    cargo install --path . --root /usr/local --locked
+
+# Also build the parity tests via plain CMake so they're available
+# inside the container for first-port validation on new GPUs (especially
+# AMD/Intel). Reuses the static libs cargo install just built.
+RUN cmake -S . -B build-tests -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} \
+        -DACPP_TARGETS=${ACPP_TARGETS} \
+        -DXCHPLOT2_BUILD_CUDA=${XCHPLOT2_BUILD_CUDA} \
+ && cmake --build build-tests --parallel --target sycl_sort_parity \
+                                          sycl_bucket_offsets_parity \
+                                          sycl_g_x_parity \
+                                          plot_file_parity \
+ && install -m 0755 build-tests/tools/parity/sycl_sort_parity            /usr/local/bin/ \
+ && install -m 0755 build-tests/tools/parity/sycl_bucket_offsets_parity  /usr/local/bin/ \
+ && install -m 0755 build-tests/tools/parity/sycl_g_x_parity             /usr/local/bin/ \
+ && install -m 0755 build-tests/tools/parity/plot_file_parity            /usr/local/bin/ \
+ && rm -rf build-tests target
+
+# ─── runtime ────────────────────────────────────────────────────────────────
+FROM ${BASE_RUNTIME}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# AdaptiveCpp's runtime backend loaders dlopen libLLVM (for SSCP runtime
+# specialization), libnuma (OMP backend), libomp, and Boost.Context.
+# SSCP also shells out to LLVM's `opt` and `llc` binaries at runtime to
+# generate PTX from the SSCP bitcode — install the full llvm-18 package
+# (binaries + lib), not just libllvm18.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        llvm-18 lld-18 libnuma1 libomp5-18 libboost-context1.83.0 \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /usr/local/bin/xchplot2                    /usr/local/bin/xchplot2
+COPY --from=builder /usr/local/bin/sycl_sort_parity            /usr/local/bin/sycl_sort_parity
+COPY --from=builder /usr/local/bin/sycl_bucket_offsets_parity  /usr/local/bin/sycl_bucket_offsets_parity
+COPY --from=builder /usr/local/bin/sycl_g_x_parity             /usr/local/bin/sycl_g_x_parity
+COPY --from=builder /usr/local/bin/plot_file_parity            /usr/local/bin/plot_file_parity
+COPY --from=builder /opt/adaptivecpp                           /opt/adaptivecpp
+
+ENV LD_LIBRARY_PATH=/opt/adaptivecpp/lib:${LD_LIBRARY_PATH}
+ENV PATH=/opt/adaptivecpp/bin:${PATH}
+
+ENTRYPOINT ["/usr/local/bin/xchplot2"]
+CMD ["--help"]
diff --git a/NOTICE b/NOTICE
index c203f35..3ffbead 100644
--- a/NOTICE
+++ b/NOTICE
@@ -49,11 +49,40 @@ FSE (Finite State Entropy)
     Vendored upstream by pos2-chip at lib/fse/ and statically linked into
     xchplot2. Provides the entropy-coding step of v2 plot file compression.
 ================================================================================
+AdaptiveCpp (formerly hipSYCL)
+    https://github.com/AdaptiveCpp/AdaptiveCpp
+    Copyright (c) The AdaptiveCpp Contributors
+    Licensed under the BSD 2-Clause "Simplified" License.
+
+    SYCL implementation. Statically linked at build time (libacpp-rt and
+    friends) for the cross-vendor SYCL kernel path. Pulled in via
+    find_package(AdaptiveCpp) from /opt/adaptivecpp (the install-deps.sh
+    default) or via CMake FetchContent at v25.10.0.
+================================================================================
 NVIDIA CUDA Toolkit (runtime + CUB)
     Used at build time and dynamically at run time.
     Subject to the NVIDIA CUDA Toolkit End User License Agreement
     (https://docs.nvidia.com/cuda/eula/).
 ================================================================================
+AMD ROCm / HIP
+    https://github.com/ROCm/ROCm
+    Copyright (c) Advanced Micro Devices, Inc.
+
+    Used at build time (HIP toolchain) and dynamically at run time on
+    AMD builds. Components are licensed per-package — primarily MIT and
+    University of Illinois/NCSA Open Source — see the per-component
+    LICENSE files in each ROCm subproject.
+================================================================================
+Intel oneAPI / Level Zero
+    https://github.com/oneapi-src
+    Copyright (c) Intel Corporation
+
+    Used at build time and dynamically at run time on Intel SYCL builds
+    (currently wired up but untested — no Intel GPU in our test matrix).
+    Components are licensed per-package: Apache-2.0 with LLVM exception
+    for the DPC++ compiler, MIT for the Level Zero loader, and the Intel
+    oneAPI End User License Agreement for the proprietary toolkit pieces.
+================================================================================
 
 Full license texts for each Apache-2.0 component are reproduced in their
 respective upstream source trees, which CMake FetchContent / cargo will
diff --git a/README.md b/README.md
index 7f73683..ff7d7a1 100644
--- a/README.md
+++ b/README.md
@@ -4,36 +4,394 @@ GPU plotter for Chia v2 proofs of space (CHIP-48). Produces farmable
 `.plot2` files byte-identical to the
 [pos2-chip](https://github.com/Chia-Network/pos2-chip) CPU reference.
 
-## Performance
+> **Status — work in progress.** Plots are byte-identical to the
+> pos2-chip CPU reference and deterministic across runs; performance,
+> AMD/Intel support, and the install/CI story are still evolving. Use
+> [`cuda-only`](https://github.com/Jsewill/xchplot2/tree/cuda-only) for
+> the most-tested path.
 
-k=28, strength=2, RTX 4090 (sm_89), PCIe Gen4 x16:
+> **Branches:** `main` — SYCL/AdaptiveCpp port, runs on NVIDIA +
+> AMD + Intel (CUB fast path preserved on NVIDIA).
+> [`cuda-only`](https://github.com/Jsewill/xchplot2/tree/cuda-only) —
+> original pure-CUDA path, pick it if you only target NVIDIA. See
+> [Performance](#performance) for the tradeoff.
 
-| Mode | Per plot |
-|---|---|
-| pos2-chip CPU baseline | ~50 s |
-| `xchplot2 batch` steady-state wall | **2.06 s** |
-| Producer GPU time, steady-state | 1.96 s |
-| Device-kernel floor (single-plot nsys) | 1.91 s |
+## Quick start
+
+```bash
+# Install — needs CUDA Toolkit 12+ (or AdaptiveCpp for AMD/Intel),
+# CMake ≥ 3.24, a C++20 compiler, and Rust. See Build for alternatives.
+cargo install --git https://github.com/Jsewill/xchplot2
 
-A physically narrower PCIe slot (e.g. Gen4 x4) adds ~240 ms per plot to
-the final fragment D2H copy. Check `cat /sys/bus/pci/devices/*/current_link_width`
-under load if numbers look off by that much.
+# Plot — 10 × k=28 files, keys derived internally from your BLS pair.
+xchplot2 plot -k 28 -n 10 \
+    -f <farmer-pk-hex> \
+    -c <pool-contract-xch1-or-txch1> \
+    -o /mnt/plots
+
+# Multi-GPU — one worker per GPU, round-robin partition.
+# (`--devices all` adds a CPU worker too; `--devices gpu` sticks to GPUs.)
+xchplot2 plot ... --devices gpu
+```
+
+See [Hardware compatibility](#hardware-compatibility) for GPU / VRAM
+/ OS requirements, [Build](#build) for container / native / CMake
+paths, and [Use](#use) for every flag.
+**Windows users**: this `cargo install` line works under WSL2; for
+native Windows or a non-WSL setup, jump to [Windows](#windows).
+
+## Hardware compatibility
+
+- **GPU:**
+  - **NVIDIA**, compute capability ≥ 5.0 (Maxwell / GTX 750-class
+    and newer) via the CUDA fast path. Builds auto-detect the
+    installed GPU's `compute_cap` via `nvidia-smi`; override with
+    `$CUDA_ARCHITECTURES` for fat or cross-target builds (see
+    [Build](#build)). Pre-sm_53 cards lack native FP16 ALUs, but
+    `cuda_fp16.h` falls back to fp32 emulation for the half-precision
+    intrinsics — kernels work correctly with the emulation cost.
+    On dual-vendor hosts (e.g. AMD primary + secondary NVIDIA),
+    `build.rs` also routes around CUDA 13.x + sm < 75 (the toolkit
+    dropped Maxwell-Volta codegen) so an old NVIDIA card next to a
+    working AMD GPU no longer derails the build.
+  - **AMD ROCm** via the SYCL / AdaptiveCpp path. Validated on RDNA2
+    (`gfx1031`, RX 6700 XT, 12 GB) — bit-exact parity with the CUDA
+    backend across the sort / bucket-offsets / g_x kernels, and
+    farmable plots end-to-end. ROCm 6.2 required (newer ROCm versions
+    have LLVM packaging breakage — see [`compose.yaml`](compose.yaml)
+    rocm-service comments). Build picks `ACPP_TARGETS=hip:gfxXXXX`
+    from `rocminfo` automatically for RDNA2+. Other gfx targets
+    (`gfx1030` / `gfx1100`) build cleanly but are untested on real
+    hardware. **RDNA1 cards (`gfx1010`/`gfx1011`/`gfx1012`, e.g.
+    Radeon Pro W5700, RX 5700 / 5700 XT)** default to
+    `ACPP_TARGETS=generic` (SSCP JIT) — a previous community
+    workaround AOT-spoofed them as `gfx1013`, but that has been
+    observed to silently produce no-op kernel stubs on at least one
+    W5700 + ROCm 6 + AdaptiveCpp 25.10 setup. Generic SSCP works
+    end-to-end through k=24 parity tests. Two opt-in escape hatches
+    preserved: `XCHPLOT2_FORCE_GFX_SPOOF=1` to restore the legacy
+    AOT spoof, `XCHPLOT2_NO_GFX_SPOOF=1` to AOT-target the actual
+    ISA natively (build will fail clearly if AdaptiveCpp doesn't
+    accept it).
+  - **Intel oneAPI** is wired up but untested.
+  - **CPU** (no GPU) via AdaptiveCpp's OpenMP backend. Opt-in with
+    `--cpu` (or `--devices cpu`) — never the default. Plotting is
+    1-2 orders of magnitude slower than a real GPU; intended for
+    headless CI, GPU-less dev machines, or as an extra worker
+    alongside GPUs (`--devices all` runs every visible GPU plus a
+    CPU worker on the same batch; `--devices gpu` sticks to GPUs). Build the container with
+    `scripts/build-container.sh --gpu cpu` for the standalone CPU
+    image (`xchplot2:cpu`, ~400 MB; no CUDA / ROCm in the image).
+- **VRAM:** four tiers, picked automatically based on free device
+  VRAM at k=28. All four produce byte-identical plots.
+  - **Pool** (~11 GB device + ~4 GB pinned host): fastest steady-state,
+    used on 12 GB+ cards.
+  - **Plain streaming** (~7.3 GB peak + 128 MB margin): per-plot
+    allocations, no pinned-host parks, single-pass T2 match. ~400 ms/
+    plot faster than compact. Used on 10-11 GB cards that can't fit
+    the pool but have headroom above compact.
+  - **Compact streaming** (~5.2 GB peak + 128 MB margin): full
+    park/rehydrate + N=2 T2 match tiling. Used on 6-8 GB cards where
+    plain won't fit. 6 GB cards (RTX 2060, RX 6600) are on the edge;
+    8 GB cards (3070, 2070 Super) comfortably fit.
+  - **Minimal streaming** (~3.76 GB peak + 128 MB margin): six layered
+    cuts on top of compact — N=8 T2 match staging, tiled gathers in
+    T1/T2 sort, sliced T1 match (per section_l), sliced T3 match
+    (T2 inputs parked on host, slice H2D'd per section pair),
+    per-tile CUB outputs in T1/T2/T3 sort with USM-host merges, and
+    tiled Xs gen+sort+pack with host-pinned accumulation. Bottleneck
+    moves from compact's T1 sort (5200 MB) to T3 match (3754 MB).
+    Targets 5 GiB+ cards (RTX 2060, RX 6600 XT, RX 7600) comfortably;
+    4 GiB cards (GTX 1050 Ti, RTX 3050 4GB, MX450) are an edge case
+    since real 4 GiB hardware reports ~3.5 GiB free post-CUDA-context.
+    Trade-off: ~6 extra cap-sized PCIe round-trips per plot. k=28
+    wall on sm_89: ~34 s/plot vs ~13 s for compact. Detailed
+    breakdown in [VRAM](#vram).
+
+  With [`--devices`](#multi-gpu---devices), each worker picks its own
+  tier from its own GPU's free VRAM — heterogeneous rigs (e.g. one
+  12 GB + one 8 GB card) plot concurrently with each device on its
+  matching tier.
+- **PCIe:** Gen4 x16 or wider recommended. A physically narrower slot
+  (e.g. Gen4 x4) adds ~240 ms per plot to the final fragment D2H
+  copy; check `cat /sys/bus/pci/devices/*/current_link_width`
+  under load if throughput looks off.
+- **Host RAM:** ≥ 16 GB recommended; `batch` mode pins ~4 GB of host
+  memory for D2H double-buffering (pool or streaming).
+- **CUDA Toolkit:** 12+ required for the NVIDIA build path (tested on
+  13.x). Skipped automatically on AMD/Intel builds where `nvcc` isn't
+  available — `build.rs` runs `nvcc --version` and flips
+  `XCHPLOT2_BUILD_CUDA=OFF` when missing. The toolkit-vs-arch matrix:
+  - `sm_50` – `sm_72` (Maxwell / Pascal / Volta): need CUDA **12.9**
+    (last toolkit with codegen for these arches — 13.x dropped them
+    entirely). `build.rs` catches the 13.x + old-arch pairing in a
+    preflight and points at the fix path.
+  - `sm_75` – `sm_90` (Turing / Ampere / Hopper): 12.x or 13.x both
+    work.
+  - `sm_120` (RTX 50-series Blackwell): need 12.8+; earlier toolkits
+    lack Blackwell codegen.
+- **OS:** Linux (tested on modern glibc distributions) is the supported
+  path. Windows users route through either the `cuda-only` branch
+  natively (NVIDIA + MSVC + CUDA) or WSL2 (any vendor WSL2 supports)
+  — see [Windows](#windows) below. macOS is not supported (no CUDA,
+  no modern SYCL runtime).
 
 ## Build
 
-Requires CUDA Toolkit 12+ (tested on 13.x), C++20 host compiler, CMake
-≥ 3.24, and a Rust toolchain (for `keygen-rs`).
+### Which path should I use?
+
+- **"I just want to plot, Linux host"** → **container (path 1)**. Smallest
+  host install (just `podman` + `podman-compose` + the GPU passthrough
+  bits — `scripts/install-container-deps.sh` installs all of it). All
+  toolchain lives inside the image. Auto-detects your GPU and pins the
+  right CUDA / ROCm base.
+- **"NVIDIA only, native binary, no SYCL/AdaptiveCpp"** → **`cuda-only`
+  branch (path 2)**. Three host packages — `cmake` + `build-essential`
+  + the CUDA Toolkit. No LLVM/lld/AdaptiveCpp install. Smaller dep
+  surface than main; same end result for NVIDIA users.
+- **"Full build — AMD / Intel / CPU support, parity tests on the host"**
+  → **`install-deps.sh` (path 3)**. Auto-installs cmake, lld, LLVM 18,
+  AdaptiveCpp from source. ~30-45 min first-time setup.
+
+Three ways to get the dependencies in place, easiest first:
+
+### 1. Container (`podman compose` or `docker compose`)
+
+Easiest path — `scripts/build-container.sh` does host-side GPU
+probing and feeds the right env vars to `compose build`. If you're
+starting from a fresh host, `scripts/install-container-deps.sh`
+installs the engine + GPU passthrough bits first (podman + GPU probe
++ `nvidia-container-toolkit` / video-render groups, as appropriate;
+no native CUDA / ROCm / LLVM / AdaptiveCpp on the host):
+
+```bash
+./scripts/install-container-deps.sh    # one-time: engine + GPU passthrough
+./scripts/build-container.sh           # auto: nvidia-smi → cuda, rocminfo → rocm
+podman compose run --rm cuda plot -k 28 -n 10 -f <farmer-pk> -c <pool-contract> -o /out
+```
+
+**The script handles a handful of host-side decisions that bare
+`podman compose build` can't:**
+
+- **Vendor pick** (cuda / rocm / intel / cpu) from nvidia-smi /
+  rocminfo, or `--gpu cpu` to force CPU.
+- **Multi-GPU fat binary** (e.g. `CUDA_ARCH="61;86"` on a
+  1070+3060 rig) — compose alone defaults to a single arch.
+- **Pascal/Volta auto-pin** to `nvidia/cuda:12.9.1-devel-ubuntu24.04`
+  when min arch < 75. CUDA 13 dropped sub-Turing codegen, so a Pascal
+  user without this pin hits a build-time `Unsupported gpu
+  architecture 'compute_61'` error inside the container.
+- **AMD `ACPP_GFX` extract** from rocminfo + the RDNA1 (gfx1010 →
+  gfx1013) workaround for Radeon Pro W5700.
+- **`--no-cache`** pass-through to force a clean rebuild after a
+  toolchain bump.
+
+You CAN run `podman compose build` directly — it just means setting
+those env vars yourself. The compose YAML's defaults are conservative
+(CUDA 13.0, sm_89, no AMD target without `ACPP_GFX`), so plain
+`podman compose build cuda` only "just works" on Turing-or-newer
+NVIDIA hosts. Anything else needs the script or the equivalent
+manual env:
+
+[`compose.yaml`](compose.yaml) defines four vendor-specific services
+sharing one [`Containerfile`](Containerfile); the script just runs
+`compose build` against whichever matches your hardware. Override
+manually if you prefer:
+
+```bash
+# NVIDIA (default sm_89; override via $CUDA_ARCH=120 etc.)
+podman compose build cuda
+
+# AMD ROCm — set $ACPP_GFX from `rocminfo | grep gfx`.
+ACPP_GFX=gfx1031 podman compose build rocm    # Navi 22
+ACPP_GFX=gfx1100 podman compose build rocm    # Navi 31 (default)
+
+# Intel oneAPI (experimental, untested).
+podman compose build intel
+
+# CPU-only (no GPU; AdaptiveCpp OpenMP backend; ~400 MB image).
+# Plotting is 1-2 orders of magnitude slower than GPU — see CPU bullet
+# under Hardware compatibility for the use case.
+podman compose build cpu
+```
+
+Plot files land in `./plots/` on the host. The container also bundles
+the parity tests (`sycl_sort_parity`, `sycl_g_x_parity`, etc.) under
+`/usr/local/bin/` for quick first-port validation on a new GPU:
+
+```bash
+podman compose run --rm --entrypoint /usr/local/bin/sycl_sort_parity rocm
+```
+
+First build is ~15-30 min (AdaptiveCpp + LLVM 18 compile from source);
+subsequent rebuilds reuse the cached layers. GPU performance inside
+the container is identical to native — kernels run on real hardware
+via the engine's GPU pass-through:
+
+- **NVIDIA**: requires `nvidia-container-toolkit` on the host. For
+  Docker users, also run once after install:
+  ```bash
+  sudo apt install nvidia-container-toolkit
+  sudo nvidia-ctk runtime configure --runtime=docker
+  sudo systemctl restart docker
+  ```
+  Podman 5.x with CDI works without the runtime-configure step.
+- **AMD**: `/dev/kfd` + `/dev/dri` device files. The compose `rocm`
+  service handles this automatically; for bare `podman/docker run`
+  pass `--device /dev/kfd --device /dev/dri --group-add video`.
+
+#### AMD container — sudo, `--privileged`, and `ACPP_GFX`
+
+AMD GPUs need three pieces of friction handled correctly. None are
+optional on most hosts, and getting any one wrong tends to fail
+silently or in confusing ways:
+
+1. **`ACPP_GFX` must be set** to your GPU's gfx target. The kernels
+   are AOT-compiled for a specific amdgcn ISA at build time. If the
+   wrong arch is baked in, HIP loads the fatbinary without complaint
+   but the kernels execute as silent no-ops at runtime — sort returns
+   input unchanged, AES match finds zero matches, plots look valid
+   but contain non-canonical proofs that won't qualify against real
+   challenges. `compose.yaml` defaults `ACPP_GFX` to a placeholder
+   string that AdaptiveCpp's HIP backend rejects loudly at build
+   time, so an unset value fails fast with the placeholder visible
+   in the error rather than silently using a default like `gfx1100`.
+   Common values (`rocminfo | grep gfx` to confirm yours):
+
+   - `gfx1030` — RDNA2 Navi 21 (RX 6800 / 6800 XT / 6900 XT)
+   - `gfx1031` — RDNA2 Navi 22 (RX 6700 XT / 6700 / 6800M)
+   - `gfx1100` — RDNA3 Navi 31 (RX 7900 XTX / XT)
+   - `gfx1101` — RDNA3 Navi 32 (RX 7800 XT / 7700 XT)
+
+2. **Rootful `--privileged` for runs.** Rootless podman's default
+   seccomp filter + capability set blocks some of the KFD ioctls
+   `libhsa-runtime64` needs during DMA setup. Without them you get
+   a segfault deep inside the HSA runtime on the very first
+   host→device copy, even though `rocminfo` works fine. Builds don't
+   need GPU access and can stay rootless if you prefer.
+
+3. **`sudo` strips environment variables by default**, including
+   the `ACPP_GFX` you set in your shell. So a bare
+   `sudo podman compose build rocm` loses it. Either invoke the
+   build script (it sets the var inside the sudo'd shell where
+   compose can see it) or pass the var through explicitly.
+
+The recommended invocation pair, in order of how short each one is:
+
+```bash
+# Build (autodetects ACPP_GFX from rocminfo — works under sudo too):
+sudo ./scripts/build-container.sh
+
+# Run a single test plot at k=22:
+sudo podman run --rm --privileged \
+    --device /dev/kfd --device /dev/dri \
+    -v $PWD/plots:/out xchplot2:rocm \
+    test 22 <plot_id_hex> 2 0 0 -G -o /out
+
+# Run real plotting:
+sudo podman run --rm --privileged \
+    --device /dev/kfd --device /dev/dri \
+    -v $PWD/plots:/out xchplot2:rocm \
+    plot -k 28 -n 10 -f <farmer-pk> -c <pool-contract> -o /out
+```
+
+If `sudo` doesn't carry `/opt/rocm/bin` on your distro and the build
+script can't find `rocminfo`, fall back to one of:
+
+```bash
+sudo -E ./scripts/build-container.sh                       # preserve your shell PATH
+sudo ACPP_GFX=gfx1031 ./scripts/build-container.sh         # explicit, no rocminfo needed
+```
+
+Or skip the script entirely:
+
+```bash
+sudo ACPP_GFX=gfx1031 podman compose build rocm
+```
+
+For convenience, drop a wrapper at `~/.local/bin/xchplot2-amd`:
+
+```bash
+#!/bin/bash
+exec sudo podman run --rm --privileged \
+    --device /dev/kfd --device /dev/dri \
+    -v "$PWD/plots:/out" xchplot2:rocm "$@"
+```
+
+Then `xchplot2-amd plot -k 28 -n 10 -f ... -c ... -o /out` just works.
+
+### 2. Native install via `scripts/install-deps.sh`
+
+```bash
+./scripts/install-deps.sh        # auto-detects distro + GPU vendor
+```
+
+Installs the toolchain via the system package manager (Arch, Ubuntu /
+Debian, Fedora) plus AdaptiveCpp from source into `/opt/adaptivecpp`.
+GPU vendor is auto-detected: `nvidia-smi` / `rocminfo` first,
+`/sys/class/drm` PCI IDs as fallback (so fresh installs without driver
+tools still work). On a no-GPU host (CI / build box) the script
+errors out — pass `--gpu nvidia` to install the toolchain anyway.
+`--gpu amd` forces the AMD path on dual-vendor hosts. Intel detection
+currently errors with a hint pointing at `--gpu nvidia` (the SYCL
+toolchain JITs onto Intel via AdaptiveCpp's generic SSCP target) or
+the container. Pass `--no-acpp` to skip the AdaptiveCpp build and
+let CMake fall back to FetchContent.
+
+### 3. Manual / FetchContent fallback
+
+If you'd rather install dependencies yourself, the toolchain is:
+
+| Dep | Notes |
+|---|---|
+| **AdaptiveCpp 25.10+** | SYCL implementation. CMake auto-fetches it via FetchContent if `find_package(AdaptiveCpp)` fails — first build adds ~15-30 min. Disable with `-DXCHPLOT2_FETCH_ADAPTIVECPP=OFF` if you want a hard error. |
+| **CUDA Toolkit 12+** (headers) | Required on **every** build path because AdaptiveCpp's `half.hpp` includes `cuda_fp16.h`. `nvcc` itself only runs when `XCHPLOT2_BUILD_CUDA=ON`. Default is vendor-aware — `ON` for NVIDIA GPUs, `OFF` for AMD / Intel GPUs (even if `nvcc` is installed), falling through to `nvcc`-presence only when no GPU is probed (CI / container). Override with the env var. |
+| **LLVM / Clang ≥ 18** | `clang`, `lld` (AdaptiveCpp's CMake requires `ld.lld`), plus the libclang dev packages. `install-deps.sh` installs all of them; manual installs need to add `lld-18` (apt) / `lld` (dnf, pacman) explicitly. |
+| **C++20 compiler** | clang ≥ 18 or gcc ≥ 13. |
+| **CMake ≥ 3.24**, **Ninja**, **Python 3** | build tools. |
+| **Boost.Context, libnuma, libomp** | AdaptiveCpp runtime deps. |
+| **Rust toolchain** (stable) | for `keygen-rs` and `cargo install`. |
+
+`pos2-chip` and `FSE` are auto-fetched at CMake configure time
+(`FetchContent`); override `-DPOS2_CHIP_DIR=/abs/path` for a local
+checkout.
+
+For non-NVIDIA targets, the build also probes:
+- **ROCm 6+** (`rocminfo`): if found, sets `ACPP_TARGETS=hip:gfxXXXX`.
+- **Intel oneAPI** (Level Zero / compute-runtime): manual `ACPP_TARGETS`.
 
 ### `cargo install`
 
 ```bash
-cargo install --git https://github.com/Chia-Network/xchplot2
-# or fat build:
-CUDA_ARCHITECTURES="89;120" cargo install --git https://github.com/Chia-Network/xchplot2
+cargo install --git https://github.com/Jsewill/xchplot2
 ```
 
-`build.rs` auto-detects the local GPU's compute capability via
-`nvidia-smi` (falling back to `sm_89`). Override with `$CUDA_ARCHITECTURES`.
+`build.rs` auto-detects the local GPU's compute capability by querying
+`nvidia-smi --query-gpu=compute_cap` and builds for only that
+architecture. That keeps the binary small and the build fast when the
+install and the target GPU are the same machine.
+
+If auto-detection fails (no `nvidia-smi` in `PATH`, or
+`nvidia-smi` can't see a GPU — common when building inside a container
+or on a headless build host that lacks the CUDA driver), the build
+falls back to `sm_89`. Note that arch-detect picks *which CUDA arch* —
+*whether* CUDA TUs build at all is a separate vendor-aware decision
+(see `XCHPLOT2_BUILD_CUDA` in [Environment variables](#environment-variables)).
+
+If you need to target a GPU that isn't the one doing the build — or if
+you want a single "fat build" binary that covers multiple
+architectures — override with `$CUDA_ARCHITECTURES`:
+
+```bash
+# Fat build for Ada (4090) and Blackwell (5090):
+CUDA_ARCHITECTURES="89;120" cargo install --git https://github.com/Jsewill/xchplot2
+
+# Single target (e.g. Turing 2080 Ti):
+CUDA_ARCHITECTURES=75 cargo install --git https://github.com/Jsewill/xchplot2
+```
+
+Common values: `61` GTX 10-series, `70` Volta, `75` Turing, `80` A100,
+`86` RTX 30-series, `89` RTX 40-series, `90` H100, `120` RTX 50-series.
 
 ### CMake (also builds the parity tests)
 
@@ -50,6 +408,188 @@ Outputs:
 - `build/tools/xchplot2/xchplot2`
 - `build/tools/parity/{aes,xs,t1,t2,t3}_parity` — bit-exact CPU/GPU tests
 
+### Windows
+
+Two supported paths — native `main` doesn't work because AdaptiveCpp
+has hard Linux-isms (libnuma, pthreads, LLVM SSCP) that fall apart on
+Windows. Jump to the relevant subsection below:
+
+- [Native Windows build (`cuda-only` branch)](#native-windows-build-cuda-only-branch) — recommended NVIDIA path.
+- [Native Windows build — SYCL path (adventurous)](#native-windows-build--sycl-path-adventurous) — AMD/Intel/cross-vendor, untested.
+
+**NVIDIA only** → use the
+[`cuda-only`](https://github.com/Jsewill/xchplot2/tree/cuda-only)
+branch. Pure MSVC + CUDA Toolkit + Rust, no SYCL runtime involved.
+See that branch's README for the VS 2022 / Windows SDK / `LIB`
+troubleshooting (the `LNK1181: kernel32.lib` and friends).
+
+**AMD or Intel, or if you just want the `main` code path** → run
+under **WSL2**. WSL2 is a full Linux environment, so every install
+option in this README works there unchanged — `cargo install`,
+`scripts/install-deps.sh`, or the container (section 1 above).
+Enable WSL2 once with `wsl --install` in an elevated PowerShell.
+GPU access in WSL2:
+
+- **NVIDIA**: install the latest "NVIDIA GPU Driver for Windows",
+  nothing else — CUDA shows up inside WSL2 automatically.
+- **AMD**: ROCm 6.1+ supports a limited card list on WSL2 (RX 7900
+  XTX, Radeon Pro W7900, specific Instincts). Follow AMD's "Install
+  ROCm on WSL" guide.
+- **Intel**: oneAPI on WSL2 via the Intel Linux graphics driver.
+
+Once the GPU is visible from a WSL2 shell (`nvidia-smi`, `rocminfo`,
+or `sycl-ls`), proceed with the native Linux instructions above.
+
+#### Native Windows build (cuda-only branch)
+
+Full walkthrough for the NVIDIA native path, repeated here so you
+don't have to flip between READMEs. Prerequisites:
+
+- Windows 10 21H2+ or Windows 11, x64
+- [Visual Studio 2022](https://visualstudio.microsoft.com/) Community
+  with the **"Desktop development with C++"** workload. That workload
+  bundles MSVC + the Windows SDK; the SDK is non-optional because it
+  ships `kernel32.lib` / `user32.lib` / etc. that `link.exe`
+  consumes. If you've trimmed the installer to "C++ build tools"
+  only, open **Visual Studio Installer → Modify → Individual
+  components** and tick the latest **Windows 11 SDK** before
+  retrying.
+- [CUDA Toolkit 12.0+](https://developer.nvidia.com/cuda-downloads) —
+  install **after** Visual Studio so the CUDA installer wires up the
+  MSBuild integration. 12.8+ required for RTX 50-series (Blackwell,
+  `sm_120`).
+- [Rust](https://www.rust-lang.org/tools/install) using the MSVC
+  toolchain (`rustup default stable-x86_64-pc-windows-msvc`).
+- [CMake 3.24+](https://cmake.org/download/) and [Git for
+  Windows](https://gitforwindows.org/).
+
+Launch the **x64 Native Tools Command Prompt for VS 2022** from the
+Start menu — there are several similarly-named prompts (x86 /
+x86_64 / 2019 / 2022); the one that matters is the x64 for 2022.
+That prompt is the one that sets `LIB`, `INCLUDE`, and `PATH` so
+`cl.exe`, `link.exe`, `nvcc`, and `cmake` all see each other plus
+the Windows SDK. A plain `cmd` / PowerShell / Windows Terminal tab
+does **not** do this — running `cargo install` from one of those
+produces `LNK1181: cannot open input file 'kernel32.lib'` at the
+first link step.
+
+Quick sanity check in the prompt:
+
+```cmd
+where link.exe
+echo %LIB%
+```
+
+`%LIB%` should include a `...\Windows Kits\10\Lib\...\um\x64`
+entry. If it doesn't, you're in the wrong prompt or the Windows SDK
+component isn't installed.
+
+Build:
+
+```cmd
+set CUDA_ARCHITECTURES=89
+cargo install --git https://github.com/Jsewill/xchplot2 --branch cuda-only
+```
+
+Or for a local checkout you can iterate on:
+
+```cmd
+git clone -b cuda-only https://github.com/Jsewill/xchplot2
+cd xchplot2
+set CUDA_ARCHITECTURES=89
+cargo install --path .
+```
+
+Set `CUDA_ARCHITECTURES` to match your card (see the list above).
+PowerShell users: use `$env:CUDA_ARCHITECTURES = "89"` instead of
+`set`. The CMake path (`cmake -B build -S . && cmake --build build`)
+also works inside the same Native Tools prompt if you prefer that
+over `cargo install`.
+
+#### Native Windows build — SYCL path (adventurous)
+
+**Strongly recommend WSL2 first** (see the top of this section).
+This subsection exists because the path is in principle buildable
+on native Windows; in practice it's days of build-system tinkering
+without hardware the maintainers can iterate on. Not validated by
+us. File an issue with your findings.
+
+What you're signing up for: AdaptiveCpp, built from source on
+Windows, pointed at either **AMD HIP SDK for Windows** (for AMD) or
+the **CUDA Toolkit** (for NVIDIA through SYCL, if you want the
+`main` branch's cross-vendor code path on NVIDIA instead of
+`cuda-only`'s CUB one). xchplot2's CMake then finds that install
+via `find_package(AdaptiveCpp)` and builds normally. AdaptiveCpp's
+FetchContent fallback is **not** viable on native Windows — its own
+CMakeLists assumes Linux-isms (libnuma, pthreads) that fall apart.
+Pre-install is mandatory.
+
+Prerequisites (on top of the cuda-only prereqs above — MSVC,
+Windows SDK, Rust, CMake, Git):
+
+- **LLVM 16–20** with Clang + LLD + the CMake development package
+  (`LLVMConfig.cmake` / `ClangConfig.cmake`). Version coverage of
+  Windows binary installers is patchy for these components; a
+  self-built LLVM is usually the path of least resistance. See
+  [AdaptiveCpp's Windows install guide](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/installing.md)
+  for the currently-recommended source.
+- **AMD HIP SDK for Windows** (for the AMD target) from AMD's
+  [HIP SDK download page](https://www.amd.com/en/developer/rocm-hub/hip-sdk.html).
+  AMD officially flags it as preview: limited card list, different
+  device-library layout vs Linux ROCm, runtime coverage varies per
+  GPU.
+- **CUDA Toolkit 12+** (for the NVIDIA-via-SYCL target). Same
+  installer as the `cuda-only` path above.
+
+Rough build sequence from a clean **x64 Native Tools Command Prompt
+for VS 2022** (paths are indicative — match your installs):
+
+```cmd
+:: 1. Build AdaptiveCpp
+git clone --branch v25.10.0 https://github.com/AdaptiveCpp/AdaptiveCpp.git
+cd AdaptiveCpp
+cmake -B build -S . -G Ninja ^
+    -DCMAKE_BUILD_TYPE=Release ^
+    -DCMAKE_INSTALL_PREFIX=C:\opt\adaptivecpp ^
+    -DLLVM_DIR=C:\path\to\llvm\lib\cmake\llvm ^
+    -DWITH_CUDA_BACKEND=OFF ^
+    -DWITH_HIP_BACKEND=ON ^
+    -DROCM_PATH="C:\Program Files\AMD\ROCm\6.1"
+cmake --build build --parallel
+cmake --install build
+
+:: 2. Build xchplot2 main against the install
+cd \path\to\xchplot2
+:: CMAKE_PREFIX_PATH only needed if you installed AdaptiveCpp to a
+:: non-default Windows path. The build's auto-discovery only covers
+:: Linux's /opt/adaptivecpp — Windows users tell CMake explicitly.
+set CMAKE_PREFIX_PATH=C:\opt\adaptivecpp
+set ACPP_TARGETS=hip:gfx1101
+set XCHPLOT2_BUILD_CUDA=OFF
+cargo install --path .
+```
+
+Flip `WITH_HIP_BACKEND` ↔ `WITH_CUDA_BACKEND` and set
+`ACPP_TARGETS=cuda:sm_XX` for the NVIDIA-through-SYCL variant.
+
+Failure modes you should expect to triage:
+
+- **Missing LLVM CMake modules** — source-built LLVM with
+  `LLVM_INSTALL_UTILS=ON` and the clang / clang-tools-extra
+  projects enabled is the reliable recipe.
+- **Generic SSCP compiler disabled** (`DEFAULT_TARGETS` warning
+  during AdaptiveCpp configure) — harmless if you set
+  `ACPP_TARGETS=hip:gfxXXXX` explicitly at xchplot2's configure.
+- **`ROCM_PATH` mismatch** — AMD's Windows installer versions the
+  directory (`C:\Program Files\AMD\ROCm\6.1\`); match it exactly.
+- **Clean build, runtime kernel failures** — the HIP SDK for
+  Windows preview doesn't cover every GPU the Linux ROCm path
+  does. Run `scripts/test-multi-gpu.sh` / `xchplot2 test 22 ...`
+  with a k=22 plot first and `xchplot2 verify` the result before
+  committing a large batch.
+
+Seriously, try WSL2 first.
+
 ## Use
 
 ### Standalone (farmable plots)
@@ -65,6 +605,16 @@ Pool variants: `-p <pool-pk>` or `--pool-ph <pool-ph>`. Other common
 flags: `-s <strength>`, `-T` testnet, `-S <seed>` for reproducible runs,
 `-v` verbose. Full help: `xchplot2 -h`.
 
+For long batches, `--skip-existing` skips plots whose output file is
+already a complete `.plot2` (magic bytes + non-trivial size), and
+`--continue-on-error` logs per-plot failures and keeps going instead of
+aborting the whole run. Both flags work for `plot` and `batch` modes.
+
+Plots are written to `<name>.plot2.partial` and atomically renamed on
+completion, so a crash / `SIGINT` / `ENOSPC` mid-write never leaves a
+malformed plot at the destination. A first `Ctrl-C` asks the plotter to
+finish the plot in flight and stop; a second hard-kills.
+
 #### Grouping plots: `-i <plot-index>` and `-g <meta-group>`
 
 Both are v2 PoS fields and default to 0.
@@ -84,13 +634,171 @@ decisions. When the grouped layout lands, the auto-incrementing
 `<plot-index>` above is the per-plot within-group identifier it
 will expect.
 
+#### Multi-device: `--devices` and `--cpu`
+
+`xchplot2 devices` prints id, name, backend, VRAM, compute-unit count,
+and which sort path each device will use (CUB on cuda-backend devices
+when this build links CUB, SortSycl otherwise) — the printed `[N]`
+index is the value `--devices N` accepts:
+
+```
+$ xchplot2 devices
+Visible devices (2 GPU + 1 CPU):
+  [0]   NVIDIA GeForce RTX 4090          backend=cuda       vram=24076 MB  CUs=128   sort:CUB
+  [1]   AMD Radeon Pro W5700             backend=hip        vram= 8176 MB  CUs=36    sort:SYCL
+  [cpu] Host CPU plotter                 backend=omp        threads=32             sort:SYCL  (1-2 orders slower than GPU)
+
+Use `--devices N` (id) for a specific GPU,
+     `--devices gpu` for every GPU,
+     `--devices cpu` for the host CPU only,
+     `--devices all` for every GPU + CPU,
+  or any comma combination (e.g. `0,2,cpu`).
+```
+
+Both `plot` and `batch` accept `--devices <SPEC>` to fan plots out
+across multiple devices — one worker thread per device, each with its
+own buffer pool and writer channel. Plots are partitioned round-robin,
+so a batch of 10 plots on 2 GPUs sends plots 0/2/4/6/8 to the first
+GPU and 1/3/5/7/9 to the second.
+
+```bash
+# Every visible GPU — enumerated at runtime. No CPU worker.
+xchplot2 plot --k 28 --num 10 -f <farmer-pk> -c <pool-contract> \
+    --out /mnt/plots --devices gpu
+
+# Every visible GPU PLUS a CPU worker on the same batch.
+xchplot2 plot ... --devices all
+
+# Only these specific GPU ids (sorted, deduplicated).
+xchplot2 plot ... --devices 0,2,3
+
+# Explicit single id (same as omitting the flag on a single-GPU host).
+xchplot2 plot ... --devices 0
+
+# CPU-only: AdaptiveCpp OpenMP backend (slow). Use the `cpu` token in
+# --devices, or the standalone --cpu flag (equivalent on its own).
+xchplot2 plot ... --devices cpu
+xchplot2 plot ... --cpu
+
+# Mix tokens: specific GPUs + CPU.
+xchplot2 plot ... --devices 0,1,cpu
+```
+
+CPU plotting is **1-2 orders of magnitude slower than GPU** — meant for
+GPU-less hosts, headless CI, or as an extra background worker. Don't
+expect GPU-grade throughput from a CPU worker on a heterogeneous batch.
+
+Omitted flag = single device via the default SYCL / CUDA selector —
+identical to pre-multi-GPU behavior, zero regression risk.
+
+**Caveats for v1:**
+
+- Static round-robin partition. If your GPUs differ in speed the
+  batch finishes only as fast as the slowest worker's slice; use
+  `--devices` to pick matched cards when that matters.
+- Each worker gets its own ~4 GB pinned host pool, so host RAM scales
+  linearly. A 4-GPU rig pins ~16 GB — size accordingly.
+- The workers share `stderr` (line-buffered, atomic per-`fprintf`) so
+  log lines from different GPUs may interleave. Fine for progress,
+  not for parsing.
+
+Smoke test: `scripts/test-multi-gpu.sh` exercises argument parsing
+(works on any host, even single-GPU) and, when 2+ GPUs are visible,
+runs a live k=22 plot across `--devices 0,1`.
+
 ### Lower-level subcommands
 
 ```bash
-xchplot2 test  <k> <plot-id-hex> [strength] ...   # single plot, raw inputs
-xchplot2 batch <manifest.tsv> [-v]                # batched, raw inputs
+xchplot2 test          <k> <plot-id-hex> [strength] ...    # single plot, raw inputs
+xchplot2 batch         <manifest.tsv> [-v] [--skip-existing] [--continue-on-error]
+                                             [--devices <SPEC>]
+xchplot2 verify        <file.plot2> [--trials N]           # run N random challenges
+xchplot2 parity-check  [--dir PATH]                        # CPU↔GPU regression screen
 ```
 
+`verify` opens a `.plot2` through pos2-chip's CPU prover and runs N
+(default 100) random challenges. Zero proofs across a reasonable sample
+strongly indicates a corrupt plot; the command exits non-zero in that
+case. Intended as a quick sanity check before farming a newly built
+batch — not a replacement for `chia plots check`.
+
+`parity-check` execs every `*_parity` binary in `--dir` (default
+`./build/tools/parity`) and summarizes PASS/FAIL with per-test wall
+time. Use after a refactor or driver update to confirm CPU↔GPU
+agreement is still bit-exact across `aes` / `xs` / `t1` / `t2` / `t3` /
+`plot_file`. Requires `cmake --build` to have produced the parity
+binaries first.
+
+## Troubleshooting
+
+- **Listing visible GPUs**: `xchplot2 devices` prints id, name, backend,
+  VRAM, compute-unit count, and which sort path each device will use
+  (CUB on cuda-backend devices when this build links CUB; SortSycl
+  otherwise). Use the printed `[N]` index with `--devices N` for
+  `plot` / `batch`.
+
+- **Hybrid hosts (NVIDIA + AMD/Intel on the same box)**: a single
+  binary handles all visible GPUs. `xchplot2 plot --devices gpu`
+  spawns a worker per GPU (use `--devices all` to also add a CPU
+  worker); each worker picks the right sort backend at queue
+  construction (CUB on NVIDIA, hand-rolled SYCL radix on AMD/Intel)
+  via the runtime dispatcher in `SortDispatch.cpp`. No rebuild
+  required to add a second-vendor card.
+
+- **`[AdaptiveCpp Warning] [backend_loader] Could not load library:
+  /opt/adaptivecpp/lib/hipSYCL/librt-backend-cuda.so (libcudart.so.11.0:
+  cannot open shared object file)`**: cosmetic only — AdaptiveCpp
+  built with CUDA backend support but no CUDA runtime to load. Happens
+  when AdaptiveCpp was installed out-of-band rather than via
+  `scripts/install-deps.sh --gpu amd` (which sets
+  `-DCMAKE_DISABLE_FIND_PACKAGE_CUDA=TRUE`). To suppress without a
+  rebuild: `export ACPP_VISIBILITY_MASK=hip;omp` so AdaptiveCpp skips
+  the CUDA backend probe entirely.
+
+- **`T1 match produced 0 entries`** on RDNA1 (`gfx1010` / `gfx1011` /
+  `gfx1012`, including the Radeon Pro W5700 / RX 5700 XT). The
+  community `gfx1013` AOT-spoof default was observed to silently
+  compile no-op kernel stubs on at least one W5700 + ROCm 6 +
+  AdaptiveCpp 25.10 host. Default flipped to `ACPP_TARGETS=generic`
+  (SSCP JIT) in recent main; `cargo install --force` past commit
+  `d939ee8` restores correct behavior. To restore the old spoof,
+  `XCHPLOT2_FORCE_GFX_SPOOF=1 cargo install ...`. The startup self-
+  test in `SyclBackend::queue()` catches the no-op-kernel case at
+  queue construction with a clear exception, so this surfaces
+  immediately rather than as empty pipeline output minutes in.
+
+- **`CUB ... invalid argument`** mid-pipeline, or
+  **`sycl_backend::queue: device id 0 out of range (found 0 usable
+  GPU device(s))`** with `--devices N` while the default selector
+  finds a GPU: pre-`762fde2` symptoms of CUB-only sort being
+  dispatched against an AMD/Intel device (or being filtered out of
+  the device list). The runtime sort dispatcher fixes both — `git
+  pull && cargo install --path . --force` to upgrade.
+
+- **Deep-pipeline diagnostics**: set `POS2GPU_T1_DEBUG=1` for verbose
+  per-stage dumps (Xs gen / sort intermediates, T1 match input/output
+  samples, AES T-table sanity). Useful when the symptom isn't on the
+  list above and you want to localize where the data goes wrong.
+
+## Environment variables
+
+| Variable                      | Effect                                                                  |
+|-------------------------------|-------------------------------------------------------------------------|
+| `XCHPLOT2_BUILD_CUDA=ON\|OFF` | Override the build-time CUB / nvcc-TU switch. Default is vendor-aware (NVIDIA → ON; AMD / Intel → OFF; no GPU → `nvcc`-presence). Force `OFF` on dual-toolchain hosts (CUDA + ROCm) where you want the SYCL-only build. |
+| `XCHPLOT2_STREAMING=1`        | Force the low-VRAM streaming pipeline even when the pool would fit.     |
+| `XCHPLOT2_STREAMING_TIER=plain\|compact\|minimal` | Override the streaming-tier auto-pick (plain = ~7.3 GB peak, no parks; compact = ~5.2 GB peak, full parks + N=2 T2 match tiling; minimal = ~3.76 GB peak with full host-pinned slicing of T1/T3 match + tiled CUB outputs in all sort phases + tiled Xs gen/sort/pack — targets 5 GiB+ cards). Equivalent CLI flag: `--tier`. |
+| `POS2GPU_MAX_VRAM_MB=N`       | Cap the pool/streaming VRAM query to N MB (exercise streaming fallback).|
+| `POS2GPU_STREAMING_STATS=1`   | Log every streaming-path `malloc_device` / `free`.                      |
+| `POS2GPU_POOL_DEBUG=1`        | Log pool allocation sizes at construction.                              |
+| `POS2GPU_PHASE_TIMING=1`      | Per-phase wall-time breakdown (Xs / sort / T1 / T2 / T3) on stderr.     |
+| `ACPP_GFX=gfxXXXX`            | AMD only — required at **build** time; sets AOT target for amdgcn ISA. |
+| `ACPP_TARGETS=...`            | Override AdaptiveCpp target selection (defaults: NVIDIA `generic`, AMD `hip:$ACPP_GFX`). |
+| `CUDA_ARCHITECTURES=sm_XX`    | Override the CUDA arch autodetected from `nvidia-smi`.                  |
+| `CUDA_PATH=/path/to/cuda`     | Override the CUDA Toolkit root for linking (default: `/opt/cuda`, `/usr/local/cuda`). Useful on JetPack / non-standard installs. |
+| `CUDA_HOME=/path/to/cuda`     | Fallback for `CUDA_PATH` — same effect.                                 |
+| `POS2_CHIP_DIR=/path`         | Build-time: point at a local pos2-chip checkout instead of FetchContent.|
+| `XCHPLOT2_TEST_GPU_COUNT=N`   | Override `scripts/test-multi-gpu.sh`'s auto-detected GPU count (forces run / skip without consulting `nvidia-smi`). |
+
 ## Testing farming on a testnet
 
 v2 (CHIP-48) farming in stock chia-blockchain is presently unfinished
@@ -115,9 +823,13 @@ pieces any v2 plot needs for farming, regardless of who produced it.
 ## Architecture
 
 ```
-src/gpu/                 CUDA kernels — AES, Xs, T1, T2, T3
+src/gpu/                 GPU kernels — AES, Xs, T1, T2, T3.
+                           CUDA path: .cu files via nvcc + CUB sort.
+                           SYCL path: matching .cpp files via
+                             AdaptiveCpp + hand-rolled LSD radix.
 src/host/
-├── GpuPipeline          Xs → T1 → T2 → T3 device orchestration
+├── GpuPipeline          Xs → T1 → T2 → T3 device orchestration;
+│                          pool + streaming (low-VRAM) variants
 ├── GpuBufferPool        persistent device + 2× pinned host pool
 ├── BatchPlotter         producer / consumer batch driver
 └── PlotFileWriterParallel  sole TU touching pos2-chip headers
@@ -128,13 +840,153 @@ keygen-rs/               Rust staticlib: plot_id_v2, BLS HD, bech32m
 
 ## VRAM
 
-PoS2 plots are k=28 by spec; the persistent buffer pool needs **~15 GB
-of device VRAM**, so a 16 GB+ card is required (RTX 4080 / 4090 /
-5080 / 5090, A6000, etc.). `xchplot2` queries `cudaMemGetInfo` at
-startup and refuses with an actionable error if the pool won't fit.
+PoS2 plots are k=28 by spec. Four code paths, dispatched automatically
+based on available VRAM at batch start:
+
+- **Pool path (~11 GB device + ~4 GB pinned host; 12 GB+ cards
+  reliably).** The persistent buffer pool is sized worst-case and
+  reused across plots in `batch` mode for amortised allocator cost and
+  double-buffered D2H. Xs sort's keys_a slot aliases d_storage tail
+  (idle during Xs gen+sort), trimming pair_b's worst case from
+  `max(cap·12, 4·N·u32 + cub)` to `max(cap·12, 3·N·u32 + cub)` —
+  saves ~1 GiB at k=28. Targets: RTX 4090 / 5090, A6000, H100,
+  RTX 4080 (16 GB), and 12 GB cards like RTX 3060 / RX 6700 XT.
+- **Plain streaming (~7.3 GB peak + 128 MB margin; ≥ 7.42 GiB free at
+  k=28).** Allocates per-phase and frees between phases, but keeps
+  large intermediates (`d_t1_meta`, `d_t1_keys_merged`, `d_t2_meta`,
+  `d_t2_xbits`, `d_t2_keys_merged`) alive across their idle windows
+  instead of parking them on pinned host. T2 match runs as a single
+  full-cap pass (N=1). Used on 10-11 GB cards that can't fit the pool
+  but have headroom above the compact floor. ~400 ms/plot faster than
+  compact at k=28 because there are no park/rehydrate PCIe round-trips.
+- **Compact streaming (~5.2 GB peak + 128 MB margin; ≥ 5.33 GiB free
+  at k=28).** All three match phases (T1/T2/T3) are tiled N=2 across
+  disjoint bucket ranges with half-cap device staging and
+  D2H-to-pinned-host between passes. T1 + T2 sorts are tiled (N=2 and
+  N=4) with merge trees, and `d_t1_meta`, `d_t2_meta`, and the
+  `*_keys_merged` buffers are parked on pinned host across their
+  sort phases and JIT-H2D'd only for the next consumer. Xs is inlined
+  as gen → sort → pack with separate-allocation scratch so keys_a +
+  vals_a can be freed right after CUB sort. Peak at k=28 is
+  **5200 MB** (measured on sm_89); per-phase live maxes:
+
+  | Phase     | Peak (MB) |
+  |-----------|----------:|
+  | Xs        | 4128 |
+  | T1 match  | 5168 |
+  | T1 sort   | 5200 |
+  | T2 match  | 5200 |
+  | T2 sort   | 5200 |
+  | T3 match  | 5200 |
+  | T3 sort   | 4228 |
+
+  A BatchPlotter preflight rejects cards reporting less than
+  `streaming_peak_bytes(k) + 128 MB` free before any queue work, so
+  mid-pipeline OOM is impossible on supported configurations.
+  Practical targets: 6 GB cards on the edge (card-dependent; RTX 2060
+  typically has ~5.5 GiB free which has ~170 MB slack over the
+  5328 MB requirement), 8 GB cards comfortable, 10 GB and up ample.
+  Log the full alloc trace with `POS2GPU_STREAMING_STATS=1`.
+- **Minimal streaming (~3.76 GB peak + 128 MB margin; ≥ 3.80 GiB free
+  at k=28).** Layered cuts on top of compact:
+  - **N=8 T2 match staging.** cap/8 ≈ 570 MB vs compact's cap/2
+    ≈ 2280 MB — saves ~1.5 GB on the T2-match peak.
+  - **Tiled gathers in T1 sort + T2 sort meta + T2 sort xbits.**
+    Each gather output produced in N=4 tiles, D2H'd to host pinned
+    (reusing the existing parking buffers) one tile at a time, then
+    rebuilt on device after the cap-sized inputs are freed. Drops
+    each gather peak from 5200 MB → ~3640 MB.
+  - **Sliced T1 match.** N passes (one per section_l) emit to a
+    cap/N device staging pair, D2H per pass to host pinned. d_xs
+    (2048 MB at k=28) no longer co-resides with full-cap d_t1_meta +
+    d_t1_mi → T1-match peak drops from 5168 MB → 3023 MB.
+  - **Sliced T3 match.** d_t2_meta_sorted parked on host across
+    T3 match; per pass H2Ds the (section_l, section_r) row slices
+    onto a small device buffer pair. d_t2_xbits_sorted +
+    d_t2_keys_merged remain full-cap on device for binary-search /
+    target reads. T3-match peak: 5200 MB → 3754 MB.
+  - **Per-tile CUB outputs in T1/T2/T3 sort sub-phases.** T1 and T2
+    sort use cap/2 / cap/4 device output buffers respectively, D2H
+    per tile to USM-host accumulators, with the existing 2-way merge
+    kernel reading USM-host inputs. T2 additionally parks AB / CD
+    intermediates to host between tree steps so the final merge
+    sees only its own outputs. T3 sort uses cap/2 tile + host-side
+    `std::inplace_merge`. CUB sub-phase peaks: 4170-4228 MB →
+    3155-3640 MB.
+  - **Tiled Xs gen+sort+pack.** N=2 position halves through cap/2
+    ping-pong buffers + USM-host accumulator + 2-way merge, then
+    pack runs in cap/2 halves with D2H per tile to a host-pinned
+    `XsCandidateGpu` accumulator (final d_xs rehydrated H2D).
+    Xs phase peak: 4128 MB → 3072 MB.
+
+  Bottleneck after all six cuts is the T3 match phase at 3754 MB.
+  Targets 5 GiB+ cards comfortably (RTX 2060, RX 6600 XT, RX 7600
+  with ~1.7+ GiB headroom). 4 GiB cards (GTX 1050 Ti / 1650, RTX 3050
+  4GB, MX450) are an edge case — real 4 GiB physical hardware
+  reports ~3.5 GiB free post-CUDA-context, just under the 3.80 GiB
+  required floor. Trade-off: ~6 extra cap-sized PCIe round-trips per
+  plot push k=28 wall on sm_89 from ~13 s/plot (compact) to ~34
+  s/plot (minimal). There is no smaller tier — a forced minimal on a
+  card below the floor throws rather than falling further.
+
+At pool construction `xchplot2` queries `cudaMemGetInfo` on the
+CUDA-only build, or `global_mem_size` (device total) on the SYCL
+path — SYCL has no portable free-memory query, so the check
+effectively approximates "free == total" and lets the actual
+`malloc_device` failure trigger the fallback. If the pool doesn't
+fit, the streaming-tier dispatch picks the largest tier that fits
+with the 128 MB margin: plain if free ≥ 7.42 GiB, else compact if
+free ≥ 5.33 GiB, else minimal. `XCHPLOT2_STREAMING=1` forces
+streaming even when the pool would fit; `--tier
+plain|compact|minimal` (or `XCHPLOT2_STREAMING_TIER`) overrides the
+auto-pick. Forced plain or compact below their floor warns and
+proceeds (caller's risk); forced minimal below its floor throws
+because there is no smaller tier to fall back to.
+
+Plot output is bit-identical across all four paths — streaming
+reorganises memory, not algorithms. Verified at k=22 with md5sum
+across pool / plain / compact / minimal.
+
+## Performance
+
+k=28, strength=2, RTX 4090 (sm_89), PCIe Gen4 x16. Steady-state per-plot
+wall from `xchplot2 batch` (10-plot manifest, mean):
+
+| Build | Per plot | Notes |
+|---|---|---|
+| pos2-chip CPU baseline | ~50 s | reference |
+| `cuda-only` branch | **2.15 s** | original CUDA-only path |
+| `main`, `XCHPLOT2_BUILD_CUDA=ON` (CUB sort) | 2.41 s | NVIDIA fast path on the SYCL/AdaptiveCpp port |
+| `main`, `XCHPLOT2_BUILD_CUDA=OFF` (hand-rolled SYCL radix) | 3.79 s | cross-vendor fallback (AMD/Intel) on AdaptiveCpp |
+| plain streaming tier (10-11 GB cards) | ~5.7 s | no parks, single-pass T2 match; ~400 ms/plot faster than compact |
+| compact streaming tier (6-8 GB cards) | ~7.3 s | full parks + N=2 T2 match |
+| minimal streaming tier (4 GiB cards) | TBD | full parks + N=8 T2 match; smallest peak (~3.7 GB) |
+| `main` on RX 6700 XT (gfx1031 / ROCm 6.2 / AdaptiveCpp HIP) | **9.97 s** | AMD batch steady-state at k=28; T-table AES near-optimal on RDNA2 via this compiler stack |
+
+The `main`/CUB row is +12% over `cuda-only` from extra AdaptiveCpp
+scheduling overhead. The SYCL row is +57% over CUB on the same NVIDIA
+hardware; ~88% of GPU compute is identical between the two paths
+(`nsys` per-kernel breakdown), and the gap is dominated by host-side
+runtime overhead in AdaptiveCpp's DAG manager rather than kernel
+performance. AMD and Intel runtimes are untested; expect roughly the
+SYCL-row latency adjusted for relative GPU throughput.
+
+Numbers above are single-GPU. With `--devices 0,1,...` the batch is
+partitioned round-robin across N worker threads (one per device), so
+wall-clock throughput is bounded by the slowest device's slice —
+≈ linear scaling on matched cards, less if cards differ in speed.
+Live multi-GPU plots were confirmed end-to-end on NVIDIA; per-device
+numbers will vary with PCIe bandwidth sharing on the host root
+complex.
 
 ## License
 
 MIT — see [LICENSE](LICENSE) and [NOTICE](NOTICE) for third-party
 attributions. Built collaboratively with
 [Claude](https://claude.ai/code).
+
+## Like this? Send a coin my way!
+
+If you appreciate this, and want to give back, feel free.
+
+xch1d80tfje65xy97fpxg7kl89wugnd6svlv5uag2qays0um5ay5sn0qz8vph8
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..1b5fc68
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,52 @@
+# Security Policy
+
+## Reporting a vulnerability
+
+Email **abraham.sewill@proton.me** with a description of the issue and
+steps to reproduce. Please do not open a public GitHub issue for
+security-sensitive reports.
+
+## Scope — what counts for a plotter
+
+xchplot2 is a client-side plot builder. It handles:
+
+- Farmer and pool public keys provided on the command line.
+- Optional `--seed` entropy that derives per-plot subseeds; a weak
+  or reused seed lets an attacker who observes plot IDs correlate
+  plots to the same master key.
+- BLS key parsing via the
+  [`chia` Rust crate](https://crates.io/crates/chia) through
+  `keygen-rs`.
+- Large file writes into caller-supplied output directories.
+
+Relevant threat model items we want to hear about:
+
+- **Key handling:** any path where farmer/pool key bytes or the
+  master seed leak into logs, temporary files, crash dumps, or
+  the plot file itself beyond the documented memo payload.
+- **File-path handling:** any way a crafted `-o` / `out_dir` / memo
+  string escapes the intended output directory or overwrites files
+  outside it (path traversal, symlink races). The atomic
+  `.partial` + rename is safe by design; report if you can break it.
+- **Manifest parsing:** malformed `batch` manifests that cause
+  out-of-bounds reads, arbitrary allocation, or unchecked sign
+  conversion.
+- **Build-time supply chain:** tampering paths in
+  `scripts/install-deps.sh`, `Containerfile`, `compose.yaml`, or
+  the FetchContent targets (pos2-chip, AdaptiveCpp).
+
+## Explicitly out of scope
+
+- Proof-of-space soundness and the v2 PoS algorithm itself —
+  report those upstream in
+  [`pos2-chip`](https://github.com/Chia-Network/pos2-chip).
+- Consensus, farming, or wallet behavior — those belong in
+  [`chia-blockchain`](https://github.com/Chia-Network/chia-blockchain)
+  and [`chia_rs`](https://github.com/Chia-Network/chia_rs).
+- Performance regressions on exotic GPUs — file as a normal bug.
+
+## Response
+
+Acknowledgement within a week. Fixes for in-scope issues land on
+`main` (and the `cuda-only` branch if applicable) with credit in the
+commit message unless you prefer otherwise.
diff --git a/_typos.toml b/_typos.toml
new file mode 100644
index 0000000..d82642d
--- /dev/null
+++ b/_typos.toml
@@ -0,0 +1,17 @@
+# _typos.toml — domain-specific allowlist for xchplot2.
+#
+# typos' default dictionary flags a handful of proper nouns and
+# CUDA / SYCL intrinsic names that only LOOK like misspellings. The
+# risk of one of these coincidentally being a real typo elsewhere in
+# the tree is low, so allowlist them globally rather than per-file.
+
+[default.extend-words]
+# AMD ROCm "Heterogeneous System Architecture" runtime.
+HSA = "HSA"
+# SYCL kernel range / index types: nd_range, nd_item.
+nd = "nd"
+# CUDA half-precision intrinsics: __hge ("greater-or-equal"),
+# __hgt, __hle, __hlt; AdaptiveCpp's libkernel/half.hpp aliases.
+hge = "hge"
+# Yann Collet, author of LZ4 / zstd, attributed in NOTICE.
+Collet = "Collet"
diff --git a/build.rs b/build.rs
index 6111517..319c082 100644
--- a/build.rs
+++ b/build.rs
@@ -36,6 +36,348 @@ fn detect_cuda_arch() -> Option<String> {
     Some(arch.to_string())
 }
 
+/// Same probe as `detect_cuda_arch`, but filters out NVIDIA GPUs
+/// below our README-documented minimum compute capability (sm_50,
+/// Maxwell first-gen / GTX 750-class). The floor used to be sm_61 on
+/// the assumption that AdaptiveCpp's `half.hpp` referenced FP16
+/// intrinsics (`__hadd` / `__hsub` / `__hmul` / `__hdiv` / `__hlt` /
+/// `__hgt`) only available on sm_53+ — but those intrinsics are
+/// *implemented* in `cuda_fp16.hpp` via `NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, …)`
+/// with a fp32 emulation fallback for pre-sm_53 cards. CUDA 12.x
+/// toolkits compile cleanly for sm_50/52/53. The real floor is the
+/// toolkit's own codegen support: CUDA 12.x supports sm_50-90+,
+/// CUDA 13.x dropped sm_50-72 (CMakeLists' nvcc-vs-arch preflight
+/// catches that pairing with a FATAL_ERROR + fix block).
+///
+/// Returns Some(arch) only when nvidia-smi reports a card at or
+/// above our minimum; emits a cargo:warning and returns None
+/// otherwise so callers fall through to the AMD / Intel detection.
+fn usable_nvidia_arch() -> Option<String> {
+    let arch = detect_cuda_arch()?;
+    let n: u32 = arch.parse().ok()?;
+    if n < 50 {
+        println!(
+            "cargo:warning=xchplot2: nvidia-smi detected sm_{arch} — below our \
+             minimum supported compute capability (sm_50 / Maxwell). CUDA 11.x \
+             was the last toolkit to compile for Kepler (sm_30-37); we don't \
+             support that path. Ignoring NVIDIA for default targeting; if \
+             this card is your only GPU, force the build with \
+             CUDA_ARCHITECTURES={arch} + XCHPLOT2_BUILD_CUDA=ON and an \
+             appropriately-old CUDA toolkit, or fall back to \
+             ACPP_TARGETS=omp for AdaptiveCpp's CPU OpenMP backend.");
+        return None;
+    }
+    if n < 75 && detect_nvcc_major().map(|m| m >= 13).unwrap_or(false) {
+        println!(
+            "cargo:warning=xchplot2: nvidia-smi detected sm_{arch} (Maxwell / \
+             Pascal / Volta) but nvcc is CUDA 13.x, which dropped codegen \
+             for sm_50-72. Ignoring NVIDIA for default targeting; install \
+             CUDA 12.9 (last toolkit with Maxwell-Volta support) and re-run, \
+             or use scripts/build-container.sh which auto-pins the right \
+             base image. CMakeLists' preflight will FATAL_ERROR with the \
+             exact remediation if you force-build anyway.");
+        return None;
+    }
+    Some(arch)
+}
+
+/// Check whether nvcc is on $PATH and runnable. Used as the fall-back
+/// signal for XCHPLOT2_BUILD_CUDA when no GPU is enumerable (headless
+/// CI / container builds). Runs `nvcc --version` rather than a simple
+/// PATH lookup so stale symlinks don't pass.
+fn detect_nvcc() -> bool {
+    Command::new("nvcc")
+        .arg("--version")
+        .output()
+        .map(|o| o.status.success())
+        .unwrap_or(false)
+}
+
+/// Parse nvcc's major version from `nvcc --version` output.
+/// The release line looks like:
+///   "Cuda compilation tools, release 13.0, V13.0.48"
+/// Returns None if nvcc isn't on PATH or the line can't be parsed —
+/// callers treat that as "skip the version-vs-arch compat check"
+/// rather than blocking the build.
+fn detect_nvcc_major() -> Option<u32> {
+    let out = Command::new("nvcc").arg("--version").output().ok()?;
+    if !out.status.success() { return None; }
+    let s = std::str::from_utf8(&out.stdout).ok()?;
+    for line in s.lines() {
+        let mut iter = line.split_whitespace();
+        while let Some(w) = iter.next() {
+            if w == "release" {
+                let next = iter.next()?;                         // "13.0,"
+                let major = next.trim_end_matches(',').split('.').next()?;
+                return major.parse().ok();
+            }
+        }
+    }
+    None
+}
+
+/// Minimum integer arch from a CMake-style CUDA_ARCHITECTURES list
+/// ("61", "61;86", "61;86;120"). Tolerates "sm_61" / "compute_61"
+/// prefixes that Cargo users sometimes pass through. Returns None
+/// when the list parses to nothing.
+fn min_arch(arch_list: &str) -> Option<u32> {
+    arch_list.split(';')
+        .filter_map(|s| {
+            let s = s.trim()
+                .trim_start_matches("sm_")
+                .trim_start_matches("compute_");
+            s.parse().ok()
+        })
+        .min()
+}
+
+/// Probe /sys/class/drm for a display-class PCI device with Intel's
+/// vendor ID (0x8086). Used as a heuristic to default
+/// XCHPLOT2_BUILD_CUDA=OFF on Intel hosts, mirroring what rocminfo
+/// already does for AMD. Returns false on non-Linux or when the sysfs
+/// path isn't accessible — callers fall back to the next signal.
+fn detect_intel_gpu() -> bool {
+    let entries = match std::fs::read_dir("/sys/class/drm") {
+        Ok(d) => d,
+        Err(_) => return false,
+    };
+    for entry in entries.flatten() {
+        let name = entry.file_name();
+        let name = name.to_string_lossy();
+        // Skip connector nodes like card0-DP-1; we only want the card itself.
+        if !name.starts_with("card") || name.contains('-') {
+            continue;
+        }
+        let vendor = entry.path().join("device/vendor");
+        if let Ok(v) = std::fs::read_to_string(&vendor) {
+            if v.trim() == "0x8086" {
+                return true;
+            }
+        }
+    }
+    false
+}
+
+/// Does the host have any AMD GPU detectable by rocminfo? Independent
+/// of which ACPP_TARGETS string we'd pick for it — `detect_amd_gfx` may
+/// return None for AMD cards we choose to route through SSCP (RDNA1
+/// default), but the GPU is still present and BUILD_CUDA detection
+/// should still see it as "AMD host, skip CUDA TUs".
+///
+/// Falls back to /sys/class/drm vendor-ID probe (0x1002) when rocminfo
+/// isn't on $PATH at build time. That happens reliably when users
+/// install ROCm via /opt/rocm/bin without sourcing /etc/profile.d/rocm.sh
+/// in the shell that runs `cargo install`, or run `cargo install` under
+/// systemd / sudo / chroot where the parent shell's PATH is stripped.
+/// Without the fallback the BUILD_CUDA selector falls through to the
+/// `nvcc present → ON, "CI fallback"` arm, the build links CUB, and the
+/// streaming pipeline dies on first sort dispatch against the AMD card.
+fn amd_gpu_present() -> bool {
+    if let Ok(out) = Command::new("rocminfo").output() {
+        if out.status.success() {
+            if let Ok(s) = std::str::from_utf8(&out.stdout) {
+                if s.lines().any(|l| {
+                    l.trim().strip_prefix("Name:")
+                        .map(|rest| rest.trim().starts_with("gfx"))
+                        .unwrap_or(false)
+                }) {
+                    return true;
+                }
+            }
+        }
+    }
+    // PCI fallback — same pattern as detect_intel_gpu(). Doesn't need any
+    // user-space tools, only readable sysfs (true on every Linux host
+    // with the amdgpu / radeon kernel module loaded).
+    let entries = match std::fs::read_dir("/sys/class/drm") {
+        Ok(d) => d,
+        Err(_) => return false,
+    };
+    for entry in entries.flatten() {
+        let name = entry.file_name();
+        let name = name.to_string_lossy();
+        if !name.starts_with("card") || name.contains('-') {
+            continue;
+        }
+        let vendor = entry.path().join("device/vendor");
+        if let Ok(v) = std::fs::read_to_string(&vendor) {
+            if v.trim() == "0x1002" {
+                return true;
+            }
+        }
+    }
+    false
+}
+
+/// Ask `rocminfo` for the first AMD GPU's architecture, e.g. "gfx1100" for
+/// an RX 7900 XTX. Returns None when rocminfo is missing or there's no AMD
+/// GPU, AND ALSO when we deliberately want the caller to fall through to
+/// ACPP_TARGETS=generic (currently for RDNA1 gfx1010/1011/1012). Use
+/// amd_gpu_present() to distinguish "no AMD GPU at all" from "AMD GPU
+/// present but routed through generic SSCP".
+fn detect_amd_gfx() -> Option<String> {
+    let out = Command::new("rocminfo").output().ok()?;
+    if !out.status.success() {
+        return None;
+    }
+    let s = std::str::from_utf8(&out.stdout).ok()?;
+    for line in s.lines() {
+        if let Some(rest) = line.trim().strip_prefix("Name:") {
+            let name = rest.trim();
+            if name.starts_with("gfx") {
+                // RDNA1 (gfx1010/1011/1012) isn't a direct AdaptiveCpp
+                // HIP AOT target. We previously defaulted to a community
+                // workaround that AOT-compiled for gfx1013 (close-ISA),
+                // but it has been observed to silently produce no-op
+                // kernels on at least one W5700 / ROCm 6 / AdaptiveCpp
+                // 25.10 setup — every kernel dispatch completes without
+                // writing, surfacing far downstream as "T1 match
+                // produced 0 entries". A separate-build experiment on
+                // the same host with ACPP_TARGETS=generic (SSCP JIT)
+                // dispatched and produced correct output through k=24.
+                //
+                // Default for RDNA1 is now ACPP_TARGETS=generic (signal
+                // by returning None — caller's None branch picks
+                // generic). Two opt-in escape hatches preserved for
+                // users who've validated their stack on the legacy
+                // path:
+                //   XCHPLOT2_FORCE_GFX_SPOOF=1 — gfx1013 AOT spoof
+                //   XCHPLOT2_NO_GFX_SPOOF=1    — native gfx1010 AOT
+                //                                (may fail to compile
+                //                                if AdaptiveCpp doesn't
+                //                                advertise it as a HIP
+                //                                target).
+                let spoofed = match name {
+                    "gfx1010" | "gfx1011" | "gfx1012" => {
+                        let force_spoof = env::var("XCHPLOT2_FORCE_GFX_SPOOF")
+                            .map(|v| !v.is_empty() && v != "0")
+                            .unwrap_or(false);
+                        let no_spoof = env::var("XCHPLOT2_NO_GFX_SPOOF")
+                            .map(|v| !v.is_empty() && v != "0")
+                            .unwrap_or(false);
+                        if force_spoof {
+                            println!(
+                                "cargo:warning=xchplot2: RDNA1 {name} detected, \
+                                 XCHPLOT2_FORCE_GFX_SPOOF set — building for \
+                                 gfx1013 (legacy community workaround). The \
+                                 default switched to ACPP_TARGETS=generic (SSCP \
+                                 JIT) after the spoof was observed to silently \
+                                 produce no-op kernels on some W5700 setups; \
+                                 unset XCHPLOT2_FORCE_GFX_SPOOF if your plots \
+                                 fail with 'T1 match produced 0 entries'.");
+                            "gfx1013".to_string()
+                        } else if no_spoof {
+                            println!(
+                                "cargo:warning=xchplot2: RDNA1 {name} detected, \
+                                 XCHPLOT2_NO_GFX_SPOOF set — AOT-targeting {name} \
+                                 natively. If AdaptiveCpp doesn't advertise {name} \
+                                 as a HIP target on your toolchain, the build will \
+                                 fail; unset XCHPLOT2_NO_GFX_SPOOF to fall back to \
+                                 the (working-on-most-cards) generic SSCP JIT.");
+                            name.to_string()
+                        } else {
+                            println!(
+                                "cargo:warning=xchplot2: RDNA1 {name} detected — \
+                                 defaulting to ACPP_TARGETS=generic (SSCP JIT). \
+                                 The previous gfx1013 community workaround was \
+                                 observed to silently produce no-op kernels on \
+                                 at least one W5700 / ROCm 6 setup. Override: \
+                                 XCHPLOT2_FORCE_GFX_SPOOF=1 (back to gfx1013 AOT) \
+                                 or XCHPLOT2_NO_GFX_SPOOF=1 (try native {name})."
+                            );
+                            return None;
+                        }
+                    }
+                    other => other.to_string(),
+                };
+                return Some(spoofed);
+            }
+        }
+    }
+    None
+}
+
+/// Probe whether `cmd` is on PATH and runnable. Used by preflight()
+/// to detect missing toolchain pieces before cmake gets to fail with
+/// a cryptic message.
+fn command_runs(cmd: &str) -> bool {
+    Command::new(cmd)
+        .arg("--version")
+        .output()
+        .map(|o| o.status.success())
+        .unwrap_or(false)
+}
+
+/// Locate `ld.lld` either on PATH or in the conventional LLVM-{16..20}
+/// install prefixes. Mirrors the find_program HINTS list in
+/// CMakeLists.txt's FetchContent block. AdaptiveCpp's CMake aborts
+/// with "Cannot find ld.lld" without it.
+fn ld_lld_findable() -> bool {
+    if command_runs("ld.lld") { return true; }
+    for p in &[
+        "/usr/lib/llvm-20/bin/ld.lld", "/usr/lib/llvm-19/bin/ld.lld",
+        "/usr/lib/llvm-18/bin/ld.lld", "/usr/lib/llvm-17/bin/ld.lld",
+        "/usr/lib/llvm-16/bin/ld.lld",
+        "/usr/lib/llvm20/bin/ld.lld",  "/usr/lib/llvm19/bin/ld.lld",
+        "/usr/lib/llvm18/bin/ld.lld",
+        "/usr/lib64/llvm20/bin/ld.lld", "/usr/lib64/llvm19/bin/ld.lld",
+        "/usr/lib64/llvm18/bin/ld.lld",
+        "/opt/llvm-20/bin/ld.lld", "/opt/llvm-19/bin/ld.lld",
+        "/opt/llvm-18/bin/ld.lld",
+    ] {
+        if std::path::Path::new(p).exists() { return true; }
+    }
+    false
+}
+
+/// True when AdaptiveCpp is already installed — at $ACPP_PREFIX if
+/// set, otherwise the install-deps.sh default of /opt/adaptivecpp.
+/// When this is true the FetchContent fallback won't fire and
+/// AdaptiveCpp's own build-time deps (notably ld.lld) aren't needed
+/// for our build.
+fn adaptivecpp_installed() -> bool {
+    let prefix = env::var("ACPP_PREFIX")
+        .unwrap_or_else(|_| "/opt/adaptivecpp".to_string());
+    std::path::Path::new(&format!(
+        "{prefix}/lib/cmake/AdaptiveCpp/AdaptiveCppConfig.cmake"
+    )).exists()
+}
+
+/// Detect a container engine on PATH, preferring podman (matches
+/// scripts/build-container.sh's default). Used to phrase the preflight
+/// panic differently when the user already has tooling that lets them
+/// skip the host-side install entirely.
+fn detect_container_engine() -> Option<&'static str> {
+    if command_runs("podman") { return Some("podman"); }
+    if command_runs("docker") { return Some("docker"); }
+    None
+}
+
+/// Walk critical build-time prerequisites and return human-readable
+/// names of anything missing. Cargo install users in particular don't
+/// read the Build section of README.md (and don't expect to need to),
+/// so a friendly preflight is much better than letting CMake or
+/// AdaptiveCpp fail with cryptic errors deep into a build.
+fn preflight(build_cuda_on: bool) -> Vec<String> {
+    let mut missing: Vec<String> = vec![];
+    if !command_runs("cmake") {
+        missing.push("cmake (3.24+) — apt install cmake / dnf install cmake / pacman -S cmake".into());
+    }
+    if !command_runs("c++") && !command_runs("g++") && !command_runs("clang++") {
+        missing.push("C++20 compiler (g++ ≥ 13 or clang++ ≥ 18) — apt install build-essential, dnf install gcc-c++, or pacman -S base-devel".into());
+    }
+    // ld.lld is only required when FetchContent will rebuild
+    // AdaptiveCpp; a pre-installed AdaptiveCpp linked against ld.lld
+    // at its own install time, so consumers don't need it again.
+    if !adaptivecpp_installed() && !ld_lld_findable() {
+        missing.push("ld.lld (apt: lld-18, dnf/pacman: lld) — required by AdaptiveCpp's FetchContent build".into());
+    }
+    if build_cuda_on && !detect_nvcc() {
+        missing.push("nvcc (CUDA Toolkit 12+) — XCHPLOT2_BUILD_CUDA=ON requested but no nvcc on PATH".into());
+    }
+    missing
+}
+
 fn main() {
     let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
     let out_dir      = PathBuf::from(env::var("OUT_DIR").unwrap());
@@ -56,6 +398,180 @@ fn main() {
     };
     println!("cargo:warning=xchplot2: building for CUDA arch {cuda_arch} ({source})");
 
+    // AdaptiveCpp target precedence:
+    //   1. $ACPP_TARGETS if set.
+    //   2. NVIDIA: "generic" (LLVM SSCP). Empirically a few percent
+    //      faster than cuda:sm_<arch> on our kernels.
+    //   3. AMD:    hip:gfx<...> via rocminfo. SSCP's HIP path is less
+    //      mature, so AOT-compile for the gfx target.
+    //   4. generic (LLVM SSCP, JITs on first use).
+    let (acpp_targets, acpp_source) = match env::var("ACPP_TARGETS") {
+        // Treat an empty env var the same as unset — Containerfile build
+        // args propagate as `ACPP_TARGETS=` when the user doesn't override
+        // them, and acpp rejects an empty target string.
+        Ok(v) if !v.is_empty() => (v, "$ACPP_TARGETS"),
+        Ok(_) | Err(_) => {
+            // Prefer a USABLE NVIDIA GPU (sm_61+) over AMD, otherwise fall
+            // through to AMD / fallback. `detect_cuda_arch` alone would
+            // trigger on an ancient secondary NVIDIA card even when AMD is
+            // the real plotting target (see usable_nvidia_arch).
+            if usable_nvidia_arch().is_some() {
+                ("generic".to_string(), "NVIDIA detected — using SSCP")
+            } else if let Some(gfx) = detect_amd_gfx() {
+                (format!("hip:{gfx}"), "rocminfo probe")
+            } else {
+                ("generic".to_string(), "fallback (LLVM SSCP)")
+            }
+        }
+    };
+    println!("cargo:warning=xchplot2: ACPP_TARGETS={acpp_targets} ({acpp_source})");
+
+    // XCHPLOT2_BUILD_CUDA toggles whether the CUB sort + nvcc-compiled
+    // CUDA TUs (AesGpu.cu, SortCuda.cu, AesGpuBitsliced.cu) are built.
+    // Autodetect prefers actual GPU vendor over toolchain availability:
+    // dual-toolchain hosts (AMD / Intel GPU, CUDA Toolkit also installed)
+    // would otherwise try to compile SortCuda.cu through nvcc + AdaptiveCpp
+    // — which has triggered upstream `half.hpp` compile errors for at
+    // least one Radeon Pro W5700 user. Priority order:
+    //   NVIDIA GPU → ON      (CUB is the fast path)
+    //   AMD GPU    → OFF     (SYCL/HIP path; CUB unused anyway)
+    //   Intel GPU  → OFF     (SYCL/L0 path)
+    //   no GPU, nvcc present → ON  (CI / container build)
+    //   no GPU, no nvcc      → OFF
+    let (build_cuda, bc_source) = match env::var("XCHPLOT2_BUILD_CUDA") {
+        Ok(v) if !v.is_empty() => (v, "$XCHPLOT2_BUILD_CUDA"),
+        _ => {
+            // Same usable-arch gate as the ACPP_TARGETS block: an
+            // ancient secondary NVIDIA card (e.g. sm_52 alongside an
+            // AMD W5700) must NOT claim the CUB path, because
+            // AdaptiveCpp half.hpp references sm_53+ FP16 intrinsics
+            // that the old card's cuda_fp16.h guards out.
+            let nvidia_gpu = usable_nvidia_arch().is_some();
+            // amd_gpu_present, NOT detect_amd_gfx().is_some() — the
+            // latter returns None for RDNA1 (we route those through
+            // SSCP instead of an AOT hip:* target), but the GPU is
+            // there and we MUST skip CUDA TUs to avoid running
+            // SortCuda.cu's CUB calls against AMD silicon.
+            let amd_gpu    = amd_gpu_present();
+            let intel_gpu  = detect_intel_gpu();
+            if nvidia_gpu {
+                ("ON".to_string(), "NVIDIA GPU detected")
+            } else if amd_gpu {
+                ("OFF".to_string(), "AMD GPU detected — skipping CUDA TUs")
+            } else if intel_gpu {
+                ("OFF".to_string(), "Intel GPU detected — skipping CUDA TUs")
+            } else if detect_nvcc() {
+                ("ON".to_string(), "no GPU probe, nvcc present — assuming CI/container")
+            } else {
+                ("OFF".to_string(), "no GPU, no nvcc — skipping CUDA TUs")
+            }
+        },
+    };
+    println!("cargo:warning=xchplot2: XCHPLOT2_BUILD_CUDA={build_cuda} ({bc_source})");
+
+    // Preflight critical system deps BEFORE invoking cmake. Cargo
+    // install users land here without reading README.md's Build
+    // section; without preflight, missing deps surface as cryptic
+    // CMake / AdaptiveCpp errors deep in the configure / build.
+    let missing = preflight(build_cuda == "ON");
+    if !missing.is_empty() {
+        let bullets = missing.iter()
+            .map(|m| format!("  - {m}"))
+            .collect::<Vec<_>>()
+            .join("\n");
+        // Surface the container path proactively when we can already
+        // see podman/docker — for many users that's the smoothest fix
+        // because the toolchain stays bundled in the image.
+        let next_steps = match detect_container_engine() {
+            Some(engine) => format!(
+                "Two ways forward, pick whichever fits:\n\n  \
+                   - Install those packages on the host:\n      \
+                       ./scripts/install-deps.sh --gpu nvidia    # auto-detects vendor + AdaptiveCpp\n\n  \
+                   - Or, since you have {engine} installed, build inside a container —\n    \
+                     toolchain stays in the image, no host changes needed:\n      \
+                       ./scripts/build-container.sh\n      \
+                       {engine} compose run --rm cuda plot ...    # or rocm / intel / cpu\n\n\
+                 If install-deps.sh just ran and you're still seeing this, check\n\
+                 its tail output — it names the failed package before exiting."
+            ),
+            None => format!(
+                "Two ways forward, pick whichever fits:\n\n  \
+                   - Install those packages on the host:\n      \
+                       ./scripts/install-deps.sh --gpu nvidia    # auto-detects vendor + AdaptiveCpp\n\n  \
+                   - Or build inside a container (no host toolchain needed beyond\n    \
+                     podman or docker — install whichever you prefer first):\n      \
+                       ./scripts/build-container.sh\n\n\
+                 If install-deps.sh just ran and you're still seeing this, check\n\
+                 its tail output — it names the failed package before exiting."
+            ),
+        };
+        panic!("\nxchplot2: build prerequisites missing:\n{bullets}\n\n{next_steps}\n");
+    }
+
+    // CUDA 13.0 dropped codegen for sm_50/52/53/60/61/62/70/72 entirely
+    // — its nvcc fails the CMake TryCompile probe with "Unsupported gpu
+    // architecture 'compute_61'" on Pascal, "compute_70" on Volta, etc.
+    // Catch that mismatch HERE so the failure surfaces with a clear fix
+    // path, not buried in a CMakeError.log 40 lines into a TryCompile.
+    // Skipped when nvcc version or arch list can't be parsed (treat as
+    // "preflight not actionable, let cmake try" — preserves prior
+    // behaviour for unusual setups).
+    if build_cuda == "ON" {
+        if let (Some(nvcc_major), Some(min)) = (detect_nvcc_major(), min_arch(&cuda_arch)) {
+            if nvcc_major >= 13 && min < 75 {
+                // Container detection: Docker writes /.dockerenv, Podman writes
+                // /run/.containerenv. Either presence means the host-side fixes
+                // (apt install cuda-toolkit, set CUDA_PATH) are not actionable
+                // from inside this build — the user needs to rebuild the image
+                // with a different BASE_DEVEL.
+                let in_container = std::path::Path::new("/.dockerenv").exists()
+                    || std::path::Path::new("/run/.containerenv").exists();
+                let fix_block = if in_container {
+                    format!(
+                        "You're building inside a container — the toolkit comes from the\n\
+                         base image, not the host. Rebuild the image with a CUDA 12.x base:\n  \
+                           - Recommended: rerun scripts/build-container.sh on the host;\n    \
+                             it auto-pins nvidia/cuda:12.9.1 when CUDA_ARCH < 75.\n  \
+                           - Or pass --build-arg explicitly:\n      \
+                               podman build -t xchplot2:cuda \\\n        \
+                                 --build-arg BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n        \
+                                 --build-arg BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n        \
+                                 --build-arg CUDA_ARCH={min} \\\n        \
+                                 .\n  \
+                           - Or via compose with env vars:\n      \
+                               CUDA_ARCH={min} \\\n        \
+                                 BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n        \
+                                 BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n        \
+                                 podman compose build cuda\n"
+                    )
+                } else {
+                    "Fix one of:\n  \
+                       - Install CUDA 12.9 (last toolkit with Pascal/Volta support):\n      \
+                           Ubuntu/Debian:  sudo apt install cuda-toolkit-12-9\n      \
+                           Arch:           pacman -S cuda  (or pin to a 12.x channel)\n    \
+                         then point the build at it:\n      \
+                           CUDA_PATH=/usr/local/cuda-12.9 cargo install \\\n      \
+                             --git https://github.com/Jsewill/xchplot2 --force\n  \
+                       - Or override the arch (only valid if you actually have a Turing+ card):\n      \
+                           CUDA_ARCHITECTURES=75 cargo install \\\n      \
+                             --git https://github.com/Jsewill/xchplot2 --force\n  \
+                       - Or use the container path — scripts/build-container.sh auto-pins\n    \
+                         the 12.9 base image when it detects a pre-Turing GPU.\n".to_string()
+                };
+                panic!(
+                    "\nxchplot2: CUDA Toolkit {nvcc_major}.x dropped codegen for sm_{min} \
+                     (Pascal / Volta / pre-Turing).\n\
+                     \n\
+                     Detected:\n  \
+                       nvcc {nvcc_major}.x\n  \
+                       target arch: sm_{min} (from CUDA_ARCHITECTURES={cuda_arch})\n\
+                     \n\
+                     {fix_block}"
+                );
+            }
+        }
+    }
+
     // ---- configure ----
     let status = Command::new("cmake")
         .args([
@@ -64,6 +580,8 @@ fn main() {
             "-DCMAKE_BUILD_TYPE=Release",
         ])
         .arg(format!("-DCMAKE_CUDA_ARCHITECTURES={cuda_arch}"))
+        .arg(format!("-DACPP_TARGETS={acpp_targets}"))
+        .arg(format!("-DXCHPLOT2_BUILD_CUDA={build_cuda}"))
         .status()
         .expect("failed to invoke cmake — is it installed?");
     if !status.success() {
@@ -111,21 +629,136 @@ fn main() {
     println!("cargo:rustc-link-lib=static=fse");
     println!("cargo:rustc-link-arg=-Wl,--end-group");
 
-    // ---- CUDA runtime ----
-    // Honour $CUDA_PATH / $CUDA_HOME if set, else fall back to /opt/cuda
-    // (Arch / CachyOS) then /usr/local/cuda (Debian-ish).
-    let cuda_root = env::var("CUDA_PATH")
-        .or_else(|_| env::var("CUDA_HOME"))
-        .unwrap_or_else(|_| {
-            for guess in ["/opt/cuda", "/usr/local/cuda"] {
-                if std::path::Path::new(guess).exists() { return guess.to_string(); }
+    // ---- AdaptiveCpp runtime ----
+    // The static archives produced by CMake reference hipsycl::rt::* symbols
+    // that live in libacpp-rt + libacpp-common (shared). CMake writes the
+    // exact lib directory to $cmake_build/acpp-prefix.txt during configure;
+    // honour that, then $ACPP_PREFIX / standard locations as fallbacks.
+    let acpp_lib_dir = std::fs::read_to_string(cmake_build.join("acpp-prefix.txt"))
+        .ok()
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+        .or_else(|| env::var("ACPP_PREFIX").ok().map(|p| format!("{p}/lib")))
+        .or_else(|| env::var("AdaptiveCpp_ROOT").ok().map(|p| format!("{p}/lib")))
+        .unwrap_or_else(|| {
+            for guess in ["/opt/adaptivecpp/lib", "/usr/local/lib",
+                          "/usr/lib/x86_64-linux-gnu", "/usr/lib"] {
+                if std::path::Path::new(&format!("{guess}/libacpp-rt.so")).exists() {
+                    return guess.to_string();
+                }
             }
-            "/opt/cuda".to_string()
+            "/opt/adaptivecpp/lib".to_string()
         });
-    println!("cargo:rustc-link-search=native={cuda_root}/lib64");
-    println!("cargo:rustc-link-search=native={cuda_root}/lib");
-    println!("cargo:rustc-link-lib=cudart");
-    println!("cargo:rustc-link-lib=cudadevrt");
+    println!("cargo:rustc-link-search=native={acpp_lib_dir}");
+    println!("cargo:rustc-link-arg=-Wl,-rpath,{acpp_lib_dir}");
+    println!("cargo:rustc-link-lib=acpp-rt");
+    println!("cargo:rustc-link-lib=acpp-common");
+
+    // ---- LLVM OpenMP runtime (SYCL→OMP backend) ----
+    // AdaptiveCpp's OMP backend lowers SYCL nd_range kernels to OpenMP
+    // parallel loops. The compiled .o files reference libomp's runtime
+    // symbols (__kmpc_fork_call, __kmpc_global_thread_num, __kmpc_barrier,
+    // __kmpc_for_static_init_8u / _fini). cc / rust-lld don't auto-link
+    // libomp — pos2_gpu's SYCL TUs would then fail to link with
+    //
+    //   rust-lld: error: undefined symbol: __kmpc_fork_call
+    //
+    // Only fire on builds where ACPP_TARGETS includes "omp"; HIP and
+    // SSCP-with-CUDA backends translate to their own runtimes and don't
+    // need libomp at link time.
+    //
+    // Locations:
+    //   Ubuntu/Debian (apt libomp-18-dev): /usr/lib/llvm-18/lib/libomp.so
+    //   Arch (pacman openmp):              /usr/lib/libomp.so
+    //   AdaptiveCpp install (bundled):     $ACPP_PREFIX/lib/libomp.so
+    if acpp_targets.split(';').any(|t| t.trim() == "omp") {
+        for guess in ["/usr/lib/llvm-18/lib", "/usr/lib/llvm-19/lib",
+                      "/usr/lib/llvm-20/lib", "/usr/lib"] {
+            if std::path::Path::new(&format!("{guess}/libomp.so")).exists()
+                || std::path::Path::new(&format!("{guess}/libomp.so.5")).exists() {
+                println!("cargo:rustc-link-search=native={guess}");
+                println!("cargo:rustc-link-arg=-Wl,-rpath,{guess}");
+                break;
+            }
+        }
+        println!("cargo:rustc-link-lib=omp");
+    }
+
+    // ---- CUDA runtime ----
+    // Only needed when XCHPLOT2_BUILD_CUDA=ON — then the nvcc-compiled
+    // TUs (SortCuda, AesGpu, AesGpuBitsliced) pull in cudart / cudadevrt.
+    // On the AMD/Intel OFF path there's no CUDA Toolkit on the image and
+    // nothing in the static archives references cudart, so emitting
+    // `-lcudart` would make rust-lld fail with "unable to find library".
+    if build_cuda == "ON" {
+        // Honour $CUDA_PATH / $CUDA_HOME if set, else fall back to
+        // /opt/cuda (Arch / CachyOS) then /usr/local/cuda (Debian-ish).
+        let cuda_root = env::var("CUDA_PATH")
+            .or_else(|_| env::var("CUDA_HOME"))
+            .unwrap_or_else(|_| {
+                for guess in ["/opt/cuda", "/usr/local/cuda"] {
+                    if std::path::Path::new(guess).exists() { return guess.to_string(); }
+                }
+                "/opt/cuda".to_string()
+            });
+        println!("cargo:rustc-link-search=native={cuda_root}/lib64");
+        println!("cargo:rustc-link-search=native={cuda_root}/lib");
+        println!("cargo:rustc-link-lib=cudart");
+        println!("cargo:rustc-link-lib=cudadevrt");
+    }
+
+    // ---- HIP runtime ----
+    // When ACPP_TARGETS is "hip:gfxXXXX", AdaptiveCpp's HIP backend
+    // compiles SYCL kernels into HIP fat binaries whose host-side
+    // launcher stubs reference __hipPushCallConfiguration /
+    // __hipRegisterFatBinary / hipLaunchKernel from libamdhip64. Without
+    // -lamdhip64 rust-lld fails with "undefined symbol: __hip*".
+    // Honour $ROCM_PATH if set, else fall back to /opt/rocm (standard
+    // bare-metal + all official ROCm container images).
+    // Link libamdhip64 whenever ROCm is reachable, not just when
+    // ACPP_TARGETS is hip-prefixed. ACPP_TARGETS=generic (SSCP JIT) on
+    // an AMD host still needs the HIP runtime at load time —
+    // librt-backend-hip.so dlopens libamdhip64, but glibc doesn't walk
+    // the binary's RUNPATH for transitive backend deps. By making
+    // libamdhip64 a direct dependency of the binary, the loader pulls
+    // it in at startup via RUNPATH, and AdaptiveCpp's runtime dlopen
+    // finds the already-loaded handle. Without this, an AMD-host
+    // build with the new RDNA1 default (generic instead of the
+    // gfx1013 spoof) fails at first queue construction with
+    // "No matching device" because HIP can't initialise.
+    //
+    // We pass the full .so path (rather than `cargo:rustc-link-lib=amdhip64`
+    // which becomes `-lamdhip64`) because the SSCP path emits no host-
+    // side HIP symbol references, and the linker's default --as-needed
+    // would drop a name-only -l flag from NEEDED. A positional path
+    // argument bypasses --as-needed and keeps the library in the link.
+    // Same approach as CMakeLists.txt's `link_libraries(.../libamdhip64.so)`.
+    let rocm_root = env::var("ROCM_PATH")
+        .unwrap_or_else(|_| "/opt/rocm".to_string());
+    let amdhip_lib = format!("{rocm_root}/lib/libamdhip64.so");
+    if acpp_targets.starts_with("hip:") || std::path::Path::new(&amdhip_lib).exists() {
+        println!("cargo:rustc-link-search=native={rocm_root}/lib");
+        println!("cargo:rustc-link-search=native={rocm_root}/hip/lib");
+        println!("cargo:rustc-link-arg=-Wl,-rpath,{rocm_root}/lib");
+        if std::path::Path::new(&amdhip_lib).exists() {
+            // Wrap with --no-as-needed/--as-needed: even a positional
+            // .so path gets dropped from NEEDED by ld's --as-needed
+            // when no symbol references it (true for the SSCP path
+            // that has zero host-side HIP symbol refs). The library
+            // itself must end up in DT_NEEDED so AdaptiveCpp's runtime
+            // dlopen finds it already loaded; otherwise HIP backend
+            // never initialises and we throw "No matching device".
+            println!("cargo:rustc-link-arg=-Wl,--no-as-needed");
+            println!("cargo:rustc-link-arg={amdhip_lib}");
+            println!("cargo:rustc-link-arg=-Wl,--as-needed");
+        } else {
+            // Fallback: ROCm not at /opt/rocm/lib but the user set
+            // ACPP_TARGETS=hip:* explicitly. AOT HIP fat binaries
+            // reference HIP symbols directly, so --as-needed keeps
+            // -lamdhip64 in NEEDED on that path.
+            println!("cargo:rustc-link-lib=amdhip64");
+        }
+    }
 
     // C++ stdlib + POSIX bits the static libs (Rust std + pthread inside
     // pos2_keygen, std::async + std::thread in pos2_gpu_host) reach for.
diff --git a/compose.yaml b/compose.yaml
new file mode 100644
index 0000000..b297cd1
--- /dev/null
+++ b/compose.yaml
@@ -0,0 +1,184 @@
+# compose.yaml — podman-first (also works with docker compose).
+#
+# Three vendor-specific services share one Containerfile, parameterized
+# via build args. Pick one based on your GPU; the build context is the
+# same so the AdaptiveCpp + xchplot2 build layers cache across services.
+#
+# Build & run examples:
+#
+#   # NVIDIA (default sm_89 / RTX 4090; override via $CUDA_ARCH=120 etc.)
+#   podman compose build cuda
+#   podman compose run --rm cuda test 22 <plot_id_hex> 2 0 0 -G -o /out
+#
+#   # NVIDIA Pascal/Volta (sm_61 / GTX 10-series, sm_70 / V100): CUDA 13.x
+#   # dropped codegen for pre-Turing archs, so pin to a 12.x base image.
+#   # scripts/build-container.sh does this automatically when it detects
+#   # CUDA_ARCH < 75; if invoking compose directly, set the base manually:
+#   CUDA_ARCH=61 \
+#     BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \
+#     BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \
+#     podman compose build cuda
+#
+#   # AMD ROCm — set $ACPP_GFX to your card's gfx target (rocminfo | grep gfx).
+#   #   gfx1031 = Navi 22 (RX 6700/6700 XT/6800M)
+#   #   gfx1100 = Navi 31 (RX 7900 XTX/XT)   ← default
+#   #   gfx900  = Vega 10 (RX Vega 56/64, MI25)
+#   ACPP_GFX=gfx1031 podman compose build rocm
+#   podman compose run --rm rocm test 22 <plot_id_hex> 2 0 0 -G -o /out
+#
+#   # Intel oneAPI (experimental, untested).
+#   podman compose build intel
+#
+# Plot files land in ./plots/ on the host (mounted at /out in the
+# container).
+
+services:
+  cuda:
+    build:
+      context: .
+      dockerfile: Containerfile
+      args:
+        # BASE_DEVEL / BASE_RUNTIME default to CUDA 13.x (latest, sm_75+).
+        # scripts/build-container.sh overrides both to nvidia/cuda:12.9.1
+        # when it detects a pre-Turing GPU (Pascal/Volta, CUDA_ARCH < 75)
+        # — CUDA 13.0 dropped codegen for those archs. Set BASE_DEVEL
+        # explicitly to bypass the auto-pick (e.g. for cross-targeting an
+        # arch the host doesn't have).
+        BASE_DEVEL:           "${BASE_DEVEL:-docker.io/nvidia/cuda:13.0.0-devel-ubuntu24.04}"
+        BASE_RUNTIME:         "${BASE_RUNTIME:-docker.io/nvidia/cuda:13.0.0-devel-ubuntu24.04}"
+        ACPP_TARGETS:         "generic"
+        XCHPLOT2_BUILD_CUDA:  "ON"
+        INSTALL_CUDA_HEADERS: "0"
+        CUDA_ARCH:            "${CUDA_ARCH:-89}"
+    image: xchplot2:cuda
+    # GPU pass-through. Works on both engines:
+    #   - Docker (with nvidia-container-toolkit + `nvidia-ctk runtime
+    #     configure --runtime=docker && systemctl restart docker`)
+    #   - Podman 5.x (with podman-compose 1.x+; equivalent to
+    #     `--device nvidia.com/gpu=all` via CDI)
+    # The previous `devices: nvidia.com/gpu=all` shorthand worked on
+    # podman but Docker silently ignored it as an unknown device path,
+    # leaving the container without libcuda.so.1 and producing a
+    # confusing "No matching device" failure mid-plot.
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    volumes:
+      - ./plots:/out
+
+  rocm:
+    build:
+      context: .
+      dockerfile: Containerfile
+      args:
+        # Pinned to ROCm 6.2.x for two reasons:
+        #   1. ROCm 7.x's rocm-llvm package no longer ships LLVMConfig.cmake,
+        #      so AdaptiveCpp's find_package(LLVM) can't run.
+        #   2. ROCm 6.2 ships LLVM 18.0git, matching Ubuntu's llvm-18 so the
+        #      device bitcode (ocml.bc, ockl.bc) is readable by AdaptiveCpp
+        #      built against Ubuntu's LLVM. No "Unknown attribute kind"
+        #      mismatch.
+        # AdaptiveCpp is therefore built against Ubuntu's /usr/lib/llvm-18
+        # (the Containerfile default), and ROCm provides its own clang +
+        # device libs at /opt/rocm/llvm for the HIP backend at runtime.
+        BASE_DEVEL:           docker.io/rocm/dev-ubuntu-24.04:6.2-complete
+        BASE_RUNTIME:         docker.io/rocm/dev-ubuntu-24.04:6.2-complete
+        # IMPORTANT: ACPP_GFX is intentionally *required* — no silent default.
+        # If it's unset the SYCL kernels are AOT-compiled for the wrong amdgcn
+        # ISA, which HIP loads without error but the kernels execute as silent
+        # no-ops at runtime (sort returns input, AES match finds zero results,
+        # plot content diverges from the canonical reference). That failure
+        # mode is extremely confusing to diagnose — it looks like a correctness
+        # bug in the kernels rather than a build-time config error.
+        #
+        # Set ACPP_GFX explicitly. If you sudo compose, pass the var through
+        # (sudo strips env by default):
+        #   ACPP_GFX=gfx1031 sudo -E podman compose build rocm
+        #   sudo ACPP_GFX=gfx1031 podman compose build rocm
+        #
+        # Common gfx targets (see `rocminfo | grep gfx`):
+        #   gfx1030 = RDNA2 Navi 21 (RX 6800/6800 XT/6900 XT)
+        #   gfx1031 = RDNA2 Navi 22 (RX 6700/6700 XT/6800M)
+        #   gfx1100 = RDNA3 Navi 31 (RX 7900 XTX/XT)
+        #   gfx1101 = RDNA3 Navi 32 (RX 7800 XT/7700 XT)
+        #   gfx906  = Vega 20 (Radeon VII, MI50)
+        #   gfx900  = Vega 10 (RX Vega 56/64, MI25)
+        # Use ${VAR:-default} (NOT ${VAR:?error}) so that building cuda
+        # / intel / cpu services without ACPP_GFX set doesn't trip a
+        # parse-time error — podman-compose evaluates :? across ALL
+        # services during YAML parse, not just the one being built.
+        # The placeholder value is intentionally invalid as a gfx
+        # target so AdaptiveCpp's HIP backend fails loudly with the
+        # placeholder string in its error message — much better than
+        # silently building wrong-arch amdgcn ISA from a default like
+        # gfx1100 (kernels would then execute as runtime no-ops, see
+        # the IMPORTANT block above).
+        ACPP_TARGETS:         "hip:${ACPP_GFX:-MISSING-set-ACPP_GFX-or-use-scripts-build-container-sh}"
+        XCHPLOT2_BUILD_CUDA:  "OFF"
+        # No CUDA headers on the AMD path — they conflict with HIP's
+        # uchar1/etc. typedefs. CudaHalfShim.hpp's __has_include guard
+        # handles the absence cleanly.
+        INSTALL_CUDA_HEADERS: "0"
+    image: xchplot2:rocm
+    devices:
+      - /dev/kfd
+      - /dev/dri
+    group_add:
+      - video
+    # Rootless podman's default seccomp filter + capability set blocks
+    # some of the KFD IOCTLs libhsa-runtime64 issues during DMA setup,
+    # which surfaces as a segfault inside the HSA runtime on the first
+    # host→device copy (rocminfo-level queries still work, so the
+    # failure is subtle and confusing). Loosen the sandbox just enough
+    # for HSA's DMA path. If rootless still fails on your host, run
+    # rootful + privileged instead:
+    #   sudo podman run --rm --privileged --device /dev/kfd \
+    #        --device /dev/dri -v $PWD/plots:/out xchplot2:rocm \
+    #        plot -k 28 -n 10 -f <farmer-pk> -c <pool-contract> -o /out
+    security_opt:
+      - seccomp=unconfined
+    cap_add:
+      - SYS_ADMIN
+    volumes:
+      - ./plots:/out
+
+  intel:
+    build:
+      context: .
+      dockerfile: Containerfile
+      args:
+        BASE_DEVEL:           docker.io/intel/oneapi-basekit:latest
+        BASE_RUNTIME:         docker.io/intel/oneapi-runtime:latest
+        ACPP_TARGETS:         "generic"
+        XCHPLOT2_BUILD_CUDA:  "OFF"
+        INSTALL_CUDA_HEADERS: "1"
+    image: xchplot2:intel
+    devices:
+      - /dev/dri
+    volumes:
+      - ./plots:/out
+
+  cpu:
+    # CPU-only image: AdaptiveCpp's OpenMP backend compiles the SYCL
+    # kernels for the host CPU. No GPU runtime needed. Plotting is
+    # 1-2 orders of magnitude slower than GPU; useful for headless CI,
+    # dev machines without a GPU, or as an extra worker on a
+    # heterogeneous `--devices` list. See README's CPU section.
+    build:
+      context: .
+      dockerfile: Containerfile
+      args:
+        BASE_DEVEL:           docker.io/ubuntu:24.04
+        BASE_RUNTIME:         docker.io/ubuntu:24.04
+        ACPP_TARGETS:         "omp"
+        XCHPLOT2_BUILD_CUDA:  "OFF"
+        # AdaptiveCpp's libkernel/half.hpp includes cuda_fp16.h on every
+        # build path; pull the headers (no libcudart link, just headers).
+        INSTALL_CUDA_HEADERS: "1"
+    image: xchplot2:cpu
+    volumes:
+      - ./plots:/out
diff --git a/keygen-rs/Cargo.lock b/keygen-rs/Cargo.lock
index 6ed82bb..795af9a 100644
--- a/keygen-rs/Cargo.lock
+++ b/keygen-rs/Cargo.lock
@@ -2,6 +2,12 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+
 [[package]]
 name = "asn1-rs"
 version = "0.6.2"
@@ -53,6 +59,12 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
 
+[[package]]
+name = "base16ct"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd307490d624467aa6f74b0eabb77633d1f758a7b25f12bceb0b22e08d9726f6"
+
 [[package]]
 name = "base64"
 version = "0.22.1"
@@ -98,6 +110,15 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "block-buffer"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be"
+dependencies = [
+ "hybrid-array",
+]
+
 [[package]]
 name = "blst"
 version = "0.3.16"
@@ -148,9 +169,9 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
 [[package]]
 name = "chia"
-version = "0.42.0"
+version = "0.42.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff1f2c3905a718d77dd48a4f4653e1b29c9e39cd599c2de8fccb10970c563049"
+checksum = "5fb7c121855983543518ab67cb1ebea7e52badc965e547f98d90ee6f728d6c06"
 dependencies = [
  "chia-bls 0.42.0",
  "chia-client",
@@ -170,17 +191,17 @@ dependencies = [
 
 [[package]]
 name = "chia-bls"
-version = "0.36.1"
+version = "0.38.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f02cbfd038d9050d45edbe8f38e09391c73479c0cca5b37925daf48c4d4fcd4"
+checksum = "a70dfe8540688eaed5bdecffd51c26df489b8bc610890b613b81461411f90cc9"
 dependencies = [
  "blst",
- "chia-sha2 0.36.1",
- "chia-traits 0.36.1",
+ "chia-sha2 0.38.2",
+ "chia-traits 0.38.2",
  "hex",
  "hkdf",
  "linked-hash-map",
- "sha2",
+ "sha2 0.10.9",
  "thiserror 1.0.69",
 ]
 
@@ -198,7 +219,7 @@ dependencies = [
  "hkdf",
  "linked-hash-map",
  "serde",
- "sha2",
+ "sha2 0.10.9",
  "thiserror 1.0.69",
 ]
 
@@ -335,8 +356,8 @@ checksum = "82c0c0303a91f6190b26ba8778f7b38438e79df02a5631b80269d3aa36372a76"
 dependencies = [
  "chia-sha2 0.42.0",
  "hex",
- "k256",
- "p256",
+ "k256 0.13.4",
+ "p256 0.13.2",
 ]
 
 [[package]]
@@ -351,11 +372,11 @@ dependencies = [
 
 [[package]]
 name = "chia-sha2"
-version = "0.36.1"
+version = "0.38.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0934b0d6b878f29ba6c958e56e4b7158f9e687c200ffdca141dbc408a5cce42e"
+checksum = "5a57be484b5abb4481a3ea8b2e6fc0404f41222e0cfb35b81269c2404b64107a"
 dependencies = [
- "sha2",
+ "sha2 0.10.9",
 ]
 
 [[package]]
@@ -364,7 +385,7 @@ version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6636ca8bba852fc516eacf01b2c3964b6b290359e7d1e89b950e6754e2a1082"
 dependencies = [
- "sha2",
+ "sha2 0.10.9",
 ]
 
 [[package]]
@@ -382,12 +403,12 @@ dependencies = [
 
 [[package]]
 name = "chia-traits"
-version = "0.36.1"
+version = "0.38.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f4922b447b2d8418213948af1a448c3ca7b84e149b51b2c87a2e00e80bb19b0"
+checksum = "b13ea36e3ae5ede1d015d873fdfa91ea4d7a8790c6859c78b6b74065c7ddbbbd"
 dependencies = [
- "chia-sha2 0.36.1",
- "chia_streamable_macro 0.36.1",
+ "chia-sha2 0.38.2",
+ "chia_streamable_macro 0.38.2",
  "thiserror 1.0.69",
 ]
 
@@ -404,9 +425,9 @@ dependencies = [
 
 [[package]]
 name = "chia_streamable_macro"
-version = "0.36.1"
+version = "0.38.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b60cefc5fe39f695816d42a327cbefad3d6d6a8ecadad1b58d7507067c25da8"
+checksum = "4450a65b83cd89f8ccad2b4d5f8dc23e89ab0b6ae86d8c535ffde9fdc9d9c6c5"
 dependencies = [
  "proc-macro-crate",
  "proc-macro2",
@@ -466,36 +487,54 @@ dependencies = [
 
 [[package]]
 name = "clvmr"
-version = "0.17.5"
+version = "0.17.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56b333963b083468df9a15602fcc3a24fa3f8c3964569fb9d2415ac70c0820e9"
+checksum = "3060bcd64cb8cf2b32fe6ee3a82698835c03361c8e1da446d2e9d058fbfffd5f"
 dependencies = [
  "bitflags",
  "bitvec",
  "bumpalo",
- "chia-bls 0.36.1",
- "chia-sha2 0.36.1",
+ "chia-bls 0.38.2",
+ "chia-sha2 0.38.2",
  "hex",
  "hex-literal",
- "k256",
+ "k256 0.14.0-rc.9",
  "lazy_static",
  "malachite-bigint",
  "num-bigint",
  "num-integer",
  "num-traits",
- "p256",
- "rand 0.8.6",
+ "p256 0.14.0-rc.9",
+ "rand 0.9.4",
  "sha1",
  "sha3",
- "thiserror 1.0.69",
+ "thiserror 2.0.18",
 ]
 
+[[package]]
+name = "cmov"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f88a43d011fc4a6876cb7344703e297c71dda42494fee094d5f7c76bf13f746"
+
 [[package]]
 name = "const-oid"
 version = "0.9.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
 
+[[package]]
+name = "const-oid"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c"
+
+[[package]]
+name = "cpubits"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15b85f9c39137c3a891689859392b1bd49812121d0d61c9caf00d46ed5ce06ae"
+
 [[package]]
 name = "cpufeatures"
 version = "0.2.17"
@@ -505,6 +544,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "cpufeatures"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "crossbeam-deque"
 version = "0.8.6"
@@ -542,6 +590,22 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "crypto-bigint"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42a0d26b245348befa0c121944541476763dcc46ede886c88f9d12e1697d27c3"
+dependencies = [
+ "cpubits",
+ "ctutils",
+ "getrandom 0.4.2",
+ "hybrid-array",
+ "num-traits",
+ "rand_core 0.10.1",
+ "subtle",
+ "zeroize",
+]
+
 [[package]]
 name = "crypto-common"
 version = "0.1.6"
@@ -552,6 +616,27 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "crypto-common"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710"
+dependencies = [
+ "getrandom 0.4.2",
+ "hybrid-array",
+ "rand_core 0.10.1",
+]
+
+[[package]]
+name = "ctutils"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d5515a3834141de9eafb9717ad39eea8247b5674e6066c404e8c4b365d2a29e"
+dependencies = [
+ "cmov",
+ "subtle",
+]
+
 [[package]]
 name = "data-encoding"
 version = "2.10.0"
@@ -564,8 +649,19 @@ version = "0.7.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
 dependencies = [
- "const-oid",
- "pem-rfc7468",
+ "const-oid 0.9.6",
+ "pem-rfc7468 0.7.0",
+ "zeroize",
+]
+
+[[package]]
+name = "der"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b"
+dependencies = [
+ "const-oid 0.10.2",
+ "pem-rfc7468 1.0.0",
  "zeroize",
 ]
 
@@ -598,12 +694,24 @@ version = "0.10.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
- "block-buffer",
- "const-oid",
- "crypto-common",
+ "block-buffer 0.10.4",
+ "const-oid 0.9.6",
+ "crypto-common 0.1.6",
  "subtle",
 ]
 
+[[package]]
+name = "digest"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c"
+dependencies = [
+ "block-buffer 0.12.0",
+ "const-oid 0.10.2",
+ "crypto-common 0.2.1",
+ "ctutils",
+]
+
 [[package]]
 name = "displaydoc"
 version = "0.2.5"
@@ -621,12 +729,27 @@ version = "0.16.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca"
 dependencies = [
- "der",
- "digest",
- "elliptic-curve",
- "rfc6979",
- "signature",
- "spki",
+ "der 0.7.10",
+ "digest 0.10.7",
+ "elliptic-curve 0.13.8",
+ "rfc6979 0.4.0",
+ "signature 2.2.0",
+ "spki 0.7.3",
+]
+
+[[package]]
+name = "ecdsa"
+version = "0.17.0-rc.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54fb064faabbee66e1fc8e5c5a9458d4269dc2d8b638fe86a425adb2510d1a96"
+dependencies = [
+ "der 0.8.0",
+ "digest 0.11.2",
+ "elliptic-curve 0.14.0-rc.32",
+ "rfc6979 0.5.0-rc.5",
+ "signature 3.0.0",
+ "spki 0.8.0",
+ "zeroize",
 ]
 
 [[package]]
@@ -641,16 +764,38 @@ version = "0.13.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
 dependencies = [
- "base16ct",
- "crypto-bigint",
- "digest",
+ "base16ct 0.2.0",
+ "crypto-bigint 0.5.5",
+ "digest 0.10.7",
  "ff",
  "generic-array",
  "group",
- "pem-rfc7468",
- "pkcs8",
+ "pem-rfc7468 0.7.0",
+ "pkcs8 0.10.2",
  "rand_core 0.6.4",
- "sec1",
+ "sec1 0.7.3",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "elliptic-curve"
+version = "0.14.0-rc.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cda94f31325c4275e9706adecbb6f0650dee2f904c915a98e3d81adaaaa757aa"
+dependencies = [
+ "base16ct 1.0.0",
+ "crypto-bigint 0.7.3",
+ "crypto-common 0.2.1",
+ "digest 0.11.2",
+ "hybrid-array",
+ "once_cell",
+ "pem-rfc7468 1.0.0",
+ "pkcs8 0.11.0",
+ "rand_core 0.10.1",
+ "rustcrypto-ff",
+ "rustcrypto-group",
+ "sec1 0.8.1",
  "subtle",
  "zeroize",
 ]
@@ -677,6 +822,12 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
 
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
 [[package]]
 name = "foldhash"
 version = "0.2.0"
@@ -762,10 +913,24 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
  "cfg-if",
  "libc",
- "r-efi",
+ "r-efi 5.3.0",
  "wasip2",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 6.0.0",
+ "rand_core 0.10.1",
+ "wasip2",
+ "wasip3",
+]
+
 [[package]]
 name = "glob"
 version = "0.3.3"
@@ -783,13 +948,22 @@ dependencies = [
  "subtle",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+dependencies = [
+ "foldhash 0.1.5",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
 dependencies = [
- "foldhash",
+ "foldhash 0.2.0",
 ]
 
 [[package]]
@@ -798,6 +972,12 @@ version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
 
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
 [[package]]
 name = "hermit-abi"
 version = "0.5.2"
@@ -822,7 +1002,7 @@ version = "0.12.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7"
 dependencies = [
- "hmac",
+ "hmac 0.12.1",
 ]
 
 [[package]]
@@ -831,7 +1011,16 @@ version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
 dependencies = [
- "digest",
+ "digest 0.10.7",
+]
+
+[[package]]
+name = "hmac"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6303bc9732ae41b04cb554b844a762b4115a61bfaa81e3e83050991eeb56863f"
+dependencies = [
+ "digest 0.11.2",
 ]
 
 [[package]]
@@ -850,6 +1039,23 @@ version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
 
+[[package]]
+name = "hybrid-array"
+version = "0.4.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d46837a0ed51fe95bd3b05de33cd64a1ee88fc797477ca48446872504507c5"
+dependencies = [
+ "subtle",
+ "typenum",
+ "zeroize",
+]
+
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
 [[package]]
 name = "indexmap"
 version = "2.14.0"
@@ -858,6 +1064,8 @@ checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
 dependencies = [
  "equivalent",
  "hashbrown 0.17.0",
+ "serde",
+ "serde_core",
 ]
 
 [[package]]
@@ -892,11 +1100,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f6e3919bbaa2945715f0bb6d3934a173d1e9a59ac23767fbaaef277265a7411b"
 dependencies = [
  "cfg-if",
- "ecdsa",
- "elliptic-curve",
+ "ecdsa 0.16.9",
+ "elliptic-curve 0.13.8",
  "once_cell",
- "sha2",
- "signature",
+ "sha2 0.10.9",
+ "signature 2.2.0",
+]
+
+[[package]]
+name = "k256"
+version = "0.14.0-rc.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b382cbfd43caf55991a93850ce538aa1aa67bb264af367d22dfe7937c4e997d"
+dependencies = [
+ "cpubits",
+ "ecdsa 0.17.0-rc.18",
+ "elliptic-curve 0.14.0-rc.32",
+ "sha2 0.11.0",
+ "signature 3.0.0",
 ]
 
 [[package]]
@@ -905,7 +1126,7 @@ version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653"
 dependencies = [
- "cpufeatures",
+ "cpufeatures 0.2.17",
 ]
 
 [[package]]
@@ -917,6 +1138,12 @@ dependencies = [
  "spin",
 ]
 
+[[package]]
+name = "leb128fmt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
+
 [[package]]
 name = "libc"
 version = "0.2.185"
@@ -1104,10 +1331,23 @@ version = "0.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b"
 dependencies = [
- "ecdsa",
- "elliptic-curve",
- "primeorder",
- "sha2",
+ "ecdsa 0.16.9",
+ "elliptic-curve 0.13.8",
+ "primeorder 0.13.6",
+ "sha2 0.10.9",
+]
+
+[[package]]
+name = "p256"
+version = "0.14.0-rc.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b97e3bf0465157ae90975ff52dbeb1362ba618924878c9f74c25baa27a65f9a"
+dependencies = [
+ "ecdsa 0.17.0-rc.18",
+ "elliptic-curve 0.14.0-rc.32",
+ "primefield",
+ "primeorder 0.14.0-rc.9",
+ "sha2 0.11.0",
 ]
 
 [[package]]
@@ -1135,6 +1375,15 @@ dependencies = [
  "base64ct",
 ]
 
+[[package]]
+name = "pem-rfc7468"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9"
+dependencies = [
+ "base64ct",
+]
+
 [[package]]
 name = "pin-project-lite"
 version = "0.2.17"
@@ -1147,9 +1396,9 @@ version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
 dependencies = [
- "der",
- "pkcs8",
- "spki",
+ "der 0.7.10",
+ "pkcs8 0.10.2",
+ "spki 0.7.3",
 ]
 
 [[package]]
@@ -1158,8 +1407,18 @@ version = "0.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
 dependencies = [
- "der",
- "spki",
+ "der 0.7.10",
+ "spki 0.7.3",
+]
+
+[[package]]
+name = "pkcs8"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "451913da69c775a56034ea8d9003d27ee8948e12443eae7c038ba100a4f21cb7"
+dependencies = [
+ "der 0.8.0",
+ "spki 0.8.0",
 ]
 
 [[package]]
@@ -1175,7 +1434,7 @@ dependencies = [
  "bech32",
  "chia",
  "hex",
- "sha2",
+ "sha2 0.11.0",
 ]
 
 [[package]]
@@ -1193,13 +1452,46 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "prettyplease"
+version = "0.2.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
+dependencies = [
+ "proc-macro2",
+ "syn",
+]
+
+[[package]]
+name = "primefield"
+version = "0.14.0-rc.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b52e6ee42db392378a95622b463c9740631171d1efce43fa445a569c1600cb6"
+dependencies = [
+ "crypto-bigint 0.7.3",
+ "crypto-common 0.2.1",
+ "rand_core 0.10.1",
+ "rustcrypto-ff",
+ "subtle",
+ "zeroize",
+]
+
 [[package]]
 name = "primeorder"
 version = "0.13.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6"
 dependencies = [
- "elliptic-curve",
+ "elliptic-curve 0.13.8",
+]
+
+[[package]]
+name = "primeorder"
+version = "0.14.0-rc.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0556580e42c19833f5d232aca11a7687a503ee41f937b54f5ae1d50fc2a6a36a"
+dependencies = [
+ "elliptic-curve 0.14.0-rc.32",
 ]
 
 [[package]]
@@ -1236,6 +1528,12 @@ version = "5.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
 
+[[package]]
+name = "r-efi"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
+
 [[package]]
 name = "radium"
 version = "0.7.0"
@@ -1301,6 +1599,12 @@ dependencies = [
  "getrandom 0.3.4",
 ]
 
+[[package]]
+name = "rand_core"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69"
+
 [[package]]
 name = "rayon"
 version = "1.12.0"
@@ -1341,7 +1645,17 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2"
 dependencies = [
- "hmac",
+ "hmac 0.12.1",
+ "subtle",
+]
+
+[[package]]
+name = "rfc6979"
+version = "0.5.0-rc.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23a3127ee32baec36af75b4107082d9bd823501ec14a4e016be4b6b37faa74ae"
+dependencies = [
+ "hmac 0.13.0",
  "subtle",
 ]
 
@@ -1365,20 +1679,41 @@ version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8573f03f5883dcaebdfcf4725caa1ecb9c15b2ef50c43a07b816e06799bb12d"
 dependencies = [
- "const-oid",
- "digest",
+ "const-oid 0.9.6",
+ "digest 0.10.7",
  "num-bigint-dig",
  "num-integer",
  "num-traits",
  "pkcs1",
- "pkcs8",
+ "pkcs8 0.10.2",
  "rand_core 0.6.4",
- "signature",
- "spki",
+ "signature 2.2.0",
+ "spki 0.7.3",
  "subtle",
  "zeroize",
 ]
 
+[[package]]
+name = "rustcrypto-ff"
+version = "0.14.0-rc.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd2a8adb347447693cd2ba0d218c4b66c62da9b0a5672b17b981e4291ec65ff6"
+dependencies = [
+ "rand_core 0.10.1",
+ "subtle",
+]
+
+[[package]]
+name = "rustcrypto-group"
+version = "0.14.0-rc.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "369f9b61aa45933c062c9f6b5c3c50ab710687eca83dd3802653b140b43f85ed"
+dependencies = [
+ "rand_core 0.10.1",
+ "rustcrypto-ff",
+ "subtle",
+]
+
 [[package]]
 name = "rusticata-macros"
 version = "4.1.0"
@@ -1418,14 +1753,34 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
 dependencies = [
- "base16ct",
- "der",
+ "base16ct 0.2.0",
+ "der 0.7.10",
  "generic-array",
- "pkcs8",
+ "pkcs8 0.10.2",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "sec1"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d56d437c2f19203ce5f7122e507831de96f3d2d4d3be5af44a0b0a09d8a80e4d"
+dependencies = [
+ "base16ct 1.0.0",
+ "ctutils",
+ "der 0.8.0",
+ "hybrid-array",
  "subtle",
  "zeroize",
 ]
 
+[[package]]
+name = "semver"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
+
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -1474,6 +1829,19 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
 [[package]]
 name = "sha1"
 version = "0.10.6"
@@ -1481,8 +1849,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
 dependencies = [
  "cfg-if",
- "cpufeatures",
- "digest",
+ "cpufeatures 0.2.17",
+ "digest 0.10.7",
 ]
 
 [[package]]
@@ -1492,8 +1860,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
 dependencies = [
  "cfg-if",
- "cpufeatures",
- "digest",
+ "cpufeatures 0.2.17",
+ "digest 0.10.7",
+]
+
+[[package]]
+name = "sha2"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.3.0",
+ "digest 0.11.2",
 ]
 
 [[package]]
@@ -1502,7 +1881,7 @@ version = "0.10.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60"
 dependencies = [
- "digest",
+ "digest 0.10.7",
  "keccak",
 ]
 
@@ -1518,10 +1897,20 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
 dependencies = [
- "digest",
+ "digest 0.10.7",
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "signature"
+version = "3.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28d567dcbaf0049cb8ac2608a76cd95ff9e4412e1899d389ee400918ca7537f5"
+dependencies = [
+ "digest 0.11.2",
+ "rand_core 0.10.1",
+]
+
 [[package]]
 name = "slab"
 version = "0.4.12"
@@ -1557,7 +1946,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
 dependencies = [
  "base64ct",
- "der",
+ "der 0.7.10",
+]
+
+[[package]]
+name = "spki"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d9efca8738c78ee9484207732f728b1ef517bbb1833d6fc0879ca898a522f6f"
+dependencies = [
+ "base64ct",
+ "der 0.8.0",
 ]
 
 [[package]]
@@ -1736,9 +2135,9 @@ dependencies = [
 
 [[package]]
 name = "typenum"
-version = "1.19.0"
+version = "1.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
+checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
 
 [[package]]
 name = "unicode-ident"
@@ -1746,6 +2145,12 @@ version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
 
+[[package]]
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
 [[package]]
 name = "untrusted"
 version = "0.9.0"
@@ -1779,6 +2184,49 @@ dependencies = [
  "wit-bindgen",
 ]
 
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+dependencies = [
+ "wit-bindgen",
+]
+
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap",
+ "wasm-encoder",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags",
+ "hashbrown 0.15.5",
+ "indexmap",
+ "semver",
+]
+
 [[package]]
 name = "wide"
 version = "1.3.0"
@@ -1891,6 +2339,88 @@ name = "wit-bindgen"
 version = "0.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+dependencies = [
+ "wit-bindgen-rust-macro",
+]
+
+[[package]]
+name = "wit-bindgen-core"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
+dependencies = [
+ "anyhow",
+ "heck",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck",
+ "indexmap",
+ "prettyplease",
+ "syn",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags",
+ "indexmap",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
+dependencies = [
+ "anyhow",
+ "id-arena",
+ "indexmap",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
+]
 
 [[package]]
 name = "wyz"
@@ -1968,6 +2498,12 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
+
 [[package]]
 name = "zstd"
 version = "0.13.3"
diff --git a/keygen-rs/Cargo.toml b/keygen-rs/Cargo.toml
index 0365b3d..02c4349 100644
--- a/keygen-rs/Cargo.toml
+++ b/keygen-rs/Cargo.toml
@@ -10,7 +10,7 @@ crate-type = ["staticlib"]
 [dependencies]
 chia = "0.42"
 bech32 = "0.11"
-sha2 = "0.10"
+sha2 = "0.11"
 
 [dev-dependencies]
 hex = "0.4"
diff --git a/keygen-rs/src/lib.rs b/keygen-rs/src/lib.rs
index 2f9e1b3..9126907 100644
--- a/keygen-rs/src/lib.rs
+++ b/keygen-rs/src/lib.rs
@@ -10,20 +10,20 @@
 // byte-identical to `chia plots create --v2`.
 
 use chia::bls::{PublicKey, SecretKey};
-use chia::protocol::{Bytes32, compute_plot_id_v2};
+use chia::protocol::{compute_plot_id_v2, Bytes32};
 use chia::sha2::Sha256;
 
 // ---------------------------------------------------------------------------
 // Result codes returned across the FFI boundary.
 // ---------------------------------------------------------------------------
-pub const POS2_OK: i32                 = 0;
-pub const POS2_BAD_FARMER_PK: i32      = -1;
-pub const POS2_BAD_POOL_KEY: i32       = -2;
-pub const POS2_BAD_POOL_KIND: i32      = -3;
+pub const POS2_OK: i32 = 0;
+pub const POS2_BAD_FARMER_PK: i32 = -1;
+pub const POS2_BAD_POOL_KEY: i32 = -2;
+pub const POS2_BAD_POOL_KIND: i32 = -3;
 pub const POS2_MEMO_BUF_TOO_SMALL: i32 = -4;
-pub const POS2_BAD_SEED: i32           = -5;
-pub const POS2_BAD_ADDRESS: i32        = -6;
-pub const POS2_BAD_HRP: i32            = -7;
+pub const POS2_BAD_SEED: i32 = -5;
+pub const POS2_BAD_ADDRESS: i32 = -6;
+pub const POS2_BAD_HRP: i32 = -7;
 
 // pool_kind values.
 pub const POS2_POOL_PK: i32 = 0; // pool_key_or_ph points to 48 bytes (G1)
@@ -108,8 +108,8 @@ pub unsafe extern "C" fn pos2_keygen_derive_plot(
     strength: u8,
     plot_index: u16,
     meta_group: u8,
-    out_plot_id: *mut u8,    // 32 bytes written
-    out_memo_buf: *mut u8,   // caller-owned buffer
+    out_plot_id: *mut u8,       // 32 bytes written
+    out_memo_buf: *mut u8,      // caller-owned buffer
     inout_memo_len: *mut usize, // in: capacity; out: bytes written
 ) -> i32 {
     if seed_len < 32 {
@@ -117,48 +117,42 @@ pub unsafe extern "C" fn pos2_keygen_derive_plot(
     }
     let seed: &[u8] = unsafe { std::slice::from_raw_parts(seed_ptr, seed_len) };
 
-    let farmer_pk_bytes: &[u8; 48] =
-        match unsafe { (farmer_pk_ptr as *const [u8; 48]).as_ref() } {
-            Some(b) => b,
-            None => return POS2_BAD_FARMER_PK,
-        };
+    let farmer_pk_bytes: &[u8; 48] = match unsafe { (farmer_pk_ptr as *const [u8; 48]).as_ref() } {
+        Some(b) => b,
+        None => return POS2_BAD_FARMER_PK,
+    };
     let farmer_pk = match PublicKey::from_bytes(farmer_pk_bytes) {
         Ok(pk) => pk,
         Err(_) => return POS2_BAD_FARMER_PK,
     };
 
-    let (pool_pk_opt, pool_ph_opt, pool_key_slice): (
-        Option<PublicKey>,
-        Option<Bytes32>,
-        &[u8],
-    ) = match pool_kind {
-        x if x == POS2_POOL_PK => {
-            let bytes: &[u8; 48] =
-                match unsafe { (pool_key_ptr as *const [u8; 48]).as_ref() } {
+    let (pool_pk_opt, pool_ph_opt, pool_key_slice): (Option<PublicKey>, Option<Bytes32>, &[u8]) =
+        match pool_kind {
+            x if x == POS2_POOL_PK => {
+                let bytes: &[u8; 48] = match unsafe { (pool_key_ptr as *const [u8; 48]).as_ref() } {
                     Some(b) => b,
                     None => return POS2_BAD_POOL_KEY,
                 };
-            let pk = match PublicKey::from_bytes(bytes) {
-                Ok(pk) => pk,
-                Err(_) => return POS2_BAD_POOL_KEY,
-            };
-            (Some(pk), None, &bytes[..])
-        }
-        x if x == POS2_POOL_PH => {
-            let bytes: &[u8; 32] =
-                match unsafe { (pool_key_ptr as *const [u8; 32]).as_ref() } {
+                let pk = match PublicKey::from_bytes(bytes) {
+                    Ok(pk) => pk,
+                    Err(_) => return POS2_BAD_POOL_KEY,
+                };
+                (Some(pk), None, &bytes[..])
+            }
+            x if x == POS2_POOL_PH => {
+                let bytes: &[u8; 32] = match unsafe { (pool_key_ptr as *const [u8; 32]).as_ref() } {
                     Some(b) => b,
                     None => return POS2_BAD_POOL_KEY,
                 };
-            let ph: Bytes32 = (*bytes).into();
-            (None, Some(ph), &bytes[..])
-        }
-        _ => return POS2_BAD_POOL_KIND,
-    };
+                let ph: Bytes32 = (*bytes).into();
+                (None, Some(ph), &bytes[..])
+            }
+            _ => return POS2_BAD_POOL_KIND,
+        };
 
     let master_sk = SecretKey::from_seed(seed);
-    let local_sk  = master_sk_to_local_sk(&master_sk);
-    let local_pk  = local_sk.public_key();
+    let local_sk = master_sk_to_local_sk(&master_sk);
+    let local_pk = local_sk.public_key();
 
     let include_taproot = pool_ph_opt.is_some();
     let plot_pk = generate_plot_public_key(&local_pk, &farmer_pk, include_taproot);
@@ -185,11 +179,7 @@ pub unsafe extern "C" fn pos2_keygen_derive_plot(
         std::ptr::copy_nonoverlapping(plot_id.as_ref().as_ptr(), out_plot_id, 32);
         let dst = out_memo_buf;
         std::ptr::copy_nonoverlapping(pool_key_slice.as_ptr(), dst, pool_key_slice.len());
-        std::ptr::copy_nonoverlapping(
-            farmer_pk_bytes.as_ptr(),
-            dst.add(pool_key_slice.len()),
-            48,
-        );
+        std::ptr::copy_nonoverlapping(farmer_pk_bytes.as_ptr(), dst.add(pool_key_slice.len()), 48);
         std::ptr::copy_nonoverlapping(
             master_sk_bytes.as_ptr(),
             dst.add(pool_key_slice.len() + 48),
@@ -223,7 +213,7 @@ pub unsafe extern "C" fn pos2_keygen_decode_address(
 
     // bech32 0.11: decode returns (Hrp, Vec<u8>) with the 8-bit payload.
     let (hrp, data) = match bech32::decode(s) {
-        Ok(x)  => x,
+        Ok(x) => x,
         Err(_) => return POS2_BAD_ADDRESS,
     };
     let h = hrp.as_str();
@@ -251,7 +241,7 @@ pub unsafe extern "C" fn pos2_keygen_decode_address(
 pub unsafe extern "C" fn pos2_keygen_derive_subseed(
     base_seed: *const u8, // 32 bytes
     idx: u64,
-    out_seed: *mut u8,    // 32 bytes
+    out_seed: *mut u8, // 32 bytes
 ) -> i32 {
     use sha2::{Digest, Sha256};
     if base_seed.is_null() || out_seed.is_null() {
@@ -275,19 +265,23 @@ mod tests {
     // Same inputs must produce identical plot_id + memo.
     #[test]
     fn deterministic_same_seed() {
-        let seed      = [0xAA_u8; 32];
+        let seed = [0xAA_u8; 32];
         let farmer_pk = SecretKey::from_seed(&[0xBB_u8; 32]).public_key().to_bytes();
-        let pool_ph   = [0xCC_u8; 32];
+        let pool_ph = [0xCC_u8; 32];
 
         let mut pid1 = [0u8; 32];
         let mut memo1 = vec![0u8; 128];
         let mut mlen1: usize = memo1.len();
         let rc1 = unsafe {
             pos2_keygen_derive_plot(
-                seed.as_ptr(), seed.len(),
+                seed.as_ptr(),
+                seed.len(),
                 farmer_pk.as_ptr(),
-                pool_ph.as_ptr(), POS2_POOL_PH,
-                2, 0, 0,
+                pool_ph.as_ptr(),
+                POS2_POOL_PH,
+                2,
+                0,
+                0,
                 pid1.as_mut_ptr(),
                 memo1.as_mut_ptr(),
                 &mut mlen1,
@@ -301,10 +295,14 @@ mod tests {
         let mut mlen2: usize = memo2.len();
         let rc2 = unsafe {
             pos2_keygen_derive_plot(
-                seed.as_ptr(), seed.len(),
+                seed.as_ptr(),
+                seed.len(),
                 farmer_pk.as_ptr(),
-                pool_ph.as_ptr(), POS2_POOL_PH,
-                2, 0, 0,
+                pool_ph.as_ptr(),
+                POS2_POOL_PH,
+                2,
+                0,
+                0,
                 pid2.as_mut_ptr(),
                 memo2.as_mut_ptr(),
                 &mut mlen2,
diff --git a/scripts/build-container.sh b/scripts/build-container.sh
new file mode 100755
index 0000000..439699d
--- /dev/null
+++ b/scripts/build-container.sh
@@ -0,0 +1,204 @@
+#!/usr/bin/env bash
+#
+# build-container.sh — auto-detect GPU vendor on the host and run the
+# matching `podman compose build <service>` with the right env vars.
+#
+# Container builds can't probe the GPU themselves (no device access),
+# so this script does it from the host before invoking compose.
+#
+# Usage:
+#   ./scripts/build-container.sh                 # auto-detect
+#   ./scripts/build-container.sh --gpu nvidia    # force NVIDIA
+#   ./scripts/build-container.sh --gpu amd       # force AMD
+#   ./scripts/build-container.sh --gpu intel     # force Intel
+#   ./scripts/build-container.sh --gpu cpu       # CPU-only (AdaptiveCpp OpenMP)
+#   ./scripts/build-container.sh --no-cache      # force clean rebuild
+#   ./scripts/build-container.sh --engine docker # use docker compose instead
+
+set -euo pipefail
+
+ENGINE=podman
+GPU=""
+declare -a EXTRA_BUILD_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --gpu)     GPU="$2";    shift 2 ;;
+        --engine)  ENGINE="$2"; shift 2 ;;
+        # Force a clean rebuild (ignore podman/docker layer cache). Useful
+        # after a host upgrade (new nvcc / new AdaptiveCpp release / etc.)
+        # where the cached layers reference stale toolchain versions.
+        --no-cache) EXTRA_BUILD_ARGS+=("--no-cache"); shift 1 ;;
+        -h|--help) sed -n '2,/^$/p' "$0" | sed 's/^# \?//'; exit 0 ;;
+        *) echo "unknown arg: $1" >&2; exit 1 ;;
+    esac
+done
+
+# ── Detect vendor ───────────────────────────────────────────────────────────
+# Capture output first so `set -o pipefail` doesn't bite us — rocminfo and
+# some nvidia-smi configurations exit non-zero even when they print useful
+# information, and the pipefail bash setting then makes the entire pipeline
+# return non-zero regardless of grep's match status.
+if [[ -z "$GPU" ]]; then
+    nvidia_out=""
+    rocm_out=""
+    if command -v nvidia-smi >/dev/null; then
+        nvidia_out=$(nvidia-smi -L 2>/dev/null || true)
+    fi
+    if command -v rocminfo >/dev/null; then
+        rocm_out=$(rocminfo 2>/dev/null || true)
+    fi
+
+    if [[ "$nvidia_out" == *GPU* ]]; then
+        GPU=nvidia
+    elif [[ "$rocm_out" == *gfx* ]]; then
+        GPU=amd
+    else
+        echo "[build-container] No GPU detected via nvidia-smi or rocminfo." >&2
+        echo "[build-container]" >&2
+        echo "[build-container] Either:" >&2
+        echo "[build-container]   1. Run scripts/install-container-deps.sh, which installs the" >&2
+        echo "[build-container]      discovery tool (nvidia-smi / rocminfo) along with the" >&2
+        echo "[build-container]      container engine + GPU runtime." >&2
+        echo "[build-container]   2. Install the discovery tool manually:" >&2
+        echo "[build-container]        Arch:    sudo pacman -S nvidia-utils    (NVIDIA)" >&2
+        echo "[build-container]                 sudo pacman -S rocminfo        (AMD)" >&2
+        echo "[build-container]        Ubuntu:  sudo apt install nvidia-utils-XXX  (NVIDIA)" >&2
+        echo "[build-container]                 sudo apt install rocminfo          (AMD)" >&2
+        echo "[build-container]   3. Force a service explicitly:" >&2
+        echo "[build-container]        $0 --gpu nvidia | amd | intel" >&2
+        echo "[build-container]   4. Or build a CPU-only image (slow plotting, no GPU needed):" >&2
+        echo "[build-container]        $0 --gpu cpu" >&2
+        exit 1
+    fi
+fi
+
+# ── Map vendor → compose service + env ──────────────────────────────────────
+case "$GPU" in
+    nvidia)
+        SERVICE=cuda
+        # Enumerate ALL GPUs and build a fat binary (CMake's "61;86"
+        # list syntax) so heterogeneous rigs (e.g. 1070 + 3060) get
+        # native sm_NN codegen for each card, not just whichever one
+        # nvidia-smi happened to list first. Single-card hosts produce
+        # a single-arch list ("89") — same end result as the prior
+        # head -1 path. Skip the probe entirely if the user pre-set
+        # CUDA_ARCH (single arch or "61;86" list) so cross-targeting
+        # an absent GPU still works.
+        if [[ -z "${CUDA_ARCH:-}" ]] && command -v nvidia-smi >/dev/null; then
+            # sed first (strip the dot), then sort -un (numeric dedup).
+            # Without the numeric sort, 1070+5090 would emit "120;61"
+            # because sort -u defaults to lexicographic.
+            caps=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null \
+                    | sed 's/\.//' | sort -un)
+            if [[ -n "$caps" ]]; then
+                # Split assignment from export so a non-zero exit from the
+                # subshell pipeline propagates instead of being masked by
+                # `export`'s own success (shellcheck SC2155).
+                CUDA_ARCH=$(echo "$caps" | paste -sd';')
+                export CUDA_ARCH
+            fi
+        fi
+        : "${CUDA_ARCH:=89}"
+        export CUDA_ARCH
+        # Min arch drives the toolkit choice: a 1070+3060 mix needs a
+        # toolchain that targets sm_61, not just sm_86. Works for
+        # single-arch CUDA_ARCH=89 (min=89) and for user-set lists
+        # like "61;86" (min=61).
+        min_arch=$(echo "$CUDA_ARCH" | tr ';' '\n' | sort -n | head -1)
+        # CUDA 13.0 dropped codegen for sm_50/52/53/60/61/62/70/72 entirely
+        # — its nvcc fails the CMake TryCompile probe with "Unsupported gpu
+        # architecture 'compute_61'" on Pascal, "compute_70" on Volta, etc.
+        # Pin builds with ANY pre-Turing card to the last 12.x dev image,
+        # which still covers sm_50 (Maxwell) through sm_120 (Blackwell), so
+        # a mixed 1070+3060 (or 1070+5090) rig gets one toolchain that
+        # handles every arch in the list. Honour an explicit BASE_DEVEL /
+        # BASE_RUNTIME override from the env so users can pin to a
+        # different toolkit if they need to.
+        if (( min_arch < 75 )) && [[ -z "${BASE_DEVEL:-}" ]]; then
+            export BASE_DEVEL="docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04"
+            export BASE_RUNTIME="${BASE_RUNTIME:-$BASE_DEVEL}"
+            echo "[build-container] sm_${min_arch} (pre-Turing) detected → pinning CUDA 12.9 base (CUDA 13.x dropped sub-Turing codegen)"
+        fi
+        echo "[build-container] vendor=nvidia service=$SERVICE CUDA_ARCH=$CUDA_ARCH"
+        ;;
+    amd)
+        SERVICE=rocm
+        # Reuse the rocminfo output captured during vendor detection (or
+        # capture it now if --gpu amd was forced and rocm_out is empty).
+        # Avoid `rocminfo | awk '...; exit'` because awk's early exit
+        # SIGPIPEs rocminfo, and pipefail + set -e then kills the script.
+        if [[ -z "${rocm_out:-}" ]] && command -v rocminfo >/dev/null; then
+            rocm_out=$(rocminfo 2>/dev/null || true)
+        fi
+        # Honour an explicit ACPP_GFX from the env first (lets the user
+        # cross-target a different GPU than the host one), else autodetect.
+        if [[ -z "${ACPP_GFX:-}" ]]; then
+            if [[ -n "${rocm_out:-}" && "$rocm_out" =~ (gfx[0-9a-f]+) ]]; then
+                detected_gfx="${BASH_REMATCH[1]}"
+                # RDNA1 workaround: gfx1010/1011/1012 aren't direct
+                # AdaptiveCpp HIP targets. Community-tested (Radeon Pro
+                # W5700) that gfx1013 is ISA-close enough to run on
+                # gfx1010 silicon. Not parity-validated.
+                case "$detected_gfx" in
+                    gfx1010|gfx1011|gfx1012)
+                        echo "[build-container] RDNA1 $detected_gfx detected — " \
+                             "using gfx1013 spoof (community workaround, not " \
+                             "parity-validated; verify plots with \`xchplot2 " \
+                             "verify\` before farming)" >&2
+                        export ACPP_GFX=gfx1013
+                        ;;
+                    *)
+                        export ACPP_GFX="$detected_gfx"
+                        ;;
+                esac
+            fi
+        fi
+        if [[ -z "${ACPP_GFX:-}" ]]; then
+            # No silent fallback: a wrong gfx target produces an image that
+            # builds clean and runs without errors, but the AOT amdgcn ISA
+            # is for the wrong arch and the SYCL kernels execute as silent
+            # no-ops at runtime (sort returns input unchanged, AES match
+            # finds zero results, plot output diverges from reference).
+            # Fail loud here instead.
+            echo "[build-container] ERROR: couldn't detect AMD gfx target." >&2
+            echo "[build-container] Either install rocminfo so the host probe finds it," >&2
+            echo "[build-container] or set ACPP_GFX explicitly to your card's arch:" >&2
+            echo "[build-container]   ACPP_GFX=gfx1012  $0  --gpu amd  # RX 5500 XT 4GB (RDNA1 — auto-spoofed to gfx1013)" >&2
+            echo "[build-container]   ACPP_GFX=gfx1030  $0  --gpu amd  # RX 6800 / 6800 XT / 6900 XT" >&2
+            echo "[build-container]   ACPP_GFX=gfx1031  $0  --gpu amd  # RX 6700 XT / 6700 / 6800M" >&2
+            echo "[build-container]   ACPP_GFX=gfx1034  $0  --gpu amd  # RX 6500 XT / 6400 (4 GiB → minimal tier)" >&2
+            echo "[build-container]   ACPP_GFX=gfx1100  $0  --gpu amd  # RX 7900 XTX / XT" >&2
+            echo "[build-container] (run \"rocminfo | grep gfx\" if available)" >&2
+            exit 1
+        fi
+        echo "[build-container] vendor=amd service=$SERVICE ACPP_GFX=$ACPP_GFX"
+        ;;
+    intel)
+        SERVICE=intel
+        echo "[build-container] vendor=intel service=$SERVICE (experimental, untested)"
+        ;;
+    cpu)
+        # CPU-only build: AdaptiveCpp's OpenMP backend, no GPU at runtime.
+        # Useful for headless CI, dev machines without a GPU, or as a
+        # secondary worker on a `--devices` list alongside real GPUs.
+        # Plotting throughput will be 1-2 orders of magnitude lower than
+        # GPU — see README's CPU section for the perf expectations.
+        SERVICE=cpu
+        echo "[build-container] vendor=cpu service=$SERVICE (AdaptiveCpp OpenMP backend; slow plotting, see README)"
+        ;;
+    *)
+        echo "unknown --gpu value: $GPU (expected nvidia|amd|intel|cpu)" >&2
+        exit 1
+        ;;
+esac
+
+# ── Invoke compose ──────────────────────────────────────────────────────────
+case "$ENGINE" in
+    podman) COMPOSE=(podman compose) ;;
+    docker) COMPOSE=(docker compose) ;;
+    *) echo "unknown --engine: $ENGINE (expected podman|docker)" >&2; exit 1 ;;
+esac
+
+set -x
+"${COMPOSE[@]}" build "${EXTRA_BUILD_ARGS[@]}" "$SERVICE"
diff --git a/scripts/install-container-deps.sh b/scripts/install-container-deps.sh
new file mode 100755
index 0000000..edb60a5
--- /dev/null
+++ b/scripts/install-container-deps.sh
@@ -0,0 +1,489 @@
+#!/usr/bin/env bash
+#
+# install-container-deps.sh — bootstrap the host packages required to
+# build & run xchplot2's container images via scripts/build-container.sh.
+#
+# Native build deps (CUDA Toolkit, ROCm SDK, LLVM 18+, AdaptiveCpp,
+# Boost.Context, libnuma, libomp, Rust) all live INSIDE the container
+# image — the host does not need any of them. This script only
+# installs:
+#   1. A container engine + compose plugin: `podman` + `podman-compose`
+#      (default), or `docker` + the `docker compose` v2 plugin via
+#      `--engine docker`.
+#   2. The GPU discovery tool used by build-container.sh's autodetect
+#      (`nvidia-smi` for NVIDIA, `rocminfo` for AMD). build-container.sh
+#      *errors* on AMD if ACPP_GFX can't be resolved, so rocminfo isn't
+#      strictly optional unless you pass ACPP_GFX through the env.
+#   3. The GPU container runtime: `nvidia-container-toolkit` + a CDI
+#      spec at /etc/cdi/nvidia.yaml (podman) or the docker runtime hook
+#      (docker) for NVIDIA. AMD and Intel only need /dev/kfd | /dev/dri
+#      access via the `video` and `render` groups; this script adds
+#      the invoking user to both.
+#
+# For NATIVE host builds (no container) use scripts/install-deps.sh
+# instead — that path needs the full CUDA / ROCm / LLVM / AdaptiveCpp
+# stack on the host and takes 30-45 min on a first run.
+#
+# Usage:
+#   scripts/install-container-deps.sh                  # auto-detect distro + GPU
+#   scripts/install-container-deps.sh --gpu nvidia
+#   scripts/install-container-deps.sh --gpu amd
+#   scripts/install-container-deps.sh --gpu intel
+#   scripts/install-container-deps.sh --gpu cpu        # engine only, no GPU runtime
+#   scripts/install-container-deps.sh --engine docker  # docker instead of podman
+#   scripts/install-container-deps.sh --no-nvidia-repo # skip adding NVIDIA's apt/dnf repo
+#   scripts/install-container-deps.sh --dry-run        # print the plan, change nothing
+#
+# Supported distros: Arch family, Ubuntu/Debian, Fedora/RHEL.
+
+set -euo pipefail
+
+ENGINE=podman
+GPU=""
+ADD_NVIDIA_REPO=1
+DRY_RUN=0
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --gpu)             GPU="$2";    shift 2 ;;
+        --engine)          ENGINE="$2"; shift 2 ;;
+        --no-nvidia-repo)  ADD_NVIDIA_REPO=0; shift ;;
+        --dry-run)         DRY_RUN=1; shift ;;
+        -h|--help)         sed -n '2,/^$/p' "$0" | sed 's/^# \?//'; exit 0 ;;
+        *) echo "unknown arg: $1" >&2; exit 1 ;;
+    esac
+done
+
+case "$ENGINE" in
+    podman|docker) ;;
+    *) echo "[install-container-deps] unknown --engine: $ENGINE (expected podman|docker)" >&2; exit 1 ;;
+esac
+
+# ── Helpers ─────────────────────────────────────────────────────────────────
+# In dry-run mode every mutating call is replaced with a `+ sudo …` stub;
+# probes (`command -v`, `[[ -f ]]`, etc.) still run as normal because they
+# don't change host state and the planning logic depends on them. The `+ `
+# prefix mirrors `set -x`'s syntax so dry-run output reads as an executable
+# trace.
+sudo_or_dry() {
+    if (( DRY_RUN )); then
+        printf '+ sudo %s\n' "$*"
+    else
+        sudo "$@"
+    fi
+}
+
+apt_update_or_dry() {
+    if (( DRY_RUN )); then
+        printf '+ sudo apt-get update\n'
+    else
+        sudo apt-get update
+    fi
+}
+
+# Curl-piped-to-(sudo tee | sudo gpg --dearmor) write. Records "+ write
+# DEST (from URL)" in dry-run mode. `mode=dearmor` covers the apt
+# gpgkey path; default mode is plain tee.
+write_url_or_dry() {
+    local url="$1" dest="$2" mode="${3:-cat}"
+    if (( DRY_RUN )); then
+        case "$mode" in
+            dearmor) printf '+ write %s (gpg --dearmor from %s)\n' "$dest" "$url" ;;
+            *)       printf '+ write %s (from %s)\n' "$dest" "$url" ;;
+        esac
+        return
+    fi
+    case "$mode" in
+        dearmor)
+            curl -fsSL "$url" \
+                | sudo gpg --batch --yes --dearmor -o "$dest"
+            ;;
+        *)
+            curl -fsSL "$url" | sudo tee "$dest" >/dev/null
+            ;;
+    esac
+}
+
+# ── Detect distro ───────────────────────────────────────────────────────────
+if [[ ! -f /etc/os-release ]]; then
+    echo "[install-container-deps] Cannot detect distro: /etc/os-release missing" >&2
+    exit 1
+fi
+# shellcheck source=/dev/null
+. /etc/os-release
+DISTRO=$ID
+DISTRO_LIKE=${ID_LIKE:-}
+
+# ── Detect GPU vendor ───────────────────────────────────────────────────────
+# Two-tier strategy mirroring install-deps.sh: tool-based first (authoritative
+# when the driver is loaded), PCI vendor-ID fallback (works pre-driver). The
+# driver tools cannot be a hard prerequisite because installing them is one
+# of the things this script is supposed to do.
+detect_gpu_via_pci() {
+    local found="" entry name vendor
+    for entry in /sys/class/drm/card*; do
+        name=$(basename "$entry")
+        # Skip connector entries like card0-DP-1; only the bare cardN
+        # nodes carry a `device/vendor` attribute we can read.
+        [[ "$name" =~ ^card[0-9]+$ ]] || continue
+        [[ -r "$entry/device/vendor" ]] || continue
+        vendor=$(cat "$entry/device/vendor" 2>/dev/null)
+        case "$vendor" in
+            0x10de) found="nvidia"; break ;;            # highest precedence
+            0x1002) found="amd" ;;                      # overrides intel
+            0x8086) [[ -z "$found" ]] && found="intel" ;; # only if nothing else
+        esac
+    done
+    echo "$found"
+}
+
+# Skip autodetect under --dry-run — CI containers have no GPU, and tests
+# always pass --gpu explicitly. Avoids "could not auto-detect" exit on
+# headless runners.
+if [[ -z "$GPU" ]] && (( ! DRY_RUN )); then
+    if command -v nvidia-smi >/dev/null && nvidia-smi -L 2>/dev/null | grep -q GPU; then
+        GPU=nvidia
+        echo "[install-container-deps] Detected NVIDIA GPU (nvidia-smi)."
+    elif command -v rocminfo >/dev/null && rocminfo 2>/dev/null | grep -q gfx; then
+        GPU=amd
+        echo "[install-container-deps] Detected AMD GPU (rocminfo)."
+    else
+        GPU=$(detect_gpu_via_pci)
+        if [[ -n "$GPU" ]]; then
+            echo "[install-container-deps] Detected $GPU GPU via /sys/class/drm (PCI vendor ID); driver tools not yet installed."
+        fi
+    fi
+fi
+
+if [[ -z "$GPU" ]]; then
+    if (( DRY_RUN )); then
+        echo "[install-container-deps] --dry-run requires --gpu to be set explicitly" >&2
+    else
+        echo "[install-container-deps] Could not auto-detect a GPU. Pass" >&2
+        echo "[install-container-deps]   --gpu nvidia | amd | intel | cpu" >&2
+        echo "[install-container-deps] explicitly. Use --gpu cpu for a GPU-less host" >&2
+        echo "[install-container-deps] (CPU-only image; slow plotting, see README)." >&2
+    fi
+    exit 1
+fi
+
+case "$GPU" in
+    nvidia|amd|intel|cpu) ;;
+    *) echo "[install-container-deps] unknown --gpu: $GPU (expected nvidia|amd|intel|cpu)" >&2; exit 1 ;;
+esac
+
+echo "[install-container-deps] distro=$DISTRO, gpu=$GPU, engine=$ENGINE"
+
+# ── Per-distro packages ─────────────────────────────────────────────────────
+install_arch() {
+    local pkgs=()
+    case "$ENGINE" in
+        podman) pkgs+=(podman podman-compose) ;;
+        docker) pkgs+=(docker docker-compose docker-buildx) ;;
+    esac
+    case "$GPU" in
+        # nvidia-utils provides nvidia-smi (used by build-container.sh's
+        # CUDA_ARCH probe). nvidia-container-toolkit provides nvidia-ctk +
+        # the CDI / runtime hook libraries for GPU pass-through.
+        nvidia) pkgs+=(nvidia-utils nvidia-container-toolkit) ;;
+        # rocminfo: build-container.sh fails fast on AMD if ACPP_GFX can't
+        # be resolved from rocminfo (compose.yaml's ACPP_TARGETS default
+        # is a deliberately invalid placeholder so wrong-arch builds fail
+        # loudly instead of silently producing no-op kernels).
+        # No ROCm SDK on the host — that lives inside the container.
+        amd)    pkgs+=(rocminfo) ;;
+    esac
+    sudo_or_dry pacman -S --needed --noconfirm "${pkgs[@]}"
+}
+
+install_apt() {
+    apt_update_or_dry
+
+    local pkgs=()
+    case "$ENGINE" in
+        # podman-compose lags upstream on LTS but covers what
+        # build-container.sh exercises (build/run, no fancy flags).
+        podman) pkgs+=(podman podman-compose) ;;
+        # docker.io = Ubuntu's stock dockerd. The compose v2 plugin name
+        # varies (24.04: docker-compose-v2 in universe; via Docker's
+        # official repo: docker-compose-plugin). Resolved below.
+        docker) pkgs+=(docker.io docker-buildx) ;;
+    esac
+    case "$GPU" in
+        nvidia)
+            # nvidia-utils-XXX is suffixed with the loaded driver branch.
+            # If a driver is already loaded, pin the matching utils branch
+            # via /proc/driver/nvidia/version. If no driver is loaded, skip
+            # — nvidia-container-toolkit still works without nvidia-smi,
+            # it just means build-container.sh can't autodetect CUDA_ARCH.
+            local drv_major=""
+            if (( DRY_RUN )); then
+                # Use a placeholder so dry-run output stays deterministic
+                # regardless of whether the runner has a driver loaded.
+                drv_major="<DRV_MAJOR>"
+            elif [[ -r /proc/driver/nvidia/version ]]; then
+                drv_major=$(grep -oE '[0-9]+\.[0-9]+' /proc/driver/nvidia/version 2>/dev/null \
+                            | head -1 | cut -d. -f1)
+            fi
+            if [[ -n "$drv_major" ]]; then
+                pkgs+=("nvidia-utils-$drv_major")
+            else
+                echo "[install-container-deps] No loaded NVIDIA driver detected via" >&2
+                echo "[install-container-deps] /proc/driver/nvidia/version. Skipping" >&2
+                echo "[install-container-deps] nvidia-utils-* — install your driver" >&2
+                echo "[install-container-deps] first, or pass --gpu nvidia + CUDA_ARCH" >&2
+                echo "[install-container-deps] manually to build-container.sh." >&2
+            fi
+            ;;
+        amd) pkgs+=(rocminfo) ;;
+    esac
+    sudo_or_dry apt-get install -y --no-install-recommends "${pkgs[@]}"
+
+    # Docker compose v2 plugin: the package name varies by source.
+    # `docker-compose-v2` ships in 24.04+ universe; `docker-compose-plugin`
+    # ships in Docker's official deb repo. Both install the same binary at
+    # /usr/libexec/docker/cli-plugins/docker-compose. build-container.sh
+    # uses the v2 `docker compose <subcmd>` syntax, so we MUST install one
+    # of these two — the legacy v1 `docker-compose` (Python) won't work.
+    if [[ "$ENGINE" == docker ]]; then
+        local compose_pkg="docker-compose-v2"
+        if (( ! DRY_RUN )); then
+            compose_pkg=""
+            for cand in docker-compose-v2 docker-compose-plugin; do
+                if apt-cache show "$cand" >/dev/null 2>&1; then
+                    compose_pkg="$cand"; break
+                fi
+            done
+            if [[ -z "$compose_pkg" ]]; then
+                echo "[install-container-deps] No compose v2 package available in apt." >&2
+                echo "[install-container-deps] Add Docker's official repo for docker-compose-plugin:" >&2
+                echo "[install-container-deps]   https://docs.docker.com/engine/install/ubuntu/" >&2
+                echo "[install-container-deps] Or use --engine podman (default; tested with compose.yaml)." >&2
+                exit 1
+            fi
+        fi
+        sudo_or_dry apt-get install -y --no-install-recommends "$compose_pkg"
+    fi
+
+    # nvidia-container-toolkit isn't in stock Ubuntu/Debian repos. Pull it
+    # from NVIDIA's official apt repo (the path NVIDIA's own docs use).
+    if [[ "$GPU" == nvidia ]]; then
+        if [[ $ADD_NVIDIA_REPO -eq 1 ]] \
+            && { (( DRY_RUN )) || [[ ! -f /etc/apt/sources.list.d/nvidia-container-toolkit.list ]]; }; then
+            echo "[install-container-deps] Adding NVIDIA's container-toolkit apt repo to /etc/apt/sources.list.d/."
+            sudo_or_dry install -m 0755 -d /usr/share/keyrings
+            write_url_or_dry \
+                https://nvidia.github.io/libnvidia-container/gpgkey \
+                /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+                dearmor
+            # The repo file gets a sed transform to inject signed-by= ;
+            # in dry-run we record the URL → dest, which is the bit
+            # users actually care about.
+            if (( DRY_RUN )); then
+                write_url_or_dry \
+                    https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+                    /etc/apt/sources.list.d/nvidia-container-toolkit.list
+            else
+                curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+                    | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+                    | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null
+            fi
+            apt_update_or_dry
+        fi
+        sudo_or_dry apt-get install -y --no-install-recommends nvidia-container-toolkit
+    fi
+}
+
+install_dnf() {
+    local pkgs=()
+    case "$ENGINE" in
+        podman)
+            # Fedora's first-class engine — both packages are in the stock
+            # repos (podman is the default container tool on Fedora 36+).
+            pkgs+=(podman podman-compose)
+            ;;
+        docker)
+            # docker isn't in Fedora/RHEL stock repos; the user has to add
+            # docker-ce.repo per Docker's docs first. Bail rather than
+            # silently fail mid-install. Skip the precondition check in
+            # dry-run so the planning output stays useful even in CI
+            # containers that haven't added the repo.
+            if (( ! DRY_RUN )); then
+                if ! sudo dnf list --installed docker-ce >/dev/null 2>&1 \
+                    && ! sudo dnf list --installed docker        >/dev/null 2>&1; then
+                    echo "[install-container-deps] Docker is not in Fedora/RHEL stock repos." >&2
+                    echo "[install-container-deps] Add docker-ce.repo per Docker's docs first," >&2
+                    echo "[install-container-deps] then re-run this script. Or use --engine podman" >&2
+                    echo "[install-container-deps] (default; Fedora's first-class engine)." >&2
+                    exit 1
+                fi
+            fi
+            pkgs+=(docker-compose-plugin docker-buildx-plugin)
+            ;;
+    esac
+    case "$GPU" in
+        nvidia)
+            # Hint only — Fedora's nvidia driver lives in RPMFusion and
+            # auto-enabling third-party repos behind the user's back is
+            # rude. nvidia-container-toolkit (added below) comes from
+            # NVIDIA's own repo, which is already a precedent set by
+            # NVIDIA's docs.
+            if (( ! DRY_RUN )) && ! command -v nvidia-smi >/dev/null; then
+                echo "[install-container-deps] WARNING: nvidia-smi not on PATH." >&2
+                echo "[install-container-deps] Enable RPMFusion + install akmod-nvidia (or" >&2
+                echo "[install-container-deps] akmod-nvidia-open) for the host driver, or" >&2
+                echo "[install-container-deps] pass --gpu nvidia + CUDA_ARCH manually." >&2
+            fi
+            ;;
+        amd) pkgs+=(rocminfo) ;;
+    esac
+    if [[ ${#pkgs[@]} -gt 0 ]]; then
+        sudo_or_dry dnf install -y "${pkgs[@]}"
+    fi
+
+    if [[ "$GPU" == nvidia ]]; then
+        if [[ $ADD_NVIDIA_REPO -eq 1 ]] \
+            && { (( DRY_RUN )) || [[ ! -f /etc/yum.repos.d/nvidia-container-toolkit.repo ]]; }; then
+            echo "[install-container-deps] Adding NVIDIA's container-toolkit dnf repo to /etc/yum.repos.d/."
+            write_url_or_dry \
+                https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
+                /etc/yum.repos.d/nvidia-container-toolkit.repo
+        fi
+        sudo_or_dry dnf install -y nvidia-container-toolkit
+    fi
+}
+
+# ── Distro-agnostic post-install (NVIDIA only) ──────────────────────────────
+configure_nvidia_runtime() {
+    if (( ! DRY_RUN )) && ! command -v nvidia-ctk >/dev/null; then
+        echo "[install-container-deps] WARNING: nvidia-ctk not on PATH — skipping CDI / runtime setup." >&2
+        return
+    fi
+    case "$ENGINE" in
+        podman)
+            # CDI spec at /etc/cdi/nvidia.yaml lets `--device nvidia.com/gpu=all`
+            # (and the `deploy.resources.reservations.devices` shorthand in
+            # compose.yaml's cuda service) resolve to real GPUs. Re-run after
+            # driver upgrades — the spec hard-codes device file paths.
+            sudo_or_dry install -m 0755 -d /etc/cdi
+            sudo_or_dry nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
+            echo "[install-container-deps] Generated CDI spec at /etc/cdi/nvidia.yaml."
+            # nvidia-ctk's "discoverer" enumerates every NVIDIA-related path
+            # the driver could expose — Vulkan ICDs, X11 configs, the
+            # fabric-manager / MPS / IMEX sockets, etc. — and prints WARN
+            # lines for ones it can't find. On any non-server, headless
+            # GPU host most of these won't be present; the spec gracefully
+            # omits them. Tell the user up front so the WARN volume on the
+            # next line doesn't look like a failure.
+            echo "[install-container-deps] (WARNings about libnvidia-vulkan-producer / X11 configs /"
+            echo "[install-container-deps]  fabric-manager / MPS / IMEX from nvidia-ctk are expected on"
+            echo "[install-container-deps]  non-server hosts — those are optional features the spec"
+            echo "[install-container-deps]  gracefully omits when not present.)"
+            ;;
+        docker)
+            # Writes /etc/docker/daemon.json's `runtimes.nvidia` entry +
+            # restarts dockerd so the change takes effect.
+            sudo_or_dry nvidia-ctk runtime configure --runtime=docker
+            if (( DRY_RUN )); then
+                printf '+ sudo systemctl restart docker\n'
+            else
+                sudo systemctl restart docker || true
+            fi
+            echo "[install-container-deps] Configured docker NVIDIA runtime + restarted dockerd."
+            ;;
+    esac
+}
+
+# ── Distro-agnostic post-install (AMD / Intel) ──────────────────────────────
+# /dev/kfd (AMD) and /dev/dri (AMD + Intel) are group-owned by `video` (and
+# `render` on newer udev/systemd setups). Add the invoking user to both so
+# rootless containers can pass the device through. Effective on next login.
+add_user_to_video_render_groups() {
+    local target_user
+    if (( DRY_RUN )); then
+        # Stable placeholder so the fixture doesn't depend on $USER.
+        target_user="<USER>"
+    else
+        target_user="${SUDO_USER:-${USER:-}}"
+        if [[ -z "$target_user" || "$target_user" == root ]]; then
+            echo "[install-container-deps] Skipping group membership (no non-root user detected)."
+            return
+        fi
+    fi
+    for grp in video render; do
+        if (( ! DRY_RUN )); then
+            getent group "$grp" >/dev/null 2>&1 || continue
+            if id -nG "$target_user" | tr ' ' '\n' | grep -qx "$grp"; then
+                continue
+            fi
+        fi
+        sudo_or_dry usermod -aG "$grp" "$target_user"
+        echo "[install-container-deps] Added $target_user to group $grp (re-login to apply)."
+    done
+}
+
+# ── Enable docker daemon when applicable ────────────────────────────────────
+enable_docker_service() {
+    [[ "$ENGINE" == docker ]] || return 0
+    if (( ! DRY_RUN )); then
+        command -v systemctl >/dev/null || return 0
+    fi
+    if (( DRY_RUN )); then
+        printf '+ sudo systemctl enable --now docker.service\n'
+    else
+        sudo systemctl enable --now docker.service || true
+    fi
+}
+
+# ── Distro dispatch ─────────────────────────────────────────────────────────
+case "$DISTRO" in
+    arch|cachyos|manjaro|endeavouros)            install_arch ;;
+    ubuntu|debian|pop|linuxmint)                 install_apt  ;;
+    fedora|rhel|centos|rocky|almalinux)          install_dnf  ;;
+    *)
+        case "$DISTRO_LIKE" in
+            *arch*)            install_arch ;;
+            *debian*)          install_apt  ;;
+            *rhel*|*fedora*)   install_dnf  ;;
+            *)
+                echo "[install-container-deps] Unknown distro '$DISTRO'. Install equivalents of:"
+                if [[ "$ENGINE" == podman ]]; then
+                    echo "  podman + podman-compose"
+                else
+                    echo "  docker + docker-compose-v2 (or docker-compose-plugin) + docker-buildx"
+                fi
+                case "$GPU" in
+                    nvidia) echo "  nvidia-container-toolkit (from NVIDIA's repo: https://nvidia.github.io/libnvidia-container/)" ;;
+                    amd)    echo "  rocminfo (only used by build-container.sh's ACPP_GFX autodetect)" ;;
+                esac
+                exit 1
+                ;;
+        esac
+        ;;
+esac
+
+enable_docker_service
+
+case "$GPU" in
+    nvidia)        configure_nvidia_runtime ;;
+    amd|intel)     add_user_to_video_render_groups ;;
+    cpu)           : ;;
+esac
+
+# ── Final notes ─────────────────────────────────────────────────────────────
+echo
+echo "[install-container-deps] Done."
+echo "  Build the image:"
+echo "    ./scripts/build-container.sh --engine $ENGINE${GPU:+ --gpu $GPU}"
+case "$GPU" in
+    amd|intel)
+        echo "  If this run added you to the video / render groups, log out"
+        echo "  and back in before running plots — group changes only take"
+        echo "  effect for fresh login sessions."
+        ;;
+    nvidia)
+        echo "  After future NVIDIA driver upgrades, re-run this script (or"
+        echo "  re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure"
+        echo "  manually) so the CDI spec / docker runtime hook stays current."
+        ;;
+esac
diff --git a/scripts/install-deps.sh b/scripts/install-deps.sh
new file mode 100755
index 0000000..8d98085
--- /dev/null
+++ b/scripts/install-deps.sh
@@ -0,0 +1,309 @@
+#!/usr/bin/env bash
+#
+# install-deps.sh — bootstrap xchplot2's native build dependencies.
+#
+# Installs CUDA Toolkit on NVIDIA, ROCm HIP SDK on AMD, LLVM 18+,
+# AdaptiveCpp 25.10, and a Rust toolchain via rustup. After this completes,
+# you can build with either:
+#   cargo install --git https://github.com/Jsewill/xchplot2
+#   # or:
+#   cmake -B build -S . && cmake --build build -j
+#
+# Usage:
+#   scripts/install-deps.sh                # auto-detect distro + GPU
+#   scripts/install-deps.sh --no-acpp      # skip AdaptiveCpp build (use FetchContent)
+#   scripts/install-deps.sh --gpu amd      # force AMD path (CUDA headers only)
+#   scripts/install-deps.sh --gpu nvidia   # force NVIDIA path (full CUDA Toolkit)
+#
+# Supported distros: Arch family, Ubuntu/Debian, Fedora/RHEL.
+# For anything else, install the equivalents listed at the bottom and
+# build AdaptiveCpp from source manually.
+
+set -euo pipefail
+
+ACPP_REF=${ACPP_REF:-v25.10.0}
+ACPP_PREFIX=${ACPP_PREFIX:-/opt/adaptivecpp}
+SKIP_ACPP=0
+GPU=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --no-acpp)   SKIP_ACPP=1; shift ;;
+        --gpu)       GPU="$2"; shift 2 ;;
+        -h|--help)   sed -n '2,/^$/p' "$0" | sed 's/^# \?//'; exit 0 ;;
+        *) echo "unknown arg: $1" >&2; exit 1 ;;
+    esac
+done
+
+# ── Detect distro ───────────────────────────────────────────────────────────
+if [[ ! -f /etc/os-release ]]; then
+    echo "Cannot detect distro: /etc/os-release missing" >&2
+    exit 1
+fi
+# shellcheck source=/dev/null
+. /etc/os-release
+DISTRO=$ID
+DISTRO_LIKE=${ID_LIKE:-}
+
+# ── Detect GPU vendor (NVIDIA / AMD / Intel) ────────────────────────────────
+# Two-tier detection so a fresh OS install (no driver tools yet) still works:
+#   1. Tool-based (nvidia-smi / rocminfo) — authoritative when available,
+#      because it confirms the driver+runtime is functional, not just that
+#      a card is plugged in.
+#   2. PCI vendor ID via /sys/class/drm — works pre-driver. The whole point
+#      of running install-deps.sh is to install the driver/toolkit, so we
+#      can't require the driver tools as a prerequisite for detection.
+#
+# Precedence (when multiple GPUs are present): NVIDIA > AMD > Intel.
+# Matches the build.rs vendor-precedence logic.
+detect_gpu_via_pci() {
+    local found="" entry name vendor
+    for entry in /sys/class/drm/card*; do
+        name=$(basename "$entry")
+        # Skip connector entries like card0-DP-1 — only the bare cardN
+        # nodes have a `device/vendor` attribute we care about.
+        [[ "$name" =~ ^card[0-9]+$ ]] || continue
+        [[ -r "$entry/device/vendor" ]] || continue
+        vendor=$(cat "$entry/device/vendor" 2>/dev/null)
+        case "$vendor" in
+            0x10de) found="nvidia"; break ;;            # highest precedence
+            0x1002) found="amd" ;;                      # overrides intel
+            0x8086) [[ -z "$found" ]] && found="intel" ;; # only if nothing else
+        esac
+    done
+    echo "$found"
+}
+
+if [[ -z "$GPU" ]]; then
+    if command -v nvidia-smi >/dev/null && nvidia-smi -L 2>/dev/null | grep -q GPU; then
+        GPU=nvidia
+        echo "[install-deps] Detected NVIDIA GPU (nvidia-smi)."
+    elif command -v rocminfo >/dev/null && rocminfo 2>/dev/null | grep -q gfx; then
+        GPU=amd
+        echo "[install-deps] Detected AMD GPU (rocminfo)."
+    else
+        GPU=$(detect_gpu_via_pci)
+        if [[ -n "$GPU" ]]; then
+            echo "[install-deps] Detected $GPU GPU via /sys/class/drm (PCI vendor ID); driver tools not yet installed."
+        fi
+    fi
+fi
+
+if [[ -z "$GPU" ]]; then
+    echo "[install-deps] Could not auto-detect a GPU (no nvidia-smi / rocminfo," >&2
+    echo "[install-deps] no usable PCI device under /sys/class/drm)." >&2
+    echo "[install-deps] Pass --gpu nvidia or --gpu amd explicitly to override." >&2
+    echo "[install-deps] Headless / CI builds: --gpu nvidia installs the LLVM" >&2
+    echo "[install-deps] toolchain + CUDA Toolkit headers used by the SYCL path." >&2
+    exit 1
+fi
+
+if [[ "$GPU" == "intel" ]]; then
+    echo "[install-deps] Intel GPU detected, but install-deps.sh has no Intel-" >&2
+    echo "[install-deps] specific package path yet. Options:" >&2
+    echo "[install-deps]   --gpu nvidia     install LLVM + CUDA headers (the SYCL" >&2
+    echo "[install-deps]                    path JITs onto Intel via AdaptiveCpp's" >&2
+    echo "[install-deps]                    generic SSCP target at runtime)" >&2
+    echo "[install-deps]   ./scripts/build-container.sh   container with Intel oneAPI" >&2
+    exit 1
+fi
+echo "[install-deps] distro=$DISTRO, gpu=$GPU, acpp=${ACPP_REF}, prefix=${ACPP_PREFIX}"
+
+# ── Per-distro packages ─────────────────────────────────────────────────────
+install_arch() {
+    local pkgs=(cmake git base-devel python ninja
+                llvm clang lld
+                boost numactl curl)
+    case "$GPU" in
+        nvidia) pkgs+=(cuda) ;;
+        # rocminfo: needed by build-container.sh + scripts/install-deps.sh
+        # autodetection (rocm-hip-sdk doesn't pull it transitively).
+        # No CUDA pkg on the AMD path — CudaHalfShim.hpp guards the CUDA
+        # headers via __has_include, and pulling CUDA alongside HIP causes
+        # uchar1/char1 typedef redefinitions.
+        amd)    pkgs+=(rocm-hip-sdk rocm-device-libs rocminfo) ;;
+    esac
+    sudo pacman -S --needed --noconfirm "${pkgs[@]}"
+}
+
+install_apt() {
+    local pkgs=(cmake git ninja-build build-essential python3 pkg-config
+                llvm-18 llvm-18-dev clang-18 lld-18 libclang-18-dev libclang-cpp18-dev
+                libboost-context-dev libnuma-dev libomp-18-dev curl ca-certificates)
+    case "$GPU" in
+        nvidia) pkgs+=(nvidia-cuda-toolkit) ;;
+        amd)    pkgs+=(rocm-hip-sdk rocm-libs rocminfo)
+                # rocminfo is the discovery tool build-container.sh probes;
+                # not pulled in transitively by rocm-hip-sdk.
+                # No nvidia-cuda-toolkit-headers on the AMD path —
+                # CudaHalfShim.hpp guards the CUDA headers via
+                # __has_include, and pulling CUDA alongside HIP causes
+                # uchar1/char1 typedef redefinitions.
+                ;;
+    esac
+    sudo apt-get update
+    sudo apt-get install -y --no-install-recommends "${pkgs[@]}"
+}
+
+install_dnf() {
+    local pkgs=(cmake git ninja-build gcc-c++ python3 pkg-config
+                llvm llvm-devel clang clang-devel lld
+                boost-devel numactl-devel libomp-devel curl)
+    case "$GPU" in
+        nvidia) pkgs+=(cuda-toolkit) ;;
+        # No cuda-toolkit on the AMD path — CudaHalfShim.hpp guards the
+        # CUDA headers via __has_include, and pulling CUDA alongside HIP
+        # causes uchar1/char1 typedef redefinitions.
+        amd)    pkgs+=(rocm-hip-devel rocminfo) ;;
+    esac
+    sudo dnf install -y "${pkgs[@]}"
+}
+
+case "$DISTRO" in
+    arch|cachyos|manjaro|endeavouros)            install_arch ;;
+    ubuntu|debian|pop|linuxmint)                 install_apt  ;;
+    fedora|rhel|centos|rocky|almalinux)          install_dnf  ;;
+    *)
+        case "$DISTRO_LIKE" in
+            *arch*)   install_arch ;;
+            *debian*) install_apt  ;;
+            *rhel*|*fedora*) install_dnf ;;
+            *)
+                echo "[install-deps] Unknown distro '$DISTRO'. Install equivalents of:"
+                echo "  CMake ≥ 3.24, Ninja, LLVM 18+, clang 18+, libclang dev,"
+                echo "  Boost.Context, libnuma, libomp, Python 3, git,"
+                if [[ "$GPU" == "nvidia" ]]; then
+                    echo "  CUDA Toolkit 12+ (with nvcc)"
+                else
+                    echo "  ROCm 6+ HIP SDK (rocm-hip-sdk / rocm-hip-devel)"
+                fi
+                echo "Then re-run with --no-acpp to skip pkg install and only build AdaptiveCpp."
+                exit 1
+                ;;
+        esac
+        ;;
+esac
+
+# ── Rust toolchain via rustup ───────────────────────────────────────────────
+if ! command -v cargo >/dev/null; then
+    echo "[install-deps] Installing Rust toolchain via rustup"
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+        sh -s -- -y --default-toolchain stable --profile minimal
+    export PATH=$HOME/.cargo/bin:$PATH
+fi
+
+# ── AdaptiveCpp ─────────────────────────────────────────────────────────────
+if [[ $SKIP_ACPP -eq 1 ]]; then
+    echo "[install-deps] Skipping AdaptiveCpp build per --no-acpp."
+    echo "[install-deps] CMakeLists will FetchContent it automatically (slow first build)."
+    exit 0
+fi
+
+if [[ -d "$ACPP_PREFIX" ]] && [[ -f "$ACPP_PREFIX/lib/cmake/AdaptiveCpp/AdaptiveCppConfig.cmake" ]]; then
+    echo "[install-deps] AdaptiveCpp already installed at $ACPP_PREFIX. Skipping."
+    exit 0
+fi
+
+ACPP_BUILD_DIR=$(mktemp -d -t xchplot2-acpp-XXXXXX)
+trap 'rm -rf "$ACPP_BUILD_DIR"' EXIT
+
+# ── Find a compatible LLVM ──────────────────────────────────────────────────
+# AdaptiveCpp 25.10 only supports LLVM 16-20. On rolling distros (Arch,
+# Fedora rawhide) the system LLVM is often 21+, which AdaptiveCpp rejects
+# with "LLVM versions greater than 20 are not yet tested/supported". Probe
+# the conventional install prefixes for the newest usable LLVM and pin
+# AdaptiveCpp to it explicitly. Fail fast with a distro-specific install
+# hint rather than letting AdaptiveCpp's CMake fail mid-configure.
+LLVM_ROOT=""
+for cand in \
+    /usr/lib/llvm-20 /usr/lib/llvm-19 /usr/lib/llvm-18 \
+    /usr/lib/llvm-17 /usr/lib/llvm-16 \
+    /usr/lib/llvm20  /usr/lib/llvm19  /usr/lib/llvm18 \
+    /usr/lib64/llvm20 /usr/lib64/llvm19 /usr/lib64/llvm18 \
+    /opt/llvm20 /opt/llvm-20 /opt/llvm19 /opt/llvm-19 \
+    /opt/llvm18 /opt/llvm-18; do
+    if [[ -x "$cand/bin/clang" ]] && [[ -x "$cand/bin/ld.lld" ]]; then
+        ver=$("$cand/bin/clang" --version 2>/dev/null \
+              | head -1 | grep -oE 'version [0-9]+' | grep -oE '[0-9]+')
+        if [[ -n "$ver" ]] && (( ver >= 16 && ver <= 20 )); then
+            LLVM_ROOT="$cand"
+            break
+        fi
+    fi
+done
+
+if [[ -z "$LLVM_ROOT" ]]; then
+    echo "[install-deps] No compatible LLVM (16-20) with ld.lld found." >&2
+    echo "[install-deps] AdaptiveCpp $ACPP_REF only supports LLVM 16-20." >&2
+    echo "[install-deps] Install one and re-run, or use the container path:" >&2
+    case "$DISTRO" in
+        arch|cachyos|manjaro|endeavouros)
+            echo "  yay -S llvm18-bin lld18-bin   # or paru -S, or any AUR helper" >&2 ;;
+        ubuntu|debian|pop|linuxmint)
+            echo "  sudo apt install llvm-18 llvm-18-dev clang-18 lld-18 libomp-18-dev" >&2 ;;
+        fedora|rhel|centos|rocky|almalinux)
+            echo "  sudo dnf install llvm18 llvm18-devel clang18 lld18-devel" >&2 ;;
+        *)
+            echo "  install LLVM 16-20 + clang + ld.lld for your distro" >&2 ;;
+    esac
+    echo "  ./scripts/build-container.sh   # container has LLVM 18 pinned" >&2
+    exit 1
+fi
+echo "[install-deps] Using LLVM at $LLVM_ROOT for AdaptiveCpp build."
+
+# ── ROCm device libs path (AMD only) ────────────────────────────────────────
+# AdaptiveCpp's HIP backend needs ockl.bc / ocml.bc to compile kernels for
+# amdgcn. The bitcode location moved between ROCm versions; probe the
+# common spots. CMake will warn if the path's missing on AMD; without a
+# match here, the build fails with "ROCm device library path not found".
+ACPP_ROCM_FLAGS=()
+if [[ "$GPU" == "amd" ]]; then
+    for d in \
+        /opt/rocm/amdgcn/bitcode \
+        /opt/rocm/lib/llvm-amdgpu/amdgcn/bitcode \
+        /opt/rocm/share/amdgcn/bitcode; do
+        if [[ -f "$d/ockl.bc" ]]; then
+            ACPP_ROCM_FLAGS+=(-DROCM_DEVICE_LIBS_PATH="$d")
+            echo "[install-deps] ROCm device libs: $d"
+            break
+        fi
+    done
+fi
+
+echo "[install-deps] Building AdaptiveCpp $ACPP_REF in $ACPP_BUILD_DIR"
+git clone --depth 1 --branch "$ACPP_REF" \
+    https://github.com/AdaptiveCpp/AdaptiveCpp.git "$ACPP_BUILD_DIR/src"
+
+# AMD-only builds don't need AdaptiveCpp's CUDA backend. Skip the
+# `find_package(CUDA)` probe that AdaptiveCpp's CMakeLists runs at
+# line ~122: on hosts where a CUDA headers subset is installed (distro
+# `cuda` package, JetPack fragments, /usr/lib from some wrappers), the
+# probe finds a partial install and AdaptiveCpp's own `FindCUDA.cmake`
+# emits `CUDAToolkit_LIBRARY_ROOT /usr/lib does not point to the
+# correct directory, try setting it manually`. The warning is cosmetic
+# (AdaptiveCpp continues without CUDA), but it looks like an error to
+# users skimming the install log.
+ACPP_CUDA_DISABLE=()
+if [[ "$GPU" == "amd" ]]; then
+    ACPP_CUDA_DISABLE+=(-DCMAKE_DISABLE_FIND_PACKAGE_CUDA=TRUE)
+fi
+
+cmake -S "$ACPP_BUILD_DIR/src" -B "$ACPP_BUILD_DIR/build" -G Ninja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_INSTALL_PREFIX="$ACPP_PREFIX" \
+    -DCMAKE_C_COMPILER="$LLVM_ROOT/bin/clang" \
+    -DCMAKE_CXX_COMPILER="$LLVM_ROOT/bin/clang++" \
+    -DLLVM_DIR="$LLVM_ROOT/lib/cmake/llvm" \
+    -DACPP_LLD_PATH="$LLVM_ROOT/bin/ld.lld" \
+    "${ACPP_CUDA_DISABLE[@]}" \
+    "${ACPP_ROCM_FLAGS[@]}"
+cmake --build "$ACPP_BUILD_DIR/build" --parallel
+sudo cmake --install "$ACPP_BUILD_DIR/build"
+
+echo
+echo "[install-deps] Done."
+echo "  AdaptiveCpp: $ACPP_PREFIX"
+echo "  Build xchplot2:"
+echo "    export CMAKE_PREFIX_PATH=$ACPP_PREFIX:\$CMAKE_PREFIX_PATH"
+echo "    cargo install --path .                  # or:"
+echo "    cmake -B build -S . && cmake --build build -j"
diff --git a/scripts/test-multi-gpu.sh b/scripts/test-multi-gpu.sh
new file mode 100755
index 0000000..6bb7fb2
--- /dev/null
+++ b/scripts/test-multi-gpu.sh
@@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+#
+# test-multi-gpu.sh — smoke test for the --devices flag.
+#
+# Two passes:
+#
+#   1. Argument-parsing checks. Runs xchplot2 against an empty manifest
+#      (run_batch returns before touching the GPU, so these work on any
+#      host including CI with no GPU visible).
+#
+#   2. Live multi-device plot, runtime-gated. Skipped automatically when
+#      < 2 GPUs are enumerable — so single-GPU dev boxes just see the
+#      parse checks run green, and a 2+ GPU rig exercises the fan-out.
+#
+# Usage:
+#   scripts/test-multi-gpu.sh [path/to/xchplot2]
+#
+# If the path is omitted, falls back to `xchplot2` on PATH (so
+# `cargo install --path .` followed by this script works out of the
+# box).
+
+set -u
+XCHPLOT2="${1:-$(command -v xchplot2 || true)}"
+if [[ -z "$XCHPLOT2" || ! -x "$XCHPLOT2" ]]; then
+    echo "ERROR: xchplot2 not found. Pass path as \$1 or put it on \$PATH." >&2
+    exit 1
+fi
+
+PASS=0; FAIL=0; SKIP=0
+pass() { printf '  \e[32mPASS\e[0m: %s\n' "$1"; PASS=$((PASS+1)); }
+fail() { printf '  \e[31mFAIL\e[0m: %s\n' "$1"; FAIL=$((FAIL+1)); }
+skip() { printf '  \e[33mSKIP\e[0m: %s\n' "$1"; SKIP=$((SKIP+1)); }
+
+EMPTY_TSV=$(mktemp -t xchplot2-empty-XXXXXX.tsv)
+TMP_OUT=$(mktemp -d -t xchplot2-multigpu-out-XXXXXX)
+trap 'rm -rf "$EMPTY_TSV" "$TMP_OUT"' EXIT
+
+check_accept() {
+    local desc="$1"; shift
+    if "$XCHPLOT2" batch "$EMPTY_TSV" "$@" >/dev/null 2>&1; then
+        pass "accepts $desc"
+    else
+        fail "accepts $desc (exit $?)"
+    fi
+}
+check_reject() {
+    local desc="$1"; shift
+    if ! "$XCHPLOT2" batch "$EMPTY_TSV" "$@" >/dev/null 2>&1; then
+        pass "rejects $desc"
+    else
+        fail "rejects $desc (should have exited nonzero)"
+    fi
+}
+
+echo "==> --devices argument parsing ($XCHPLOT2)"
+check_accept "'all'"              --devices all
+check_accept "single id '0'"      --devices 0
+check_accept "explicit list"      --devices 0,1,2
+check_reject "garbage spec"       --devices badspec
+check_reject "negative id"        --devices -1
+check_reject "empty value"        --devices ""
+
+# --- Live multi-GPU plot (runtime-gated) ---
+echo "==> multi-device plot"
+
+# GPU_COUNT source of truth:
+#   - Explicit override lets a CI / test runner force-skip or force-run.
+#   - nvidia-smi works on both the main (SYCL+CUDA) and cuda-only branches
+#     whenever the target GPUs are NVIDIA, which covers every multi-GPU
+#     rig we realistically expect to hit. AMD-only multi-GPU can use
+#     `XCHPLOT2_TEST_GPU_COUNT=N scripts/test-multi-gpu.sh`.
+GPU_COUNT="${XCHPLOT2_TEST_GPU_COUNT:-}"
+if [[ -z "$GPU_COUNT" ]]; then
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits 2>/dev/null \
+                    | head -1 | tr -d ' ' || echo 0)
+    fi
+    GPU_COUNT="${GPU_COUNT:-0}"
+fi
+
+if [[ "$GPU_COUNT" -lt 2 ]]; then
+    skip "need >=2 GPUs (got $GPU_COUNT); set XCHPLOT2_TEST_GPU_COUNT=N to override"
+else
+    # k=22 is the smallest k the pipeline supports; two plots give each
+    # worker one entry to process under round-robin partition.
+    #
+    # We build a MANIFEST with pre-computed plot_id_hex + memo_hex (the
+    # `batch` subcommand feeds these straight to run_gpu_pipeline) rather
+    # than invoking `plot` with synthetic BLS keys — pos2_keygen rejects
+    # anything that isn't a real G1 public key with rc=-1 before the
+    # pipeline ever sees it.
+    LIVE_TSV="$TMP_OUT/live.tsv"
+    printf '22\t2\t0\t0\t0\tabababababababababababababababababababababababababababababababab\t00\t%s\tm1.plot2\n22\t2\t1\t0\t0\tcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd\t00\t%s\tm2.plot2\n' \
+        "$TMP_OUT" "$TMP_OUT" > "$LIVE_TSV"
+
+    if "$XCHPLOT2" batch "$LIVE_TSV" --devices 0,1 >"$TMP_OUT/log" 2>&1
+    then
+        # Two output files expected, each starting with the 'pos2' magic.
+        local_ok=1
+        shopt -s nullglob
+        plots=("$TMP_OUT"/m?.plot2)
+        if [[ "${#plots[@]}" -ne 2 ]]; then
+            fail "expected 2 plots, got ${#plots[@]}"
+            local_ok=0
+        else
+            for p in "${plots[@]}"; do
+                magic=$(head -c 4 "$p" | tr -d '\0')
+                if [[ "$magic" != "pos2" ]]; then
+                    fail "bad magic in $(basename "$p"): '$magic'"
+                    local_ok=0
+                fi
+            done
+        fi
+        if (( local_ok )); then
+            pass "wrote 2 k=22 plots across devices 0,1"
+        fi
+    else
+        fail "batch --devices 0,1 failed (see $TMP_OUT/log)"
+        sed 's/^/    /' "$TMP_OUT/log"
+    fi
+
+    echo "==> cross-device byte-stability"
+    # 4-entry manifest exercises round-robin (2 plots per worker on a
+    # 2-GPU rig). Plot output must be byte-identical regardless of
+    # which worker ran it; if --devices 0 and --devices 0,1 produce
+    # different SHAs for the same plot_id, the multi-device path has
+    # introduced non-determinism we shouldn't ship.
+    SD_DIR="$TMP_OUT/sd"
+    MD_DIR="$TMP_OUT/md"
+    mkdir -p "$SD_DIR" "$MD_DIR"
+    SD_TSV="$TMP_OUT/parity_sd.tsv"
+    MD_TSV="$TMP_OUT/parity_md.tsv"
+    {
+        a64=$(printf '%64s' '' | tr ' ' a)
+        b64=$(printf '%64s' '' | tr ' ' b)
+        c64=$(printf '%64s' '' | tr ' ' c)
+        d64=$(printf '%64s' '' | tr ' ' d)
+        printf '22\t2\t0\t0\t0\t%s\t00\t%s\tp0.plot2\n' "$a64" "$SD_DIR"
+        printf '22\t2\t1\t0\t0\t%s\t00\t%s\tp1.plot2\n' "$b64" "$SD_DIR"
+        printf '22\t2\t2\t0\t0\t%s\t00\t%s\tp2.plot2\n' "$c64" "$SD_DIR"
+        printf '22\t2\t3\t0\t0\t%s\t00\t%s\tp3.plot2\n' "$d64" "$SD_DIR"
+    } > "$SD_TSV"
+    sed "s|$SD_DIR|$MD_DIR|g" "$SD_TSV" > "$MD_TSV"
+
+    if "$XCHPLOT2" batch "$SD_TSV" --devices 0     >"$TMP_OUT/sd.log" 2>&1 \
+    && "$XCHPLOT2" batch "$MD_TSV" --devices 0,1 >"$TMP_OUT/md.log" 2>&1
+    then
+        parity_ok=1
+        for f in "$SD_DIR"/p?.plot2; do
+            name=$(basename "$f")
+            sd_sha=$(sha256sum "$f"          | awk '{print $1}')
+            md_sha=$(sha256sum "$MD_DIR/$name" | awk '{print $1}')
+            if [[ "$sd_sha" != "$md_sha" ]]; then
+                fail "byte mismatch on $name (sd=${sd_sha:0:12} md=${md_sha:0:12})"
+                parity_ok=0
+            fi
+        done
+        if (( parity_ok )); then
+            pass "single-device and multi-device produced byte-identical plots"
+        fi
+    else
+        fail "cross-device parity batches failed (logs in $TMP_OUT/sd.log, md.log)"
+    fi
+fi
+
+echo
+printf '==> %d passed, %d failed, %d skipped\n' "$PASS" "$FAIL" "$SKIP"
+exit $(( FAIL > 0 ? 1 : 0 ))
diff --git a/scripts/test/install-container-deps/arch.txt b/scripts/test/install-container-deps/arch.txt
new file mode 100644
index 0000000..058ac4d
--- /dev/null
+++ b/scripts/test/install-container-deps/arch.txt
@@ -0,0 +1,112 @@
+=== engine=podman gpu=nvidia ===
+[install-container-deps] distro=arch, gpu=nvidia, engine=podman
++ sudo pacman -S --needed --noconfirm podman podman-compose nvidia-utils nvidia-container-toolkit
++ sudo install -m 0755 -d /etc/cdi
++ sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
+[install-container-deps] Generated CDI spec at /etc/cdi/nvidia.yaml.
+[install-container-deps] (WARNings about libnvidia-vulkan-producer / X11 configs /
+[install-container-deps]  fabric-manager / MPS / IMEX from nvidia-ctk are expected on
+[install-container-deps]  non-server hosts — those are optional features the spec
+[install-container-deps]  gracefully omits when not present.)
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu nvidia
+  After future NVIDIA driver upgrades, re-run this script (or
+  re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure
+  manually) so the CDI spec / docker runtime hook stays current.
+
+=== engine=podman gpu=amd ===
+[install-container-deps] distro=arch, gpu=amd, engine=podman
++ sudo pacman -S --needed --noconfirm podman podman-compose rocminfo
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu amd
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=podman gpu=intel ===
+[install-container-deps] distro=arch, gpu=intel, engine=podman
++ sudo pacman -S --needed --noconfirm podman podman-compose
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu intel
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=podman gpu=cpu ===
+[install-container-deps] distro=arch, gpu=cpu, engine=podman
++ sudo pacman -S --needed --noconfirm podman podman-compose
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu cpu
+
+=== engine=docker gpu=nvidia ===
+[install-container-deps] distro=arch, gpu=nvidia, engine=docker
++ sudo pacman -S --needed --noconfirm docker docker-compose docker-buildx nvidia-utils nvidia-container-toolkit
++ sudo systemctl enable --now docker.service
++ sudo nvidia-ctk runtime configure --runtime=docker
++ sudo systemctl restart docker
+[install-container-deps] Configured docker NVIDIA runtime + restarted dockerd.
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu nvidia
+  After future NVIDIA driver upgrades, re-run this script (or
+  re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure
+  manually) so the CDI spec / docker runtime hook stays current.
+
+=== engine=docker gpu=amd ===
+[install-container-deps] distro=arch, gpu=amd, engine=docker
++ sudo pacman -S --needed --noconfirm docker docker-compose docker-buildx rocminfo
++ sudo systemctl enable --now docker.service
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu amd
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=docker gpu=intel ===
+[install-container-deps] distro=arch, gpu=intel, engine=docker
++ sudo pacman -S --needed --noconfirm docker docker-compose docker-buildx
++ sudo systemctl enable --now docker.service
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu intel
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=docker gpu=cpu ===
+[install-container-deps] distro=arch, gpu=cpu, engine=docker
++ sudo pacman -S --needed --noconfirm docker docker-compose docker-buildx
++ sudo systemctl enable --now docker.service
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu cpu
+
diff --git a/scripts/test/install-container-deps/fedora.txt b/scripts/test/install-container-deps/fedora.txt
new file mode 100644
index 0000000..9fb1a7c
--- /dev/null
+++ b/scripts/test/install-container-deps/fedora.txt
@@ -0,0 +1,118 @@
+=== engine=podman gpu=nvidia ===
+[install-container-deps] distro=fedora, gpu=nvidia, engine=podman
++ sudo dnf install -y podman podman-compose
+[install-container-deps] Adding NVIDIA's container-toolkit dnf repo to /etc/yum.repos.d/.
++ write /etc/yum.repos.d/nvidia-container-toolkit.repo (from https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo)
++ sudo dnf install -y nvidia-container-toolkit
++ sudo install -m 0755 -d /etc/cdi
++ sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
+[install-container-deps] Generated CDI spec at /etc/cdi/nvidia.yaml.
+[install-container-deps] (WARNings about libnvidia-vulkan-producer / X11 configs /
+[install-container-deps]  fabric-manager / MPS / IMEX from nvidia-ctk are expected on
+[install-container-deps]  non-server hosts — those are optional features the spec
+[install-container-deps]  gracefully omits when not present.)
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu nvidia
+  After future NVIDIA driver upgrades, re-run this script (or
+  re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure
+  manually) so the CDI spec / docker runtime hook stays current.
+
+=== engine=podman gpu=amd ===
+[install-container-deps] distro=fedora, gpu=amd, engine=podman
++ sudo dnf install -y podman podman-compose rocminfo
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu amd
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=podman gpu=intel ===
+[install-container-deps] distro=fedora, gpu=intel, engine=podman
++ sudo dnf install -y podman podman-compose
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu intel
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=podman gpu=cpu ===
+[install-container-deps] distro=fedora, gpu=cpu, engine=podman
++ sudo dnf install -y podman podman-compose
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu cpu
+
+=== engine=docker gpu=nvidia ===
+[install-container-deps] distro=fedora, gpu=nvidia, engine=docker
++ sudo dnf install -y docker-compose-plugin docker-buildx-plugin
+[install-container-deps] Adding NVIDIA's container-toolkit dnf repo to /etc/yum.repos.d/.
++ write /etc/yum.repos.d/nvidia-container-toolkit.repo (from https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo)
++ sudo dnf install -y nvidia-container-toolkit
++ sudo systemctl enable --now docker.service
++ sudo nvidia-ctk runtime configure --runtime=docker
++ sudo systemctl restart docker
+[install-container-deps] Configured docker NVIDIA runtime + restarted dockerd.
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu nvidia
+  After future NVIDIA driver upgrades, re-run this script (or
+  re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure
+  manually) so the CDI spec / docker runtime hook stays current.
+
+=== engine=docker gpu=amd ===
+[install-container-deps] distro=fedora, gpu=amd, engine=docker
++ sudo dnf install -y docker-compose-plugin docker-buildx-plugin rocminfo
++ sudo systemctl enable --now docker.service
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu amd
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=docker gpu=intel ===
+[install-container-deps] distro=fedora, gpu=intel, engine=docker
++ sudo dnf install -y docker-compose-plugin docker-buildx-plugin
++ sudo systemctl enable --now docker.service
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu intel
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=docker gpu=cpu ===
+[install-container-deps] distro=fedora, gpu=cpu, engine=docker
++ sudo dnf install -y docker-compose-plugin docker-buildx-plugin
++ sudo systemctl enable --now docker.service
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu cpu
+
diff --git a/scripts/test/install-container-deps/run.sh b/scripts/test/install-container-deps/run.sh
new file mode 100755
index 0000000..eee753a
--- /dev/null
+++ b/scripts/test/install-container-deps/run.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+#
+# run.sh — verify install-container-deps.sh's --dry-run output matches
+# checked-in fixtures across (distro × engine × gpu) combinations.
+#
+# Each distro's full (engine × gpu) matrix runs inside a single
+# arch/ubuntu/fedora container, so the cost is three image pulls + three
+# container startups regardless of how many tuples the matrix expands to.
+#
+# Usage:
+#   scripts/test/install-container-deps/run.sh            # diff mode (CI default)
+#   scripts/test/install-container-deps/run.sh --update   # regenerate fixtures
+#
+# Honours $XCHPLOT2_CONTAINER_RUNTIME (podman|docker); auto-detects
+# otherwise, preferring podman.
+
+set -euo pipefail
+
+# Derive ROOT from this script's own path so the harness works no
+# matter what CWD it runs from. The previous `git rev-parse` form
+# resolved against the *outer* CWD, so running this script from
+# another repo's directory wrote fixtures into the wrong tree.
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)
+FIXTURE_DIR="$ROOT/scripts/test/install-container-deps"
+
+UPDATE=0
+[[ "${1:-}" == --update ]] && UPDATE=1
+
+if [[ -n "${XCHPLOT2_CONTAINER_RUNTIME:-}" ]]; then
+    RUNTIME="$XCHPLOT2_CONTAINER_RUNTIME"
+elif command -v podman >/dev/null; then
+    RUNTIME=podman
+elif command -v docker >/dev/null; then
+    RUNTIME=docker
+else
+    echo "run.sh: neither podman nor docker on PATH" >&2
+    exit 1
+fi
+
+declare -A IMAGES=(
+    [arch]=docker.io/archlinux:latest
+    [ubuntu]=docker.io/ubuntu:24.04
+    [fedora]=docker.io/fedora:40
+)
+
+# `XCHPLOT2_DRY_DISTRO_FILTER=arch` runs only one distro — handy when
+# regenerating a single fixture without re-pulling all three images.
+FILTER="${XCHPLOT2_DRY_DISTRO_FILTER:-}"
+
+failed=0
+for distro in arch ubuntu fedora; do
+    [[ -z "$FILTER" || "$FILTER" == "$distro" ]] || continue
+
+    img="${IMAGES[$distro]}"
+    fixture="$FIXTURE_DIR/$distro.txt"
+    tmp=$(mktemp)
+    # shellcheck disable=SC2064  # intentional early expansion
+    trap "rm -f '$tmp'" EXIT
+
+    # All (engine × gpu) combos for this distro run in one container.
+    # Each combo gets a `=== engine=X gpu=Y ===` header so the fixture
+    # diffs cleanly when one tuple drifts.
+    # shellcheck disable=SC2016  # $engine/$gpu intentionally evaluated inside the container shell
+    "$RUNTIME" run --rm -v "$ROOT/scripts:/s:ro" "$img" bash -c '
+        for engine in podman docker; do
+            for gpu in nvidia amd intel cpu; do
+                printf "=== engine=%s gpu=%s ===\n" "$engine" "$gpu"
+                /s/install-container-deps.sh --dry-run \
+                    --engine "$engine" --gpu "$gpu" 2>&1 \
+                    || printf "[exit=%d]\n" $?
+                printf "\n"
+            done
+        done
+    ' > "$tmp"
+
+    if (( UPDATE )); then
+        cp "$tmp" "$fixture"
+        echo "updated: $fixture"
+    elif ! diff -u "$fixture" "$tmp"; then
+        echo "::error::fixture mismatch for distro=$distro"
+        failed=1
+    else
+        echo "ok: $distro"
+    fi
+done
+
+exit $failed
diff --git a/scripts/test/install-container-deps/ubuntu.txt b/scripts/test/install-container-deps/ubuntu.txt
new file mode 100644
index 0000000..c4666a4
--- /dev/null
+++ b/scripts/test/install-container-deps/ubuntu.txt
@@ -0,0 +1,136 @@
+=== engine=podman gpu=nvidia ===
+[install-container-deps] distro=ubuntu, gpu=nvidia, engine=podman
++ sudo apt-get update
++ sudo apt-get install -y --no-install-recommends podman podman-compose nvidia-utils-<DRV_MAJOR>
+[install-container-deps] Adding NVIDIA's container-toolkit apt repo to /etc/apt/sources.list.d/.
++ sudo install -m 0755 -d /usr/share/keyrings
++ write /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg (gpg --dearmor from https://nvidia.github.io/libnvidia-container/gpgkey)
++ write /etc/apt/sources.list.d/nvidia-container-toolkit.list (from https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list)
++ sudo apt-get update
++ sudo apt-get install -y --no-install-recommends nvidia-container-toolkit
++ sudo install -m 0755 -d /etc/cdi
++ sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
+[install-container-deps] Generated CDI spec at /etc/cdi/nvidia.yaml.
+[install-container-deps] (WARNings about libnvidia-vulkan-producer / X11 configs /
+[install-container-deps]  fabric-manager / MPS / IMEX from nvidia-ctk are expected on
+[install-container-deps]  non-server hosts — those are optional features the spec
+[install-container-deps]  gracefully omits when not present.)
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu nvidia
+  After future NVIDIA driver upgrades, re-run this script (or
+  re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure
+  manually) so the CDI spec / docker runtime hook stays current.
+
+=== engine=podman gpu=amd ===
+[install-container-deps] distro=ubuntu, gpu=amd, engine=podman
++ sudo apt-get update
++ sudo apt-get install -y --no-install-recommends podman podman-compose rocminfo
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu amd
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=podman gpu=intel ===
+[install-container-deps] distro=ubuntu, gpu=intel, engine=podman
++ sudo apt-get update
++ sudo apt-get install -y --no-install-recommends podman podman-compose
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu intel
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=podman gpu=cpu ===
+[install-container-deps] distro=ubuntu, gpu=cpu, engine=podman
++ sudo apt-get update
++ sudo apt-get install -y --no-install-recommends podman podman-compose
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine podman --gpu cpu
+
+=== engine=docker gpu=nvidia ===
+[install-container-deps] distro=ubuntu, gpu=nvidia, engine=docker
++ sudo apt-get update
++ sudo apt-get install -y --no-install-recommends docker.io docker-buildx nvidia-utils-<DRV_MAJOR>
++ sudo apt-get install -y --no-install-recommends docker-compose-v2
+[install-container-deps] Adding NVIDIA's container-toolkit apt repo to /etc/apt/sources.list.d/.
++ sudo install -m 0755 -d /usr/share/keyrings
++ write /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg (gpg --dearmor from https://nvidia.github.io/libnvidia-container/gpgkey)
++ write /etc/apt/sources.list.d/nvidia-container-toolkit.list (from https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list)
++ sudo apt-get update
++ sudo apt-get install -y --no-install-recommends nvidia-container-toolkit
++ sudo systemctl enable --now docker.service
++ sudo nvidia-ctk runtime configure --runtime=docker
++ sudo systemctl restart docker
+[install-container-deps] Configured docker NVIDIA runtime + restarted dockerd.
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu nvidia
+  After future NVIDIA driver upgrades, re-run this script (or
+  re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure
+  manually) so the CDI spec / docker runtime hook stays current.
+
+=== engine=docker gpu=amd ===
+[install-container-deps] distro=ubuntu, gpu=amd, engine=docker
++ sudo apt-get update
++ sudo apt-get install -y --no-install-recommends docker.io docker-buildx rocminfo
++ sudo apt-get install -y --no-install-recommends docker-compose-v2
++ sudo systemctl enable --now docker.service
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu amd
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=docker gpu=intel ===
+[install-container-deps] distro=ubuntu, gpu=intel, engine=docker
++ sudo apt-get update
++ sudo apt-get install -y --no-install-recommends docker.io docker-buildx
++ sudo apt-get install -y --no-install-recommends docker-compose-v2
++ sudo systemctl enable --now docker.service
++ sudo usermod -aG video <USER>
+[install-container-deps] Added <USER> to group video (re-login to apply).
++ sudo usermod -aG render <USER>
+[install-container-deps] Added <USER> to group render (re-login to apply).
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu intel
+  If this run added you to the video / render groups, log out
+  and back in before running plots — group changes only take
+  effect for fresh login sessions.
+
+=== engine=docker gpu=cpu ===
+[install-container-deps] distro=ubuntu, gpu=cpu, engine=docker
++ sudo apt-get update
++ sudo apt-get install -y --no-install-recommends docker.io docker-buildx
++ sudo apt-get install -y --no-install-recommends docker-compose-v2
++ sudo systemctl enable --now docker.service
+
+[install-container-deps] Done.
+  Build the image:
+    ./scripts/build-container.sh --engine docker --gpu cpu
+
diff --git a/src/gpu/AesGpu.cu b/src/gpu/AesGpu.cu
index 88625a9..37297c8 100644
--- a/src/gpu/AesGpu.cu
+++ b/src/gpu/AesGpu.cu
@@ -1,8 +1,9 @@
-// AesGpu.cu — T-table initialisation. Tables are computed on the host
-// (small, deterministic) and copied to constant memory.
+// AesGpu.cu — T-table initialisation. Tables are computed at compile
+// time in AesTables.inl (shared with the SYCL backend) and copied here
+// into __constant__ memory for the CUDA path.
 
 #include "gpu/AesGpu.cuh"
-#include <array>
+#include "gpu/AesTables.inl"
 
 namespace pos2gpu {
 
@@ -11,70 +12,12 @@ __device__ __constant__ uint32_t kAesT1[256];
 __device__ __constant__ uint32_t kAesT2[256];
 __device__ __constant__ uint32_t kAesT3[256];
 
-namespace {
-
-// Rijndael S-box.
-constexpr uint8_t kSBox[256] = {
-    0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
-    0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
-    0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
-    0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
-    0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
-    0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
-    0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
-    0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
-    0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
-    0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
-    0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
-    0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
-    0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
-    0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
-    0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
-    0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
-};
-
-// xtime() — multiplication by x (i.e. 0x02) in GF(2^8) with the AES polynomial.
-constexpr uint8_t xtime(uint8_t x) {
-    return static_cast<uint8_t>((x << 1) ^ ((x & 0x80) ? 0x1B : 0));
-}
-
-// MixColumns row [02 03 01 01]. T0[a] = (2·S[a], 1·S[a], 1·S[a], 3·S[a])
-// little-endian bytes are: byte0=2S, byte1=S, byte2=S, byte3=3S.
-constexpr uint32_t te_word(uint8_t a, int rotate)
-{
-    uint8_t s = kSBox[a];
-    uint8_t s2 = xtime(s);
-    uint8_t s3 = static_cast<uint8_t>(s2 ^ s);
-    uint8_t b[4] = { s2, s, s, s3 };
-    uint32_t v = 0;
-    for (int i = 0; i < 4; ++i) {
-        v |= uint32_t(b[(i + rotate) & 3]) << (8 * i);
-    }
-    return v;
-}
-
-constexpr std::array<uint32_t, 256> build_table(int rotate)
-{
-    std::array<uint32_t, 256> t{};
-    for (int i = 0; i < 256; ++i) {
-        t[i] = te_word(static_cast<uint8_t>(i), rotate);
-    }
-    return t;
-}
-
-constexpr auto T0 = build_table(0);
-constexpr auto T1 = build_table(3);
-constexpr auto T2 = build_table(2);
-constexpr auto T3 = build_table(1);
-
-} // namespace
-
 void initialize_aes_tables()
 {
-    cudaMemcpyToSymbol(kAesT0, T0.data(), sizeof(uint32_t) * 256);
-    cudaMemcpyToSymbol(kAesT1, T1.data(), sizeof(uint32_t) * 256);
-    cudaMemcpyToSymbol(kAesT2, T2.data(), sizeof(uint32_t) * 256);
-    cudaMemcpyToSymbol(kAesT3, T3.data(), sizeof(uint32_t) * 256);
+    cudaMemcpyToSymbol(kAesT0, aes_tables::T0.data(), sizeof(uint32_t) * 256);
+    cudaMemcpyToSymbol(kAesT1, aes_tables::T1.data(), sizeof(uint32_t) * 256);
+    cudaMemcpyToSymbol(kAesT2, aes_tables::T2.data(), sizeof(uint32_t) * 256);
+    cudaMemcpyToSymbol(kAesT3, aes_tables::T3.data(), sizeof(uint32_t) * 256);
 }
 
 } // namespace pos2gpu
diff --git a/src/gpu/AesGpu.cuh b/src/gpu/AesGpu.cuh
index 46a566f..42cf2d7 100644
--- a/src/gpu/AesGpu.cuh
+++ b/src/gpu/AesGpu.cuh
@@ -20,26 +20,44 @@
 //
 // Cross-check against pos2-chip/src/pos/aes/intrin_portable.h which
 // defines `rx_aesenc_vec_i128 _mm_aesenc_si128`.
+//
+// Backend portability:
+//
+// The SYCL path (compiled by acpp/clang in non-CUDA mode) cannot see
+// __constant__ memory, threadIdx, or __device__ markup. The pieces it
+// needs — aesenc_round_smem, set_int_vec_i128, load_state_le, and the
+// AesState struct itself — are decorated with the portable macros from
+// PortableAttrs.hpp and stay outside the __CUDACC__ gate. The constant-
+// memory T-tables, the aesenc_round variant that reads them, and
+// load_aes_tables_smem (uses threadIdx) are CUDA-only.
 
 #pragma once
 
-#include <cuda_runtime.h>
+#include "gpu/PortableAttrs.hpp"
+
 #include <cstdint>
 
+#if defined(__CUDACC__)
+  #include <cuda_runtime.h>
+#endif
+
 namespace pos2gpu {
 
-// AES S-box (Rijndael forward S-box).
+#if defined(__CUDACC__)
+// AES T-tables in constant memory. Defined in AesGpu.cu, populated by
+// initialize_aes_tables() at startup.
 __device__ __constant__ extern uint32_t kAesT0[256];
 __device__ __constant__ extern uint32_t kAesT1[256];
 __device__ __constant__ extern uint32_t kAesT2[256];
 __device__ __constant__ extern uint32_t kAesT3[256];
+#endif
 
 struct AesState {
     uint32_t w[4];
 };
 
 // Load 16 bytes (little-endian) into an AesState.
-__host__ __device__ inline AesState load_state_le(uint8_t const* bytes)
+POS2_HOST_DEVICE_INLINE AesState load_state_le(uint8_t const* bytes)
 {
     AesState s;
     #pragma unroll
@@ -52,12 +70,11 @@ __host__ __device__ inline AesState load_state_le(uint8_t const* bytes)
     return s;
 }
 
-// One AES round equivalent to _mm_aesenc_si128(state, key).
-// Implemented with T-tables. ShiftRows is folded into the byte-extraction
-// indices, then SubBytes+MixColumns is the table lookup.
-//
-// AESENC operates per-column. For column c (0..3), the output column is:
-//   T0[s[c, 0]] ^ T1[s[(c+1) mod 4, 1]] ^ T2[s[(c+2) mod 4, 2]] ^ T3[s[(c+3) mod 4, 3]] ^ key[c]
+#if defined(__CUDACC__)
+// One AES round equivalent to _mm_aesenc_si128(state, key), reading the
+// T-tables from constant memory. CUDA-only because __constant__ has no
+// SYCL equivalent — the SYCL path uses aesenc_round_smem with tables
+// preloaded into local memory.
 __device__ __forceinline__ AesState aesenc_round(AesState s, AesState const& key)
 {
     auto byte = [](uint32_t w, int n) -> uint32_t {
@@ -75,10 +92,11 @@ __device__ __forceinline__ AesState aesenc_round(AesState s, AesState const& key
     }
     return out;
 }
+#endif
 
 // Convenience: load an i128 from four little-endian 32-bit ints, matching
 // rx_set_int_vec_i128(i3, i2, i1, i0).
-__host__ __device__ inline AesState set_int_vec_i128(int32_t i3, int32_t i2, int32_t i1, int32_t i0)
+POS2_HOST_DEVICE_INLINE AesState set_int_vec_i128(int32_t i3, int32_t i2, int32_t i1, int32_t i0)
 {
     AesState s;
     s.w[0] = static_cast<uint32_t>(i0);
@@ -90,6 +108,7 @@ __host__ __device__ inline AesState set_int_vec_i128(int32_t i3, int32_t i2, int
 
 // Initialize the constant-memory T-tables on first use. Must be called once
 // per program from host code before any kernel that touches AesGpu runs.
+// Implemented in AesGpu.cu (CUDA TU only).
 void initialize_aes_tables();
 
 // =========================================================================
@@ -106,8 +125,14 @@ void initialize_aes_tables();
 //   __syncthreads();
 //   AesState state = ...;
 //   state = aesenc_round_smem(state, round_key, sT);
+//
+// The SYCL path uses the same aesenc_round_smem (pointer-based, fully
+// portable) but provides its own loader — local_accessor + nd_item barrier
+// in place of __shared__ + __syncthreads — and supplies the table data
+// from a USM buffer initialised from AesTables.inl on the host side.
 // =========================================================================
 
+#if defined(__CUDACC__)
 __device__ __forceinline__ void load_aes_tables_smem(uint32_t* sT)
 {
     // sT layout: [T0|T1|T2|T3], 256 entries each (4096 entries total).
@@ -121,8 +146,9 @@ __device__ __forceinline__ void load_aes_tables_smem(uint32_t* sT)
         sT[3 * 256 + i] = kAesT3[i];
     }
 }
+#endif
 
-__device__ __forceinline__ AesState aesenc_round_smem(
+POS2_DEVICE_INLINE AesState aesenc_round_smem(
     AesState s, AesState const& key, uint32_t const* __restrict__ sT)
 {
     auto byte = [](uint32_t w, int n) -> uint32_t {
diff --git a/src/gpu/AesHashBsSycl.hpp b/src/gpu/AesHashBsSycl.hpp
new file mode 100644
index 0000000..e1176ea
--- /dev/null
+++ b/src/gpu/AesHashBsSycl.hpp
@@ -0,0 +1,376 @@
+// AesHashBsSycl.hpp — sub_group-cooperative bit-sliced AES hash for SYCL.
+//
+// Cross-reference:
+//   src/gpu/AesGpuBitsliced.cuh  (CUDA original, 32-lane warp-coop)
+//   src/gpu/AesHashGpu.cuh       (CUDA T-table API; _smem family)
+//   src/gpu/AesSBoxBP.cuh        (Boyar-Peralta S-box circuit, shared)
+//
+// Exports sub_group-cooperative equivalents of g_x_smem / pairing_smem /
+// matching_target_smem. Each kernel thread holds one state; 32 threads in
+// a sub_group cooperate on 32 parallel AES computations, using only bit
+// ops + sub_group shuffles — no T-table LDS lookups, which is what makes
+// the bitsliced path win on amdgcn under AdaptiveCpp's HIP backend.
+//
+// Preconditions for callers:
+//   - Kernel MUST be launched with reqd_sub_group_size(32) (wave32 on
+//     RDNA2, warp32 on NVIDIA; both native). The shuffle/ballot math is
+//     hard-coded for 32 lanes.
+//   - ALL 32 lanes of the sub_group must participate in every call.
+//     Lanes with no real work should pass dummy inputs, do the call,
+//     then return afterwards.
+
+#pragma once
+
+#include "gpu/AesGpu.cuh"
+#include "gpu/AesHashGpu.cuh"
+#include "gpu/AesSBoxBP.cuh"
+
+#include <sycl/sycl.hpp>
+
+#include <cstdint>
+
+namespace pos2gpu {
+
+// ---------- low-level sub_group primitives ----------
+
+inline uint32_t bs_shfl(sycl::sub_group const& sg, uint32_t x, int lane)
+{
+    return sycl::select_from_group(sg, x, lane);
+}
+
+// Ballot: 32 lanes each contribute one bit, collected into a single
+// uint32 mask (bit l of the result == lane l's predicate).
+//
+// Fast path on AdaptiveCpp's HIP target: __builtin_amdgcn_ballot_w32
+// lowers to a single v_cmp + s_mov on RDNA2/3 — one native amdgcn
+// instruction instead of the log-n reduction the portable fallback
+// compiles to. This is the critical piece for bitsliced AES to win
+// on amdgcn: bs32_pack calls ballot 128× per hash, so a 5× speedup
+// per call is the difference between a +23 % regression (the first
+// attempt with reduce_over_group<bit_or>) and a net win.
+//
+// Dispatch MUST go through AdaptiveCpp's __acpp_if_target_hip(stmts)
+// macro, not a raw `#if defined(__HIP_DEVICE_COMPILE__)`. AdaptiveCpp
+// compiles each kernel body for every backend target it's configured
+// for (including the OMP host-CPU fallback), so on the OMP pass the
+// preprocessor branch is chosen per-TU but the kernel body is also
+// evaluated as a __host__ function — clang then rejects the
+// __device__-only `__builtin_amdgcn_ballot_w32` with "reference to
+// __device__ function in __host__ function" even though the #if
+// would have eliminated it on the non-HIP backend. __acpp_if_target_hip
+// expands to `stmts` during the HIP device code-gen pass only, and
+// to nothing on all other passes — so the intrinsic truly never
+// appears in a __host__ context.
+//
+// Wave-size caveat: we hard-code _w32 because gfx1031 (RDNA2) is
+// wave32 and the entire bitsliced scheme is wave32-only (reqd_sub_
+// group_size(32) on the kernels, 32-way pack/unpack layout). Using
+// _w64 on a wave32 target miscompiles — LLVM issue #62477.
+//
+// Recipe source: AdaptiveCpp doc/hip-source-interop.md.
+inline uint32_t bs_ballot(sycl::sub_group const& sg, bool pred)
+{
+    __acpp_if_target_hip(
+        return static_cast<uint32_t>(__builtin_amdgcn_ballot_w32(pred));
+    );
+    // Portable fallback — reachable on every non-HIP target (OMP host,
+    // CUDA, Intel Level Zero, SSCP). The HIP device pass early-returns
+    // above so this branch is dead on amdgcn.
+    uint32_t lane = sg.get_local_linear_id();
+    uint32_t bit  = pred ? (1u << lane) : 0u;
+    return sycl::reduce_over_group(sg, bit, sycl::bit_or<uint32_t>{});
+}
+
+// ---------- 32-way pack / unpack ----------
+//
+// Bit-plane layout matches AesGpuBitsliced.cuh:
+//   plane p (0..127) has bit l = bit p of lane l's scalar state.
+//   thread t owns planes { 4t, 4t+1, 4t+2, 4t+3 }.
+
+inline void bs32_pack(sycl::sub_group const& sg,
+                      AesState const& my, uint32_t out[4])
+{
+    uint32_t lane = sg.get_local_linear_id();
+    for (int p = 0; p < 128; ++p) {
+        int byte_idx    = p >> 3;
+        int bit_in_byte = p & 7;
+        int word_idx    = byte_idx >> 2;
+        int byte_in_w   = byte_idx & 3;
+        uint32_t bit = (my.w[word_idx] >> (8 * byte_in_w + bit_in_byte)) & 1u;
+        uint32_t plane = bs_ballot(sg, bit != 0u);
+        if (lane == uint32_t(p >> 2)) {
+            out[p & 3] = plane;
+        }
+    }
+}
+
+inline void bs32_unpack(sycl::sub_group const& sg,
+                        uint32_t const in[4], AesState& my)
+{
+    uint32_t lane = sg.get_local_linear_id();
+    my.w[0] = my.w[1] = my.w[2] = my.w[3] = 0u;
+    for (int p = 0; p < 128; ++p) {
+        int owner = p >> 2;
+        int slot  = p & 3;
+        uint32_t plane = bs_shfl(sg, in[slot], owner);
+        uint32_t bit = (plane >> lane) & 1u;
+        int byte_idx    = p >> 3;
+        int bit_in_byte = p & 7;
+        int word_idx    = byte_idx >> 2;
+        int byte_in_w   = byte_idx & 3;
+        my.w[word_idx] |= bit << (8 * byte_in_w + bit_in_byte);
+    }
+}
+
+// ---------- round key materialisation ----------
+//
+// All 32 states share the same key, so each bit-plane of a bit-sliced
+// key is either all-ones or all-zeros. No cross-lane communication.
+
+inline void make_bs32_round_key(sycl::sub_group const& sg,
+                                AesState const& key, uint32_t key_bs[4])
+{
+    uint32_t lane = sg.get_local_linear_id();
+    #pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        int p = 4 * int(lane) + i;
+        int byte_idx    = p >> 3;
+        int bit_in_byte = p & 7;
+        int word_idx    = byte_idx >> 2;
+        int byte_in_w   = byte_idx & 3;
+        uint32_t bit = (key.w[word_idx] >> (8 * byte_in_w + bit_in_byte)) & 1u;
+        key_bs[i] = bit ? 0xFFFFFFFFu : 0u;
+    }
+}
+
+inline void add_round_key_bs32(uint32_t bs[4], uint32_t const key_bs[4])
+{
+    bs[0] ^= key_bs[0]; bs[1] ^= key_bs[1];
+    bs[2] ^= key_bs[2]; bs[3] ^= key_bs[3];
+}
+
+// ---------- ShiftRows ----------
+//
+// Each lane fetches its own output byte from a single source lane. The
+// permutation preserves bit-within-byte index, so one shuffle per plane.
+
+inline void shift_rows_bs32(sycl::sub_group const& sg, uint32_t bs[4])
+{
+    uint32_t lane  = sg.get_local_linear_id();
+    int is_hi = int(lane) & 1;
+    int b     = int(lane) >> 1;
+    int c     = b >> 2;
+    int r     = b & 3;
+    int b_old = ((c + r) & 3) * 4 + r;
+    int owner = 2 * b_old + is_hi;
+    uint32_t n0 = bs_shfl(sg, bs[0], owner);
+    uint32_t n1 = bs_shfl(sg, bs[1], owner);
+    uint32_t n2 = bs_shfl(sg, bs[2], owner);
+    uint32_t n3 = bs_shfl(sg, bs[3], owner);
+    bs[0] = n0; bs[1] = n1; bs[2] = n2; bs[3] = n3;
+}
+
+// ---------- MixColumns ----------
+//
+// See AesGpuBitsliced.cuh for the algebraic derivation. 14 shuffles per
+// lane (12 same-half column mates + 2 cross-half boundary bits).
+
+inline void mix_columns_bs32(sycl::sub_group const& sg, uint32_t bs[4])
+{
+    uint32_t lane = sg.get_local_linear_id();
+    int is_hi    = int(lane) & 1;
+    int b        = int(lane) >> 1;
+    int c        = b >> 2;
+    int r        = b & 3;
+    int partner  = int(lane) ^ 1;
+    int col_base = 8 * c;
+    int r1 = (r + 1) & 3;
+    int r2 = (r + 2) & 3;
+    int r3 = (r + 3) & 3;
+    int L1 = col_base + 2 * r1 + is_hi;
+    int L2 = col_base + 2 * r2 + is_hi;
+    int L3 = col_base + 2 * r3 + is_hi;
+    int L1_other = col_base + 2 * r1 + (is_hi ^ 1);
+
+    uint32_t r1_0 = bs_shfl(sg, bs[0], L1);
+    uint32_t r1_1 = bs_shfl(sg, bs[1], L1);
+    uint32_t r1_2 = bs_shfl(sg, bs[2], L1);
+    uint32_t r1_3 = bs_shfl(sg, bs[3], L1);
+    uint32_t r2_0 = bs_shfl(sg, bs[0], L2);
+    uint32_t r2_1 = bs_shfl(sg, bs[1], L2);
+    uint32_t r2_2 = bs_shfl(sg, bs[2], L2);
+    uint32_t r2_3 = bs_shfl(sg, bs[3], L2);
+    uint32_t r3_0 = bs_shfl(sg, bs[0], L3);
+    uint32_t r3_1 = bs_shfl(sg, bs[1], L3);
+    uint32_t r3_2 = bs_shfl(sg, bs[2], L3);
+    uint32_t r3_3 = bs_shfl(sg, bs[3], L3);
+
+    uint32_t t_0 = bs[0] ^ r1_0;
+    uint32_t t_1 = bs[1] ^ r1_1;
+    uint32_t t_2 = bs[2] ^ r1_2;
+    uint32_t t_3 = bs[3] ^ r1_3;
+
+    uint32_t t_boundary = bs_shfl(sg, bs[3], partner)
+                        ^ bs_shfl(sg, bs[3], L1_other);
+
+    uint32_t xt_0, xt_1, xt_2, xt_3;
+    if (is_hi) {
+        xt_0 = t_boundary ^ t_3;
+        xt_1 = t_0;
+        xt_2 = t_1;
+        xt_3 = t_2;
+    } else {
+        xt_0 = t_boundary;
+        xt_1 = t_0 ^ t_boundary;
+        xt_2 = t_1;
+        xt_3 = t_2 ^ t_boundary;
+    }
+
+    bs[0] = xt_0 ^ r1_0 ^ r2_0 ^ r3_0;
+    bs[1] = xt_1 ^ r1_1 ^ r2_1 ^ r3_1;
+    bs[2] = xt_2 ^ r1_2 ^ r2_2 ^ r3_2;
+    bs[3] = xt_3 ^ r1_3 ^ r2_3 ^ r3_3;
+}
+
+// ---------- SubBytes via Boyar-Peralta bitsliced S-box ----------
+//
+// Threads 2b and 2b+1 cooperate on byte b: they swap their four planes
+// once, run the 113-gate BP circuit redundantly, then keep the four
+// outputs for their own half of the byte.
+
+inline void sub_bytes_bs32(sycl::sub_group const& sg, uint32_t bs[4])
+{
+    uint32_t lane = sg.get_local_linear_id();
+    int is_hi   = int(lane) & 1;
+    int partner = int(lane) ^ 1;
+
+    uint32_t peer0 = bs_shfl(sg, bs[0], partner);
+    uint32_t peer1 = bs_shfl(sg, bs[1], partner);
+    uint32_t peer2 = bs_shfl(sg, bs[2], partner);
+    uint32_t peer3 = bs_shfl(sg, bs[3], partner);
+
+    uint32_t U0, U1, U2, U3, U4, U5, U6, U7;
+    if (is_hi) {
+        U0 = bs[3]; U1 = bs[2]; U2 = bs[1]; U3 = bs[0];
+        U4 = peer3; U5 = peer2; U6 = peer1; U7 = peer0;
+    } else {
+        U0 = peer3; U1 = peer2; U2 = peer1; U3 = peer0;
+        U4 = bs[3]; U5 = bs[2]; U6 = bs[1]; U7 = bs[0];
+    }
+
+    uint32_t S0, S1, S2, S3, S4, S5, S6, S7;
+    bp_sbox_circuit<uint32_t>(U0, U1, U2, U3, U4, U5, U6, U7,
+                               S0, S1, S2, S3, S4, S5, S6, S7,
+                               0xFFFFFFFFu);
+
+    if (is_hi) {
+        bs[3] = S0; bs[2] = S1; bs[1] = S2; bs[0] = S3;
+    } else {
+        bs[3] = S4; bs[2] = S5; bs[1] = S6; bs[0] = S7;
+    }
+}
+
+// ---------- full round + round loop ----------
+
+inline void aesenc_round_bs32(sycl::sub_group const& sg,
+                              uint32_t bs[4], uint32_t const key_bs[4])
+{
+    shift_rows_bs32(sg, bs);
+    sub_bytes_bs32(sg, bs);
+    mix_columns_bs32(sg, bs);
+    add_round_key_bs32(bs, key_bs);
+}
+
+inline void run_rounds_bs32(sycl::sub_group const& sg,
+                            uint32_t bs[4],
+                            uint32_t const k1_bs[4],
+                            uint32_t const k2_bs[4],
+                            int rounds)
+{
+    #pragma unroll 2
+    for (int r = 0; r < rounds; ++r) {
+        aesenc_round_bs32(sg, bs, k1_bs);
+        aesenc_round_bs32(sg, bs, k2_bs);
+    }
+}
+
+// ---------- high-level wrappers matching AesHashGpu.cuh ----------
+//
+// Each wrapper must be called uniformly across the sub_group. The return
+// value is per-lane (this lane's result); callers collect per-lane values
+// into their own output buffers as usual.
+
+// g_x_bs32 — bitsliced equivalent of g_x_smem(keys, x, k). Each lane
+// contributes its own `x`, returns bottom k bits of state.w[0] for this
+// lane's x.
+inline uint32_t g_x_bs32(sycl::sub_group const& sg,
+                         AesHashKeys const& keys, uint32_t x, int k,
+                         int rounds = kAesGRounds)
+{
+    AesState in = set_int_vec_i128(0, 0, 0, static_cast<int32_t>(x));
+    uint32_t bs[4], k1_bs[4], k2_bs[4];
+    bs32_pack(sg, in, bs);
+    make_bs32_round_key(sg, keys.round_key_1, k1_bs);
+    make_bs32_round_key(sg, keys.round_key_2, k2_bs);
+    run_rounds_bs32(sg, bs, k1_bs, k2_bs, rounds);
+    AesState out;
+    bs32_unpack(sg, bs, out);
+    return out.w[0] & ((1u << k) - 1u);
+}
+
+// matching_target_bs32 — bitsliced equivalent of matching_target_smem.
+// (table_id, match_key) are typically sub_group-uniform in the match
+// kernels; only `meta` varies per lane. That's fine — bitslicing doesn't
+// require per-lane inputs to differ.
+inline uint32_t matching_target_bs32(sycl::sub_group const& sg,
+                                     AesHashKeys const& keys,
+                                     uint32_t table_id, uint32_t match_key,
+                                     uint64_t meta,
+                                     int extra_rounds_bits = 0)
+{
+    int32_t i0 = static_cast<int32_t>(table_id);
+    int32_t i1 = static_cast<int32_t>(match_key);
+    int32_t i2 = static_cast<int32_t>(meta & 0xFFFFFFFFu);
+    int32_t i3 = static_cast<int32_t>((meta >> 32) & 0xFFFFFFFFu);
+    AesState in = set_int_vec_i128(i3, i2, i1, i0);
+    uint32_t bs[4], k1_bs[4], k2_bs[4];
+    bs32_pack(sg, in, bs);
+    make_bs32_round_key(sg, keys.round_key_1, k1_bs);
+    make_bs32_round_key(sg, keys.round_key_2, k2_bs);
+    int rounds = kAesMatchingTargetRounds << extra_rounds_bits;
+    run_rounds_bs32(sg, bs, k1_bs, k2_bs, rounds);
+    AesState out;
+    bs32_unpack(sg, bs, out);
+    return out.w[0];
+}
+
+// pairing_bs32 — bitsliced equivalent of pairing_smem. Kept for
+// completeness / future use; the current match kernels keep the inner
+// loop on T-table pairing because the inner trip count is data-dependent
+// (per-lane window size varies), which is awkward to bit-slice without
+// a batch-collect prepass.
+inline Result128 pairing_bs32(sycl::sub_group const& sg,
+                              AesHashKeys const& keys,
+                              uint64_t meta_l, uint64_t meta_r,
+                              int extra_rounds_bits = 0)
+{
+    int32_t i0 = static_cast<int32_t>(meta_l & 0xFFFFFFFFu);
+    int32_t i1 = static_cast<int32_t>((meta_l >> 32) & 0xFFFFFFFFu);
+    int32_t i2 = static_cast<int32_t>(meta_r & 0xFFFFFFFFu);
+    int32_t i3 = static_cast<int32_t>((meta_r >> 32) & 0xFFFFFFFFu);
+    AesState in = set_int_vec_i128(i3, i2, i1, i0);
+    uint32_t bs[4], k1_bs[4], k2_bs[4];
+    bs32_pack(sg, in, bs);
+    make_bs32_round_key(sg, keys.round_key_1, k1_bs);
+    make_bs32_round_key(sg, keys.round_key_2, k2_bs);
+    int rounds = kAesPairingRounds << extra_rounds_bits;
+    run_rounds_bs32(sg, bs, k1_bs, k2_bs, rounds);
+    AesState out;
+    bs32_unpack(sg, bs, out);
+    Result128 r{};
+    r.r[0] = out.w[0]; r.r[1] = out.w[1];
+    r.r[2] = out.w[2]; r.r[3] = out.w[3];
+    return r;
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/AesHashGpu.cuh b/src/gpu/AesHashGpu.cuh
index 29aa895..36453ff 100644
--- a/src/gpu/AesHashGpu.cuh
+++ b/src/gpu/AesHashGpu.cuh
@@ -8,10 +8,21 @@
 // The CPU code uses 16 alternating rounds (round_key_1, round_key_2). We
 // keep the same round count constants here so a single binary can be a
 // drop-in for the CPU code.
+//
+// Backend portability:
+//
+// The `_smem` family (run_rounds_smem, g_x_smem, pairing_smem,
+// matching_target_smem, chain_smem) is fully pointer-driven (table
+// pointer passed as an argument) and decorated with portable macros, so
+// it compiles under both nvcc and acpp/clang. The non-smem family reads
+// the constant-memory T-tables directly via aesenc_round and is
+// therefore CUDA-only.
 
 #pragma once
 
 #include "gpu/AesGpu.cuh"
+#include "gpu/PortableAttrs.hpp"
+
 #include <cstdint>
 
 namespace pos2gpu {
@@ -28,7 +39,7 @@ struct AesHashKeys {
 
 // Build the two round keys from a 32-byte plot_id, matching
 // load_plot_id_as_aes_key in AesHash.hpp.
-__host__ __device__ inline AesHashKeys make_keys(uint8_t const* plot_id_bytes)
+POS2_HOST_DEVICE inline AesHashKeys make_keys(uint8_t const* plot_id_bytes)
 {
     AesHashKeys k;
     k.round_key_1 = load_state_le(plot_id_bytes + 0);
@@ -36,8 +47,10 @@ __host__ __device__ inline AesHashKeys make_keys(uint8_t const* plot_id_bytes)
     return k;
 }
 
+#if defined(__CUDACC__)
 // One full alternating round-pair. The CPU loop is:
 //   for r in 0..Rounds: state = aesenc(state, k1); state = aesenc(state, k2);
+// CUDA-only: calls aesenc_round which reads constant-memory T-tables.
 __device__ __forceinline__ AesState run_rounds(AesState state, AesHashKeys const& keys, int rounds)
 {
     #pragma unroll 2
@@ -56,12 +69,14 @@ __device__ __forceinline__ uint32_t g_x(AesHashKeys const& keys, uint32_t x, int
     s = run_rounds(s, keys, rounds);
     return s.w[0] & ((1u << k) - 1u);
 }
+#endif
 
 // pairing: load (meta_l_lo, meta_l_hi, meta_r_lo, meta_r_hi) into i0..i3,
 // run AES_PAIRING_ROUNDS << extra_rounds_bits, return all 4 u32s.
 // Mirrors AesHash::pairing<Soft>.
 struct Result128 { uint32_t r[4]; };
 
+#if defined(__CUDACC__)
 __device__ __forceinline__ Result128 pairing(
     AesHashKeys const& keys,
     uint64_t meta_l, uint64_t meta_r,
@@ -110,14 +125,17 @@ __device__ __forceinline__ uint64_t chain(AesHashKeys const& keys, uint64_t inpu
     s = run_rounds(s, keys, kAesChainingRounds);
     return uint64_t(s.w[0]) | (uint64_t(s.w[1]) << 32);
 }
+#endif // __CUDACC__
 
 // =========================================================================
 // Shared-memory T-table variants. Use after load_aes_tables_smem(sT) +
-// __syncthreads(). All four functions mirror their constant-memory peers
-// above; only the inner aesenc_round call changes.
+// __syncthreads() in CUDA, or after a SYCL local_accessor + barrier in
+// SYCL. All five functions mirror their constant-memory peers above;
+// only the inner aesenc_round_smem call (and the table pointer arg)
+// differ. Fully portable — compile under both backends.
 // =========================================================================
 
-__device__ __forceinline__ AesState run_rounds_smem(
+POS2_DEVICE_INLINE AesState run_rounds_smem(
     AesState state, AesHashKeys const& keys, int rounds, uint32_t const* __restrict__ sT)
 {
     #pragma unroll 2
@@ -128,7 +146,7 @@ __device__ __forceinline__ AesState run_rounds_smem(
     return state;
 }
 
-__device__ __forceinline__ uint32_t g_x_smem(
+POS2_DEVICE_INLINE uint32_t g_x_smem(
     AesHashKeys const& keys, uint32_t x, int k,
     uint32_t const* __restrict__ sT, int rounds = kAesGRounds)
 {
@@ -137,7 +155,7 @@ __device__ __forceinline__ uint32_t g_x_smem(
     return s.w[0] & ((1u << k) - 1u);
 }
 
-__device__ __forceinline__ Result128 pairing_smem(
+POS2_DEVICE_INLINE Result128 pairing_smem(
     AesHashKeys const& keys,
     uint64_t meta_l, uint64_t meta_r,
     uint32_t const* __restrict__ sT,
@@ -156,7 +174,7 @@ __device__ __forceinline__ Result128 pairing_smem(
     return out;
 }
 
-__device__ __forceinline__ uint32_t matching_target_smem(
+POS2_DEVICE_INLINE uint32_t matching_target_smem(
     AesHashKeys const& keys,
     uint32_t table_id, uint32_t match_key, uint64_t meta,
     uint32_t const* __restrict__ sT,
@@ -172,7 +190,7 @@ __device__ __forceinline__ uint32_t matching_target_smem(
     return s.w[0];
 }
 
-__device__ __forceinline__ uint64_t chain_smem(
+POS2_DEVICE_INLINE uint64_t chain_smem(
     AesHashKeys const& keys, uint64_t input,
     uint32_t const* __restrict__ sT)
 {
diff --git a/src/gpu/AesSBoxBP.cuh b/src/gpu/AesSBoxBP.cuh
index 6b8b57e..3a56a0c 100644
--- a/src/gpu/AesSBoxBP.cuh
+++ b/src/gpu/AesSBoxBP.cuh
@@ -20,12 +20,21 @@
 
 #pragma once
 
+#include "gpu/PortableAttrs.hpp"
+
 #include <cstdint>
 
 namespace pos2gpu {
 
+// Portable markup: POS2_HOST_DEVICE_INLINE expands to
+// __host__ __device__ __forceinline__ under nvcc (CUDA TU) and to
+// inline __attribute__((always_inline)) under acpp/clang (SYCL TU).
+// Raw __host__ / __device__ tokens would fail to parse under
+// AdaptiveCpp's SYCL-to-HIP compilation path (they're not defined
+// outside nvcc/hipcc source-to-source front-ends), which would
+// cascade to "no matching function" errors at every call site.
 template <typename T>
-__host__ __device__ __forceinline__
+POS2_HOST_DEVICE_INLINE
 void bp_sbox_circuit(T U0, T U1, T U2, T U3, T U4, T U5, T U6, T U7,
                      T& S0, T& S1, T& S2, T& S3,
                      T& S4, T& S5, T& S6, T& S7,
@@ -154,7 +163,7 @@ void bp_sbox_circuit(T U0, T U1, T U2, T U3, T U4, T U5, T U6, T U7,
     S5     = tc21 ^ tc17;
 }
 
-__host__ __device__ __forceinline__
+POS2_HOST_DEVICE_INLINE
 uint8_t bp_sbox(uint8_t x)
 {
     uint8_t U0 = uint8_t((x >> 7) & 1u);
diff --git a/src/gpu/AesStub.cpp b/src/gpu/AesStub.cpp
new file mode 100644
index 0000000..afe271a
--- /dev/null
+++ b/src/gpu/AesStub.cpp
@@ -0,0 +1,15 @@
+// AesStub.cpp — provides the symbols defined by AesGpu.cu when the build
+// excludes the CUDA AOT path (XCHPLOT2_BUILD_CUDA=OFF). The CUDA path
+// uploads AES T-tables into __constant__ memory; the SYCL path keeps them
+// in a USM device buffer (SyclBackend.hpp's aes_tables_device(q)) which
+// is initialised lazily on first kernel call. So this stub simply makes
+// initialize_aes_tables a no-op — the SYCL kernels don't depend on it.
+
+namespace pos2gpu {
+
+void initialize_aes_tables() {
+    // No-op on non-CUDA builds. AES T-tables are uploaded by
+    // SyclBackend.hpp's aes_tables_device(q) on first use.
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/AesTables.inl b/src/gpu/AesTables.inl
new file mode 100644
index 0000000..c186470
--- /dev/null
+++ b/src/gpu/AesTables.inl
@@ -0,0 +1,70 @@
+// AesTables.inl — AES T-table values shared between the CUDA path
+// (uploaded into __constant__ memory by initialize_aes_tables in
+// AesGpu.cu) and the SYCL path (uploaded once into a USM device
+// buffer at first use).
+//
+// The four tables are constexpr — built at compile time from kSBox +
+// xtime via the standard 4-table T-box construction. Sourced from
+// AesGpu.cu lines 17-68; behaviour unchanged.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+
+namespace pos2gpu::aes_tables {
+
+// Rijndael S-box.
+constexpr uint8_t kSBox[256] = {
+    0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
+    0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
+    0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
+    0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
+    0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
+    0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
+    0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
+    0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
+    0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
+    0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
+    0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
+    0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
+    0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
+    0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
+    0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
+    0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+};
+
+constexpr uint8_t xtime(uint8_t x) {
+    return static_cast<uint8_t>((x << 1) ^ ((x & 0x80) ? 0x1B : 0));
+}
+
+// MixColumns row [02 03 01 01]. T0[a] = (2·S[a], 1·S[a], 1·S[a], 3·S[a])
+// little-endian bytes are: byte0=2S, byte1=S, byte2=S, byte3=3S.
+constexpr uint32_t te_word(uint8_t a, int rotate)
+{
+    uint8_t s = kSBox[a];
+    uint8_t s2 = xtime(s);
+    uint8_t s3 = static_cast<uint8_t>(s2 ^ s);
+    uint8_t b[4] = { s2, s, s, s3 };
+    uint32_t v = 0;
+    for (int i = 0; i < 4; ++i) {
+        v |= uint32_t(b[(i + rotate) & 3]) << (8 * i);
+    }
+    return v;
+}
+
+constexpr std::array<uint32_t, 256> build_table(int rotate)
+{
+    std::array<uint32_t, 256> t{};
+    for (int i = 0; i < 256; ++i) {
+        t[i] = te_word(static_cast<uint8_t>(i), rotate);
+    }
+    return t;
+}
+
+constexpr auto T0 = build_table(0);
+constexpr auto T1 = build_table(3);
+constexpr auto T2 = build_table(2);
+constexpr auto T3 = build_table(1);
+
+} // namespace pos2gpu::aes_tables
diff --git a/src/gpu/CudaHalfShim.hpp b/src/gpu/CudaHalfShim.hpp
new file mode 100644
index 0000000..424e2ae
--- /dev/null
+++ b/src/gpu/CudaHalfShim.hpp
@@ -0,0 +1,59 @@
+// CudaHalfShim.hpp — conditionally pulls in the CUDA Toolkit headers
+// consumed by AdaptiveCpp-compatible SYCL TUs:
+//   - cuda_fp16.h       (AdaptiveCpp's libkernel/half_representation.hpp
+//                        references __half whenever the CUDA backend is
+//                        in scope)
+//   - cuda_runtime.h    (our .cuh signatures reference cudaEvent_t /
+//                        cudaError_t for signature-only interop)
+//
+// On NVIDIA builds these headers are on the include path and everything
+// "just works". On AMD/ROCm builds they're absent — ROCm's HIP headers
+// redefine vector types like uchar1 that CUDA's headers also define, so
+// pulling both in blows up with typedef redefinition errors.
+//
+// Uses __has_include so the CUDA Toolkit is only pulled in when actually
+// available. For HIP/Intel backends we provide minimal type stubs — just
+// enough for function signatures carrying cudaEvent_t / cudaError_t to
+// parse. Those parameters are always nullptr / ignored on non-CUDA paths,
+// so the stubs are purely compile-time bookkeeping.
+//
+// Define XCHPLOT2_SKIP_CUDA_FP16 or XCHPLOT2_SKIP_CUDA_RUNTIME to opt out
+// of either include unconditionally (useful when CUDA headers are present
+// for an unrelated reason but you want to test the stub path).
+
+#pragma once
+
+#include <cstdint>
+
+#if !defined(XCHPLOT2_SKIP_CUDA_RUNTIME) && __has_include(<cuda_runtime.h>)
+  #include <cuda_runtime.h>
+#else
+  // Opaque stubs for signature-only CUDA types. These only appear in
+  // launch_*_profiled parameter lists where non-CUDA callers pass nullptr.
+  using cudaEvent_t = void*;
+  using cudaError_t = int;
+  #ifndef cudaSuccess
+    #define cudaSuccess 0
+  #endif
+  #ifndef cudaErrorInvalidValue
+    #define cudaErrorInvalidValue 1
+  #endif
+#endif
+
+// __half / __half2: AdaptiveCpp's libkernel/half_representation can
+// reference these by name even when the codegen target is HIP, not CUDA.
+// Earlier the SKIP path simply didn't include cuda_fp16.h and provided
+// nothing in its place — silent on most hosts, but on at least one
+// W5700 / gfx1010 / gfx1013-spoof + ROCm + AdaptiveCpp combination, the
+// missing types caused JIT to emit no-op kernel stubs (every kernel
+// dispatch completed cleanly with zero device-side writes). Fall back
+// to ROCm's <hip/hip_fp16.h> when available, then to opaque struct
+// stubs as a last resort.
+#if !defined(XCHPLOT2_SKIP_CUDA_FP16) && __has_include(<cuda_fp16.h>)
+  #include <cuda_fp16.h>
+#elif __has_include(<hip/hip_fp16.h>)
+  #include <hip/hip_fp16.h>
+#else
+  struct __half  { uint16_t x; };
+  struct __half2 { uint16_t x; uint16_t y; };
+#endif
diff --git a/src/gpu/DeviceIds.hpp b/src/gpu/DeviceIds.hpp
new file mode 100644
index 0000000..27ec6b0
--- /dev/null
+++ b/src/gpu/DeviceIds.hpp
@@ -0,0 +1,26 @@
+// DeviceIds.hpp — synthetic device-id sentinels shared between the
+// CLI / BatchPlotter (host code) and SyclBackend (per-thread queue
+// routing). Real GPU ids are 0..N-1; negative values are reserved
+// for selectors that don't correspond to a numbered device.
+//
+// Lives in src/gpu/ rather than src/host/ because SyclBackend.hpp
+// (which can't include host-side headers) is the authoritative
+// consumer; BatchPlotter / cli.cpp pull the same constants from
+// here so the two sides agree on the encoding.
+
+#pragma once
+
+namespace pos2gpu {
+
+// Default thread-local value of sycl_backend::current_device_id_ref().
+// queue() picks sycl::gpu_selector_v in this case — the single-device
+// zero-config path users see when --devices is not passed.
+inline constexpr int kDefaultGpuId = -1;
+
+// Routes queue() to sycl::cpu_selector_v — AdaptiveCpp's OMP backend
+// on the CPU build path (ACPP_TARGETS=omp). BatchPlotter pushes this
+// into device_ids when --cpu (or `cpu` in --devices) is requested,
+// so the multi-device fan-out treats CPU like just-another-device.
+inline constexpr int kCpuDeviceId = -2;
+
+} // namespace pos2gpu
diff --git a/src/gpu/FeistelCipherGpu.cuh b/src/gpu/FeistelCipherGpu.cuh
index 28ee6d5..1afb256 100644
--- a/src/gpu/FeistelCipherGpu.cuh
+++ b/src/gpu/FeistelCipherGpu.cuh
@@ -5,7 +5,8 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
+#include "gpu/PortableAttrs.hpp"
+
 #include <cstdint>
 
 namespace pos2gpu {
@@ -16,7 +17,7 @@ struct FeistelKey {
     int rounds;
 };
 
-__host__ __device__ inline FeistelKey make_feistel_key(uint8_t const* plot_id, int k, int rounds = 4)
+POS2_HOST_DEVICE_INLINE FeistelKey make_feistel_key(uint8_t const* plot_id, int k, int rounds = 4)
 {
     FeistelKey fk;
     fk.k = k;
@@ -26,14 +27,14 @@ __host__ __device__ inline FeistelKey make_feistel_key(uint8_t const* plot_id, i
     return fk;
 }
 
-__host__ __device__ inline uint64_t feistel_rotate_left(uint64_t value, uint64_t shift, uint64_t bit_length)
+POS2_HOST_DEVICE_INLINE uint64_t feistel_rotate_left(uint64_t value, uint64_t shift, uint64_t bit_length)
 {
     if (shift > bit_length) shift = bit_length;
     uint64_t mask = (bit_length == 64 ? ~0ULL : ((1ULL << bit_length) - 1));
     return ((value << shift) & mask) | (value >> (bit_length - shift));
 }
 
-__host__ __device__ inline uint64_t feistel_slice_key(FeistelKey const& fk, int start_bit, int num_bits)
+POS2_HOST_DEVICE_INLINE uint64_t feistel_slice_key(FeistelKey const& fk, int start_bit, int num_bits)
 {
     int start_byte    = start_bit / 8;
     int bit_offset    = start_bit % 8;
@@ -49,7 +50,7 @@ __host__ __device__ inline uint64_t feistel_slice_key(FeistelKey const& fk, int
     return (key_segment >> shift_amount) & mask;
 }
 
-__host__ __device__ inline uint64_t feistel_round_key(FeistelKey const& fk, int round_num)
+POS2_HOST_DEVICE_INLINE uint64_t feistel_round_key(FeistelKey const& fk, int round_num)
 {
     int half_length    = fk.k;
     int bits_for_round = 3 * half_length;
@@ -61,7 +62,7 @@ __host__ __device__ inline uint64_t feistel_round_key(FeistelKey const& fk, int
 
 struct FeistelResultGpu { uint64_t left, right; };
 
-__host__ __device__ inline FeistelResultGpu feistel_round(
+POS2_HOST_DEVICE_INLINE FeistelResultGpu feistel_round(
     FeistelKey const& fk, uint64_t left, uint64_t right, uint64_t round_key)
 {
     int k = fk.k;
@@ -87,7 +88,7 @@ __host__ __device__ inline FeistelResultGpu feistel_round(
     return res;
 }
 
-__host__ __device__ inline uint64_t feistel_encrypt(FeistelKey const& fk, uint64_t input_value)
+POS2_HOST_DEVICE_INLINE uint64_t feistel_encrypt(FeistelKey const& fk, uint64_t input_value)
 {
     int k = fk.k;
     uint64_t bitmask = (k == 64 ? ~0ULL : ((1ULL << k) - 1));
diff --git a/src/gpu/PipelineKernels.cuh b/src/gpu/PipelineKernels.cuh
new file mode 100644
index 0000000..37f4a7f
--- /dev/null
+++ b/src/gpu/PipelineKernels.cuh
@@ -0,0 +1,64 @@
+// PipelineKernels.cuh — backend-dispatched wrappers for the simple
+// orchestration kernels in src/host/GpuPipeline.cu (init, gather,
+// permute, merge). All five are pure grid-stride compute — no AES, no
+// shared memory, no atomics — so the SYCL ports are mechanical.
+//
+// Selection at configure time via XCHPLOT2_BACKEND, same shape as
+// T1Offsets / T2Offsets / T3Offsets.
+
+#pragma once
+
+#include <cstdint>
+
+#include "gpu/CudaHalfShim.hpp"
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+// vals[i] = i  for i in [0, count). Used to seed the index stream that
+// the subsequent radix sort permutes.
+void launch_init_u32_identity(
+    uint32_t* d_vals,
+    uint64_t count,
+    sycl::queue& q);
+
+// dst[p] = src[indices[p]]  for p in [0, count). Two width specialisations.
+void launch_gather_u64(
+    uint64_t const* d_src,
+    uint32_t const* d_indices,
+    uint64_t* d_dst,
+    uint64_t count,
+    sycl::queue& q);
+
+void launch_gather_u32(
+    uint32_t const* d_src,
+    uint32_t const* d_indices,
+    uint32_t* d_dst,
+    uint64_t count,
+    sycl::queue& q);
+
+// dst_meta[idx]  = src_meta [indices[idx]]
+// dst_xbits[idx] = src_xbits[indices[idx]]
+// for idx in [0, count). T2's two-stream gather, fused.
+void launch_permute_t2(
+    uint64_t const* d_src_meta,
+    uint32_t const* d_src_xbits,
+    uint32_t const* d_indices,
+    uint64_t* d_dst_meta,
+    uint32_t* d_dst_xbits,
+    uint64_t count,
+    sycl::queue& q);
+
+// Stable 2-way merge of two sorted (key, value) runs via per-thread
+// merge-path binary search. A wins on ties (load-bearing for parity
+// with the pool path's CUB radix sort). Only the (uint32, uint32)
+// instantiation is currently used — both T1 and T2 streaming-merge
+// paths sort uint32 keys (match_info) by uint32 indices.
+void launch_merge_pairs_stable_2way_u32_u32(
+    uint32_t const* d_A_keys, uint32_t const* d_A_vals, uint64_t nA,
+    uint32_t const* d_B_keys, uint32_t const* d_B_vals, uint64_t nB,
+    uint32_t* d_out_keys, uint32_t* d_out_vals,
+    uint64_t total,
+    sycl::queue& q);
+
+} // namespace pos2gpu
diff --git a/src/gpu/PipelineKernelsSycl.cpp b/src/gpu/PipelineKernelsSycl.cpp
new file mode 100644
index 0000000..bf665ae
--- /dev/null
+++ b/src/gpu/PipelineKernelsSycl.cpp
@@ -0,0 +1,123 @@
+// PipelineKernelsSycl.cpp — SYCL implementation of the simple pipeline
+// kernels. Mirrors PipelineKernelsCuda.cu; reuses the shared queue from
+// SyclBackend.hpp. None of these touch AES so no T-table buffer is
+// needed.
+
+#include "gpu/PipelineKernels.cuh"
+#include "gpu/SyclBackend.hpp"
+
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+namespace {
+
+constexpr size_t kThreads = 256;
+
+inline size_t global_for(uint64_t count)
+{
+    size_t groups = static_cast<size_t>((count + kThreads - 1) / kThreads);
+    return groups * kThreads;
+}
+
+} // namespace
+
+void launch_init_u32_identity(
+    uint32_t* d_vals, uint64_t count, sycl::queue& q)
+{
+    q.parallel_for(
+        sycl::nd_range<1>{ global_for(count), kThreads },
+        [=](sycl::nd_item<1> it) {
+            uint64_t idx = it.get_global_id(0);
+            if (idx >= count) return;
+            d_vals[idx] = uint32_t(idx);
+        }).wait();
+}
+
+void launch_gather_u64(
+    uint64_t const* d_src, uint32_t const* d_indices,
+    uint64_t* d_dst, uint64_t count, sycl::queue& q)
+{
+    q.parallel_for(
+        sycl::nd_range<1>{ global_for(count), kThreads },
+        [=](sycl::nd_item<1> it) {
+            uint64_t p = it.get_global_id(0);
+            if (p >= count) return;
+            d_dst[p] = d_src[d_indices[p]];
+        }).wait();
+}
+
+void launch_gather_u32(
+    uint32_t const* d_src, uint32_t const* d_indices,
+    uint32_t* d_dst, uint64_t count, sycl::queue& q)
+{
+    q.parallel_for(
+        sycl::nd_range<1>{ global_for(count), kThreads },
+        [=](sycl::nd_item<1> it) {
+            uint64_t p = it.get_global_id(0);
+            if (p >= count) return;
+            d_dst[p] = d_src[d_indices[p]];
+        }).wait();
+}
+
+void launch_permute_t2(
+    uint64_t const* d_src_meta, uint32_t const* d_src_xbits,
+    uint32_t const* d_indices,
+    uint64_t* d_dst_meta, uint32_t* d_dst_xbits,
+    uint64_t count, sycl::queue& q)
+{
+    q.parallel_for(
+        sycl::nd_range<1>{ global_for(count), kThreads },
+        [=](sycl::nd_item<1> it) {
+            uint64_t idx = it.get_global_id(0);
+            if (idx >= count) return;
+            uint32_t i = d_indices[idx];
+            d_dst_meta[idx]  = d_src_meta[i];
+            d_dst_xbits[idx] = d_src_xbits[i];
+        }).wait();
+}
+
+void launch_merge_pairs_stable_2way_u32_u32(
+    uint32_t const* d_A_keys, uint32_t const* d_A_vals, uint64_t nA,
+    uint32_t const* d_B_keys, uint32_t const* d_B_vals, uint64_t nB,
+    uint32_t* d_out_keys, uint32_t* d_out_vals, uint64_t total,
+    sycl::queue& q)
+{
+    q.parallel_for(
+        sycl::nd_range<1>{ global_for(total), kThreads },
+        [=](sycl::nd_item<1> it) {
+            uint64_t p = it.get_global_id(0);
+            if (p >= total) return;
+
+            uint64_t lo = (p > nB) ? (p - nB) : 0;
+            uint64_t hi = (p < nA) ? p : nA;
+            while (lo < hi) {
+                uint64_t i = lo + (hi - lo + 1) / 2;
+                uint64_t j = p - i;
+                uint32_t a_prev = d_A_keys[i - 1];
+                uint32_t b_here = (j < nB) ? d_B_keys[j] : 0xFFFFFFFFu;
+                if (a_prev > b_here) {
+                    hi = i - 1;
+                } else {
+                    lo = i;
+                }
+            }
+            uint64_t i = lo;
+            uint64_t j = p - i;
+
+            bool take_a;
+            if (i >= nA)      take_a = false;
+            else if (j >= nB) take_a = true;
+            else              take_a = d_A_keys[i] <= d_B_keys[j];
+
+            if (take_a) {
+                d_out_keys[p] = d_A_keys[i];
+                d_out_vals[p] = d_A_vals[i];
+            } else {
+                d_out_keys[p] = d_B_keys[j];
+                d_out_vals[p] = d_B_vals[j];
+            }
+        }).wait();
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/PortableAttrs.hpp b/src/gpu/PortableAttrs.hpp
new file mode 100644
index 0000000..c959657
--- /dev/null
+++ b/src/gpu/PortableAttrs.hpp
@@ -0,0 +1,21 @@
+// PortableAttrs.hpp — backend-portable function attribute macros so the
+// AES helpers in AesGpu.cuh / AesHashGpu.cuh compile under both nvcc
+// (CUDA TU) and acpp/clang (SYCL TU).
+//
+// Under CUDA the macros expand to the usual __device__ / __host__ / etc.
+// markup. Under non-CUDA the markup is dropped and we fall back to plain
+// inline (with a force-inline hint where appropriate). The functions
+// then compile as ordinary C++ that can be called from a SYCL kernel
+// lambda by ADL with no special decoration.
+
+#pragma once
+
+#if defined(__CUDACC__)
+  #define POS2_DEVICE_INLINE      __device__ __forceinline__
+  #define POS2_HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+  #define POS2_HOST_DEVICE        __host__ __device__
+#else
+  #define POS2_DEVICE_INLINE      inline __attribute__((always_inline))
+  #define POS2_HOST_DEVICE_INLINE inline __attribute__((always_inline))
+  #define POS2_HOST_DEVICE
+#endif
diff --git a/src/gpu/Sort.cuh b/src/gpu/Sort.cuh
new file mode 100644
index 0000000..85b5d37
--- /dev/null
+++ b/src/gpu/Sort.cuh
@@ -0,0 +1,59 @@
+// Sort.cuh — backend-dispatched radix sort wrappers.
+//
+// Two implementations:
+//   SortCuda.cu — CUB-backed, compiled by nvcc. NVIDIA-only target. The
+//                 wrapper takes sycl::queue& q and bridges by draining q
+//                 with q.wait(), calling CUB on the default stream, then
+//                 cudaStreamSynchronize(nullptr). CUB and the SYCL backend
+//                 share the same primary CUDA context (libcuda underneath
+//                 both), so device pointers interop natively. ~2 host
+//                 fences per sort call (~50µs each, well under 1ms/plot).
+//   SortSycl.cpp — TODO: oneDPL-backed for AMD/Intel targets. Slower than
+//                  CUB on NVIDIA but the only path on non-NVIDIA hardware.
+//
+// CMake selects between them based on the target. For now (NVIDIA-only)
+// SortCuda.cu is always built.
+//
+// API mirrors CUB's two-mode contract: pass d_temp_storage=nullptr to
+// query the required temp_bytes; pass real storage to perform the sort.
+
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+// Sort (key, value) pairs by uint32 key over [begin_bit, end_bit) bits.
+// Stable. Used for T1 / T2 / Xs sorts (key=match_info, value=index or x).
+//
+// Both keys_in/vals_in AND keys_out/vals_out are writable: the SYCL
+// implementation uses them as a ping-pong pair across radix passes to
+// avoid allocating its own (8 × N bytes) alt buffers. Caller treats
+// keys_in/vals_in as scratch on input — they get clobbered. The result
+// always lands in keys_out/vals_out (the wrapper does a final memcpy
+// internally if the pass count is odd). The CUB backend ignores the
+// non-constness — it still treats keys_in/vals_in as read-only.
+void launch_sort_pairs_u32_u32(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint32_t* keys_in, uint32_t* keys_out,
+    uint32_t* vals_in, uint32_t* vals_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q);
+
+// Sort uint64 keys over [begin_bit, end_bit) bits. Used for the final
+// T3 fragment sort (sort by proof_fragment's low 2k bits).
+// Same in/out ping-pong contract as launch_sort_pairs_u32_u32.
+void launch_sort_keys_u64(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint64_t* keys_in, uint64_t* keys_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q);
+
+} // namespace pos2gpu
diff --git a/src/gpu/SortCubInternal.cuh b/src/gpu/SortCubInternal.cuh
new file mode 100644
index 0000000..322fd02
--- /dev/null
+++ b/src/gpu/SortCubInternal.cuh
@@ -0,0 +1,57 @@
+// SortCubInternal.cuh — pure-CUDA, SYCL-free declarations of the
+// CUB-backed radix sort. This header is the only entry point that
+// SortCuda.cu (compiled by nvcc) needs to see — it deliberately
+// does NOT include <sycl/sycl.hpp> so the nvcc translation unit
+// never reaches into AdaptiveCpp's libkernel headers.
+//
+// AdaptiveCpp's expected consumer pattern is "compile through acpp,
+// or stay out of the SYCL header tree." Pulling <sycl/sycl.hpp>
+// into a .cu file hits the legacy CUDA branch of half.hpp's
+// __acpp_backend_switch and tries to reference __hadd / __hsub /
+// etc. that aren't in scope without cuda_fp16.h. Keeping nvcc TUs
+// SYCL-free removes that whole class of bug.
+//
+// The SYCL-typed public API stays in Sort.cuh; SortSyclCub.cpp
+// (compiled by acpp) bridges by draining the SYCL queue, calling
+// these CUB symbols, and the cudaStreamSynchronize at the end is
+// already done inside the CUB body — see comments below.
+
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+namespace pos2gpu {
+
+// Pure-CUDA CUB radix sort. Caller responsibilities:
+//   - Inputs (keys_in / vals_in) must be ready on the device — the
+//     SYCL adapter handles this by draining the producing queue
+//     with q.wait() before calling.
+//   - Output is on the default CUDA stream and is fully drained
+//     before the function returns (we cudaStreamSynchronize(nullptr)
+//     internally so the caller can immediately consume keys_out /
+//     vals_out without further fences).
+//
+// Sizing-query mode: pass d_temp_storage = nullptr; *temp_bytes is
+// filled with the required scratch size and the function returns
+// immediately without doing any work or any sync.
+//
+// Same in/out ping-pong contract as the SYCL-typed public API in
+// Sort.cuh: keys_in/vals_in are clobbered, the result lands in
+// keys_out/vals_out (memcpy from the CUB-chosen buffer if needed).
+void cub_sort_pairs_u32_u32(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint32_t* keys_in, uint32_t* keys_out,
+    uint32_t* vals_in, uint32_t* vals_out,
+    uint64_t count,
+    int begin_bit, int end_bit);
+
+void cub_sort_keys_u64(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint64_t* keys_in, uint64_t* keys_out,
+    uint64_t count,
+    int begin_bit, int end_bit);
+
+} // namespace pos2gpu
diff --git a/src/gpu/SortCuda.cu b/src/gpu/SortCuda.cu
new file mode 100644
index 0000000..3ea4c36
--- /dev/null
+++ b/src/gpu/SortCuda.cu
@@ -0,0 +1,130 @@
+// SortCuda.cu — CUB-backed implementation of the Sort.cuh wrappers.
+// Compiled by nvcc; required when targeting NVIDIA. CUB's radix sort is
+// state-of-the-art, so on NVIDIA we lean on it directly even from the
+// SYCL host code by bridging the queue↔CUDA-stream boundary: drain the
+// SYCL queue with q.wait(), run CUB on the default CUDA stream, then
+// cudaStreamSynchronize(nullptr). Both backends share the same primary
+// CUDA context (libcuda underneath both), so device pointers interop
+// natively. Two host fences per sort call (~50µs each, well under
+// 1ms/plot at the typical 3 sorts/plot rate).
+
+// Pure-CUDA TU — never include <sycl/sycl.hpp> here, directly or
+// transitively. AdaptiveCpp's libkernel reaches into nvcc's CUDA
+// device pass via __acpp_backend_switch when the SYCL header is in
+// scope, and that path was never intended to be used from
+// nvcc-driver-compiled consumer TUs (per the AdaptiveCpp dev's
+// guidance: stick to --acpp-targets=generic, or stay out of the
+// SYCL header tree from non-acpp compilers). The SYCL-typed entry
+// points live in SortSyclCub.cpp (compiled by acpp) and call into
+// the cub_sort_* declarations below.
+#include "gpu/SortCubInternal.cuh"
+
+#include <cub/cub.cuh>
+#include <cuda_runtime.h>
+
+#include <stdexcept>
+#include <string>
+
+namespace pos2gpu {
+
+namespace {
+
+inline void cuda_check_or_throw(cudaError_t err, char const* what)
+{
+    if (err != cudaSuccess) {
+        throw std::runtime_error(std::string("CUB ") + what + ": " +
+                                 cudaGetErrorString(err));
+    }
+}
+
+} // namespace
+
+// CUB DoubleBuffer mode: caller passes both buffers as a ping-pong pair,
+// CUB picks which one the result lands in (db.Current()), and CUB's own
+// scratch shrinks to ~MB of histograms instead of ~2 GB of internal
+// temp keys/vals buffers it would otherwise allocate. We then memcpy
+// db.Current() to keys_out if needed so the public API contract holds.
+//
+// Caller (SortSyclCub.cpp) drains the producing SYCL queue with q.wait()
+// before this is called. This function syncs the default CUDA stream
+// internally before returning so the caller can hand keys_out / vals_out
+// straight back to SYCL without another fence.
+void cub_sort_pairs_u32_u32(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint32_t* keys_in, uint32_t* keys_out,
+    uint32_t* vals_in, uint32_t* vals_out,
+    uint64_t count,
+    int begin_bit, int end_bit)
+{
+    if (d_temp_storage == nullptr) {
+        cub::DoubleBuffer<uint32_t> d_keys(keys_in, keys_out);
+        cub::DoubleBuffer<uint32_t> d_vals(vals_in, vals_out);
+        cuda_check_or_throw(cub::DeviceRadixSort::SortPairs(
+            nullptr, temp_bytes,
+            d_keys, d_vals,
+            static_cast<int>(count), begin_bit, end_bit, /*stream=*/nullptr),
+            "SortPairs (sizing)");
+        return;
+    }
+
+    cub::DoubleBuffer<uint32_t> d_keys(keys_in, keys_out);
+    cub::DoubleBuffer<uint32_t> d_vals(vals_in, vals_out);
+    cuda_check_or_throw(cub::DeviceRadixSort::SortPairs(
+        d_temp_storage, temp_bytes,
+        d_keys, d_vals,
+        static_cast<int>(count), begin_bit, end_bit, /*stream=*/nullptr),
+        "SortPairs");
+
+    // CUB picks the output buffer; copy to keys_out/vals_out if it landed
+    // in keys_in/vals_in instead.
+    if (d_keys.Current() != keys_out) {
+        cuda_check_or_throw(cudaMemcpyAsync(keys_out, d_keys.Current(),
+            count * sizeof(uint32_t), cudaMemcpyDeviceToDevice, nullptr),
+            "memcpy keys_out");
+    }
+    if (d_vals.Current() != vals_out) {
+        cuda_check_or_throw(cudaMemcpyAsync(vals_out, d_vals.Current(),
+            count * sizeof(uint32_t), cudaMemcpyDeviceToDevice, nullptr),
+            "memcpy vals_out");
+    }
+
+    cuda_check_or_throw(cudaStreamSynchronize(nullptr),
+        "cudaStreamSynchronize after SortPairs");
+}
+
+void cub_sort_keys_u64(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint64_t* keys_in, uint64_t* keys_out,
+    uint64_t count,
+    int begin_bit, int end_bit)
+{
+    if (d_temp_storage == nullptr) {
+        cub::DoubleBuffer<uint64_t> d_keys(keys_in, keys_out);
+        cuda_check_or_throw(cub::DeviceRadixSort::SortKeys(
+            nullptr, temp_bytes,
+            d_keys,
+            static_cast<int>(count), begin_bit, end_bit, /*stream=*/nullptr),
+            "SortKeys (sizing)");
+        return;
+    }
+
+    cub::DoubleBuffer<uint64_t> d_keys(keys_in, keys_out);
+    cuda_check_or_throw(cub::DeviceRadixSort::SortKeys(
+        d_temp_storage, temp_bytes,
+        d_keys,
+        static_cast<int>(count), begin_bit, end_bit, /*stream=*/nullptr),
+        "SortKeys");
+
+    if (d_keys.Current() != keys_out) {
+        cuda_check_or_throw(cudaMemcpyAsync(keys_out, d_keys.Current(),
+            count * sizeof(uint64_t), cudaMemcpyDeviceToDevice, nullptr),
+            "memcpy keys_out");
+    }
+
+    cuda_check_or_throw(cudaStreamSynchronize(nullptr),
+        "cudaStreamSynchronize after SortKeys");
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/SortDispatch.cpp b/src/gpu/SortDispatch.cpp
new file mode 100644
index 0000000..f0d8d3f
--- /dev/null
+++ b/src/gpu/SortDispatch.cpp
@@ -0,0 +1,104 @@
+// SortDispatch.cpp — runtime backend dispatch for the radix sort wrappers.
+//
+// Two implementations can coexist in the same binary on dual-toolchain
+// builds:
+//
+//   launch_sort_*_cub   — CUB-backed (SortSyclCub.cpp + SortCuda.cu);
+//                          present only when XCHPLOT2_HAVE_CUB defined.
+//   launch_sort_*_sycl  — pure-SYCL hand-rolled radix (SortSycl.cpp);
+//                          always present.
+//
+// The dispatcher picks based on the queue's device backend, so a hybrid
+// host (NVIDIA + AMD on the same box) runs CUB on the NVIDIA worker and
+// SYCL radix on the AMD worker without rebuilding. Single-vendor builds
+// (BUILD_CUDA=OFF) compile out the CUB branch entirely; the dispatcher
+// reduces to a single tail call.
+
+#include "gpu/Sort.cuh"
+
+namespace pos2gpu {
+
+#if defined(XCHPLOT2_HAVE_CUB)
+void launch_sort_pairs_u32_u32_cub(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint32_t* keys_in, uint32_t* keys_out,
+    uint32_t* vals_in, uint32_t* vals_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q);
+
+void launch_sort_keys_u64_cub(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint64_t* keys_in, uint64_t* keys_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q);
+#endif
+
+void launch_sort_pairs_u32_u32_sycl(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint32_t* keys_in, uint32_t* keys_out,
+    uint32_t* vals_in, uint32_t* vals_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q);
+
+void launch_sort_keys_u64_sycl(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint64_t* keys_in, uint64_t* keys_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q);
+
+void launch_sort_pairs_u32_u32(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint32_t* keys_in, uint32_t* keys_out,
+    uint32_t* vals_in, uint32_t* vals_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q)
+{
+#if defined(XCHPLOT2_HAVE_CUB)
+    if (q.get_device().get_backend() == sycl::backend::cuda) {
+        launch_sort_pairs_u32_u32_cub(
+            d_temp_storage, temp_bytes,
+            keys_in, keys_out, vals_in, vals_out,
+            count, begin_bit, end_bit, q);
+        return;
+    }
+#endif
+    launch_sort_pairs_u32_u32_sycl(
+        d_temp_storage, temp_bytes,
+        keys_in, keys_out, vals_in, vals_out,
+        count, begin_bit, end_bit, q);
+}
+
+void launch_sort_keys_u64(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint64_t* keys_in, uint64_t* keys_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q)
+{
+#if defined(XCHPLOT2_HAVE_CUB)
+    if (q.get_device().get_backend() == sycl::backend::cuda) {
+        launch_sort_keys_u64_cub(
+            d_temp_storage, temp_bytes,
+            keys_in, keys_out,
+            count, begin_bit, end_bit, q);
+        return;
+    }
+#endif
+    launch_sort_keys_u64_sycl(
+        d_temp_storage, temp_bytes,
+        keys_in, keys_out,
+        count, begin_bit, end_bit, q);
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/SortSycl.cpp b/src/gpu/SortSycl.cpp
new file mode 100644
index 0000000..1984b35
--- /dev/null
+++ b/src/gpu/SortSycl.cpp
@@ -0,0 +1,391 @@
+// SortSycl.cpp — stable LSD radix sort in SYCL with parallel scan +
+// per-tile parallel-across-tiles scatter. Used when XCHPLOT2_BUILD_CUDA=OFF;
+// the CUDA build uses SortCuda.cu (CUB).
+//
+// Why hand-rolled? oneDPL's sort_by_key segfaults on AdaptiveCpp's CUDA
+// backend, and AdaptiveCpp's bitonic_sort is O(N log² N) and unstable
+// (we need stability for LSD radix). This implementation runs on every
+// AdaptiveCpp backend (CUDA, HIP, Level Zero, OpenCL).
+//
+// Design (per 4-bit pass; RADIX=16; TILE_SIZE=1024 items per workgroup):
+//   Phase 1 — parallel per-tile count: each WG reduces its tile into a
+//     local 16-bucket histogram, then writes those 16 counts (no atomics)
+//     into a bucket-major device array tile_hist[d * num_tiles + t]. The
+//     bucket-major layout is what makes phase 2 a single 1-D scan.
+//   Phase 2 — global exclusive scan over the entire tile_hist via
+//     AdaptiveCpp's scanning::scan (decoupled-lookback, multi-WG, parallel).
+//     The scan output, tile_offsets[d * num_tiles + t], is exactly the
+//     starting position in the output where tile t's bucket-d items go,
+//     because the bucket-major layout means the scan accumulates each
+//     bucket's tiles in order, then rolls over to the next bucket. Stable
+//     by construction: tile t < t' always lands earlier within bucket d.
+//   Phase 3 — parallel-across-tiles scatter: each WG loads its tile into
+//     local memory, then thread 0 sequentially walks the tile and emits
+//     each item to out[tile_offsets[d * num_tiles + t] + pos[d]++]. Stable
+//     within each tile (sequential walk preserves input order).
+//
+// Performance vs CUB: significantly slower (single-thread scatter per WG
+// is ~32× under-utilized vs CUB's warp-cooperative scatter), but parallel
+// across tiles. Future work: cooperative intra-tile scatter using per-WG
+// per-bucket prefix scans. For now, correct and parallel beats fast and
+// wrong.
+
+#include "gpu/Sort.cuh"
+
+#include <sycl/sycl.hpp>
+
+#include "hipSYCL/algorithms/scan/scan.hpp"
+#include "hipSYCL/algorithms/util/allocation_cache.hpp"
+
+#include <cstdint>
+#include <utility>
+
+namespace pos2gpu {
+
+namespace {
+
+constexpr int  RADIX_BITS       = 4;
+constexpr int  RADIX            = 1 << RADIX_BITS;
+constexpr int  RADIX_MASK       = RADIX - 1;
+constexpr int  WG_SIZE          = 256;
+constexpr int  ITEMS_PER_THREAD = 4;
+constexpr int  TILE_SIZE        = WG_SIZE * ITEMS_PER_THREAD;  // 1024
+
+using local_atomic_u32 = sycl::atomic_ref<
+    uint32_t,
+    sycl::memory_order::relaxed,
+    sycl::memory_scope::work_group,
+    sycl::access::address_space::local_space>;
+
+// Per-process scratch cache for AdaptiveCpp's scan algorithm. Lives for
+// the program's lifetime; allocations are pooled and reused across calls.
+hipsycl::algorithms::util::allocation_cache& scan_alloc_cache()
+{
+    static hipsycl::algorithms::util::allocation_cache cache(
+        hipsycl::algorithms::util::allocation_type::device);
+    return cache;
+}
+
+uint64_t tile_count_for(uint64_t count)
+{
+    return (count + TILE_SIZE - 1) / TILE_SIZE;
+}
+
+void radix_pass_pairs_u32(
+    sycl::queue& q,
+    uint32_t const* in_keys, uint32_t const* in_vals,
+    uint32_t* out_keys,      uint32_t* out_vals,
+    uint32_t* tile_hist,     uint32_t* tile_offsets,
+    uint64_t count, int bit)
+{
+    uint64_t const num_tiles = tile_count_for(count);
+    uint64_t const grid      = num_tiles * WG_SIZE;
+
+    // Phase 1: per-tile histogram → tile_hist[d * num_tiles + t].
+    q.submit([&](sycl::handler& h) {
+        sycl::local_accessor<uint32_t, 1> local_hist(sycl::range<1>(RADIX), h);
+        h.parallel_for(sycl::nd_range<1>(grid, WG_SIZE),
+            [=](sycl::nd_item<1> it) {
+                int const tid = static_cast<int>(it.get_local_id(0));
+                uint64_t const tile = it.get_group(0);
+
+                if (tid < RADIX) local_hist[tid] = 0;
+                it.barrier(sycl::access::fence_space::local_space);
+
+                uint64_t const base = tile * TILE_SIZE;
+                for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
+                    uint64_t const idx = base + static_cast<uint64_t>(i) * WG_SIZE + tid;
+                    if (idx < count) {
+                        uint32_t const d = (in_keys[idx] >> bit) & RADIX_MASK;
+                        local_atomic_u32(local_hist[d]).fetch_add(1u);
+                    }
+                }
+                it.barrier(sycl::access::fence_space::local_space);
+
+                if (tid < RADIX) {
+                    tile_hist[static_cast<uint64_t>(tid) * num_tiles + tile] = local_hist[tid];
+                }
+            });
+    });
+    q.wait();
+
+    // Phase 2: parallel exclusive scan over the entire tile_hist.
+    {
+        hipsycl::algorithms::util::allocation_group scratch_alloc(
+            &scan_alloc_cache(), q.get_device());
+        size_t const scan_size = static_cast<size_t>(RADIX) * static_cast<size_t>(num_tiles);
+        hipsycl::algorithms::scanning::scan</*IsInclusive=*/false>(
+            q, scratch_alloc,
+            tile_hist, tile_hist + scan_size,
+            tile_offsets,
+            sycl::plus<uint32_t>{},
+            uint32_t{0}).wait();
+    }
+
+    // Phase 3: per-tile stable scatter, cooperative across the WG.
+    // Items are laid out in local memory CONTIGUOUSLY-PER-THREAD so that
+    // the per-digit prefix scan (one per bucket; 16 iterations) yields
+    // ranks in input order, preserving stability. Each iteration:
+    //   1. Each thread counts its items that match the current digit.
+    //   2. exclusive_scan_over_group turns those counts into per-thread
+    //      offsets within the bucket.
+    //   3. Each thread scatters its matching items to local_bases[d] +
+    //      offset, advancing one position per matching item.
+    q.submit([&](sycl::handler& h) {
+        sycl::local_accessor<uint32_t, 1> local_keys  (sycl::range<1>(TILE_SIZE), h);
+        sycl::local_accessor<uint32_t, 1> local_vals  (sycl::range<1>(TILE_SIZE), h);
+        sycl::local_accessor<uint8_t,  1> local_digits(sycl::range<1>(TILE_SIZE), h);
+        sycl::local_accessor<uint32_t, 1> local_bases (sycl::range<1>(RADIX),     h);
+        h.parallel_for(sycl::nd_range<1>(grid, WG_SIZE),
+            [=](sycl::nd_item<1> it) {
+                int const tid = static_cast<int>(it.get_local_id(0));
+                uint64_t const tile = it.get_group(0);
+                auto const grp = it.get_group();
+
+                uint64_t const base = tile * TILE_SIZE;
+                int const items_in_tile = static_cast<int>(
+                    sycl::min<uint64_t>(TILE_SIZE, count - base));
+
+                for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
+                    int const local_pos = tid * ITEMS_PER_THREAD + i;
+                    if (local_pos < items_in_tile) {
+                        uint32_t const k = in_keys[base + local_pos];
+                        local_keys  [local_pos] = k;
+                        local_vals  [local_pos] = in_vals[base + local_pos];
+                        local_digits[local_pos] = static_cast<uint8_t>((k >> bit) & RADIX_MASK);
+                    }
+                }
+
+                if (tid < RADIX) {
+                    local_bases[tid] = tile_offsets[
+                        static_cast<uint64_t>(tid) * num_tiles + tile];
+                }
+                it.barrier(sycl::access::fence_space::local_space);
+
+                for (int d = 0; d < RADIX; ++d) {
+                    uint32_t my_count = 0;
+                    for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
+                        int const local_pos = tid * ITEMS_PER_THREAD + i;
+                        if (local_pos < items_in_tile && local_digits[local_pos] == d) {
+                            ++my_count;
+                        }
+                    }
+
+                    uint32_t const my_offset = sycl::exclusive_scan_over_group(
+                        grp, my_count, sycl::plus<uint32_t>());
+
+                    uint32_t pos_in_bucket = my_offset;
+                    for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
+                        int const local_pos = tid * ITEMS_PER_THREAD + i;
+                        if (local_pos < items_in_tile && local_digits[local_pos] == d) {
+                            uint32_t const target = local_bases[d] + pos_in_bucket;
+                            out_keys[target] = local_keys[local_pos];
+                            out_vals[target] = local_vals[local_pos];
+                            ++pos_in_bucket;
+                        }
+                    }
+                    it.barrier(sycl::access::fence_space::local_space);
+                }
+            });
+    });
+    q.wait();
+}
+
+void radix_pass_keys_u64(
+    sycl::queue& q,
+    uint64_t const* in_keys,
+    uint64_t* out_keys,
+    uint32_t* tile_hist, uint32_t* tile_offsets,
+    uint64_t count, int bit)
+{
+    uint64_t const num_tiles = tile_count_for(count);
+    uint64_t const grid      = num_tiles * WG_SIZE;
+
+    q.submit([&](sycl::handler& h) {
+        sycl::local_accessor<uint32_t, 1> local_hist(sycl::range<1>(RADIX), h);
+        h.parallel_for(sycl::nd_range<1>(grid, WG_SIZE),
+            [=](sycl::nd_item<1> it) {
+                int const tid = static_cast<int>(it.get_local_id(0));
+                uint64_t const tile = it.get_group(0);
+
+                if (tid < RADIX) local_hist[tid] = 0;
+                it.barrier(sycl::access::fence_space::local_space);
+
+                uint64_t const base = tile * TILE_SIZE;
+                for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
+                    uint64_t const idx = base + static_cast<uint64_t>(i) * WG_SIZE + tid;
+                    if (idx < count) {
+                        uint32_t const d =
+                            static_cast<uint32_t>((in_keys[idx] >> bit) & uint64_t{RADIX_MASK});
+                        local_atomic_u32(local_hist[d]).fetch_add(1u);
+                    }
+                }
+                it.barrier(sycl::access::fence_space::local_space);
+
+                if (tid < RADIX) {
+                    tile_hist[static_cast<uint64_t>(tid) * num_tiles + tile] = local_hist[tid];
+                }
+            });
+    });
+    q.wait();
+
+    {
+        hipsycl::algorithms::util::allocation_group scratch_alloc(
+            &scan_alloc_cache(), q.get_device());
+        size_t const scan_size = static_cast<size_t>(RADIX) * static_cast<size_t>(num_tiles);
+        hipsycl::algorithms::scanning::scan</*IsInclusive=*/false>(
+            q, scratch_alloc,
+            tile_hist, tile_hist + scan_size,
+            tile_offsets,
+            sycl::plus<uint32_t>{},
+            uint32_t{0}).wait();
+    }
+
+    q.submit([&](sycl::handler& h) {
+        sycl::local_accessor<uint64_t, 1> local_keys  (sycl::range<1>(TILE_SIZE), h);
+        sycl::local_accessor<uint8_t,  1> local_digits(sycl::range<1>(TILE_SIZE), h);
+        sycl::local_accessor<uint32_t, 1> local_bases (sycl::range<1>(RADIX),     h);
+        h.parallel_for(sycl::nd_range<1>(grid, WG_SIZE),
+            [=](sycl::nd_item<1> it) {
+                int const tid = static_cast<int>(it.get_local_id(0));
+                uint64_t const tile = it.get_group(0);
+                auto const grp = it.get_group();
+
+                uint64_t const base = tile * TILE_SIZE;
+                int const items_in_tile = static_cast<int>(
+                    sycl::min<uint64_t>(TILE_SIZE, count - base));
+
+                for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
+                    int const local_pos = tid * ITEMS_PER_THREAD + i;
+                    if (local_pos < items_in_tile) {
+                        uint64_t const k = in_keys[base + local_pos];
+                        local_keys  [local_pos] = k;
+                        local_digits[local_pos] =
+                            static_cast<uint8_t>((k >> bit) & uint64_t{RADIX_MASK});
+                    }
+                }
+
+                if (tid < RADIX) {
+                    local_bases[tid] = tile_offsets[
+                        static_cast<uint64_t>(tid) * num_tiles + tile];
+                }
+                it.barrier(sycl::access::fence_space::local_space);
+
+                for (int d = 0; d < RADIX; ++d) {
+                    uint32_t my_count = 0;
+                    for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
+                        int const local_pos = tid * ITEMS_PER_THREAD + i;
+                        if (local_pos < items_in_tile && local_digits[local_pos] == d) {
+                            ++my_count;
+                        }
+                    }
+
+                    uint32_t const my_offset = sycl::exclusive_scan_over_group(
+                        grp, my_count, sycl::plus<uint32_t>());
+
+                    uint32_t pos_in_bucket = my_offset;
+                    for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
+                        int const local_pos = tid * ITEMS_PER_THREAD + i;
+                        if (local_pos < items_in_tile && local_digits[local_pos] == d) {
+                            uint32_t const target = local_bases[d] + pos_in_bucket;
+                            out_keys[target] = local_keys[local_pos];
+                            ++pos_in_bucket;
+                        }
+                    }
+                    it.barrier(sycl::access::fence_space::local_space);
+                }
+            });
+    });
+    q.wait();
+}
+
+} // namespace
+
+// DoubleBuffer-style ping-pong over caller's buffers — no internal alt
+// allocation. Scratch is just tile_hist + tile_offsets (a few MB at k=28
+// vs the ~6 GB the old keys_alt/vals_alt cost there). The result lands
+// in keys_out; if the pass count is odd we do one final memcpy from
+// keys_in (which holds the result after the last swap).
+// Renamed _sycl in 2026-05; the canonical launch_sort_pairs_u32_u32 lives
+// in SortDispatch.cpp and routes to this implementation for non-CUDA
+// devices (and for everything when XCHPLOT2_HAVE_CUB isn't defined).
+void launch_sort_pairs_u32_u32_sycl(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint32_t* keys_in, uint32_t* keys_out,
+    uint32_t* vals_in, uint32_t* vals_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q)
+{
+    uint64_t const num_tiles = tile_count_for(count);
+    size_t const bytes = sizeof(uint32_t) * RADIX * num_tiles * 2;
+    if (d_temp_storage == nullptr) {
+        temp_bytes = bytes;
+        return;
+    }
+
+    uint8_t* p = static_cast<uint8_t*>(d_temp_storage);
+    uint32_t* tile_hist    = reinterpret_cast<uint32_t*>(p);  p += sizeof(uint32_t) * RADIX * num_tiles;
+    uint32_t* tile_offsets = reinterpret_cast<uint32_t*>(p);
+
+    // First pass reads from keys_in (caller's input). Subsequent passes
+    // ping-pong between keys_in and keys_out — we treat keys_in as
+    // scratch from here on, which the public API documents.
+    uint32_t* cur_keys = keys_in;
+    uint32_t* cur_vals = vals_in;
+    uint32_t* dst_keys = keys_out;
+    uint32_t* dst_vals = vals_out;
+
+    for (int bit = begin_bit; bit < end_bit; bit += RADIX_BITS) {
+        radix_pass_pairs_u32(q, cur_keys, cur_vals, dst_keys, dst_vals,
+                             tile_hist, tile_offsets, count, bit);
+        std::swap(cur_keys, dst_keys);
+        std::swap(cur_vals, dst_vals);
+    }
+    q.wait();
+
+    // After the loop, cur_keys/cur_vals point to the buffer holding the
+    // sorted result (because radix_pass writes to dst, then we swap so
+    // dst becomes the input for the next pass). If that's not keys_out,
+    // copy the result over.
+    if (cur_keys != keys_out) {
+        q.memcpy(keys_out, cur_keys, sizeof(uint32_t) * count);
+        q.memcpy(vals_out, cur_vals, sizeof(uint32_t) * count).wait();
+    }
+}
+
+void launch_sort_keys_u64_sycl(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint64_t* keys_in, uint64_t* keys_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q)
+{
+    uint64_t const num_tiles = tile_count_for(count);
+    size_t const bytes = sizeof(uint32_t) * RADIX * num_tiles * 2;
+    if (d_temp_storage == nullptr) {
+        temp_bytes = bytes;
+        return;
+    }
+
+    uint8_t* p = static_cast<uint8_t*>(d_temp_storage);
+    uint32_t* tile_hist    = reinterpret_cast<uint32_t*>(p);  p += sizeof(uint32_t) * RADIX * num_tiles;
+    uint32_t* tile_offsets = reinterpret_cast<uint32_t*>(p);
+
+    uint64_t* cur = keys_in;
+    uint64_t* dst = keys_out;
+
+    for (int bit = begin_bit; bit < end_bit; bit += RADIX_BITS) {
+        radix_pass_keys_u64(q, cur, dst, tile_hist, tile_offsets, count, bit);
+        std::swap(cur, dst);
+    }
+    q.wait();
+
+    if (cur != keys_out) {
+        q.memcpy(keys_out, cur, sizeof(uint64_t) * count).wait();
+    }
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/SortSyclCub.cpp b/src/gpu/SortSyclCub.cpp
new file mode 100644
index 0000000..f1c47bf
--- /dev/null
+++ b/src/gpu/SortSyclCub.cpp
@@ -0,0 +1,61 @@
+// SortSyclCub.cpp — SYCL-typed entry points for the CUB-backed sort.
+//
+// Compiled by acpp (the AdaptiveCpp compiler), so <sycl/sycl.hpp>
+// is in scope here. SortCuda.cu (compiled by nvcc) used to provide
+// these directly with a `sycl::queue&` parameter, but that meant
+// nvcc was reaching into AdaptiveCpp's libkernel headers — a path
+// AdaptiveCpp doesn't intend to support. We now keep nvcc's view
+// SYCL-free (see SortCubInternal.cuh) and bridge here:
+//
+//   q.wait()                             — drain the producing SYCL
+//                                          queue so CUB sees the
+//                                          right inputs.
+//   cub_sort_*(...)                      — pure-CUDA CUB kernel +
+//                                          internal cudaStreamSync.
+//
+// This file is only built when XCHPLOT2_BUILD_CUDA=ON. The dispatcher
+// in SortDispatch.cpp routes here for CUDA-backend queues; non-CUDA
+// queues (HIP / Level Zero / OpenMP host) flow to SortSycl.cpp's
+// launch_sort_*_sycl variants instead. AMD-only / Intel-only / CPU
+// builds skip this file entirely (BUILD_CUDA=OFF).
+
+#include "gpu/Sort.cuh"
+#include "gpu/SortCubInternal.cuh"
+
+namespace pos2gpu {
+
+void launch_sort_pairs_u32_u32_cub(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint32_t* keys_in, uint32_t* keys_out,
+    uint32_t* vals_in, uint32_t* vals_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q)
+{
+    // The sizing-query path (d_temp_storage == nullptr) never touches
+    // device memory — no need to fence the SYCL queue.
+    if (d_temp_storage != nullptr) {
+        q.wait();
+    }
+    cub_sort_pairs_u32_u32(d_temp_storage, temp_bytes,
+        keys_in, keys_out, vals_in, vals_out,
+        count, begin_bit, end_bit);
+}
+
+void launch_sort_keys_u64_cub(
+    void* d_temp_storage,
+    size_t& temp_bytes,
+    uint64_t* keys_in, uint64_t* keys_out,
+    uint64_t count,
+    int begin_bit, int end_bit,
+    sycl::queue& q)
+{
+    if (d_temp_storage != nullptr) {
+        q.wait();
+    }
+    cub_sort_keys_u64(d_temp_storage, temp_bytes,
+        keys_in, keys_out, count, begin_bit, end_bit);
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/SyclBackend.hpp b/src/gpu/SyclBackend.hpp
new file mode 100644
index 0000000..6ad762a
--- /dev/null
+++ b/src/gpu/SyclBackend.hpp
@@ -0,0 +1,268 @@
+// SyclBackend.hpp — shared SYCL infrastructure for the cross-backend
+// kernel implementations in T*OffsetsSycl.cpp.
+//
+// Both helpers are header-only inline so multiple SYCL TUs (T1OffsetsSycl,
+// T2OffsetsSycl, T3OffsetsSycl) share a single queue and a single AES
+// T-table USM buffer per process — function-local statics inside inline
+// functions have unique-instance semantics under ISO C++17+.
+//
+// This file is consumed only by the SYCL backend; CUDA TUs never include
+// it. It depends on PortableAttrs.hpp solely for the AesTables namespace
+// dependency through AesTables.inl, which has no CUDA-specific content.
+
+#pragma once
+
+#include "gpu/AesTables.inl"
+#include "gpu/DeviceIds.hpp"
+
+// cuda_fp16.h must precede sycl/sycl.hpp when this header is consumed
+// from an nvcc TU — AdaptiveCpp's libkernel/detail/half_representation.hpp
+// references __half, which only exists once cuda_fp16 has been seen.
+#include "gpu/CudaHalfShim.hpp"
+#include <sycl/sycl.hpp>
+
+#include <algorithm>
+#include <cstdio>
+#include <exception>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace pos2gpu::sycl_backend {
+
+// Async-exception handler for the persistent queue. AdaptiveCpp's
+// default policy for unhandled async errors is to call std::terminate()
+// via its `throw_result` path, which is what caused the observed
+// "Aborted (core dumped)" after a synchronous malloc_device failure
+// threw a clean std::runtime_error — secondary async errors (e.g. a
+// CUDA:2 from in-flight work on the now-starved context) hit the
+// default handler and killed the process before the CLI could exit
+// normally. Logging and swallowing here keeps the synchronous
+// std::runtime_error as the primary signal.
+inline void async_error_handler(sycl::exception_list exns) noexcept
+{
+    for (std::exception_ptr const& ep : exns) {
+        try { std::rethrow_exception(ep); }
+        catch (sycl::exception const& e) {
+            std::fprintf(stderr, "[sycl async] %s\n", e.what());
+        }
+        catch (std::exception const& e) {
+            std::fprintf(stderr, "[sycl async] %s\n", e.what());
+        }
+        catch (...) {
+            std::fprintf(stderr, "[sycl async] (unknown exception type)\n");
+        }
+    }
+}
+
+// Per-thread target device id. A worker thread sets this once at startup
+// via set_current_device_id() so that its subsequent queue() call returns
+// a queue bound to the requested device. Sentinel values:
+//   kDefaultGpuId (-1)  : sycl::gpu_selector_v (single-device default,
+//                         pre-multi-GPU zero-config path)
+//   kCpuDeviceId  (-2)  : sycl::cpu_selector_v (latent — kept so a future
+//                         SYCL-on-CPU benchmark path can compare against
+//                         pos2-chip's hand-tuned CPU plotter; production
+//                         --cpu / --devices cpu plotting bypasses this
+//                         and dispatches directly to run_one_plot_cpu()
+//                         in BatchPlotter, see CpuPlotter.cpp)
+//   0..N-1              : explicit GPU index from
+//                         sycl::device::get_devices(gpu)
+//
+// Thread-local, not global: the multi-device fan-out in BatchPlotter runs
+// N worker threads, each binding to a distinct device. The main thread
+// stays at kDefaultGpuId and sees the default selector.
+inline int& current_device_id_ref()
+{
+    thread_local int id = kDefaultGpuId;
+    return id;
+}
+
+inline void set_current_device_id(int id)
+{
+    current_device_id_ref() = id;
+}
+
+inline int current_device_id()
+{
+    return current_device_id_ref();
+}
+
+// Every SYCL GPU device this process can see. Used by --devices N to
+// translate the user's index into a sycl::device, and by --devices all
+// to spawn a worker per device.
+//
+// Used to filter non-CUDA backends out when the CUB sort path was
+// linked, on the theory that a worker landing on an AMD device with
+// CUB-only sort would just die mid-pipeline. The runtime backend
+// dispatch in SortDispatch.cpp made that filter unnecessary — a hybrid
+// host (NVIDIA + AMD) can now run a worker per device, with each
+// worker picking the right sort backend at queue construction time.
+inline std::vector<sycl::device> usable_gpu_devices()
+{
+    auto devs = sycl::device::get_devices(sycl::info::device_type::gpu);
+    return devs;
+}
+
+// Per-thread SYCL queue. Bound to the thread's current device id (see
+// the kDefaultGpuId / kCpuDeviceId sentinels above). A unique_ptr wrapper
+// lets us defer construction until the thread has had a chance to set
+// its device id.
+//
+// gpu_selector_v ensures the CUDA-backed GPU (or whichever AdaptiveCpp
+// was configured for) is picked over the OpenMP host device. cpu_selector_v
+// bypasses GPU enumeration entirely and lands on AdaptiveCpp's OMP backend
+// (CPU build path, ACPP_TARGETS=omp).
+//
+// Runs a one-shot dispatch sanity check on first construction (see
+// validate_kernel_dispatch below). If AdaptiveCpp's HIP / CUDA backend
+// on this host produces a no-op kernel stub at JIT/AOT time, the throw
+// surfaces here — at the first GPU work request — instead of much later
+// as a confusing "T1 match produced 0 entries" / streaming-tier error.
+// Set POS2GPU_SKIP_SELFTEST=1 to bypass; useful when you've already
+// validated the device this session and want lower startup overhead
+// across many short-lived processes.
+inline void validate_kernel_dispatch(sycl::queue& q)
+{
+    if (char const* v = std::getenv("POS2GPU_SKIP_SELFTEST"); v && v[0] == '1') {
+        return;
+    }
+
+    constexpr std::size_t   N        = 16;
+    constexpr std::uint32_t kPattern = 0xDEADBEEFu;
+
+    std::uint32_t* d = sycl::malloc_device<std::uint32_t>(N, q);
+    if (!d) {
+        throw std::runtime_error(
+            "[selftest] sycl::malloc_device(16 * u32) returned null. "
+            "The SYCL runtime can't allocate even tiny device buffers — "
+            "device discovery probably failed (check rocminfo / nvidia-smi, "
+            "ACPP_VISIBILITY_MASK).");
+    }
+
+    // Sentinel-fill: a "no kernel writes landed" outcome shows the
+    // sentinel, not random uninitialised bytes that might happen to
+    // match the expected pattern by coincidence.
+    q.memset(d, 0xCD, N * sizeof(std::uint32_t)).wait();
+    q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> it) {
+        std::size_t idx = it.get_global_id(0);
+        d[idx] = kPattern + static_cast<std::uint32_t>(idx);
+    }).wait();
+
+    std::uint32_t host[N] = {};
+    q.memcpy(host, d, N * sizeof(std::uint32_t)).wait();
+    sycl::free(d, q);
+
+    int fails = 0;
+    for (std::size_t i = 0; i < N; ++i) {
+        if (host[i] != kPattern + static_cast<std::uint32_t>(i)) ++fails;
+    }
+    if (fails == 0) return;
+
+    char head[64];
+    std::snprintf(head, sizeof(head), "0x%08x (expected 0x%08x)",
+                  host[0], kPattern);
+    std::string msg =
+        "[selftest] SYCL kernel writes are not landing on the device. "
+        "A trivial parallel_for(16) writing a known pattern produced "
+        "host[0]=";
+    msg += head;
+    msg += ".\n  ";
+    if (host[0] == 0xCDCDCDCDu) {
+        msg += "The pre-launch sentinel (0xCDCDCDCD) is intact, so the "
+               "kernel completed without writing anything. ";
+    } else {
+        msg += "The sentinel was overwritten but with a wrong value — "
+               "the kernel is dispatching but its output is corrupted. ";
+    }
+    msg += "Most likely AdaptiveCpp's HIP / CUDA backend on this host is "
+           "producing a no-op or miscompiled kernel stub at JIT/AOT time. "
+           "Diagnose with:\n"
+           "  - ACPP_DEBUG_LEVEL=2 ./xchplot2 ...   (shows the JIT log)\n"
+           "  - rocminfo / nvidia-smi              (confirm the actual ISA "
+           "matches the AOT target — see cargo:warning lines from your "
+           "last `cargo install`)\n"
+           "  - try ACPP_TARGETS=generic           (forces SSCP JIT instead "
+           "of an AOT spoof)\n"
+           "Bypass the self-test with POS2GPU_SKIP_SELFTEST=1 if you've "
+           "already validated this device this session.";
+    throw std::runtime_error(msg);
+}
+
+inline sycl::queue& queue()
+{
+    thread_local std::unique_ptr<sycl::queue> q;
+    if (!q) {
+        int const id = current_device_id();
+        if (id == kCpuDeviceId) {
+            // AdaptiveCpp's OpenMP backend exposes its host device as
+            // `info::device_type::host`, which SYCL 2020's
+            // `cpu_selector_v` *can* reject (host-device is deprecated
+            // in 2020). And a custom selector lambda does too on the
+            // 25.10 headers. Bypass selectors and take the first device
+            // visible under whatever ACPP_VISIBILITY_MASK is in effect —
+            // when limited to omp, that's the OMP host device by
+            // construction. When CPU + GPU are both visible, set the
+            // mask to "omp" before invoking to disambiguate.
+            auto devs = sycl::device::get_devices();
+            if (devs.empty()) {
+                throw std::runtime_error(
+                    "sycl_backend::queue (CPU): no SYCL devices visible. "
+                    "Set ACPP_VISIBILITY_MASK=omp to expose AdaptiveCpp's "
+                    "OpenMP backend.");
+            }
+            q = std::make_unique<sycl::queue>(devs.front(),
+                                              async_error_handler);
+        } else if (id < 0) {
+            q = std::make_unique<sycl::queue>(sycl::gpu_selector_v,
+                                              async_error_handler);
+        } else {
+            auto devices = usable_gpu_devices();
+            if (id >= static_cast<int>(devices.size())) {
+                throw std::runtime_error(
+                    "sycl_backend::queue: device id " + std::to_string(id) +
+                    " out of range (found " + std::to_string(devices.size()) +
+                    " usable GPU device(s))");
+            }
+            q = std::make_unique<sycl::queue>(devices[id], async_error_handler);
+        }
+        validate_kernel_dispatch(*q);
+    }
+    return *q;
+}
+
+// Return the number of SYCL GPU devices visible to the process AND
+// usable by this build. Used by BatchOptions::use_all_devices to expand
+// "all" into an explicit list. See usable_gpu_devices() for the filter.
+inline int get_gpu_device_count()
+{
+    return static_cast<int>(usable_gpu_devices().size());
+}
+
+// AES T-tables uploaded into a USM device buffer on first use, kept
+// alive for the thread's queue lifetime — mirrors the CUDA path's
+// __constant__ T-tables. Thread-local because each worker thread's queue
+// is on a different device; the table upload must happen once per device,
+// not once per process.
+//
+// Pointer layout matches what the _smem family expects: [T0|T1|T2|T3],
+// 256 entries each.
+inline uint32_t* aes_tables_device(sycl::queue& q)
+{
+    thread_local uint32_t* d_tables = nullptr;
+    if (d_tables) return d_tables;
+
+    std::vector<uint32_t> sT_host(4 * 256);
+    for (int i = 0; i < 256; ++i) {
+        sT_host[0 * 256 + i] = pos2gpu::aes_tables::T0[i];
+        sT_host[1 * 256 + i] = pos2gpu::aes_tables::T1[i];
+        sT_host[2 * 256 + i] = pos2gpu::aes_tables::T2[i];
+        sT_host[3 * 256 + i] = pos2gpu::aes_tables::T3[i];
+    }
+    d_tables = sycl::malloc_device<uint32_t>(4 * 256, q);
+    q.memcpy(d_tables, sT_host.data(), sizeof(uint32_t) * 4 * 256).wait();
+    return d_tables;
+}
+
+} // namespace pos2gpu::sycl_backend
diff --git a/src/gpu/SyclDeviceList.cpp b/src/gpu/SyclDeviceList.cpp
new file mode 100644
index 0000000..6993db4
--- /dev/null
+++ b/src/gpu/SyclDeviceList.cpp
@@ -0,0 +1,45 @@
+// SyclDeviceList.cpp — implementation of list_gpu_devices().
+// Compiled by acpp via add_sycl_to_target so the SYCL headers are in
+// scope here; the public-facing header (SyclDeviceList.hpp) carries
+// only plain types for non-acpp consumers like cli.cpp.
+
+#include "gpu/SyclDeviceList.hpp"
+#include "gpu/SyclBackend.hpp"
+
+namespace pos2gpu {
+
+std::vector<GpuDeviceInfo> list_gpu_devices()
+{
+    std::vector<GpuDeviceInfo> out;
+    auto devs = sycl_backend::usable_gpu_devices();
+    out.reserve(devs.size());
+    for (std::size_t i = 0; i < devs.size(); ++i) {
+        auto const& d = devs[i];
+        GpuDeviceInfo info{};
+        info.id              = i;
+        info.name            = d.get_info<sycl::info::device::name>();
+        info.vram_bytes      = d.get_info<sycl::info::device::global_mem_size>();
+        info.cu_count        = static_cast<unsigned>(
+                                   d.get_info<sycl::info::device::max_compute_units>());
+        info.is_cuda_backend = false;
+        switch (d.get_backend()) {
+            case sycl::backend::cuda:
+                info.backend = "cuda";
+                info.is_cuda_backend = true;
+                break;
+            case sycl::backend::hip:
+                info.backend = "hip";
+                break;
+            case sycl::backend::level_zero:
+                info.backend = "level_zero";
+                break;
+            default:
+                info.backend = "?";
+                break;
+        }
+        out.push_back(std::move(info));
+    }
+    return out;
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/SyclDeviceList.hpp b/src/gpu/SyclDeviceList.hpp
new file mode 100644
index 0000000..0b35b99
--- /dev/null
+++ b/src/gpu/SyclDeviceList.hpp
@@ -0,0 +1,34 @@
+// SyclDeviceList.hpp — plain-types declaration for `xchplot2 devices`
+// (and any other consumer that needs to enumerate GPU devices without
+// pulling <sycl/sycl.hpp> into its TU).
+//
+// cli.cpp is compiled by g++ with -Werror, and including SyclBackend.hpp
+// drags in AdaptiveCpp's libkernel/host/builtins.hpp which has a
+// narrowing-conversion warning that gets escalated to an error. Keeping
+// this header SYCL-free lets non-acpp TUs query the device list via the
+// implementation in SyclDeviceList.cpp (compiled by acpp).
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace pos2gpu {
+
+struct GpuDeviceInfo {
+    std::size_t   id;
+    std::string   name;
+    std::string   backend;          // "cuda" / "hip" / "level_zero" / "opencl" / "?"
+    bool          is_cuda_backend;  // true iff backend == sycl::backend::cuda
+    std::uint64_t vram_bytes;
+    unsigned      cu_count;         // max_compute_units
+};
+
+// Enumerate every visible SYCL GPU device. Order matches what
+// `--devices N` uses for index lookup, so the printed `[N]` is a
+// drop-in for that flag.
+std::vector<GpuDeviceInfo> list_gpu_devices();
+
+} // namespace pos2gpu
diff --git a/src/gpu/T1Kernel.cpp b/src/gpu/T1Kernel.cpp
new file mode 100644
index 0000000..75a43bf
--- /dev/null
+++ b/src/gpu/T1Kernel.cpp
@@ -0,0 +1,202 @@
+// T1Kernel.cu — port of pos2-chip Table1Constructor.
+//
+// Algorithm (mirrors pos2-chip/src/plot/TableConstructorGeneric.hpp):
+//
+//   For each section_l in {0,1,2,3} (order doesn't affect the *set* of
+//     T1Pairings produced; CPU iterates 3,0,2,1 but the post-construct
+//     sort by match_info collapses ordering):
+//     section_r = matching_section(section_l)
+//     For each match_key_r in [0, num_match_keys):
+//       L = sorted_xs[section_l..section_l+1)            (entire section)
+//       R = sorted_xs in (section_r, match_key_r) bucket
+//       For each L candidate (one thread):
+//         target_l = matching_target(1, match_key_r, x_l) & target_mask
+//         binary-search R for first entry with match_target == target_l
+//         walk forward while still equal; for each:
+//           pairing_t1(x_l, x_r); if test_result == 0, emit T1Pairing
+//             { meta = (x_l << k) | x_r, match_info = pair.r[0] mask k }
+
+#include "host/PoolSizing.hpp"
+
+#include "gpu/AesGpu.cuh"
+#include "gpu/AesHashGpu.cuh"
+#include "gpu/T1Kernel.cuh"
+#include "gpu/T1Offsets.cuh"
+
+#include <climits>
+#include <cstdint>
+
+namespace pos2gpu {
+
+T1MatchParams make_t1_params(int k, int strength)
+{
+    T1MatchParams p{};
+    p.k                     = k;
+    p.strength              = strength;
+    p.num_section_bits      = (k < 28) ? 2 : (k - 26);
+    p.num_match_key_bits    = 2; // table_id == 1
+    p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits;
+    return p;
+}
+
+// All T1 kernels (compute_bucket_offsets, compute_fine_bucket_offsets,
+// match_all_buckets) and the previously-unused matching_section helper
+// have moved to T1Offsets.cuh / T1OffsetsSycl.cpp on the cross-backend path.
+
+namespace {
+
+constexpr int kT1FineBits = 8;
+
+struct T1Derived {
+    uint32_t num_sections;
+    uint32_t num_match_keys;
+    uint32_t num_buckets;
+    uint64_t fine_entries;
+    size_t   bucket_bytes;
+    size_t   fine_bytes;
+    size_t   temp_needed;
+    uint32_t target_mask;
+    uint64_t l_count_max;
+};
+
+T1Derived derive_t1(T1MatchParams const& params)
+{
+    T1Derived d{};
+    d.num_sections    = 1u << params.num_section_bits;
+    d.num_match_keys  = 1u << params.num_match_key_bits;
+    d.num_buckets     = d.num_sections * d.num_match_keys;
+    uint64_t const fine_count = 1ull << kT1FineBits;
+    d.fine_entries    = uint64_t(d.num_buckets) * fine_count + 1;
+    d.bucket_bytes    = sizeof(uint64_t) * (d.num_buckets + 1);
+    d.fine_bytes      = sizeof(uint64_t) * d.fine_entries;
+    d.temp_needed     = d.bucket_bytes + d.fine_bytes;
+    d.target_mask     = (params.num_match_target_bits >= 32)
+                          ? 0xFFFFFFFFu
+                          : ((1u << params.num_match_target_bits) - 1u);
+    d.l_count_max =
+        static_cast<uint64_t>(max_pairs_per_section(params.k, params.num_section_bits));
+    return d;
+}
+
+} // namespace
+
+void launch_t1_match_prepare(
+    uint8_t const* plot_id_bytes,
+    T1MatchParams const& params,
+    XsCandidateGpu const* d_sorted_xs,
+    uint64_t total,
+    uint64_t* d_out_count,
+    void* d_temp_storage,
+    size_t* temp_bytes,
+    sycl::queue& q)
+{
+    if (!plot_id_bytes || !temp_bytes) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.strength < 2)            throw std::invalid_argument("invalid argument to launch wrapper");
+
+    T1Derived const d = derive_t1(params);
+
+    if (d_temp_storage == nullptr) {
+        *temp_bytes = d.temp_needed;
+        return;
+    }
+    if (*temp_bytes < d.temp_needed) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_sorted_xs || !d_out_count)  throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.num_match_target_bits <= kT1FineBits) throw std::invalid_argument("invalid argument to launch wrapper");
+
+    auto* d_offsets      = reinterpret_cast<uint64_t*>(d_temp_storage);
+    auto* d_fine_offsets = d_offsets + (d.num_buckets + 1);
+
+    launch_compute_bucket_offsets(
+        d_sorted_xs, total,
+        params.num_match_target_bits,
+        d.num_buckets, d_offsets, q);
+    launch_compute_fine_bucket_offsets(
+        d_sorted_xs, d_offsets,
+        params.num_match_target_bits, kT1FineBits,
+        d.num_buckets, d_fine_offsets, q);
+    q.memset(d_out_count, 0, sizeof(uint64_t)).wait();
+}
+
+void launch_t1_match_range(
+    uint8_t const* plot_id_bytes,
+    T1MatchParams const& params,
+    XsCandidateGpu const* d_sorted_xs,
+    uint64_t total,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void const* d_temp_storage,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q)
+{
+    (void)total;
+    if (!plot_id_bytes) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.strength < 2)            throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_temp_storage)                throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_sorted_xs || !d_out_meta || !d_out_mi || !d_out_count)
+        throw std::invalid_argument("invalid argument to launch wrapper");
+
+    T1Derived const d = derive_t1(params);
+    if (bucket_end > d.num_buckets) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (bucket_end <= bucket_begin) return;
+
+    constexpr int kThreads = 256;
+    uint64_t const blocks_x_u64 = (d.l_count_max + kThreads - 1) / kThreads;
+    if (blocks_x_u64 > UINT_MAX) throw std::invalid_argument("invalid argument to launch wrapper");
+
+    auto const* d_offsets      = reinterpret_cast<uint64_t const*>(d_temp_storage);
+    auto const* d_fine_offsets = d_offsets + (d.num_buckets + 1);
+
+    AesHashKeys keys = make_keys(plot_id_bytes);
+
+    int const extra_rounds_bits = params.strength - 2;
+    int const num_test_bits     = params.num_match_key_bits;
+    int const num_info_bits     = params.k;
+
+    launch_t1_match_all_buckets(
+        keys, d_sorted_xs,
+        const_cast<uint64_t const*>(d_offsets),
+        const_cast<uint64_t const*>(d_fine_offsets),
+        d.num_match_keys, d.num_buckets,
+        params.k, params.num_section_bits,
+        params.num_match_target_bits, kT1FineBits,
+        extra_rounds_bits, d.target_mask,
+        num_test_bits, num_info_bits,
+        d_out_meta, d_out_mi, d_out_count,
+        capacity, d.l_count_max,
+        bucket_begin, bucket_end, q);
+}
+
+void launch_t1_match(
+    uint8_t const* plot_id_bytes,
+    T1MatchParams const& params,
+    XsCandidateGpu const* d_sorted_xs,
+    uint64_t total,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void* d_temp_storage,
+    size_t* temp_bytes,
+    sycl::queue& q)
+{
+    // Single-shot wrapper: prepare + one full-range match. Preserves
+    // the original API for pool path, test mode, and parity tests.
+    launch_t1_match_prepare(
+        plot_id_bytes, params, d_sorted_xs, total,
+        d_out_count, d_temp_storage, temp_bytes, q);
+    if (d_temp_storage == nullptr) return;  // size-query path
+
+    T1Derived const d = derive_t1(params);
+    launch_t1_match_range(
+        plot_id_bytes, params, d_sorted_xs, total,
+        d_out_meta, d_out_mi, d_out_count,
+        capacity, d_temp_storage,
+        /*bucket_begin=*/0, /*bucket_end=*/d.num_buckets, q);
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/T1Kernel.cu b/src/gpu/T1Kernel.cu
deleted file mode 100644
index 43ef516..0000000
--- a/src/gpu/T1Kernel.cu
+++ /dev/null
@@ -1,328 +0,0 @@
-// T1Kernel.cu — port of pos2-chip Table1Constructor.
-//
-// Algorithm (mirrors pos2-chip/src/plot/TableConstructorGeneric.hpp):
-//
-//   For each section_l in {0,1,2,3} (order doesn't affect the *set* of
-//     T1Pairings produced; CPU iterates 3,0,2,1 but the post-construct
-//     sort by match_info collapses ordering):
-//     section_r = matching_section(section_l)
-//     For each match_key_r in [0, num_match_keys):
-//       L = sorted_xs[section_l..section_l+1)            (entire section)
-//       R = sorted_xs in (section_r, match_key_r) bucket
-//       For each L candidate (one thread):
-//         target_l = matching_target(1, match_key_r, x_l) & target_mask
-//         binary-search R for first entry with match_target == target_l
-//         walk forward while still equal; for each:
-//           pairing_t1(x_l, x_r); if test_result == 0, emit T1Pairing
-//             { meta = (x_l << k) | x_r, match_info = pair.r[0] mask k }
-
-#include "gpu/AesGpu.cuh"
-#include "gpu/AesHashGpu.cuh"
-#include "gpu/T1Kernel.cuh"
-
-#include <cuda_runtime.h>
-#include <climits>
-#include <cstdint>
-#include <vector>
-
-namespace pos2gpu {
-
-T1MatchParams make_t1_params(int k, int strength)
-{
-    T1MatchParams p{};
-    p.k                     = k;
-    p.strength              = strength;
-    p.num_section_bits      = (k < 28) ? 2 : (k - 26);
-    p.num_match_key_bits    = 2; // table_id == 1
-    p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits;
-    return p;
-}
-
-namespace {
-
-// Mirrors pos2-chip/src/pos/ProofCore.hpp:198 matching_section.
-__host__ __device__ inline uint32_t matching_section(uint32_t section, int num_section_bits)
-{
-    uint32_t num_sections = 1u << num_section_bits;
-    uint32_t mask = num_sections - 1u;
-    uint32_t rotated_left = ((section << 1) | (section >> (num_section_bits - 1))) & mask;
-    uint32_t rotated_left_plus_1 = (rotated_left + 1) & mask;
-    uint32_t section_new = ((rotated_left_plus_1 >> 1)
-                          | (rotated_left_plus_1 << (num_section_bits - 1))) & mask;
-    return section_new;
-}
-
-__global__ void compute_bucket_offsets(
-    XsCandidateGpu const* __restrict__ sorted,
-    uint64_t total,
-    int num_match_target_bits, // bucket id = match_info >> num_match_target_bits
-    uint32_t num_buckets,      // num_sections * num_match_keys
-    uint64_t* __restrict__ offsets) // offsets[num_buckets + 1]
-{
-    if (threadIdx.x != 0 || blockIdx.x != 0) return;
-    uint32_t bucket_shift = static_cast<uint32_t>(num_match_target_bits);
-
-    uint64_t pos = 0;
-    for (uint32_t b = 0; b < num_buckets; ++b) {
-        uint64_t lo = pos, hi = total;
-        while (lo < hi) {
-            uint64_t mid = lo + ((hi - lo) >> 1);
-            uint32_t bucket_mid = sorted[mid].match_info >> bucket_shift;
-            if (bucket_mid < b) lo = mid + 1;
-            else                hi = mid;
-        }
-        offsets[b] = lo;
-        pos = lo;
-    }
-    offsets[num_buckets] = total;
-}
-
-// See T3Kernel.cu for the rationale. T1's sorted stream is
-// XsCandidateGpu AoS; we read match_info directly from the struct.
-__global__ void compute_fine_bucket_offsets(
-    XsCandidateGpu const* __restrict__ sorted,
-    uint64_t const* __restrict__ bucket_offsets,
-    int num_match_target_bits,
-    int fine_bits,
-    uint32_t num_buckets,
-    uint64_t* __restrict__ fine_offsets)
-{
-    uint32_t const fine_count = 1u << fine_bits;
-    uint32_t const total      = num_buckets * fine_count;
-    uint32_t const tid        = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total) return;
-
-    uint32_t const r_bucket = tid / fine_count;
-    uint32_t const fine_key = tid % fine_count;
-
-    uint64_t const r_start = bucket_offsets[r_bucket];
-    uint64_t const r_end   = bucket_offsets[r_bucket + 1];
-
-    uint32_t const target_mask = (num_match_target_bits >= 32)
-                                  ? 0xFFFFFFFFu
-                                  : ((1u << num_match_target_bits) - 1u);
-    uint32_t const shift       = static_cast<uint32_t>(num_match_target_bits - fine_bits);
-
-    uint64_t lo = r_start, hi = r_end;
-    while (lo < hi) {
-        uint64_t mid = lo + ((hi - lo) >> 1);
-        uint32_t t   = (sorted[mid].match_info & target_mask) >> shift;
-        if (t < fine_key) lo = mid + 1;
-        else              hi = mid;
-    }
-    fine_offsets[tid] = lo;
-
-    if (tid == total - 1) {
-        fine_offsets[total] = bucket_offsets[num_buckets];
-    }
-}
-
-// Fused match kernel: handles all (section_l, match_key_r) buckets in a
-// single launch. blockIdx.y identifies the bucket, blockIdx.x slices L.
-// Loads AES T-tables into shared memory once per block.
-__global__ __launch_bounds__(256, 4) void match_all_buckets(
-    AesHashKeys keys,
-    XsCandidateGpu const* __restrict__ sorted_xs,
-    uint64_t const* __restrict__ d_offsets, // [num_buckets+1]
-    uint64_t const* __restrict__ d_fine_offsets,
-    uint32_t num_match_keys,
-    int k,
-    int num_section_bits,
-    int num_match_target_bits,
-    int fine_bits,
-    int extra_rounds_bits,
-    uint32_t target_mask,
-    int num_test_bits,
-    int num_match_info_bits,
-    T1PairingGpu* __restrict__ out,
-    unsigned long long* __restrict__ out_count,
-    uint64_t out_capacity)
-{
-    __shared__ uint32_t sT[4 * 256];
-    load_aes_tables_smem(sT);
-    __syncthreads();
-
-    uint32_t bucket_id   = blockIdx.y;            // 0..num_buckets
-    uint32_t section_l   = bucket_id / num_match_keys;
-    uint32_t match_key_r = bucket_id % num_match_keys;
-
-    uint32_t section_r;
-    {
-        uint32_t mask = (1u << num_section_bits) - 1u;
-        uint32_t rl   = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask;
-        uint32_t rl1  = (rl + 1) & mask;
-        section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask;
-    }
-
-    uint64_t l_start = d_offsets[section_l * num_match_keys];
-    uint64_t l_end   = d_offsets[(section_l + 1) * num_match_keys];
-    uint32_t r_bucket = section_r * num_match_keys + match_key_r;
-
-    uint64_t l = l_start + blockIdx.x * uint64_t(blockDim.x) + threadIdx.x;
-    if (l >= l_end) return;
-
-    uint32_t x_l = sorted_xs[l].x;
-
-    // Per pos2-chip/src/pos/ProofHashing.hpp:160, T1's matching_target uses
-    // extra_rounds_bits = strength - 2 (only T1, not T2/T3). The kernel arg
-    // already carries that value; we were passing 0 here, producing wrong
-    // target_l values at strength > 2.
-    uint32_t target_l = matching_target_smem(keys, 1u, match_key_r, uint64_t(x_l),
-                                              sT, extra_rounds_bits)
-                      & target_mask;
-
-    // Fine-bucket pre-index; see T3Kernel.cu for rationale.
-    uint32_t fine_shift = static_cast<uint32_t>(num_match_target_bits - fine_bits);
-    uint32_t fine_key   = target_l >> fine_shift;
-    uint64_t fine_idx   = (uint64_t(r_bucket) << fine_bits) | fine_key;
-    uint64_t lo         = d_fine_offsets[fine_idx];
-    uint64_t fine_hi    = d_fine_offsets[fine_idx + 1];
-    uint64_t hi         = fine_hi;
-
-    while (lo < hi) {
-        uint64_t mid = lo + ((hi - lo) >> 1);
-        uint32_t target_mid = sorted_xs[mid].match_info & target_mask;
-        if (target_mid < target_l) lo = mid + 1;
-        else                       hi = mid;
-    }
-
-    uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu
-                                                : ((1u << num_test_bits) - 1u);
-    uint32_t info_mask = (num_match_info_bits >= 32) ? 0xFFFFFFFFu
-                                                     : ((1u << num_match_info_bits) - 1u);
-
-    for (uint64_t r = lo; r < fine_hi; ++r) {
-        uint32_t target_r = sorted_xs[r].match_info & target_mask;
-        if (target_r != target_l) break;
-
-        uint32_t x_r = sorted_xs[r].x;
-        Result128 res = pairing_smem(keys, uint64_t(x_l), uint64_t(x_r), sT, extra_rounds_bits);
-
-        uint32_t test_result = res.r[3] & test_mask;
-        if (test_result != 0) continue;
-
-        uint32_t match_info_result = res.r[0] & info_mask;
-
-        unsigned long long out_idx = atomicAdd(out_count, 1ULL);
-        if (out_idx >= out_capacity) return;
-
-        uint64_t meta = (uint64_t(x_l) << k) | uint64_t(x_r);
-        T1PairingGpu p;
-        p.meta_lo    = uint32_t(meta);
-        p.meta_hi    = uint32_t(meta >> 32);
-        p.match_info = match_info_result;
-        out[out_idx] = p;
-    }
-}
-
-} // namespace
-
-cudaError_t launch_t1_match(
-    uint8_t const* plot_id_bytes,
-    T1MatchParams const& params,
-    XsCandidateGpu const* d_sorted_xs,
-    uint64_t total,
-    T1PairingGpu* d_out_pairings,
-    uint64_t* d_out_count,
-    uint64_t capacity,
-    void* d_temp_storage,
-    size_t* temp_bytes,
-    cudaStream_t stream)
-{
-    if (!plot_id_bytes || !temp_bytes) return cudaErrorInvalidValue;
-    if (params.k < 18 || params.k > 32) return cudaErrorInvalidValue;
-    if (params.strength < 2)            return cudaErrorInvalidValue;
-
-    uint32_t num_sections    = 1u << params.num_section_bits;
-    uint32_t num_match_keys  = 1u << params.num_match_key_bits;
-    uint32_t num_buckets     = num_sections * num_match_keys;
-
-    // temp layout: offsets[num_buckets + 1] uint64 || fine_offsets[num_buckets * 2^FINE_BITS + 1]
-    constexpr int FINE_BITS = 8;
-    uint64_t const fine_count    = 1ull << FINE_BITS;
-    uint64_t const fine_entries  = uint64_t(num_buckets) * fine_count + 1;
-
-    size_t const bucket_bytes = sizeof(uint64_t) * (num_buckets + 1);
-    size_t const fine_bytes   = sizeof(uint64_t) * fine_entries;
-    size_t const needed       = bucket_bytes + fine_bytes;
-
-    if (d_temp_storage == nullptr) {
-        *temp_bytes = needed;
-        return cudaSuccess;
-    }
-    if (*temp_bytes < needed)        return cudaErrorInvalidValue;
-    if (!d_sorted_xs || !d_out_pairings || !d_out_count) return cudaErrorInvalidValue;
-    if (params.num_match_target_bits <= FINE_BITS) return cudaErrorInvalidValue;
-
-    auto* d_offsets      = reinterpret_cast<uint64_t*>(d_temp_storage);
-    auto* d_fine_offsets = d_offsets + (num_buckets + 1);
-
-    AesHashKeys keys = make_keys(plot_id_bytes);
-
-    // 1) Bucket offsets.
-    compute_bucket_offsets<<<1, 1, 0, stream>>>(
-        d_sorted_xs, total,
-        params.num_match_target_bits,
-        num_buckets,
-        d_offsets);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) return err;
-
-    // 1b) Fine-bucket offsets: one thread per (r_bucket, fine_key).
-    uint32_t fine_threads_total = num_buckets * uint32_t(fine_count);
-    unsigned fine_blocks = (fine_threads_total + 255) / 256;
-    compute_fine_bucket_offsets<<<fine_blocks, 256, 0, stream>>>(
-        d_sorted_xs, d_offsets,
-        params.num_match_target_bits, FINE_BITS,
-        num_buckets, d_fine_offsets);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) return err;
-
-    // Reset out_count to 0.
-    err = cudaMemsetAsync(d_out_count, 0, sizeof(uint64_t), stream);
-    if (err != cudaSuccess) return err;
-
-    // 2) Compute max L-count across sections (small H2D copy only for sizing).
-    std::vector<uint64_t> h_offsets(num_buckets + 1);
-    err = cudaMemcpyAsync(h_offsets.data(), d_offsets,
-                          sizeof(uint64_t) * (num_buckets + 1),
-                          cudaMemcpyDeviceToHost, stream);
-    if (err != cudaSuccess) return err;
-    err = cudaStreamSynchronize(stream);
-    if (err != cudaSuccess) return err;
-
-    uint64_t l_count_max = 0;
-    for (uint32_t s = 0; s < num_sections; ++s) {
-        uint64_t l_count = h_offsets[(s + 1) * num_match_keys]
-                         - h_offsets[s * num_match_keys];
-        if (l_count > l_count_max) l_count_max = l_count;
-    }
-
-    uint32_t target_mask = (params.num_match_target_bits >= 32)
-                            ? 0xFFFFFFFFu
-                            : ((1u << params.num_match_target_bits) - 1u);
-    int extra_rounds_bits = params.strength - 2;
-    int num_test_bits     = params.num_match_key_bits;
-    int num_info_bits     = params.k;
-
-    constexpr int kThreads = 256;
-    uint64_t blocks_x_u64 = (l_count_max + kThreads - 1) / kThreads;
-    if (blocks_x_u64 > UINT_MAX) return cudaErrorInvalidValue;
-    dim3 grid(static_cast<unsigned>(blocks_x_u64), num_buckets, 1);
-
-    match_all_buckets<<<grid, kThreads, 0, stream>>>(
-        keys, d_sorted_xs, d_offsets, d_fine_offsets,
-        num_match_keys,
-        params.k, params.num_section_bits,
-        params.num_match_target_bits, FINE_BITS,
-        extra_rounds_bits, target_mask,
-        num_test_bits, num_info_bits,
-        d_out_pairings,
-        reinterpret_cast<unsigned long long*>(d_out_count),
-        capacity);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) return err;
-    return cudaSuccess;
-}
-
-} // namespace pos2gpu
diff --git a/src/gpu/T1Kernel.cuh b/src/gpu/T1Kernel.cuh
index 05a4aa3..71abf0a 100644
--- a/src/gpu/T1Kernel.cuh
+++ b/src/gpu/T1Kernel.cuh
@@ -9,7 +9,8 @@
 #include "gpu/AesHashGpu.cuh"
 #include "gpu/XsKernel.cuh"
 
-#include <cuda_runtime.h>
+#include "gpu/CudaHalfShim.hpp"
+#include <sycl/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
 
@@ -37,21 +38,66 @@ T1MatchParams make_t1_params(int k, int strength);
 // Run the full T1 phase.
 //   d_sorted_xs        : output of launch_construct_xs (sorted by match_info)
 //   total              : 1 << k
-//   d_out_pairings     : caller-allocated, capacity entries
+//   d_out_meta         : caller-allocated, capacity entries (uint64 meta).
+//   d_out_mi           : caller-allocated, capacity entries (uint32 match_info).
 //   d_out_count        : single uint64_t, will hold actual emitted count
-//   capacity           : max number of T1Pairings d_out_pairings can hold
+//   capacity           : max number of T1Pairings the output arrays can hold
 //   d_temp_storage     : nullptr to query *temp_bytes; otherwise must be
 //                        at least *temp_bytes large
-cudaError_t launch_t1_match(
+//
+// Output is SoA (two parallel streams) rather than an AoS T1PairingGpu
+// array so the streaming pipeline can feed d_out_mi straight into CUB
+// as the sort-key input and free it as soon as CUB consumes it, without
+// touching the meta stream. Saves ~1 GB at k=28 during the T1 sort
+// phase. t1_parity and other consumers rebuild the AoS form locally if
+// they need it.
+void launch_t1_match(
     uint8_t const* plot_id_bytes,
     T1MatchParams const& params,
     XsCandidateGpu const* d_sorted_xs,
     uint64_t total,
-    T1PairingGpu* d_out_pairings,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
     uint64_t* d_out_count,
     uint64_t capacity,
     void* d_temp_storage,
     size_t* temp_bytes,
-    cudaStream_t stream = nullptr);
+    sycl::queue& q);
+
+// Two-step entry point for callers that want to run T1 match in
+// multiple bucket-range passes (parallel to T3's prepare/range plumbing).
+//
+// launch_t1_match_prepare: computes bucket + fine-bucket offsets into
+//   d_temp_storage and zeroes d_out_count. Same sizing protocol as
+//   launch_t1_match (d_temp_storage==nullptr fills *temp_bytes).
+//
+// launch_t1_match_range: runs the match kernel for bucket range
+//   [bucket_begin, bucket_end). Multiple calls sharing the same
+//   d_out_meta / d_out_mi / d_out_count produce a concatenated output
+//   via atomic append, byte-equivalent to a single full-range call
+//   after the subsequent T1 sort.
+void launch_t1_match_prepare(
+    uint8_t const* plot_id_bytes,
+    T1MatchParams const& params,
+    XsCandidateGpu const* d_sorted_xs,
+    uint64_t total,
+    uint64_t* d_out_count,
+    void* d_temp_storage,
+    size_t* temp_bytes,
+    sycl::queue& q);
+
+void launch_t1_match_range(
+    uint8_t const* plot_id_bytes,
+    T1MatchParams const& params,
+    XsCandidateGpu const* d_sorted_xs,
+    uint64_t total,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void const* d_temp_storage,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q);
 
 } // namespace pos2gpu
diff --git a/src/gpu/T1Offsets.cuh b/src/gpu/T1Offsets.cuh
new file mode 100644
index 0000000..79ba482
--- /dev/null
+++ b/src/gpu/T1Offsets.cuh
@@ -0,0 +1,95 @@
+// T1Offsets.cuh — backend-dispatched wrapper for compute_bucket_offsets.
+//
+// One-thread-per-bucket binary search that emits offsets[num_buckets+1]
+// for T1's sorted XsCandidateGpu stream. Two implementations live in
+// sibling TUs and are selected at configure time:
+//
+//   XCHPLOT2_BACKEND=cuda  →  T1OffsetsCuda.cu  (default; existing __global__)
+//   XCHPLOT2_BACKEND=sycl  →  T1OffsetsSycl.cpp (AdaptiveCpp parallel_for)
+//
+// The CUDA stream parameter is honoured by both: the CUDA path launches
+// directly on it; the SYCL path syncs the stream before its own launch
+// and waits for the SYCL queue to complete before returning, so the
+// caller can chain subsequent CUDA work on `stream` unchanged.
+
+#pragma once
+
+#include "gpu/AesHashGpu.cuh"
+#include "gpu/XsCandidateGpu.hpp"
+
+#include <cstdint>
+
+// Forward-declare cudaStream_t instead of including <cuda_runtime.h>, so the
+// SYCL backend implementation (compiled by acpp/clang in non-CUDA mode) can
+// include this header without dragging in nvcc-only intrinsics from the
+// transitive AesGpu.cuh chain. CUDA-side TUs include <cuda_runtime.h>
+// themselves; the typedef redeclaration to the same type is permitted.
+#include "gpu/CudaHalfShim.hpp"
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+void launch_compute_bucket_offsets(
+    XsCandidateGpu const* d_sorted,
+    uint64_t total,
+    int num_match_target_bits,
+    uint32_t num_buckets,
+    uint64_t* d_offsets,
+    sycl::queue& q);
+
+// Per-fine-key offsets: for each (r_bucket, fine_key) in
+// [0, num_buckets) × [0, 2^fine_bits), find the lowest index i in
+// `sorted[bucket_offsets[r_bucket] .. bucket_offsets[r_bucket+1])` such
+// that ((sorted[i].match_info & target_mask) >> shift) >= fine_key, where
+// target_mask = (1<<num_match_target_bits)-1 and shift = num_match_target_bits
+// - fine_bits. Sentinel: fine_offsets[total] = bucket_offsets[num_buckets].
+void launch_compute_fine_bucket_offsets(
+    XsCandidateGpu const* d_sorted,
+    uint64_t const* d_bucket_offsets,
+    int num_match_target_bits,
+    int fine_bits,
+    uint32_t num_buckets,
+    uint64_t* d_fine_offsets,
+    sycl::queue& q);
+
+// Fused T1 match: for each (section_l, match_key_r) bucket in the
+// half-open range [bucket_begin, bucket_end), walk the L candidates
+// against the matching R bucket with AES-derived target_l, and emit
+// T1Pairings into out_meta[] / out_mi[] via an atomic cursor.
+//
+// Grid arrangement (CUDA): grid.y = bucket_end - bucket_begin,
+// grid.x slices L; the SYCL path uses an analogous 2D nd_range.
+// l_count_max is the per-section L upper bound used to size grid.x
+// without a host fence on the actual L count — excess threads
+// early-exit on `l >= l_end`.
+//
+// Across multiple calls sharing the same d_out_meta / d_out_mi /
+// d_out_count, results append via the atomic counter — same pattern
+// as T3 match's bucket-range plumbing. Used by minimal tier to split
+// T1 match into N passes with smaller per-pass staging output, keeping
+// d_t1_meta + d_t1_mi off-device until after T1 match completes.
+void launch_t1_match_all_buckets(
+    AesHashKeys keys,
+    XsCandidateGpu const* d_sorted_xs,
+    uint64_t const* d_offsets,
+    uint64_t const* d_fine_offsets,
+    uint32_t num_match_keys,
+    uint32_t num_buckets,
+    int k,
+    int num_section_bits,
+    int num_match_target_bits,
+    int fine_bits,
+    int extra_rounds_bits,
+    uint32_t target_mask,
+    int num_test_bits,
+    int num_match_info_bits,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
+    uint64_t* d_out_count,
+    uint64_t out_capacity,
+    uint64_t l_count_max,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q);
+
+} // namespace pos2gpu
diff --git a/src/gpu/T1OffsetsSycl.cpp b/src/gpu/T1OffsetsSycl.cpp
new file mode 100644
index 0000000..c7708e4
--- /dev/null
+++ b/src/gpu/T1OffsetsSycl.cpp
@@ -0,0 +1,234 @@
+// T1OffsetsSycl.cpp — SYCL/AdaptiveCpp implementation of
+// launch_compute_bucket_offsets, selected when XCHPLOT2_BACKEND=sycl.
+//
+// Same algorithm and output layout as T1OffsetsCuda.cu. The SYCL queue
+// uses AdaptiveCpp's CUDA backend (gpu_selector picks the RTX 4090 in
+// our test bench), which uses libcuda directly and shares the primary
+// CUDA context with the rest of the pipeline — so raw CUDA device
+// pointers from cudaMalloc are valid USM device pointers in the SYCL
+// kernel without any copy or remap.
+//
+// Synchronisation: the function syncs `stream` before launching SYCL
+// (so prior CUDA writes to d_sorted are visible) and waits for the
+// SYCL queue after (so subsequent CUDA reads of d_offsets see the
+// SYCL writes). Two extra host syncs vs. the pure-CUDA path; not
+// perf-relevant for slice 2.
+
+#include "gpu/SyclBackend.hpp"
+#include "gpu/T1Offsets.cuh"
+
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+
+void launch_compute_bucket_offsets(
+    XsCandidateGpu const* d_sorted,
+    uint64_t total,
+    int num_match_target_bits,
+    uint32_t num_buckets,
+    uint64_t* d_offsets,
+    sycl::queue& q)
+{
+    constexpr size_t threads   = 256;
+    size_t   const out_count   = static_cast<size_t>(num_buckets) + 1;
+    size_t   const groups      = (out_count + threads - 1) / threads;
+
+    q.parallel_for(
+        sycl::nd_range<1>{ groups * threads, threads },
+        [=](sycl::nd_item<1> it) {
+            uint32_t b = static_cast<uint32_t>(it.get_global_id(0));
+            if (b > num_buckets) return;
+            if (b == num_buckets) { d_offsets[num_buckets] = total; return; }
+
+            uint32_t bucket_shift = static_cast<uint32_t>(num_match_target_bits);
+            uint64_t lo = 0, hi = total;
+            while (lo < hi) {
+                uint64_t mid = lo + ((hi - lo) >> 1);
+                uint32_t v   = d_sorted[mid].match_info >> bucket_shift;
+                if (v < b) lo = mid + 1;
+                else       hi = mid;
+            }
+            d_offsets[b] = lo;
+        }).wait();
+}
+
+void launch_compute_fine_bucket_offsets(
+    XsCandidateGpu const* d_sorted,
+    uint64_t const* d_bucket_offsets,
+    int num_match_target_bits,
+    int fine_bits,
+    uint32_t num_buckets,
+    uint64_t* d_fine_offsets,
+    sycl::queue& q)
+{
+    constexpr size_t threads      = 256;
+    uint32_t const   fine_count   = 1u << fine_bits;
+    uint32_t const   total        = num_buckets * fine_count;
+    size_t   const   groups       = (total + threads - 1) / threads;
+
+    q.parallel_for(
+        sycl::nd_range<1>{ groups * threads, threads },
+        [=](sycl::nd_item<1> it) {
+            uint32_t tid = static_cast<uint32_t>(it.get_global_id(0));
+            if (tid >= total) return;
+
+            uint32_t r_bucket = tid / fine_count;
+            uint32_t fine_key = tid % fine_count;
+
+            uint64_t r_start = d_bucket_offsets[r_bucket];
+            uint64_t r_end   = d_bucket_offsets[r_bucket + 1];
+
+            uint32_t target_mask = (num_match_target_bits >= 32)
+                                    ? 0xFFFFFFFFu
+                                    : ((1u << num_match_target_bits) - 1u);
+            uint32_t shift = static_cast<uint32_t>(num_match_target_bits - fine_bits);
+
+            uint64_t lo = r_start, hi = r_end;
+            while (lo < hi) {
+                uint64_t mid = lo + ((hi - lo) >> 1);
+                uint32_t t   = (d_sorted[mid].match_info & target_mask) >> shift;
+                if (t < fine_key) lo = mid + 1;
+                else              hi = mid;
+            }
+            d_fine_offsets[tid] = lo;
+
+            if (tid == total - 1) {
+                d_fine_offsets[total] = d_bucket_offsets[num_buckets];
+            }
+        }).wait();
+}
+
+void launch_t1_match_all_buckets(
+    AesHashKeys keys,
+    XsCandidateGpu const* d_sorted_xs,
+    uint64_t const* d_offsets,
+    uint64_t const* d_fine_offsets,
+    uint32_t num_match_keys,
+    uint32_t num_buckets,
+    int k,
+    int num_section_bits,
+    int num_match_target_bits,
+    int fine_bits,
+    int extra_rounds_bits,
+    uint32_t target_mask,
+    int num_test_bits,
+    int num_match_info_bits,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
+    uint64_t* d_out_count,
+    uint64_t out_capacity,
+    uint64_t l_count_max,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q)
+{
+    (void)num_buckets;
+    if (bucket_end <= bucket_begin) return;
+    uint32_t const num_buckets_in_range = bucket_end - bucket_begin;
+
+    uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q);
+
+    constexpr size_t threads = 256;
+    uint64_t blocks_x_u64    = (l_count_max + threads - 1) / threads;
+    size_t   const blocks_x  = static_cast<size_t>(blocks_x_u64);
+
+    auto* d_out_count_ull =
+        reinterpret_cast<unsigned long long*>(d_out_count);
+
+    q.submit([&](sycl::handler& h) {
+        sycl::local_accessor<uint32_t, 1> sT_local{
+            sycl::range<1>{4 * 256}, h};
+
+        h.parallel_for(
+            sycl::nd_range<2>{
+                sycl::range<2>{ static_cast<size_t>(num_buckets_in_range),
+                                blocks_x * threads },
+                sycl::range<2>{ 1, threads }
+            },
+            [=, keys_copy = keys](sycl::nd_item<2> it) {
+                // Cooperative load of AES T-tables into local memory.
+                uint32_t* sT = &sT_local[0];
+                size_t local_id = it.get_local_id(1);
+                #pragma unroll 1
+                for (size_t i = local_id; i < 4 * 256; i += threads) {
+                    sT[i] = d_aes_tables[i];
+                }
+                it.barrier(sycl::access::fence_space::local_space);
+
+                uint32_t bucket_id   = bucket_begin + static_cast<uint32_t>(it.get_group(0));
+                uint32_t section_l   = bucket_id / num_match_keys;
+                uint32_t match_key_r = bucket_id % num_match_keys;
+
+                uint32_t section_r;
+                {
+                    uint32_t mask = (1u << num_section_bits) - 1u;
+                    uint32_t rl   = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask;
+                    uint32_t rl1  = (rl + 1) & mask;
+                    section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask;
+                }
+
+                uint64_t l_start = d_offsets[section_l * num_match_keys];
+                uint64_t l_end   = d_offsets[(section_l + 1) * num_match_keys];
+                uint32_t r_bucket = section_r * num_match_keys + match_key_r;
+
+                uint64_t l = l_start
+                           + it.get_group(1) * uint64_t(threads)
+                           + local_id;
+                if (l >= l_end) return;
+
+                uint32_t x_l = d_sorted_xs[l].x;
+
+                uint32_t target_l = pos2gpu::matching_target_smem(
+                                        keys_copy, 1u, match_key_r, uint64_t(x_l),
+                                        sT, extra_rounds_bits)
+                                  & target_mask;
+
+                uint32_t fine_shift = static_cast<uint32_t>(num_match_target_bits - fine_bits);
+                uint32_t fine_key   = target_l >> fine_shift;
+                uint64_t fine_idx   = (uint64_t(r_bucket) << fine_bits) | fine_key;
+                uint64_t lo         = d_fine_offsets[fine_idx];
+                uint64_t fine_hi    = d_fine_offsets[fine_idx + 1];
+                uint64_t hi         = fine_hi;
+
+                while (lo < hi) {
+                    uint64_t mid = lo + ((hi - lo) >> 1);
+                    uint32_t target_mid = d_sorted_xs[mid].match_info & target_mask;
+                    if (target_mid < target_l) lo = mid + 1;
+                    else                       hi = mid;
+                }
+
+                uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu
+                                                            : ((1u << num_test_bits) - 1u);
+                uint32_t info_mask = (num_match_info_bits >= 32) ? 0xFFFFFFFFu
+                                                                 : ((1u << num_match_info_bits) - 1u);
+
+                for (uint64_t r = lo; r < fine_hi; ++r) {
+                    uint32_t target_r = d_sorted_xs[r].match_info & target_mask;
+                    if (target_r != target_l) break;
+
+                    uint32_t x_r = d_sorted_xs[r].x;
+                    pos2gpu::Result128 res = pos2gpu::pairing_smem(
+                        keys_copy, uint64_t(x_l), uint64_t(x_r), sT, extra_rounds_bits);
+
+                    uint32_t test_result = res.r[3] & test_mask;
+                    if (test_result != 0) continue;
+
+                    uint32_t match_info_result = res.r[0] & info_mask;
+
+                    sycl::atomic_ref<unsigned long long,
+                                     sycl::memory_order::relaxed,
+                                     sycl::memory_scope::device>
+                        out_count_atomic{ *d_out_count_ull };
+                    unsigned long long out_idx = out_count_atomic.fetch_add(1ULL);
+                    if (out_idx >= out_capacity) return;
+
+                    uint64_t meta = (uint64_t(x_l) << k) | uint64_t(x_r);
+                    d_out_meta[out_idx] = meta;
+                    d_out_mi  [out_idx] = match_info_result;
+                }
+            });
+    }).wait();
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/T2Kernel.cpp b/src/gpu/T2Kernel.cpp
new file mode 100644
index 0000000..e86bb1a
--- /dev/null
+++ b/src/gpu/T2Kernel.cpp
@@ -0,0 +1,213 @@
+// T2Kernel.cu — port of pos2-chip Table2Constructor.
+//
+// Differences from T1 (see T1Kernel.cu):
+//   - Input is T1Pairing (12 bytes, has 64-bit meta accessor), not Xs_Candidate.
+//   - matching_target uses table_id=2 and meta=T1Pairing.meta() (64-bit).
+//     ProofHashing::matching_target sets extra_rounds_bits=0 for table_id != 1.
+//   - pairing_t2 calls AesHash::pairing without extra_rounds_bits (always 0).
+//   - num_match_key_bits = strength (not hard-coded 2 like T1).
+//   - Output T2Pairing has the AES pair.meta_result (64-bit) + x_bits derived
+//     from upper-k bits of meta_l/meta_r.
+
+#include "gpu/AesGpu.cuh"
+#include "gpu/AesHashGpu.cuh"
+#include "gpu/T2Kernel.cuh"
+#include "gpu/T2Offsets.cuh"
+#include "host/PoolSizing.hpp"
+
+#include <climits>
+#include <cstdint>
+
+namespace pos2gpu {
+
+T2MatchParams make_t2_params(int k, int strength)
+{
+    T2MatchParams p{};
+    p.k                     = k;
+    p.strength              = strength;
+    p.num_section_bits      = (k < 28) ? 2 : (k - 26);
+    p.num_match_key_bits    = strength; // T2 uses strength match_key bits
+    p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits;
+    return p;
+}
+
+// T2's three kernels — compute_bucket_offsets, compute_fine_bucket_offsets,
+// match_all_buckets — have moved to T2Offsets.cuh / T2OffsetsCuda.cu /
+// T2OffsetsSycl.cpp on the cross-backend path. The previously-unused
+// matching_section helper went with them.
+
+namespace {
+
+// Fine-bucket pre-index; see T3Kernel.cu for the scheme.
+constexpr int kT2FineBits = 8;
+
+// Shared parameter derivation so launch_t2_match, launch_t2_match_prepare,
+// and launch_t2_match_range all agree on bucket counts, offset layout,
+// and temp_storage sizing.
+struct T2Derived {
+    uint32_t num_sections;
+    uint32_t num_match_keys;
+    uint32_t num_buckets;
+    uint64_t fine_entries;
+    size_t   bucket_bytes;
+    size_t   fine_bytes;
+    size_t   temp_needed;
+    uint32_t target_mask;
+    int      num_test_bits;
+    int      num_info_bits;
+    int      half_k;
+    uint64_t l_count_max;
+};
+
+T2Derived derive_t2(T2MatchParams const& params)
+{
+    T2Derived d{};
+    d.num_sections    = 1u << params.num_section_bits;
+    d.num_match_keys  = 1u << params.num_match_key_bits;
+    d.num_buckets     = d.num_sections * d.num_match_keys;
+    uint64_t const fine_count = 1ull << kT2FineBits;
+    d.fine_entries    = uint64_t(d.num_buckets) * fine_count + 1;
+    d.bucket_bytes    = sizeof(uint64_t) * (d.num_buckets + 1);
+    d.fine_bytes      = sizeof(uint64_t) * d.fine_entries;
+    d.temp_needed     = d.bucket_bytes + d.fine_bytes;
+    d.target_mask     = (params.num_match_target_bits >= 32)
+                          ? 0xFFFFFFFFu
+                          : ((1u << params.num_match_target_bits) - 1u);
+    d.num_test_bits   = params.num_match_key_bits;
+    d.num_info_bits   = params.k;
+    d.half_k          = params.k / 2;
+    d.l_count_max =
+        static_cast<uint64_t>(max_pairs_per_section(params.k, params.num_section_bits));
+    return d;
+}
+
+} // namespace
+
+void launch_t2_match_prepare(
+    uint8_t const* plot_id_bytes,
+    T2MatchParams const& params,
+    uint32_t const* d_sorted_mi,
+    uint64_t t1_count,
+    uint64_t* d_out_count,
+    void* d_temp_storage,
+    size_t* temp_bytes,
+    sycl::queue& q)
+{
+    if (!plot_id_bytes || !temp_bytes) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.strength < 2)            throw std::invalid_argument("invalid argument to launch wrapper");
+
+    T2Derived const d = derive_t2(params);
+
+    if (d_temp_storage == nullptr) {
+        *temp_bytes = d.temp_needed;
+        return;
+    }
+    if (*temp_bytes < d.temp_needed) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_sorted_mi || !d_out_count) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.num_match_target_bits <= kT2FineBits) throw std::invalid_argument("invalid argument to launch wrapper");
+
+    auto* d_offsets      = reinterpret_cast<uint64_t*>(d_temp_storage);
+    auto* d_fine_offsets = d_offsets + (d.num_buckets + 1);
+
+    launch_t2_compute_bucket_offsets(
+        d_sorted_mi, t1_count,
+        params.num_match_target_bits,
+        d.num_buckets, d_offsets, q);
+    launch_t2_compute_fine_bucket_offsets(
+        d_sorted_mi, d_offsets,
+        params.num_match_target_bits, kT2FineBits,
+        d.num_buckets, d_fine_offsets, q);
+    q.memset(d_out_count, 0, sizeof(uint64_t)).wait();
+}
+
+void launch_t2_match_range(
+    uint8_t const* plot_id_bytes,
+    T2MatchParams const& params,
+    uint64_t const* d_sorted_meta,
+    uint32_t const* d_sorted_mi,
+    uint64_t t1_count,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
+    uint32_t* d_out_xbits,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void const* d_temp_storage,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q)
+{
+    (void)t1_count;
+    if (!plot_id_bytes) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.strength < 2)            throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_temp_storage)                throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_sorted_meta || !d_sorted_mi ||
+        !d_out_meta || !d_out_mi || !d_out_xbits || !d_out_count)
+    {
+        throw std::invalid_argument("invalid argument to launch wrapper");
+    }
+
+    T2Derived const d = derive_t2(params);
+
+    if (bucket_end > d.num_buckets) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (bucket_end <= bucket_begin) return;  // empty range is a no-op
+
+    constexpr int kThreads = 256;
+    uint64_t const blocks_x_u64 = (d.l_count_max + kThreads - 1) / kThreads;
+    if (blocks_x_u64 > UINT_MAX) throw std::invalid_argument("invalid argument to launch wrapper");
+
+    auto const* d_offsets      = reinterpret_cast<uint64_t const*>(d_temp_storage);
+    auto const* d_fine_offsets = d_offsets + (d.num_buckets + 1);
+
+    AesHashKeys keys = make_keys(plot_id_bytes);
+
+    launch_t2_match_all_buckets(
+        keys, d_sorted_meta, d_sorted_mi,
+        // launch_t2_match_all_buckets takes mutable pointers to the
+        // offset arrays (historical — they're treated as const inside
+        // the kernel). Cast away const at the ABI boundary only.
+        const_cast<uint64_t*>(d_offsets),
+        const_cast<uint64_t*>(d_fine_offsets),
+        d.num_match_keys, d.num_buckets,
+        params.k, params.num_section_bits,
+        params.num_match_target_bits, kT2FineBits,
+        d.target_mask, d.num_test_bits, d.num_info_bits, d.half_k,
+        d_out_meta, d_out_mi, d_out_xbits, d_out_count,
+        capacity, d.l_count_max,
+        bucket_begin, bucket_end,
+        q);
+}
+
+void launch_t2_match(
+    uint8_t const* plot_id_bytes,
+    T2MatchParams const& params,
+    uint64_t const* d_sorted_meta,
+    uint32_t const* d_sorted_mi,
+    uint64_t t1_count,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
+    uint32_t* d_out_xbits,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void* d_temp_storage,
+    size_t* temp_bytes,
+    sycl::queue& q)
+{
+    // Single-shot wrapper: prepare + one full-range match. Preserves the
+    // original API for test-mode, the pool path, and parity-test callers.
+    launch_t2_match_prepare(
+        plot_id_bytes, params, d_sorted_mi, t1_count,
+        d_out_count, d_temp_storage, temp_bytes, q);
+    if (d_temp_storage == nullptr) return;  // size-query path
+
+    T2Derived const d = derive_t2(params);
+    launch_t2_match_range(
+        plot_id_bytes, params,
+        d_sorted_meta, d_sorted_mi, t1_count,
+        d_out_meta, d_out_mi, d_out_xbits, d_out_count,
+        capacity, d_temp_storage,
+        /*bucket_begin=*/0, /*bucket_end=*/d.num_buckets, q);
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/T2Kernel.cu b/src/gpu/T2Kernel.cu
deleted file mode 100644
index 691d18b..0000000
--- a/src/gpu/T2Kernel.cu
+++ /dev/null
@@ -1,320 +0,0 @@
-// T2Kernel.cu — port of pos2-chip Table2Constructor.
-//
-// Differences from T1 (see T1Kernel.cu):
-//   - Input is T1Pairing (12 bytes, has 64-bit meta accessor), not Xs_Candidate.
-//   - matching_target uses table_id=2 and meta=T1Pairing.meta() (64-bit).
-//     ProofHashing::matching_target sets extra_rounds_bits=0 for table_id != 1.
-//   - pairing_t2 calls AesHash::pairing without extra_rounds_bits (always 0).
-//   - num_match_key_bits = strength (not hard-coded 2 like T1).
-//   - Output T2Pairing has the AES pair.meta_result (64-bit) + x_bits derived
-//     from upper-k bits of meta_l/meta_r.
-
-#include "gpu/AesGpu.cuh"
-#include "gpu/AesHashGpu.cuh"
-#include "gpu/T2Kernel.cuh"
-
-#include <cuda_runtime.h>
-#include <climits>
-#include <cstdint>
-#include <vector>
-
-namespace pos2gpu {
-
-T2MatchParams make_t2_params(int k, int strength)
-{
-    T2MatchParams p{};
-    p.k                     = k;
-    p.strength              = strength;
-    p.num_section_bits      = (k < 28) ? 2 : (k - 26);
-    p.num_match_key_bits    = strength; // T2 uses strength match_key bits
-    p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits;
-    return p;
-}
-
-namespace {
-
-__host__ __device__ inline uint32_t matching_section(uint32_t section, int num_section_bits)
-{
-    uint32_t num_sections = 1u << num_section_bits;
-    uint32_t mask = num_sections - 1u;
-    uint32_t rotated_left = ((section << 1) | (section >> (num_section_bits - 1))) & mask;
-    uint32_t rotated_left_plus_1 = (rotated_left + 1) & mask;
-    uint32_t section_new = ((rotated_left_plus_1 >> 1)
-                          | (rotated_left_plus_1 << (num_section_bits - 1))) & mask;
-    return section_new;
-}
-
-__global__ void compute_bucket_offsets(
-    uint32_t const* __restrict__ sorted_mi,
-    uint64_t total,
-    int num_match_target_bits,
-    uint32_t num_buckets,
-    uint64_t* __restrict__ offsets)
-{
-    if (threadIdx.x != 0 || blockIdx.x != 0) return;
-    uint32_t bucket_shift = static_cast<uint32_t>(num_match_target_bits);
-
-    uint64_t pos = 0;
-    for (uint32_t b = 0; b < num_buckets; ++b) {
-        uint64_t lo = pos, hi = total;
-        while (lo < hi) {
-            uint64_t mid = lo + ((hi - lo) >> 1);
-            uint32_t bucket_mid = sorted_mi[mid] >> bucket_shift;
-            if (bucket_mid < b) lo = mid + 1;
-            else                hi = mid;
-        }
-        offsets[b] = lo;
-        pos = lo;
-    }
-    offsets[num_buckets] = total;
-}
-
-// See T3Kernel.cu for the rationale — one offset per (r_bucket, top
-// fine_bits of target) cuts the match-kernel bsearch window 256× at
-// fine_bits=8.
-__global__ void compute_fine_bucket_offsets(
-    uint32_t const* __restrict__ sorted_mi,
-    uint64_t const* __restrict__ bucket_offsets,
-    int num_match_target_bits,
-    int fine_bits,
-    uint32_t num_buckets,
-    uint64_t* __restrict__ fine_offsets)
-{
-    uint32_t const fine_count = 1u << fine_bits;
-    uint32_t const total      = num_buckets * fine_count;
-    uint32_t const tid        = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total) return;
-
-    uint32_t const r_bucket = tid / fine_count;
-    uint32_t const fine_key = tid % fine_count;
-
-    uint64_t const r_start = bucket_offsets[r_bucket];
-    uint64_t const r_end   = bucket_offsets[r_bucket + 1];
-
-    uint32_t const target_mask = (num_match_target_bits >= 32)
-                                  ? 0xFFFFFFFFu
-                                  : ((1u << num_match_target_bits) - 1u);
-    uint32_t const shift       = static_cast<uint32_t>(num_match_target_bits - fine_bits);
-
-    uint64_t lo = r_start, hi = r_end;
-    while (lo < hi) {
-        uint64_t mid = lo + ((hi - lo) >> 1);
-        uint32_t t   = (sorted_mi[mid] & target_mask) >> shift;
-        if (t < fine_key) lo = mid + 1;
-        else              hi = mid;
-    }
-    fine_offsets[tid] = lo;
-
-    if (tid == total - 1) {
-        fine_offsets[total] = bucket_offsets[num_buckets];
-    }
-}
-
-__global__ __launch_bounds__(256, 4) void match_all_buckets(
-    AesHashKeys keys,
-    uint64_t const* __restrict__ sorted_meta,
-    uint32_t const* __restrict__ sorted_mi,
-    uint64_t const* __restrict__ d_offsets,
-    uint64_t const* __restrict__ d_fine_offsets,
-    uint32_t num_match_keys,
-    int k,
-    int num_section_bits,
-    int num_match_target_bits,
-    int fine_bits,
-    uint32_t target_mask,
-    int num_test_bits,
-    int num_match_info_bits,
-    int half_k,
-    T2PairingGpu* __restrict__ out,
-    unsigned long long* __restrict__ out_count,
-    uint64_t out_capacity)
-{
-    __shared__ uint32_t sT[4 * 256];
-    load_aes_tables_smem(sT);
-    __syncthreads();
-
-    uint32_t bucket_id   = blockIdx.y;
-    uint32_t section_l   = bucket_id / num_match_keys;
-    uint32_t match_key_r = bucket_id % num_match_keys;
-
-    uint32_t section_r;
-    {
-        uint32_t mask = (1u << num_section_bits) - 1u;
-        uint32_t rl   = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask;
-        uint32_t rl1  = (rl + 1) & mask;
-        section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask;
-    }
-
-    uint64_t l_start = d_offsets[section_l * num_match_keys];
-    uint64_t l_end   = d_offsets[(section_l + 1) * num_match_keys];
-    uint32_t r_bucket = section_r * num_match_keys + match_key_r;
-
-    uint64_t l = l_start + blockIdx.x * uint64_t(blockDim.x) + threadIdx.x;
-    if (l >= l_end) return;
-
-    uint64_t meta_l = sorted_meta[l];
-
-    uint32_t target_l = matching_target_smem(keys, 2u, match_key_r, meta_l, sT, 0)
-                      & target_mask;
-
-    // Fine-bucket pre-index; see T3Kernel.cu for rationale.
-    uint32_t fine_shift = static_cast<uint32_t>(num_match_target_bits - fine_bits);
-    uint32_t fine_key   = target_l >> fine_shift;
-    uint64_t fine_idx   = (uint64_t(r_bucket) << fine_bits) | fine_key;
-    uint64_t lo         = d_fine_offsets[fine_idx];
-    uint64_t fine_hi    = d_fine_offsets[fine_idx + 1];
-    uint64_t hi         = fine_hi;
-
-    while (lo < hi) {
-        uint64_t mid = lo + ((hi - lo) >> 1);
-        uint32_t target_mid = sorted_mi[mid] & target_mask;
-        if (target_mid < target_l) lo = mid + 1;
-        else                       hi = mid;
-    }
-
-    uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu
-                                                : ((1u << num_test_bits) - 1u);
-    uint32_t info_mask = (num_match_info_bits >= 32) ? 0xFFFFFFFFu
-                                                     : ((1u << num_match_info_bits) - 1u);
-    int meta_bits = 2 * k;
-
-    for (uint64_t r = lo; r < fine_hi; ++r) {
-        uint32_t target_r = sorted_mi[r] & target_mask;
-        if (target_r != target_l) break;
-
-        uint64_t meta_r = sorted_meta[r];
-
-        Result128 res = pairing_smem(keys, meta_l, meta_r, sT, 0);
-
-        uint32_t test_result = res.r[3] & test_mask;
-        if (test_result != 0) continue;
-
-        uint32_t match_info_result = res.r[0] & info_mask;
-        uint64_t meta_result_full = uint64_t(res.r[1]) | (uint64_t(res.r[2]) << 32);
-        uint64_t meta_result = (meta_bits == 64)
-                               ? meta_result_full
-                               : (meta_result_full & ((1ULL << meta_bits) - 1ULL));
-
-        uint32_t x_bits_l = static_cast<uint32_t>((meta_l >> k) >> half_k);
-        uint32_t x_bits_r = static_cast<uint32_t>((meta_r >> k) >> half_k);
-        uint32_t x_bits   = (x_bits_l << half_k) | x_bits_r;
-
-        unsigned long long out_idx = atomicAdd(out_count, 1ULL);
-        if (out_idx >= out_capacity) return;
-
-        T2PairingGpu p;
-        p.meta       = meta_result;
-        p.match_info = match_info_result;
-        p.x_bits     = x_bits;
-        out[out_idx] = p;
-    }
-}
-
-} // namespace
-
-cudaError_t launch_t2_match(
-    uint8_t const* plot_id_bytes,
-    T2MatchParams const& params,
-    uint64_t const* d_sorted_meta,
-    uint32_t const* d_sorted_mi,
-    uint64_t t1_count,
-    T2PairingGpu* d_out_pairings,
-    uint64_t* d_out_count,
-    uint64_t capacity,
-    void* d_temp_storage,
-    size_t* temp_bytes,
-    cudaStream_t stream)
-{
-    if (!plot_id_bytes || !temp_bytes) return cudaErrorInvalidValue;
-    if (params.k < 18 || params.k > 32) return cudaErrorInvalidValue;
-    if (params.strength < 2)            return cudaErrorInvalidValue;
-
-    uint32_t num_sections    = 1u << params.num_section_bits;
-    uint32_t num_match_keys  = 1u << params.num_match_key_bits;
-    uint32_t num_buckets     = num_sections * num_match_keys;
-
-    // Fine-bucket pre-index; see T3Kernel.cu for the scheme.
-    constexpr int FINE_BITS = 8;
-    uint64_t const fine_count    = 1ull << FINE_BITS;
-    uint64_t const fine_entries  = uint64_t(num_buckets) * fine_count + 1;
-
-    size_t const bucket_bytes = sizeof(uint64_t) * (num_buckets + 1);
-    size_t const fine_bytes   = sizeof(uint64_t) * fine_entries;
-    size_t const needed       = bucket_bytes + fine_bytes;
-
-    if (d_temp_storage == nullptr) {
-        *temp_bytes = needed;
-        return cudaSuccess;
-    }
-    if (*temp_bytes < needed)        return cudaErrorInvalidValue;
-    if (!d_sorted_meta || !d_sorted_mi || !d_out_pairings || !d_out_count) return cudaErrorInvalidValue;
-    if (params.num_match_target_bits <= FINE_BITS) return cudaErrorInvalidValue;
-
-    auto* d_offsets      = reinterpret_cast<uint64_t*>(d_temp_storage);
-    auto* d_fine_offsets = d_offsets + (num_buckets + 1);
-
-    AesHashKeys keys = make_keys(plot_id_bytes);
-
-    compute_bucket_offsets<<<1, 1, 0, stream>>>(
-        d_sorted_mi, t1_count,
-        params.num_match_target_bits,
-        num_buckets,
-        d_offsets);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) return err;
-
-    uint32_t fine_threads_total = num_buckets * uint32_t(fine_count);
-    unsigned fine_blocks = (fine_threads_total + 255) / 256;
-    compute_fine_bucket_offsets<<<fine_blocks, 256, 0, stream>>>(
-        d_sorted_mi, d_offsets,
-        params.num_match_target_bits, FINE_BITS,
-        num_buckets, d_fine_offsets);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) return err;
-
-    err = cudaMemsetAsync(d_out_count, 0, sizeof(uint64_t), stream);
-    if (err != cudaSuccess) return err;
-
-    std::vector<uint64_t> h_offsets(num_buckets + 1);
-    err = cudaMemcpyAsync(h_offsets.data(), d_offsets,
-                          sizeof(uint64_t) * (num_buckets + 1),
-                          cudaMemcpyDeviceToHost, stream);
-    if (err != cudaSuccess) return err;
-    err = cudaStreamSynchronize(stream);
-    if (err != cudaSuccess) return err;
-
-    uint64_t l_count_max = 0;
-    for (uint32_t s = 0; s < num_sections; ++s) {
-        uint64_t l_count = h_offsets[(s + 1) * num_match_keys]
-                         - h_offsets[s * num_match_keys];
-        if (l_count > l_count_max) l_count_max = l_count;
-    }
-
-    uint32_t target_mask = (params.num_match_target_bits >= 32)
-                            ? 0xFFFFFFFFu
-                            : ((1u << params.num_match_target_bits) - 1u);
-    int num_test_bits = params.num_match_key_bits;
-    int num_info_bits = params.k;
-    int half_k        = params.k / 2;
-
-    constexpr int kThreads = 256;
-    uint64_t blocks_x_u64 = (l_count_max + kThreads - 1) / kThreads;
-    if (blocks_x_u64 > UINT_MAX) return cudaErrorInvalidValue;
-    dim3 grid(static_cast<unsigned>(blocks_x_u64), num_buckets, 1);
-
-    match_all_buckets<<<grid, kThreads, 0, stream>>>(
-        keys, d_sorted_meta, d_sorted_mi,
-        d_offsets, d_fine_offsets,
-        num_match_keys,
-        params.k, params.num_section_bits,
-        params.num_match_target_bits, FINE_BITS,
-        target_mask, num_test_bits, num_info_bits, half_k,
-        d_out_pairings,
-        reinterpret_cast<unsigned long long*>(d_out_count),
-        capacity);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) return err;
-    return cudaSuccess;
-}
-
-} // namespace pos2gpu
diff --git a/src/gpu/T2Kernel.cuh b/src/gpu/T2Kernel.cuh
index b311e66..d41b351 100644
--- a/src/gpu/T2Kernel.cuh
+++ b/src/gpu/T2Kernel.cuh
@@ -9,7 +9,8 @@
 #include "gpu/AesHashGpu.cuh"
 #include "gpu/T1Kernel.cuh"
 
-#include <cuda_runtime.h>
+#include "gpu/CudaHalfShim.hpp"
+#include <sycl/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
 
@@ -45,17 +46,67 @@ T2MatchParams make_t2_params(int k, int strength);
 // Dropping the 4-byte match_info from the permuted stream trims the sorted-T1
 // footprint 12 B → 8 B per entry and removes wasted bandwidth on the match
 // kernel's hot meta loads.
-cudaError_t launch_t2_match(
+//
+// Output is also SoA: three parallel streams instead of a packed
+// T2PairingGpu array. This lets the streaming pipeline free the mi
+// stream early (after it's consumed by the subsequent CUB sort as the
+// key input) without touching the meta/xbits streams, shaving ~1 GB
+// off the k=28 T2-sort peak. The matching-parity tool rebuilds
+// T2PairingGpu locally when it needs the AoS form.
+void launch_t2_match(
     uint8_t const* plot_id_bytes,
     T2MatchParams const& params,
     uint64_t const* d_sorted_meta,  // meta, sorted by match_info ascending
     uint32_t const* d_sorted_mi,    // parallel match_info stream
     uint64_t t1_count,
-    T2PairingGpu* d_out_pairings,
+    uint64_t* d_out_meta,           // uint64 meta per emitted pair
+    uint32_t* d_out_mi,             // uint32 match_info per emitted pair
+    uint32_t* d_out_xbits,          // uint32 x_bits per emitted pair
     uint64_t* d_out_count,
     uint64_t capacity,
     void* d_temp_storage,
     size_t* temp_bytes,
-    cudaStream_t stream = nullptr);
+    sycl::queue& q);
+
+// Two-step entry point for callers that want to run the match kernel
+// in multiple bucket-range passes (e.g. the streaming pipeline's N=2
+// tiling — see docs/t2-match-tiling-plan.md). Equivalent to calling
+// launch_t2_match with (0, num_buckets) when the range covers the
+// whole bucket space.
+//
+// launch_t2_match_prepare: computes bucket + fine-bucket offsets into
+//   d_temp_storage and zeroes d_out_count. Same sizing protocol as
+//   launch_t2_match (d_temp_storage==nullptr fills *temp_bytes).
+//
+// launch_t2_match_range: runs the match kernel for bucket-id range
+//   [bucket_begin, bucket_end). Multiple calls sharing the same
+//   d_temp_storage / d_out_* buffers / d_out_count produce a single
+//   concatenated output (atomic counter), byte-equivalent to a single
+//   full-range call after the subsequent T2 sort.
+void launch_t2_match_prepare(
+    uint8_t const* plot_id_bytes,
+    T2MatchParams const& params,
+    uint32_t const* d_sorted_mi,
+    uint64_t t1_count,
+    uint64_t* d_out_count,
+    void* d_temp_storage,
+    size_t* temp_bytes,
+    sycl::queue& q);
+
+void launch_t2_match_range(
+    uint8_t const* plot_id_bytes,
+    T2MatchParams const& params,
+    uint64_t const* d_sorted_meta,
+    uint32_t const* d_sorted_mi,
+    uint64_t t1_count,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
+    uint32_t* d_out_xbits,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void const* d_temp_storage,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q);
 
 } // namespace pos2gpu
diff --git a/src/gpu/T2Offsets.cuh b/src/gpu/T2Offsets.cuh
new file mode 100644
index 0000000..f5f2a30
--- /dev/null
+++ b/src/gpu/T2Offsets.cuh
@@ -0,0 +1,81 @@
+// T2Offsets.cuh — backend-dispatched wrappers for T2's three kernels.
+// Parallel to T1Offsets.cuh; selected at configure time via XCHPLOT2_BACKEND
+// (T2OffsetsCuda.cu vs T2OffsetsSycl.cpp).
+//
+// T2's input stream is SoA (uint64 meta + uint32 match_info) rather than
+// T1's AoS XsCandidateGpu, so the bucket/fine-offset wrappers take the
+// match_info array directly. The match kernel emits three output streams
+// (meta, match_info, x_bits) instead of T1's two.
+
+#pragma once
+
+#include "gpu/AesHashGpu.cuh"
+
+#include <cstdint>
+
+#include "gpu/CudaHalfShim.hpp"
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+void launch_t2_compute_bucket_offsets(
+    uint32_t const* d_sorted_mi,
+    uint64_t total,
+    int num_match_target_bits,
+    uint32_t num_buckets,
+    uint64_t* d_offsets,
+    sycl::queue& q);
+
+void launch_t2_compute_fine_bucket_offsets(
+    uint32_t const* d_sorted_mi,
+    uint64_t const* d_bucket_offsets,
+    int num_match_target_bits,
+    int fine_bits,
+    uint32_t num_buckets,
+    uint64_t* d_fine_offsets,
+    sycl::queue& q);
+
+// Fused T2 match. table_id=2, no strength scaling on AES rounds. Emits
+// (meta, match_info, x_bits) triples via an atomic cursor; x_bits packs
+// the upper-half-k bits of meta_l and meta_r per Table2Constructor.
+//
+// bucket_begin / bucket_end select which bucket-id range to process
+// (inclusive / exclusive). Passing (0, num_buckets) preserves the
+// original full-pass behavior. Smaller ranges let callers split T2
+// match into temporally-separated passes so downstream memory does
+// not need to hold the full T2 output at once (see
+// docs/t2-match-tiling-plan.md).
+//
+// Across all passes that share the same d_out_{meta,mi,xbits} +
+// d_out_count, results append starting at the current value of
+// d_out_count (atomic). Callers that want pass-disjoint output should
+// sum counts themselves; callers that want the concatenation as a
+// single array should simply leave d_out_count and the buffers untouched
+// between passes.
+void launch_t2_match_all_buckets(
+    AesHashKeys keys,
+    uint64_t const* d_sorted_meta,
+    uint32_t const* d_sorted_mi,
+    uint64_t const* d_offsets,
+    uint64_t const* d_fine_offsets,
+    uint32_t num_match_keys,
+    uint32_t num_buckets,
+    int k,
+    int num_section_bits,
+    int num_match_target_bits,
+    int fine_bits,
+    uint32_t target_mask,
+    int num_test_bits,
+    int num_match_info_bits,
+    int half_k,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
+    uint32_t* d_out_xbits,
+    uint64_t* d_out_count,
+    uint64_t out_capacity,
+    uint64_t l_count_max,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q);
+
+} // namespace pos2gpu
diff --git a/src/gpu/T2OffsetsSycl.cpp b/src/gpu/T2OffsetsSycl.cpp
new file mode 100644
index 0000000..2887b5c
--- /dev/null
+++ b/src/gpu/T2OffsetsSycl.cpp
@@ -0,0 +1,231 @@
+// T2OffsetsSycl.cpp — SYCL implementation of T2's three backend-dispatched
+// kernels. Pattern mirrors T1OffsetsSycl.cpp; reuses the shared SYCL
+// queue + AES-table USM buffer from SyclBackend.hpp.
+
+#include "gpu/SyclBackend.hpp"
+#include "gpu/T2Offsets.cuh"
+
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+void launch_t2_compute_bucket_offsets(
+    uint32_t const* d_sorted_mi,
+    uint64_t total,
+    int num_match_target_bits,
+    uint32_t num_buckets,
+    uint64_t* d_offsets,
+    sycl::queue& q)
+{
+    constexpr size_t threads = 256;
+    size_t   const out_count = static_cast<size_t>(num_buckets) + 1;
+    size_t   const groups    = (out_count + threads - 1) / threads;
+
+    q.parallel_for(
+        sycl::nd_range<1>{ groups * threads, threads },
+        [=](sycl::nd_item<1> it) {
+            uint32_t b = static_cast<uint32_t>(it.get_global_id(0));
+            if (b > num_buckets) return;
+            if (b == num_buckets) { d_offsets[num_buckets] = total; return; }
+
+            uint32_t bucket_shift = static_cast<uint32_t>(num_match_target_bits);
+            uint64_t lo = 0, hi = total;
+            while (lo < hi) {
+                uint64_t mid = lo + ((hi - lo) >> 1);
+                uint32_t v   = d_sorted_mi[mid] >> bucket_shift;
+                if (v < b) lo = mid + 1;
+                else       hi = mid;
+            }
+            d_offsets[b] = lo;
+        }).wait();
+}
+
+void launch_t2_compute_fine_bucket_offsets(
+    uint32_t const* d_sorted_mi,
+    uint64_t const* d_bucket_offsets,
+    int num_match_target_bits,
+    int fine_bits,
+    uint32_t num_buckets,
+    uint64_t* d_fine_offsets,
+    sycl::queue& q)
+{
+    constexpr size_t threads      = 256;
+    uint32_t const   fine_count   = 1u << fine_bits;
+    uint32_t const   total        = num_buckets * fine_count;
+    size_t   const   groups       = (total + threads - 1) / threads;
+
+    q.parallel_for(
+        sycl::nd_range<1>{ groups * threads, threads },
+        [=](sycl::nd_item<1> it) {
+            uint32_t tid = static_cast<uint32_t>(it.get_global_id(0));
+            if (tid >= total) return;
+
+            uint32_t r_bucket = tid / fine_count;
+            uint32_t fine_key = tid % fine_count;
+
+            uint64_t r_start = d_bucket_offsets[r_bucket];
+            uint64_t r_end   = d_bucket_offsets[r_bucket + 1];
+
+            uint32_t target_mask = (num_match_target_bits >= 32)
+                                    ? 0xFFFFFFFFu
+                                    : ((1u << num_match_target_bits) - 1u);
+            uint32_t shift = static_cast<uint32_t>(num_match_target_bits - fine_bits);
+
+            uint64_t lo = r_start, hi = r_end;
+            while (lo < hi) {
+                uint64_t mid = lo + ((hi - lo) >> 1);
+                uint32_t t   = (d_sorted_mi[mid] & target_mask) >> shift;
+                if (t < fine_key) lo = mid + 1;
+                else              hi = mid;
+            }
+            d_fine_offsets[tid] = lo;
+
+            if (tid == total - 1) {
+                d_fine_offsets[total] = d_bucket_offsets[num_buckets];
+            }
+        }).wait();
+}
+
+void launch_t2_match_all_buckets(
+    AesHashKeys keys,
+    uint64_t const* d_sorted_meta,
+    uint32_t const* d_sorted_mi,
+    uint64_t const* d_offsets,
+    uint64_t const* d_fine_offsets,
+    uint32_t num_match_keys,
+    uint32_t num_buckets,
+    int k,
+    int num_section_bits,
+    int num_match_target_bits,
+    int fine_bits,
+    uint32_t target_mask,
+    int num_test_bits,
+    int num_match_info_bits,
+    int half_k,
+    uint64_t* d_out_meta,
+    uint32_t* d_out_mi,
+    uint32_t* d_out_xbits,
+    uint64_t* d_out_count,
+    uint64_t out_capacity,
+    uint64_t l_count_max,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q)
+{
+    (void)num_buckets; // only the [begin, end) sub-range is iterated
+    if (bucket_end <= bucket_begin) return;
+    uint32_t const num_buckets_in_range = bucket_end - bucket_begin;
+
+    uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q);
+
+    constexpr size_t threads = 256;
+    uint64_t blocks_x_u64    = (l_count_max + threads - 1) / threads;
+    size_t   const blocks_x  = static_cast<size_t>(blocks_x_u64);
+
+    auto* d_out_count_ull =
+        reinterpret_cast<unsigned long long*>(d_out_count);
+
+    q.submit([&](sycl::handler& h) {
+        sycl::local_accessor<uint32_t, 1> sT_local{
+            sycl::range<1>{4 * 256}, h};
+
+        h.parallel_for(
+            sycl::nd_range<2>{
+                sycl::range<2>{ static_cast<size_t>(num_buckets_in_range),
+                                blocks_x * threads },
+                sycl::range<2>{ 1, threads }
+            },
+            [=, keys_copy = keys](sycl::nd_item<2> it) {
+                uint32_t* sT = &sT_local[0];
+                size_t local_id = it.get_local_id(1);
+                #pragma unroll 1
+                for (size_t i = local_id; i < 4 * 256; i += threads) {
+                    sT[i] = d_aes_tables[i];
+                }
+                it.barrier(sycl::access::fence_space::local_space);
+
+                uint32_t bucket_id   = bucket_begin + static_cast<uint32_t>(it.get_group(0));
+                uint32_t section_l   = bucket_id / num_match_keys;
+                uint32_t match_key_r = bucket_id % num_match_keys;
+
+                uint32_t section_r;
+                {
+                    uint32_t mask = (1u << num_section_bits) - 1u;
+                    uint32_t rl   = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask;
+                    uint32_t rl1  = (rl + 1) & mask;
+                    section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask;
+                }
+
+                uint64_t l_start = d_offsets[section_l * num_match_keys];
+                uint64_t l_end   = d_offsets[(section_l + 1) * num_match_keys];
+                uint32_t r_bucket = section_r * num_match_keys + match_key_r;
+
+                uint64_t l = l_start
+                           + it.get_group(1) * uint64_t(threads)
+                           + local_id;
+                if (l >= l_end) return;
+
+                uint64_t meta_l = d_sorted_meta[l];
+
+                uint32_t target_l = pos2gpu::matching_target_smem(
+                                        keys_copy, 2u, match_key_r, meta_l, sT, 0)
+                                  & target_mask;
+
+                uint32_t fine_shift = static_cast<uint32_t>(num_match_target_bits - fine_bits);
+                uint32_t fine_key   = target_l >> fine_shift;
+                uint64_t fine_idx   = (uint64_t(r_bucket) << fine_bits) | fine_key;
+                uint64_t lo         = d_fine_offsets[fine_idx];
+                uint64_t fine_hi    = d_fine_offsets[fine_idx + 1];
+                uint64_t hi         = fine_hi;
+
+                while (lo < hi) {
+                    uint64_t mid = lo + ((hi - lo) >> 1);
+                    uint32_t target_mid = d_sorted_mi[mid] & target_mask;
+                    if (target_mid < target_l) lo = mid + 1;
+                    else                       hi = mid;
+                }
+
+                uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu
+                                                            : ((1u << num_test_bits) - 1u);
+                uint32_t info_mask = (num_match_info_bits >= 32) ? 0xFFFFFFFFu
+                                                                 : ((1u << num_match_info_bits) - 1u);
+                int meta_bits = 2 * k;
+
+                for (uint64_t r = lo; r < fine_hi; ++r) {
+                    uint32_t target_r = d_sorted_mi[r] & target_mask;
+                    if (target_r != target_l) break;
+
+                    uint64_t meta_r = d_sorted_meta[r];
+
+                    pos2gpu::Result128 res = pos2gpu::pairing_smem(
+                        keys_copy, meta_l, meta_r, sT, 0);
+
+                    uint32_t test_result = res.r[3] & test_mask;
+                    if (test_result != 0) continue;
+
+                    uint32_t match_info_result = res.r[0] & info_mask;
+                    uint64_t meta_result_full = uint64_t(res.r[1]) | (uint64_t(res.r[2]) << 32);
+                    uint64_t meta_result = (meta_bits == 64)
+                                            ? meta_result_full
+                                            : (meta_result_full & ((1ULL << meta_bits) - 1ULL));
+
+                    uint32_t x_bits_l = static_cast<uint32_t>((meta_l >> k) >> half_k);
+                    uint32_t x_bits_r = static_cast<uint32_t>((meta_r >> k) >> half_k);
+                    uint32_t x_bits   = (x_bits_l << half_k) | x_bits_r;
+
+                    sycl::atomic_ref<unsigned long long,
+                                     sycl::memory_order::relaxed,
+                                     sycl::memory_scope::device>
+                        out_count_atomic{ *d_out_count_ull };
+                    unsigned long long out_idx = out_count_atomic.fetch_add(1ULL);
+                    if (out_idx >= out_capacity) return;
+
+                    d_out_meta [out_idx] = meta_result;
+                    d_out_mi   [out_idx] = match_info_result;
+                    d_out_xbits[out_idx] = x_bits;
+                }
+            });
+    }).wait();
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/T3Kernel.cpp b/src/gpu/T3Kernel.cpp
new file mode 100644
index 0000000..a89db1a
--- /dev/null
+++ b/src/gpu/T3Kernel.cpp
@@ -0,0 +1,268 @@
+// T3Kernel.cu — port of pos2-chip Table3Constructor.
+//
+// Differences from T2:
+//   - Input is T2Pairing { meta(64), match_info(32), x_bits(32) }.
+//   - matching_target uses table_id=3 and meta=T2Pairing.meta (no extra rounds).
+//   - pairing_t3 only consumes test_result; no match_info / meta extraction
+//     from the AES output. AES rounds = AES_PAIRING_ROUNDS (16), no strength
+//     bonus.
+//   - Emit T3Pairing { proof_fragment = FeistelCipher.encrypt(all_x_bits) }
+//     where all_x_bits = (l.x_bits << k) | r.x_bits.
+
+#include "gpu/AesGpu.cuh"
+#include "gpu/AesHashGpu.cuh"
+#include "gpu/FeistelCipherGpu.cuh"
+#include "gpu/T2Offsets.cuh"
+#include "gpu/T3Kernel.cuh"
+#include "gpu/T3Offsets.cuh"
+#include "host/PoolSizing.hpp"
+
+#include <climits>
+#include <cstdint>
+
+namespace pos2gpu {
+
+// The CUDA __constant__ FeistelKey + its setup have moved to
+// T3OffsetsCuda.cu, scoped to the wrapper that uses them. The SYCL
+// path captures FeistelKey by value in the lambda instead.
+
+T3MatchParams make_t3_params(int k, int strength)
+{
+    T3MatchParams p{};
+    p.k                     = k;
+    p.strength              = strength;
+    p.num_section_bits      = (k < 28) ? 2 : (k - 26);
+    p.num_match_key_bits    = strength;
+    p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits;
+    return p;
+}
+
+// T3's three kernels (compute_bucket_offsets, compute_fine_bucket_offsets,
+// match_all_buckets) have moved to the cross-backend path. The two offset
+// kernels are bit-identical to T2's and reuse T2Offsets.cuh's wrappers; the
+// match kernel — Feistel-encrypted output — has its own wrapper in
+// T3Offsets.cuh. The previously-unused matching_section helper went with
+// them.
+
+
+namespace {
+
+constexpr int kT3FineBits = 8;
+
+struct T3Derived {
+    uint32_t num_sections;
+    uint32_t num_match_keys;
+    uint32_t num_buckets;
+    uint64_t fine_entries;
+    size_t   bucket_bytes;
+    size_t   fine_bytes;
+    size_t   temp_needed;
+    uint32_t target_mask;
+    int      num_test_bits;
+    uint64_t l_count_max;
+};
+
+T3Derived derive_t3(T3MatchParams const& params)
+{
+    T3Derived d{};
+    d.num_sections    = 1u << params.num_section_bits;
+    d.num_match_keys  = 1u << params.num_match_key_bits;
+    d.num_buckets     = d.num_sections * d.num_match_keys;
+    uint64_t const fine_count = 1ull << kT3FineBits;
+    d.fine_entries    = uint64_t(d.num_buckets) * fine_count + 1;
+    d.bucket_bytes    = sizeof(uint64_t) * (d.num_buckets + 1);
+    d.fine_bytes      = sizeof(uint64_t) * d.fine_entries;
+    d.temp_needed     = d.bucket_bytes + d.fine_bytes;
+    d.target_mask     = (params.num_match_target_bits >= 32)
+                          ? 0xFFFFFFFFu
+                          : ((1u << params.num_match_target_bits) - 1u);
+    d.num_test_bits   = params.num_match_key_bits;
+    d.l_count_max =
+        static_cast<uint64_t>(max_pairs_per_section(params.k, params.num_section_bits));
+    return d;
+}
+
+} // namespace
+
+void launch_t3_match_prepare(
+    uint8_t const* plot_id_bytes,
+    T3MatchParams const& params,
+    uint32_t const* d_sorted_mi,
+    uint64_t t2_count,
+    uint64_t* d_out_count,
+    void* d_temp_storage,
+    size_t* temp_bytes,
+    sycl::queue& q)
+{
+    if (!plot_id_bytes || !temp_bytes) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.strength < 2)            throw std::invalid_argument("invalid argument to launch wrapper");
+
+    T3Derived const d = derive_t3(params);
+
+    if (d_temp_storage == nullptr) {
+        *temp_bytes = d.temp_needed;
+        return;
+    }
+    if (*temp_bytes < d.temp_needed) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_sorted_mi || !d_out_count) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.num_match_target_bits <= kT3FineBits) throw std::invalid_argument("invalid argument to launch wrapper");
+
+    auto* d_offsets      = reinterpret_cast<uint64_t*>(d_temp_storage);
+    auto* d_fine_offsets = d_offsets + (d.num_buckets + 1);
+
+    // T3 reuses T2's offset wrappers (identical layout + algorithm).
+    launch_t2_compute_bucket_offsets(
+        d_sorted_mi, t2_count,
+        params.num_match_target_bits,
+        d.num_buckets, d_offsets, q);
+    launch_t2_compute_fine_bucket_offsets(
+        d_sorted_mi, d_offsets,
+        params.num_match_target_bits, kT3FineBits,
+        d.num_buckets, d_fine_offsets, q);
+    q.memset(d_out_count, 0, sizeof(uint64_t)).wait();
+}
+
+void launch_t3_match_range(
+    uint8_t const* plot_id_bytes,
+    T3MatchParams const& params,
+    uint64_t const* d_sorted_meta,
+    uint32_t const* d_sorted_xbits,
+    uint32_t const* d_sorted_mi,
+    uint64_t t2_count,
+    T3PairingGpu* d_out_pairings,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void const* d_temp_storage,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q)
+{
+    (void)t2_count;
+    if (!plot_id_bytes) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.strength < 2)            throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_temp_storage)                throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_sorted_meta || !d_sorted_xbits || !d_sorted_mi
+        || !d_out_pairings || !d_out_count) throw std::invalid_argument("invalid argument to launch wrapper");
+
+    T3Derived const d = derive_t3(params);
+
+    if (bucket_end > d.num_buckets) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (bucket_end <= bucket_begin) return;
+
+    constexpr int kThreads = 256;
+    uint64_t const blocks_x_u64 = (d.l_count_max + kThreads - 1) / kThreads;
+    if (blocks_x_u64 > UINT_MAX) throw std::invalid_argument("invalid argument to launch wrapper");
+
+    auto const* d_offsets      = reinterpret_cast<uint64_t const*>(d_temp_storage);
+    auto const* d_fine_offsets = d_offsets + (d.num_buckets + 1);
+
+    AesHashKeys keys = make_keys(plot_id_bytes);
+    FeistelKey  fk   = make_feistel_key(plot_id_bytes, params.k, /*rounds=*/4);
+
+    launch_t3_match_all_buckets(
+        keys, fk,
+        d_sorted_meta, d_sorted_xbits, d_sorted_mi,
+        const_cast<uint64_t*>(d_offsets),
+        const_cast<uint64_t*>(d_fine_offsets),
+        d.num_match_keys, d.num_buckets,
+        params.k, params.num_section_bits,
+        params.num_match_target_bits, kT3FineBits,
+        d.target_mask, d.num_test_bits,
+        d_out_pairings, d_out_count,
+        capacity, d.l_count_max,
+        bucket_begin, bucket_end,
+        q);
+}
+
+void launch_t3_match_section_pair_range(
+    uint8_t const* plot_id_bytes,
+    T3MatchParams const& params,
+    uint64_t const* d_meta_l_slice,
+    uint64_t section_l_row_start,
+    uint64_t const* d_meta_r_slice,
+    uint64_t section_r_row_start,
+    uint32_t const* d_sorted_xbits,
+    uint32_t const* d_sorted_mi,
+    uint64_t t2_count,
+    T3PairingGpu* d_out_pairings,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void const* d_temp_storage,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q)
+{
+    (void)t2_count;
+    if (!plot_id_bytes) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (params.strength < 2)            throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_temp_storage)                throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_meta_l_slice || !d_meta_r_slice
+        || !d_sorted_xbits || !d_sorted_mi
+        || !d_out_pairings || !d_out_count) throw std::invalid_argument("invalid argument to launch wrapper");
+
+    T3Derived const d = derive_t3(params);
+
+    if (bucket_end > d.num_buckets) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (bucket_end <= bucket_begin) return;
+
+    constexpr int kThreads = 256;
+    uint64_t const blocks_x_u64 = (d.l_count_max + kThreads - 1) / kThreads;
+    if (blocks_x_u64 > UINT_MAX) throw std::invalid_argument("invalid argument to launch wrapper");
+
+    auto const* d_offsets      = reinterpret_cast<uint64_t const*>(d_temp_storage);
+    auto const* d_fine_offsets = d_offsets + (d.num_buckets + 1);
+
+    AesHashKeys keys = make_keys(plot_id_bytes);
+    FeistelKey  fk   = make_feistel_key(plot_id_bytes, params.k, /*rounds=*/4);
+
+    launch_t3_match_section_pair(
+        keys, fk,
+        d_meta_l_slice, section_l_row_start,
+        d_meta_r_slice, section_r_row_start,
+        d_sorted_xbits, d_sorted_mi,
+        const_cast<uint64_t*>(d_offsets),
+        const_cast<uint64_t*>(d_fine_offsets),
+        d.num_match_keys, d.num_buckets,
+        params.k, params.num_section_bits,
+        params.num_match_target_bits, kT3FineBits,
+        d.target_mask, d.num_test_bits,
+        d_out_pairings, d_out_count,
+        capacity, d.l_count_max,
+        bucket_begin, bucket_end,
+        q);
+}
+
+void launch_t3_match(
+    uint8_t const* plot_id_bytes,
+    T3MatchParams const& params,
+    uint64_t const* d_sorted_meta,
+    uint32_t const* d_sorted_xbits,
+    uint32_t const* d_sorted_mi,
+    uint64_t t2_count,
+    T3PairingGpu* d_out_pairings,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void* d_temp_storage,
+    size_t* temp_bytes,
+    sycl::queue& q)
+{
+    // Single-shot wrapper: prepare + one full-range match. Preserves the
+    // original API for pool path, test mode, and parity-test callers.
+    launch_t3_match_prepare(
+        plot_id_bytes, params, d_sorted_mi, t2_count,
+        d_out_count, d_temp_storage, temp_bytes, q);
+    if (d_temp_storage == nullptr) return;  // size-query path
+
+    T3Derived const d = derive_t3(params);
+    launch_t3_match_range(
+        plot_id_bytes, params,
+        d_sorted_meta, d_sorted_xbits, d_sorted_mi, t2_count,
+        d_out_pairings, d_out_count,
+        capacity, d_temp_storage,
+        /*bucket_begin=*/0, /*bucket_end=*/d.num_buckets, q);
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/T3Kernel.cu b/src/gpu/T3Kernel.cu
deleted file mode 100644
index 6e91ba5..0000000
--- a/src/gpu/T3Kernel.cu
+++ /dev/null
@@ -1,337 +0,0 @@
-// T3Kernel.cu — port of pos2-chip Table3Constructor.
-//
-// Differences from T2:
-//   - Input is T2Pairing { meta(64), match_info(32), x_bits(32) }.
-//   - matching_target uses table_id=3 and meta=T2Pairing.meta (no extra rounds).
-//   - pairing_t3 only consumes test_result; no match_info / meta extraction
-//     from the AES output. AES rounds = AES_PAIRING_ROUNDS (16), no strength
-//     bonus.
-//   - Emit T3Pairing { proof_fragment = FeistelCipher.encrypt(all_x_bits) }
-//     where all_x_bits = (l.x_bits << k) | r.x_bits.
-
-#include "gpu/AesGpu.cuh"
-#include "gpu/AesHashGpu.cuh"
-#include "gpu/FeistelCipherGpu.cuh"
-#include "gpu/T3Kernel.cuh"
-
-#include <cuda_runtime.h>
-#include <climits>
-#include <cstdint>
-#include <vector>
-
-namespace pos2gpu {
-
-// FeistelKey is 40 bytes (32-byte plot_id + 2 ints). Passed by value as
-// a kernel arg, the compiler spilled it to local memory (STACK:40), so
-// `fk.plot_id[i]` accesses inside feistel_encrypt became scattered LMEM
-// LDGs — brutal for an L1-bound kernel. Stashing it in __constant__
-// memory makes those loads broadcast-cached across the warp instead.
-__constant__ FeistelKey g_t3_fk;
-
-T3MatchParams make_t3_params(int k, int strength)
-{
-    T3MatchParams p{};
-    p.k                     = k;
-    p.strength              = strength;
-    p.num_section_bits      = (k < 28) ? 2 : (k - 26);
-    p.num_match_key_bits    = strength;
-    p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits;
-    return p;
-}
-
-namespace {
-
-__host__ __device__ inline uint32_t matching_section(uint32_t section, int num_section_bits)
-{
-    uint32_t num_sections = 1u << num_section_bits;
-    uint32_t mask = num_sections - 1u;
-    uint32_t rotated_left = ((section << 1) | (section >> (num_section_bits - 1))) & mask;
-    uint32_t rotated_left_plus_1 = (rotated_left + 1) & mask;
-    uint32_t section_new = ((rotated_left_plus_1 >> 1)
-                          | (rotated_left_plus_1 << (num_section_bits - 1))) & mask;
-    return section_new;
-}
-
-__global__ void compute_bucket_offsets(
-    uint32_t const* __restrict__ sorted_mi,
-    uint64_t total,
-    int num_match_target_bits,
-    uint32_t num_buckets,
-    uint64_t* __restrict__ offsets)
-{
-    if (threadIdx.x != 0 || blockIdx.x != 0) return;
-    uint32_t bucket_shift = static_cast<uint32_t>(num_match_target_bits);
-
-    uint64_t pos = 0;
-    for (uint32_t b = 0; b < num_buckets; ++b) {
-        uint64_t lo = pos, hi = total;
-        while (lo < hi) {
-            uint64_t mid = lo + ((hi - lo) >> 1);
-            uint32_t bucket_mid = sorted_mi[mid] >> bucket_shift;
-            if (bucket_mid < b) lo = mid + 1;
-            else                hi = mid;
-        }
-        offsets[b] = lo;
-        pos = lo;
-    }
-    offsets[num_buckets] = total;
-}
-
-// Compute fine-grained bucket offsets: one offset per (r_bucket,
-// top-FINE_BITS-of-target) pair. Lets the match kernel replace a
-// ~24-iteration bsearch on sorted_mi with a 2-LDG lookup + an ~16-
-// iteration bsearch in a 256× narrower window. Each thread writes
-// one fine_offsets entry via an in-range bsearch over sorted_mi
-// restricted to its parent bucket.
-__global__ void compute_fine_bucket_offsets(
-    uint32_t const* __restrict__ sorted_mi,
-    uint64_t const* __restrict__ bucket_offsets,
-    int num_match_target_bits,
-    int fine_bits,
-    uint32_t num_buckets,
-    uint64_t* __restrict__ fine_offsets)
-{
-    uint32_t const fine_count = 1u << fine_bits;
-    uint32_t const total      = num_buckets * fine_count;
-    uint32_t const tid        = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total) return;
-
-    uint32_t const r_bucket = tid / fine_count;
-    uint32_t const fine_key = tid % fine_count;
-
-    uint64_t const r_start = bucket_offsets[r_bucket];
-    uint64_t const r_end   = bucket_offsets[r_bucket + 1];
-
-    uint32_t const target_mask = (num_match_target_bits >= 32)
-                                  ? 0xFFFFFFFFu
-                                  : ((1u << num_match_target_bits) - 1u);
-    uint32_t const shift       = static_cast<uint32_t>(num_match_target_bits - fine_bits);
-
-    uint64_t lo = r_start, hi = r_end;
-    while (lo < hi) {
-        uint64_t mid = lo + ((hi - lo) >> 1);
-        uint32_t t   = (sorted_mi[mid] & target_mask) >> shift;
-        if (t < fine_key) lo = mid + 1;
-        else              hi = mid;
-    }
-    fine_offsets[tid] = lo;
-
-    // Last thread writes the sentinel (overall end = sorted_mi length).
-    if (tid == total - 1) {
-        fine_offsets[total] = bucket_offsets[num_buckets];
-    }
-}
-
-__global__ __launch_bounds__(256, 4) void match_all_buckets(
-    AesHashKeys keys,
-    uint64_t const* __restrict__ sorted_meta,
-    uint32_t const* __restrict__ sorted_xbits,
-    uint32_t const* __restrict__ sorted_mi,
-    uint64_t const* __restrict__ d_offsets,
-    uint64_t const* __restrict__ d_fine_offsets,
-    uint32_t num_match_keys,
-    int k,
-    int num_section_bits,
-    int num_match_target_bits,
-    int fine_bits,
-    uint32_t target_mask,
-    int num_test_bits,
-    T3PairingGpu* __restrict__ out,
-    unsigned long long* __restrict__ out_count,
-    uint64_t out_capacity)
-{
-    __shared__ uint32_t sT[4 * 256];
-    load_aes_tables_smem(sT);
-    __syncthreads();
-
-    uint32_t bucket_id   = blockIdx.y;
-    uint32_t section_l   = bucket_id / num_match_keys;
-    uint32_t match_key_r = bucket_id % num_match_keys;
-
-    uint32_t section_r;
-    {
-        uint32_t mask = (1u << num_section_bits) - 1u;
-        uint32_t rl   = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask;
-        uint32_t rl1  = (rl + 1) & mask;
-        section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask;
-    }
-
-    uint64_t l_start = d_offsets[section_l * num_match_keys];
-    uint64_t l_end   = d_offsets[(section_l + 1) * num_match_keys];
-    uint32_t r_bucket = section_r * num_match_keys + match_key_r;
-
-    uint64_t l = l_start + blockIdx.x * uint64_t(blockDim.x) + threadIdx.x;
-    if (l >= l_end) return;
-
-    uint64_t meta_l = sorted_meta[l];
-    uint32_t xb_l   = sorted_xbits[l];
-
-    uint32_t target_l = matching_target_smem(keys, 3u, match_key_r, meta_l, sT, 0)
-                      & target_mask;
-
-    // Fine-bucket pre-index: narrows the bsearch range by 2^fine_bits
-    // using a precomputed offset table indexed by (r_bucket, top
-    // fine_bits of target_l). Two cached LDGs replace the outer d_offsets
-    // r_start/r_end and shrink the bsearch window 256× at fine_bits=8.
-    uint32_t fine_shift = static_cast<uint32_t>(num_match_target_bits - fine_bits);
-    uint32_t fine_key   = target_l >> fine_shift;
-    uint64_t fine_idx   = (uint64_t(r_bucket) << fine_bits) | fine_key;
-    uint64_t lo         = d_fine_offsets[fine_idx];
-    uint64_t fine_hi    = d_fine_offsets[fine_idx + 1];
-    uint64_t hi         = fine_hi;
-
-    while (lo < hi) {
-        uint64_t mid = lo + ((hi - lo) >> 1);
-        uint32_t target_mid = sorted_mi[mid] & target_mask;
-        if (target_mid < target_l) lo = mid + 1;
-        else                       hi = mid;
-    }
-
-    uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu
-                                                : ((1u << num_test_bits) - 1u);
-
-    for (uint64_t r = lo; r < fine_hi; ++r) {
-        uint32_t target_r = sorted_mi[r] & target_mask;
-        if (target_r != target_l) break;
-
-        uint64_t meta_r = sorted_meta[r];
-        uint32_t xb_r   = sorted_xbits[r];
-
-        Result128 res = pairing_smem(keys, meta_l, meta_r, sT, 0);
-        uint32_t test_result = res.r[3] & test_mask;
-        if (test_result != 0) continue;
-
-        uint64_t all_x_bits = (uint64_t(xb_l) << k) | uint64_t(xb_r);
-        uint64_t fragment   = feistel_encrypt(g_t3_fk, all_x_bits);
-
-        unsigned long long out_idx = atomicAdd(out_count, 1ULL);
-        if (out_idx >= out_capacity) return;
-
-        T3PairingGpu p;
-        p.proof_fragment = fragment;
-        out[out_idx] = p;
-    }
-}
-
-} // namespace
-
-cudaError_t launch_t3_match(
-    uint8_t const* plot_id_bytes,
-    T3MatchParams const& params,
-    uint64_t const* d_sorted_meta,
-    uint32_t const* d_sorted_xbits,
-    uint32_t const* d_sorted_mi,
-    uint64_t t2_count,
-    T3PairingGpu* d_out_pairings,
-    uint64_t* d_out_count,
-    uint64_t capacity,
-    void* d_temp_storage,
-    size_t* temp_bytes,
-    cudaStream_t stream)
-{
-    if (!plot_id_bytes || !temp_bytes) return cudaErrorInvalidValue;
-    if (params.k < 18 || params.k > 32) return cudaErrorInvalidValue;
-    if (params.strength < 2)            return cudaErrorInvalidValue;
-
-    uint32_t num_sections    = 1u << params.num_section_bits;
-    uint32_t num_match_keys  = 1u << params.num_match_key_bits;
-    uint32_t num_buckets     = num_sections * num_match_keys;
-
-    // Fine-bucket pre-index: 2^FINE_BITS slots per bucket shrinks the
-    // match-kernel bsearch window by the same factor. Requires at least
-    // FINE_BITS+1 bits of target range; num_match_target_bits is
-    // k - section_bits - match_key_bits = 14..30 across the supported
-    // (k, strength) matrix, so 8 fine bits always leaves ≥6 for bsearch.
-    constexpr int FINE_BITS = 8;
-    uint64_t const fine_count    = 1ull << FINE_BITS;
-    uint64_t const fine_entries  = uint64_t(num_buckets) * fine_count + 1;
-
-    size_t const bucket_bytes = sizeof(uint64_t) * (num_buckets + 1);
-    size_t const fine_bytes   = sizeof(uint64_t) * fine_entries;
-    size_t const needed       = bucket_bytes + fine_bytes;
-
-    if (d_temp_storage == nullptr) {
-        *temp_bytes = needed;
-        return cudaSuccess;
-    }
-    if (*temp_bytes < needed)        return cudaErrorInvalidValue;
-    if (!d_sorted_meta || !d_sorted_xbits || !d_sorted_mi
-        || !d_out_pairings || !d_out_count) return cudaErrorInvalidValue;
-    if (params.num_match_target_bits <= FINE_BITS) {
-        // Fall-back would be needed here; not expected for supported
-        // (k, strength) combinations, so fail loudly if we ever trip it.
-        return cudaErrorInvalidValue;
-    }
-
-    auto* d_offsets      = reinterpret_cast<uint64_t*>(d_temp_storage);
-    auto* d_fine_offsets = d_offsets + (num_buckets + 1);
-
-    AesHashKeys keys = make_keys(plot_id_bytes);
-    FeistelKey  fk   = make_feistel_key(plot_id_bytes, params.k, /*rounds=*/4);
-    cudaError_t fk_err = cudaMemcpyToSymbolAsync(g_t3_fk, &fk, sizeof(fk),
-                                                 0, cudaMemcpyHostToDevice, stream);
-    if (fk_err != cudaSuccess) return fk_err;
-
-    compute_bucket_offsets<<<1, 1, 0, stream>>>(
-        d_sorted_mi, t2_count,
-        params.num_match_target_bits,
-        num_buckets,
-        d_offsets);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) return err;
-
-    // One thread per (r_bucket, fine_key). At T3 k=28 strength=2:
-    // 16 × 256 = 4096 threads = 16 blocks × 256.
-    uint32_t fine_threads_total = num_buckets * uint32_t(fine_count);
-    unsigned fine_blocks = (fine_threads_total + 255) / 256;
-    compute_fine_bucket_offsets<<<fine_blocks, 256, 0, stream>>>(
-        d_sorted_mi, d_offsets,
-        params.num_match_target_bits, FINE_BITS,
-        num_buckets, d_fine_offsets);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) return err;
-
-    err = cudaMemsetAsync(d_out_count, 0, sizeof(uint64_t), stream);
-    if (err != cudaSuccess) return err;
-
-    std::vector<uint64_t> h_offsets(num_buckets + 1);
-    err = cudaMemcpyAsync(h_offsets.data(), d_offsets,
-                          sizeof(uint64_t) * (num_buckets + 1),
-                          cudaMemcpyDeviceToHost, stream);
-    if (err != cudaSuccess) return err;
-    err = cudaStreamSynchronize(stream);
-    if (err != cudaSuccess) return err;
-
-    uint64_t l_count_max = 0;
-    for (uint32_t s = 0; s < num_sections; ++s) {
-        uint64_t l_count = h_offsets[(s + 1) * num_match_keys]
-                         - h_offsets[s * num_match_keys];
-        if (l_count > l_count_max) l_count_max = l_count;
-    }
-
-    uint32_t target_mask = (params.num_match_target_bits >= 32)
-                            ? 0xFFFFFFFFu
-                            : ((1u << params.num_match_target_bits) - 1u);
-    int num_test_bits = params.num_match_key_bits;
-
-    constexpr int kThreads = 256;
-    uint64_t blocks_x_u64 = (l_count_max + kThreads - 1) / kThreads;
-    if (blocks_x_u64 > UINT_MAX) return cudaErrorInvalidValue;
-    dim3 grid(static_cast<unsigned>(blocks_x_u64), num_buckets, 1);
-
-    match_all_buckets<<<grid, kThreads, 0, stream>>>(
-        keys, d_sorted_meta, d_sorted_xbits, d_sorted_mi,
-        d_offsets, d_fine_offsets,
-        num_match_keys,
-        params.k, params.num_section_bits,
-        params.num_match_target_bits, FINE_BITS,
-        target_mask, num_test_bits,
-        d_out_pairings,
-        reinterpret_cast<unsigned long long*>(d_out_count),
-        capacity);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) return err;
-    return cudaSuccess;
-}
-
-} // namespace pos2gpu
diff --git a/src/gpu/T3Kernel.cuh b/src/gpu/T3Kernel.cuh
index 46295b9..2711d06 100644
--- a/src/gpu/T3Kernel.cuh
+++ b/src/gpu/T3Kernel.cuh
@@ -10,7 +10,8 @@
 #include "gpu/AesHashGpu.cuh"
 #include "gpu/T2Kernel.cuh"
 
-#include <cuda_runtime.h>
+#include "gpu/CudaHalfShim.hpp"
+#include <sycl/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
 
@@ -35,7 +36,7 @@ T3MatchParams make_t3_params(int k, int strength);
 // sorted_t2 input is SoA-split: d_sorted_meta[i] is T2Pairing.meta and
 // d_sorted_xbits[i] is T2Pairing.x_bits after the T2 sort. match_info is
 // carried in the parallel d_sorted_mi stream.
-cudaError_t launch_t3_match(
+void launch_t3_match(
     uint8_t const* plot_id_bytes,
     T3MatchParams const& params,
     uint64_t const* d_sorted_meta,   // cap entries, uint64 meta
@@ -47,6 +48,72 @@ cudaError_t launch_t3_match(
     uint64_t capacity,
     void* d_temp_storage,
     size_t* temp_bytes,
-    cudaStream_t stream = nullptr);
+    sycl::queue& q);
+
+// Two-step entry point for callers that want to run T3 match in multiple
+// bucket-range passes (stage 4d — parallel to the T2 prepare/range split).
+// Equivalent to calling launch_t3_match with (0, num_buckets) when the
+// range covers the whole bucket space.
+//
+// launch_t3_match_prepare: computes bucket + fine-bucket offsets into
+//   d_temp_storage (reusing T2's wrappers, which T3's input is
+//   bit-identical to) and zeroes d_out_count. Same sizing protocol as
+//   launch_t3_match (d_temp_storage==nullptr fills *temp_bytes).
+//
+// launch_t3_match_range: runs the match kernel for bucket range
+//   [bucket_begin, bucket_end). Multiple calls sharing d_temp_storage /
+//   d_out_pairings / d_out_count produce a concatenated output via
+//   atomic append, byte-equivalent to a single full-range call after
+//   the subsequent T3 sort.
+void launch_t3_match_prepare(
+    uint8_t const* plot_id_bytes,
+    T3MatchParams const& params,
+    uint32_t const* d_sorted_mi,
+    uint64_t t2_count,
+    uint64_t* d_out_count,
+    void* d_temp_storage,
+    size_t* temp_bytes,
+    sycl::queue& q);
+
+void launch_t3_match_range(
+    uint8_t const* plot_id_bytes,
+    T3MatchParams const& params,
+    uint64_t const* d_sorted_meta,
+    uint32_t const* d_sorted_xbits,
+    uint32_t const* d_sorted_mi,
+    uint64_t t2_count,
+    T3PairingGpu* d_out_pairings,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void const* d_temp_storage,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q);
+
+// Sliced-meta variant of launch_t3_match_range (minimal tier). Caller
+// must ensure that all bucket ids in [bucket_begin, bucket_end) share
+// the same section_l so that l reads always fall within section_l's
+// row range and r reads always fall within section_r's row range. The
+// caller pre-computes the row starts for each section (from the
+// d_offsets table sitting in d_temp_storage) and H2Ds the relevant
+// section slices of d_sorted_meta into d_meta_l_slice / d_meta_r_slice.
+// d_sorted_xbits and d_sorted_mi are still full-cap on device.
+void launch_t3_match_section_pair_range(
+    uint8_t const* plot_id_bytes,
+    T3MatchParams const& params,
+    uint64_t const* d_meta_l_slice,
+    uint64_t section_l_row_start,
+    uint64_t const* d_meta_r_slice,
+    uint64_t section_r_row_start,
+    uint32_t const* d_sorted_xbits,
+    uint32_t const* d_sorted_mi,
+    uint64_t t2_count,
+    T3PairingGpu* d_out_pairings,
+    uint64_t* d_out_count,
+    uint64_t capacity,
+    void const* d_temp_storage,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q);
 
 } // namespace pos2gpu
diff --git a/src/gpu/T3Offsets.cuh b/src/gpu/T3Offsets.cuh
new file mode 100644
index 0000000..3c6b594
--- /dev/null
+++ b/src/gpu/T3Offsets.cuh
@@ -0,0 +1,98 @@
+// T3Offsets.cuh — backend-dispatched wrapper for T3's match kernel.
+//
+// T3 reuses T2's bucket / fine-bucket offset wrappers (the input is the
+// same uint32_t* sorted_mi stream and the algorithm is identical), so
+// only the match kernel — which differs in the Feistel-encrypted output
+// — is declared here.
+
+#pragma once
+
+#include "gpu/AesHashGpu.cuh"
+#include "gpu/FeistelCipherGpu.cuh"
+#include "gpu/T3Kernel.cuh"  // T3PairingGpu
+
+#include <cstdint>
+
+#include "gpu/CudaHalfShim.hpp"
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+// Fused T3 match. table_id=3, no strength scaling. For each surviving
+// (l, r) pair, emits T3PairingGpu{ proof_fragment = feistel_encrypt(
+// (xb_l << k) | xb_r) } via an atomic cursor.
+//
+// bucket_begin / bucket_end select which bucket-id range to process
+// (inclusive / exclusive). Passing (0, num_buckets) preserves the
+// original full-pass behavior. Smaller ranges let callers split T3
+// match into temporally-separated passes so downstream memory does
+// not need to hold the full T3 output at once — parallel to the T2
+// match bucket-range plumbing in T2Offsets.cuh.
+//
+// Across all passes sharing the same d_out_pairings / d_out_count,
+// results append via the atomic counter in the kernel.
+void launch_t3_match_all_buckets(
+    AesHashKeys keys,
+    FeistelKey fk,
+    uint64_t const* d_sorted_meta,
+    uint32_t const* d_sorted_xbits,
+    uint32_t const* d_sorted_mi,
+    uint64_t const* d_offsets,
+    uint64_t const* d_fine_offsets,
+    uint32_t num_match_keys,
+    uint32_t num_buckets,
+    int k,
+    int num_section_bits,
+    int num_match_target_bits,
+    int fine_bits,
+    uint32_t target_mask,
+    int num_test_bits,
+    T3PairingGpu* d_out_pairings,
+    uint64_t* d_out_count,
+    uint64_t out_capacity,
+    uint64_t l_count_max,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q);
+
+// Sliced variant: same algorithm as launch_t3_match_all_buckets but with
+// d_sorted_meta accessed via two per-section slices instead of a full
+// cap-sized device buffer. The kernel reads:
+//   meta_l = d_meta_l_slice[l - section_l_row_start]
+//   meta_r = d_meta_r_slice[r - section_r_row_start]
+// Caller MUST ensure that all bucket ids in [bucket_begin, bucket_end)
+// share the same section_l (i.e., the range is contained in
+// [section_l*num_match_keys, (section_l+1)*num_match_keys)) so that
+// every l read falls in section_l's row range and every r read falls in
+// the (uniquely-determined) section_r's row range. d_sorted_xbits and
+// d_sorted_mi remain full-cap on device (no slicing). Used by minimal
+// tier to keep d_t2_meta_sorted parked on host pinned across T3 match;
+// drops T3 match peak from ~5200 MB to ~3380 MB at k=28.
+void launch_t3_match_section_pair(
+    AesHashKeys keys,
+    FeistelKey fk,
+    uint64_t const* d_meta_l_slice,
+    uint64_t section_l_row_start,
+    uint64_t const* d_meta_r_slice,
+    uint64_t section_r_row_start,
+    uint32_t const* d_sorted_xbits,
+    uint32_t const* d_sorted_mi,
+    uint64_t const* d_offsets,
+    uint64_t const* d_fine_offsets,
+    uint32_t num_match_keys,
+    uint32_t num_buckets,
+    int k,
+    int num_section_bits,
+    int num_match_target_bits,
+    int fine_bits,
+    uint32_t target_mask,
+    int num_test_bits,
+    T3PairingGpu* d_out_pairings,
+    uint64_t* d_out_count,
+    uint64_t out_capacity,
+    uint64_t l_count_max,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q);
+
+} // namespace pos2gpu
diff --git a/src/gpu/T3OffsetsSycl.cpp b/src/gpu/T3OffsetsSycl.cpp
new file mode 100644
index 0000000..ab764e8
--- /dev/null
+++ b/src/gpu/T3OffsetsSycl.cpp
@@ -0,0 +1,282 @@
+// T3OffsetsSycl.cpp — SYCL implementation of T3's match kernel. Mirrors
+// the CUDA path; FeistelKey (40 B) is captured by value in the parallel_for
+// lambda instead of going through CUDA constant memory. AdaptiveCpp's
+// SSCP backend handles the capture via the kernel-arg mechanism, which is
+// fine at this size — if local-memory spills ever bite, switch to a USM
+// upload analogous to the CUDA cudaMemcpyToSymbolAsync path.
+
+#include "gpu/SyclBackend.hpp"
+#include "gpu/T3Offsets.cuh"
+
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+void launch_t3_match_all_buckets(
+    AesHashKeys keys,
+    FeistelKey fk,
+    uint64_t const* d_sorted_meta,
+    uint32_t const* d_sorted_xbits,
+    uint32_t const* d_sorted_mi,
+    uint64_t const* d_offsets,
+    uint64_t const* d_fine_offsets,
+    uint32_t num_match_keys,
+    uint32_t num_buckets,
+    int k,
+    int num_section_bits,
+    int num_match_target_bits,
+    int fine_bits,
+    uint32_t target_mask,
+    int num_test_bits,
+    T3PairingGpu* d_out_pairings,
+    uint64_t* d_out_count,
+    uint64_t out_capacity,
+    uint64_t l_count_max,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q)
+{
+    (void)num_buckets;  // only the [begin, end) sub-range is iterated
+    if (bucket_end <= bucket_begin) return;
+    uint32_t const num_buckets_in_range = bucket_end - bucket_begin;
+
+    uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q);
+
+    constexpr size_t threads = 256;
+    uint64_t blocks_x_u64    = (l_count_max + threads - 1) / threads;
+    size_t   const blocks_x  = static_cast<size_t>(blocks_x_u64);
+
+    auto* d_out_count_ull =
+        reinterpret_cast<unsigned long long*>(d_out_count);
+
+    q.submit([&](sycl::handler& h) {
+        sycl::local_accessor<uint32_t, 1> sT_local{
+            sycl::range<1>{4 * 256}, h};
+
+        h.parallel_for(
+            sycl::nd_range<2>{
+                sycl::range<2>{ static_cast<size_t>(num_buckets_in_range),
+                                blocks_x * threads },
+                sycl::range<2>{ 1, threads }
+            },
+            [=, keys_copy = keys, fk_copy = fk](sycl::nd_item<2> it) {
+                uint32_t* sT = &sT_local[0];
+                size_t local_id = it.get_local_id(1);
+                #pragma unroll 1
+                for (size_t i = local_id; i < 4 * 256; i += threads) {
+                    sT[i] = d_aes_tables[i];
+                }
+                it.barrier(sycl::access::fence_space::local_space);
+
+                uint32_t bucket_id   = bucket_begin + static_cast<uint32_t>(it.get_group(0));
+                uint32_t section_l   = bucket_id / num_match_keys;
+                uint32_t match_key_r = bucket_id % num_match_keys;
+
+                uint32_t section_r;
+                {
+                    uint32_t mask = (1u << num_section_bits) - 1u;
+                    uint32_t rl   = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask;
+                    uint32_t rl1  = (rl + 1) & mask;
+                    section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask;
+                }
+
+                uint64_t l_start = d_offsets[section_l * num_match_keys];
+                uint64_t l_end   = d_offsets[(section_l + 1) * num_match_keys];
+                uint32_t r_bucket = section_r * num_match_keys + match_key_r;
+
+                uint64_t l = l_start
+                           + it.get_group(1) * uint64_t(threads)
+                           + local_id;
+                if (l >= l_end) return;
+
+                uint64_t meta_l = d_sorted_meta[l];
+                uint32_t xb_l   = d_sorted_xbits[l];
+
+                uint32_t target_l = pos2gpu::matching_target_smem(
+                                        keys_copy, 3u, match_key_r, meta_l, sT, 0)
+                                  & target_mask;
+
+                uint32_t fine_shift = static_cast<uint32_t>(num_match_target_bits - fine_bits);
+                uint32_t fine_key   = target_l >> fine_shift;
+                uint64_t fine_idx   = (uint64_t(r_bucket) << fine_bits) | fine_key;
+                uint64_t lo         = d_fine_offsets[fine_idx];
+                uint64_t fine_hi    = d_fine_offsets[fine_idx + 1];
+                uint64_t hi         = fine_hi;
+
+                while (lo < hi) {
+                    uint64_t mid = lo + ((hi - lo) >> 1);
+                    uint32_t target_mid = d_sorted_mi[mid] & target_mask;
+                    if (target_mid < target_l) lo = mid + 1;
+                    else                       hi = mid;
+                }
+
+                uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu
+                                                            : ((1u << num_test_bits) - 1u);
+
+                for (uint64_t r = lo; r < fine_hi; ++r) {
+                    uint32_t target_r = d_sorted_mi[r] & target_mask;
+                    if (target_r != target_l) break;
+
+                    uint64_t meta_r = d_sorted_meta[r];
+                    uint32_t xb_r   = d_sorted_xbits[r];
+
+                    pos2gpu::Result128 res = pos2gpu::pairing_smem(
+                        keys_copy, meta_l, meta_r, sT, 0);
+                    uint32_t test_result = res.r[3] & test_mask;
+                    if (test_result != 0) continue;
+
+                    uint64_t all_x_bits = (uint64_t(xb_l) << k) | uint64_t(xb_r);
+                    uint64_t fragment   = pos2gpu::feistel_encrypt(fk_copy, all_x_bits);
+
+                    sycl::atomic_ref<unsigned long long,
+                                     sycl::memory_order::relaxed,
+                                     sycl::memory_scope::device>
+                        out_count_atomic{ *d_out_count_ull };
+                    unsigned long long out_idx = out_count_atomic.fetch_add(1ULL);
+                    if (out_idx >= out_capacity) return;
+
+                    T3PairingGpu p;
+                    p.proof_fragment = fragment;
+                    d_out_pairings[out_idx] = p;
+                }
+            });
+    }).wait();
+}
+
+void launch_t3_match_section_pair(
+    AesHashKeys keys,
+    FeistelKey fk,
+    uint64_t const* d_meta_l_slice,
+    uint64_t section_l_row_start,
+    uint64_t const* d_meta_r_slice,
+    uint64_t section_r_row_start,
+    uint32_t const* d_sorted_xbits,
+    uint32_t const* d_sorted_mi,
+    uint64_t const* d_offsets,
+    uint64_t const* d_fine_offsets,
+    uint32_t num_match_keys,
+    uint32_t num_buckets,
+    int k,
+    int num_section_bits,
+    int num_match_target_bits,
+    int fine_bits,
+    uint32_t target_mask,
+    int num_test_bits,
+    T3PairingGpu* d_out_pairings,
+    uint64_t* d_out_count,
+    uint64_t out_capacity,
+    uint64_t l_count_max,
+    uint32_t bucket_begin,
+    uint32_t bucket_end,
+    sycl::queue& q)
+{
+    (void)num_buckets;
+    if (bucket_end <= bucket_begin) return;
+    uint32_t const num_buckets_in_range = bucket_end - bucket_begin;
+
+    uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q);
+
+    constexpr size_t threads = 256;
+    uint64_t blocks_x_u64    = (l_count_max + threads - 1) / threads;
+    size_t   const blocks_x  = static_cast<size_t>(blocks_x_u64);
+
+    auto* d_out_count_ull =
+        reinterpret_cast<unsigned long long*>(d_out_count);
+
+    q.submit([&](sycl::handler& h) {
+        sycl::local_accessor<uint32_t, 1> sT_local{
+            sycl::range<1>{4 * 256}, h};
+
+        h.parallel_for(
+            sycl::nd_range<2>{
+                sycl::range<2>{ static_cast<size_t>(num_buckets_in_range),
+                                blocks_x * threads },
+                sycl::range<2>{ 1, threads }
+            },
+            [=, keys_copy = keys, fk_copy = fk](sycl::nd_item<2> it) {
+                uint32_t* sT = &sT_local[0];
+                size_t local_id = it.get_local_id(1);
+                #pragma unroll 1
+                for (size_t i = local_id; i < 4 * 256; i += threads) {
+                    sT[i] = d_aes_tables[i];
+                }
+                it.barrier(sycl::access::fence_space::local_space);
+
+                uint32_t bucket_id   = bucket_begin + static_cast<uint32_t>(it.get_group(0));
+                uint32_t section_l   = bucket_id / num_match_keys;
+                uint32_t match_key_r = bucket_id % num_match_keys;
+
+                uint32_t section_r;
+                {
+                    uint32_t mask = (1u << num_section_bits) - 1u;
+                    uint32_t rl   = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask;
+                    uint32_t rl1  = (rl + 1) & mask;
+                    section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask;
+                }
+
+                uint64_t l_start = d_offsets[section_l * num_match_keys];
+                uint64_t l_end   = d_offsets[(section_l + 1) * num_match_keys];
+                uint32_t r_bucket = section_r * num_match_keys + match_key_r;
+
+                uint64_t l = l_start
+                           + it.get_group(1) * uint64_t(threads)
+                           + local_id;
+                if (l >= l_end) return;
+
+                // Sliced read: caller guarantees l ∈ [section_l_row_start, ...).
+                uint64_t meta_l = d_meta_l_slice[l - section_l_row_start];
+                uint32_t xb_l   = d_sorted_xbits[l];
+
+                uint32_t target_l = pos2gpu::matching_target_smem(
+                                        keys_copy, 3u, match_key_r, meta_l, sT, 0)
+                                  & target_mask;
+
+                uint32_t fine_shift = static_cast<uint32_t>(num_match_target_bits - fine_bits);
+                uint32_t fine_key   = target_l >> fine_shift;
+                uint64_t fine_idx   = (uint64_t(r_bucket) << fine_bits) | fine_key;
+                uint64_t lo         = d_fine_offsets[fine_idx];
+                uint64_t fine_hi    = d_fine_offsets[fine_idx + 1];
+                uint64_t hi         = fine_hi;
+
+                while (lo < hi) {
+                    uint64_t mid = lo + ((hi - lo) >> 1);
+                    uint32_t target_mid = d_sorted_mi[mid] & target_mask;
+                    if (target_mid < target_l) lo = mid + 1;
+                    else                       hi = mid;
+                }
+
+                uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu
+                                                            : ((1u << num_test_bits) - 1u);
+
+                for (uint64_t r = lo; r < fine_hi; ++r) {
+                    uint32_t target_r = d_sorted_mi[r] & target_mask;
+                    if (target_r != target_l) break;
+
+                    // Sliced read: caller guarantees r ∈ [section_r_row_start, ...).
+                    uint64_t meta_r = d_meta_r_slice[r - section_r_row_start];
+                    uint32_t xb_r   = d_sorted_xbits[r];
+
+                    pos2gpu::Result128 res = pos2gpu::pairing_smem(
+                        keys_copy, meta_l, meta_r, sT, 0);
+                    uint32_t test_result = res.r[3] & test_mask;
+                    if (test_result != 0) continue;
+
+                    uint64_t all_x_bits = (uint64_t(xb_l) << k) | uint64_t(xb_r);
+                    uint64_t fragment   = pos2gpu::feistel_encrypt(fk_copy, all_x_bits);
+
+                    sycl::atomic_ref<unsigned long long,
+                                     sycl::memory_order::relaxed,
+                                     sycl::memory_scope::device>
+                        out_count_atomic{ *d_out_count_ull };
+                    unsigned long long out_idx = out_count_atomic.fetch_add(1ULL);
+                    if (out_idx >= out_capacity) return;
+
+                    T3PairingGpu p;
+                    p.proof_fragment = fragment;
+                    d_out_pairings[out_idx] = p;
+                }
+            });
+    }).wait();
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/XsCandidateGpu.hpp b/src/gpu/XsCandidateGpu.hpp
new file mode 100644
index 0000000..a42fef3
--- /dev/null
+++ b/src/gpu/XsCandidateGpu.hpp
@@ -0,0 +1,22 @@
+// XsCandidateGpu.hpp — minimal header carrying just the Xs_Candidate POD.
+//
+// Split out from XsKernel.cuh so the type can be referenced from non-CUDA
+// translation units (notably the SYCL backend implementations), which can't
+// pull in the CUDA-laden XsKernel.cuh → AesHashGpu.cuh → AesGpu.cuh chain.
+//
+// Layout mirrors pos2-chip/src/plot/TableConstructorGeneric.hpp:496 so a
+// host-side reinterpret_cast to the pos2-chip type is safe.
+
+#pragma once
+
+#include <cstdint>
+
+namespace pos2gpu {
+
+struct XsCandidateGpu {
+    uint32_t match_info;
+    uint32_t x;
+};
+static_assert(sizeof(XsCandidateGpu) == 8, "must match pos2-chip Xs_Candidate layout");
+
+} // namespace pos2gpu
diff --git a/src/gpu/XsKernel.cpp b/src/gpu/XsKernel.cpp
new file mode 100644
index 0000000..162e92b
--- /dev/null
+++ b/src/gpu/XsKernel.cpp
@@ -0,0 +1,185 @@
+// XsKernel.cpp — orchestrates Xs construction on a SYCL queue.
+//
+// Pipeline:
+//   1. launch_xs_gen:  writes (g(x⊕xor_const), x) into (keys_a, vals_a).
+//   2. launch_sort_pairs_u32_u32: stable radix sort by the bottom k bits.
+//   3. launch_xs_pack: fold sorted (keys, vals) into XsCandidateGpu[total].
+//
+// All scratch is allocated by the caller; on the first call with
+// d_temp_storage == nullptr the function only writes the required
+// *temp_bytes and returns without launching anything.
+
+#include "gpu/AesHashGpu.cuh"
+#include "gpu/Sort.cuh"
+#include "gpu/XsKernel.cuh"
+#include "gpu/XsKernels.cuh"
+
+#include <sycl/sycl.hpp>
+
+#include <chrono>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+namespace pos2gpu {
+
+namespace {
+
+// Mirrors pos2-chip/src/pos/ProofConstants.hpp:14
+constexpr uint32_t kTestnetGXorConst = 0xA3B1C4D7u;
+
+// Layout of caller-provided d_temp_storage:
+//   [0                  .. cub_bytes)            CUB sort scratch
+//   [keys_a_off         .. keys_a_off + N*4)     keys_a (uint32)  (*)
+//   [keys_b_off         .. keys_b_off + N*4)     keys_b (uint32)
+//   [vals_a_off         .. vals_a_off + N*4)     vals_a (uint32)
+//   [vals_b_off         .. vals_b_off + N*4)     vals_b (uint32)
+// (*) In split mode (split_keys_a != nullptr) the keys_a slot is OMITTED
+// from d_temp_storage — keys_a_off is set to SIZE_MAX as a sentinel and
+// keys_b_off follows directly after cub_scratch. Total bytes drop by
+// one aligned (N*u32) block (~1 GiB at k=28).
+struct ScratchLayout {
+    size_t cub_bytes;
+    size_t keys_a_off;
+    size_t keys_b_off;
+    size_t vals_a_off;
+    size_t vals_b_off;
+    size_t total_bytes;
+};
+
+inline size_t align_up(size_t v, size_t a) { return (v + a - 1) / a * a; }
+
+ScratchLayout layout_for(uint64_t total, size_t cub_bytes, bool split_keys_a)
+{
+    ScratchLayout s{};
+    s.cub_bytes  = cub_bytes;
+    size_t cur   = align_up(s.cub_bytes, 256);
+    if (split_keys_a) {
+        s.keys_a_off = ~size_t{0};  // sentinel: keys_a lives externally
+    } else {
+        s.keys_a_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256);
+    }
+    s.keys_b_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256);
+    s.vals_a_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256);
+    s.vals_b_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256);
+    s.total_bytes = cur;
+    return s;
+}
+
+} // namespace
+
+void launch_construct_xs(
+    uint8_t const* plot_id_bytes, int k, bool testnet,
+    XsCandidateGpu* d_out, void* d_temp_storage, size_t* temp_bytes,
+    sycl::queue& q, void* split_keys_a)
+{
+    return launch_construct_xs_profiled(plot_id_bytes, k, testnet,
+                                        d_out, d_temp_storage, temp_bytes,
+                                        nullptr, nullptr, q, split_keys_a);
+}
+
+void launch_construct_xs_profiled(
+    uint8_t const* plot_id_bytes,
+    int k,
+    bool testnet,
+    XsCandidateGpu* d_out,
+    void* d_temp_storage,
+    size_t* temp_bytes,
+    cudaEvent_t /*after_gen*/,
+    cudaEvent_t /*after_sort*/,
+    sycl::queue& q,
+    void* split_keys_a)
+{
+    // NOTE: the cudaEvent_t after_gen / after_sort parameters are kept
+    // for API compatibility but no longer recorded. xs_bench's per-phase
+    // timing is therefore zero through this call; use chrono on the host
+    // around launch_construct_xs to measure end-to-end wall time. A
+    // sycl::event-based profiling overload is the natural follow-up.
+
+    if (k < 18 || k > 32 || (k & 1) != 0) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!plot_id_bytes || !temp_bytes)    throw std::invalid_argument("invalid argument to launch wrapper");
+
+    uint64_t const total = 1ULL << k;
+
+    // Query CUB temp size via the wrapper (sizing mode: null storage).
+    size_t cub_bytes = 0;
+    launch_sort_pairs_u32_u32(
+        nullptr, cub_bytes,
+        nullptr, nullptr,
+        nullptr, nullptr,
+        total, /*begin_bit=*/0, /*end_bit=*/k, q);
+
+    bool const split = (split_keys_a != nullptr);
+    auto sl = layout_for(total, cub_bytes, split);
+
+    if (d_temp_storage == nullptr) {
+        *temp_bytes = sl.total_bytes;
+
+        return;
+    }
+    if (*temp_bytes < sl.total_bytes) throw std::invalid_argument("invalid argument to launch wrapper");
+    if (!d_out)                       throw std::invalid_argument("invalid argument to launch wrapper");
+
+    auto* base = static_cast<uint8_t*>(d_temp_storage);
+    auto* cub_scratch = base; // first cub_bytes
+    auto* keys_a = split
+        ? static_cast<uint32_t*>(split_keys_a)
+        : reinterpret_cast<uint32_t*>(base + sl.keys_a_off);
+    auto* keys_b = reinterpret_cast<uint32_t*>(base + sl.keys_b_off);
+    auto* vals_a = reinterpret_cast<uint32_t*>(base + sl.vals_a_off);
+    auto* vals_b = reinterpret_cast<uint32_t*>(base + sl.vals_b_off);
+
+    AesHashKeys keys = make_keys(plot_id_bytes);
+    uint32_t xor_const = testnet ? kTestnetGXorConst : 0u;
+
+    // Sub-phase wall-time breakdown — useful when GpuPipeline's outer
+    // "Xs gen+sort" phase dominates total wall (notably on the SYCL/HIP
+    // backend, where the Xs phase has been observed at ~40% on RDNA2 vs
+    // ~6% on NVIDIA). Gated on POS2GPU_PHASE_TIMING=1 so the q.wait()s
+    // don't perturb production runs.
+    bool const xs_timing = [] {
+        char const* v = std::getenv("POS2GPU_PHASE_TIMING");
+        return v && v[0] == '1';
+    }();
+    using xs_clock = std::chrono::steady_clock;
+    auto xs_now = [&] { return xs_clock::now(); };
+    auto xs_elapsed_ms = [&](xs_clock::time_point t0) {
+        return std::chrono::duration<double, std::milli>(xs_now() - t0).count();
+    };
+    auto xs_t0 = xs_now();
+    if (xs_timing) q.wait();
+
+    // Phase 1: generate (match_info, x) into keys_a / vals_a
+    launch_xs_gen(keys, keys_a, vals_a, total, k, xor_const, q);
+    double t_gen = 0.0;
+    if (xs_timing) { q.wait(); t_gen = xs_elapsed_ms(xs_t0); xs_t0 = xs_now(); }
+
+    // Phase 2: stable radix sort by (key low k bits) — keys_a → keys_b,
+    // vals_a → vals_b. (We give up CUB's DoubleBuffer optimisation here,
+    // costing one extra pass at most; pack reads from the b side.)
+    launch_sort_pairs_u32_u32(
+        cub_scratch, cub_bytes,
+        keys_a, keys_b,
+        vals_a, vals_b,
+        total, /*begin_bit=*/0, /*end_bit=*/k, q);
+    double t_sort = 0.0;
+    if (xs_timing) { q.wait(); t_sort = xs_elapsed_ms(xs_t0); xs_t0 = xs_now(); }
+
+    // Phase 3: pack the sorted side into AoS XsCandidateGpu in d_out.
+    launch_xs_pack(keys_b, vals_b, d_out, total, q);
+    double t_pack = 0.0;
+    if (xs_timing) { q.wait(); t_pack = xs_elapsed_ms(xs_t0); }
+
+    if (xs_timing) {
+        double const total_ms = t_gen + t_sort + t_pack;
+        std::fprintf(stderr,
+            "[xs-timing] gen=%.1fms(%.0f%%) sort=%.1fms(%.0f%%) pack=%.1fms(%.0f%%) total=%.1fms\n",
+            t_gen,  total_ms > 0.0 ? 100.0 * t_gen  / total_ms : 0.0,
+            t_sort, total_ms > 0.0 ? 100.0 * t_sort / total_ms : 0.0,
+            t_pack, total_ms > 0.0 ? 100.0 * t_pack / total_ms : 0.0,
+            total_ms);
+    }
+}
+
+} // namespace pos2gpu
diff --git a/src/gpu/XsKernel.cu b/src/gpu/XsKernel.cu
deleted file mode 100644
index 133504e..0000000
--- a/src/gpu/XsKernel.cu
+++ /dev/null
@@ -1,181 +0,0 @@
-// XsKernel.cu — implementation of launch_construct_xs.
-//
-// Pipeline:
-//   1. Phase 1 kernel writes XsCandidateGpu[x] = { g(x), x } for x in [0, 2^k).
-//   2. Pack into (key=match_info, value=x) and call cub::DeviceRadixSort::
-//      SortPairs over the bottom k bits. CUB's radix sort is stable
-//      (preserves relative order for equal keys), matching pos2-chip's
-//      RadixSort which is multi-pass LSD radix.
-//   3. Repack sorted (key, value) back into XsCandidateGpu in d_out.
-//
-// All scratch is allocated by the caller; on first call with d_temp_storage
-// == nullptr the function only writes the required *temp_bytes and returns
-// without launching anything.
-
-#include "gpu/AesGpu.cuh"
-#include "gpu/AesHashGpu.cuh"
-#include "gpu/XsKernel.cuh"
-
-#include <cub/cub.cuh>
-#include <cuda_runtime.h>
-#include <cstdint>
-
-namespace pos2gpu {
-
-namespace {
-
-// Mirrors pos2-chip/src/pos/ProofConstants.hpp:14
-constexpr uint32_t kTestnetGXorConst = 0xA3B1C4D7u;
-
-__global__ void gen_kernel(
-    AesHashKeys keys,
-    uint32_t* __restrict__ keys_out, // match_info
-    uint32_t* __restrict__ vals_out, // x
-    uint64_t total,
-    int k,
-    uint32_t xor_const)
-{
-    __shared__ uint32_t sT[4 * 256];
-    load_aes_tables_smem(sT);
-    __syncthreads();
-
-    uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x;
-    if (idx >= total) return;
-    uint32_t x = static_cast<uint32_t>(idx);
-    uint32_t mixed = x ^ xor_const;
-    keys_out[idx] = g_x_smem(keys, mixed, k, sT, kAesGRounds);
-    vals_out[idx] = x;
-}
-
-__global__ void pack_kernel(
-    uint32_t const* __restrict__ keys_in,
-    uint32_t const* __restrict__ vals_in,
-    XsCandidateGpu* __restrict__ out,
-    uint64_t total)
-{
-    uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x;
-    if (idx >= total) return;
-    out[idx] = XsCandidateGpu{ keys_in[idx], vals_in[idx] };
-}
-
-// Layout of caller-provided d_temp_storage (single arena):
-//
-//   [0                  .. keys_in_off)             reserved for CUB scratch
-//   [keys_in_off        .. keys_in_off + N*4)        keys_in   (uint32)
-//   [keys_out_off       .. keys_out_off + N*4)       keys_out  (uint32)
-//   [vals_in_off        .. vals_in_off + N*4)        vals_in   (uint32)
-//   [vals_out_off       .. vals_out_off + N*4)       vals_out  (uint32)
-//
-// CUB SortPairs alternates ping-pong between in/out; we use the
-// `DoubleBuffer` API to let CUB pick which side ends up holding the
-// sorted result.
-
-struct ScratchLayout {
-    size_t cub_bytes;     // bytes for CUB's own scratch
-    size_t keys_a_off;    // offset to keys buffer A
-    size_t keys_b_off;    // offset to keys buffer B
-    size_t vals_a_off;    // offset to vals buffer A
-    size_t vals_b_off;    // offset to vals buffer B
-    size_t total_bytes;
-};
-
-constexpr size_t align_up(size_t v, size_t a) { return (v + a - 1) / a * a; }
-
-ScratchLayout layout_for(uint64_t total, size_t cub_bytes)
-{
-    ScratchLayout s{};
-    s.cub_bytes = cub_bytes;
-    size_t cur = align_up(s.cub_bytes, 256);
-    s.keys_a_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256);
-    s.keys_b_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256);
-    s.vals_a_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256);
-    s.vals_b_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256);
-    s.total_bytes = cur;
-    return s;
-}
-
-} // namespace
-
-cudaError_t launch_construct_xs(
-    uint8_t const* plot_id_bytes, int k, bool testnet,
-    XsCandidateGpu* d_out, void* d_temp_storage, size_t* temp_bytes,
-    cudaStream_t stream)
-{
-    return launch_construct_xs_profiled(plot_id_bytes, k, testnet,
-                                        d_out, d_temp_storage, temp_bytes,
-                                        nullptr, nullptr, stream);
-}
-
-cudaError_t launch_construct_xs_profiled(
-    uint8_t const* plot_id_bytes,
-    int k,
-    bool testnet,
-    XsCandidateGpu* d_out,
-    void* d_temp_storage,
-    size_t* temp_bytes,
-    cudaEvent_t after_gen,
-    cudaEvent_t after_sort,
-    cudaStream_t stream)
-{
-    if (k < 18 || k > 32 || (k & 1) != 0) return cudaErrorInvalidValue;
-    if (!plot_id_bytes || !temp_bytes)    return cudaErrorInvalidValue;
-
-    uint64_t const total = 1ULL << k;
-
-    // Query CUB temp size once (depends only on N).
-    cub::DoubleBuffer<uint32_t> probe_keys(nullptr, nullptr);
-    cub::DoubleBuffer<uint32_t> probe_vals(nullptr, nullptr);
-    size_t cub_bytes = 0;
-    cudaError_t err = cub::DeviceRadixSort::SortPairs(
-        nullptr, cub_bytes,
-        probe_keys, probe_vals,
-        total, /*begin_bit=*/0, /*end_bit=*/k, stream);
-    if (err != cudaSuccess) return err;
-
-    auto sl = layout_for(total, cub_bytes);
-
-    if (d_temp_storage == nullptr) {
-        *temp_bytes = sl.total_bytes;
-        return cudaSuccess;
-    }
-    if (*temp_bytes < sl.total_bytes) return cudaErrorInvalidValue;
-    if (!d_out)                       return cudaErrorInvalidValue;
-
-    auto* base = static_cast<uint8_t*>(d_temp_storage);
-    auto* cub_scratch = base; // first cub_bytes
-    auto* keys_a = reinterpret_cast<uint32_t*>(base + sl.keys_a_off);
-    auto* keys_b = reinterpret_cast<uint32_t*>(base + sl.keys_b_off);
-    auto* vals_a = reinterpret_cast<uint32_t*>(base + sl.vals_a_off);
-    auto* vals_b = reinterpret_cast<uint32_t*>(base + sl.vals_b_off);
-
-    AesHashKeys keys = make_keys(plot_id_bytes);
-    uint32_t xor_const = testnet ? kTestnetGXorConst : 0u;
-
-    constexpr int kThreads = 256;
-    uint64_t blocks_u64 = (total + kThreads - 1) / kThreads;
-    if (blocks_u64 > UINT_MAX) return cudaErrorInvalidValue;
-    unsigned blocks = static_cast<unsigned>(blocks_u64);
-
-    // Phase 1: generate (match_info, x) into keys_a / vals_a
-    gen_kernel<<<blocks, kThreads, 0, stream>>>(keys, keys_a, vals_a, total, k, xor_const);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) return err;
-    if (after_gen) cudaEventRecord(after_gen, stream);
-
-    // Phase 2: stable radix sort by (key low k bits)
-    cub::DoubleBuffer<uint32_t> keys_buf(keys_a, keys_b);
-    cub::DoubleBuffer<uint32_t> vals_buf(vals_a, vals_b);
-    err = cub::DeviceRadixSort::SortPairs(
-        cub_scratch, cub_bytes,
-        keys_buf, vals_buf,
-        total, /*begin_bit=*/0, /*end_bit=*/k, stream);
-    if (err != cudaSuccess) return err;
-
-    // Phase 3: pack the side CUB ended up writing into d_out
-    pack_kernel<<<blocks, kThreads, 0, stream>>>(
-        keys_buf.Current(), vals_buf.Current(), d_out, total);
-    if (after_sort) cudaEventRecord(after_sort, stream);
-    return cudaGetLastError();
-}
-
-} // namespace pos2gpu
diff --git a/src/gpu/XsKernel.cuh b/src/gpu/XsKernel.cuh
index b43d11c..8ea924e 100644
--- a/src/gpu/XsKernel.cuh
+++ b/src/gpu/XsKernel.cuh
@@ -9,19 +9,15 @@
 #pragma once
 
 #include "gpu/AesHashGpu.cuh"
+#include "gpu/XsCandidateGpu.hpp"
 
-#include <cuda_runtime.h>
+#include "gpu/CudaHalfShim.hpp"
+#include <sycl/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
 
 namespace pos2gpu {
 
-struct XsCandidateGpu {
-    uint32_t match_info;
-    uint32_t x;
-};
-static_assert(sizeof(XsCandidateGpu) == 8, "must match pos2-chip Xs_Candidate layout");
-
 // Generate Xs_Candidate[2^k], sorted by match_info (low k bits, stable).
 // Caller must have called initialize_aes_tables() once before invocation.
 //
@@ -32,22 +28,31 @@ static_assert(sizeof(XsCandidateGpu) == 8, "must match pos2-chip Xs_Candidate la
 //   d_out          : device buffer of at least (1ULL << k) XsCandidateGpu
 //   d_temp_storage : device scratch; pass nullptr first to query size
 //   temp_bytes     : in/out — when d_temp_storage is null, set to required size
-//   stream         : optional CUDA stream
+//   split_keys_a   : optional device pointer of at least total*sizeof(uint32_t)
+//                    bytes. When non-null, the sort's keys_a slot is placed
+//                    there instead of inside d_temp_storage, and *temp_bytes
+//                    correspondingly shrinks by total*u32 (plus alignment).
+//                    Intended for the pool path, which aliases keys_a into
+//                    d_storage's tail (idle during Xs gen+sort) to drop
+//                    ~1 GiB off the pair_b xs-scratch region at k=28. The
+//                    non-null-ness is the flag in sizing mode (the actual
+//                    pointer is read only when d_temp_storage != nullptr).
 //
 // Returns cudaSuccess on launch success. The sort is asynchronous on the
 // stream — synchronize before reading d_out on the host.
-cudaError_t launch_construct_xs(
+void launch_construct_xs(
     uint8_t const* plot_id_bytes,
     int k,
     bool testnet,
     XsCandidateGpu* d_out,
     void* d_temp_storage,
     size_t* temp_bytes,
-    cudaStream_t stream = nullptr);
+    sycl::queue& q,
+    void* split_keys_a = nullptr);
 
 // Optional callback fired between the gen kernel and the sort, useful for
 // per-stage cudaEvent timing. Pass nullptr to skip.
-cudaError_t launch_construct_xs_profiled(
+void launch_construct_xs_profiled(
     uint8_t const* plot_id_bytes,
     int k,
     bool testnet,
@@ -56,6 +61,7 @@ cudaError_t launch_construct_xs_profiled(
     size_t* temp_bytes,
     cudaEvent_t after_gen,    // nullable; recorded after gen kernel queued
     cudaEvent_t after_sort,   // nullable; recorded after sort queued
-    cudaStream_t stream = nullptr);
+    sycl::queue& q,
+    void* split_keys_a = nullptr);
 
 } // namespace pos2gpu
diff --git a/src/gpu/XsKernels.cuh b/src/gpu/XsKernels.cuh
new file mode 100644
index 0000000..35ac27f
--- /dev/null
+++ b/src/gpu/XsKernels.cuh
@@ -0,0 +1,66 @@
+// XsKernels.cuh — backend-dispatched wrappers for the two non-sort phases
+// of Xs construction. The orchestration (sizing query, sort, fold-into-AoS)
+// lives in XsKernel.cpp and chains these via a sycl::queue.
+//
+// Phase 1: launch_xs_gen — fill (keys_out[x], vals_out[x]) = (g_x(x⊕xor_const), x)
+//          for x in [0, total). Loads AES T-tables into local memory once
+//          per workgroup, mirroring the CUDA gen_kernel pattern.
+//
+// Phase 3: launch_xs_pack — pack sorted (keys_in, vals_in) back into AoS
+//          XsCandidateGpu[total]. Pure grid-stride; no AES.
+
+#pragma once
+
+#include "gpu/AesHashGpu.cuh"
+#include "gpu/XsCandidateGpu.hpp"
+
+#include <cstdint>
+
+#include "gpu/CudaHalfShim.hpp"
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+void launch_xs_gen(
+    AesHashKeys keys,
+    uint32_t* keys_out,
+    uint32_t* vals_out,
+    uint64_t total,
+    int k,
+    uint32_t xor_const,
+    sycl::queue& q);
+
+// Position-range variant of launch_xs_gen. Generates Xs candidates for
+// positions x ∈ [pos_begin, pos_end) and writes to keys_out[i] /
+// vals_out[i] where i = x - pos_begin (relative indexing). keys_out /
+// vals_out must be sized for at least (pos_end - pos_begin) elements.
+// Used by minimal tier to tile the Xs gen + sort phase below the
+// 4 GiB-cap peak.
+void launch_xs_gen_range(
+    AesHashKeys keys,
+    uint32_t* keys_out,
+    uint32_t* vals_out,
+    uint64_t pos_begin,
+    uint64_t pos_end,
+    int k,
+    uint32_t xor_const,
+    sycl::queue& q);
+
+void launch_xs_pack(
+    uint32_t const* keys_in,
+    uint32_t const* vals_in,
+    XsCandidateGpu* d_out,
+    uint64_t total,
+    sycl::queue& q);
+
+// Position-range variant of launch_xs_pack. Reads keys_in[i] / vals_in[i]
+// for i ∈ [0, count) and writes XsCandidateGpu{keys_in[i], vals_in[i]}
+// to d_out[i + dst_begin]. Lets the caller pack incrementally.
+void launch_xs_pack_range(
+    uint32_t const* keys_in,
+    uint32_t const* vals_in,
+    XsCandidateGpu* d_out,
+    uint64_t count,
+    sycl::queue& q);
+
+} // namespace pos2gpu
diff --git a/src/gpu/XsKernelsSycl.cpp b/src/gpu/XsKernelsSycl.cpp
new file mode 100644
index 0000000..9ae3589
--- /dev/null
+++ b/src/gpu/XsKernelsSycl.cpp
@@ -0,0 +1,136 @@
+// XsKernelsSycl.cpp — SYCL implementation of Xs gen/pack kernels.
+// Same shape as the T1/T2/T3 SYCL impls; gen reuses the AES T-table USM
+// buffer from SyclBackend.hpp, pack is a pure grid-stride lambda.
+
+#include "gpu/SyclBackend.hpp"
+#include "gpu/XsKernels.cuh"
+
+#include <sycl/sycl.hpp>
+
+namespace pos2gpu {
+
+void launch_xs_gen(
+    AesHashKeys keys,
+    uint32_t* keys_out,
+    uint32_t* vals_out,
+    uint64_t total,
+    int k,
+    uint32_t xor_const,
+    sycl::queue& q)
+{
+    uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q);
+
+    constexpr size_t threads = 256;
+    size_t   const groups    = (total + threads - 1) / threads;
+
+    q.submit([&](sycl::handler& h) {
+        sycl::local_accessor<uint32_t, 1> sT_local{
+            sycl::range<1>{4 * 256}, h};
+
+        h.parallel_for(
+            sycl::nd_range<1>{ groups * threads, threads },
+            [=, keys_copy = keys](sycl::nd_item<1> it) {
+                // Cooperative load of AES T-tables into local memory.
+                uint32_t* sT = &sT_local[0];
+                size_t local_id = it.get_local_id(0);
+                #pragma unroll 1
+                for (size_t i = local_id; i < 4 * 256; i += threads) {
+                    sT[i] = d_aes_tables[i];
+                }
+                it.barrier(sycl::access::fence_space::local_space);
+
+                uint64_t idx = it.get_global_id(0);
+                if (idx >= total) return;
+                uint32_t x = static_cast<uint32_t>(idx);
+                uint32_t mixed = x ^ xor_const;
+                keys_out[idx] = pos2gpu::g_x_smem(keys_copy, mixed, k, sT);
+                vals_out[idx] = x;
+            });
+    }).wait();
+}
+
+void launch_xs_gen_range(
+    AesHashKeys keys,
+    uint32_t* keys_out,
+    uint32_t* vals_out,
+    uint64_t pos_begin,
+    uint64_t pos_end,
+    int k,
+    uint32_t xor_const,
+    sycl::queue& q)
+{
+    if (pos_end <= pos_begin) return;
+    uint64_t const range_n = pos_end - pos_begin;
+
+    uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q);
+
+    constexpr size_t threads = 256;
+    size_t   const groups    = (range_n + threads - 1) / threads;
+
+    q.submit([&](sycl::handler& h) {
+        sycl::local_accessor<uint32_t, 1> sT_local{
+            sycl::range<1>{4 * 256}, h};
+
+        h.parallel_for(
+            sycl::nd_range<1>{ groups * threads, threads },
+            [=, keys_copy = keys](sycl::nd_item<1> it) {
+                uint32_t* sT = &sT_local[0];
+                size_t local_id = it.get_local_id(0);
+                #pragma unroll 1
+                for (size_t i = local_id; i < 4 * 256; i += threads) {
+                    sT[i] = d_aes_tables[i];
+                }
+                it.barrier(sycl::access::fence_space::local_space);
+
+                uint64_t local_idx = it.get_global_id(0);
+                if (local_idx >= range_n) return;
+                uint32_t x = static_cast<uint32_t>(pos_begin + local_idx);
+                uint32_t mixed = x ^ xor_const;
+                keys_out[local_idx] = pos2gpu::g_x_smem(keys_copy, mixed, k, sT);
+                vals_out[local_idx] = x;
+            });
+    }).wait();
+}
+
+void launch_xs_pack(
+    uint32_t const* keys_in,
+    uint32_t const* vals_in,
+    XsCandidateGpu* d_out,
+    uint64_t total,
+    sycl::queue& q)
+{
+    constexpr size_t threads = 256;
+    size_t   const groups    = (total + threads - 1) / threads;
+
+    q.parallel_for(
+        sycl::nd_range<1>{ groups * threads, threads },
+        [=](sycl::nd_item<1> it) {
+            uint64_t idx = it.get_global_id(0);
+            if (idx >= total) return;
+            d_out[idx] = XsCandidateGpu{ keys_in[idx], vals_in[idx] };
+        }).wait();
+}
+
+void launch_xs_pack_range(
+    uint32_t const* keys_in,
+    uint32_t const* vals_in,
+    XsCandidateGpu* d_out,
+    uint64_t count,
+    sycl::queue& q)
+{
+    // Same body as launch_xs_pack — caller passes already-offset pointers
+    // (keys_in, vals_in, d_out) and the slice count.
+    if (count == 0) return;
+    constexpr size_t threads = 256;
+    size_t   const groups    = (count + threads - 1) / threads;
+
+    q.parallel_for(
+        sycl::nd_range<1>{ groups * threads, threads },
+        [=](sycl::nd_item<1> it) {
+            uint64_t idx = it.get_global_id(0);
+            if (idx >= count) return;
+            d_out[idx] = XsCandidateGpu{ keys_in[idx], vals_in[idx] };
+        }).wait();
+}
+
+} // namespace pos2gpu
diff --git a/src/host/BatchPlotter.cpp b/src/host/BatchPlotter.cpp
index ccb3949..4d53434 100644
--- a/src/host/BatchPlotter.cpp
+++ b/src/host/BatchPlotter.cpp
@@ -1,13 +1,17 @@
 // BatchPlotter.cu — implementation of staggered multi-plot pipeline.
 
 #include "host/BatchPlotter.hpp"
+#include "host/Cancel.hpp"
+#include "host/CpuPlotter.hpp"  // run_one_plot_cpu — pos2-chip CPU pipeline
 #include "host/GpuBufferPool.hpp"
 #include "host/GpuPipeline.hpp"
 #include "host/PlotFileWriterParallel.hpp"
+#include "gpu/DeviceIds.hpp"  // kCpuDeviceId for the --cpu device-list mixin
 
 // Deliberately no pos2-chip includes here — see PlotFileWriterParallel.cpp.
 
 #include <algorithm>
+#include <array>
 #include <atomic>
 #include <chrono>
 #include <condition_variable>
@@ -15,11 +19,14 @@
 #include <cstdlib>
 #include <filesystem>
 #include <fstream>
+#include <map>
+#include <memory>
 #include <mutex>
 #include <queue>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <system_error>
 #include <thread>
 
 namespace pos2gpu {
@@ -100,24 +107,111 @@ struct WorkItem {
     size_t            index = 0;
 };
 
-// Bounded SPSC queue of depth 1 plus end-of-stream signal.
+// Rough per-plot upper-bound estimate for the disk preflight. The actual
+// compressed .plot2 is smaller (FSE over proof-fragment stubs); this
+// uncompressed ceiling is deliberately pessimistic so we only WARN when
+// the disk is genuinely too small, not for boundary cases.
+//
+// Formula: 2^k fragments × (proof_fragment_bits) / 8, where
+// proof_fragment_bits ≈ k + (k - MINUS_STUB_BITS) + overhead, ≈ 2k bytes*bits.
+uint64_t approx_plot_bytes_upper_bound(int k)
+{
+    if (k <= 0 || k > 32) return 0;
+    uint64_t const fragments = uint64_t(1) << k;
+    uint64_t const bits_per  = uint64_t(2 * k);  // k stub + k-2 xbits, rounded up
+    return (fragments * bits_per) / 8;
+}
+
+// Check `.plot2` is present at path AND looks like a valid plot file
+// (magic bytes "pos2" + nonzero size). Used for --skip-existing so we
+// don't silently skip a zero-byte or crash-truncated leftover.
+bool looks_like_complete_plot(std::filesystem::path const& path)
+{
+    std::error_code ec;
+    auto const sz = std::filesystem::file_size(path, ec);
+    if (ec || sz < 64) return false;  // header alone is >64 B
+
+    std::ifstream in(path, std::ios::binary);
+    if (!in) return false;
+    char magic[4]{};
+    in.read(magic, 4);
+    return in.good() && magic[0] == 'p' && magic[1] == 'o'
+                     && magic[2] == 's' && magic[3] == '2';
+}
+
+// Print a warning if the available free space on each unique output
+// directory looks insufficient for the plots targeted there. Purely
+// advisory — the atomic .partial write handles actual ENOSPC cleanly.
+void preflight_disk_space(std::vector<BatchEntry> const& entries,
+                          BatchOptions const& opts)
+{
+    if (entries.empty()) return;
+
+    std::map<std::string, std::pair<size_t, uint64_t>> per_dir;  // dir -> (count, bytes)
+    for (auto const& e : entries) {
+        uint64_t const est = approx_plot_bytes_upper_bound(e.k);
+        auto& slot = per_dir[e.out_dir.empty() ? std::string(".") : e.out_dir];
+        slot.first  += 1;
+        slot.second += est;
+    }
+
+    constexpr double GB = 1.0 / (1024.0 * 1024.0 * 1024.0);
+    for (auto const& [dir, tally] : per_dir) {
+        std::error_code ec;
+        std::filesystem::create_directories(dir, ec);  // space() needs it to exist
+        auto const info = std::filesystem::space(dir, ec);
+        if (ec) {
+            if (opts.verbose) {
+                std::fprintf(stderr,
+                    "[batch] preflight: cannot stat free space on %s (%s) — "
+                    "skipping check\n", dir.c_str(), ec.message().c_str());
+            }
+            continue;
+        }
+        double const need_gb = tally.second * GB;
+        double const free_gb = info.available * GB;
+        if (info.available < tally.second) {
+            std::fprintf(stderr,
+                "[batch] WARNING: %s has %.1f GB free but %zu plot(s) may need "
+                "up to ~%.1f GB (uncompressed upper bound). The batch will "
+                "still run; .partial writes are atomic so mid-plot ENOSPC is "
+                "recoverable, but consider freeing space or reducing count.\n",
+                dir.c_str(), free_gb, tally.first, need_gb);
+        } else if (opts.verbose) {
+            std::fprintf(stderr,
+                "[batch] preflight: %s has %.1f GB free, %zu plot(s) need "
+                "up to ~%.1f GB\n",
+                dir.c_str(), free_gb, tally.first, need_gb);
+        }
+    }
+}
+
+// Bounded SPSC queue + end-of-stream signal.
+//
+// Depth = kNumPinnedBuffers - 1 so the producer never overtakes the
+// consumer by more than (num_pinned - 1) plots. The pinned slot the
+// producer writes is slot (i % kNumPinnedBuffers); with depth-(N-1)
+// the consumer is guaranteed to have popped plot (i - N) before the
+// producer overwrites its slot.
 class Channel {
 public:
+    explicit Channel(std::size_t capacity) : capacity_(capacity) {}
+
     void push(WorkItem item) {
         std::unique_lock<std::mutex> lock(mu_);
-        cv_.wait(lock, [&]{ return !item_.has_value() && !closed_; });
+        cv_not_full_.wait(lock, [&]{ return q_.size() < capacity_ || closed_; });
         if (closed_) return;
-        item_ = std::move(item);
-        cv_.notify_all();
+        q_.push(std::move(item));
+        cv_not_empty_.notify_one();
     }
-    // Returns false when channel is closed AND empty.
+    // Returns false when the channel is closed AND empty.
     bool pop(WorkItem& out) {
         std::unique_lock<std::mutex> lock(mu_);
-        cv_.wait(lock, [&]{ return item_.has_value() || closed_; });
-        if (item_.has_value()) {
-            out = std::move(*item_);
-            item_.reset();
-            cv_.notify_all();
+        cv_not_empty_.wait(lock, [&]{ return !q_.empty() || closed_; });
+        if (!q_.empty()) {
+            out = std::move(q_.front());
+            q_.pop();
+            cv_not_full_.notify_one();
             return true;
         }
         return false;
@@ -125,93 +219,397 @@ class Channel {
     void close() {
         std::lock_guard<std::mutex> lock(mu_);
         closed_ = true;
-        cv_.notify_all();
+        cv_not_empty_.notify_all();
+        cv_not_full_.notify_all();
     }
 private:
     std::mutex                mu_;
-    std::condition_variable   cv_;
-    std::optional<WorkItem>   item_;
+    std::condition_variable   cv_not_empty_, cv_not_full_;
+    std::queue<WorkItem>      q_;
+    std::size_t               capacity_;
     bool                      closed_ = false;
 };
 
 } // namespace
 
-BatchResult run_batch(std::vector<BatchEntry> const& entries, bool verbose)
+namespace {
+
+// Per-worker pipeline. Extracted from run_batch so the multi-device
+// fan-out can spawn N of these concurrently — one thread per device,
+// each with its own pool / channel / consumer. The outer run_batch
+// validates homogeneity and runs the disk-space preflight once; this
+// helper assumes both have already been done on `entries`.
+//
+// device_id sentinels (see src/gpu/DeviceIds.hpp):
+//   kDefaultGpuId (-1) → keep the default SYCL gpu_selector_v
+//                        (single-device default; zero-config users
+//                        see unchanged behavior).
+//   kCpuDeviceId  (-2) → CPU worker via sycl::cpu_selector_v
+//                        (--cpu / --devices cpu; AdaptiveCpp OMP
+//                        backend, much slower than GPU).
+//   0..N-1            → explicit GPU index from get_devices(gpu).
+// worker_id  < 0 → single-device path; currently unused beyond
+//                  documenting intent but reserved for a future per-
+//                  worker log prefix (see fprintf calls below — one
+//                  line per call means ordering is already atomic
+//                  per-line, so interleaving across workers is
+//                  acceptable for v1 without prefix disambiguation).
+// shared_idx (default null) lets multiple workers race for the next plot
+// out of a single shared `entries` list. When set, every worker calls
+// shared_idx->fetch_add(1) and exits when the result >= entries.size() —
+// dynamic load balancing, so a fast GPU worker keeps pulling plots while
+// a slow CPU worker handles only what it can finish in the same wall.
+// When null (single-device path), the worker iterates 0..entries.size()-1
+// in order — original behaviour.
+BatchResult run_batch_slice(std::vector<BatchEntry> const& entries,
+                            BatchOptions const& opts,
+                            int                 device_id,
+                            int                 worker_id,
+                            std::atomic<std::size_t>* shared_idx = nullptr)
 {
+    (void)worker_id;
+
+    // CPU worker: bypass the GPU pool / streaming path entirely. pos2-chip's
+    // Plotter manages all internal state itself, so each plot is a
+    // synchronous run_one_plot_cpu() call. Single-threaded internally;
+    // multi-core utilization comes from passing `cpu` multiple times in
+    // --devices (e.g. --devices cpu,cpu,cpu,cpu on a 4-core host).
+    //
+    // XCHPLOT2_SYCL_CPU_BENCH=1 routes --cpu through the SYCL pipeline on
+    // AdaptiveCpp's CPU backend instead of pos2-chip — exposed as an env
+    // var purely for benchmarking the two CPU paths against each other,
+    // not as a supported plotting mode (pos2-chip is faster + leaner).
+    bool const sycl_cpu_bench = [] {
+        char const* v = std::getenv("XCHPLOT2_SYCL_CPU_BENCH");
+        return v && v[0] == '1';
+    }();
+    if (device_id == kCpuDeviceId && !sycl_cpu_bench) {
+        BatchResult res;
+        if (entries.empty()) return res;
+        auto const t_start = std::chrono::steady_clock::now();
+        std::size_t local_idx = 0;
+        while (true) {
+            std::size_t const i = shared_idx
+                ? shared_idx->fetch_add(1, std::memory_order_relaxed)
+                : local_idx++;
+            if (i >= entries.size()) break;
+            if (opts.skip_existing) {
+                auto out_path = std::filesystem::path(entries[i].out_dir)
+                                / entries[i].out_name;
+                if (looks_like_complete_plot(out_path)) {
+                    if (opts.verbose) {
+                        std::fprintf(stderr,
+                            "[batch:cpu] skipping plot %zu: %s (already exists)\n",
+                            i, out_path.string().c_str());
+                    }
+                    ++res.plots_skipped;
+                    continue;
+                }
+            }
+            try {
+                run_one_plot_cpu(entries[i], opts);
+                ++res.plots_written;
+                if (opts.verbose) {
+                    std::fprintf(stderr,
+                        "[batch:cpu] plot %zu done: %s\n",
+                        i, entries[i].out_name.c_str());
+                }
+            } catch (std::exception const& ex) {
+                std::fprintf(stderr,
+                    "[batch:cpu] plot %zu FAILED: %s\n", i, ex.what());
+                ++res.plots_failed;
+                if (!opts.continue_on_error) {
+                    res.total_wall_seconds = std::chrono::duration<double>(
+                        std::chrono::steady_clock::now() - t_start).count();
+                    return res;
+                }
+            }
+            if (cancel_requested()) break;
+        }
+        res.total_wall_seconds = std::chrono::duration<double>(
+            std::chrono::steady_clock::now() - t_start).count();
+        return res;
+    }
+
+    if (device_id >= 0 || device_id == kCpuDeviceId) bind_current_device(device_id);
     initialize_aes_tables();
 
+    bool const verbose = opts.verbose;
+
     BatchResult res;
     if (entries.empty()) return res;
 
-    // All entries in a batch must share (k, strength, testnet) so one pool
-    // fits all plots. Mixed-shape batches could be supported by splitting
-    // into homogeneous sub-batches; not needed in practice.
+    // Pool shape from the first entry. Homogeneity (all entries share
+    // k/strength/testnet) was checked by the outer run_batch.
     int  pool_k        = entries[0].k;
     int  pool_strength = entries[0].strength;
     bool pool_testnet  = entries[0].testnet;
-    for (size_t i = 1; i < entries.size(); ++i) {
-        if (entries[i].k != pool_k
-            || entries[i].strength != pool_strength
-            || entries[i].testnet  != pool_testnet)
-        {
-            throw std::runtime_error(
-                "run_batch: all entries must share (k, strength, testnet)");
-        }
-    }
 
     // Allocate the pool once; destructor frees at function exit. This is
     // the whole point of the batch path — eliminate the per-plot ~2.4 s
     // allocator cost (dominated by cudaMallocHost(2 GB)).
-    GpuBufferPool pool(pool_k, pool_strength, pool_testnet);
-    if (verbose) {
+    //
+    // On insufficient device VRAM (small card), the pool ctor throws
+    // InsufficientVramError. Fall back to the streaming pipeline per
+    // plot — slower (no buffer amortisation across plots, no
+    // producer/consumer overlap between GPU D2H and consumer I/O on
+    // pinned double-buffered pool slots), but it fits inside the card's
+    // VRAM and is still overlapped via the Channel between the producer
+    // thread's streaming call and the consumer thread's FSE compression
+    // + plot-file write.
+    std::unique_ptr<GpuBufferPool> pool_ptr;
+    // Streaming-fallback pinned buffers — double-buffered the same way the
+    // pool does, so producer's D2H of plot N+1 can run concurrently with
+    // the consumer reading plot N. cudaMallocHost is ~600 ms, so doing it
+    // once instead of per plot is a significant win on long batches.
+    uint64_t* stream_pinned[GpuBufferPool::kNumPinnedBuffers] = {};
+    size_t    stream_pinned_cap = 0;
+    // Stage 4f: amortised streaming-path pinned-host scratch. Populated
+    // in the streaming-fallback branch below; nullptr fields when the
+    // pool path is active (pool_ptr != null).
+    StreamingPinnedScratch stream_scratch{};
+
+    // Force-streaming override (matches the one-shot run_gpu_pipeline
+    // dispatch). Useful for testing the streaming path on a high-VRAM
+    // card and for users who want the smaller peak even when the pool
+    // would fit.
+    bool const force_streaming = [] {
+        char const* v = std::getenv("XCHPLOT2_STREAMING");
+        return v && v[0] == '1';
+    }();
+
+    try {
+        if (force_streaming) {
+            throw InsufficientVramError("XCHPLOT2_STREAMING=1 forced");
+        }
+        pool_ptr = std::make_unique<GpuBufferPool>(
+            pool_k, pool_strength, pool_testnet);
+    } catch (InsufficientVramError const& e) {
+        if (force_streaming) {
+            std::fprintf(stderr, "[batch] XCHPLOT2_STREAMING=1 — using "
+                                 "streaming pipeline per plot\n");
+        } else {
+            std::fprintf(stderr,
+                "[batch] pool needs %.2f GiB, only %.2f GiB free — using "
+                "streaming pipeline per plot\n",
+                e.required_bytes / double(1ULL << 30),
+                e.free_bytes     / double(1ULL << 30));
+        }
+        // Streaming tier dispatch — three tiers, increasing PCIe pressure
+        // for decreasing peak VRAM:
+        //   plain   (~7290 MB at k=28): no parks, single-pass T2 match.
+        //                               Fastest, ~400 ms/plot over compact.
+        //   compact (~5200 MB at k=28): all parks + N=2 T2 match staging.
+        //                               Targets 6-8 GiB cards.
+        //   minimal (~3700 MB at k=28): compact's parks + N=8 T2 match
+        //                               staging. Targets 4 GiB cards at
+        //                               the cost of extra PCIe round-trips
+        //                               during T2 match.
+        // Auto-pick takes the largest tier that fits with the margin.
+        // 128 MB margin above measured CUDA-context + driver overhead
+        // on headless cards.
+        //
+        // opts.streaming_tier (--tier CLI flag) > XCHPLOT2_STREAMING_TIER
+        // env var > auto. Forced plain/compact below their floor warn but
+        // proceed (caller's risk); forced minimal below its floor throws
+        // because there is no smaller tier to fall back to.
+        {
+            auto const mem            = query_device_memory();
+            size_t const plain_peak   = streaming_plain_peak_bytes(pool_k);
+            size_t const compact_peak = streaming_peak_bytes(pool_k);
+            size_t const minimal_peak = streaming_minimal_peak_bytes(pool_k);
+            size_t const margin       = 128ULL << 20;
+            auto to_gib = [](size_t b) { return b / double(1ULL << 30); };
+
+            char const* tier_env = std::getenv("XCHPLOT2_STREAMING_TIER");
+            std::string const tier_pref =
+                !opts.streaming_tier.empty() ? opts.streaming_tier :
+                (tier_env ? std::string(tier_env) : std::string());
+
+            enum class Tier { Plain, Compact, Minimal };
+            Tier tier;
+            if (tier_pref == "plain") {
+                tier = Tier::Plain;
+            } else if (tier_pref == "compact") {
+                tier = Tier::Compact;
+            } else if (tier_pref == "minimal") {
+                tier = Tier::Minimal;
+            } else {
+                // Auto: pick the largest tier that fits with margin.
+                tier = (mem.free_bytes >= plain_peak   + margin) ? Tier::Plain   :
+                       (mem.free_bytes >= compact_peak + margin) ? Tier::Compact :
+                                                                   Tier::Minimal;
+            }
+
+            auto tier_name = [](Tier t) -> char const* {
+                return t == Tier::Plain   ? "plain"
+                     : t == Tier::Compact ? "compact"
+                     :                      "minimal";
+            };
+            size_t const required =
+                tier == Tier::Plain   ? plain_peak   :
+                tier == Tier::Compact ? compact_peak :
+                                        minimal_peak;
+
+            // Minimal is the open-ended fallback — if even minimal won't
+            // fit, throw. Forced higher tier below its floor warns and
+            // proceeds (caller asked).
+            if (tier == Tier::Minimal && mem.free_bytes < required + margin) {
+                InsufficientVramError se(
+                    "[batch] streaming pipeline needs ~" +
+                    std::to_string(to_gib(required + margin)).substr(0, 5) +
+                    " GiB peak for k=" + std::to_string(pool_k) +
+                    " (minimal tier, the smallest available), device reports " +
+                    std::to_string(to_gib(mem.free_bytes)).substr(0, 5) +
+                    " GiB free of " +
+                    std::to_string(to_gib(mem.total_bytes)).substr(0, 5) +
+                    " GiB total. Use a smaller k or a larger GPU "
+                    "(or --cpu for pos2-chip CPU plotting).");
+                se.required_bytes = required + margin;
+                se.free_bytes     = mem.free_bytes;
+                se.total_bytes    = mem.total_bytes;
+                throw se;
+            }
+            if (tier != Tier::Minimal && mem.free_bytes < required + margin) {
+                std::fprintf(stderr,
+                    "[batch] streaming tier: %s forced (%.2f GiB free < %.2f GiB "
+                    "%s floor) — proceeding, may OOM mid-plot\n",
+                    tier_name(tier),
+                    to_gib(mem.free_bytes),
+                    to_gib(required + margin),
+                    tier_name(tier));
+            }
+
+            stream_scratch.plain_mode = (tier == Tier::Plain);
+            if (tier == Tier::Minimal) {
+                stream_scratch.t2_tile_count     = 8;
+                stream_scratch.gather_tile_count = 4;
+            }
+
+            std::fprintf(stderr,
+                "[batch] streaming tier: %s "
+                "(%.2f GiB free, %.2f GiB peak, %.2f GiB plain floor)\n",
+                tier_name(tier),
+                to_gib(mem.free_bytes),
+                to_gib(required),
+                to_gib(plain_peak + margin));
+        }
+        // Size the pinned buffers using the same cap formula as the pool.
+        int const num_section_bits = (pool_k < 28) ? 2 : (pool_k - 26);
+        int const extra_margin_bits = 8 - ((28 - pool_k) / 2);
+        uint64_t const per_section =
+            (1ULL << (pool_k - num_section_bits)) +
+            (1ULL << (pool_k - extra_margin_bits));
+        uint64_t const cap = per_section * (1ULL << num_section_bits);
+        stream_pinned_cap = size_t(cap);
+        bool any_fail = false;
+        for (int s = 0; s < GpuBufferPool::kNumPinnedBuffers; ++s) {
+            stream_pinned[s] = streaming_alloc_pinned_uint64(stream_pinned_cap);
+            if (!stream_pinned[s]) { any_fail = true; break; }
+        }
+        if (any_fail) {
+            for (int s = 0; s < GpuBufferPool::kNumPinnedBuffers; ++s) {
+                if (stream_pinned[s]) streaming_free_pinned_uint64(stream_pinned[s]);
+            }
+            throw std::runtime_error(
+                "[batch] streaming-fallback: pinned D2H buffer allocation failed");
+        }
+
+        // Stage 4f (compact tier only): amortise streaming-path
+        // pinned-host scratch across all plots in the batch. Lifetime
+        // analysis (see StreamingPinnedScratch doc) lets four shared
+        // buffers cover all six internal park/staging roles. At k=28:
+        // h_meta 2080 MB + h_keys_merged 1040 MB + h_t2_xbits 1040 MB
+        // + h_t3 2080 MB = ~6.24 GB of pinned host, paid ONCE for the
+        // whole batch.
+        //
+        // Plain tier does not park anything, so these pinned-host
+        // scratch buffers are not needed.
+        if (!stream_scratch.plain_mode) {
+            stream_scratch.h_meta        = streaming_alloc_pinned_uint64(stream_pinned_cap);
+            stream_scratch.h_keys_merged = streaming_alloc_pinned_uint32(stream_pinned_cap);
+            stream_scratch.h_t2_xbits    = streaming_alloc_pinned_uint32(stream_pinned_cap);
+            stream_scratch.h_t3          = streaming_alloc_pinned_uint64(stream_pinned_cap);
+            if (!stream_scratch.h_meta || !stream_scratch.h_keys_merged ||
+                !stream_scratch.h_t2_xbits || !stream_scratch.h_t3)
+            {
+                if (stream_scratch.h_meta)        streaming_free_pinned_uint64(stream_scratch.h_meta);
+                if (stream_scratch.h_keys_merged) streaming_free_pinned_uint32(stream_scratch.h_keys_merged);
+                if (stream_scratch.h_t2_xbits)    streaming_free_pinned_uint32(stream_scratch.h_t2_xbits);
+                if (stream_scratch.h_t3)          streaming_free_pinned_uint64(stream_scratch.h_t3);
+                for (int s = 0; s < GpuBufferPool::kNumPinnedBuffers; ++s) {
+                    if (stream_pinned[s]) streaming_free_pinned_uint64(stream_pinned[s]);
+                }
+                throw std::runtime_error(
+                    "[batch] streaming-fallback: pinned-host scratch allocation failed");
+            }
+        }
+    }
+    if (verbose && pool_ptr) {
         double gb = 1.0 / (1024.0 * 1024.0 * 1024.0);
         std::fprintf(stderr,
             "[batch] pool: storage=%.2f GB pair_a=%.2f GB pair_b=%.2f GB "
             "sort_scratch=%.2f GB pinned=2x%.2f GB "
             "(Xs scratch aliased in pair_b)\n",
-            pool.storage_bytes * gb,
-            pool.pair_bytes    * gb,
-            pool.pair_bytes    * gb,
-            pool.sort_scratch_bytes * gb,
-            pool.pinned_bytes       * gb);
+            pool_ptr->storage_bytes * gb,
+            pool_ptr->pair_a_bytes  * gb,
+            pool_ptr->pair_b_bytes  * gb,
+            pool_ptr->sort_scratch_bytes * gb,
+            pool_ptr->pinned_bytes       * gb);
     }
 
-    Channel chan;
+    // Depth = kNumPinnedBuffers - 1. See Channel's comment block above.
+    Channel chan(static_cast<std::size_t>(GpuBufferPool::kNumPinnedBuffers - 1));
     std::atomic<bool>     consumer_failed{false};
     std::atomic<size_t>   plots_done{0};
     std::exception_ptr    consumer_err;
 
     auto t_start = std::chrono::steady_clock::now();
 
+    std::atomic<size_t> plots_failed_consumer{0};
+
     // Consumer: takes finished GpuPipelineResults and writes plot files.
+    // Under continue_on_error, per-plot exceptions (e.g. ENOSPC for a
+    // specific plot) are logged and the loop continues rather than
+    // tearing down the batch. The .partial + rename in
+    // write_plot_file_parallel guarantees failed writes leave nothing
+    // behind at the destination.
     std::thread consumer([&] {
         try {
             WorkItem item;
             while (chan.pop(item)) {
-                std::filesystem::create_directories(item.entry.out_dir);
                 auto full_path = std::filesystem::path(item.entry.out_dir) / item.entry.out_name;
-
-                std::vector<uint8_t> memo_bytes = item.entry.memo;
-                if (memo_bytes.empty()) memo_bytes.assign(32 + 48 + 32, 0);
-
-                // Fragments are borrowed from the pool's pinned slot; the
-                // producer is synchronised via the depth-1 channel so that
-                // slot won't be reused until we're done here.
-                write_plot_file_parallel(
-                    full_path.string(),
-                    item.result.fragments(),
-                    item.entry.plot_id.data(),
-                    static_cast<uint8_t>(item.entry.k),
-                    static_cast<uint8_t>(item.entry.strength),
-                    item.entry.testnet ? uint8_t{1} : uint8_t{0},
-                    static_cast<uint16_t>(item.entry.plot_index),
-                    static_cast<uint8_t>(item.entry.meta_group),
-                    std::span<uint8_t const>(memo_bytes.data(), memo_bytes.size()));
-
-                ++plots_done;
-                if (verbose) {
-                    std::fprintf(stderr, "[batch] consumer wrote plot %zu: %s\n",
-                                 item.index, full_path.string().c_str());
+                try {
+                    std::filesystem::create_directories(item.entry.out_dir);
+
+                    std::vector<uint8_t> memo_bytes = item.entry.memo;
+                    if (memo_bytes.empty()) memo_bytes.assign(32 + 48 + 32, 0);
+
+                    // Fragments are borrowed from the pool's pinned slot; the
+                    // producer is synchronised via the depth-1 channel so that
+                    // slot won't be reused until we're done here.
+                    write_plot_file_parallel(
+                        full_path.string(),
+                        item.result.fragments(),
+                        item.entry.plot_id.data(),
+                        static_cast<uint8_t>(item.entry.k),
+                        static_cast<uint8_t>(item.entry.strength),
+                        item.entry.testnet ? uint8_t{1} : uint8_t{0},
+                        static_cast<uint16_t>(item.entry.plot_index),
+                        static_cast<uint8_t>(item.entry.meta_group),
+                        std::span<uint8_t const>(memo_bytes.data(), memo_bytes.size()));
+
+                    ++plots_done;
+                    if (verbose) {
+                        std::fprintf(stderr, "[batch] consumer wrote plot %zu: %s\n",
+                                     item.index, full_path.string().c_str());
+                    }
+                } catch (std::exception const& e) {
+                    if (!opts.continue_on_error) throw;
+                    ++plots_failed_consumer;
+                    std::fprintf(stderr,
+                        "[batch] plot %zu FAILED (write %s): %s — continuing\n",
+                        item.index, full_path.string().c_str(), e.what());
                 }
             }
         } catch (...) {
@@ -220,11 +618,44 @@ BatchResult run_batch(std::vector<BatchEntry> const& entries, bool verbose)
         }
     });
 
+    size_t producer_failed = 0;
+
     // Producer (this thread): drives the GPU pipeline, hands off to consumer.
+    // local_count rotates this worker's own pinned-buffer slots (channel
+    // depth = kNumPinnedBuffers); it must NOT use the global plot index
+    // when shared_idx is in play, because peer workers also hold slots in
+    // their own pools.
     try {
-        for (size_t i = 0; i < entries.size(); ++i) {
+        std::size_t local_idx = 0;
+        std::size_t local_count = 0;
+        while (true) {
             if (consumer_failed) break;
 
+            std::size_t const i = shared_idx
+                ? shared_idx->fetch_add(1, std::memory_order_relaxed)
+                : local_idx++;
+            if (i >= entries.size()) break;
+
+            if (cancel_requested()) {
+                std::fprintf(stderr,
+                    "[batch] cancel received — stopping before plot %zu\n", i);
+                break;
+            }
+
+            if (opts.skip_existing) {
+                auto out_path = std::filesystem::path(entries[i].out_dir)
+                                / entries[i].out_name;
+                if (looks_like_complete_plot(out_path)) {
+                    if (verbose) {
+                        std::fprintf(stderr,
+                            "[batch] skipping plot %zu: %s (already exists)\n",
+                            i, out_path.string().c_str());
+                    }
+                    ++res.plots_skipped;
+                    continue;
+                }
+            }
+
             auto t_plot = std::chrono::steady_clock::now();
 
             GpuPipelineConfig cfg;
@@ -237,9 +668,29 @@ BatchResult run_batch(std::vector<BatchEntry> const& entries, bool verbose)
             WorkItem item;
             item.entry  = entries[i];
             item.index  = i;
-            // Alternate pinned buffer per plot so the current D2H doesn't
-            // clobber pinned data the consumer is still reading.
-            item.result = run_gpu_pipeline(cfg, pool, static_cast<int>(i % 2));
+            int const slot = static_cast<int>(
+                local_count % GpuBufferPool::kNumPinnedBuffers);
+            try {
+                if (pool_ptr) {
+                    // Pool path: rotate pinned slot per plot. The channel's
+                    // (kNumPinnedBuffers - 1) depth holds the producer back
+                    // before it overtakes the consumer's read of that slot.
+                    item.result = run_gpu_pipeline(cfg, *pool_ptr, slot);
+                } else {
+                    // Streaming path with externally-owned pinned: same
+                    // rotation + channel-depth invariant.
+                    item.result = run_gpu_pipeline_streaming(
+                        cfg, stream_pinned[slot], stream_pinned_cap,
+                        stream_scratch);
+                }
+            } catch (std::exception const& e) {
+                if (!opts.continue_on_error) throw;
+                ++producer_failed;
+                std::fprintf(stderr,
+                    "[batch] plot %zu FAILED (GPU): %s — continuing\n",
+                    i, e.what());
+                continue;
+            }
 
             if (verbose) {
                 auto ms = std::chrono::duration<double, std::milli>(
@@ -254,6 +705,7 @@ BatchResult run_batch(std::vector<BatchEntry> const& entries, bool verbose)
             }
 
             chan.push(std::move(item));
+            ++local_count;
         }
     } catch (...) {
         chan.close();
@@ -266,10 +718,140 @@ BatchResult run_batch(std::vector<BatchEntry> const& entries, bool verbose)
 
     if (consumer_failed && consumer_err) std::rethrow_exception(consumer_err);
 
+    for (int s = 0; s < GpuBufferPool::kNumPinnedBuffers; ++s) {
+        streaming_free_pinned_uint64(stream_pinned[s]);
+    }
+    // Stage 4f: free the amortised streaming scratch (no-op if pool path
+    // was used — all fields stay nullptr in that case).
+    if (stream_scratch.h_meta)        streaming_free_pinned_uint64(stream_scratch.h_meta);
+    if (stream_scratch.h_keys_merged) streaming_free_pinned_uint32(stream_scratch.h_keys_merged);
+    if (stream_scratch.h_t2_xbits)    streaming_free_pinned_uint32(stream_scratch.h_t2_xbits);
+    if (stream_scratch.h_t3)          streaming_free_pinned_uint64(stream_scratch.h_t3);
+
     res.plots_written = plots_done.load();
+    res.plots_failed  = producer_failed + plots_failed_consumer.load();
     res.total_wall_seconds = std::chrono::duration<double>(
                                 std::chrono::steady_clock::now() - t_start).count();
     return res;
 }
 
+} // namespace
+
+BatchResult run_batch(std::vector<BatchEntry> const& entries,
+                      BatchOptions const& opts)
+{
+    if (entries.empty()) return BatchResult{};
+
+    // Homogeneity check (all entries must share k/strength/testnet) —
+    // runs once on the full list before any per-worker dispatch so both
+    // the single- and multi-device paths share the same error surface.
+    int  const pool_k        = entries[0].k;
+    int  const pool_strength = entries[0].strength;
+    bool const pool_testnet  = entries[0].testnet;
+    for (size_t i = 1; i < entries.size(); ++i) {
+        if (entries[i].k != pool_k
+            || entries[i].strength != pool_strength
+            || entries[i].testnet  != pool_testnet)
+        {
+            throw std::runtime_error(
+                "run_batch: all entries must share (k, strength, testnet)");
+        }
+    }
+
+    preflight_disk_space(entries, opts);
+
+    // Resolve the target device list:
+    //   use_all_devices  → enumerate at runtime, one worker per GPU
+    //   device_ids       → use these explicit ids
+    //   (neither)        → empty list → single-device default selector
+    //   include_cpu      → orthogonal: also append kCpuDeviceId so the
+    //                      CPU runs as one more worker. Mixes with the
+    //                      above (--cpu alone → CPU only; --cpu --devices
+    //                      all → all GPUs + CPU; etc.).
+    std::vector<int> device_ids;
+    if (opts.use_all_devices) {
+        int const n = gpu_device_count();
+        if (n <= 0) {
+            std::fprintf(stderr,
+                "[batch] --devices all: runtime enumerated 0 GPUs — "
+                "falling back to the default SYCL selector\n");
+        } else {
+            device_ids.reserve(static_cast<size_t>(n));
+            for (int i = 0; i < n; ++i) device_ids.push_back(i);
+        }
+    } else if (!opts.device_ids.empty()) {
+        device_ids = opts.device_ids;
+    }
+    if (opts.include_cpu &&
+        std::find(device_ids.begin(), device_ids.end(), kCpuDeviceId)
+            == device_ids.end()) {
+        device_ids.push_back(kCpuDeviceId);
+    }
+
+    auto const t_start = std::chrono::steady_clock::now();
+
+    // Fast path: zero-config default or one explicit id. Runs on the
+    // caller thread — identical control flow to pre-multi-GPU except
+    // for the optional thread-local device bind at the top of the
+    // slice.
+    if (device_ids.size() <= 1) {
+        int const dev = device_ids.empty() ? -1 : device_ids[0];
+        BatchResult r = run_batch_slice(entries, opts, dev, -1);
+        r.total_wall_seconds = std::chrono::duration<double>(
+            std::chrono::steady_clock::now() - t_start).count();
+        return r;
+    }
+
+    // Multi-device: workers race to pull plots from a single shared
+    // queue (atomic counter into `entries`) so a fast GPU keeps pulling
+    // work while a slow CPU only handles what it can finish in the same
+    // wall. Each worker still constructs its own GpuBufferPool /
+    // producer-consumer channel / writer thread on its target device —
+    // zero cross-worker shared state beyond `next_idx`, stderr, and
+    // the filesystem.
+    size_t const N = device_ids.size();
+    std::fprintf(stderr,
+        "[batch] multi-device: %zu plots across %zu workers (work-queue) — devices:",
+        entries.size(), N);
+    for (size_t i = 0; i < N; ++i) {
+        std::fprintf(stderr, " %d", device_ids[i]);
+    }
+    std::fprintf(stderr, "\n");
+
+    std::atomic<std::size_t> next_idx{0};
+    std::vector<BatchResult>         per_worker(N);
+    std::vector<std::exception_ptr>  per_worker_exc(N);
+    std::vector<std::thread>         workers;
+    workers.reserve(N);
+    for (size_t i = 0; i < N; ++i) {
+        workers.emplace_back([&, i]() {
+            try {
+                per_worker[i] = run_batch_slice(
+                    entries, opts, device_ids[i],
+                    static_cast<int>(i), &next_idx);
+            } catch (...) {
+                per_worker_exc[i] = std::current_exception();
+            }
+        });
+    }
+    for (auto& t : workers) t.join();
+
+    // Propagate the first worker exception after every worker has
+    // joined — prevents a fast failure from leaving peer workers still
+    // running and printing to a half-torn-down pipeline.
+    for (auto& ep : per_worker_exc) {
+        if (ep) std::rethrow_exception(ep);
+    }
+
+    BatchResult agg;
+    for (auto const& r : per_worker) {
+        agg.plots_written += r.plots_written;
+        agg.plots_skipped += r.plots_skipped;
+        agg.plots_failed  += r.plots_failed;
+    }
+    agg.total_wall_seconds = std::chrono::duration<double>(
+        std::chrono::steady_clock::now() - t_start).count();
+    return agg;
+}
+
 } // namespace pos2gpu
diff --git a/src/host/BatchPlotter.hpp b/src/host/BatchPlotter.hpp
index 2c1423e..e9b7c37 100644
--- a/src/host/BatchPlotter.hpp
+++ b/src/host/BatchPlotter.hpp
@@ -32,15 +32,75 @@ struct BatchEntry {
 
 struct BatchResult {
     size_t plots_written = 0;
+    size_t plots_skipped = 0;  // present + skipped via BatchOptions::skip_existing
+    size_t plots_failed  = 0;  // raised an exception under BatchOptions::continue_on_error
     double total_wall_seconds = 0.0;
 };
 
+// Options controlling batch behavior.
+//   verbose           — per-plot progress on stderr
+//   skip_existing     — if an output .plot2 already exists (and passes a
+//                       lightweight magic/size check), skip the plot
+//                       instead of overwriting it
+//   continue_on_error — catch per-plot exceptions and log rather than
+//                       aborting the batch; plots_failed in the result
+//                       counts how many skipped this way
+//   device_ids        — explicit list of GPU device ids to use. When empty
+//                       and use_all_devices is false, run on a single
+//                       device picked by the default SYCL gpu_selector_v
+//                       (zero-configuration, pre-multi-GPU behavior).
+//                       With multiple ids, the batch is partitioned
+//                       across workers — one thread per device, each
+//                       with its own GpuBufferPool and producer/consumer
+//                       channel. Plots are assigned round-robin
+//                       (entry i → worker i % N).
+//   use_all_devices   — enumerate all SYCL GPU devices at runtime and
+//                       use them. Overrides device_ids. Useful when the
+//                       caller doesn't know the host's device count up
+//                       front (e.g. `--devices all` on the CLI).
+//   include_cpu       — append the CPU as a worker device alongside any
+//                       GPUs already selected. Set by `--cpu` (orthogonal
+//                       to --devices) or by passing `cpu` as a token in
+//                       --devices. CPU is encoded as kCpuDeviceId (-2) in
+//                       device_ids — see src/gpu/DeviceIds.hpp. Plotting
+//                       on CPU is 1-2 orders of magnitude slower than on
+//                       GPU; this is meant for headless CI / GPU-less
+//                       hosts / heterogeneous device-list mixing.
+//   streaming_tier    — optional manual override for the streaming
+//                       pipeline tier (when the GPU pool doesn't fit).
+//                       Accepted values: "plain" (~7.24 GB floor at k=28,
+//                       ~10-15% faster), "compact" (~5.33 GB floor, fits
+//                       on tight 8 GB cards). Empty string = auto (the
+//                       pre-existing behavior: pick plain if it fits,
+//                       else compact). Equivalent to XCHPLOT2_STREAMING_TIER
+//                       env var but settable via --tier on the CLI; the
+//                       struct field takes precedence over the env var.
+struct BatchOptions {
+    bool verbose           = false;
+    bool skip_existing     = false;
+    bool continue_on_error = false;
+    std::vector<int> device_ids;
+    bool use_all_devices   = false;
+    bool include_cpu       = false;
+    std::string streaming_tier;
+};
+
 // Parse a manifest file in the format described in tools/xchplot2/main.cpp
 // (tab-separated, one plot per line). Throws std::runtime_error on bad input.
 std::vector<BatchEntry> parse_manifest(std::string const& path);
 
 // Run the staggered pipeline. Producer/consumer share a queue of depth 1.
 // The first plot pays the full GPU+FSE cost; subsequent plots overlap.
-BatchResult run_batch(std::vector<BatchEntry> const& entries, bool verbose = false);
+BatchResult run_batch(std::vector<BatchEntry> const& entries,
+                      BatchOptions const& opts);
+
+// Legacy bool-verbose shim kept for source-compat with older callsites.
+inline BatchResult run_batch(std::vector<BatchEntry> const& entries,
+                             bool verbose = false)
+{
+    BatchOptions opts;
+    opts.verbose = verbose;
+    return run_batch(entries, opts);
+}
 
 } // namespace pos2gpu
diff --git a/src/host/Cancel.cpp b/src/host/Cancel.cpp
new file mode 100644
index 0000000..7ba7fd6
--- /dev/null
+++ b/src/host/Cancel.cpp
@@ -0,0 +1,68 @@
+// Cancel.cpp — implementation of the SIGINT/SIGTERM cancel flag.
+
+#include "host/Cancel.hpp"
+
+#include <csignal>
+
+#if defined(__unix__) || defined(__APPLE__)
+#  include <unistd.h>  // write(2)
+#endif
+
+namespace pos2gpu {
+
+namespace {
+
+// sig_atomic_t is the one type C/C++ guarantee is safe to read/write from
+// a signal handler without synchronization concerns. The count lets us
+// turn the second same-signal receipt into a hard kill, so a user whose
+// cooperative shutdown is stuck can still escape with a second Ctrl-C.
+volatile std::sig_atomic_t g_cancel_count = 0;
+
+void write_stderr_safe(char const* msg, std::size_t len) noexcept
+{
+#if defined(__unix__) || defined(__APPLE__)
+    // write(2) is async-signal-safe; std::fprintf is not.
+    ssize_t const rc = ::write(2, msg, len);
+    (void)rc;  // nothing useful to do if stderr is gone
+#else
+    (void)msg;
+    (void)len;
+#endif
+}
+
+extern "C" void cancel_handler(int sig) noexcept
+{
+    // On the second receipt, restore the default disposition and re-raise
+    // so the process dies immediately. Prevents a hung plotter from
+    // needing kill -9 when the user insists.
+    if (g_cancel_count >= 1) {
+        std::signal(sig, SIG_DFL);
+        std::raise(sig);
+        return;
+    }
+    g_cancel_count = 1;
+    static char const msg[] =
+        "\n[xchplot2] cancel requested — finishing current plot then "
+        "stopping. Press Ctrl-C again to abort immediately.\n";
+    write_stderr_safe(msg, sizeof(msg) - 1);
+}
+
+} // namespace
+
+void install_cancel_signal_handlers()
+{
+    std::signal(SIGINT,  cancel_handler);
+    std::signal(SIGTERM, cancel_handler);
+}
+
+bool cancel_requested() noexcept
+{
+    return g_cancel_count > 0;
+}
+
+void reset_cancel_for_tests() noexcept
+{
+    g_cancel_count = 0;
+}
+
+} // namespace pos2gpu
diff --git a/src/host/Cancel.hpp b/src/host/Cancel.hpp
new file mode 100644
index 0000000..cc4138e
--- /dev/null
+++ b/src/host/Cancel.hpp
@@ -0,0 +1,26 @@
+// Cancel.hpp — SIGINT/SIGTERM handling for long-running batches.
+//
+// install_cancel_signal_handlers() installs handlers that set an
+// async-signal-safe flag on first receipt and restore the default
+// disposition on second receipt (so double-Ctrl-C kills hard).
+//
+// cancel_requested() is cheap enough to call from tight loops.
+
+#pragma once
+
+namespace pos2gpu {
+
+// Install SIGINT + SIGTERM handlers. Idempotent — safe to call more than
+// once. First signal sets the cancel flag and prints a one-line notice
+// via write(2) (async-signal-safe). Second signal of the same type
+// re-raises with the default disposition, terminating the process.
+void install_cancel_signal_handlers();
+
+// True if a cancelling signal has been received since program start
+// (or since reset_cancel_for_tests()).
+bool cancel_requested() noexcept;
+
+// Testing hook — clear the flag. Not intended for production code.
+void reset_cancel_for_tests() noexcept;
+
+} // namespace pos2gpu
diff --git a/src/host/CpuPlotter.cpp b/src/host/CpuPlotter.cpp
new file mode 100644
index 0000000..1e83e09
--- /dev/null
+++ b/src/host/CpuPlotter.cpp
@@ -0,0 +1,72 @@
+// CpuPlotter.cpp — wraps pos2-chip's Plotter + PlotFile::writeData.
+//
+// Isolated to one TU because pos2-chip's Plotter.hpp pulls in the full
+// table-construction template stack (Table1/2/3Constructor + RadixSort
+// + ChunkCompressor + ...). Including that header anywhere else in the
+// build would balloon compile times for no benefit — only this TU
+// actually invokes Plotter::run().
+
+#include "host/CpuPlotter.hpp"
+#include "host/BatchPlotter.hpp"  // for BatchEntry / BatchOptions
+
+// pos2-chip headers — header-only, no separate compilation needed.
+// pos2_chip_headers (PUBLIC dep of pos2_gpu_host) provides the
+// include path + fse link.
+#include "plot/Plotter.hpp"
+#include "plot/PlotFile.hpp"
+#include "pos/ProofParams.hpp"
+
+#include <cstdint>
+#include <cstdio>
+#include <filesystem>
+#include <span>
+#include <stdexcept>
+#include <string>
+
+namespace pos2gpu {
+
+void run_one_plot_cpu(BatchEntry const& entry, BatchOptions const& opts)
+{
+    // Build pos2-chip's ProofParams from BatchEntry's existing fields.
+    // ProofParams is in the global namespace (pos2-chip doesn't wrap
+    // its public types in a namespace).
+    ::ProofParams params(entry.plot_id.data(),
+                         static_cast<uint8_t>(entry.k),
+                         static_cast<uint8_t>(entry.strength),
+                         static_cast<uint8_t>(entry.testnet ? 1 : 0));
+
+    ::Plotter::Options pl_opts;
+    pl_opts.verbose = opts.verbose;
+
+    ::Plotter plotter(params);
+    ::PlotData plot = plotter.run(pl_opts);
+
+    // pos2-chip's PlotFile::writeData accepts the memo as a span and
+    // writes a 1-byte length prefix on disk, so any size in [0, 255]
+    // is valid. keygen-rs emits two layouts:
+    //   - pool-PH mode: 32-byte pool_ph + 48-byte farmer_pk + 32-byte
+    //                   master_sk = 112 bytes
+    //   - pool-PK mode: 48-byte pool_pk + 48-byte farmer_pk + 32-byte
+    //                   master_sk = 128 bytes
+    // BatchEntry.memo already holds the bytes in the on-disk layout, so
+    // pass them through as a span. The previous strict 112-byte check
+    // rejected pool-PK plots produced via `xchplot2 plot -p ...`.
+    if (entry.memo.size() > 255) {
+        throw std::runtime_error(
+            "CpuPlotter: memo size " + std::to_string(entry.memo.size()) +
+            " exceeds the 255-byte on-disk limit");
+    }
+
+    std::filesystem::path const out_path =
+        std::filesystem::path(entry.out_dir) / entry.out_name;
+
+    ::PlotFile::writeData(out_path.string(),
+                          plot,
+                          params,
+                          static_cast<uint16_t>(entry.plot_index),
+                          static_cast<uint8_t>(entry.meta_group),
+                          std::span<uint8_t const>(entry.memo.data(),
+                                                   entry.memo.size()));
+}
+
+} // namespace pos2gpu
diff --git a/src/host/CpuPlotter.hpp b/src/host/CpuPlotter.hpp
new file mode 100644
index 0000000..796034a
--- /dev/null
+++ b/src/host/CpuPlotter.hpp
@@ -0,0 +1,28 @@
+// CpuPlotter.hpp — single-plot CPU pipeline using pos2-chip's Plotter
+// directly (no SYCL / no GPU code path involved).
+//
+// Format-compatible with the GPU output: same plot_id derivation, same
+// .plot2 file layout, byte-identical proofs. pos2-chip is the upstream
+// PoS2 reference implementation, already in our build tree via
+// FetchContent (third_party/pos2-chip), so we link its CPU plotter
+// directly rather than routing SYCL kernels through AdaptiveCpp's
+// OpenMP backend.
+//
+// Single-threaded internally (the Plotter constructs T1/T2/T3 in
+// sequence). Multi-core utilization comes from BatchPlotter spawning
+// one of these per `cpu` token in --devices, e.g. `--devices cpu,cpu`
+// runs two concurrent plots on two cores.
+//
+// Throws std::runtime_error on plotting failure (caller decides
+// whether to continue under continue_on_error).
+
+#pragma once
+
+namespace pos2gpu {
+
+struct BatchEntry;
+struct BatchOptions;
+
+void run_one_plot_cpu(BatchEntry const& entry, BatchOptions const& opts);
+
+} // namespace pos2gpu
diff --git a/src/host/GpuBufferPool.cpp b/src/host/GpuBufferPool.cpp
new file mode 100644
index 0000000..d35fd53
--- /dev/null
+++ b/src/host/GpuBufferPool.cpp
@@ -0,0 +1,478 @@
+// GpuBufferPool.cu — queries per-phase scratch sizes once and allocates
+// worst-case-sized persistent buffers. Slice 13 migrated the device and
+// pinned-host allocations from the cudaMalloc / cudaMallocHost family to
+// sycl::malloc_device / sycl::malloc_host on the shared SYCL queue;
+// cudaMemGetInfo is left as-is because it's a context-level query that
+// works regardless of which runtime is doing the allocations (SYCL +
+// CUDA host code share the same primary CUDA context).
+
+#include "host/GpuBufferPool.hpp"
+#include "gpu/Sort.cuh"
+#include "gpu/SyclBackend.hpp"
+#include "host/PoolSizing.hpp"
+
+#include "gpu/XsKernel.cuh"
+#include "gpu/T1Kernel.cuh"
+#include "gpu/T2Kernel.cuh"
+#include "gpu/T3Kernel.cuh"
+
+#include <sycl/sycl.hpp>
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <stdexcept>
+#include <string>
+
+namespace pos2gpu {
+
+namespace {
+
+
+// Allocate `bytes` of device memory on `q` and check for null. The cap-and-
+// throw helpers in GpuPipeline.cu are streaming-pipeline specific; the pool
+// just allocates worst-case sizes once at construction so a one-line wrap
+// suffices.
+// Format a byte count as "<N> bytes (<N.NN> MB)" for diagnostics. The
+// raw byte count surfaces sub-MiB requests that would otherwise round
+// to "0 MB"; the MB form keeps human readability for the > 1 MiB case.
+inline std::string fmt_alloc_bytes(size_t bytes)
+{
+    char buf[64];
+    std::snprintf(buf, sizeof(buf), "%zu bytes (%.2f MB)",
+                  bytes, double(bytes) / (1024.0 * 1024.0));
+    return std::string(buf);
+}
+
+// AdaptiveCpp's CUDA allocator throws sycl::exception on cudaMalloc
+// failure (e.g. "cuda_allocator: cudaMalloc() failed (error code =
+// CUDA:2)" for cudaErrorMemoryAllocation). Older / non-CUDA backends
+// may instead return nullptr. Cover both paths with one diagnostic
+// shape so callers see "sycl::malloc_device(d_pair_a, 4690 MB) failed:
+// <underlying>" regardless of which branch fired. This also catches
+// the throw synchronously so the async error handler doesn't log the
+// same CUDA error a second time after caller cleanup.
+inline void* sycl_alloc_device_or_throw(size_t bytes, sycl::queue& q,
+                                        char const* what)
+{
+    void* p = nullptr;
+    try {
+        p = sycl::malloc_device(bytes, q);
+    } catch (sycl::exception const& e) {
+        throw std::runtime_error(
+            std::string("sycl::malloc_device(") + what + ", " +
+            fmt_alloc_bytes(bytes) + ") failed: " + e.what() +
+            ". Likely transient OOM — check `nvidia-smi` for other GPU "
+            "consumers, or set POS2GPU_MAX_VRAM_MB lower if VRAM is "
+            "shared with display/compositor.");
+    }
+    if (!p) {
+        throw std::runtime_error(
+            std::string("sycl::malloc_device(") + what + ", " +
+            fmt_alloc_bytes(bytes) + ") returned null (out of device "
+            "memory). Likely transient OOM — check `nvidia-smi` for "
+            "other GPU consumers, or set POS2GPU_MAX_VRAM_MB lower if "
+            "VRAM is shared with display/compositor.");
+    }
+    return p;
+}
+
+inline void* sycl_alloc_host_or_throw(size_t bytes, sycl::queue& q,
+                                      char const* what)
+{
+    void* p = nullptr;
+    try {
+        p = sycl::malloc_host(bytes, q);
+    } catch (sycl::exception const& e) {
+        throw std::runtime_error(
+            std::string("sycl::malloc_host(") + what + ", " +
+            fmt_alloc_bytes(bytes) + ") failed: " + e.what());
+    }
+    if (!p) {
+        throw std::runtime_error(
+            std::string("sycl::malloc_host(") + what + ", " +
+            fmt_alloc_bytes(bytes) + ") returned null (out of host pinned memory)");
+    }
+    return p;
+}
+
+} // namespace
+
+GpuBufferPool::GpuBufferPool(int k_, int strength_, bool testnet_)
+    : k(k_), strength(strength_), testnet(testnet_)
+{
+    sycl::queue& q = sycl_backend::queue();
+
+    int const num_section_bits = (k < 28) ? 2 : (k - 26);
+    total_xs = 1ULL << k;
+    cap      = max_pairs_per_section(k, num_section_bits) * (1ULL << num_section_bits);
+
+    // d_storage must hold EITHER total_xs XsCandidateGpu (8 B each) OR
+    // THREE cap-sized uint32 key/val arrays during sort. Only three, not
+    // four: the sort API signature takes a (keys_in, keys_out, vals_in,
+    // vals_out) quad, but pool-path callers always pass the SoA match-info
+    // stream (d_t1_mi / d_t2_mi, living in d_pair_a) as keys_in, so the
+    // keys_in slot inside d_storage was never read. Dropping it saves
+    // cap·4 B (~1.09 GiB at k=28) — enough to close the 0.71 GiB pool
+    // shortfall on 12 GiB cards.
+    storage_bytes = std::max(
+        static_cast<size_t>(total_xs) * sizeof(XsCandidateGpu),
+        static_cast<size_t>(cap) * 3 * sizeof(uint32_t));
+
+    // d_pair_a holds the *match output* of the current phase: T1 SoA
+    // (meta·8 B + mi·4 B = 12 B), T2 SoA (meta·8 B + mi·4 B + xbits·4 B =
+    // 16 B), then T3 (T3PairingGpu, 8 B). Worst case is T2 at 16 B/entry.
+    // It does NOT alias the Xs construction scratch — that's d_pair_b.
+    pair_a_bytes = std::max({
+        static_cast<size_t>(cap) * sizeof(T1PairingGpu),
+        static_cast<size_t>(cap) * sizeof(T2PairingGpu),
+        static_cast<size_t>(cap) * sizeof(T3PairingGpu),
+        static_cast<size_t>(cap) * sizeof(uint64_t),
+    });
+
+    // d_pair_b holds the *sort output* of the current phase (sorted T1
+    // meta, sorted T2 meta+xbits, T3 frags) AND the Xs construction
+    // scratch. Sized to the max of those.
+    //
+    // Split-keys_a optimisation: the pool places the Xs sort's keys_a
+    // slot (total_xs·u32 = 1 GiB at k=28) in d_storage's tail — idle
+    // during Xs gen+sort, and the final pack phase only writes
+    // d_storage[0..total_xs·8), leaving the tail region undisturbed.
+    // This drops xs_temp_bytes from ~4.36 GB (4·N·u32 + cub) to
+    // ~3.22 GB (3·N·u32 + cub). At k=28 pair_b is then bounded by
+    // cap·12 (sorted T2 meta+xbits = 3.27 GB) rather than xs scratch,
+    // saving ~1.09 GB off the pool's peak VRAM requirement vs the
+    // pre-split layout.
+    uint8_t dummy_plot_id[32] = {};
+    // Non-null sentinel tells launch_construct_xs to report the
+    // split-layout size. The sentinel value is read only in sizing
+    // mode (d_temp_storage == nullptr), where only its non-null-ness
+    // matters.
+    void* const xs_split_sentinel = reinterpret_cast<void*>(uintptr_t{1});
+    launch_construct_xs(dummy_plot_id, k, testnet,
+                                   nullptr, nullptr, &xs_temp_bytes, q,
+                                   xs_split_sentinel);
+    pair_b_bytes = std::max({
+        static_cast<size_t>(cap) * sizeof(uint64_t),                          // sorted T1 meta
+        static_cast<size_t>(cap) * (sizeof(uint64_t) + sizeof(uint32_t)),     // sorted T2 meta+xbits
+        static_cast<size_t>(cap) * sizeof(uint64_t),                          // T3 frags out
+        xs_temp_bytes,                                                        // Xs aliased scratch (3·N·u32 + cub)
+    });
+
+    // Query CUB sort scratch sizes (largest across T1/T2/T3 sorts).
+    size_t s_pairs = 0;
+    launch_sort_pairs_u32_u32(
+        nullptr, s_pairs,
+        static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+        static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+        cap, 0, k, q);
+    size_t s_keys = 0;
+    launch_sort_keys_u64(
+        nullptr, s_keys,
+        static_cast<uint64_t*>(nullptr), static_cast<uint64_t*>(nullptr),
+        cap, 0, 2 * k, q);
+    sort_scratch_bytes = std::max(s_pairs, s_keys);
+
+    pinned_bytes = cap * sizeof(uint64_t);
+
+    // Check VRAM before attempting allocation so we can give a useful
+    // diagnostic instead of a generic allocation failure. The margin covers
+    // GPU driver/context state, sort scratch, AES T-tables, and other small
+    // runtime allocations.
+    //
+    // SYCL has no portable free-memory query, so slice 17c approximates
+    // free_b == total_b. The actual sycl::malloc_device call will throw if
+    // VRAM is exhausted; the diagnostic message is just less precise about
+    // how much of the total is already consumed by other processes.
+    {
+        size_t const required_device =
+            storage_bytes + pair_a_bytes + pair_b_bytes + sort_scratch_bytes + sizeof(uint64_t);
+        // Margin covers per-context driver state + AES T-tables + the
+        // tiny (sizeof(uint64_t)) d_counter alloc that's not counted in
+        // sort_scratch. Originally 512 MB (slice 17c); trimmed to 256 MB
+        // after measuring actual runtime overhead on gfx1031/ROCm 6.2
+        // and sm_89/CUDA 13: both land under 150 MB of non-pool device
+        // allocations, so a 256 MB margin leaves >100 MB headroom while
+        // letting cards on the threshold (e.g. 12 GiB reporting ~11.8
+        // GiB free at ctor time) now succeed into the pool path.
+        size_t const margin = 256ULL * 1024 * 1024; // 256 MB
+        size_t const total_b =
+            q.get_device().get_info<sycl::info::device::global_mem_size>();
+        size_t const free_b = total_b;  // approximation — see comment above
+        if (free_b < required_device + margin) {
+            auto to_gib = [](size_t b) { return b / double(1ULL << 30); };
+            InsufficientVramError e(
+                "GpuBufferPool: insufficient device VRAM for k=" +
+                std::to_string(k) + " strength=" + std::to_string(strength) +
+                "; need ~" + std::to_string(to_gib(required_device + margin)).substr(0, 5) +
+                " GiB (pool " + std::to_string(to_gib(required_device)).substr(0, 5) +
+                " GiB + ~0.25 GiB runtime), only " +
+                std::to_string(to_gib(free_b)).substr(0, 5) +
+                " GiB free of " + std::to_string(to_gib(total_b)).substr(0, 5) +
+                " GiB total. Use a smaller k or a GPU with more VRAM.");
+            e.required_bytes = required_device + margin;
+            e.free_bytes     = free_b;
+            e.total_bytes    = total_b;
+            throw e;
+        }
+    }
+
+    if (getenv("POS2GPU_POOL_DEBUG")) {
+        size_t const total_b =
+            q.get_device().get_info<sycl::info::device::global_mem_size>();
+        std::fprintf(stderr,
+            "[pool] k=%d strength=%d cap=%llu total_xs=%llu "
+            "total=%.2fGB (free unavailable in SYCL build)\n",
+            k, strength, (unsigned long long)cap, (unsigned long long)total_xs,
+            total_b/1e9);
+        std::fprintf(stderr,
+            "[pool] sizes: storage=%.2fGB pair_a=%.2fGB pair_b=%.2fGB "
+            "xs_temp(alias→pair_b)=%.2fGB sort_scratch=%.2fGB pinned=%.2fGB\n",
+            storage_bytes/1e9, pair_a_bytes/1e9, pair_b_bytes/1e9,
+            xs_temp_bytes/1e9, sort_scratch_bytes/1e9, pinned_bytes/1e9);
+    }
+
+    // Wrap allocations so a mid-sequence failure (e.g. d_pair_b OOM after
+    // d_storage + d_pair_a have already succeeded) frees the pre-allocated
+    // buffers instead of leaking ~10 GB of device VRAM and ~7 GB of host
+    // pinned memory per failed pool ctor across a batch retry loop.
+    auto cleanup_partial = [&]{
+        if (d_storage)       { sycl::free(d_storage,      q); d_storage      = nullptr; }
+        if (d_pair_a)        { sycl::free(d_pair_a,       q); d_pair_a       = nullptr; }
+        if (d_pair_b)        { sycl::free(d_pair_b,       q); d_pair_b       = nullptr; }
+        if (d_sort_scratch)  { sycl::free(d_sort_scratch, q); d_sort_scratch = nullptr; }
+        if (d_counter)       { sycl::free(d_counter,      q); d_counter      = nullptr; }
+        for (int i = 0; i < kNumPinnedBuffers; ++i) {
+            if (h_pinned_t3[i]) { sycl::free(h_pinned_t3[i], q); h_pinned_t3[i] = nullptr; }
+        }
+    };
+    try {
+        d_storage      = sycl_alloc_device_or_throw(storage_bytes,      q, "d_storage");
+        // d_pair_a is allocated lazily in ensure_pair_a(), called by
+        // run_gpu_pipeline's pool path right after submitting Xs gen
+        // — the malloc_device then overlaps with Xs GPU execution.
+        // Saves ~400-500 ms on first-plot wall vs eager alloc; batch
+        // plots 2+ are unaffected (fast-path pointer lookup).
+        d_pair_b       = sycl_alloc_device_or_throw(pair_b_bytes,       q, "d_pair_b");
+        d_sort_scratch = sycl_alloc_device_or_throw(sort_scratch_bytes, q, "d_sort_scratch");
+        d_counter      = static_cast<uint64_t*>(
+            sycl_alloc_device_or_throw(sizeof(uint64_t),                q, "d_counter"));
+        // h_pinned_t3[] is allocated lazily in ensure_pinned(); see
+        // the header comment for why. Single-plot runs only ever
+        // touch slot 0 so the other two 2.2 GB malloc_host calls
+        // aren't paid at all.
+    } catch (...) {
+        cleanup_partial();
+        throw;
+    }
+}
+
+void* GpuBufferPool::ensure_pair_a()
+{
+    if (d_pair_a) return d_pair_a;
+    std::lock_guard<std::mutex> lk(pair_a_mu_);
+    if (d_pair_a) return d_pair_a;
+    sycl::queue& q = sycl_backend::queue();
+    d_pair_a = sycl_alloc_device_or_throw(pair_a_bytes, q, "d_pair_a");
+    return d_pair_a;
+}
+
+void GpuBufferPool::release_pair_a()
+{
+    std::lock_guard<std::mutex> lk(pair_a_mu_);
+    if (!d_pair_a) return;
+    sycl::free(d_pair_a, sycl_backend::queue());
+    d_pair_a = nullptr;
+}
+
+uint64_t* GpuBufferPool::ensure_pinned(int idx)
+{
+    if (idx < 0 || idx >= kNumPinnedBuffers) {
+        throw std::runtime_error("GpuBufferPool::ensure_pinned: idx out of range");
+    }
+    // Double-checked locking: fast path skips the mutex once the
+    // slot's pointer is visible. Writes inside the mutex are
+    // release-ordered w.r.t. the mutex release; the unlocked read
+    // on the fast path is an acquire (relaxed access is fine here
+    // because x86 and arm64 give us acquire ordering for aligned
+    // pointer reads; if this ever needs to be portable to weaker
+    // architectures, make h_pinned_t3 std::atomic<uint64_t*>[]).
+    if (h_pinned_t3[idx]) return h_pinned_t3[idx];
+    std::lock_guard<std::mutex> lk(pinned_mu_[idx]);
+    if (h_pinned_t3[idx]) return h_pinned_t3[idx];
+    sycl::queue& q = sycl_backend::queue();
+    h_pinned_t3[idx] = static_cast<uint64_t*>(
+        sycl_alloc_host_or_throw(pinned_bytes, q, "h_pinned_t3"));
+    return h_pinned_t3[idx];
+}
+
+GpuBufferPool::~GpuBufferPool()
+{
+    sycl::queue& q = sycl_backend::queue();
+    if (d_storage)       sycl::free(d_storage,      q);
+    if (d_pair_a)        sycl::free(d_pair_a,       q);
+    if (d_pair_b)        sycl::free(d_pair_b,       q);
+    if (d_sort_scratch)  sycl::free(d_sort_scratch, q);
+    if (d_counter)       sycl::free(d_counter,      q);
+    for (int i = 0; i < kNumPinnedBuffers; ++i) {
+        if (h_pinned_t3[i]) sycl::free(h_pinned_t3[i], q);
+    }
+}
+
+DeviceMemInfo query_device_memory()
+{
+    sycl::queue& q = sycl_backend::queue();
+    DeviceMemInfo info;
+    info.total_bytes =
+        q.get_device().get_info<sycl::info::device::global_mem_size>();
+    // SYCL has no portable free-memory query; AdaptiveCpp's
+    // global_mem_size returns the device total. On the CUDA backend
+    // the underlying driver often subtracts active reservations
+    // (framebuffer, compositor) before reporting, which gets us
+    // closer to "free" in practice. Treat the result as an upper
+    // bound; sycl::malloc_device is still the source of truth.
+    info.free_bytes = info.total_bytes;
+
+    if (char const* v = std::getenv("POS2GPU_MAX_VRAM_MB"); v && v[0]) {
+        size_t const cap = size_t(std::strtoull(v, nullptr, 10)) * (1ULL << 20);
+        info.free_bytes  = std::min(info.free_bytes,  cap);
+        info.total_bytes = std::min(info.total_bytes, cap);
+    }
+    return info;
+}
+
+namespace {
+
+// CUB's DeviceRadixSort temp_storage_bytes at k=28 with our key/val
+// shape lands around 64-128 MB on sm_89; the streaming peak anchors
+// below were measured with that overhead already live, so they
+// implicitly budget for it. AdaptiveCpp's HIP backend routes the
+// same `launch_sort_*` calls through a hand-rolled SYCL radix in
+// SortSycl.cpp that uses ping-pong buffers sized to the input —
+// multi-GiB at k=28, far exceeding what CUB's in-place radix needs.
+// The streaming peak prediction has to add that excess so dispatch
+// in BatchPlotter doesn't pick a tier whose "predicted peak" is
+// several GiB short of the actual T1-sort live, the way an 8 GiB
+// W5700 (gfx1010 → gfx1013 spoof) currently does.
+//
+// Baseline set at 256 MB at k=28 (a touch over CUB's typical scratch
+// on sm_89 to keep headroom on NVIDIA cards near the threshold) and
+// scaled 2× per +k step (linear in cap, matching how CUB's actual
+// DeviceRadixSort scratch grows). The returned adjustment is
+// `max(0, runtime_sort_scratch - baseline)`, so NVIDIA hosts whose
+// runtime scratch is at or below the baseline see no change in
+// predicted peak.
+inline size_t streaming_sort_scratch_adjustment(int k)
+{
+    constexpr size_t cub_baseline_at_k28_bytes = 256ULL << 20;
+
+    sycl::queue& q = sycl_backend::queue();
+    int const num_section_bits = (k < 28) ? 2 : (k - 26);
+    size_t const cap_for_k =
+        max_pairs_per_section(k, num_section_bits) * (1ULL << num_section_bits);
+
+    size_t s_pairs = 0;
+    launch_sort_pairs_u32_u32(
+        nullptr, s_pairs,
+        static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+        static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+        cap_for_k, 0, k, q);
+    size_t s_keys = 0;
+    launch_sort_keys_u64(
+        nullptr, s_keys,
+        static_cast<uint64_t*>(nullptr), static_cast<uint64_t*>(nullptr),
+        cap_for_k, 0, 2 * k, q);
+    size_t const actual = std::max(s_pairs, s_keys);
+
+    int const dk = k - 28;
+    size_t baseline = cub_baseline_at_k28_bytes;
+    if (dk > 0)      baseline <<= dk;
+    else if (dk < 0) baseline >>= -dk;
+
+    return (actual > baseline) ? (actual - baseline) : 0;
+}
+
+} // namespace
+
+size_t streaming_peak_bytes(int k)
+{
+    // Anchor: 5200 MB at k=28 (measured post-stage-4e on sm_89).
+    // After the full T1/T2/T3 match/sort work (stages 1-4d) + Xs
+    // gen+sort+pack inlining (4e), all match + sort phases cap out at
+    // cap·sizeof(uint64_t) × ~2.5 aliases = ~5200 MB. Xs peak is 4128,
+    // T3 sort 4228, all others ≤ 5200. Dominant terms scale with 2^k.
+    constexpr size_t anchor_mb = 5200;
+    size_t const adj = streaming_sort_scratch_adjustment(k);
+    if (k == 28) return (anchor_mb << 20) + adj;
+    if (k <  18) return (size_t(16) << 20) + adj;       // floor for tiny test plots
+    if (k >  32) return (size_t(anchor_mb) << (20 + (32 - 28))) + adj;
+
+    if (k < 28) {
+        int const shift = 28 - k;  // cap halves per −1 in k → 2× smaller
+        return ((size_t(anchor_mb) << 20) >> shift) + adj;
+    }
+    int const shift = k - 28;
+    return ((size_t(anchor_mb) << 20) << shift) + adj;
+}
+
+size_t streaming_plain_peak_bytes(int k)
+{
+    // Anchor: 7290 MB at k=28 (pre-stage-1-4 peak — d_t1_meta +
+    // d_t1_keys_merged + d_t2_meta + d_t2_mi + d_t2_xbits all live
+    // concurrently during T2 match, no parks). Plain tier skips all
+    // park/rehydrate round-trips for ~400 ms/plot over compact at the
+    // cost of this higher peak. Scales the same way as compact.
+    constexpr size_t anchor_mb = 7290;
+    size_t const adj = streaming_sort_scratch_adjustment(k);
+    if (k == 28) return (anchor_mb << 20) + adj;
+    if (k <  18) return (size_t(16) << 20) + adj;
+    if (k >  32) return (size_t(anchor_mb) << (20 + (32 - 28))) + adj;
+
+    if (k < 28) {
+        int const shift = 28 - k;
+        return ((size_t(anchor_mb) << 20) >> shift) + adj;
+    }
+    int const shift = k - 28;
+    return ((size_t(anchor_mb) << 20) << shift) + adj;
+}
+
+size_t streaming_minimal_peak_bytes(int k)
+{
+    // Anchor: 3760 MB at k=28 (measured 3754 MB on sm_89 + the
+    // streaming-stats trace; rounded up for safety). Bottleneck is T3
+    // match where d_t2_keys_merged + d_t2_xbits_sorted + meta-l/r
+    // slices + d_t3_stage are co-resident.
+    //
+    // Minimal layers cumulative cuts on top of compact:
+    //   1. N=8 T2 match staging (cap/8 ≈ 570 MB vs compact's cap/2).
+    //   2. T1 sort gather, T2 sort meta+xbits gathers — tiled output,
+    //      D2H per tile to host pinned, rebuild on device after free.
+    //   3. T3 match — d_t2_meta_sorted parked on host pinned, sliced
+    //      device buffers H2D'd per (section_l, section_r) pass.
+    //   4. T1 match — sliced into N passes per section_l, output
+    //      accumulated to host pinned.
+    //   5. T1, T2, T3 sort CUB sub-phases — per-tile cap/N output
+    //      buffers, USM-host accumulation, merges with USM-host inputs.
+    //   6. Xs phase — gen+sort tiled in N=2 position halves with
+    //      USM-host accumulators; pack tiled with D2H per tile.
+    //
+    // Cumulative effect at k=28: peak drops from 5200 MB (compact) →
+    // 3754 MB (minimal). Trade-off: ~6 extra cap-sized PCIe round-
+    // trips per plot (~2.5× wall on NVIDIA — 13 s/plot → 34 s/plot
+    // at k=28). Same k-scaling as compact / plain.
+    constexpr size_t anchor_mb = 3760;
+    size_t const adj = streaming_sort_scratch_adjustment(k);
+    if (k == 28) return (anchor_mb << 20) + adj;
+    if (k <  18) return (size_t(16) << 20) + adj;
+    if (k >  32) return (size_t(anchor_mb) << (20 + (32 - 28))) + adj;
+
+    if (k < 28) {
+        int const shift = 28 - k;
+        return ((size_t(anchor_mb) << 20) >> shift) + adj;
+    }
+    int const shift = k - 28;
+    return ((size_t(anchor_mb) << 20) << shift) + adj;
+}
+
+} // namespace pos2gpu
diff --git a/src/host/GpuBufferPool.cu b/src/host/GpuBufferPool.cu
deleted file mode 100644
index ddb3298..0000000
--- a/src/host/GpuBufferPool.cu
+++ /dev/null
@@ -1,151 +0,0 @@
-// GpuBufferPool.cu — queries per-phase scratch sizes once and allocates
-// worst-case-sized persistent buffers.
-
-#include "host/GpuBufferPool.hpp"
-
-#include "gpu/XsKernel.cuh"
-#include "gpu/T1Kernel.cuh"
-#include "gpu/T2Kernel.cuh"
-#include "gpu/T3Kernel.cuh"
-
-#include <cub/cub.cuh>
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <cstring>
-#include <stdexcept>
-#include <string>
-
-namespace pos2gpu {
-
-namespace {
-
-// Variadic so the preprocessor doesn't choke on template-argument commas
-// in e.g. cub::DeviceRadixSort::SortPairs<uint32_t, uint32_t>(...).
-#define POOL_CHECK(...) do {                                             \
-    cudaError_t err = (__VA_ARGS__);                                     \
-    if (err != cudaSuccess) {                                            \
-        throw std::runtime_error(std::string("GpuBufferPool CUDA: ") +   \
-                                 cudaGetErrorString(err));               \
-    }                                                                    \
-} while (0)
-
-// Mirrors GpuPipeline.cu's max_pairs_per_section (and pos2-chip's
-// TableConstructorGeneric.hpp:23).
-inline size_t max_pairs_per_section(int k, int num_section_bits) {
-    int extra_margin_bits = 8 - ((28 - k) / 2);
-    return (1ULL << (k - num_section_bits)) + (1ULL << (k - extra_margin_bits));
-}
-
-} // namespace
-
-GpuBufferPool::GpuBufferPool(int k_, int strength_, bool testnet_)
-    : k(k_), strength(strength_), testnet(testnet_)
-{
-    int const num_section_bits = (k < 28) ? 2 : (k - 26);
-    total_xs = 1ULL << k;
-    cap      = max_pairs_per_section(k, num_section_bits) * (1ULL << num_section_bits);
-
-    // d_storage must hold EITHER total_xs XsCandidateGpu (8 B each) OR four
-    // cap-sized uint32 key/val arrays during sort. Cast everything to size_t
-    // so std::max's template deduction finds one common type.
-    storage_bytes = std::max(
-        static_cast<size_t>(total_xs) * sizeof(XsCandidateGpu),
-        static_cast<size_t>(cap) * 4 * sizeof(uint32_t));
-
-    // d_pair_*: worst case across T1 (12 B), T2 (16 B), T3 (8 B), uint64 frags (8 B).
-    pair_bytes = std::max({
-        static_cast<size_t>(cap) * sizeof(T1PairingGpu),
-        static_cast<size_t>(cap) * sizeof(T2PairingGpu),
-        static_cast<size_t>(cap) * sizeof(T3PairingGpu),
-        static_cast<size_t>(cap) * sizeof(uint64_t),
-    });
-
-    // Only the Xs phase asks for kernel scratch; T1/T2/T3 match report 0.
-    // Xs wants ~4.34 GB at k=28 — we alias d_pair_b for that, so no separate
-    // allocation.
-    uint8_t dummy_plot_id[32] = {};
-    POOL_CHECK(launch_construct_xs(dummy_plot_id, k, testnet,
-                                   nullptr, nullptr, &xs_temp_bytes));
-    if (xs_temp_bytes > pair_bytes) {
-        throw std::runtime_error(
-            "GpuBufferPool: Xs scratch exceeds pair buffer size; aliasing "
-            "d_pair_b as Xs temp is no longer safe");
-    }
-
-    // Query CUB sort scratch sizes (largest across T1/T2/T3 sorts).
-    size_t s_pairs = 0;
-    POOL_CHECK(cub::DeviceRadixSort::SortPairs<uint32_t, uint32_t>(
-        nullptr, s_pairs,
-        static_cast<uint32_t const*>(nullptr), static_cast<uint32_t*>(nullptr),
-        static_cast<uint32_t const*>(nullptr), static_cast<uint32_t*>(nullptr),
-        cap, 0, k, nullptr));
-    size_t s_keys = 0;
-    POOL_CHECK(cub::DeviceRadixSort::SortKeys<uint64_t>(
-        nullptr, s_keys,
-        static_cast<uint64_t const*>(nullptr), static_cast<uint64_t*>(nullptr),
-        cap, 0, 2 * k, nullptr));
-    sort_scratch_bytes = std::max(s_pairs, s_keys);
-
-    pinned_bytes = cap * sizeof(uint64_t);
-
-    // Check free VRAM before attempting allocation so we can give a useful
-    // diagnostic instead of a generic cudaErrorMemoryAllocation. The margin
-    // covers CUDA driver/context state, CUB internal scratch, AES T-tables,
-    // and other small runtime allocations.
-    {
-        size_t const required_device =
-            storage_bytes + 2 * pair_bytes + sort_scratch_bytes + sizeof(uint64_t);
-        size_t const margin = 512ULL * 1024 * 1024; // 512 MB
-        size_t free_b = 0, total_b = 0;
-        POOL_CHECK(cudaMemGetInfo(&free_b, &total_b));
-        if (free_b < required_device + margin) {
-            auto to_gib = [](size_t b) { return b / double(1ULL << 30); };
-            throw std::runtime_error(
-                "GpuBufferPool: insufficient device VRAM for k=" +
-                std::to_string(k) + " strength=" + std::to_string(strength) +
-                "; need ~" + std::to_string(to_gib(required_device + margin)).substr(0, 5) +
-                " GiB (pool " + std::to_string(to_gib(required_device)).substr(0, 5) +
-                " GiB + ~0.5 GiB runtime), only " +
-                std::to_string(to_gib(free_b)).substr(0, 5) +
-                " GiB free of " + std::to_string(to_gib(total_b)).substr(0, 5) +
-                " GiB total. Use a smaller k or a GPU with more VRAM.");
-        }
-    }
-
-    if (getenv("POS2GPU_POOL_DEBUG")) {
-        size_t free_b = 0, total_b = 0;
-        cudaMemGetInfo(&free_b, &total_b);
-        std::fprintf(stderr,
-            "[pool] k=%d strength=%d cap=%llu total_xs=%llu "
-            "free=%.2fGB total=%.2fGB\n",
-            k, strength, (unsigned long long)cap, (unsigned long long)total_xs,
-            free_b/1e9, total_b/1e9);
-        std::fprintf(stderr,
-            "[pool] sizes: storage=%.2fGB pair=%.2fGB xs_temp(alias)=%.2fGB "
-            "sort_scratch=%.2fGB pinned=%.2fGB\n",
-            storage_bytes/1e9, pair_bytes/1e9, xs_temp_bytes/1e9,
-            sort_scratch_bytes/1e9, pinned_bytes/1e9);
-    }
-
-    POOL_CHECK(cudaMalloc(&d_storage,      storage_bytes));
-    POOL_CHECK(cudaMalloc(&d_pair_a,       pair_bytes));
-    POOL_CHECK(cudaMalloc(&d_pair_b,       pair_bytes));
-    POOL_CHECK(cudaMalloc(&d_sort_scratch, sort_scratch_bytes));
-    POOL_CHECK(cudaMalloc(&d_counter,      sizeof(uint64_t)));
-    POOL_CHECK(cudaMallocHost(&h_pinned_t3[0], pinned_bytes));
-    POOL_CHECK(cudaMallocHost(&h_pinned_t3[1], pinned_bytes));
-}
-
-GpuBufferPool::~GpuBufferPool()
-{
-    if (d_storage)       cudaFree(d_storage);
-    if (d_pair_a)        cudaFree(d_pair_a);
-    if (d_pair_b)        cudaFree(d_pair_b);
-    if (d_sort_scratch)  cudaFree(d_sort_scratch);
-    if (d_counter)       cudaFree(d_counter);
-    if (h_pinned_t3[0])  cudaFreeHost(h_pinned_t3[0]);
-    if (h_pinned_t3[1])  cudaFreeHost(h_pinned_t3[1]);
-}
-
-} // namespace pos2gpu
diff --git a/src/host/GpuBufferPool.hpp b/src/host/GpuBufferPool.hpp
index 834f520..fd404c6 100644
--- a/src/host/GpuBufferPool.hpp
+++ b/src/host/GpuBufferPool.hpp
@@ -7,36 +7,69 @@
 // between device time (~2.75 s) and producer wall time (~5.1 s).
 //
 // Memory layout with aliasing (k=28 worst-case sizes in parens):
-//   d_storage      (4.36 GB)  — Xs candidates during Xs phase,
-//                               then 4×uint32[cap] sort keys/vals during sorts
-//   d_pair_a       (4.36 GB)  — T1/T2/T3 match output (reused across phases);
-//                               also serves as Xs phase scratch before T1
-//   d_pair_b       (4.36 GB)  — *_sorted / frags_out (reused across phases);
-//                               also serves as Xs phase scratch before T1
-//   d_sort_scratch (~2.3 GB)  — CUB radix-sort scratch (largest across phases)
+//   d_storage      (~3.3 GB)  — Xs candidates during Xs phase (2.1 GB),
+//                               then 3×uint32[cap] sort keys_out/vals_in/
+//                               vals_out during sorts. The fourth
+//                               (keys_in) slot the sort API would want
+//                               is ALWAYS the SoA match-info stream
+//                               from d_pair_a (d_t1_mi / d_t2_mi), so
+//                               d_storage doesn't allocate for it —
+//                               saves cap·4 B (~1.09 GiB at k=28) vs
+//                               the old 4-slot layout.
+//   d_pair_a       (~4.4 GB)  — T1/T2/T3 match output (reused across phases).
+//                               Sized to the largest match-output: cap·16 B
+//                               for T2 (meta+mi+xbits SoA). Does NOT alias the
+//                               Xs phase scratch — that lives in d_pair_b.
+//   d_pair_b       (~4.4 GB)  — *_sorted / frags_out (reused across phases),
+//                               AND the Xs construction scratch. Sized to
+//                               max(largest sorted-output, xs_temp_bytes);
+//                               at k=28 xs_temp dominates.
+//   d_sort_scratch (~MB)      — Radix sort scratch. After ping-pong refactor:
+//                               CUB DoubleBuffer mode shrinks this from ~2 GB
+//                               to ~MB; SortSycl already ping-pongs over the
+//                               caller's keys_in/keys_out buffers.
 //   d_counter      (8 B)      — reused uint64_t count output
-//   h_pinned_t3[2] (2.18 GB ea) — double-buffered final fragments DMA target.
-//                                 Producer writes plot N to buffer (N%2) while
-//                                 consumer reads plot N-1 from the other slot.
-//                                 With a depth-1 channel + producer being
-//                                 slower than consumer, this is race-free.
+//   h_pinned_t3[N] (~2.2 GB ea) — rotating final-fragments DMA targets.
+//                                 Producer writes plot K into slot K mod N
+//                                 while consumer reads earlier plots from
+//                                 the other slots; channel depth N-1 keeps
+//                                 the producer from overwriting in-flight
+//                                 reads. N defaults to 3 (see kNumPinnedBuffers).
 //
-// Total ~15 GB device + ~4.36 GB pinned host — fits in 17 GB free VRAM on a
-// 24 GB 4090.
+// Total ~12 GB device + ~6.6 GB pinned host at k=28 — fits (just) in the
+// 11.98 GiB free VRAM of a Navi 22 (RX 6700 XT) after the d_storage
+// slot-trim above. Pre-trim the total was ~13.1 GB and overshot this
+// card's budget by ~0.7 GiB, forcing a fallback to the streaming
+// pipeline which costs an extra ~5 s at k=28.
 //
 // Note: T1/T2/T3 match kernels report temp_bytes = 0 (no scratch needed).
-// Only the Xs phase wants ~4.34 GB of scratch, so we alias d_pair_b for that.
+// Only the Xs phase wants ~4.4 GB of scratch, and we alias d_pair_b for that.
 
 #pragma once
 
 #include <cstddef>
 #include <cstdint>
+#include <mutex>
+#include <stdexcept>
 
 namespace pos2gpu {
 
+// Typed exception for the "pool sizing exceeds available device VRAM"
+// case. Callers that want to fall back to the streaming pipeline when
+// the pool does not fit should catch this specifically rather than
+// string-matching a generic std::runtime_error.
+struct InsufficientVramError : std::runtime_error {
+    using std::runtime_error::runtime_error;
+    size_t required_bytes = 0;
+    size_t free_bytes     = 0;
+    size_t total_bytes    = 0;
+};
+
 struct GpuBufferPool {
-    // Allocates all buffers sized for (k, strength, testnet). Throws on any
-    // CUDA allocation failure.
+    // Allocates all buffers sized for (k, strength, testnet). Throws
+    // InsufficientVramError when the sized pool will not fit in free
+    // device VRAM; throws std::runtime_error on any other CUDA
+    // allocation or API failure.
     GpuBufferPool(int k, int strength, bool testnet);
     ~GpuBufferPool();
 
@@ -52,7 +85,8 @@ struct GpuBufferPool {
     uint64_t total_xs           = 0;
     uint64_t cap                = 0;
     size_t   storage_bytes      = 0;
-    size_t   pair_bytes         = 0;
+    size_t   pair_a_bytes       = 0; // max(T1/T2/T3 match-output footprints)
+    size_t   pair_b_bytes       = 0; // max(*_sorted footprints, xs_temp_bytes)
     size_t   xs_temp_bytes      = 0; // scratch size the Xs phase asks for
     size_t   sort_scratch_bytes = 0;
     size_t   pinned_bytes       = 0; // per pinned buffer
@@ -65,10 +99,93 @@ struct GpuBufferPool {
     void*     d_sort_scratch = nullptr;
     uint64_t* d_counter      = nullptr;
 
-    // Pinned host buffers for final T3 fragment D2H. Double-buffered so the
-    // consumer can read plot N directly from one slot while producer writes
-    // plot N+1 into the other — no intermediate ~2 GB heap copy per plot.
-    uint64_t* h_pinned_t3[2] = {nullptr, nullptr};
+    // Number of rotating pinned slots for the final T3-fragment D2H.
+    // Set to 3 so the channel can hold depth-2 of in-flight plots
+    // without the producer ever overwriting a slot the consumer is
+    // still reading — useful when consumer wall > producer wall
+    // (slow disk / FSE-heavy strengths). 2 was enough for the
+    // previously measured producer-slower-than-consumer case, but
+    // 3 costs only ~2 GB of host pinned at k=28 and widens the
+    // "safe" consumer/producer ratio.
+    //
+    // Pinned slots are allocated LAZILY on first use via
+    // ensure_pinned(idx). The ctor no longer pays ~1.8 s at k=28
+    // for the 3 × 2.2 GB malloc_host calls; single-plot runs
+    // (plot -n 1) only ever allocate slot 0, saving ~1.2 s of
+    // ctor time. Batch runs (plot -n N, N ≥ 3) amortise the
+    // allocation cost across the first three plots' D2H phases
+    // instead of the ctor — identical total batch time.
+    static constexpr int kNumPinnedBuffers = 3;
+    uint64_t* h_pinned_t3[kNumPinnedBuffers] = {};
+
+    // Returns pool.h_pinned_t3[idx], allocating the slot if it
+    // hasn't been used yet. Thread-safe via a per-slot mutex
+    // (concurrent callers with the same idx cooperate through
+    // double-checked locking; different idx values proceed
+    // independently). Throws std::runtime_error on host alloc
+    // failure.
+    uint64_t* ensure_pinned(int idx);
+
+    // Returns pool.d_pair_a, allocating it on first use. Deferred
+    // from ctor so run_gpu_pipeline can submit Xs gen *before*
+    // paying this 4.36 GB malloc_device. Thread-safe via double-
+    // checked locking on pair_a_mu_.
+    //
+    // Measured on RX 6700 XT / ROCm 6.2 / AdaptiveCpp HIP:
+    // sycl::malloc_device of 4.36 GB takes ~5 ms (the driver
+    // almost certainly just reserves virtual-address space and
+    // defers physical commit to first write). Overlap benefit
+    // vs eager alloc is therefore ~5 ms in practice, below noise.
+    // The lazy pattern is kept because (a) it's a drop-in
+    // replacement with zero regression, (b) it mirrors
+    // ensure_pinned, and (c) it enables release_pair_a() below.
+    void* ensure_pair_a();
+
+    // Frees d_pair_a if it's allocated, so a subsequent
+    // ensure_pair_a() will re-allocate. Called by the pool path
+    // at the end of each plot in a batch to shrink the
+    // inter-plot VRAM peak. With ~5 ms malloc on AMD, the
+    // release-and-realloc cost is below noise per plot, while
+    // the 4.36 GB VRAM freed during file-write / D2H-consume
+    // phases lets the pool path fit cards with ~7-8 GiB free
+    // that would otherwise hit the InsufficientVramError path
+    // and fall back to streaming.
+    //
+    // Thread-safe via pair_a_mu_; lock-order is
+    // (pair_a_mu_ → sycl::free) so release can run concurrently
+    // with a future ensure_pair_a from a different thread
+    // without deadlock. In practice run_batch is single-producer
+    // so contention is zero.
+    void release_pair_a();
+
+private:
+    std::mutex pinned_mu_[kNumPinnedBuffers];
+    std::mutex pair_a_mu_;
+};
+
+// Free + total device VRAM at call time. On SYCL backends without a
+// portable free-memory query, free_bytes is approximated as
+// total_bytes (AdaptiveCpp's global_mem_size = device total). Used as
+// a preflight signal; sycl::malloc_device remains the source of
+// truth. POS2GPU_MAX_VRAM_MB caps both fields when set.
+struct DeviceMemInfo {
+    size_t free_bytes  = 0;
+    size_t total_bytes = 0;
 };
+DeviceMemInfo query_device_memory();
+
+// Upper bound on streaming-pipeline peak device VRAM at given k.
+// streaming_peak_bytes: compact tier (anchored at 5200 MB at k=28).
+// streaming_plain_peak_bytes: plain tier (anchored at 7290 MB at k=28,
+// pre-park pipeline — saves ~400 ms/plot over compact via fewer PCIe
+// round-trips, at the cost of the higher peak).
+// streaming_minimal_peak_bytes: minimal tier (anchored at 3700 MB at
+// k=28). Same parks as compact plus N=8 T2 match staging (cap/8 vs
+// compact's cap/2) — targets 4 GiB cards at the cost of more PCIe
+// round-trips during T2 match.
+// Dominant terms scale with 2^k, so other k extrapolate linearly.
+size_t streaming_peak_bytes(int k);
+size_t streaming_plain_peak_bytes(int k);
+size_t streaming_minimal_peak_bytes(int k);
 
 } // namespace pos2gpu
diff --git a/src/host/GpuPipeline.cpp b/src/host/GpuPipeline.cpp
new file mode 100644
index 0000000..9263084
--- /dev/null
+++ b/src/host/GpuPipeline.cpp
@@ -0,0 +1,2461 @@
+// GpuPipeline.cu — orchestrates Xs → T1 → T2 → T3 on the device, with
+// CUB radix sort between phases (each phase consumes sorted-by-match_info
+// input). Final T3 output is sorted by proof_fragment (low 2k bits) to
+// match pos2-chip Table3Constructor::post_construct_span.
+//
+// Two overloads live here:
+//   run_gpu_pipeline(cfg)       — transient pool, one-shot.
+//   run_gpu_pipeline(cfg, pool) — shared pool, batch-friendly. This is the
+//                                 real implementation; the one-shot form
+//                                 just wraps it in a temporary pool.
+
+#include "host/GpuPipeline.hpp"
+#include "host/GpuBufferPool.hpp"
+#include "host/PoolSizing.hpp"
+
+#include "gpu/AesGpu.cuh"
+#include "gpu/XsKernel.cuh"
+#include "gpu/XsKernels.cuh"   // launch_xs_gen / launch_xs_pack (stage 4e)
+#include "gpu/T1Kernel.cuh"
+#include "gpu/T2Kernel.cuh"
+#include "gpu/T3Kernel.cuh"
+#include "gpu/PipelineKernels.cuh"
+#include "gpu/Sort.cuh"
+#include "gpu/SyclBackend.hpp"
+
+#include <sycl/sycl.hpp>
+
+
+#include <algorithm>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace pos2gpu {
+
+namespace {
+
+
+// =====================================================================
+// T1 sort: by match_info, low k bits, stable. Uses CUB SortPairs with
+// (key=match_info, value=index) then permutes T1Pairings.
+// =====================================================================
+// T2 sort: same shape — sort indices by match_info.
+// =====================================================================
+// Streaming allocation tracker.
+//
+// Wraps cudaMalloc / cudaFree so we can: (a) account for live/peak VRAM
+// used by the streaming pipeline, (b) honour a soft device-memory cap
+// set via POS2GPU_MAX_VRAM_MB (throws before the underlying cudaMalloc
+// when an alloc would push live past the cap), and (c) emit a per-alloc
+// trace under POS2GPU_STREAMING_STATS=1 for manual audits.
+//
+// Pinned host allocations are NOT counted — the cap is specifically for
+// device VRAM, and the pinned D2H staging buffer is host-resident.
+// =====================================================================
+struct StreamingStats {
+    size_t cap  = 0;   // 0 = no cap
+    size_t live = 0;
+    size_t peak = 0;
+    std::unordered_map<void*, size_t> sizes;
+    bool        verbose = false;
+    char const* phase   = "(init)";
+
+    // Free any allocations still alive on destruction. If the streaming
+    // pipeline throws partway (e.g. d_xs_temp OOM after d_xs already
+    // succeeded), this dtor releases the still-live device buffers
+    // instead of leaking them across batch iterations.
+    ~StreamingStats() {
+        if (sizes.empty()) return;
+        auto& q = sycl_backend::queue();
+        for (auto& [ptr, _bytes] : sizes) {
+            if (ptr) sycl::free(ptr, q);
+        }
+        sizes.clear();
+    }
+};
+
+inline void s_init_from_env(StreamingStats& s)
+{
+    if (char const* v = std::getenv("POS2GPU_MAX_VRAM_MB"); v && v[0]) {
+        s.cap = size_t(std::strtoull(v, nullptr, 10)) * (1ULL << 20);
+    }
+    if (char const* v = std::getenv("POS2GPU_STREAMING_STATS"); v && v[0] == '1') {
+        s.verbose = true;
+    }
+}
+
+// Format a byte count as both raw bytes and decimal MB. The previous
+// `bytes >> 20` form (integer right-shift = truncating divide by 1 MiB)
+// rounded any sub-MiB request down to "0 MB", which masked both the
+// real allocation size and any genuine zero-byte sizing bug at the
+// call site. Use this helper in every error path so a future
+// `requested=0` is unambiguous (raw bytes settles it).
+inline std::string s_fmt_bytes(size_t bytes) {
+    char buf[64];
+    std::snprintf(buf, sizeof(buf),
+                  "%zu bytes (%.2f MB)", bytes, bytes / 1048576.0);
+    return std::string(buf);
+}
+
+template <typename T>
+inline void s_malloc(StreamingStats& s, T*& out, size_t bytes, char const* reason)
+{
+    // Zero-byte requests come from sizing queries that returned 0,
+    // which downstream callers honour as "skip this alloc" only by
+    // accident (sycl::malloc_device(0) returns null on HIP). Surface
+    // the actual upstream cause instead of triggering the misleading
+    // "Card likely too small" path below.
+    if (bytes == 0) {
+        throw std::runtime_error(
+            std::string("internal: s_malloc('") + reason + "') called with "
+            "bytes=0 — an upstream sizing query returned 0 (count=0). On "
+            "AMD/HIP this most often indicates a kernel correctness issue "
+            "on an unvalidated device — either an AOT target outside the "
+            "validated set (the gfx1013/RDNA1 community spoof is the known "
+            "case) or AdaptiveCpp's generic SSCP JIT miscompiling a kernel "
+            "for the actual gfx ISA. Run the parity tests on this device "
+            "to localise: sycl_g_x_parity, sycl_sort_parity, "
+            "sycl_bucket_offsets_parity, sycl_t1_parity.");
+    }
+    if (s.cap && s.live + bytes > s.cap) {
+        throw std::runtime_error(
+            std::string("streaming VRAM cap: phase=") + s.phase +
+            " alloc=" + reason +
+            " live=" + s_fmt_bytes(s.live) +
+            " + new=" + s_fmt_bytes(bytes) +
+            " would exceed cap=" + s_fmt_bytes(s.cap));
+    }
+    void* p = sycl::malloc_device(bytes, sycl_backend::queue());
+    if (!p) {
+        throw std::runtime_error(
+            std::string("sycl::malloc_device(") + reason + "): null — phase=" +
+            s.phase + " requested=" + s_fmt_bytes(bytes) +
+            " live=" + s_fmt_bytes(s.live) +
+            ". Card likely too small for this k via the streaming "
+            "pipeline; try a smaller k or a card with more VRAM.");
+    }
+    out = static_cast<T*>(p);
+    s.live += bytes;
+    if (s.live > s.peak) s.peak = s.live;
+    s.sizes[p] = bytes;
+    if (s.verbose) {
+        std::fprintf(stderr,
+            "[stream %-8s] +%7.2f MB  %-20s  live=%8.2f  peak=%8.2f\n",
+            s.phase, bytes / 1048576.0, reason,
+            s.live / 1048576.0, s.peak / 1048576.0);
+    }
+}
+
+template <typename T>
+inline void s_free(StreamingStats& s, T*& ptr)
+{
+    if (!ptr) return;
+    void* raw = static_cast<void*>(ptr);
+    auto it = s.sizes.find(raw);
+    if (it != s.sizes.end()) {
+        s.live -= it->second;
+        if (s.verbose) {
+            std::fprintf(stderr,
+                "[stream %-8s] -%7.2f MB  %-20s  live=%8.2f  peak=%8.2f\n",
+                s.phase, it->second / 1048576.0, "(free)",
+                s.live / 1048576.0, s.peak / 1048576.0);
+        }
+        s.sizes.erase(it);
+    }
+    sycl::free(raw, sycl_backend::queue());
+    ptr = nullptr;
+}
+
+// Sanity-check t1_count after T1 match. Healthy plots produce ~2^k
+// entries; anything below total_xs/64 (= 2^(k-6)) — let alone literal
+// zero — points at kernel correctness on the device, not a VRAM
+// shortfall. Catching this here surfaces a clear diagnostic instead of
+// letting downstream sort-scratch alloc fail with the misleading
+// "Card likely too small" message. Two AMD/HIP cases produce 0 T1
+// matches at k=28: the gfx1013/RDNA1 community spoof on a W5700, and
+// AdaptiveCpp's generic SSCP JIT on the same RDNA1 silicon (the JIT
+// path is theoretically more compatible than the AOT spoof but has
+// been observed to miscompile the matcher). Only the OOM further down
+// was visible before this check.
+inline void validate_t1_count(uint64_t t1_count, int k)
+{
+    uint64_t const min_plausible = (1ULL << k) >> 6;
+    if (t1_count >= min_plausible) return;
+
+    throw std::runtime_error(
+        "T1 match produced " + std::to_string(t1_count) + " entries "
+        "(expected ~2^" + std::to_string(k) + " = " +
+        std::to_string(1ULL << k) + " for k=" + std::to_string(k) +
+        "). This indicates a kernel correctness issue on this device, "
+        "not a VRAM shortfall. On AMD/HIP this most often means the "
+        "AdaptiveCpp target produced wrong output for the actual gfx "
+        "ISA — either the gfx1013/RDNA1 community AOT spoof or the "
+        "generic SSCP JIT path on an unvalidated card. Build the "
+        "parity tests via cmake and verify on this device: "
+        "sycl_g_x_parity, sycl_sort_parity, sycl_bucket_offsets_parity, "
+        "sycl_t1_parity. The first three exercise individual kernels at "
+        "small N; sycl_t1_parity runs the full T1 matcher against the "
+        "pos2-chip CPU reference and is the closest reproducer of the "
+        "k=28 failure. README's 'Community-tested, not parity-validated' "
+        "caveat applies.");
+}
+
+} // namespace
+
+GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg,
+                                   GpuBufferPool& pool,
+                                   int pinned_index)
+{
+
+    sycl::queue& q = sycl_backend::queue();
+    if (cfg.k < 18 || cfg.k > 32 || (cfg.k & 1) != 0) {
+        throw std::runtime_error("k must be even in [18, 32]");
+    }
+    if (cfg.strength < 2) {
+        throw std::runtime_error("strength must be >= 2");
+    }
+    if (pool.k != cfg.k || pool.strength != cfg.strength
+        || pool.testnet != cfg.testnet)
+    {
+        throw std::runtime_error(
+            "GpuBufferPool was sized for different (k, strength, testnet)");
+    }
+    if (pinned_index < 0 || pinned_index >= GpuBufferPool::kNumPinnedBuffers) {
+        throw std::runtime_error(
+            "pinned_index must be in [0, GpuBufferPool::kNumPinnedBuffers)");
+    }
+
+    uint64_t const total_xs = pool.total_xs;
+    uint64_t const cap      = pool.cap;
+
+    constexpr int kThreads = 256;
+    auto blocks = [&](uint64_t n) {
+        return unsigned((n + kThreads - 1) / kThreads);
+    };
+
+    // ---- pool aliases ----
+    // d_pair_a carries the "current phase match output": T1, then T2, then T3.
+    // d_pair_b carries the "current phase sort output": sorted T1, sorted T2,
+    // then final uint64_t fragments. Each subsequent phase's output overwrites
+    // the previous (consumed) contents in the same slot.
+    XsCandidateGpu* d_xs             = static_cast<XsCandidateGpu*>(pool.d_storage);
+    // d_pair_a-derived aliases (d_t1_meta, d_t1_mi, d_t2_meta, d_t2_mi,
+    // d_t2_xbits, d_t3) are NOT declared here. They're declared inside
+    // the Xs phase block below, right after pool.ensure_pair_a()
+    // performs the lazy malloc_device for d_pair_a. Deferring that
+    // alloc until after Xs gen has been submitted to the queue lets
+    // the ~400-500 ms CPU-side malloc_device overlap with Xs's
+    // ~750 ms GPU execution — saves ~400-500 ms off first-plot wall;
+    // batch plots 2+ hit ensure_pair_a's cached-pointer fast path
+    // so the alloc cost is paid exactly once per pool.
+    //
+    // d_pair_b-derived aliases stay up here because d_pair_b is
+    // eager-allocated by the pool ctor: Xs gen needs it as scratch
+    // from the start of the pipeline.
+    uint64_t*       d_t1_meta_sorted  = static_cast<uint64_t*>      (pool.d_pair_b);
+    uint64_t*       d_t2_meta_sorted  = static_cast<uint64_t*>      (pool.d_pair_b);
+    uint32_t*       d_t2_xbits_sorted = reinterpret_cast<uint32_t*>(
+        static_cast<uint8_t*>(pool.d_pair_b) + pool.cap * sizeof(uint64_t));
+    uint64_t*       d_frags_out       = static_cast<uint64_t*>      (pool.d_pair_b);
+
+    uint64_t*       d_count        = pool.d_counter;
+    // Xs phase needs ~3.22 GB scratch at k=28 in split-keys_a mode
+    // (3 × total_xs × u32 + cub); d_pair_b is idle through the whole
+    // Xs phase (not touched until T1 sort permute writes to it), so
+    // we alias it rather than allocating separately.
+    //
+    // Split-keys_a: the Xs sort's keys_a (total_xs · u32 = 1 GiB at
+    // k=28) lives in d_storage's tail — bytes [total_xs·8, storage_bytes)
+    // which is idle during Xs gen+sort. The final pack phase writes
+    // d_storage[0..total_xs·8) only, leaving keys_a's memory region
+    // undisturbed (and its contents unread after the sort anyway, so
+    // the overlap on T1/T2/T3-sort aliases in d_storage after pack is
+    // a pure write-without-read of stale bytes). Saves ~1 GiB off the
+    // pair_b xs-scratch region — see GpuBufferPool.cpp for sizing.
+    void* const d_xs_split_keys_a = static_cast<uint8_t*>(pool.d_storage)
+                                    + pool.total_xs * sizeof(XsCandidateGpu);
+    void*           d_xs_temp      = pool.d_pair_b;
+    void*           d_sort_scratch = pool.d_sort_scratch;
+    // Lazy pinned-host alloc: skips ~600 ms × (kNumPinnedBuffers-1)
+    // on single-plot runs (only slot 0 gets allocated). See
+    // GpuBufferPool::ensure_pinned header comment for rationale.
+    uint64_t*       h_pinned_t3    = pool.ensure_pinned(pinned_index);
+    // T1/T2/T3 match kernels report 0 scratch bytes, but some CUDA paths
+    // reject a nullptr d_temp_storage with cudaErrorInvalidArgument even
+    // when bytes==0. Point them at d_sort_scratch (idle during match) to
+    // give the kernel a valid non-null handle.
+    void*           d_match_temp   = pool.d_sort_scratch;
+
+    // Sort key/val arrays alias d_storage. Safe because Xs is fully consumed
+    // by T1 match (stream-synchronised) before we enter T1 sort.
+    //
+    // Only three slots live here — keys_out, vals_in, vals_out. The
+    // sort's keys_input is always the SoA match-info stream from
+    // d_pair_a (d_t1_mi / d_t2_mi), so the fourth slot that would
+    // have hosted "d_keys_in" is neither allocated nor used. See
+    // GpuBufferPool.cpp for the matching storage_bytes shrink.
+    auto     storage_u32 = static_cast<uint32_t*>(pool.d_storage);
+    uint32_t* d_keys_out = storage_u32 + 0 * cap;
+    uint32_t* d_vals_in  = storage_u32 + 1 * cap;
+    uint32_t* d_vals_out = storage_u32 + 2 * cap;
+
+    // ---- per-phase wall-time profiling ----
+    // Enabled when either cfg.profile is set (xchplot2 -P / --profile) or
+    // POS2GPU_PHASE_TIMING=1 is in the env. Each phase's wall is measured
+    // around q.wait()s so launches actually drain to the device before the
+    // next start sample — adds a sync point but gives an honest breakdown.
+    // When disabled, begin/end/report are early-out and add ~zero cost.
+    bool const phase_timing = cfg.profile || [] {
+        char const* v = std::getenv("POS2GPU_PHASE_TIMING");
+        return v && v[0] == '1';
+    }();
+    using phase_clock = std::chrono::steady_clock;
+    std::vector<std::pair<char const*, phase_clock::time_point>> phase_starts;
+    std::vector<std::pair<char const*, double>>                  phase_records;
+    auto begin_phase = [&](char const* label) -> int {
+        if (!phase_timing) return -1;
+        q.wait();
+        phase_starts.emplace_back(label, phase_clock::now());
+        return static_cast<int>(phase_starts.size() - 1);
+    };
+    auto end_phase = [&](int idx) {
+        if (idx < 0) return;
+        q.wait();
+        auto const t1 = phase_clock::now();
+        auto const& [name, t0] = phase_starts[idx];
+        double const ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        phase_records.emplace_back(name, ms);
+    };
+    auto report_phases = [&]() {
+        if (!phase_timing || phase_records.empty()) return;
+        double total = 0.0;
+        for (auto const& [_n, ms] : phase_records) total += ms;
+        std::fprintf(stderr, "[phase-timing]");
+        for (auto const& [name, ms] : phase_records) {
+            std::fprintf(stderr, " %s=%.1fms(%.0f%%)",
+                name, ms, total > 0.0 ? 100.0 * ms / total : 0.0);
+        }
+        std::fprintf(stderr, " total=%.1fms\n", total);
+    };
+
+    // ---------- Phase Xs ----------
+    size_t xs_temp_bytes = 0;
+    launch_construct_xs(cfg.plot_id.data(), cfg.k, cfg.testnet,
+                              nullptr, nullptr, &xs_temp_bytes, q,
+                              d_xs_split_keys_a);
+    int p_xs = begin_phase("Xs gen+sort");
+    // Xs phase events stubbed in slice 17b — pass nullptr for the (no-op)
+    // profiling event slots. The launch_construct_xs_profiled signature still
+    // accepts cudaEvent_t for API compatibility but ignores the values.
+    launch_construct_xs_profiled(cfg.plot_id.data(), cfg.k, cfg.testnet,
+                                       d_xs, d_xs_temp, &xs_temp_bytes,
+                                       nullptr, nullptr, q,
+                                       d_xs_split_keys_a);
+    // Overlap d_pair_a's lazy malloc_device (~400-500 ms for 4.36 GB at
+    // k=28) with Xs gen's GPU execution. In production
+    // (POS2GPU_PHASE_TIMING unset), launch_construct_xs_profiled returns
+    // immediately with the kernel in-flight on the queue; this CPU-side
+    // alloc then runs in parallel and its wall is hidden behind Xs's
+    // ~750 ms GPU work. In phase_timing mode xs-timing's internal
+    // q.waits serialise Xs first, then this alloc pays full wall — a
+    // diagnostic-mode trade-off.
+    void* const d_pair_a_raw = pool.ensure_pair_a();
+    end_phase(p_xs);
+
+    // d_pair_a-derived aliases, now that the lazy alloc has resolved.
+    // Same layout as the old eager version — just computed from the
+    // local d_pair_a_raw instead of pool.d_pair_a so there's no
+    // confusion about when the pointer became valid.
+    //
+    // T1 match output is SoA, carved out of d_pair_a. Layout: meta[cap]
+    // (cap·8 B) then mi[cap] (cap·4 B). Total cap·12 B, fits in d_pair_a's
+    // cap·16 B budget.
+    uint64_t*     d_t1_meta = static_cast<uint64_t*>(d_pair_a_raw);
+    uint32_t*     d_t1_mi   = reinterpret_cast<uint32_t*>(
+        static_cast<uint8_t*>(d_pair_a_raw) + pool.cap * sizeof(uint64_t));
+    // T2 match output is SoA, carved out of d_pair_a. Layout: meta[cap]
+    // (cap·8 B), then mi[cap] (cap·4 B), then xbits[cap] (cap·4 B). Total
+    // cap·16 B, matching d_pair_a's size.
+    uint64_t*     d_t2_meta  = static_cast<uint64_t*>(d_pair_a_raw);
+    uint32_t*     d_t2_mi    = reinterpret_cast<uint32_t*>(
+        static_cast<uint8_t*>(d_pair_a_raw) + pool.cap * sizeof(uint64_t));
+    uint32_t*     d_t2_xbits = reinterpret_cast<uint32_t*>(
+        static_cast<uint8_t*>(d_pair_a_raw) + pool.cap * (sizeof(uint64_t) + sizeof(uint32_t)));
+    T3PairingGpu* d_t3       = static_cast<T3PairingGpu*>(d_pair_a_raw);
+
+    // ---------- Phase T1 ----------
+    auto t1p = make_t1_params(cfg.k, cfg.strength);
+    size_t t1_temp_bytes = 0;
+    launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs,
+                          nullptr, nullptr, d_count, cap,
+                          nullptr, &t1_temp_bytes, q);
+    q.memset(d_count, 0, sizeof(uint64_t));
+    int p_t1 = begin_phase("T1 match");
+    launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs,
+                          d_t1_meta, d_t1_mi, d_count, cap,
+                          d_match_temp, &t1_temp_bytes, q);
+    end_phase(p_t1);
+
+    // No explicit sync: the next cudaMemcpy (non-async, default stream)
+    // implicitly drains prior stream work before the host reads t1_count.
+    uint64_t t1_count = 0;
+    q.memcpy(&t1_count, d_count, sizeof(uint64_t)).wait();
+    if (t1_count > cap) throw std::runtime_error("T1 overflow");
+    validate_t1_count(t1_count, cfg.k);
+
+
+    // Sort T1 by match_info (low k bits). d_storage is now repurposed
+    // as (keys_in, keys_out, vals_in, vals_out), Xs having been fully
+    // consumed by T1 match above. T1 match emits match_info in a SoA
+    // stream (d_t1_mi), so we feed that directly to CUB as the sort key
+    // input rather than extracting from a packed struct.
+    int p_t1_sort = begin_phase("T1 sort");
+    {
+        launch_init_u32_identity(d_vals_in, t1_count, q);
+        size_t sort_bytes = pool.sort_scratch_bytes;
+        launch_sort_pairs_u32_u32(
+            d_sort_scratch, sort_bytes,
+            d_t1_mi, d_keys_out, d_vals_in, d_vals_out,
+            t1_count, /*begin_bit=*/0, /*end_bit=*/cfg.k, q);
+
+        launch_gather_u64(d_t1_meta, d_vals_out, d_t1_meta_sorted, t1_count, q);
+    }
+    end_phase(p_t1_sort);
+
+    // ---------- Phase T2 ----------
+    // Sorted T1 = (d_t1_meta_sorted: uint64 meta, d_keys_out: uint32 match_info).
+    // No AoS struct anymore — saves 33 % of sorted-T1 bandwidth on both the
+    // permute write and the match-kernel hot path.
+    auto t2p = make_t2_params(cfg.k, cfg.strength);
+    size_t t2_temp_bytes = 0;
+    launch_t2_match(cfg.plot_id.data(), t2p, nullptr, nullptr, t1_count,
+                          nullptr, nullptr, nullptr, d_count, cap,
+                          nullptr, &t2_temp_bytes, q);
+    q.memset(d_count, 0, sizeof(uint64_t));
+    int p_t2 = begin_phase("T2 match");
+    launch_t2_match(cfg.plot_id.data(), t2p, d_t1_meta_sorted, d_keys_out, t1_count,
+                          d_t2_meta, d_t2_mi, d_t2_xbits, d_count, cap,
+                          d_match_temp, &t2_temp_bytes, q);
+    end_phase(p_t2);
+
+    uint64_t t2_count = 0;
+    q.memcpy(&t2_count, d_count, sizeof(uint64_t)).wait();
+    if (t2_count > cap) throw std::runtime_error("T2 overflow");
+
+    int p_t2_sort = begin_phase("T2 sort");
+    {
+        // T2 match emitted match_info as a SoA stream (d_t2_mi) — feed
+        // it straight into CUB as the sort key input rather than
+        // re-extracting from a packed struct. vals_in just needs a
+        // 0..n-1 identity fill.
+        launch_init_u32_identity(d_vals_in, t2_count, q);
+        size_t sort_bytes = pool.sort_scratch_bytes;
+        launch_sort_pairs_u32_u32(
+            d_sort_scratch, sort_bytes,
+            d_t2_mi, d_keys_out, d_vals_in, d_vals_out,
+            t2_count, 0, cfg.k, q);
+
+        launch_permute_t2(d_t2_meta, d_t2_xbits, d_vals_out,
+                          d_t2_meta_sorted, d_t2_xbits_sorted, t2_count, q);
+    }
+    end_phase(p_t2_sort);
+
+    // ---------- Phase T3 ----------
+    // d_keys_out now holds the T2 sorted match_info (T1's was overwritten by
+    // the T2 sort above) — pass as the slim stream for binary search in T3.
+    auto t3p = make_t3_params(cfg.k, cfg.strength);
+    size_t t3_temp_bytes = 0;
+    launch_t3_match(cfg.plot_id.data(), t3p,
+                          d_t2_meta_sorted, d_t2_xbits_sorted,
+                          nullptr, t2_count,
+                          d_t3, d_count, cap,
+                          nullptr, &t3_temp_bytes, q);
+    q.memset(d_count, 0, sizeof(uint64_t));
+    int p_t3 = begin_phase("T3 match + Feistel");
+    launch_t3_match(cfg.plot_id.data(), t3p,
+                          d_t2_meta_sorted, d_t2_xbits_sorted,
+                          d_keys_out, t2_count,
+                          d_t3, d_count, cap,
+                          d_match_temp, &t3_temp_bytes, q);
+    end_phase(p_t3);
+
+    uint64_t t3_count = 0;
+    q.memcpy(&t3_count, d_count, sizeof(uint64_t)).wait();
+    if (t3_count > cap) throw std::runtime_error("T3 overflow");
+
+    // Sort T3 by proof_fragment (low 2k bits). T3PairingGpu is just a
+    // uint64_t, so reinterpret the d_pair_a slot directly.
+    uint64_t* d_frags_in = reinterpret_cast<uint64_t*>(d_t3);
+    int p_t3_sort = begin_phase("T3 sort");
+    {
+        size_t sort_bytes = pool.sort_scratch_bytes;
+        launch_sort_keys_u64(
+            d_sort_scratch, sort_bytes,
+            d_frags_in, d_frags_out,
+            t3_count, /*begin_bit=*/0, /*end_bit=*/2 * cfg.k, q);
+    }
+    end_phase(p_t3_sort);
+
+    // ---------- D2H ----------
+    int p_d2h = begin_phase("D2H copy T3 fragments (pinned)");
+    GpuPipelineResult result;
+    result.t1_count = t1_count;
+    result.t2_count = t2_count;
+    result.t3_count = t3_count;
+
+    if (t3_count > 0) {
+        q.memcpy(h_pinned_t3, d_frags_out, sizeof(uint64_t) * t3_count);
+        q.wait();
+    }
+    end_phase(p_d2h);
+
+    if (t3_count > 0) {
+        // Borrow: caller (batch producer) promises to finish consuming this
+        // pinned slot before reusing it for another plot.
+        result.external_fragments_ptr   = h_pinned_t3;
+        result.external_fragments_count = t3_count;
+    }
+
+    // Xs gen / sort per-phase timings stubbed in slice 17b — see profiling
+    // notes above.
+
+    // Release d_pair_a so it isn't held between plots in a batch run.
+    // At ~5 ms/alloc on amdgcn (sycl::malloc_device effectively just
+    // reserves virtual address space), the per-plot realloc cost is
+    // below noise, but freeing 4.36 GB during the inter-plot gap means
+    // the pool path is viable on cards with ~7-8 GiB free that would
+    // otherwise hit InsufficientVramError and fall back to streaming.
+    // The final q.wait() inside the D2H block above has already drained
+    // T3 sort so the buffer is safe to free.
+    pool.release_pair_a();
+
+    report_phases();
+    return result;
+}
+
+GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg)
+{
+    // Explicit override for callers that want the streaming path without
+    // having to rebuild anything. Handy for testing and for users who know
+    // their hardware won't fit the pool.
+    if (char const* env = std::getenv("XCHPLOT2_STREAMING");
+        env && env[0] == '1')
+    {
+        return run_gpu_pipeline_streaming(cfg);
+    }
+
+    // Default: build a transient pool and run through it. Pays the full
+    // per-call allocator overhead (~2.4 s for k=28) — batch callers should
+    // construct a pool once and reuse it via the 3-arg overload.
+    //
+    // On insufficient device VRAM the pool ctor throws
+    // InsufficientVramError; catch it specifically and fall back to
+    // streaming so users on small-VRAM cards get a working plot with no
+    // flags. Other CUDA errors propagate.
+    try {
+        GpuBufferPool pool(cfg.k, cfg.strength, cfg.testnet);
+        GpuPipelineResult r = run_gpu_pipeline(cfg, pool, /*pinned_index=*/0);
+        // Pool (and its pinned buffer) is about to be destroyed, so
+        // materialise a self-contained copy before returning.
+        if (r.external_fragments_ptr && r.external_fragments_count > 0) {
+            r.t3_fragments_storage.resize(r.external_fragments_count);
+            std::memcpy(r.t3_fragments_storage.data(),
+                        r.external_fragments_ptr,
+                        sizeof(uint64_t) * r.external_fragments_count);
+        }
+        r.external_fragments_ptr   = nullptr;
+        r.external_fragments_count = 0;
+        return r;
+    } catch (InsufficientVramError const& e) {
+        std::fprintf(stderr,
+            "[xchplot2] pool needs %.2f GiB, only %.2f GiB free of "
+            "%.2f GiB — falling back to streaming pipeline\n",
+            e.required_bytes / double(1ULL << 30),
+            e.free_bytes     / double(1ULL << 30),
+            e.total_bytes    / double(1ULL << 30));
+        return run_gpu_pipeline_streaming(cfg);
+    }
+}
+
+// =====================================================================
+// Streaming pipeline — per-phase cudaMalloc / cudaFree, no persistent pool.
+//
+// Only buffers required for the CURRENT and NEXT phase are resident at any
+// point. Tiled sorts + SoA emission drive the peak down under 8 GB at
+// k=28, so an 8 GB card can run this path.
+//
+// The implementation body below accepts an optional caller-provided
+// pinned D2H buffer — used by BatchPlotter to amortise cudaMallocHost
+// across plots and double-buffer the D2H with the FSE consumer.
+//
+// Exception safety: on throw mid-pipeline we currently leak the
+// still-live device allocations. The CLI terminates on exception anyway,
+// so the OS reclaims the context. If we later embed this in a long-lived
+// process we can add RAII owners without changing the public surface.
+// =====================================================================
+namespace { // anon: shared impl, not part of the public API.
+
+GpuPipelineResult run_gpu_pipeline_streaming_impl(
+    GpuPipelineConfig const& cfg,
+    uint64_t* pinned_dst,                       // nullable
+    size_t    pinned_capacity,                  // count, not bytes; ignored if pinned_dst null
+    StreamingPinnedScratch const& scratch);     // any field nullptr → per-plot malloc_host fallback
+
+} // namespace
+
+GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg)
+{
+
+    sycl::queue& q = sycl_backend::queue();
+    return run_gpu_pipeline_streaming_impl(cfg, /*pinned_dst=*/nullptr,
+                                                /*pinned_capacity=*/0,
+                                                StreamingPinnedScratch{});
+}
+
+GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg,
+                                             uint64_t* pinned_dst,
+                                             size_t    pinned_capacity)
+{
+    if (!pinned_dst || pinned_capacity == 0) {
+        throw std::runtime_error(
+            "run_gpu_pipeline_streaming(cfg, pinned, cap): pinned buffer must be non-null");
+    }
+    return run_gpu_pipeline_streaming_impl(cfg, pinned_dst, pinned_capacity,
+                                           StreamingPinnedScratch{});
+}
+
+GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg,
+                                             uint64_t* pinned_dst,
+                                             size_t    pinned_capacity,
+                                             StreamingPinnedScratch const& scratch)
+{
+    if (!pinned_dst || pinned_capacity == 0) {
+        throw std::runtime_error(
+            "run_gpu_pipeline_streaming(cfg, pinned, cap, scratch): pinned buffer must be non-null");
+    }
+    return run_gpu_pipeline_streaming_impl(cfg, pinned_dst, pinned_capacity, scratch);
+}
+
+namespace {
+
+GpuPipelineResult run_gpu_pipeline_streaming_impl(
+    GpuPipelineConfig const& cfg,
+    uint64_t* pinned_dst,
+    size_t    pinned_capacity,
+    StreamingPinnedScratch const& scratch)
+{
+
+    sycl::queue& q = sycl_backend::queue();
+    if (cfg.k < 18 || cfg.k > 32 || (cfg.k & 1) != 0) {
+        throw std::runtime_error("k must be even in [18, 32]");
+    }
+    if (cfg.strength < 2) {
+        throw std::runtime_error("strength must be >= 2");
+    }
+
+    int const num_section_bits = (cfg.k < 28) ? 2 : (cfg.k - 26);
+    uint64_t const total_xs = 1ULL << cfg.k;
+    uint64_t const cap =
+        max_pairs_per_section(cfg.k, num_section_bits) *
+        (1ULL << num_section_bits);
+
+    constexpr int kThreads = 256;
+    auto blocks = [&](uint64_t n) {
+        return unsigned((n + kThreads - 1) / kThreads);
+    };
+
+    StreamingStats stats;
+    s_init_from_env(stats);
+
+    // ---- per-phase wall-time profiling ----
+    // Identical shape to the pool path (run_gpu_pipeline above); the
+    // [phase-timing] output format matches so POS2GPU_PHASE_TIMING=1 now
+    // produces the same breakdown whether the pipeline runs pool or
+    // falls back to streaming. On 12 GiB cards at k=28 (where pool
+    // overflows and we always streams) this is the only way to see
+    // which phase is eating the wall.
+    bool const phase_timing = cfg.profile || [] {
+        char const* v = std::getenv("POS2GPU_PHASE_TIMING");
+        return v && v[0] == '1';
+    }();
+    using phase_clock = std::chrono::steady_clock;
+    std::vector<std::pair<char const*, phase_clock::time_point>> phase_starts;
+    std::vector<std::pair<char const*, double>>                  phase_records;
+    auto begin_phase = [&](char const* label) -> int {
+        if (!phase_timing) return -1;
+        q.wait();
+        phase_starts.emplace_back(label, phase_clock::now());
+        return static_cast<int>(phase_starts.size() - 1);
+    };
+    auto end_phase = [&](int idx) {
+        if (idx < 0) return;
+        q.wait();
+        auto const t1 = phase_clock::now();
+        auto const& [name, t0] = phase_starts[idx];
+        double const ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        phase_records.emplace_back(name, ms);
+    };
+    auto report_phases = [&]() {
+        if (!phase_timing || phase_records.empty()) return;
+        double total = 0.0;
+        for (auto const& [_n, ms] : phase_records) total += ms;
+        std::fprintf(stderr, "[phase-timing]");
+        for (auto const& [name, ms] : phase_records) {
+            std::fprintf(stderr, " %s=%.1fms(%.0f%%)",
+                name, ms, total > 0.0 ? 100.0 * ms / total : 0.0);
+        }
+        std::fprintf(stderr, " total=%.1fms\n", total);
+    };
+
+    // --- pipeline-wide tiny allocations ---
+    // d_counter: per-phase uint64 count output (reused).
+    // The match kernels each need their own temp-storage buffer sized via
+    // their size query; we allocate it per-phase rather than globally so
+    // that the peak VRAM is the phase's alone.
+    stats.phase = "init";
+    uint64_t* d_counter = nullptr;
+    s_malloc(stats, d_counter, sizeof(uint64_t), "d_counter");
+
+    // ---------- Phase Xs (stage 4e: inlined gen+sort+pack) ----------
+    // launch_construct_xs lumps keys_a/keys_b/vals_a/vals_b into a single
+    // d_xs_temp blob (~4 GB at k=28). keys_a+vals_a are dead after the
+    // CUB sort but can't be freed because they're interior slices of a
+    // single allocation. Inline the three sub-kernels so we can:
+    //   1. alloc cub_scratch + keys_a + vals_a
+    //   2. gen fills keys_a, vals_a
+    //   3. alloc keys_b + vals_b
+    //   4. CUB sort keys_a/vals_a -> keys_b/vals_b; keys_a/vals_a now dead
+    //   5. free cub_scratch + keys_a + vals_a       <- 2078 MB freed
+    //   6. alloc d_xs
+    //   7. pack keys_b/vals_b -> d_xs
+    //   8. free keys_b + vals_b
+    // Phase peak at k=28 drops from d_xs (2048) + d_xs_temp (4128) =
+    // 6176 MB to max(sort 4126 MB, pack 4096 MB) = 4126 MB.
+    stats.phase = "Xs";
+
+    AesHashKeys const xs_keys = make_keys(cfg.plot_id.data());
+    uint32_t    const xs_xor_const = cfg.testnet ? 0xA3B1C4D7u : 0u;
+
+    XsCandidateGpu* d_xs = nullptr;
+    uint32_t* d_xs_keys_b = nullptr;
+    uint32_t* d_xs_vals_b = nullptr;
+
+    bool const xs_sliced = !scratch.plain_mode && scratch.gather_tile_count > 1;
+
+    if (!xs_sliced) {
+        // Compact / plain — full-cap gen+sort+pack (4128 MB sort peak).
+        size_t xs_cub_bytes = 0;
+        launch_sort_pairs_u32_u32(
+            nullptr, xs_cub_bytes,
+            static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+            static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+            total_xs, /*begin_bit=*/0, /*end_bit=*/cfg.k, q);
+
+        void*     d_xs_cub_scratch = nullptr;
+        uint32_t* d_xs_keys_a      = nullptr;
+        uint32_t* d_xs_vals_a      = nullptr;
+        s_malloc(stats, d_xs_cub_scratch, xs_cub_bytes,                     "d_xs_cub");
+        s_malloc(stats, d_xs_keys_a,      total_xs * sizeof(uint32_t),      "d_xs_keys_a");
+        s_malloc(stats, d_xs_vals_a,      total_xs * sizeof(uint32_t),      "d_xs_vals_a");
+
+        if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') {
+            // Sentinel-fill keys_a / vals_a head/mid/tail with 0xCD.
+            uint64_t const off_mid  = total_xs / 2;
+            uint64_t const off_tail = (total_xs >= 16ULL) ? total_xs - 16ULL : 0ULL;
+            q.memset(d_xs_keys_a,            0xCD, 64).wait();
+            q.memset(d_xs_keys_a + off_mid,  0xCD, 64).wait();
+            q.memset(d_xs_keys_a + off_tail, 0xCD, 64).wait();
+            q.memset(d_xs_vals_a,            0xCD, 64).wait();
+            q.memset(d_xs_vals_a + off_mid,  0xCD, 64).wait();
+            q.memset(d_xs_vals_a + off_tail, 0xCD, 64).wait();
+
+            // Trivial-kernel sanity: writes 0xDEADBEEF to keys_a[0..16]
+            // with no LDS / no captured struct / no AES. If this
+            // produces 0xCDCDCDCD post-launch, AdaptiveCpp's HIP
+            // submission path is producing no-op stubs for ANY kernel
+            // — the problem is below our level. If it produces
+            // 0xDEADBEEF, simple kernels work and the issue is
+            // specific to the cooperative-LDS / AES kernel pattern.
+            {
+                uint32_t* p = d_xs_keys_a;
+                q.parallel_for(
+                    sycl::nd_range<1>{256, 256},
+                    [=](sycl::nd_item<1> it) {
+                        size_t idx = it.get_global_id(0);
+                        if (idx < 16) p[idx] = 0xDEADBEEFu;
+                    }).wait();
+                uint32_t check[16] = {};
+                q.memcpy(check, d_xs_keys_a, 16 * sizeof(uint32_t)).wait();
+                bool const ok = (check[0] == 0xDEADBEEFu);
+                std::fprintf(stderr,
+                    "[t1-debug] trivial kernel test: %s  (keys_a[0]=0x%08x)\n",
+                    ok ? "PASS — simple kernels can write"
+                       : "FAIL — kernel writes are not landing",
+                    check[0]);
+                // Restore sentinel since the trivial kernel overwrote
+                // the head region.
+                q.memset(d_xs_keys_a, 0xCD, 64).wait();
+            }
+
+            // Dump d_aes_tables[0..16]. Standard AES T0[0] = 0xC66363A5.
+            // If we see 0xBE / 0xCD here, the T-table USM buffer was
+            // never populated by aes_tables_device's q.memcpy — kernels
+            // would then read garbage and produce nothing useful.
+            {
+                uint32_t* d_tables = sycl_backend::aes_tables_device(q);
+                uint32_t aes_check[16] = {};
+                q.memcpy(aes_check, d_tables, 16 * sizeof(uint32_t)).wait();
+                std::fprintf(stderr,
+                    "[t1-debug] d_aes_tables[0..16] (T0[a] = (2S[a],S[a],S[a],3S[a]) packed LE; T0[0] = 0xa56363c6):\n");
+                for (int i = 0; i < 16; ++i) {
+                    std::fprintf(stderr, "  [%2d] 0x%08x\n", i, aes_check[i]);
+                }
+            }
+        }
+
+        int p_xs = begin_phase("Xs gen+sort");
+        launch_xs_gen(xs_keys, d_xs_keys_a, d_xs_vals_a, total_xs,
+                      cfg.k, xs_xor_const, q);
+
+        if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') {
+            uint64_t const sn = (total_xs < 16ULL) ? total_xs : 16ULL;
+            uint64_t const off_mid  = total_xs / 2;
+            uint64_t const off_tail = (total_xs >= 16ULL) ? total_xs - 16ULL : 0ULL;
+            uint32_t ka_h[16] = {}, va_h[16] = {};
+            uint32_t ka_m[16] = {}, va_m[16] = {};
+            uint32_t ka_t[16] = {}, va_t[16] = {};
+            q.memcpy(ka_h, d_xs_keys_a,            sn * sizeof(uint32_t)).wait();
+            q.memcpy(va_h, d_xs_vals_a,            sn * sizeof(uint32_t)).wait();
+            q.memcpy(ka_m, d_xs_keys_a + off_mid,  sn * sizeof(uint32_t)).wait();
+            q.memcpy(va_m, d_xs_vals_a + off_mid,  sn * sizeof(uint32_t)).wait();
+            q.memcpy(ka_t, d_xs_keys_a + off_tail, sn * sizeof(uint32_t)).wait();
+            q.memcpy(va_t, d_xs_vals_a + off_tail, sn * sizeof(uint32_t)).wait();
+            std::fprintf(stderr,
+                "[t1-debug] post-xs_gen   total_xs=%llu (head idx=0, mid idx=%llu, tail idx=%llu):\n",
+                (unsigned long long)total_xs,
+                (unsigned long long)off_mid, (unsigned long long)off_tail);
+            for (uint64_t i = 0; i < sn; ++i) {
+                std::fprintf(stderr,
+                    "  H[%2llu] ka=0x%08x va=0x%08x  M[%2llu] ka=0x%08x va=0x%08x  T[%2llu] ka=0x%08x va=0x%08x\n",
+                    (unsigned long long)i,            ka_h[i], va_h[i],
+                    (unsigned long long)(off_mid + i),  ka_m[i], va_m[i],
+                    (unsigned long long)(off_tail + i), ka_t[i], va_t[i]);
+            }
+        }
+
+        s_malloc(stats, d_xs_keys_b, total_xs * sizeof(uint32_t), "d_xs_keys_b");
+        s_malloc(stats, d_xs_vals_b, total_xs * sizeof(uint32_t), "d_xs_vals_b");
+
+        launch_sort_pairs_u32_u32(
+            d_xs_cub_scratch, xs_cub_bytes,
+            d_xs_keys_a, d_xs_keys_b,
+            d_xs_vals_a, d_xs_vals_b,
+            total_xs, /*begin_bit=*/0, /*end_bit=*/cfg.k, q);
+        end_phase(p_xs);
+
+        s_free(stats, d_xs_cub_scratch);
+        s_free(stats, d_xs_keys_a);
+        s_free(stats, d_xs_vals_a);
+
+        s_malloc(stats, d_xs, total_xs * sizeof(XsCandidateGpu), "d_xs");
+
+        if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') {
+            uint64_t const sn = (total_xs < 16ULL) ? total_xs : 16ULL;
+            uint64_t const off_mid  = total_xs / 2;
+            uint64_t const off_tail = (total_xs >= 16ULL) ? total_xs - 16ULL : 0ULL;
+            uint32_t kb_h[16] = {}, vb_h[16] = {};
+            uint32_t kb_m[16] = {}, vb_m[16] = {};
+            uint32_t kb_t[16] = {}, vb_t[16] = {};
+            q.memcpy(kb_h, d_xs_keys_b,            sn * sizeof(uint32_t)).wait();
+            q.memcpy(vb_h, d_xs_vals_b,            sn * sizeof(uint32_t)).wait();
+            q.memcpy(kb_m, d_xs_keys_b + off_mid,  sn * sizeof(uint32_t)).wait();
+            q.memcpy(vb_m, d_xs_vals_b + off_mid,  sn * sizeof(uint32_t)).wait();
+            q.memcpy(kb_t, d_xs_keys_b + off_tail, sn * sizeof(uint32_t)).wait();
+            q.memcpy(vb_t, d_xs_vals_b + off_tail, sn * sizeof(uint32_t)).wait();
+            std::fprintf(stderr,
+                "[t1-debug] post-xs_sort  total_xs=%llu (head idx=0, mid idx=%llu, tail idx=%llu):\n",
+                (unsigned long long)total_xs,
+                (unsigned long long)off_mid, (unsigned long long)off_tail);
+            for (uint64_t i = 0; i < sn; ++i) {
+                std::fprintf(stderr,
+                    "  H[%2llu] kb=0x%08x vb=0x%08x  M[%2llu] kb=0x%08x vb=0x%08x  T[%2llu] kb=0x%08x vb=0x%08x\n",
+                    (unsigned long long)i,            kb_h[i], vb_h[i],
+                    (unsigned long long)(off_mid + i),  kb_m[i], vb_m[i],
+                    (unsigned long long)(off_tail + i), kb_t[i], vb_t[i]);
+            }
+        }
+
+        int p_xs_pack = begin_phase("Xs pack");
+        launch_xs_pack(d_xs_keys_b, d_xs_vals_b, d_xs, total_xs, q);
+        end_phase(p_xs_pack);
+
+        s_free(stats, d_xs_keys_b);
+        s_free(stats, d_xs_vals_b);
+    } else {
+        // Sliced (minimal). Tile gen+sort in N=2 position halves into
+        // cap/2 device buffers, D2H per tile to USM-host. Then merge
+        // host-pinned tile outputs into device d_xs_keys_b + d_xs_vals_b
+        // (full cap). Then pack in N=2 halves with D2H per tile to a
+        // host-pinned XsCandidateGpu accumulator. Finally rehydrate
+        // d_xs from host pinned. Drops sort peak from 4128 MB → 2056 MB
+        // and pack peak from 4096 MB → 3072 MB at k=28.
+        uint64_t const xs_tile_n0  = total_xs / 2;
+        uint64_t const xs_tile_n1  = total_xs - xs_tile_n0;
+        uint64_t const xs_tile_max = (xs_tile_n0 > xs_tile_n1) ? xs_tile_n0 : xs_tile_n1;
+
+        size_t xs_cub_tile_bytes = 0;
+        launch_sort_pairs_u32_u32(
+            nullptr, xs_cub_tile_bytes,
+            static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+            static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+            xs_tile_max, /*begin_bit=*/0, /*end_bit=*/cfg.k, q);
+
+        void*     d_xs_cub_scratch  = nullptr;
+        uint32_t* d_xs_keys_a_tile  = nullptr;
+        uint32_t* d_xs_vals_a_tile  = nullptr;
+        uint32_t* d_xs_keys_b_tile  = nullptr;
+        uint32_t* d_xs_vals_b_tile  = nullptr;
+        s_malloc(stats, d_xs_keys_a_tile, xs_tile_max * sizeof(uint32_t), "d_xs_keys_a_tile");
+        s_malloc(stats, d_xs_vals_a_tile, xs_tile_max * sizeof(uint32_t), "d_xs_vals_a_tile");
+        s_malloc(stats, d_xs_keys_b_tile, xs_tile_max * sizeof(uint32_t), "d_xs_keys_b_tile");
+        s_malloc(stats, d_xs_vals_b_tile, xs_tile_max * sizeof(uint32_t), "d_xs_vals_b_tile");
+        s_malloc(stats, d_xs_cub_scratch, xs_cub_tile_bytes,              "d_xs_cub");
+
+        uint32_t* h_xs_keys = static_cast<uint32_t*>(
+            sycl::malloc_host(total_xs * sizeof(uint32_t), q));
+        if (!h_xs_keys) throw std::runtime_error("sycl::malloc_host(h_xs_keys) failed");
+        uint32_t* h_xs_vals = static_cast<uint32_t*>(
+            sycl::malloc_host(total_xs * sizeof(uint32_t), q));
+        if (!h_xs_vals) throw std::runtime_error("sycl::malloc_host(h_xs_vals) failed");
+
+        int p_xs = begin_phase("Xs gen+sort");
+        auto run_tile = [&](uint64_t pos_begin, uint64_t pos_end, uint64_t out_offset) {
+            uint64_t tile_n = pos_end - pos_begin;
+            if (tile_n == 0) return;
+            launch_xs_gen_range(
+                xs_keys, d_xs_keys_a_tile, d_xs_vals_a_tile,
+                pos_begin, pos_end, cfg.k, xs_xor_const, q);
+            launch_sort_pairs_u32_u32(
+                d_xs_cub_scratch, xs_cub_tile_bytes,
+                d_xs_keys_a_tile, d_xs_keys_b_tile,
+                d_xs_vals_a_tile, d_xs_vals_b_tile,
+                tile_n, /*begin_bit=*/0, /*end_bit=*/cfg.k, q);
+            q.memcpy(h_xs_keys + out_offset, d_xs_keys_b_tile,
+                     tile_n * sizeof(uint32_t)).wait();
+            q.memcpy(h_xs_vals + out_offset, d_xs_vals_b_tile,
+                     tile_n * sizeof(uint32_t)).wait();
+        };
+        run_tile(0,           xs_tile_n0,  0);
+        run_tile(xs_tile_n0,  total_xs,    xs_tile_n0);
+        end_phase(p_xs);
+
+        s_free(stats, d_xs_cub_scratch);
+        s_free(stats, d_xs_vals_b_tile);
+        s_free(stats, d_xs_keys_b_tile);
+        s_free(stats, d_xs_vals_a_tile);
+        s_free(stats, d_xs_keys_a_tile);
+
+        // Full-cap merge outputs on device. Merge from USM-host inputs.
+        s_malloc(stats, d_xs_keys_b, total_xs * sizeof(uint32_t), "d_xs_keys_b");
+        s_malloc(stats, d_xs_vals_b, total_xs * sizeof(uint32_t), "d_xs_vals_b");
+        launch_merge_pairs_stable_2way_u32_u32(
+            h_xs_keys + 0,           h_xs_vals + 0,           xs_tile_n0,
+            h_xs_keys + xs_tile_n0,  h_xs_vals + xs_tile_n0,  xs_tile_n1,
+            d_xs_keys_b, d_xs_vals_b, total_xs, q);
+        sycl::free(h_xs_keys, q);
+        sycl::free(h_xs_vals, q);
+
+        // Tiled pack. d_xs_pack_tile (cap/2 × XsCandidate = 1024 MB
+        // at k=28) reuses across tiles; the packed output collects on
+        // host pinned h_xs (cap × XsCandidate = 2048 MB host).
+        uint64_t const pack_tile_n0  = total_xs / 2;
+        uint64_t const pack_tile_n1  = total_xs - pack_tile_n0;
+        uint64_t const pack_tile_max = (pack_tile_n0 > pack_tile_n1) ? pack_tile_n0 : pack_tile_n1;
+
+        XsCandidateGpu* d_xs_pack_tile = nullptr;
+        s_malloc(stats, d_xs_pack_tile, pack_tile_max * sizeof(XsCandidateGpu), "d_xs_pack_tile");
+
+        XsCandidateGpu* h_xs = static_cast<XsCandidateGpu*>(
+            sycl::malloc_host(total_xs * sizeof(XsCandidateGpu), q));
+        if (!h_xs) throw std::runtime_error("sycl::malloc_host(h_xs) failed");
+
+        int p_xs_pack = begin_phase("Xs pack");
+        if (pack_tile_n0 > 0) {
+            launch_xs_pack_range(d_xs_keys_b + 0, d_xs_vals_b + 0,
+                                 d_xs_pack_tile, pack_tile_n0, q);
+            q.memcpy(h_xs + 0, d_xs_pack_tile,
+                     pack_tile_n0 * sizeof(XsCandidateGpu)).wait();
+        }
+        if (pack_tile_n1 > 0) {
+            launch_xs_pack_range(d_xs_keys_b + pack_tile_n0,
+                                 d_xs_vals_b + pack_tile_n0,
+                                 d_xs_pack_tile, pack_tile_n1, q);
+            q.memcpy(h_xs + pack_tile_n0, d_xs_pack_tile,
+                     pack_tile_n1 * sizeof(XsCandidateGpu)).wait();
+        }
+        end_phase(p_xs_pack);
+
+        s_free(stats, d_xs_pack_tile);
+        s_free(stats, d_xs_keys_b);
+        s_free(stats, d_xs_vals_b);
+        d_xs_keys_b = nullptr;
+        d_xs_vals_b = nullptr;
+
+        // Re-hydrate full d_xs on device from host pinned.
+        s_malloc(stats, d_xs, total_xs * sizeof(XsCandidateGpu), "d_xs");
+        q.memcpy(d_xs, h_xs, total_xs * sizeof(XsCandidateGpu)).wait();
+        sycl::free(h_xs, q);
+    }
+
+    // ---------- Phase T1 match ----------
+    // SoA output: meta (uint64) + mi (uint32). Same 12 B/pair as the old
+    // AoS struct, but the two streams can be freed independently — we
+    // drop d_t1_mi as soon as CUB consumes it in the T1 sort phase.
+    //
+    // Minimal mode (gather_tile_count > 1) splits T1 match into N=
+    // num_sections passes (one per section_l) with cap/N staging
+    // outputs that are D2H'd to host pinned per pass — keeps d_xs +
+    // d_t1_meta + d_t1_mi from being co-resident at full-cap. Drops
+    // the T1 match peak from
+    //   d_xs (2048) + d_t1_meta (2080) + d_t1_mi (1040) = 5168 MB
+    // to
+    //   d_xs (2048) + d_t1_meta_stage (cap/N × 8) +
+    //   d_t1_mi_stage (cap/N × 4) = ~2870 MB at k=28 N=4.
+    //
+    // d_t1_meta + d_t1_mi (full cap) are then re-allocated on device
+    // for T1 sort, with the data H2D'd from host pinned. d_t1_meta
+    // stays parked on h_t1_meta across T1 sort exactly as in compact
+    // mode (the existing park dance is skipped — data is already on
+    // host).
+    bool const t1_match_sliced = !scratch.plain_mode && scratch.gather_tile_count > 1;
+
+    stats.phase = "T1 match";
+    auto t1p = make_t1_params(cfg.k, cfg.strength);
+    size_t t1_temp_bytes = 0;
+    launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs,
+                          nullptr, nullptr, d_counter, cap,
+                          nullptr, &t1_temp_bytes, q);
+
+    uint64_t* d_t1_meta = nullptr;
+    uint32_t* d_t1_mi   = nullptr;
+    void*     d_t1_match_temp = nullptr;
+
+    // Lift h_t1_meta / h_t1_mi out of the T1 sort scope so the sliced
+    // T1 match path can populate them directly. h_t1_mi is sliced-only
+    // — it's freed in T1 sort once CUB has consumed the H2D'd copy.
+    bool      const h_meta_owned = (!scratch.plain_mode && scratch.h_meta == nullptr);
+    uint64_t* h_t1_meta = nullptr;
+    bool      h_t1_mi_owned = false;
+    uint32_t* h_t1_mi = nullptr;
+
+    uint64_t t1_count = 0;
+
+    if (!t1_match_sliced) {
+        // Single-shot path (compact / plain): d_t1_meta + d_t1_mi
+        // allocated full-cap on device.
+        s_malloc(stats, d_t1_meta,        cap * sizeof(uint64_t), "d_t1_meta");
+        s_malloc(stats, d_t1_mi,          cap * sizeof(uint32_t), "d_t1_mi");
+        s_malloc(stats, d_t1_match_temp,  t1_temp_bytes,          "d_t1_match_temp");
+
+        if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') {
+            uint64_t const sample_n = (total_xs < 16ULL) ? total_xs : 16ULL;
+            XsCandidateGpu sample[16] = {};
+            q.memcpy(sample, d_xs, sample_n * sizeof(XsCandidateGpu)).wait();
+            std::fprintf(stderr,
+                "[t1-debug] plain pre-launch  k=%d total_xs=%llu cap=%llu  d_xs[0..%llu]:\n",
+                cfg.k, (unsigned long long)total_xs,
+                (unsigned long long)cap, (unsigned long long)sample_n);
+            for (uint64_t i = 0; i < sample_n; ++i) {
+                std::fprintf(stderr,
+                    "  [%2llu] match_info=0x%08x x=0x%08x\n",
+                    (unsigned long long)i, sample[i].match_info, sample[i].x);
+            }
+        }
+
+        int p_t1 = begin_phase("T1 match");
+        q.memset(d_counter, 0, sizeof(uint64_t));
+        launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs,
+                              d_t1_meta, d_t1_mi, d_counter, cap,
+                              d_t1_match_temp, &t1_temp_bytes, q);
+        end_phase(p_t1);
+
+        q.memcpy(&t1_count, d_counter, sizeof(uint64_t)).wait();
+        if (t1_count > cap) throw std::runtime_error("T1 overflow");
+        if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') {
+            std::fprintf(stderr,
+                "[t1-debug] plain post-launch t1_count=%llu\n",
+                (unsigned long long)t1_count);
+        }
+        validate_t1_count(t1_count, cfg.k);
+
+        s_free(stats, d_t1_match_temp);
+        s_free(stats, d_xs);
+    } else {
+        // Sliced path (minimal): N=num_sections passes with cap/N
+        // staging buffers. Output accumulates on host pinned, then
+        // d_t1_mi + h_t1_meta receive their final populations after
+        // d_xs is freed.
+        uint32_t const t1_num_sections   = 1u << t1p.num_section_bits;
+        uint32_t const t1_num_match_keys = 1u << t1p.num_match_key_bits;
+        // 25% safety over the per-section average expected output.
+        uint64_t const t1_section_cap =
+            ((cap + t1_num_sections - 1) / t1_num_sections) * 5ULL / 4ULL;
+
+        s_malloc(stats, d_t1_match_temp, t1_temp_bytes, "d_t1_match_temp");
+
+        // Compute bucket + fine-bucket offsets once; passes share them.
+        // Also zeros d_counter.
+        launch_t1_match_prepare(cfg.plot_id.data(), t1p, d_xs, total_xs,
+                                d_counter, d_t1_match_temp, &t1_temp_bytes, q);
+
+        // Host pinned full-cap accumulators for meta + mi.
+        h_t1_meta = h_meta_owned
+            ? static_cast<uint64_t*>(sycl::malloc_host(cap * sizeof(uint64_t), q))
+            : scratch.h_meta;
+        if (!h_t1_meta) throw std::runtime_error("sycl::malloc_host(h_t1_meta) failed");
+        h_t1_mi_owned = true;
+        h_t1_mi = static_cast<uint32_t*>(sycl::malloc_host(cap * sizeof(uint32_t), q));
+        if (!h_t1_mi) throw std::runtime_error("sycl::malloc_host(h_t1_mi) failed");
+
+        // Per-pass staging device buffers (cap/N).
+        uint64_t* d_t1_meta_stage = nullptr;
+        uint32_t* d_t1_mi_stage   = nullptr;
+        s_malloc(stats, d_t1_meta_stage, t1_section_cap * sizeof(uint64_t), "d_t1_meta_stage");
+        s_malloc(stats, d_t1_mi_stage,   t1_section_cap * sizeof(uint32_t), "d_t1_mi_stage");
+
+        if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') {
+            uint64_t const sample_n = (total_xs < 16ULL) ? total_xs : 16ULL;
+            XsCandidateGpu sample[16] = {};
+            q.memcpy(sample, d_xs, sample_n * sizeof(XsCandidateGpu)).wait();
+            std::fprintf(stderr,
+                "[t1-debug] sliced pre-launch k=%d total_xs=%llu cap=%llu  d_xs[0..%llu]:\n",
+                cfg.k, (unsigned long long)total_xs,
+                (unsigned long long)cap, (unsigned long long)sample_n);
+            for (uint64_t i = 0; i < sample_n; ++i) {
+                std::fprintf(stderr,
+                    "  [%2llu] match_info=0x%08x x=0x%08x\n",
+                    (unsigned long long)i, sample[i].match_info, sample[i].x);
+            }
+        }
+
+        int p_t1 = begin_phase("T1 match");
+        uint64_t host_offset = 0;
+        for (uint32_t section_l = 0; section_l < t1_num_sections; ++section_l) {
+            uint32_t const bucket_begin = section_l * t1_num_match_keys;
+            uint32_t const bucket_end   = (section_l + 1) * t1_num_match_keys;
+
+            launch_t1_match_range(
+                cfg.plot_id.data(), t1p, d_xs, total_xs,
+                d_t1_meta_stage, d_t1_mi_stage, d_counter, t1_section_cap,
+                d_t1_match_temp, bucket_begin, bucket_end, q);
+
+            uint64_t pass_count = 0;
+            q.memcpy(&pass_count, d_counter, sizeof(uint64_t)).wait();
+            if (pass_count > t1_section_cap) {
+                throw std::runtime_error(
+                    "T1 match (sliced) section_l=" + std::to_string(section_l) +
+                    " produced " + std::to_string(pass_count) +
+                    " pairs, staging holds " + std::to_string(t1_section_cap) +
+                    ". Increase t1_section_cap safety factor.");
+            }
+            q.memcpy(h_t1_meta + host_offset, d_t1_meta_stage,
+                     pass_count * sizeof(uint64_t)).wait();
+            q.memcpy(h_t1_mi   + host_offset, d_t1_mi_stage,
+                     pass_count * sizeof(uint32_t)).wait();
+            host_offset += pass_count;
+            q.memset(d_counter, 0, sizeof(uint64_t)).wait();
+        }
+        end_phase(p_t1);
+
+        t1_count = host_offset;
+        if (t1_count > cap) throw std::runtime_error("T1 overflow");
+        if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') {
+            std::fprintf(stderr,
+                "[t1-debug] sliced post-launch t1_count=%llu (sum across %u sections)\n",
+                (unsigned long long)t1_count, t1_num_sections);
+        }
+        validate_t1_count(t1_count, cfg.k);
+
+        s_free(stats, d_t1_meta_stage);
+        s_free(stats, d_t1_mi_stage);
+        s_free(stats, d_t1_match_temp);
+
+        // Xs fully consumed.
+        s_free(stats, d_xs);
+
+        // Re-hydrate d_t1_mi full-cap on device for T1 sort (CUB
+        // sort key input). h_t1_meta stays on host across T1 sort.
+        s_malloc(stats, d_t1_mi, cap * sizeof(uint32_t), "d_t1_mi");
+        q.memcpy(d_t1_mi, h_t1_mi, t1_count * sizeof(uint32_t)).wait();
+        if (h_t1_mi_owned) sycl::free(h_t1_mi, q);
+        h_t1_mi = nullptr;
+        // d_t1_meta stays nullptr — h_t1_meta has the data; the
+        // existing T1-sort park block will see d_t1_meta == nullptr
+        // and skip the d_t1_meta → h_t1_meta memcpy.
+    }
+
+    // Stage 4b (compact only): park d_t1_meta on pinned host across
+    // the T1 sort phase. d_t1_meta is only needed again for
+    // launch_gather_u64 at the end of T1 sort — holding it alive
+    // through CUB setup was responsible for the 6256 MB overall
+    // streaming peak (d_t1_meta 2080 + d_t1_mi 1040 + CUB working 3120
+    // + scratch). JIT H2D before the gather below, free right after.
+    // Mirror of stage 4a for T2.
+    //
+    // Stage 4f: use caller-provided scratch when present (amortised
+    // across batch); fall back to per-plot malloc_host otherwise. Same
+    // pattern applied to h_t1_keys_merged, h_t2_*, h_t3 below.
+    //
+    // Plain mode skips the park entirely: d_t1_meta stays live through
+    // T1 sort. Costs ~2 GB peak but saves a PCIe round-trip.
+    //
+    // Sliced mode: h_t1_meta was already populated by the T1 match
+    // passes — d_t1_meta is nullptr and the park dance is skipped
+    // here. h_meta_owned + h_t1_meta were declared above (lifted out
+    // of the original T1-sort scope) so the rest of T1 sort sees the
+    // same variables in both paths.
+    if (!scratch.plain_mode && !t1_match_sliced) {
+        h_t1_meta = h_meta_owned
+            ? static_cast<uint64_t*>(sycl::malloc_host(cap * sizeof(uint64_t), q))
+            : scratch.h_meta;
+        if (!h_t1_meta) throw std::runtime_error("sycl::malloc_host(h_t1_meta) failed");
+        q.memcpy(h_t1_meta, d_t1_meta, t1_count * sizeof(uint64_t)).wait();
+        s_free(stats, d_t1_meta);
+        d_t1_meta = nullptr;
+    }
+
+    // ---------- Phase T1 sort (tiled, N=2) ----------
+    // Partition T1 into two halves by index, CUB-sort each with scratch
+    // sized for the larger half, then stable 2-way merge the sorted runs
+    // back into the extract-input slot (d_keys_in / d_vals_in) — that
+    // slot is free because the CUB sort has already consumed it.
+    //
+    // N=2 is the minimal case that exercises the tile + merge path; a
+    // larger N shrinks per-tile CUB scratch further but needs a multi-
+    // way merge or a tree of pairwise merges. Phase 6 can bump N once
+    // Phase 4's k=28 VRAM measurement shows how tight the budget is.
+    uint64_t const t1_tile_n0  = t1_count / 2;
+    uint64_t const t1_tile_n1  = t1_count - t1_tile_n0;
+    uint64_t const t1_tile_max = (t1_tile_n0 > t1_tile_n1) ? t1_tile_n0 : t1_tile_n1;
+
+    size_t t1_sort_bytes = 0;
+    launch_sort_pairs_u32_u32(
+        nullptr, t1_sort_bytes,
+        static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+        static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+        t1_tile_max, 0, cfg.k, q);
+
+    stats.phase = "T1 sort";
+    // With T1 SoA emission, d_t1_mi IS the CUB key input. We only need
+    // d_keys_out (CUB sort output), d_vals_in (identity) + d_vals_out
+    // (sorted vals). d_t1_mi is freed as soon as CUB consumes it.
+    //
+    // Compact / plain: full-cap d_keys_out + d_vals_in + d_vals_out
+    // (1040 MB each at k=28); plus d_t1_mi (1040, full-cap input) +
+    // scratch ≈ 4176 MB peak.
+    //
+    // Minimal: per-tile cap/2 output buffers (520 each) instead of
+    // full-cap + USM-host h_keys/h_vals to collect tile outputs +
+    // launch_merge_pairs_stable_2way_u32_u32 reading USM-host inputs.
+    // Drops T1 sort CUB peak to:
+    //   d_t1_mi (1040) + 3 × cap/2 u32 (1560) + scratch ≈ 2616 MB.
+    void* d_sort_scratch = nullptr;
+    uint32_t* d_keys_out = nullptr;     // populated in compact path; minimal uses h_keys instead
+    uint32_t* d_vals_in  = nullptr;     // T2 sort below also uses this; declared at wider scope
+    uint32_t* d_vals_out = nullptr;     // populated in compact path; minimal uses h_vals instead
+    uint32_t* h_keys     = nullptr;     // USM-host, sliced path only
+    uint32_t* h_vals     = nullptr;     // USM-host, sliced path only
+
+    int p_t1_sort = begin_phase("T1 sort");
+
+    if (!t1_match_sliced) {
+        // Compact / plain — existing full-cap path.
+        s_malloc(stats, d_keys_out,     cap * sizeof(uint32_t), "d_keys_out");
+        s_malloc(stats, d_vals_in,      cap * sizeof(uint32_t), "d_vals_in");
+        s_malloc(stats, d_vals_out,     cap * sizeof(uint32_t), "d_vals_out");
+        s_malloc(stats, d_sort_scratch, t1_sort_bytes,          "d_sort_scratch(t1)");
+
+        launch_init_u32_identity(d_vals_in, t1_count, q);
+        if (t1_tile_n0 > 0) {
+            launch_sort_pairs_u32_u32(
+                d_sort_scratch, t1_sort_bytes,
+                d_t1_mi + 0, d_keys_out + 0,
+                d_vals_in + 0, d_vals_out + 0,
+                t1_tile_n0, /*begin_bit=*/0, /*end_bit=*/cfg.k, q);
+        }
+        if (t1_tile_n1 > 0) {
+            launch_sort_pairs_u32_u32(
+                d_sort_scratch, t1_sort_bytes,
+                d_t1_mi + t1_tile_n0, d_keys_out + t1_tile_n0,
+                d_vals_in + t1_tile_n0, d_vals_out + t1_tile_n0,
+                t1_tile_n1, /*begin_bit=*/0, /*end_bit=*/cfg.k, q);
+        }
+
+        s_free(stats, d_sort_scratch);
+        s_free(stats, d_vals_in);
+        s_free(stats, d_t1_mi);
+    } else {
+        // Sliced — per-tile cap/2 output buffers, D2H to USM-host.
+        uint32_t* d_keys_out_tile = nullptr;
+        uint32_t* d_vals_in_tile  = nullptr;
+        uint32_t* d_vals_out_tile = nullptr;
+        s_malloc(stats, d_keys_out_tile, t1_tile_max * sizeof(uint32_t), "d_t1_keys_out_tile");
+        s_malloc(stats, d_vals_in_tile,  t1_tile_max * sizeof(uint32_t), "d_t1_vals_in_tile");
+        s_malloc(stats, d_vals_out_tile, t1_tile_max * sizeof(uint32_t), "d_t1_vals_out_tile");
+        s_malloc(stats, d_sort_scratch,  t1_sort_bytes,                  "d_sort_scratch(t1)");
+
+        h_keys = static_cast<uint32_t*>(sycl::malloc_host(cap * sizeof(uint32_t), q));
+        if (!h_keys) throw std::runtime_error("sycl::malloc_host(h_keys t1) failed");
+        h_vals = static_cast<uint32_t*>(sycl::malloc_host(cap * sizeof(uint32_t), q));
+        if (!h_vals) throw std::runtime_error("sycl::malloc_host(h_vals t1) failed");
+
+        auto run_tile = [&](uint64_t tile_off, uint64_t tile_n) {
+            if (tile_n == 0) return;
+            uint32_t const off32 = static_cast<uint32_t>(tile_off);
+            uint32_t* d_vals_in_tile_local = d_vals_in_tile;
+            q.parallel_for(
+                sycl::range<1>{ static_cast<size_t>(tile_n) },
+                [=](sycl::id<1> i) {
+                    d_vals_in_tile_local[i] = off32 + uint32_t(i);
+                }).wait();
+            launch_sort_pairs_u32_u32(
+                d_sort_scratch, t1_sort_bytes,
+                d_t1_mi + tile_off, d_keys_out_tile,
+                d_vals_in_tile,    d_vals_out_tile,
+                tile_n, /*begin_bit=*/0, /*end_bit=*/cfg.k, q);
+            q.memcpy(h_keys + tile_off, d_keys_out_tile,
+                     tile_n * sizeof(uint32_t)).wait();
+            q.memcpy(h_vals + tile_off, d_vals_out_tile,
+                     tile_n * sizeof(uint32_t)).wait();
+        };
+        run_tile(0,            t1_tile_n0);
+        run_tile(t1_tile_n0,   t1_tile_n1);
+
+        s_free(stats, d_sort_scratch);
+        s_free(stats, d_vals_out_tile);
+        s_free(stats, d_vals_in_tile);
+        s_free(stats, d_keys_out_tile);
+        s_free(stats, d_t1_mi);
+    }
+
+    // 3-pass post-CUB (merge → gather meta) — same shape as T2 sort,
+    // but T1 only has one gather stream (meta) so it's 2 passes here.
+    uint32_t* d_t1_keys_merged  = nullptr;
+    uint32_t* d_t1_merged_vals  = nullptr;
+    s_malloc(stats, d_t1_keys_merged, cap * sizeof(uint32_t), "d_t1_keys_merged");
+    s_malloc(stats, d_t1_merged_vals, cap * sizeof(uint32_t), "d_t1_merged_vals");
+
+    if (!t1_match_sliced) {
+        launch_merge_pairs_stable_2way_u32_u32(
+            d_keys_out + 0,          d_vals_out + 0,          t1_tile_n0,
+            d_keys_out + t1_tile_n0, d_vals_out + t1_tile_n0, t1_tile_n1,
+            d_t1_keys_merged, d_t1_merged_vals, t1_count, q);
+        s_free(stats, d_keys_out);
+        s_free(stats, d_vals_out);
+    } else {
+        // Merge inputs are USM-host; the kernel reads via PCIe (sequential
+        // 2-way merge → bandwidth-bound, ~3.27 GB at k=28 / ~25 GB/s ≈
+        // 130 ms). Live device set during merge is just the two cap-sized
+        // output buffers (d_t1_keys_merged + d_t1_merged_vals = 2080 MB).
+        launch_merge_pairs_stable_2way_u32_u32(
+            h_keys + 0,            h_vals + 0,            t1_tile_n0,
+            h_keys + t1_tile_n0,   h_vals + t1_tile_n0,   t1_tile_n1,
+            d_t1_keys_merged, d_t1_merged_vals, t1_count, q);
+        sycl::free(h_keys, q); h_keys = nullptr;
+        sycl::free(h_vals, q); h_vals = nullptr;
+    }
+
+    // Stage 4c (compact only): d_t1_keys_merged is not used by the
+    // gather below (gather uses d_t1_merged_vals for indices); it is
+    // only consumed by T2 match as the "d_sorted_mi" input. Park it on
+    // pinned host across the gather peak so the 1040 MB doesn't coexist
+    // with d_t1_merged_vals + d_t1_meta + d_t1_meta_sorted. H2D'd back
+    // at T2 match entry.
+    //
+    // Plain mode keeps d_t1_keys_merged live across the gather peak.
+    bool      const h_keys_owned = (!scratch.plain_mode && scratch.h_keys_merged == nullptr);
+    uint32_t* h_t1_keys_merged = nullptr;
+    if (!scratch.plain_mode) {
+        h_t1_keys_merged = h_keys_owned
+            ? static_cast<uint32_t*>(sycl::malloc_host(cap * sizeof(uint32_t), q))
+            : scratch.h_keys_merged;
+        if (!h_t1_keys_merged) throw std::runtime_error("sycl::malloc_host(h_t1_keys_merged) failed");
+        q.memcpy(h_t1_keys_merged, d_t1_keys_merged, t1_count * sizeof(uint32_t)).wait();
+        s_free(stats, d_t1_keys_merged);
+        d_t1_keys_merged = nullptr;
+    }
+
+    // Stage 4b (compact only): JIT H2D d_t1_meta back onto the device
+    // for the gather, then free it immediately. Peak during this window:
+    //   d_t1_keys_merged (1040) + d_t1_merged_vals (1040)
+    //   + d_t1_meta (2080 H2D) + d_t1_meta_sorted (2080 populated)
+    //   = 6240 MB — same as T2 sort's gather peak, and no longer the
+    // overall bottleneck on its own.
+    //
+    // Plain mode: d_t1_meta is already live (never parked).
+    int const t1_gather_N = scratch.plain_mode ? 1 : scratch.gather_tile_count;
+    if (!scratch.plain_mode) {
+        s_malloc(stats, d_t1_meta, cap * sizeof(uint64_t), "d_t1_meta");
+        q.memcpy(d_t1_meta, h_t1_meta, t1_count * sizeof(uint64_t)).wait();
+        // With gather_tile_count > 1 we reuse h_t1_meta to stage the
+        // sorted output (overwriting the unsorted data we just
+        // rehydrated from); defer the free until after the H2D rebuild.
+        if (t1_gather_N <= 1) {
+            if (h_meta_owned) sycl::free(h_t1_meta, q);
+            h_t1_meta = nullptr;
+        }
+    }
+
+    uint64_t* d_t1_meta_sorted = nullptr;
+    if (t1_gather_N <= 1) {
+        s_malloc(stats, d_t1_meta_sorted, cap * sizeof(uint64_t), "d_t1_meta_sorted");
+        launch_gather_u64(d_t1_meta, d_t1_merged_vals, d_t1_meta_sorted, t1_count, q);
+        end_phase(p_t1_sort);
+        s_free(stats, d_t1_meta);
+        s_free(stats, d_t1_merged_vals);
+    } else {
+        // Tiled-output gather (minimal tier). Produce the sorted output
+        // in N tiles, D2H each tile to h_t1_meta (overwriting the
+        // unsorted data we just rehydrated from), then free the inputs
+        // and rebuild the full d_t1_meta_sorted on device. Peak during
+        // gather drops from
+        //   d_t1_meta (2080) + d_t1_merged_vals (1040)
+        //   + d_t1_meta_sorted (2080) = 5200 MB
+        // to
+        //   d_t1_meta (2080) + d_t1_merged_vals (1040)
+        //   + d_tile (cap/N × u64 = 520 at N=4) = ~3640 MB.
+        uint64_t const tile_max =
+            (t1_count + uint64_t(t1_gather_N) - 1) / uint64_t(t1_gather_N);
+        uint64_t* d_tile = nullptr;
+        s_malloc(stats, d_tile, tile_max * sizeof(uint64_t), "d_t1_meta_sorted_tile");
+        for (int n = 0; n < t1_gather_N; ++n) {
+            uint64_t const tile_off = uint64_t(n) * tile_max;
+            if (tile_off >= t1_count) break;
+            uint64_t const tile_n = std::min(tile_max, t1_count - tile_off);
+            launch_gather_u64(
+                d_t1_meta, d_t1_merged_vals + tile_off,
+                d_tile, tile_n, q);
+            q.memcpy(h_t1_meta + tile_off, d_tile,
+                     tile_n * sizeof(uint64_t)).wait();
+        }
+        s_free(stats, d_tile);
+        s_free(stats, d_t1_meta);
+        s_free(stats, d_t1_merged_vals);
+        s_malloc(stats, d_t1_meta_sorted, cap * sizeof(uint64_t), "d_t1_meta_sorted");
+        q.memcpy(d_t1_meta_sorted, h_t1_meta, t1_count * sizeof(uint64_t)).wait();
+        end_phase(p_t1_sort);
+        if (h_meta_owned) sycl::free(h_t1_meta, q);
+        h_t1_meta = nullptr;
+    }
+
+    // Stage 4c (compact only): H2D d_t1_keys_merged back now that T2
+    // match (its consumer) is about to start. Pinned host freed after
+    // H2D. Plain mode: d_t1_keys_merged is already live.
+    if (!scratch.plain_mode) {
+        s_malloc(stats, d_t1_keys_merged, cap * sizeof(uint32_t), "d_t1_keys_merged");
+        q.memcpy(d_t1_keys_merged, h_t1_keys_merged, t1_count * sizeof(uint32_t)).wait();
+        if (h_keys_owned) sycl::free(h_t1_keys_merged, q);
+        h_t1_keys_merged = nullptr;
+    }
+
+    // ---------- Phase T2 match ----------
+    // Plain mode: single-pass full-cap N=1 match. Device live set
+    // during match is T1 sorted (3.07 GB at k=28) + full-cap T2 output
+    // (4.16 GB) ≈ 7.23 GB. No PCIe round-trips.
+    //
+    // Compact mode (tiled N=2, D2H per pass): two bucket-range passes
+    // through half-cap device staging + pinned host accumulators. Match
+    // live set drops to T1 sorted + half-cap staging ≈ 5.15 GB, at the
+    // cost of ~70 ms of PCIe per pass. This is stage 3 of C (see
+    // docs/t2-match-tiling-plan.md). Pool path uses the single-shot
+    // launch_t2_match — it has the VRAM and doesn't pay the staging
+    // round-trip cost.
+    //
+    // Per-pass compact safety: we expect each half to produce ≤ cap/2
+    // pairs because the match output is roughly uniform across bucket
+    // ids. cap itself has a built-in safety margin (see
+    // extra_margin_bits in PoolSizing), and typical actual utilisation
+    // is well under 100 %. If a pass ever exceeds staging capacity we
+    // throw rather than silently dropping pairs.
+    stats.phase = "T2 match";
+    auto t2p = make_t2_params(cfg.k, cfg.strength);
+
+    // Shared outputs. In plain mode d_t2_meta / d_t2_xbits / d_t2_mi
+    // all become live full-cap buffers here; the T2 sort / gather
+    // sections below skip the JIT H2D re-hydrations. In compact mode
+    // only d_t2_mi is live here (hydrated from the per-plot h_t2_mi),
+    // and h_t2_meta / h_t2_xbits hold the concatenated outputs on
+    // pinned host until JIT H2D at the gather site.
+    uint64_t* d_t2_meta  = nullptr;
+    uint32_t* d_t2_mi    = nullptr;
+    uint32_t* d_t2_xbits = nullptr;
+    uint64_t t2_count    = 0;
+    uint64_t* h_t2_meta  = nullptr;
+    uint32_t* h_t2_xbits = nullptr;
+    bool      h_xbits_owned = false;
+
+    if (scratch.plain_mode) {
+        // Plain: one-shot launch_t2_match into full-cap device buffers.
+        size_t t2_temp_bytes = 0;
+        launch_t2_match(cfg.plot_id.data(), t2p, nullptr, nullptr, t1_count,
+                        nullptr, nullptr, nullptr, d_counter, cap,
+                        nullptr, &t2_temp_bytes, q);
+
+        void* d_t2_match_temp = nullptr;
+        s_malloc(stats, d_t2_meta,       cap * sizeof(uint64_t), "d_t2_meta");
+        s_malloc(stats, d_t2_mi,         cap * sizeof(uint32_t), "d_t2_mi");
+        s_malloc(stats, d_t2_xbits,      cap * sizeof(uint32_t), "d_t2_xbits");
+        s_malloc(stats, d_t2_match_temp, t2_temp_bytes,          "d_t2_match_temp");
+
+        q.memset(d_counter, 0, sizeof(uint64_t)).wait();
+        int p_t2 = begin_phase("T2 match");
+        launch_t2_match(cfg.plot_id.data(), t2p,
+                        d_t1_meta_sorted, d_t1_keys_merged, t1_count,
+                        d_t2_meta, d_t2_mi, d_t2_xbits,
+                        d_counter, cap,
+                        d_t2_match_temp, &t2_temp_bytes, q);
+        end_phase(p_t2);
+
+        q.memcpy(&t2_count, d_counter, sizeof(uint64_t)).wait();
+        if (t2_count > cap) throw std::runtime_error("T2 overflow");
+
+        s_free(stats, d_t2_match_temp);
+        s_free(stats, d_t1_meta_sorted);
+        s_free(stats, d_t1_keys_merged);
+    } else {
+        // Compact: N-tile cap/N staging with pinned-host accumulators.
+        // N = scratch.t2_tile_count: 2 = compact (~2.3 GB staging at
+        // k=28); 8 = minimal (~570 MB) for 4 GiB cards. Must be a power
+        // of 2 ≤ t2_num_buckets so even bucket distribution is exact.
+        uint32_t const t2_num_buckets =
+            (1u << t2p.num_section_bits) * (1u << t2p.num_match_key_bits);
+        int const N = scratch.t2_tile_count;
+        if (N < 2 || (N & (N - 1)) != 0) {
+            throw std::runtime_error(
+                "scratch.t2_tile_count must be a power of 2 ≥ 2 (got " +
+                std::to_string(N) + ")");
+        }
+        if (static_cast<uint32_t>(N) > t2_num_buckets) {
+            throw std::runtime_error(
+                "scratch.t2_tile_count " + std::to_string(N) +
+                " exceeds t2_num_buckets " + std::to_string(t2_num_buckets));
+        }
+        uint64_t const t2_tile_cap = (cap + uint64_t(N) - 1) / uint64_t(N);
+
+        size_t t2_temp_bytes = 0;
+        launch_t2_match_prepare(cfg.plot_id.data(), t2p, nullptr, t1_count,
+                                d_counter, nullptr, &t2_temp_bytes, q);
+
+        // Tile-cap device staging (reused across all N passes).
+        uint64_t* d_t2_meta_stage  = nullptr;
+        uint32_t* d_t2_mi_stage    = nullptr;
+        uint32_t* d_t2_xbits_stage = nullptr;
+        void*     d_t2_match_temp  = nullptr;
+        s_malloc(stats, d_t2_meta_stage,  t2_tile_cap * sizeof(uint64_t), "d_t2_meta_stage");
+        s_malloc(stats, d_t2_mi_stage,    t2_tile_cap * sizeof(uint32_t), "d_t2_mi_stage");
+        s_malloc(stats, d_t2_xbits_stage, t2_tile_cap * sizeof(uint32_t), "d_t2_xbits_stage");
+        s_malloc(stats, d_t2_match_temp,  t2_temp_bytes,                  "d_t2_match_temp");
+
+        // Full-cap pinned host that will hold the concatenated T2 output.
+        // Stage 4f: reuse the caller-provided scratch for h_meta / h_xbits
+        // (amortised across batch). h_t2_mi is still allocated per-plot.
+        auto alloc_pinned_or_throw = [&](size_t bytes, char const* what) {
+            void* p = sycl::malloc_host(bytes, q);
+            if (!p) throw std::runtime_error(std::string("sycl::malloc_host(")
+                                             + what + ") failed");
+            return p;
+        };
+        h_t2_meta  = h_meta_owned
+            ? static_cast<uint64_t*>(alloc_pinned_or_throw(cap * sizeof(uint64_t), "h_t2_meta"))
+            : scratch.h_meta;
+        uint32_t* h_t2_mi = static_cast<uint32_t*>(
+            alloc_pinned_or_throw(cap * sizeof(uint32_t), "h_t2_mi"));
+        h_xbits_owned = (scratch.h_t2_xbits == nullptr);
+        h_t2_xbits = h_xbits_owned
+            ? static_cast<uint32_t*>(alloc_pinned_or_throw(cap * sizeof(uint32_t), "h_t2_xbits"))
+            : scratch.h_t2_xbits;
+
+        // Compute bucket + fine-bucket offsets once; both passes share
+        // them. Also zeroes d_counter.
+        launch_t2_match_prepare(cfg.plot_id.data(), t2p,
+                                d_t1_keys_merged, t1_count,
+                                d_counter, d_t2_match_temp, &t2_temp_bytes, q);
+
+        auto run_pass_and_stage = [&](uint32_t bucket_begin, uint32_t bucket_end,
+                                      uint64_t host_offset) -> uint64_t
+        {
+            launch_t2_match_range(cfg.plot_id.data(), t2p,
+                                  d_t1_meta_sorted, d_t1_keys_merged, t1_count,
+                                  d_t2_meta_stage, d_t2_mi_stage, d_t2_xbits_stage,
+                                  d_counter, t2_tile_cap, d_t2_match_temp,
+                                  bucket_begin, bucket_end, q);
+            uint64_t pass_count = 0;
+            q.memcpy(&pass_count, d_counter, sizeof(uint64_t)).wait();
+            if (pass_count > t2_tile_cap) {
+                throw std::runtime_error(
+                    "T2 match pass overflow: bucket range [" +
+                    std::to_string(bucket_begin) + "," + std::to_string(bucket_end) +
+                    ") produced " + std::to_string(pass_count) +
+                    " pairs, staging holds " + std::to_string(t2_tile_cap) +
+                    " (consider lower N or fall back to compact tier).");
+            }
+            q.memcpy(h_t2_meta  + host_offset, d_t2_meta_stage,  pass_count * sizeof(uint64_t));
+            q.memcpy(h_t2_mi    + host_offset, d_t2_mi_stage,    pass_count * sizeof(uint32_t));
+            q.memcpy(h_t2_xbits + host_offset, d_t2_xbits_stage, pass_count * sizeof(uint32_t));
+            q.wait();
+            q.memset(d_counter, 0, sizeof(uint64_t)).wait();
+            return pass_count;
+        };
+
+        int p_t2 = begin_phase("T2 match");
+        // N evenly-spaced bucket ranges. host_offset accumulates so each
+        // pass appends to the pinned host buffer behind the prior pass.
+        t2_count = 0;
+        for (int pass = 0; pass < N; ++pass) {
+            uint32_t const bucket_begin =
+                uint32_t(uint64_t(pass)     * t2_num_buckets / uint64_t(N));
+            uint32_t const bucket_end =
+                uint32_t(uint64_t(pass + 1) * t2_num_buckets / uint64_t(N));
+            t2_count += run_pass_and_stage(bucket_begin, bucket_end,
+                                           /*host_offset=*/t2_count);
+        }
+        end_phase(p_t2);
+
+        if (t2_count > cap) throw std::runtime_error("T2 overflow");
+
+        // Free device staging + T1 sorted + match temp before
+        // re-allocating the full-cap d_t2_mi that T2 sort expects.
+        s_free(stats, d_t2_match_temp);
+        s_free(stats, d_t2_meta_stage);
+        s_free(stats, d_t2_mi_stage);
+        s_free(stats, d_t2_xbits_stage);
+        s_free(stats, d_t1_meta_sorted);
+        s_free(stats, d_t1_keys_merged);
+
+        // Stage 4a: hydrate full-cap d_t2_mi from h_t2_mi. d_t2_meta
+        // and d_t2_xbits are NOT hydrated yet — they stay on pinned
+        // host until their gather calls at the end of T2 sort.
+        s_malloc(stats, d_t2_mi, cap * sizeof(uint32_t), "d_t2_mi");
+        q.memcpy(d_t2_mi, h_t2_mi, t2_count * sizeof(uint32_t));
+        q.wait();
+        sycl::free(h_t2_mi, q);
+    }
+
+    // ---------- Phase T2 sort (tiled, N=2) ----------
+    // Mirror of T1 sort above — same tile-and-merge shape, but permute
+    // writes a meta-xbits pair (T2 match output is 16 B, split SoA for
+    // T3's L1-bound read pattern) instead of plain meta.
+    // N=4 tiling halves the CUB scratch peak (~1044 MB → ~522 MB at
+    // k=28), bringing the T2 CUB-alloc peak under 8 GB. Merge is done
+    // as a tree of three 2-way merges: (0+1)→AB, (2+3)→CD, (AB+CD)→final.
+    constexpr int kNumT2Tiles = 4;
+    uint64_t t2_tile_n  [kNumT2Tiles];
+    uint64_t t2_tile_off[kNumT2Tiles + 1];
+    uint64_t const t2_base_tile = t2_count / kNumT2Tiles;
+    uint64_t       t2_rem       = t2_count % kNumT2Tiles;
+    t2_tile_off[0] = 0;
+    for (int t = 0; t < kNumT2Tiles; ++t) {
+        t2_tile_n[t]     = t2_base_tile + (t2_rem > 0 ? 1 : 0);
+        if (t2_rem > 0) --t2_rem;
+        t2_tile_off[t+1] = t2_tile_off[t] + t2_tile_n[t];
+    }
+    uint64_t t2_tile_max = 0;
+    for (int t = 0; t < kNumT2Tiles; ++t)
+        if (t2_tile_n[t] > t2_tile_max) t2_tile_max = t2_tile_n[t];
+
+    size_t t2_sort_bytes = 0;
+    launch_sort_pairs_u32_u32(
+        nullptr, t2_sort_bytes,
+        static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+        static_cast<uint32_t*>(nullptr), static_cast<uint32_t*>(nullptr),
+        t2_tile_max, 0, cfg.k, q);
+
+    stats.phase = "T2 sort";
+    // CUB sort key input = d_t2_mi (emitted SoA by T2 match); no extract
+    // needed, so d_keys_in only needs to hold the merged sorted-MI output
+    // that downstream T3 match will consume. Allocate it AFTER the CUB
+    // tile-sort has freed d_t2_mi to keep peak narrow.
+    //
+    // Compact / plain: full-cap d_keys_out + d_vals_in + d_vals_out
+    // (~4168 MB peak with d_t2_mi during tile sort).
+    //
+    // Sliced (minimal): per-tile cap/N output buffers + USM-host
+    // accumulators, then USM-host parking of AB / CD between merge
+    // tree steps so the final merge sees only its own outputs +
+    // USM-host inputs (live device ~2080 MB at k=28). Peaks under
+    // 4 GiB at every step.
+
+    uint64_t const ab_count = t2_tile_n[0] + t2_tile_n[1];
+    uint64_t const cd_count = t2_tile_n[2] + t2_tile_n[3];
+
+    int p_t2_sort = begin_phase("T2 sort");
+
+    if (!t1_match_sliced) {
+        // Compact / plain — existing full-cap CUB tile sort.
+        s_malloc(stats, d_keys_out,     cap * sizeof(uint32_t), "d_keys_out");
+        s_malloc(stats, d_vals_in,      cap * sizeof(uint32_t), "d_vals_in");
+        s_malloc(stats, d_vals_out,     cap * sizeof(uint32_t), "d_vals_out");
+        s_malloc(stats, d_sort_scratch, t2_sort_bytes,          "d_sort_scratch(t2)");
+
+        launch_init_u32_identity(d_vals_in, t2_count, q);
+        for (int t = 0; t < kNumT2Tiles; ++t) {
+            if (t2_tile_n[t] == 0) continue;
+            uint64_t off = t2_tile_off[t];
+            launch_sort_pairs_u32_u32(
+                d_sort_scratch, t2_sort_bytes,
+                d_t2_mi    + off, d_keys_out + off,
+                d_vals_in  + off, d_vals_out + off,
+                t2_tile_n[t], 0, cfg.k, q);
+        }
+
+        s_free(stats, d_sort_scratch);
+        s_free(stats, d_vals_in);
+        s_free(stats, d_t2_mi);
+    } else {
+        // Sliced — per-tile cap/N output, D2H to USM-host h_keys/h_vals.
+        uint32_t* d_keys_out_tile = nullptr;
+        uint32_t* d_vals_in_tile  = nullptr;
+        uint32_t* d_vals_out_tile = nullptr;
+        s_malloc(stats, d_keys_out_tile, t2_tile_max * sizeof(uint32_t), "d_t2_keys_out_tile");
+        s_malloc(stats, d_vals_in_tile,  t2_tile_max * sizeof(uint32_t), "d_t2_vals_in_tile");
+        s_malloc(stats, d_vals_out_tile, t2_tile_max * sizeof(uint32_t), "d_t2_vals_out_tile");
+        s_malloc(stats, d_sort_scratch,  t2_sort_bytes,                  "d_sort_scratch(t2)");
+
+        h_keys = static_cast<uint32_t*>(sycl::malloc_host(cap * sizeof(uint32_t), q));
+        if (!h_keys) throw std::runtime_error("sycl::malloc_host(h_keys t2) failed");
+        h_vals = static_cast<uint32_t*>(sycl::malloc_host(cap * sizeof(uint32_t), q));
+        if (!h_vals) throw std::runtime_error("sycl::malloc_host(h_vals t2) failed");
+
+        for (int t = 0; t < kNumT2Tiles; ++t) {
+            uint64_t const tile_n = t2_tile_n[t];
+            if (tile_n == 0) continue;
+            uint64_t const tile_off = t2_tile_off[t];
+            uint32_t const off32    = static_cast<uint32_t>(tile_off);
+            uint32_t* d_vals_in_tile_local = d_vals_in_tile;
+            q.parallel_for(
+                sycl::range<1>{ static_cast<size_t>(tile_n) },
+                [=](sycl::id<1> i) {
+                    d_vals_in_tile_local[i] = off32 + uint32_t(i);
+                }).wait();
+            launch_sort_pairs_u32_u32(
+                d_sort_scratch, t2_sort_bytes,
+                d_t2_mi + tile_off, d_keys_out_tile,
+                d_vals_in_tile,    d_vals_out_tile,
+                tile_n, 0, cfg.k, q);
+            q.memcpy(h_keys + tile_off, d_keys_out_tile,
+                     tile_n * sizeof(uint32_t)).wait();
+            q.memcpy(h_vals + tile_off, d_vals_out_tile,
+                     tile_n * sizeof(uint32_t)).wait();
+        }
+
+        s_free(stats, d_sort_scratch);
+        s_free(stats, d_vals_out_tile);
+        s_free(stats, d_vals_in_tile);
+        s_free(stats, d_keys_out_tile);
+        s_free(stats, d_t2_mi);
+    }
+
+    // Tree-of-2-way-merges: (tile 0 + tile 1) → AB, (tile 2 + tile 3) → CD,
+    // then (AB + CD) → final merged stream.
+    //
+    // Compact: AB + CD live across the final merge → peak ~4160 MB.
+    // Sliced: AB and CD parked to USM-host between tree steps so the
+    // final merge sees only itself + USM-host inputs (~2080 MB peak).
+    uint32_t* d_AB_keys = nullptr;
+    uint32_t* d_AB_vals = nullptr;
+    uint32_t* d_CD_keys = nullptr;
+    uint32_t* d_CD_vals = nullptr;
+    uint32_t* h_AB_keys = nullptr;
+    uint32_t* h_AB_vals = nullptr;
+    uint32_t* h_CD_keys = nullptr;
+    uint32_t* h_CD_vals = nullptr;
+
+    if (!t1_match_sliced) {
+        s_malloc(stats, d_AB_keys, ab_count * sizeof(uint32_t), "d_t2_AB_keys");
+        s_malloc(stats, d_AB_vals, ab_count * sizeof(uint32_t), "d_t2_AB_vals");
+        s_malloc(stats, d_CD_keys, cd_count * sizeof(uint32_t), "d_t2_CD_keys");
+        s_malloc(stats, d_CD_vals, cd_count * sizeof(uint32_t), "d_t2_CD_vals");
+
+        if (ab_count > 0) {
+            launch_merge_pairs_stable_2way_u32_u32(
+                d_keys_out + t2_tile_off[0], d_vals_out + t2_tile_off[0], t2_tile_n[0],
+                d_keys_out + t2_tile_off[1], d_vals_out + t2_tile_off[1], t2_tile_n[1],
+                d_AB_keys, d_AB_vals, ab_count, q);
+        }
+        if (cd_count > 0) {
+            launch_merge_pairs_stable_2way_u32_u32(
+                d_keys_out + t2_tile_off[2], d_vals_out + t2_tile_off[2], t2_tile_n[2],
+                d_keys_out + t2_tile_off[3], d_vals_out + t2_tile_off[3], t2_tile_n[3],
+                d_CD_keys, d_CD_vals, cd_count, q);
+        }
+
+        s_free(stats, d_keys_out);
+        s_free(stats, d_vals_out);
+    } else {
+        // AB merge: read USM-host slices, write device d_AB. Then D2H
+        // to USM-host and free device.
+        s_malloc(stats, d_AB_keys, ab_count * sizeof(uint32_t), "d_t2_AB_keys");
+        s_malloc(stats, d_AB_vals, ab_count * sizeof(uint32_t), "d_t2_AB_vals");
+        if (ab_count > 0) {
+            launch_merge_pairs_stable_2way_u32_u32(
+                h_keys + t2_tile_off[0], h_vals + t2_tile_off[0], t2_tile_n[0],
+                h_keys + t2_tile_off[1], h_vals + t2_tile_off[1], t2_tile_n[1],
+                d_AB_keys, d_AB_vals, ab_count, q);
+        }
+        h_AB_keys = static_cast<uint32_t*>(sycl::malloc_host(ab_count * sizeof(uint32_t), q));
+        h_AB_vals = static_cast<uint32_t*>(sycl::malloc_host(ab_count * sizeof(uint32_t), q));
+        if (!h_AB_keys || !h_AB_vals) throw std::runtime_error("sycl::malloc_host(h_AB) failed");
+        if (ab_count > 0) {
+            q.memcpy(h_AB_keys, d_AB_keys, ab_count * sizeof(uint32_t));
+            q.memcpy(h_AB_vals, d_AB_vals, ab_count * sizeof(uint32_t)).wait();
+        }
+        s_free(stats, d_AB_vals);
+        s_free(stats, d_AB_keys);
+
+        // CD merge: same shape.
+        s_malloc(stats, d_CD_keys, cd_count * sizeof(uint32_t), "d_t2_CD_keys");
+        s_malloc(stats, d_CD_vals, cd_count * sizeof(uint32_t), "d_t2_CD_vals");
+        if (cd_count > 0) {
+            launch_merge_pairs_stable_2way_u32_u32(
+                h_keys + t2_tile_off[2], h_vals + t2_tile_off[2], t2_tile_n[2],
+                h_keys + t2_tile_off[3], h_vals + t2_tile_off[3], t2_tile_n[3],
+                d_CD_keys, d_CD_vals, cd_count, q);
+        }
+        h_CD_keys = static_cast<uint32_t*>(sycl::malloc_host(cd_count * sizeof(uint32_t), q));
+        h_CD_vals = static_cast<uint32_t*>(sycl::malloc_host(cd_count * sizeof(uint32_t), q));
+        if (!h_CD_keys || !h_CD_vals) throw std::runtime_error("sycl::malloc_host(h_CD) failed");
+        if (cd_count > 0) {
+            q.memcpy(h_CD_keys, d_CD_keys, cd_count * sizeof(uint32_t));
+            q.memcpy(h_CD_vals, d_CD_vals, cd_count * sizeof(uint32_t)).wait();
+        }
+        s_free(stats, d_CD_vals);
+        s_free(stats, d_CD_keys);
+
+        // h_keys + h_vals consumed by AB/CD merges — free.
+        sycl::free(h_keys, q); h_keys = nullptr;
+        sycl::free(h_vals, q); h_vals = nullptr;
+    }
+
+    uint32_t* d_t2_keys_merged = nullptr;   // merged sorted MI for T3.
+    uint32_t* d_merged_vals    = nullptr;   // merged sorted src indices.
+    s_malloc(stats, d_t2_keys_merged, cap * sizeof(uint32_t), "d_t2_keys_merged");
+    s_malloc(stats, d_merged_vals,    cap * sizeof(uint32_t), "d_merged_vals");
+
+    if (!t1_match_sliced) {
+        launch_merge_pairs_stable_2way_u32_u32(
+            d_AB_keys, d_AB_vals, ab_count,
+            d_CD_keys, d_CD_vals, cd_count,
+            d_t2_keys_merged, d_merged_vals, t2_count, q);
+        s_free(stats, d_AB_keys);
+        s_free(stats, d_AB_vals);
+        s_free(stats, d_CD_keys);
+        s_free(stats, d_CD_vals);
+    } else {
+        // Final merge from USM-host inputs into device outputs.
+        launch_merge_pairs_stable_2way_u32_u32(
+            h_AB_keys, h_AB_vals, ab_count,
+            h_CD_keys, h_CD_vals, cd_count,
+            d_t2_keys_merged, d_merged_vals, t2_count, q);
+        sycl::free(h_AB_keys, q); h_AB_keys = nullptr;
+        sycl::free(h_AB_vals, q); h_AB_vals = nullptr;
+        sycl::free(h_CD_keys, q); h_CD_keys = nullptr;
+        sycl::free(h_CD_vals, q); h_CD_vals = nullptr;
+    }
+
+    // Stage 4c (compact only): d_t2_keys_merged is not consumed by the
+    // gather calls below (they use d_merged_vals for indices) — it's
+    // only needed later by T3 match as the sorted-MI input. Park it on
+    // pinned host across the gather peak so the 1040 MB doesn't coexist
+    // with d_merged_vals + d_t2_meta + d_t2_meta_sorted. H2D'd back
+    // before T3 match.
+    //
+    // Plain mode keeps d_t2_keys_merged live across the gather peak.
+    uint32_t* h_t2_keys_merged = nullptr;
+    if (!scratch.plain_mode) {
+        h_t2_keys_merged = h_keys_owned  // reuse t1_keys flag: same scratch
+            ? static_cast<uint32_t*>(sycl::malloc_host(cap * sizeof(uint32_t), q))
+            : scratch.h_keys_merged;
+        if (!h_t2_keys_merged) throw std::runtime_error("sycl::malloc_host(h_t2_keys_merged) failed");
+        q.memcpy(h_t2_keys_merged, d_t2_keys_merged, t2_count * sizeof(uint32_t)).wait();
+        s_free(stats, d_t2_keys_merged);
+        d_t2_keys_merged = nullptr;
+    }
+
+    // Stage 4a (compact only): JIT H2D the gather source buffers.
+    // d_t2_meta is alive only for the duration of its gather (2080 MB
+    // at k=28), then freed before d_t2_xbits is H2D'd. With stage 4c
+    // the gather peak drops to d_merged_vals (1040) + d_t2_meta (2080)
+    // + d_t2_meta_sorted (2080) = 5200 MB (no more d_t2_keys_merged).
+    //
+    // Plain mode: d_t2_meta and d_t2_xbits are already live from T2
+    // match (never parked). Gather reads them directly and frees after.
+    int const t2_gather_N = scratch.plain_mode ? 1 : scratch.gather_tile_count;
+    uint64_t* d_t2_meta_sorted  = nullptr;
+    uint32_t* d_t2_xbits_sorted = nullptr;
+
+    if (t2_gather_N <= 1) {
+        // Single-shot path (compact / plain).
+        if (!scratch.plain_mode) {
+            s_malloc(stats, d_t2_meta, cap * sizeof(uint64_t), "d_t2_meta");
+            q.memcpy(d_t2_meta, h_t2_meta, t2_count * sizeof(uint64_t));
+            q.wait();
+            if (h_meta_owned) sycl::free(h_t2_meta, q);
+            h_t2_meta = nullptr;
+        }
+
+        s_malloc(stats, d_t2_meta_sorted, cap * sizeof(uint64_t), "d_t2_meta_sorted");
+        launch_gather_u64(d_t2_meta, d_merged_vals, d_t2_meta_sorted, t2_count, q);
+        q.wait();
+        s_free(stats, d_t2_meta);
+
+        if (!scratch.plain_mode) {
+            s_malloc(stats, d_t2_xbits, cap * sizeof(uint32_t), "d_t2_xbits");
+            q.memcpy(d_t2_xbits, h_t2_xbits, t2_count * sizeof(uint32_t));
+            q.wait();
+            if (h_xbits_owned) sycl::free(h_t2_xbits, q);
+            h_t2_xbits = nullptr;
+        }
+
+        s_malloc(stats, d_t2_xbits_sorted, cap * sizeof(uint32_t), "d_t2_xbits_sorted");
+        launch_gather_u32(d_t2_xbits, d_merged_vals, d_t2_xbits_sorted, t2_count, q);
+        end_phase(p_t2_sort);
+        s_free(stats, d_t2_xbits);
+        s_free(stats, d_merged_vals);
+    } else {
+        // Tiled-output gather (minimal tier). Both gathers stage their
+        // sorted outputs to host pinned (reusing h_t2_meta and
+        // h_t2_xbits — same buffers that just held the parked unsorted
+        // data) one tile at a time. Crucially, d_t2_meta_sorted is NOT
+        // re-allocated on device until BOTH gathers and d_merged_vals
+        // are done — otherwise the xbits gather peak (d_t2_meta_sorted
+        // 2080 + d_merged_vals 1040 + d_t2_xbits 1040 + tile 260) would
+        // still hit ~4420 MB. Deferring the rehydrate keeps the xbits
+        // gather peak at d_merged_vals (1040) + d_t2_xbits (1040) +
+        // tile (260 at N=4) = ~2340 MB. Final rehydrate peak:
+        // d_t2_meta_sorted (2080) + d_t2_xbits_sorted (1040) = 3120 MB.
+        uint64_t const tile_max =
+            (t2_count + uint64_t(t2_gather_N) - 1) / uint64_t(t2_gather_N);
+
+        // --- Meta gather (tiled output → h_t2_meta) ---
+        s_malloc(stats, d_t2_meta, cap * sizeof(uint64_t), "d_t2_meta");
+        q.memcpy(d_t2_meta, h_t2_meta, t2_count * sizeof(uint64_t)).wait();
+        {
+            uint64_t* d_meta_tile = nullptr;
+            s_malloc(stats, d_meta_tile, tile_max * sizeof(uint64_t), "d_t2_meta_sorted_tile");
+            for (int n = 0; n < t2_gather_N; ++n) {
+                uint64_t const tile_off = uint64_t(n) * tile_max;
+                if (tile_off >= t2_count) break;
+                uint64_t const tile_n = std::min(tile_max, t2_count - tile_off);
+                launch_gather_u64(
+                    d_t2_meta, d_merged_vals + tile_off,
+                    d_meta_tile, tile_n, q);
+                q.memcpy(h_t2_meta + tile_off, d_meta_tile,
+                         tile_n * sizeof(uint64_t)).wait();
+            }
+            s_free(stats, d_meta_tile);
+        }
+        s_free(stats, d_t2_meta);
+
+        // --- Xbits gather (tiled output → h_t2_xbits) ---
+        s_malloc(stats, d_t2_xbits, cap * sizeof(uint32_t), "d_t2_xbits");
+        q.memcpy(d_t2_xbits, h_t2_xbits, t2_count * sizeof(uint32_t)).wait();
+        {
+            uint32_t* d_xbits_tile = nullptr;
+            s_malloc(stats, d_xbits_tile, tile_max * sizeof(uint32_t), "d_t2_xbits_sorted_tile");
+            for (int n = 0; n < t2_gather_N; ++n) {
+                uint64_t const tile_off = uint64_t(n) * tile_max;
+                if (tile_off >= t2_count) break;
+                uint64_t const tile_n = std::min(tile_max, t2_count - tile_off);
+                launch_gather_u32(
+                    d_t2_xbits, d_merged_vals + tile_off,
+                    d_xbits_tile, tile_n, q);
+                q.memcpy(h_t2_xbits + tile_off, d_xbits_tile,
+                         tile_n * sizeof(uint32_t)).wait();
+            }
+            s_free(stats, d_xbits_tile);
+        }
+        s_free(stats, d_t2_xbits);
+
+        // d_merged_vals dead now that both gathers have produced their
+        // sorted outputs on host.
+        s_free(stats, d_merged_vals);
+
+        // Rehydrate d_t2_xbits_sorted to device (1040 MB at k=28). The
+        // T3 match kernel reads d_sorted_xbits[l] / d_sorted_xbits[r]
+        // by index and the random-access pattern would be too slow via
+        // PCIe with USM-host.
+        s_malloc(stats, d_t2_xbits_sorted, cap * sizeof(uint32_t), "d_t2_xbits_sorted");
+        q.memcpy(d_t2_xbits_sorted, h_t2_xbits, t2_count * sizeof(uint32_t)).wait();
+        if (h_xbits_owned) sycl::free(h_t2_xbits, q);
+        h_t2_xbits = nullptr;
+
+        // Site 4: do NOT rehydrate d_t2_meta_sorted to device. h_t2_meta
+        // (now containing the sorted meta) stays alive across T3 match;
+        // the sliced T3 match path H2Ds a section_l + section_r pair of
+        // slices per pass, dropping T3 match peak from
+        //   d_t2_meta_sorted (2080) + d_t2_xbits_sorted (1040) +
+        //   d_t2_keys_merged (1040) + d_t3_stage (1040) = 5200 MB
+        // to
+        //   d_meta_l (cap/N_sections × u64 = 520) + d_meta_r (520) +
+        //   d_t2_xbits_sorted (1040) + d_t2_keys_merged (1040) +
+        //   d_t3_stage (cap/N_sections × u64 = 520) = ~3640 MB at k=28.
+        // h_t2_meta is freed inside the T3 match block once all
+        // section-pair passes complete.
+
+        end_phase(p_t2_sort);
+    }
+
+    // ---------- Phase T3 match ----------
+    // Plain mode: one-shot launch_t3_match writing directly into
+    // full-cap d_t3. No pinned-host staging, no round-trips — saves
+    // the per-plot sycl::malloc_host(2 GB) (~500 ms on NVIDIA) plus
+    // the two D2H halves + H2D re-hydration. Match live set:
+    //   d_t2_keys_merged (1040) + d_t2_meta_sorted (2080)
+    //   + d_t2_xbits_sorted (1040) + d_t3 (2080) + temp
+    //   = ~6240 MB — fits under plain's 7290 MB T2-match floor.
+    //
+    // Compact mode (stage 4d.3, N=2 tiled): half-cap d_t3 staging +
+    // D2H-to-pinned-host between passes, then full-cap d_t3 + H2D
+    // before T3 sort. Keeps T3 match peak at 5200 MB.
+    stats.phase = "T3 match";
+    auto t3p = make_t3_params(cfg.k, cfg.strength);
+    size_t t3_temp_bytes = 0;
+    launch_t3_match_prepare(cfg.plot_id.data(), t3p, nullptr, t2_count,
+                            d_counter, nullptr, &t3_temp_bytes, q);
+
+    // Stage 4c (compact only): H2D d_t2_keys_merged back from pinned
+    // host now that we're about to enter T3 match (its consumer).
+    // Pinned host freed after H2D. Plain mode: d_t2_keys_merged is
+    // already live (never parked).
+    if (!scratch.plain_mode) {
+        s_malloc(stats, d_t2_keys_merged, cap * sizeof(uint32_t), "d_t2_keys_merged");
+        q.memcpy(d_t2_keys_merged, h_t2_keys_merged, t2_count * sizeof(uint32_t)).wait();
+        if (h_keys_owned) sycl::free(h_t2_keys_merged, q);
+        h_t2_keys_merged = nullptr;
+    }
+
+    T3PairingGpu* d_t3    = nullptr;
+    uint64_t      t3_count = 0;
+
+    if (scratch.plain_mode) {
+        // Plain: one-shot full-cap T3 match.
+        void* d_t3_match_temp = nullptr;
+        s_malloc(stats, d_t3,            cap * sizeof(T3PairingGpu), "d_t3");
+        s_malloc(stats, d_t3_match_temp, t3_temp_bytes,              "d_t3_match_temp");
+
+        q.memset(d_counter, 0, sizeof(uint64_t)).wait();
+        int p_t3 = begin_phase("T3 match + Feistel");
+        launch_t3_match(cfg.plot_id.data(), t3p,
+                        d_t2_meta_sorted, d_t2_xbits_sorted,
+                        d_t2_keys_merged, t2_count,
+                        d_t3, d_counter, cap,
+                        d_t3_match_temp, &t3_temp_bytes, q);
+        end_phase(p_t3);
+
+        q.memcpy(&t3_count, d_counter, sizeof(uint64_t)).wait();
+        if (t3_count > cap) throw std::runtime_error("T3 overflow");
+
+        s_free(stats, d_t3_match_temp);
+        s_free(stats, d_t2_meta_sorted);
+        s_free(stats, d_t2_xbits_sorted);
+        s_free(stats, d_t2_keys_merged);
+    } else if (scratch.gather_tile_count > 1) {
+        // Minimal (sliced T3 match — site 4). d_t2_meta_sorted is NOT
+        // on device in this path; the sorted meta is parked on
+        // h_t2_meta (from the T2 sort tiled gather). For each section_l
+        // we H2D the matching pair of sections (l + r) into small
+        // device slices, run the kernel against those slices, D2H the
+        // stage output to h_t3, then free the slices. Drops T3 match
+        // peak from ~5200 MB (compact) to ~3665 MB at k=28.
+        uint32_t const num_sections   = 1u << t3p.num_section_bits;
+        uint32_t const num_match_keys = 1u << t3p.num_match_key_bits;
+        uint32_t const num_buckets_t3 = num_sections * num_match_keys;
+        // Per-pass output capacity sized at cap/N × 1.25 (25% safety
+        // margin over the expected uniform-distribution average).
+        uint64_t const t3_section_cap =
+            ((cap + num_sections - 1) / num_sections) * 5ULL / 4ULL;
+
+        T3PairingGpu* d_t3_stage      = nullptr;
+        void*         d_t3_match_temp = nullptr;
+        s_malloc(stats, d_t3_stage,      t3_section_cap * sizeof(T3PairingGpu), "d_t3_stage");
+        s_malloc(stats, d_t3_match_temp, t3_temp_bytes,                          "d_t3_match_temp");
+
+        bool const h_t3_owned = (scratch.h_t3 == nullptr);
+        T3PairingGpu* h_t3 = h_t3_owned
+            ? static_cast<T3PairingGpu*>(sycl::malloc_host(cap * sizeof(T3PairingGpu), q))
+            : reinterpret_cast<T3PairingGpu*>(scratch.h_t3);
+        if (!h_t3) throw std::runtime_error("sycl::malloc_host(h_t3) failed");
+
+        // Compute bucket + fine-bucket offsets in d_t3_match_temp; also
+        // zero d_counter. Same call shape as compact path.
+        launch_t3_match_prepare(cfg.plot_id.data(), t3p,
+                                d_t2_keys_merged, t2_count,
+                                d_counter, d_t3_match_temp, &t3_temp_bytes, q);
+
+        // D2H the bucket-offsets table (small: 17 × u64 at k=28
+        // strength=2) so we can compute each section's global row range
+        // host-side.
+        std::vector<uint64_t> h_t3_offsets(num_buckets_t3 + 1);
+        q.memcpy(h_t3_offsets.data(), d_t3_match_temp,
+                 (num_buckets_t3 + 1) * sizeof(uint64_t)).wait();
+
+        auto compute_section_r = [&](uint32_t section_l) -> uint32_t {
+            // Mirror the kernel's section_l → section_r permutation.
+            uint32_t const mask = num_sections - 1u;
+            uint32_t const rl   = ((section_l << 1) |
+                                   (section_l >> (t3p.num_section_bits - 1))) & mask;
+            uint32_t const rl1  = (rl + 1u) & mask;
+            return ((rl1 >> 1) |
+                    (rl1 << (t3p.num_section_bits - 1))) & mask;
+        };
+
+        int p_t3 = begin_phase("T3 match + Feistel");
+        uint64_t host_offset = 0;
+        for (uint32_t section_l = 0; section_l < num_sections; ++section_l) {
+            uint32_t const section_r = compute_section_r(section_l);
+            uint64_t const section_l_row_start = h_t3_offsets[section_l * num_match_keys];
+            uint64_t const section_l_row_end   = h_t3_offsets[(section_l + 1) * num_match_keys];
+            uint64_t const section_l_count     = section_l_row_end - section_l_row_start;
+            uint64_t const section_r_row_start = h_t3_offsets[section_r * num_match_keys];
+            uint64_t const section_r_row_end   = h_t3_offsets[(section_r + 1) * num_match_keys];
+            uint64_t const section_r_count     = section_r_row_end - section_r_row_start;
+
+            // Skip empty sections — happens for tiny test plots where
+            // a section has zero rows. The kernel would early-return
+            // anyway but the slice malloc rejects bytes==0 since f1d3c67.
+            if (section_l_count == 0) continue;
+
+            uint64_t* d_meta_l_slice = nullptr;
+            uint64_t* d_meta_r_slice = nullptr;
+            s_malloc(stats, d_meta_l_slice, section_l_count * sizeof(uint64_t), "d_t3_meta_l_slice");
+            if (section_r_count > 0) {
+                s_malloc(stats, d_meta_r_slice, section_r_count * sizeof(uint64_t), "d_t3_meta_r_slice");
+            }
+
+            q.memcpy(d_meta_l_slice, h_t2_meta + section_l_row_start,
+                     section_l_count * sizeof(uint64_t)).wait();
+            if (section_r_count > 0) {
+                q.memcpy(d_meta_r_slice, h_t2_meta + section_r_row_start,
+                         section_r_count * sizeof(uint64_t)).wait();
+            }
+
+            uint32_t const bucket_begin = section_l * num_match_keys;
+            uint32_t const bucket_end   = (section_l + 1) * num_match_keys;
+            launch_t3_match_section_pair_range(
+                cfg.plot_id.data(), t3p,
+                d_meta_l_slice, section_l_row_start,
+                d_meta_r_slice, section_r_row_start,
+                d_t2_xbits_sorted, d_t2_keys_merged, t2_count,
+                d_t3_stage, d_counter, t3_section_cap,
+                d_t3_match_temp, bucket_begin, bucket_end, q);
+
+            uint64_t pass_count = 0;
+            q.memcpy(&pass_count, d_counter, sizeof(uint64_t)).wait();
+            if (pass_count > t3_section_cap) {
+                throw std::runtime_error(
+                    "T3 match (sliced) section_l=" + std::to_string(section_l) +
+                    " produced " + std::to_string(pass_count) +
+                    " pairs, staging holds " + std::to_string(t3_section_cap) +
+                    ". Lower N or widen t3_section_cap safety factor.");
+            }
+            q.memcpy(h_t3 + host_offset, d_t3_stage,
+                     pass_count * sizeof(T3PairingGpu)).wait();
+            host_offset += pass_count;
+            q.memset(d_counter, 0, sizeof(uint64_t)).wait();
+
+            if (section_r_count > 0) s_free(stats, d_meta_r_slice);
+            s_free(stats, d_meta_l_slice);
+        }
+        end_phase(p_t3);
+
+        t3_count = host_offset;
+        if (t3_count > cap) throw std::runtime_error("T3 overflow");
+
+        // d_t2_meta_sorted is null in this path (never allocated) — skip
+        // its s_free. Free everything else that was alive across T3 match.
+        s_free(stats, d_t3_match_temp);
+        s_free(stats, d_t3_stage);
+        s_free(stats, d_t2_xbits_sorted);
+        s_free(stats, d_t2_keys_merged);
+
+        // h_t2_meta was kept alive across T3 match for slicing; free now
+        // that all section pairs have been H2D'd.
+        if (h_meta_owned) sycl::free(h_t2_meta, q);
+        h_t2_meta = nullptr;
+
+        // Re-hydrate full-cap d_t3 on device for T3 sort.
+        s_malloc(stats, d_t3, cap * sizeof(T3PairingGpu), "d_t3");
+        q.memcpy(d_t3, h_t3, t3_count * sizeof(T3PairingGpu)).wait();
+        if (h_t3_owned) sycl::free(h_t3, q);
+    } else {
+        // Compact: N=2 half-cap staging with pinned-host h_t3 accumulator.
+        uint64_t const t3_half_cap = (cap + 1) / 2;
+
+        T3PairingGpu* d_t3_stage    = nullptr;
+        void*         d_t3_match_temp = nullptr;
+        s_malloc(stats, d_t3_stage,      t3_half_cap * sizeof(T3PairingGpu), "d_t3_stage");
+        s_malloc(stats, d_t3_match_temp, t3_temp_bytes,                     "d_t3_match_temp");
+
+        // Full-cap pinned host that will hold the concatenated T3 output.
+        // Stage 4f: reuse scratch.h_t3 when provided (amortised across
+        // batch). T3PairingGpu is just a uint64 proof_fragment, so the
+        // scratch buffer is declared as uint64_t* and reinterpret-cast.
+        bool const h_t3_owned = (scratch.h_t3 == nullptr);
+        T3PairingGpu* h_t3 = h_t3_owned
+            ? static_cast<T3PairingGpu*>(sycl::malloc_host(cap * sizeof(T3PairingGpu), q))
+            : reinterpret_cast<T3PairingGpu*>(scratch.h_t3);
+        if (!h_t3) throw std::runtime_error("sycl::malloc_host(h_t3) failed");
+
+        // Compute bucket + fine-bucket offsets once; both match passes
+        // share them. Also zeroes d_counter.
+        launch_t3_match_prepare(cfg.plot_id.data(), t3p,
+                                d_t2_keys_merged, t2_count,
+                                d_counter, d_t3_match_temp, &t3_temp_bytes, q);
+
+        uint32_t const t3_num_buckets =
+            (1u << t3p.num_section_bits) * (1u << t3p.num_match_key_bits);
+        uint32_t const t3_bucket_mid = t3_num_buckets / 2;
+
+        auto run_t3_pass = [&](uint32_t bucket_begin, uint32_t bucket_end,
+                               uint64_t host_offset) -> uint64_t
+        {
+            launch_t3_match_range(cfg.plot_id.data(), t3p,
+                                  d_t2_meta_sorted, d_t2_xbits_sorted,
+                                  d_t2_keys_merged, t2_count,
+                                  d_t3_stage, d_counter, t3_half_cap,
+                                  d_t3_match_temp, bucket_begin, bucket_end, q);
+            uint64_t pass_count = 0;
+            q.memcpy(&pass_count, d_counter, sizeof(uint64_t)).wait();
+            if (pass_count > t3_half_cap) {
+                throw std::runtime_error(
+                    "T3 match pass overflow: bucket range [" +
+                    std::to_string(bucket_begin) + "," + std::to_string(bucket_end) +
+                    ") produced " + std::to_string(pass_count) +
+                    " pairs, staging holds " + std::to_string(t3_half_cap) +
+                    ". Lower N or widen staging.");
+            }
+            q.memcpy(h_t3 + host_offset, d_t3_stage,
+                     pass_count * sizeof(T3PairingGpu)).wait();
+            // Reset counter so the next pass writes at stage index 0.
+            q.memset(d_counter, 0, sizeof(uint64_t)).wait();
+            return pass_count;
+        };
+
+        int p_t3 = begin_phase("T3 match + Feistel");
+        uint64_t const t3_count1 = run_t3_pass(0,              t3_bucket_mid,   /*host_offset=*/0);
+        uint64_t const t3_count2 = run_t3_pass(t3_bucket_mid,  t3_num_buckets,  /*host_offset=*/t3_count1);
+        end_phase(p_t3);
+
+        t3_count = t3_count1 + t3_count2;
+        if (t3_count > cap) throw std::runtime_error("T3 overflow");
+
+        // Free everything that was alive across T3 match: staging, temp,
+        // sorted T2 inputs, keys_merged.
+        s_free(stats, d_t3_match_temp);
+        s_free(stats, d_t3_stage);
+        s_free(stats, d_t2_meta_sorted);
+        s_free(stats, d_t2_xbits_sorted);
+        s_free(stats, d_t2_keys_merged);
+
+        // Re-hydrate full-cap d_t3 on device for T3 sort.
+        s_malloc(stats, d_t3, cap * sizeof(T3PairingGpu), "d_t3");
+        q.memcpy(d_t3, h_t3, t3_count * sizeof(T3PairingGpu)).wait();
+        if (h_t3_owned) sycl::free(h_t3, q);
+    }
+
+    // ---------- Phase T3 sort ----------
+    // Compact / plain: full-cap CUB sort_keys with separate keys_in
+    // (= d_t3) and keys_out (= d_frags_out) buffers — peaks at
+    // 2 × cap × u64 + scratch ≈ 4228 MB at k=28.
+    //
+    // Minimal: tile the sort in halves with a single cap/2 output
+    // buffer, D2H each tile to host pinned, std::inplace_merge on
+    // host, then H2D the merged result back into the full-cap
+    // d_frags_out the D2H phase below expects. Drops T3 sort peak to
+    // ~3152 MB at k=28 (d_t3 2080 + tile output 1040 + sort scratch
+    // sized for cap/2 ≈ 32). Adds one cap-sized PCIe round-trip per
+    // plot.
+    stats.phase = "T3 sort";
+    uint64_t* d_frags_in  = reinterpret_cast<uint64_t*>(d_t3);
+    uint64_t* d_frags_out = nullptr;
+
+    if (!t1_match_sliced) {
+        size_t t3_sort_bytes = 0;
+        launch_sort_keys_u64(
+            nullptr, t3_sort_bytes,
+            static_cast<uint64_t*>(nullptr), static_cast<uint64_t*>(nullptr),
+            cap, 0, 2 * cfg.k, q);
+
+        s_malloc(stats, d_frags_out,    cap * sizeof(uint64_t), "d_frags_out");
+        s_malloc(stats, d_sort_scratch, t3_sort_bytes,          "d_sort_scratch(t3)");
+
+        int p_t3_sort = begin_phase("T3 sort");
+        launch_sort_keys_u64(
+            d_sort_scratch, t3_sort_bytes,
+            d_frags_in, d_frags_out,
+            t3_count, /*begin_bit=*/0, /*end_bit=*/2 * cfg.k, q);
+        end_phase(p_t3_sort);
+
+        s_free(stats, d_t3);
+        s_free(stats, d_sort_scratch);
+    } else {
+        // Tiled sort + host merge.
+        uint64_t const tile_max = (cap + 1) / 2;
+        uint64_t const tile_n0  = t3_count / 2;
+        uint64_t const tile_n1  = t3_count - tile_n0;
+
+        size_t t3_tile_sort_bytes = 0;
+        launch_sort_keys_u64(
+            nullptr, t3_tile_sort_bytes,
+            static_cast<uint64_t*>(nullptr), static_cast<uint64_t*>(nullptr),
+            tile_max, 0, 2 * cfg.k, q);
+
+        uint64_t* d_frags_out_tile     = nullptr;
+        void*     d_sort_scratch_tile  = nullptr;
+        s_malloc(stats, d_frags_out_tile,    tile_max * sizeof(uint64_t), "d_frags_out_tile");
+        s_malloc(stats, d_sort_scratch_tile, t3_tile_sort_bytes,          "d_sort_scratch(t3_tile)");
+
+        uint64_t* h_frags = static_cast<uint64_t*>(
+            sycl::malloc_host(cap * sizeof(uint64_t), q));
+        if (!h_frags) throw std::runtime_error("sycl::malloc_host(h_frags) failed");
+
+        int p_t3_sort = begin_phase("T3 sort");
+        if (tile_n0 > 0) {
+            launch_sort_keys_u64(
+                d_sort_scratch_tile, t3_tile_sort_bytes,
+                d_frags_in, d_frags_out_tile,
+                tile_n0, /*begin_bit=*/0, /*end_bit=*/2 * cfg.k, q);
+            q.memcpy(h_frags, d_frags_out_tile,
+                     tile_n0 * sizeof(uint64_t)).wait();
+        }
+        if (tile_n1 > 0) {
+            launch_sort_keys_u64(
+                d_sort_scratch_tile, t3_tile_sort_bytes,
+                d_frags_in + tile_n0, d_frags_out_tile,
+                tile_n1, /*begin_bit=*/0, /*end_bit=*/2 * cfg.k, q);
+            q.memcpy(h_frags + tile_n0, d_frags_out_tile,
+                     tile_n1 * sizeof(uint64_t)).wait();
+        }
+        end_phase(p_t3_sort);
+
+        s_free(stats, d_frags_out_tile);
+        s_free(stats, d_sort_scratch_tile);
+        s_free(stats, d_t3);
+
+        // Stable in-place merge of [0, tile_n0) and [tile_n0, t3_count)
+        // — both halves are individually sorted by launch_sort_keys_u64.
+        std::inplace_merge(h_frags, h_frags + tile_n0, h_frags + t3_count);
+
+        // Re-hydrate full-cap d_frags_out for the existing D2H phase.
+        s_malloc(stats, d_frags_out, cap * sizeof(uint64_t), "d_frags_out");
+        if (t3_count > 0) {
+            q.memcpy(d_frags_out, h_frags, t3_count * sizeof(uint64_t)).wait();
+        }
+        sycl::free(h_frags, q);
+    }
+
+    // ---------- D2H ----------
+    // Two destination modes:
+    //   caller-supplied pinned_dst (batch): copy D2H into pinned_dst and
+    //     return a BORROWING result (external_fragments_ptr). Consumer
+    //     must finish reading pinned_dst before the caller reuses it.
+    //   no pinned_dst (one-shot): alloc a temp pinned region sized to
+    //     t3_count, D2H, copy to an OWNING vector, free the temp.
+    stats.phase = "D2H";
+    GpuPipelineResult result;
+    result.t1_count = t1_count;
+    result.t2_count = t2_count;
+    result.t3_count = t3_count;
+
+    int p_d2h = begin_phase("D2H copy T3 fragments (pinned)");
+    if (t3_count > 0) {
+        if (pinned_dst) {
+            if (pinned_capacity < t3_count) {
+                throw std::runtime_error(
+                    "run_gpu_pipeline_streaming: pinned_capacity " +
+                    std::to_string(pinned_capacity) +
+                    " < t3_count " + std::to_string(t3_count));
+            }
+            q.memcpy(pinned_dst, d_frags_out, sizeof(uint64_t) * t3_count);
+            q.wait();
+            result.external_fragments_ptr   = pinned_dst;
+            result.external_fragments_count = t3_count;
+        } else {
+            uint64_t* h_pinned = nullptr;
+            h_pinned = static_cast<uint64_t*>(
+                sycl::malloc_host(sizeof(uint64_t) * t3_count, sycl_backend::queue()));
+            if (!h_pinned) throw std::runtime_error("sycl::malloc_host(h_pinned) failed");
+            q.memcpy(h_pinned, d_frags_out, sizeof(uint64_t) * t3_count);
+            q.wait();
+            result.t3_fragments_storage.resize(t3_count);
+            std::memcpy(result.t3_fragments_storage.data(), h_pinned,
+                        sizeof(uint64_t) * t3_count);
+            sycl::free(h_pinned, sycl_backend::queue());
+        }
+    }
+    end_phase(p_d2h);
+
+    s_free(stats, d_frags_out);
+    s_free(stats, d_counter);
+
+    if (stats.verbose) {
+        std::fprintf(stderr,
+            "[streaming] k=%d strength=%d  peak device VRAM = %.2f MB\n",
+            cfg.k, cfg.strength, stats.peak / 1048576.0);
+    }
+    report_phases();
+    return result;
+}
+
+} // namespace (anon — streaming impl)
+
+uint64_t* streaming_alloc_pinned_uint64(size_t count)
+{
+    uint64_t* p = nullptr;
+    p = static_cast<uint64_t*>(
+        sycl::malloc_host(count * sizeof(uint64_t), sycl_backend::queue()));
+    if (!p) return nullptr;
+    return p;
+}
+
+uint32_t* streaming_alloc_pinned_uint32(size_t count)
+{
+    uint32_t* p = static_cast<uint32_t*>(
+        sycl::malloc_host(count * sizeof(uint32_t), sycl_backend::queue()));
+    return p;  // nullptr on failure
+}
+
+void streaming_free_pinned_uint32(uint32_t* ptr)
+{
+    if (ptr) sycl::free(ptr, sycl_backend::queue());
+}
+
+void streaming_free_pinned_uint64(uint64_t* ptr)
+{
+    if (ptr) sycl::free(ptr, sycl_backend::queue());
+}
+
+void bind_current_device(int device_id)
+{
+    sycl_backend::set_current_device_id(device_id);
+}
+
+int gpu_device_count()
+{
+    try {
+        return sycl_backend::get_gpu_device_count();
+    } catch (...) {
+        return 0;
+    }
+}
+
+} // namespace pos2gpu
diff --git a/src/host/GpuPipeline.cu b/src/host/GpuPipeline.cu
deleted file mode 100644
index 2b28b7d..0000000
--- a/src/host/GpuPipeline.cu
+++ /dev/null
@@ -1,411 +0,0 @@
-// GpuPipeline.cu — orchestrates Xs → T1 → T2 → T3 on the device, with
-// CUB radix sort between phases (each phase consumes sorted-by-match_info
-// input). Final T3 output is sorted by proof_fragment (low 2k bits) to
-// match pos2-chip Table3Constructor::post_construct_span.
-//
-// Two overloads live here:
-//   run_gpu_pipeline(cfg)       — transient pool, one-shot.
-//   run_gpu_pipeline(cfg, pool) — shared pool, batch-friendly. This is the
-//                                 real implementation; the one-shot form
-//                                 just wraps it in a temporary pool.
-
-#include "host/GpuPipeline.hpp"
-#include "host/GpuBufferPool.hpp"
-
-#include "gpu/AesGpu.cuh"
-#include "gpu/XsKernel.cuh"
-#include "gpu/T1Kernel.cuh"
-#include "gpu/T2Kernel.cuh"
-#include "gpu/T3Kernel.cuh"
-
-#include <cub/cub.cuh>
-#include <cuda_runtime.h>
-
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-namespace pos2gpu {
-
-namespace {
-
-#define CHECK(call) do {                                                 \
-    cudaError_t err = (call);                                            \
-    if (err != cudaSuccess) {                                            \
-        throw std::runtime_error(std::string("CUDA: ") +                 \
-                                 cudaGetErrorString(err));               \
-    }                                                                    \
-} while (0)
-
-// =====================================================================
-// T1 sort: by match_info, low k bits, stable. Uses CUB SortPairs with
-// (key=match_info, value=index) then permutes T1Pairings.
-// =====================================================================
-
-// Permute the T1 match output by sort indices, writing only the 8-byte
-// meta (meta_hi << 32 | meta_lo). match_info already lives in the sort's
-// key-output stream so we don't rematerialise it; the T2 match kernel
-// consumes (sorted_meta, sorted_mi) directly.
-__global__ void permute_t1(
-    T1PairingGpu const* __restrict__ src,
-    uint32_t const* __restrict__ indices,
-    uint64_t* __restrict__ dst_meta,
-    uint64_t count)
-{
-    uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x;
-    if (idx >= count) return;
-    T1PairingGpu s = src[indices[idx]];
-    dst_meta[idx] = (uint64_t(s.meta_hi) << 32) | uint64_t(s.meta_lo);
-}
-
-__global__ void extract_t1_keys(
-    T1PairingGpu const* __restrict__ src,
-    uint32_t* __restrict__ keys_out,
-    uint32_t* __restrict__ vals_out,
-    uint64_t count)
-{
-    uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x;
-    if (idx >= count) return;
-    keys_out[idx] = src[idx].match_info;
-    vals_out[idx] = uint32_t(idx);
-}
-
-// =====================================================================
-// T2 sort: same shape — sort indices by match_info.
-// =====================================================================
-
-// T3 match reads meta (8 B) and x_bits (4 B) from sorted_t2 but does not
-// touch match_info (passed as the parallel sorted_mi stream). Splitting
-// the sort output into meta[] and xbits[] arrays drops the per-access
-// line footprint from 16 B to 12 B, cutting L1/TEX line fetches on an
-// L1-throughput-bound kernel.
-__global__ void permute_t2(
-    T2PairingGpu const* __restrict__ src,
-    uint32_t const* __restrict__ indices,
-    uint64_t* __restrict__ dst_meta,
-    uint32_t* __restrict__ dst_xbits,
-    uint64_t count)
-{
-    uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x;
-    if (idx >= count) return;
-    T2PairingGpu p = src[indices[idx]];
-    dst_meta[idx]  = p.meta;
-    dst_xbits[idx] = p.x_bits;
-}
-
-__global__ void extract_t2_keys(
-    T2PairingGpu const* __restrict__ src,
-    uint32_t* __restrict__ keys_out,
-    uint32_t* __restrict__ vals_out,
-    uint64_t count)
-{
-    uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x;
-    if (idx >= count) return;
-    keys_out[idx] = src[idx].match_info;
-    vals_out[idx] = uint32_t(idx);
-}
-
-} // namespace
-
-GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg,
-                                   GpuBufferPool& pool,
-                                   int pinned_index)
-{
-    if (cfg.k < 18 || cfg.k > 32 || (cfg.k & 1) != 0) {
-        throw std::runtime_error("k must be even in [18, 32]");
-    }
-    if (cfg.strength < 2) {
-        throw std::runtime_error("strength must be >= 2");
-    }
-    if (pool.k != cfg.k || pool.strength != cfg.strength
-        || pool.testnet != cfg.testnet)
-    {
-        throw std::runtime_error(
-            "GpuBufferPool was sized for different (k, strength, testnet)");
-    }
-    if (pinned_index < 0 || pinned_index > 1) {
-        throw std::runtime_error("pinned_index must be 0 or 1");
-    }
-
-    uint64_t const total_xs = pool.total_xs;
-    uint64_t const cap      = pool.cap;
-
-    constexpr int kThreads = 256;
-    auto blocks = [&](uint64_t n) {
-        return unsigned((n + kThreads - 1) / kThreads);
-    };
-
-    cudaStream_t stream = nullptr; // default stream
-
-    // ---- pool aliases ----
-    // d_pair_a carries the "current phase match output": T1, then T2, then T3.
-    // d_pair_b carries the "current phase sort output": sorted T1, sorted T2,
-    // then final uint64_t fragments. Each subsequent phase's output overwrites
-    // the previous (consumed) contents in the same slot.
-    XsCandidateGpu* d_xs             = static_cast<XsCandidateGpu*>(pool.d_storage);
-    T1PairingGpu*   d_t1             = static_cast<T1PairingGpu*>  (pool.d_pair_a);
-    // Sorted T1 is now just meta (8 B/entry) — match_info comes from sort keys.
-    uint64_t*       d_t1_meta_sorted = static_cast<uint64_t*>      (pool.d_pair_b);
-    T2PairingGpu*   d_t2             = static_cast<T2PairingGpu*>  (pool.d_pair_a);
-    // Sorted T2 is SoA-split across d_pair_b: meta[cap] then xbits[cap],
-    // 12 B total per entry (fits in d_pair_b's 16 B/entry budget). T3
-    // match reads both; frags_out later reuses d_pair_b from offset 0.
-    uint64_t*       d_t2_meta_sorted  = static_cast<uint64_t*>      (pool.d_pair_b);
-    uint32_t*       d_t2_xbits_sorted = reinterpret_cast<uint32_t*>(
-        static_cast<uint8_t*>(pool.d_pair_b) + pool.cap * sizeof(uint64_t));
-    T3PairingGpu*   d_t3             = static_cast<T3PairingGpu*>  (pool.d_pair_a);
-    uint64_t*       d_frags_out      = static_cast<uint64_t*>      (pool.d_pair_b);
-
-    uint64_t*       d_count        = pool.d_counter;
-    // Xs phase needs ~4.34 GB scratch at k=28; d_pair_b is idle through
-    // the whole Xs phase (not touched until T1 sort permute writes to it),
-    // so we alias it rather than allocating separately.
-    void*           d_xs_temp      = pool.d_pair_b;
-    void*           d_sort_scratch = pool.d_sort_scratch;
-    uint64_t*       h_pinned_t3    = pool.h_pinned_t3[pinned_index];
-    // T1/T2/T3 match kernels report 0 scratch bytes, but some CUDA paths
-    // reject a nullptr d_temp_storage with cudaErrorInvalidArgument even
-    // when bytes==0. Point them at d_sort_scratch (idle during match) to
-    // give the kernel a valid non-null handle.
-    void*           d_match_temp   = pool.d_sort_scratch;
-
-    // Sort key/val arrays alias d_storage. Safe because Xs is fully consumed
-    // by T1 match (stream-synchronised) before we enter T1 sort.
-    auto     storage_u32 = static_cast<uint32_t*>(pool.d_storage);
-    uint32_t* d_keys_in  = storage_u32 + 0 * cap;
-    uint32_t* d_keys_out = storage_u32 + 1 * cap;
-    uint32_t* d_vals_in  = storage_u32 + 2 * cap;
-    uint32_t* d_vals_out = storage_u32 + 3 * cap;
-
-    // ---- profiling: cudaEvent helpers ----
-    struct PhaseTimer {
-        cudaEvent_t start, stop;
-        std::string label;
-    };
-    std::vector<PhaseTimer> phases;
-    auto begin_phase = [&](char const* label) -> int {
-        if (!cfg.profile) return -1;
-        PhaseTimer pt;
-        pt.label = label;
-        cudaEventCreate(&pt.start);
-        cudaEventCreate(&pt.stop);
-        cudaEventRecord(pt.start, stream);
-        phases.push_back(pt);
-        return int(phases.size()) - 1;
-    };
-    auto end_phase = [&](int idx) {
-        if (!cfg.profile || idx < 0) return;
-        cudaEventRecord(phases[idx].stop, stream);
-    };
-    auto report_phases = [&]() {
-        if (!cfg.profile) return;
-        cudaDeviceSynchronize();
-        std::fprintf(stderr, "=== gpu_pipeline phase breakdown ===\n");
-        float total_ms = 0;
-        for (auto& pt : phases) {
-            float ms = 0;
-            cudaEventElapsedTime(&ms, pt.start, pt.stop);
-            std::fprintf(stderr, "  %-30s %8.2f ms\n", pt.label.c_str(), ms);
-            total_ms += ms;
-            cudaEventDestroy(pt.start);
-            cudaEventDestroy(pt.stop);
-        }
-        std::fprintf(stderr, "  %-30s %8.2f ms\n", "TOTAL device time:", total_ms);
-    };
-
-    // ---------- Phase Xs ----------
-    size_t xs_temp_bytes = 0;
-    CHECK(launch_construct_xs(cfg.plot_id.data(), cfg.k, cfg.testnet,
-                              nullptr, nullptr, &xs_temp_bytes));
-    cudaEvent_t e_xs_start = nullptr, e_xs_gen_done = nullptr, e_xs_sort_done = nullptr;
-    if (cfg.profile) {
-        cudaEventCreate(&e_xs_start);
-        cudaEventCreate(&e_xs_gen_done);
-        cudaEventCreate(&e_xs_sort_done);
-        cudaEventRecord(e_xs_start, stream);
-    }
-    CHECK(launch_construct_xs_profiled(cfg.plot_id.data(), cfg.k, cfg.testnet,
-                                       d_xs, d_xs_temp, &xs_temp_bytes,
-                                       e_xs_gen_done, e_xs_sort_done, stream));
-
-    // ---------- Phase T1 ----------
-    auto t1p = make_t1_params(cfg.k, cfg.strength);
-    size_t t1_temp_bytes = 0;
-    CHECK(launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs,
-                          d_t1, d_count, cap,
-                          nullptr, &t1_temp_bytes));
-    CHECK(cudaMemsetAsync(d_count, 0, sizeof(uint64_t), stream));
-    int p_t1 = begin_phase("T1 match");
-    CHECK(launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs,
-                          d_t1, d_count, cap,
-                          d_match_temp, &t1_temp_bytes, stream));
-    end_phase(p_t1);
-
-    // No explicit sync: the next cudaMemcpy (non-async, default stream)
-    // implicitly drains prior stream work before the host reads t1_count.
-    uint64_t t1_count = 0;
-    CHECK(cudaMemcpy(&t1_count, d_count, sizeof(uint64_t),
-                     cudaMemcpyDeviceToHost));
-    if (t1_count > cap) throw std::runtime_error("T1 overflow");
-
-    // Sort T1 by match_info (low k bits). d_storage is now repurposed
-    // as (keys_in, keys_out, vals_in, vals_out), Xs having been fully
-    // consumed by T1 match above.
-    int p_t1_sort = begin_phase("T1 sort");
-    {
-        extract_t1_keys<<<blocks(t1_count), kThreads, 0, stream>>>(
-            d_t1, d_keys_in, d_vals_in, t1_count);
-        CHECK(cudaGetLastError());
-
-        size_t sort_bytes = pool.sort_scratch_bytes;
-        CHECK(cub::DeviceRadixSort::SortPairs(
-            d_sort_scratch, sort_bytes,
-            d_keys_in, d_keys_out, d_vals_in, d_vals_out,
-            t1_count, /*begin_bit=*/0, /*end_bit=*/cfg.k, stream));
-
-        permute_t1<<<blocks(t1_count), kThreads, 0, stream>>>(
-            d_t1, d_vals_out, d_t1_meta_sorted, t1_count);
-        CHECK(cudaGetLastError());
-    }
-    end_phase(p_t1_sort);
-
-    // ---------- Phase T2 ----------
-    // Sorted T1 = (d_t1_meta_sorted: uint64 meta, d_keys_out: uint32 match_info).
-    // No AoS struct anymore — saves 33 % of sorted-T1 bandwidth on both the
-    // permute write and the match-kernel hot path.
-    auto t2p = make_t2_params(cfg.k, cfg.strength);
-    size_t t2_temp_bytes = 0;
-    CHECK(launch_t2_match(cfg.plot_id.data(), t2p, nullptr, nullptr, t1_count,
-                          d_t2, d_count, cap,
-                          nullptr, &t2_temp_bytes));
-    CHECK(cudaMemsetAsync(d_count, 0, sizeof(uint64_t), stream));
-    int p_t2 = begin_phase("T2 match");
-    CHECK(launch_t2_match(cfg.plot_id.data(), t2p, d_t1_meta_sorted, d_keys_out, t1_count,
-                          d_t2, d_count, cap,
-                          d_match_temp, &t2_temp_bytes, stream));
-    end_phase(p_t2);
-
-    uint64_t t2_count = 0;
-    CHECK(cudaMemcpy(&t2_count, d_count, sizeof(uint64_t),
-                     cudaMemcpyDeviceToHost));
-    if (t2_count > cap) throw std::runtime_error("T2 overflow");
-
-    int p_t2_sort = begin_phase("T2 sort");
-    {
-        extract_t2_keys<<<blocks(t2_count), kThreads, 0, stream>>>(
-            d_t2, d_keys_in, d_vals_in, t2_count);
-        CHECK(cudaGetLastError());
-
-        size_t sort_bytes = pool.sort_scratch_bytes;
-        CHECK(cub::DeviceRadixSort::SortPairs(
-            d_sort_scratch, sort_bytes,
-            d_keys_in, d_keys_out, d_vals_in, d_vals_out,
-            t2_count, 0, cfg.k, stream));
-
-        permute_t2<<<blocks(t2_count), kThreads, 0, stream>>>(
-            d_t2, d_vals_out, d_t2_meta_sorted, d_t2_xbits_sorted, t2_count);
-        CHECK(cudaGetLastError());
-    }
-    end_phase(p_t2_sort);
-
-    // ---------- Phase T3 ----------
-    // d_keys_out now holds the T2 sorted match_info (T1's was overwritten by
-    // the T2 sort above) — pass as the slim stream for binary search in T3.
-    auto t3p = make_t3_params(cfg.k, cfg.strength);
-    size_t t3_temp_bytes = 0;
-    CHECK(launch_t3_match(cfg.plot_id.data(), t3p,
-                          d_t2_meta_sorted, d_t2_xbits_sorted,
-                          nullptr, t2_count,
-                          d_t3, d_count, cap,
-                          nullptr, &t3_temp_bytes));
-    CHECK(cudaMemsetAsync(d_count, 0, sizeof(uint64_t), stream));
-    int p_t3 = begin_phase("T3 match + Feistel");
-    CHECK(launch_t3_match(cfg.plot_id.data(), t3p,
-                          d_t2_meta_sorted, d_t2_xbits_sorted,
-                          d_keys_out, t2_count,
-                          d_t3, d_count, cap,
-                          d_match_temp, &t3_temp_bytes, stream));
-    end_phase(p_t3);
-
-    uint64_t t3_count = 0;
-    CHECK(cudaMemcpy(&t3_count, d_count, sizeof(uint64_t),
-                     cudaMemcpyDeviceToHost));
-    if (t3_count > cap) throw std::runtime_error("T3 overflow");
-
-    // Sort T3 by proof_fragment (low 2k bits). T3PairingGpu is just a
-    // uint64_t, so reinterpret the d_pair_a slot directly.
-    uint64_t* d_frags_in = reinterpret_cast<uint64_t*>(d_t3);
-    int p_t3_sort = begin_phase("T3 sort");
-    {
-        size_t sort_bytes = pool.sort_scratch_bytes;
-        CHECK(cub::DeviceRadixSort::SortKeys(
-            d_sort_scratch, sort_bytes,
-            d_frags_in, d_frags_out,
-            t3_count, /*begin_bit=*/0, /*end_bit=*/2 * cfg.k, stream));
-    }
-    end_phase(p_t3_sort);
-
-    // ---------- D2H ----------
-    int p_d2h = begin_phase("D2H copy T3 fragments (pinned)");
-    GpuPipelineResult result;
-    result.t1_count = t1_count;
-    result.t2_count = t2_count;
-    result.t3_count = t3_count;
-
-    if (t3_count > 0) {
-        CHECK(cudaMemcpyAsync(h_pinned_t3, d_frags_out,
-                              sizeof(uint64_t) * t3_count,
-                              cudaMemcpyDeviceToHost, stream));
-        CHECK(cudaStreamSynchronize(stream));
-    }
-    end_phase(p_d2h);
-
-    if (t3_count > 0) {
-        // Borrow: caller (batch producer) promises to finish consuming this
-        // pinned slot before reusing it for another plot.
-        result.external_fragments_ptr   = h_pinned_t3;
-        result.external_fragments_count = t3_count;
-    }
-
-    // Inject Xs gen / sort timings before reporting (avoids the double-event
-    // ownership headache by handling them out-of-band here).
-    if (cfg.profile) {
-        cudaDeviceSynchronize();
-        float gen_ms = 0, sort_ms = 0;
-        cudaEventElapsedTime(&gen_ms,  e_xs_start,    e_xs_gen_done);
-        cudaEventElapsedTime(&sort_ms, e_xs_gen_done, e_xs_sort_done);
-        std::fprintf(stderr, "  %-30s %8.2f ms\n", "Xs gen (g_x)", gen_ms);
-        std::fprintf(stderr, "  %-30s %8.2f ms\n", "Xs sort", sort_ms);
-        cudaEventDestroy(e_xs_start);
-        cudaEventDestroy(e_xs_gen_done);
-        cudaEventDestroy(e_xs_sort_done);
-    }
-
-    report_phases();
-    return result;
-}
-
-GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg)
-{
-    // One-shot convenience path: build a transient pool and run through it.
-    // Pays the full per-call allocator overhead (~2.4 s for k=28). Batch
-    // callers should construct a pool once and reuse it via the overload.
-    GpuBufferPool pool(cfg.k, cfg.strength, cfg.testnet);
-    GpuPipelineResult r = run_gpu_pipeline(cfg, pool, /*pinned_index=*/0);
-    // Pool (and its pinned buffer) is about to be destroyed, so materialise
-    // a self-contained copy before returning.
-    if (r.external_fragments_ptr && r.external_fragments_count > 0) {
-        r.t3_fragments_storage.resize(r.external_fragments_count);
-        std::memcpy(r.t3_fragments_storage.data(),
-                    r.external_fragments_ptr,
-                    sizeof(uint64_t) * r.external_fragments_count);
-    }
-    r.external_fragments_ptr   = nullptr;
-    r.external_fragments_count = 0;
-    return r;
-}
-
-} // namespace pos2gpu
diff --git a/src/host/GpuPipeline.hpp b/src/host/GpuPipeline.hpp
index ae8fabd..f70037e 100644
--- a/src/host/GpuPipeline.hpp
+++ b/src/host/GpuPipeline.hpp
@@ -62,6 +62,10 @@ struct GpuPipelineResult {
 // One-shot path: allocates a transient pool, runs the pipeline, then copies
 // the pinned T3 fragments into t3_fragments_storage so the result is
 // self-contained after the pool is destroyed.
+//
+// If XCHPLOT2_STREAMING=1 is set in the environment, this routes through
+// run_gpu_pipeline_streaming() instead — useful for exercising the low-VRAM
+// path from unchanged call sites.
 GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg);
 
 // Batch path: runs the pipeline writing D2H into pool.h_pinned_t3[pinned_index]
@@ -74,4 +78,114 @@ GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg,
                                    GpuBufferPool& pool,
                                    int pinned_index);
 
+// Streaming path: per-phase cudaMalloc / cudaFree instead of a persistent
+// pool. Targets GPUs where the full pool (~15 GB at k=28) will not fit.
+//
+// Two overloads:
+//   run_gpu_pipeline_streaming(cfg)
+//     Allocates an internal pinned staging buffer for the final D2H,
+//     copies fragments into an owning std::vector, frees the pinned
+//     buffer. Self-contained result. Simplest for one-shot callers.
+//
+//   run_gpu_pipeline_streaming(cfg, pinned_dst, pinned_capacity)
+//     Caller supplies a pinned host buffer (size ≥ cap × sizeof(uint64_t))
+//     that the pipeline uses as the D2H target. Result borrows into
+//     pinned_dst via external_fragments_ptr; caller must not overwrite
+//     pinned_dst while the consumer is still reading it. Use this from
+//     BatchPlotter's streaming fallback to amortise the ~600 ms
+//     cudaMallocHost cost across plots and double-buffer D2H with the
+//     FSE consumer thread the same way the pool path does.
+GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg);
+GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg,
+                                             uint64_t* pinned_dst,
+                                             size_t    pinned_capacity);
+
+// Caller-provided pinned-host scratch buffers for the streaming path.
+// Allocate once per batch in BatchPlotter, reuse across all plots —
+// avoids paying the ~300–600 ms sycl::malloc_host cost per plot per
+// buffer on NVIDIA (measured as the dominant per-plot overhead in
+// stages 4b-4e streaming runs). Lifetime analysis shows that phases
+// using these buffers do not overlap, so two pairs can share a single
+// allocation each:
+//   h_meta        (cap × u64): T1 meta park → T2 meta park
+//   h_keys_merged (cap × u32): T1 keys_merged park → T2 keys_merged park
+//   h_t2_xbits    (cap × u32): T2 xbits park (distinct)
+//   h_t3          (cap × T3PairingGpu = u64): T3 staging (distinct)
+//
+// Any field left nullptr makes the streaming pipeline allocate-on-
+// demand for that buffer (one-shot `test` mode). A fully-populated
+// StreamingPinnedScratch saves all 6 sycl::malloc_host calls per plot.
+struct StreamingPinnedScratch {
+    uint64_t* h_meta         = nullptr;
+    uint32_t* h_keys_merged  = nullptr;
+    uint32_t* h_t2_xbits     = nullptr;
+    uint64_t* h_t3           = nullptr;  // reinterpreted as T3PairingGpu*
+
+    // Plain mode: skip all parks and use single-pass T2 match. Higher
+    // peak (~7.3 GB at k=28) than compact (~5.2 GB) but ~400 ms/plot
+    // faster because there are no PCIe round-trips for T1 meta / T1
+    // keys_merged / T2 meta / T2 xbits / T2 keys_merged parks. The
+    // BatchPlotter picks this tier when free VRAM fits the plain peak
+    // but not the pool (12-14 GB cards). When true, the h_* pointers
+    // above are ignored — plain mode does not park anything.
+    bool plain_mode          = false;
+
+    // T2 match staging tile count (compact path only — ignored when
+    // plain_mode is true). compact uses 2 (cap/2 staging, ~2.3 GB at
+    // k=28); minimal sets it to 8 (cap/8 staging, ~570 MB) to fit 4
+    // GiB cards at the cost of more PCIe round-trips during T2 match.
+    // Must be a power of 2 in [2, t2_num_buckets] — at k=28 strength=2
+    // that's [2, 16]. BatchPlotter's tier selection sets it.
+    int t2_tile_count        = 2;
+
+    // Sort-gather tile count (compact path only — ignored when
+    // plain_mode is true). Each of T1-sort gather, T2-sort meta gather,
+    // and T2-sort xbits gather peaks at ~5200 MB at k=28 because the
+    // input meta + indices + output buffer are all cap-sized and live
+    // simultaneously. With gather_tile_count = N > 1, the gather runs
+    // in N tiles, D2H'ing each tile to a host pinned staging buffer
+    // (reusing the parking scratch h_meta / h_t2_xbits) and
+    // re-allocating the full sorted output afterward via H2D. Drops
+    // each gather peak from 5200 to ~3640 MB at N=4 (peak = full input
+    // 2080 + indices 1040 + tile output 520). Default 1 = no tiling
+    // (compact / plain). Minimal tier sets it to 4. Adds ~3 PCIe round
+    // trips of cap-sized data per plot.
+    int gather_tile_count    = 1;
+};
+
+GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg,
+                                             uint64_t* pinned_dst,
+                                             size_t    pinned_capacity,
+                                             StreamingPinnedScratch const& scratch);
+
+// Allocate / free host-pinned memory — thin wrappers around
+// cudaMallocHost / cudaFreeHost, exposed so plain .cpp consumers (which
+// do not have cuda_runtime.h on the include path) can own the pinned
+// buffers the streaming overload expects. Returns nullptr on failure.
+uint64_t* streaming_alloc_pinned_uint64(size_t count);
+void      streaming_free_pinned_uint64(uint64_t* ptr);
+
+uint32_t* streaming_alloc_pinned_uint32(size_t count);
+void      streaming_free_pinned_uint32(uint32_t* ptr);
+
+// Multi-GPU device binding. bind_current_device() sets a thread-local
+// target device id that sycl_backend::queue() reads when lazily
+// constructing the worker thread's queue. Must be called on the worker
+// thread BEFORE any kernel launch on that thread — ideally as the very
+// first statement of the worker lambda.
+//
+// device_id < 0 → use the default SYCL gpu_selector_v (single-device,
+// pre-multi-GPU behavior). Calling with -1 from the main thread is a
+// no-op and is always safe.
+//
+// gpu_device_count() returns the number of SYCL GPU devices the runtime
+// can enumerate, or 0 on error. BatchPlotter uses it to expand
+// `--devices all` into an explicit id list.
+//
+// Declared here (instead of in SyclBackend.hpp) so plain .cpp consumers
+// like BatchPlotter.cpp can call them without pulling <sycl/sycl.hpp>
+// onto their include path.
+void bind_current_device(int device_id);
+int  gpu_device_count();
+
 } // namespace pos2gpu
diff --git a/src/host/PlotFileWriterParallel.cpp b/src/host/PlotFileWriterParallel.cpp
index 9f7c18f..5485888 100644
--- a/src/host/PlotFileWriterParallel.cpp
+++ b/src/host/PlotFileWriterParallel.cpp
@@ -18,11 +18,18 @@
 #include "plot/PlotIO.hpp"
 #include "plot/Plotter.hpp"
 #include "pos/ProofParams.hpp"
+#include "pos/ProofValidator.hpp"
+#include "prove/Prover.hpp"
 
 #include <algorithm>
+#include <array>
+#include <cstring>
+#include <filesystem>
 #include <fstream>
 #include <future>
+#include <random>
 #include <stdexcept>
+#include <system_error>
 #include <thread>
 #include <vector>
 
@@ -141,8 +148,23 @@ size_t write_plot_file_parallel(
         for (auto& f : tasks) f.get();
     }
 
-    // Serial write phase — file I/O is sequential anyway.
-    std::ofstream out(filename, std::ios::binary);
+    // Serial write phase — file I/O is sequential anyway. Write to
+    // <filename>.partial and rename on success so SIGINT / crash / ENOSPC
+    // never leaves a malformed .plot2 at the destination. The guard
+    // unlinks the partial on early exit.
+    std::string const partial = filename + ".partial";
+    struct PartialGuard {
+        std::string const& path;
+        bool committed = false;
+        ~PartialGuard() {
+            if (!committed) {
+                std::error_code ec;
+                std::filesystem::remove(path, ec);
+            }
+        }
+    } guard{partial};
+
+    std::ofstream out(partial, std::ios::binary | std::ios::trunc);
     if (!out) throw std::runtime_error("Failed to open " + filename);
 
     out.write("pos2", 4);
@@ -191,9 +213,50 @@ size_t write_plot_file_parallel(
     if (!out) throw std::runtime_error("Failed to write chunk offsets to " + filename);
     out.seekp(0, std::ios::end);
 
+    // Close before rename so buffered writes are flushed and the destination
+    // sees the final byte image.
+    out.close();
+    if (!out) throw std::runtime_error("Failed to close " + partial);
+
+    std::error_code ec;
+    std::filesystem::rename(partial, filename, ec);
+    if (ec) {
+        throw std::runtime_error(
+            "Failed to rename " + partial + " -> " + filename + ": " + ec.message());
+    }
+    guard.committed = true;
+
     return bytes_written;
 }
 
+VerifyResult verify_plot_file(std::string const& filename, size_t n_trials)
+{
+    VerifyResult res;
+    if (n_trials == 0) return res;
+
+    Prover prover(filename);
+
+    // Fresh entropy per call; the result only depends on the plot content,
+    // not the specific challenges, beyond being a uniform sample.
+    std::random_device rd;
+    std::mt19937_64    gen(rd());
+    std::uniform_int_distribution<uint64_t> dist;
+
+    for (size_t i = 0; i < n_trials; ++i) {
+        std::array<uint8_t, 32> challenge{};
+        for (size_t j = 0; j < 32; j += 8) {
+            uint64_t const v = dist(gen);
+            std::memcpy(challenge.data() + j, &v, 8);
+        }
+        auto const chains = prover.prove(
+            std::span<uint8_t const, 32>(challenge.data(), 32));
+        res.trials++;
+        res.proofs_found += chains.size();
+        if (!chains.empty()) res.challenges_with_proof++;
+    }
+    return res;
+}
+
 std::vector<uint64_t> read_plot_file_fragments(std::string const& filename)
 {
     PlotFile::PlotFileContents contents = PlotFile::readAllChunkedData(filename);
diff --git a/src/host/PlotFileWriterParallel.hpp b/src/host/PlotFileWriterParallel.hpp
index f066ad5..70acfdb 100644
--- a/src/host/PlotFileWriterParallel.hpp
+++ b/src/host/PlotFileWriterParallel.hpp
@@ -64,4 +64,21 @@ std::vector<uint64_t> run_cpu_plotter_to_fragments(
 // plot/PlotFile.hpp to other TUs.
 std::vector<uint64_t> read_plot_file_fragments(std::string const& filename);
 
+// Result of a `verify_plot_file` call.
+//   trials                 — how many random challenges were tried
+//   challenges_with_proof  — challenges that produced ≥ 1 proof
+//   proofs_found           — total proofs summed across all trials
+struct VerifyResult {
+    size_t trials                = 0;
+    size_t challenges_with_proof = 0;
+    size_t proofs_found          = 0;
+};
+
+// Opens `filename` via pos2-chip's `Prover` and runs `n_trials` random
+// challenges. Each proof is internally validated by the prover; a result
+// with zero proofs across a sensible sample (>= 100) strongly suggests
+// the plot is corrupt. Lives here because Prover.hpp transitively pulls
+// in pos2-chip plot/pos headers (see top-of-file comment in the .cpp).
+VerifyResult verify_plot_file(std::string const& filename, size_t n_trials);
+
 } // namespace pos2gpu
diff --git a/src/host/PoolSizing.hpp b/src/host/PoolSizing.hpp
new file mode 100644
index 0000000..abf7054
--- /dev/null
+++ b/src/host/PoolSizing.hpp
@@ -0,0 +1,26 @@
+// PoolSizing.hpp — inline helpers shared by the buffer pool, the
+// pipeline orchestrator, and the match-kernel wrappers. Kept here so a
+// single formula change updates every consumer.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace pos2gpu {
+
+// Maximum L-side rows that can fall into any single (section, match_key)
+// bucket at the given (k, section_bits). Used to size the persistent
+// pool AND as the safe over-launch upper bound for the match kernels'
+// `blocks_x` dimension. Over-launched threads early-exit on the
+// `l >= l_end` guard at the top of the match body, so slight
+// over-launch is free on the GPU.
+//
+// Formula mirrors pos2-chip's TableConstructorGeneric.hpp:23.
+inline std::size_t max_pairs_per_section(int k, int num_section_bits) noexcept
+{
+    int const extra_margin_bits = 8 - ((28 - k) / 2);
+    return (1ULL << (k - num_section_bits)) + (1ULL << (k - extra_margin_bits));
+}
+
+} // namespace pos2gpu
diff --git a/tools/parity/ParityCommon.hpp b/tools/parity/ParityCommon.hpp
new file mode 100644
index 0000000..9e0660c
--- /dev/null
+++ b/tools/parity/ParityCommon.hpp
@@ -0,0 +1,83 @@
+// ParityCommon.hpp — shared harness helpers for the parity tests.
+//
+// Keeps the PRNG seed shape, mismatch-reporting format, and the CUDA
+// error-check macro consistent across every `*_parity` / `*_bench`
+// binary in this directory. The audit that motivated this header
+// found ~170 lines of verbatim copy-paste across 7-9 files (same
+// derive_plot_id, same Stats/compare shape, same CHECK macro).
+//
+// Plain-header (inline) so .cu and .cpp TUs can both include it
+// without changing the existing CMake layout. No library target
+// needed.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+// CUDA error-check macro. Only meaningful inside a .cu TU (where
+// cuda_runtime.h is in scope). Guarded behind __CUDACC__ so the
+// header can still be included from plain .cpp parity tests for
+// derive_plot_id / Stats / compare without pulling in CUDA.
+#ifdef __CUDACC__
+#include <cuda_runtime.h>
+#define PARITY_CHECK(call) do {                                              \
+    cudaError_t err = (call);                                                \
+    if (err != cudaSuccess) {                                                \
+        std::fprintf(stderr, "CUDA error at %s:%d: %s\n",                    \
+                     __FILE__, __LINE__, cudaGetErrorString(err));           \
+        std::exit(2);                                                        \
+    }                                                                        \
+} while (0)
+#endif
+
+namespace pos2gpu::parity {
+
+// Deterministic mixing from a 32-bit seed to a 32-byte plot_id. Not
+// cryptographic — just spreads bits so parity tests for distinct seeds
+// exercise non-trivially different plot_ids. Golden-ratio + splitmix-
+// style step.
+inline std::array<uint8_t, 32> derive_plot_id(uint32_t seed)
+{
+    std::array<uint8_t, 32> id{};
+    uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL;
+    for (std::size_t i = 0; i < id.size(); ++i) {
+        s = s * 6364136223846793005ULL + 1442695040888963407ULL;
+        id[i] = static_cast<uint8_t>(s >> 56);
+    }
+    return id;
+}
+
+// Mismatch counter with pretty-print of the first 5 errors per
+// (seed, label). Keeps test output useful when a regression lands:
+// you see which labelled comparison first diverges and at what
+// index, without a multi-thousand-line fault log.
+struct Stats {
+    uint64_t total      = 0;
+    uint64_t mismatches = 0;
+    bool ok() const { return mismatches == 0; }
+};
+
+// Cmp is any `bool(uint64_t i)` — returns true when host index i
+// agrees between CPU reference and GPU result.
+template <typename Cmp>
+Stats compare(uint64_t n, Cmp const& cmp, char const* label, uint32_t seed)
+{
+    Stats s;
+    s.total = n;
+    for (uint64_t i = 0; i < n; ++i) {
+        if (!cmp(i)) {
+            if (s.mismatches < 5) {
+                std::printf("  [seed=%u %s] MISMATCH at i=%llu\n",
+                            seed, label,
+                            static_cast<unsigned long long>(i));
+            }
+            ++s.mismatches;
+        }
+    }
+    return s;
+}
+
+} // namespace pos2gpu::parity
diff --git a/tools/parity/aes_parity.cu b/tools/parity/aes_parity.cu
index e39cc2c..db37f6f 100644
--- a/tools/parity/aes_parity.cu
+++ b/tools/parity/aes_parity.cu
@@ -19,6 +19,8 @@
 #include "pos/aes/AesHash.hpp"
 #include "pos/aes/intrin_portable.h"
 
+#include "ParityCommon.hpp"
+
 #include <cuda_runtime.h>
 #include <array>
 #include <cstdint>
@@ -29,6 +31,10 @@
 
 namespace {
 
+using pos2gpu::parity::derive_plot_id;
+using pos2gpu::parity::Stats;
+using pos2gpu::parity::compare;
+
 #define CHECK(call) do { \
     cudaError_t err = (call); \
     if (err != cudaSuccess) { \
@@ -122,40 +128,6 @@ std::vector<T> launch_and_collect(
         return out;                                                         \
     }()
 
-std::array<uint8_t, 32> derive_plot_id(uint32_t seed)
-{
-    std::array<uint8_t, 32> id{};
-    // Deterministic mixing — not crypto, just spreads bits across all 32 bytes.
-    uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL;
-    for (size_t i = 0; i < id.size(); ++i) {
-        s = s * 6364136223846793005ULL + 1442695040888963407ULL;
-        id[i] = static_cast<uint8_t>(s >> 56);
-    }
-    return id;
-}
-
-struct Stats {
-    uint64_t total = 0;
-    uint64_t mismatches = 0;
-    bool ok() const { return mismatches == 0; }
-};
-
-template <typename Cmp>
-Stats compare(uint64_t n, Cmp const& cmp, char const* label, uint32_t seed)
-{
-    Stats s; s.total = n;
-    for (uint64_t i = 0; i < n; ++i) {
-        if (!cmp(i)) {
-            if (s.mismatches < 5) {
-                std::printf("  [seed=%u %s] MISMATCH at i=%llu\n", seed, label,
-                            static_cast<unsigned long long>(i));
-            }
-            ++s.mismatches;
-        }
-    }
-    return s;
-}
-
 // Per-plot-id full sweep.
 bool run_for_plot_id(uint32_t seed)
 {
diff --git a/tools/parity/sycl_bucket_offsets_parity.cpp b/tools/parity/sycl_bucket_offsets_parity.cpp
new file mode 100644
index 0000000..e48730c
--- /dev/null
+++ b/tools/parity/sycl_bucket_offsets_parity.cpp
@@ -0,0 +1,168 @@
+// sycl_bucket_offsets_parity — SYCL port of compute_bucket_offsets
+// (src/gpu/T1Kernel.cu:58) verified against a CPU reference on synthetic
+// input. First slice of the SYCL backend port: proves the AdaptiveCpp
+// toolchain works end-to-end before we touch the production pipeline.
+//
+// The kernel is "for each bucket b in [0, num_buckets), find the lowest
+// index i in `sorted` such that (sorted[i].match_info >> shift) >= b" —
+// one thread per bucket runs a binary search and writes offsets[b].
+// Thread num_buckets writes the sentinel offsets[num_buckets] = total.
+//
+// Synthetic input: a sorted random XsCandidateGpu[] with match_info
+// drawn uniformly from [0, num_buckets << shift) so every bucket is
+// non-trivially populated. Reference is std::lower_bound on the same
+// shifted key. Pass criterion: byte-for-byte memcmp of offsets[].
+
+#include <sycl/sycl.hpp>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <random>
+#include <vector>
+
+namespace {
+
+// Local copy of pos2gpu::XsCandidateGpu — keeps this TU free of the
+// CUDA-laden gpu/XsKernel.cuh include chain. Layout-checked below.
+struct XsCandidateGpu {
+    uint32_t match_info;
+    uint32_t x;
+};
+static_assert(sizeof(XsCandidateGpu) == 8, "must match pos2-chip Xs_Candidate layout");
+
+std::vector<XsCandidateGpu> make_sorted_input(uint64_t total, uint64_t value_range, uint32_t seed)
+{
+    std::mt19937_64 rng(seed);
+    std::vector<XsCandidateGpu> v(total);
+    for (uint64_t i = 0; i < total; ++i) {
+        v[i].match_info = static_cast<uint32_t>(rng() % value_range);
+        v[i].x          = static_cast<uint32_t>(rng());
+    }
+    std::sort(v.begin(), v.end(),
+              [](XsCandidateGpu const& a, XsCandidateGpu const& b) {
+                  return a.match_info < b.match_info;
+              });
+    return v;
+}
+
+std::vector<uint64_t> reference_offsets(
+    std::vector<XsCandidateGpu> const& sorted,
+    int num_match_target_bits,
+    uint32_t num_buckets)
+{
+    std::vector<uint64_t> offsets(num_buckets + 1);
+    uint32_t const shift = static_cast<uint32_t>(num_match_target_bits);
+    uint64_t const total = sorted.size();
+    for (uint32_t b = 0; b < num_buckets; ++b) {
+        uint64_t lo = 0, hi = total;
+        while (lo < hi) {
+            uint64_t mid = lo + ((hi - lo) >> 1);
+            uint32_t v   = sorted[mid].match_info >> shift;
+            if (v < b) lo = mid + 1;
+            else       hi = mid;
+        }
+        offsets[b] = lo;
+    }
+    offsets[num_buckets] = total;
+    return offsets;
+}
+
+std::vector<uint64_t> sycl_offsets(
+    sycl::queue& q,
+    std::vector<XsCandidateGpu> const& sorted,
+    int num_match_target_bits,
+    uint32_t num_buckets)
+{
+    uint64_t const total     = sorted.size();
+    size_t   const out_count = static_cast<size_t>(num_buckets) + 1;
+    constexpr size_t threads = 256;
+    size_t   const groups    = (out_count + threads - 1) / threads;
+
+    XsCandidateGpu* d_sorted  = sycl::malloc_device<XsCandidateGpu>(total, q);
+    uint64_t*       d_offsets = sycl::malloc_device<uint64_t>(out_count, q);
+
+    q.memcpy(d_sorted, sorted.data(), sizeof(XsCandidateGpu) * total).wait();
+
+    q.parallel_for(
+        sycl::nd_range<1>{ groups * threads, threads },
+        [=](sycl::nd_item<1> it) {
+            uint32_t b = static_cast<uint32_t>(it.get_global_id(0));
+            if (b > num_buckets) return;
+            if (b == num_buckets) { d_offsets[num_buckets] = total; return; }
+
+            uint32_t bucket_shift = static_cast<uint32_t>(num_match_target_bits);
+            uint64_t lo = 0, hi = total;
+            while (lo < hi) {
+                uint64_t mid = lo + ((hi - lo) >> 1);
+                uint32_t v   = d_sorted[mid].match_info >> bucket_shift;
+                if (v < b) lo = mid + 1;
+                else       hi = mid;
+            }
+            d_offsets[b] = lo;
+        }).wait();
+
+    std::vector<uint64_t> out(out_count);
+    q.memcpy(out.data(), d_offsets, sizeof(uint64_t) * out_count).wait();
+
+    sycl::free(d_sorted, q);
+    sycl::free(d_offsets, q);
+    return out;
+}
+
+bool run_for(sycl::queue& q, uint32_t seed, uint64_t total,
+             int num_match_target_bits, uint32_t num_buckets)
+{
+    uint64_t const value_range = uint64_t(num_buckets) << num_match_target_bits;
+    auto sorted    = make_sorted_input(total, value_range, seed);
+    auto reference = reference_offsets(sorted, num_match_target_bits, num_buckets);
+    auto actual    = sycl_offsets(q, sorted, num_match_target_bits, num_buckets);
+
+    if (std::memcmp(reference.data(), actual.data(),
+                    sizeof(uint64_t) * reference.size()) == 0) {
+        std::printf("PASS  seed=%u total=%llu shift=%d buckets=%u\n",
+                    seed, (unsigned long long)total,
+                    num_match_target_bits, num_buckets);
+        return true;
+    }
+    for (size_t i = 0; i < reference.size(); ++i) {
+        if (reference[i] != actual[i]) {
+            std::fprintf(stderr,
+                "FAIL  seed=%u  bucket=%zu  ref=%llu  actual=%llu\n",
+                seed, i,
+                (unsigned long long)reference[i],
+                (unsigned long long)actual[i]);
+            break;
+        }
+    }
+    return false;
+}
+
+} // namespace
+
+int main()
+{
+    sycl::queue q{ sycl::default_selector_v };
+    std::printf("device: %s\n",
+                q.get_device().get_info<sycl::info::device::name>().c_str());
+
+    // Sizes representative of T1 at small k (slice 1 is correctness, not perf).
+    // num_buckets = num_sections (4) * num_match_keys (4) = 16 for k<28.
+    struct Case { uint64_t total; int shift; uint32_t buckets; };
+    Case const cases[] = {
+        { 1ull << 18, 14, 16 },   // k=18
+        { 1ull << 20, 16, 16 },   // k=20
+        { 1ull << 22, 18, 16 },   // k=22
+        { 1ull << 24, 20, 16 },   // k=24
+    };
+
+    bool all_pass = true;
+    for (uint32_t seed : { 1u, 7u, 31u }) {
+        for (auto const& c : cases) {
+            if (!run_for(q, seed, c.total, c.shift, c.buckets)) all_pass = false;
+        }
+    }
+    return all_pass ? 0 : 1;
+}
diff --git a/tools/parity/sycl_g_x_parity.cpp b/tools/parity/sycl_g_x_parity.cpp
new file mode 100644
index 0000000..1389007
--- /dev/null
+++ b/tools/parity/sycl_g_x_parity.cpp
@@ -0,0 +1,120 @@
+// sycl_g_x_parity — validates the SYCL-compiled AES g_x_smem against the
+// same function run on the host. Both compile from the same C++ source in
+// AesHashGpu.cuh (the _smem family, now fully portable behind the
+// PortableAttrs macros), but one goes through acpp's SSCP backend into a
+// device kernel and the other through the host C++ compiler. Any
+// codegen-introduced divergence shows up byte-by-byte here.
+//
+// For x in [0, 1<<k):
+//     ref    = g_x_smem on the host with the same AES keys + T-tables
+//     actual = g_x_smem inside a SYCL parallel_for, reading the same
+//              T-tables from a USM buffer
+//
+// Pass criterion: ref == actual as a memcmp'd array. Slice-4 of the
+// SYCL port — exercises the real AES math on the SYCL device for the
+// first time, without the complexity of match_all_buckets around it.
+
+#include "gpu/AesHashGpu.cuh"
+#include "gpu/AesTables.inl"
+
+#include <sycl/sycl.hpp>
+
+#include <array>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+namespace {
+
+std::array<uint8_t, 32> derive_plot_id(uint32_t seed)
+{
+    std::array<uint8_t, 32> id{};
+    uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL;
+    for (size_t i = 0; i < id.size(); ++i) {
+        s = s * 6364136223846793005ULL + 1442695040888963407ULL;
+        id[i] = static_cast<uint8_t>(s >> 56);
+    }
+    return id;
+}
+
+// Build the 4×256 uint32_t sT layout the _smem AES functions expect,
+// pulling the values from AesTables.inl so the same data feeds both
+// the host reference and the device buffer.
+std::vector<uint32_t> build_sT()
+{
+    std::vector<uint32_t> sT(4 * 256);
+    for (int i = 0; i < 256; ++i) {
+        sT[0 * 256 + i] = pos2gpu::aes_tables::T0[i];
+        sT[1 * 256 + i] = pos2gpu::aes_tables::T1[i];
+        sT[2 * 256 + i] = pos2gpu::aes_tables::T2[i];
+        sT[3 * 256 + i] = pos2gpu::aes_tables::T3[i];
+    }
+    return sT;
+}
+
+bool run_for(sycl::queue& q, uint32_t seed, int k)
+{
+    uint64_t const N = 1ull << k;
+    auto plot_id = derive_plot_id(seed);
+    auto keys    = pos2gpu::make_keys(plot_id.data());
+    auto sT_host = build_sT();
+
+    std::vector<uint32_t> ref(N);
+    for (uint64_t x = 0; x < N; ++x) {
+        ref[x] = pos2gpu::g_x_smem(keys, static_cast<uint32_t>(x), k, sT_host.data());
+    }
+
+    uint32_t* d_sT  = sycl::malloc_device<uint32_t>(4 * 256, q);
+    uint32_t* d_out = sycl::malloc_device<uint32_t>(N, q);
+    q.memcpy(d_sT, sT_host.data(), sizeof(uint32_t) * 4 * 256).wait();
+
+    constexpr size_t threads = 256;
+    size_t const groups      = (N + threads - 1) / threads;
+
+    q.parallel_for(
+        sycl::nd_range<1>{ groups * threads, threads },
+        [=, keys_copy = keys](sycl::nd_item<1> it) {
+            uint64_t x = it.get_global_id(0);
+            if (x >= N) return;
+            d_out[x] = pos2gpu::g_x_smem(keys_copy, static_cast<uint32_t>(x), k, d_sT);
+        }).wait();
+
+    std::vector<uint32_t> actual(N);
+    q.memcpy(actual.data(), d_out, sizeof(uint32_t) * N).wait();
+    sycl::free(d_sT, q);
+    sycl::free(d_out, q);
+
+    if (std::memcmp(ref.data(), actual.data(), sizeof(uint32_t) * N) == 0) {
+        std::printf("PASS  seed=%u k=%d N=%llu\n",
+                    seed, k, (unsigned long long)N);
+        return true;
+    }
+    for (uint64_t x = 0; x < N; ++x) {
+        if (ref[x] != actual[x]) {
+            std::fprintf(stderr,
+                "FAIL  seed=%u k=%d  x=%llu  ref=0x%08x  actual=0x%08x\n",
+                seed, k, (unsigned long long)x, ref[x], actual[x]);
+            break;
+        }
+    }
+    return false;
+}
+
+} // namespace
+
+int main()
+{
+    sycl::queue q{ sycl::gpu_selector_v };
+    std::printf("device: %s\n",
+                q.get_device().get_info<sycl::info::device::name>().c_str());
+
+    bool all_pass = true;
+    for (uint32_t seed : { 1u, 7u, 31u }) {
+        for (int k : { 14, 16, 18 }) {
+            if (!run_for(q, seed, k)) all_pass = false;
+        }
+    }
+    return all_pass ? 0 : 1;
+}
diff --git a/tools/parity/sycl_sort_parity.cpp b/tools/parity/sycl_sort_parity.cpp
new file mode 100644
index 0000000..ff36235
--- /dev/null
+++ b/tools/parity/sycl_sort_parity.cpp
@@ -0,0 +1,176 @@
+// sycl_sort_parity — exercises launch_sort_pairs_u32_u32 and
+// launch_sort_keys_u64 on synthetic input and compares against a
+// std::sort reference. Built always (independent of XCHPLOT2_BUILD_CUDA),
+// so it validates whichever Sort backend is wired into pos2_gpu:
+// CUB on the NVIDIA build, oneDPL on the SYCL/AdaptiveCpp build.
+//
+// Pass criterion: byte-identical sorted streams.
+
+#include "gpu/Sort.cuh"
+#include "gpu/SyclBackend.hpp"
+
+#include <sycl/sycl.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <random>
+#include <vector>
+
+namespace {
+
+bool run_pairs(uint32_t seed, uint64_t count)
+{
+    auto& q = pos2gpu::sycl_backend::queue();
+
+    // Use unique keys (shuffled 0..count-1) so stable and unstable sorts
+    // produce byte-identical output — lets us test both CUB (stable) and
+    // the hand-rolled SYCL radix (unstable within equal keys) the same way.
+    std::mt19937_64 rng(seed);
+    std::vector<uint32_t> h_keys(count), h_vals(count);
+    for (uint64_t i = 0; i < count; ++i) {
+        h_keys[i] = static_cast<uint32_t>(i);
+        h_vals[i] = static_cast<uint32_t>(i);
+    }
+    std::shuffle(h_keys.begin(), h_keys.end(), rng);
+
+    // Reference: std::sort over indices by key.
+    std::vector<uint32_t> ref_keys = h_keys;
+    std::vector<uint32_t> ref_vals = h_vals;
+    {
+        std::vector<uint32_t> idx(count);
+        for (uint64_t i = 0; i < count; ++i) idx[i] = static_cast<uint32_t>(i);
+        std::sort(idx.begin(), idx.end(),
+            [&](uint32_t a, uint32_t b) { return h_keys[a] < h_keys[b]; });
+        for (uint64_t i = 0; i < count; ++i) {
+            ref_keys[i] = h_keys[idx[i]];
+            ref_vals[i] = h_vals[idx[i]];
+        }
+    }
+
+    uint32_t* d_keys_in  = sycl::malloc_device<uint32_t>(count, q);
+    uint32_t* d_keys_out = sycl::malloc_device<uint32_t>(count, q);
+    uint32_t* d_vals_in  = sycl::malloc_device<uint32_t>(count, q);
+    uint32_t* d_vals_out = sycl::malloc_device<uint32_t>(count, q);
+    q.memcpy(d_keys_in, h_keys.data(), sizeof(uint32_t) * count);
+    q.memcpy(d_vals_in, h_vals.data(), sizeof(uint32_t) * count).wait();
+
+    size_t scratch_bytes = 0;
+    pos2gpu::launch_sort_pairs_u32_u32(
+        nullptr, scratch_bytes,
+        nullptr, nullptr, nullptr, nullptr,
+        count, 0, 32, q);
+
+    void* d_scratch = scratch_bytes ? sycl::malloc_device(scratch_bytes, q) : nullptr;
+
+    auto const t0 = std::chrono::steady_clock::now();
+    pos2gpu::launch_sort_pairs_u32_u32(
+        d_scratch ? d_scratch : reinterpret_cast<void*>(uintptr_t{1}),  // any non-null
+        scratch_bytes,
+        d_keys_in, d_keys_out,
+        d_vals_in, d_vals_out,
+        count, 0, 32, q);
+    q.wait();
+    auto const t1 = std::chrono::steady_clock::now();
+    double const ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+
+    std::vector<uint32_t> h_sorted_keys(count), h_sorted_vals(count);
+    q.memcpy(h_sorted_keys.data(), d_keys_out, sizeof(uint32_t) * count);
+    q.memcpy(h_sorted_vals.data(), d_vals_out, sizeof(uint32_t) * count).wait();
+
+    if (d_scratch) sycl::free(d_scratch, q);
+    sycl::free(d_keys_in,  q);
+    sycl::free(d_keys_out, q);
+    sycl::free(d_vals_in,  q);
+    sycl::free(d_vals_out, q);
+
+    bool const keys_ok = std::memcmp(ref_keys.data(), h_sorted_keys.data(),
+                                     sizeof(uint32_t) * count) == 0;
+    bool const vals_ok = std::memcmp(ref_vals.data(), h_sorted_vals.data(),
+                                     sizeof(uint32_t) * count) == 0;
+    bool const sorted = std::is_sorted(h_sorted_keys.begin(),
+                                       h_sorted_keys.end());
+    bool const ok = keys_ok && vals_ok;
+    std::printf("%s  pairs  seed=%u count=%llu  [keys=%d vals=%d sorted=%d  %.2fms]\n",
+                ok ? "PASS" : "FAIL", seed, (unsigned long long)count,
+                keys_ok, vals_ok, sorted, ms);
+    if (!ok) {
+        uint64_t const show = std::min<uint64_t>(count, 16);
+        std::printf("  got [0..%llu): ", (unsigned long long)show);
+        for (uint64_t i = 0; i < show; ++i) std::printf("%u ", h_sorted_keys[i]);
+        std::printf("\n  ref [0..%llu): ", (unsigned long long)show);
+        for (uint64_t i = 0; i < show; ++i) std::printf("%u ", ref_keys[i]);
+        std::printf("\n  got [N-%llu..N): ", (unsigned long long)show);
+        for (uint64_t i = count - show; i < count; ++i) std::printf("%u ", h_sorted_keys[i]);
+        std::printf("\n");
+    }
+    return ok;
+}
+
+bool run_keys(uint32_t seed, uint64_t count)
+{
+    auto& q = pos2gpu::sycl_backend::queue();
+
+    std::mt19937_64 rng(seed);
+    std::vector<uint64_t> h_keys(count);
+    for (uint64_t i = 0; i < count; ++i) {
+        h_keys[i] = rng() & 0x0000FFFFFFFFFFFFull;  // ~48-bit keys
+    }
+
+    std::vector<uint64_t> ref = h_keys;
+    std::sort(ref.begin(), ref.end());
+
+    uint64_t* d_in  = sycl::malloc_device<uint64_t>(count, q);
+    uint64_t* d_out = sycl::malloc_device<uint64_t>(count, q);
+    q.memcpy(d_in, h_keys.data(), sizeof(uint64_t) * count).wait();
+
+    size_t scratch_bytes = 0;
+    pos2gpu::launch_sort_keys_u64(nullptr, scratch_bytes, nullptr, nullptr,
+                                  count, 0, 48, q);
+    void* d_scratch = scratch_bytes ? sycl::malloc_device(scratch_bytes, q) : nullptr;
+    auto const t0 = std::chrono::steady_clock::now();
+    pos2gpu::launch_sort_keys_u64(
+        d_scratch ? d_scratch : reinterpret_cast<void*>(uintptr_t{1}),
+        scratch_bytes,
+        d_in, d_out,
+        count, 0, 48, q);
+    q.wait();
+    auto const t1 = std::chrono::steady_clock::now();
+    double const ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+
+    std::vector<uint64_t> h_sorted(count);
+    q.memcpy(h_sorted.data(), d_out, sizeof(uint64_t) * count).wait();
+
+    if (d_scratch) sycl::free(d_scratch, q);
+    sycl::free(d_in, q);
+    sycl::free(d_out, q);
+
+    bool const ok = std::memcmp(ref.data(), h_sorted.data(),
+                                sizeof(uint64_t) * count) == 0;
+    bool const sorted = std::is_sorted(h_sorted.begin(), h_sorted.end());
+    std::printf("%s  keys   seed=%u count=%llu  [match=%d sorted=%d  %.2fms]\n",
+                ok ? "PASS" : "FAIL", seed, (unsigned long long)count,
+                ok, sorted, ms);
+    return ok;
+}
+
+} // namespace
+
+int main()
+{
+    auto& q = pos2gpu::sycl_backend::queue();
+    std::printf("device: %s\n",
+                q.get_device().get_info<sycl::info::device::name>().c_str());
+
+    bool all_pass = true;
+    for (uint32_t seed : { 1u, 7u, 31u }) {
+        for (uint64_t n : { 16ull, 1ull << 14, 1ull << 18, 1ull << 20 }) {
+            if (!run_pairs(seed, n)) all_pass = false;
+            if (!run_keys (seed, n)) all_pass = false;
+        }
+    }
+    return all_pass ? 0 : 1;
+}
diff --git a/tools/parity/sycl_t1_parity.cpp b/tools/parity/sycl_t1_parity.cpp
new file mode 100644
index 0000000..9ddb4ad
--- /dev/null
+++ b/tools/parity/sycl_t1_parity.cpp
@@ -0,0 +1,317 @@
+// sycl_t1_parity — SYCL-native sibling of t1_parity.cu. Builds on every
+// backend (CUDA / HIP / Level Zero / OMP) so the T1 matcher can be
+// validated against the pos2-chip CPU reference on AMD and Intel
+// devices, where the .cu version isn't compiled.
+//
+// Same comparison semantics as t1_parity.cu: both CPU and GPU outputs
+// are sorted by (match_info, meta_hi, meta_lo) and compared as a set.
+// Bit-exactness of the SET is what determines correctness for the
+// downstream T2/T3/proof pipeline — the post-construct sort by
+// match_info collapses the order in which matches were emitted.
+//
+// Usage:
+//   ./sycl_t1_parity                       # default sweep
+//   ./sycl_t1_parity --k 20                # single-k smoke test
+//   ./sycl_t1_parity --k 20 --strength 4   # custom strength
+//
+// The default sweep stays small (k <= 18) so it fits on 8 GiB cards
+// and so the CPU reference completes in seconds. --k lets a triage
+// session push the matcher to the largest k that fits on the device.
+
+#include "gpu/AesGpu.cuh"
+#include "gpu/SyclBackend.hpp"
+#include "gpu/XsKernel.cuh"
+#include "gpu/T1Kernel.cuh"
+
+#include "plot/PlotLayout.hpp"
+#include "plot/TableConstructorGeneric.hpp"
+#include "pos/ProofCore.hpp"
+#include "pos/ProofParams.hpp"
+
+#include "ParityCommon.hpp"
+
+#include <sycl/sycl.hpp>
+
+#include <algorithm>
+#include <array>
+#include <charconv>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace {
+
+using pos2gpu::parity::derive_plot_id;
+
+struct PairKey {
+    uint32_t mi;
+    uint32_t lo;
+    uint32_t hi;
+    bool operator<(PairKey const& o) const noexcept {
+        if (mi != o.mi) return mi < o.mi;
+        if (hi != o.hi) return hi < o.hi;
+        return lo < o.lo;
+    }
+    bool operator==(PairKey const& o) const noexcept {
+        return mi == o.mi && lo == o.lo && hi == o.hi;
+    }
+};
+
+template <typename T>
+T* sycl_alloc_device(sycl::queue& q, std::size_t n, char const* what)
+{
+    T* p = sycl::malloc_device<T>(n, q);
+    if (!p) {
+        std::fprintf(stderr, "  FAIL: sycl::malloc_device(%s, %zu * %zu B)\n",
+                     what, n, sizeof(T));
+        std::exit(2);
+    }
+    return p;
+}
+
+bool run_for_id(sycl::queue& q,
+                std::array<uint8_t, 32> const& plot_id,
+                char const* label,
+                int k,
+                int strength)
+{
+    uint64_t const total = 1ULL << k;
+    std::printf("[%s  k=%d  strength=%d  N=%llu]\n",
+                label, k, strength, static_cast<unsigned long long>(total));
+
+    ProofParams params(plot_id.data(),
+                       static_cast<uint8_t>(k),
+                       static_cast<uint8_t>(strength),
+                       /*testnet=*/uint8_t{0});
+
+    // ---- CPU reference (XsConstructor → Table1Constructor::construct) ----
+    std::size_t max_section_pairs = max_pairs_per_section_possible(params);
+    std::size_t num_sections      = static_cast<std::size_t>(params.get_num_sections());
+    std::size_t max_pairs         = max_section_pairs * num_sections;
+    std::size_t max_element_bytes = std::max({sizeof(Xs_Candidate), sizeof(T1Pairing),
+                                              sizeof(T2Pairing), sizeof(T3Pairing)});
+    PlotLayout layout(max_section_pairs, num_sections, max_element_bytes,
+                      /*minor_scratch_bytes=*/2 * 1024 * 1024);
+
+    auto xsV = layout.xs();
+    XsConstructor xs_ctor(params);
+    auto xs_sorted = xs_ctor.construct(xsV.out, xsV.post_sort_tmp, xsV.minor);
+
+    // Mirror t1_parity.cu: if XsConstructor returned its output in the
+    // PrimaryOut slot, copy aside so T1's construct (which writes its
+    // output into PrimaryOut) doesn't corrupt the input.
+    if (xs_sorted.data() == xsV.out.data()) {
+        std::copy(xsV.out.begin(), xsV.out.end(), xsV.post_sort_tmp.begin());
+        xs_sorted = xsV.post_sort_tmp.first(xs_sorted.size());
+    }
+
+    auto t1V = layout.t1();
+    Table1Constructor t1_ctor(params, t1V.target, t1V.minor);
+    auto t1_pairs = t1_ctor.construct(xs_sorted, t1V.out, t1V.post_sort_tmp);
+
+    std::vector<PairKey> cpu_keys;
+    cpu_keys.reserve(t1_pairs.size());
+    for (auto const& p : t1_pairs) {
+        cpu_keys.push_back({p.match_info, p.meta_lo, p.meta_hi});
+    }
+    std::sort(cpu_keys.begin(), cpu_keys.end());
+    std::printf("  CPU produced %zu T1Pairings\n", cpu_keys.size());
+
+    // ---- GPU pipeline: launch_construct_xs, then launch_t1_match ----
+    auto* d_xs = sycl_alloc_device<pos2gpu::XsCandidateGpu>(q, total, "d_xs");
+
+    std::size_t xs_temp_bytes = 0;
+    pos2gpu::launch_construct_xs(plot_id.data(), k, /*testnet=*/false,
+                                 nullptr, nullptr, &xs_temp_bytes, q);
+    void* d_xs_temp = sycl_alloc_device<unsigned char>(q, xs_temp_bytes, "d_xs_temp");
+    pos2gpu::launch_construct_xs(plot_id.data(), k, /*testnet=*/false,
+                                 d_xs, d_xs_temp, &xs_temp_bytes, q);
+    q.wait();
+
+    auto t1p = pos2gpu::make_t1_params(k, strength);
+    uint64_t const capacity = static_cast<uint64_t>(max_pairs);
+
+    auto* d_t1_meta  = sycl_alloc_device<uint64_t>(q, capacity, "d_t1_meta");
+    auto* d_t1_mi    = sycl_alloc_device<uint32_t>(q, capacity, "d_t1_mi");
+    auto* d_t1_count = sycl_alloc_device<uint64_t>(q, 1,        "d_t1_count");
+
+    // Mirror GpuPipeline.cpp: the streaming pipeline always memsets
+    // d_counter to 0 before the real launch_t1_match call. The size-
+    // query call below doesn't touch d_t1_count, but the real call's
+    // launch_t1_match_prepare also memsets it — keep the explicit
+    // pre-zero to make the test a one-shot if the prepare path ever
+    // changes.
+    q.memset(d_t1_count, 0, sizeof(uint64_t)).wait();
+
+    std::size_t t1_temp_bytes = 0;
+    pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total,
+                             nullptr, nullptr, d_t1_count, capacity,
+                             nullptr, &t1_temp_bytes, q);
+    void* d_t1_temp = sycl_alloc_device<unsigned char>(q, t1_temp_bytes, "d_t1_temp");
+    pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total,
+                             d_t1_meta, d_t1_mi, d_t1_count, capacity,
+                             d_t1_temp, &t1_temp_bytes, q);
+    q.wait();
+
+    uint64_t gpu_count = 0;
+    q.memcpy(&gpu_count, d_t1_count, sizeof(uint64_t)).wait();
+
+    auto free_all = [&]() {
+        sycl::free(d_t1_temp,  q);
+        sycl::free(d_t1_count, q);
+        sycl::free(d_t1_mi,    q);
+        sycl::free(d_t1_meta,  q);
+        sycl::free(d_xs_temp,  q);
+        sycl::free(d_xs,       q);
+    };
+
+    if (gpu_count > capacity) {
+        std::printf("  GPU OVERFLOW: emitted %llu but capacity %llu\n",
+                    static_cast<unsigned long long>(gpu_count),
+                    static_cast<unsigned long long>(capacity));
+        free_all();
+        return false;
+    }
+
+    std::vector<uint64_t> h_meta(gpu_count);
+    std::vector<uint32_t> h_mi  (gpu_count);
+    if (gpu_count > 0) {
+        q.memcpy(h_meta.data(), d_t1_meta, sizeof(uint64_t) * gpu_count).wait();
+        q.memcpy(h_mi.data(),   d_t1_mi,   sizeof(uint32_t) * gpu_count).wait();
+    }
+    free_all();
+
+    std::vector<PairKey> gpu_keys;
+    gpu_keys.reserve(gpu_count);
+    for (uint64_t i = 0; i < gpu_count; ++i) {
+        uint32_t meta_lo = static_cast<uint32_t>(h_meta[i]);
+        uint32_t meta_hi = static_cast<uint32_t>(h_meta[i] >> 32);
+        gpu_keys.push_back({h_mi[i], meta_lo, meta_hi});
+    }
+    std::sort(gpu_keys.begin(), gpu_keys.end());
+    std::printf("  GPU produced %zu T1Pairings\n", gpu_keys.size());
+
+    if (cpu_keys.size() != gpu_keys.size()) {
+        std::printf("  count mismatch (CPU %zu vs GPU %zu) — analysing overlap\n",
+                    cpu_keys.size(), gpu_keys.size());
+        std::size_t in_cpu_only = 0, in_gpu_only = 0, common = 0;
+        std::vector<PairKey> only_in_gpu;
+        std::size_t i = 0, j = 0;
+        while (i < cpu_keys.size() && j < gpu_keys.size()) {
+            if (cpu_keys[i] == gpu_keys[j])      { ++common; ++i; ++j; }
+            else if (cpu_keys[i] < gpu_keys[j])  { ++in_cpu_only; ++i; }
+            else {
+                if (only_in_gpu.size() < 5) only_in_gpu.push_back(gpu_keys[j]);
+                ++in_gpu_only; ++j;
+            }
+        }
+        in_cpu_only += cpu_keys.size() - i;
+        while (j < gpu_keys.size()) {
+            if (only_in_gpu.size() < 5) only_in_gpu.push_back(gpu_keys[j]);
+            ++in_gpu_only;
+            ++j;
+        }
+        std::printf("    common=%zu  cpu_only=%zu  gpu_only=%zu\n",
+                    common, in_cpu_only, in_gpu_only);
+        for (auto const& p : only_in_gpu) {
+            uint64_t meta = (uint64_t(p.hi) << 32) | uint64_t(p.lo);
+            uint32_t x_l  = static_cast<uint32_t>(meta >> static_cast<uint32_t>(k));
+            uint32_t x_r  = static_cast<uint32_t>(meta & ((1ULL << k) - 1));
+            std::printf("    GPU-only sample: x_l=%u x_r=%u  match_info=0x%08x\n",
+                        x_l, x_r, p.mi);
+        }
+        return false;
+    }
+
+    uint64_t mismatches = 0;
+    for (std::size_t i = 0; i < cpu_keys.size(); ++i) {
+        if (!(cpu_keys[i] == gpu_keys[i])) {
+            if (mismatches < 5) {
+                std::printf("  MISMATCH at i=%zu  cpu=(mi=0x%08x lo=0x%08x hi=0x%08x)  "
+                            "gpu=(mi=0x%08x lo=0x%08x hi=0x%08x)\n",
+                            i,
+                            cpu_keys[i].mi, cpu_keys[i].lo, cpu_keys[i].hi,
+                            gpu_keys[i].mi, gpu_keys[i].lo, gpu_keys[i].hi);
+            }
+            ++mismatches;
+        }
+    }
+    if (mismatches == 0) {
+        std::printf("  OK  %zu / %zu T1Pairings match (sorted set comparison)\n",
+                    cpu_keys.size(), cpu_keys.size());
+        return true;
+    }
+    std::printf("  FAIL  %llu mismatches / %zu\n",
+                static_cast<unsigned long long>(mismatches), cpu_keys.size());
+    return false;
+}
+
+bool parse_int_arg(std::string_view sv, int& out)
+{
+    auto const* first = sv.data();
+    auto const* last  = sv.data() + sv.size();
+    auto r = std::from_chars(first, last, out);
+    return r.ec == std::errc{} && r.ptr == last;
+}
+
+} // namespace
+
+int main(int argc, char** argv)
+{
+    pos2gpu::initialize_aes_tables();
+
+    int k_override        = -1;
+    int strength_override = -1;
+    for (int i = 1; i + 1 < argc; ++i) {
+        std::string_view a = argv[i];
+        if      (a == "--k")        { (void)parse_int_arg(argv[++i], k_override); }
+        else if (a == "--strength") { (void)parse_int_arg(argv[++i], strength_override); }
+    }
+
+    sycl::queue q{ sycl::gpu_selector_v };
+    std::printf("device: %s\n",
+                q.get_device().get_info<sycl::info::device::name>().c_str());
+
+    bool all_ok = true;
+
+    if (k_override > 0) {
+        int const s = (strength_override > 0) ? strength_override : 2;
+        // Use the same fixed plot_id family as the default sweep so a
+        // user-driven --k 22 run is reproducible alongside the seed=1
+        // baseline.
+        std::string label = "k=" + std::to_string(k_override) +
+                            " strength=" + std::to_string(s);
+        all_ok = run_for_id(q, derive_plot_id(/*seed=*/1u),
+                            label.c_str(), k_override, s) && all_ok;
+    } else {
+        // Default sweep — k=18 only, since launch_t1_match_prepare rejects
+        // k < 18 (smallest size for which num_match_target_bits exceeds the
+        // FINE_BITS=8 floor with sensible margin). Seed and strength
+        // coverage is deliberately narrower than t1_parity.cu because
+        // this binary is meant to be run as a quick-triage check on
+        // AMD/Intel hardware where the CUDA test isn't available — the
+        // full coverage is in t1_parity.cu on the CUDA build path.
+        for (uint32_t seed : { 1u, 7u, 31u, 0xCAFEBABEu, 0xDEADBEEFu }) {
+            std::string label = "seed=" + std::to_string(seed);
+            all_ok = run_for_id(q, derive_plot_id(seed),
+                                label.c_str(), /*k=*/18, /*strength=*/2)
+                     && all_ok;
+        }
+        // Strength sweep at k=18 — exercises the test_mask path through
+        // the matcher which scales with strength. strength=7 leaves
+        // num_match_target_bits=9, still above the FINE_BITS=8 floor.
+        for (int strength : { 3, 4, 5, 6, 7 }) {
+            std::string label = "seed=1 strength=" + std::to_string(strength);
+            all_ok = run_for_id(q, derive_plot_id(1u),
+                                label.c_str(), /*k=*/18, strength)
+                     && all_ok;
+        }
+    }
+
+    std::printf("\n==> %s\n", all_ok ? "ALL OK" : "FAIL");
+    return all_ok ? 0 : 1;
+}
diff --git a/tools/parity/t1_debug.cu b/tools/parity/t1_debug.cu
index a44606c..01c2e04 100644
--- a/tools/parity/t1_debug.cu
+++ b/tools/parity/t1_debug.cu
@@ -9,6 +9,8 @@
 #include "pos/ProofParams.hpp"
 #include "pos/ProofCore.hpp"
 
+#include "ParityCommon.hpp"
+
 #include <cuda_runtime.h>
 #include <array>
 #include <cstdio>
@@ -19,16 +21,7 @@
 
 namespace {
 
-std::array<uint8_t, 32> derive_plot_id(uint32_t seed)
-{
-    std::array<uint8_t, 32> id{};
-    uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL;
-    for (size_t i = 0; i < id.size(); ++i) {
-        s = s * 6364136223846793005ULL + 1442695040888963407ULL;
-        id[i] = static_cast<uint8_t>(s >> 56);
-    }
-    return id;
-}
+using pos2gpu::parity::derive_plot_id;
 
 __global__ void test_kernel(
     pos2gpu::AesHashKeys keys,
diff --git a/tools/parity/t1_parity.cu b/tools/parity/t1_parity.cu
index 71c9652..8195ba9 100644
--- a/tools/parity/t1_parity.cu
+++ b/tools/parity/t1_parity.cu
@@ -7,6 +7,7 @@
 // downstream T2/T3/proof pipeline.
 
 #include "gpu/AesGpu.cuh"
+#include "gpu/SyclBackend.hpp"
 #include "gpu/XsKernel.cuh"
 #include "gpu/T1Kernel.cuh"
 
@@ -16,6 +17,8 @@
 #include "pos/ProofCore.hpp"
 #include "pos/ProofParams.hpp"
 
+#include "ParityCommon.hpp"
+
 #include <cuda_runtime.h>
 #include <algorithm>
 #include <array>
@@ -27,6 +30,8 @@
 
 namespace {
 
+using pos2gpu::parity::derive_plot_id;
+
 #define CHECK(call) do {                                                                     \
     cudaError_t err = (call);                                                                \
     if (err != cudaSuccess) {                                                                \
@@ -36,17 +41,6 @@ namespace {
     }                                                                                        \
 } while (0)
 
-std::array<uint8_t, 32> derive_plot_id(uint32_t seed)
-{
-    std::array<uint8_t, 32> id{};
-    uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL;
-    for (size_t i = 0; i < id.size(); ++i) {
-        s = s * 6364136223846793005ULL + 1442695040888963407ULL;
-        id[i] = static_cast<uint8_t>(s >> 56);
-    }
-    return id;
-}
-
 struct PairKey {
     uint32_t mi;  // match_info
     uint32_t lo;  // meta_lo
@@ -111,10 +105,10 @@ bool run_for_id(std::array<uint8_t, 32> const& plot_id, char const* label, int k
     pos2gpu::XsCandidateGpu* d_xs = nullptr;
     CHECK(cudaMalloc(&d_xs, sizeof(pos2gpu::XsCandidateGpu) * total));
     size_t xs_temp_bytes = 0;
-    CHECK(pos2gpu::launch_construct_xs(plot_id.data(), k, false, nullptr, nullptr, &xs_temp_bytes));
+    pos2gpu::launch_construct_xs(plot_id.data(), k, false, nullptr, nullptr, &xs_temp_bytes, pos2gpu::sycl_backend::queue());
     void* d_xs_temp = nullptr;
     CHECK(cudaMalloc(&d_xs_temp, xs_temp_bytes));
-    CHECK(pos2gpu::launch_construct_xs(plot_id.data(), k, false, d_xs, d_xs_temp, &xs_temp_bytes));
+    pos2gpu::launch_construct_xs(plot_id.data(), k, false, d_xs, d_xs_temp, &xs_temp_bytes, pos2gpu::sycl_backend::queue());
     CHECK(cudaDeviceSynchronize());
 
     auto t1p = pos2gpu::make_t1_params(k, strength);
@@ -122,46 +116,55 @@ bool run_for_id(std::array<uint8_t, 32> const& plot_id, char const* label, int k
     // re-use it.
     uint64_t capacity = static_cast<uint64_t>(max_pairs);
 
-    pos2gpu::T1PairingGpu* d_t1 = nullptr;
-    CHECK(cudaMalloc(&d_t1, sizeof(pos2gpu::T1PairingGpu) * capacity));
+    // T1 match emits SoA: (uint64 meta, uint32 mi) parallel streams.
+    uint64_t* d_t1_meta = nullptr;
+    uint32_t* d_t1_mi   = nullptr;
+    CHECK(cudaMalloc(&d_t1_meta, sizeof(uint64_t) * capacity));
+    CHECK(cudaMalloc(&d_t1_mi,   sizeof(uint32_t) * capacity));
     uint64_t* d_t1_count = nullptr;
     CHECK(cudaMalloc(&d_t1_count, sizeof(uint64_t)));
 
     size_t t1_temp_bytes = 0;
-    CHECK(pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total,
-                                   d_t1, d_t1_count, capacity,
-                                   nullptr, &t1_temp_bytes));
+    pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total,
+                                   nullptr, nullptr, d_t1_count, capacity,
+                                   nullptr, &t1_temp_bytes, pos2gpu::sycl_backend::queue());
     void* d_t1_temp = nullptr;
     CHECK(cudaMalloc(&d_t1_temp, t1_temp_bytes));
-    CHECK(pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total,
-                                   d_t1, d_t1_count, capacity,
-                                   d_t1_temp, &t1_temp_bytes));
+    pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total,
+                                   d_t1_meta, d_t1_mi, d_t1_count, capacity,
+                                   d_t1_temp, &t1_temp_bytes, pos2gpu::sycl_backend::queue());
     CHECK(cudaDeviceSynchronize());
 
     uint64_t gpu_count = 0;
     CHECK(cudaMemcpy(&gpu_count, d_t1_count, sizeof(uint64_t), cudaMemcpyDeviceToHost));
 
+    auto free_all = [&]() {
+        cudaFree(d_t1_temp); cudaFree(d_t1_count);
+        cudaFree(d_t1_meta); cudaFree(d_t1_mi);
+        cudaFree(d_xs_temp); cudaFree(d_xs);
+    };
+
     if (gpu_count > capacity) {
         std::printf("  GPU OVERFLOW: emitted %llu but capacity %llu\n",
                     (unsigned long long)gpu_count, (unsigned long long)capacity);
-        cudaFree(d_t1_temp); cudaFree(d_t1_count); cudaFree(d_t1);
-        cudaFree(d_xs_temp); cudaFree(d_xs);
+        free_all();
         return false;
     }
 
-    std::vector<pos2gpu::T1PairingGpu> gpu_pairs(gpu_count);
+    std::vector<uint64_t> h_meta(gpu_count);
+    std::vector<uint32_t> h_mi  (gpu_count);
     if (gpu_count > 0) {
-        CHECK(cudaMemcpy(gpu_pairs.data(), d_t1,
-                         sizeof(pos2gpu::T1PairingGpu) * gpu_count,
-                         cudaMemcpyDeviceToHost));
+        CHECK(cudaMemcpy(h_meta.data(), d_t1_meta, sizeof(uint64_t) * gpu_count, cudaMemcpyDeviceToHost));
+        CHECK(cudaMemcpy(h_mi.data(),   d_t1_mi,   sizeof(uint32_t) * gpu_count, cudaMemcpyDeviceToHost));
     }
-    cudaFree(d_t1_temp); cudaFree(d_t1_count); cudaFree(d_t1);
-    cudaFree(d_xs_temp); cudaFree(d_xs);
+    free_all();
 
     std::vector<PairKey> gpu_keys;
-    gpu_keys.reserve(gpu_pairs.size());
-    for (auto const& p : gpu_pairs) {
-        gpu_keys.push_back({p.match_info, p.meta_lo, p.meta_hi});
+    gpu_keys.reserve(gpu_count);
+    for (uint64_t i = 0; i < gpu_count; ++i) {
+        uint32_t meta_lo = uint32_t(h_meta[i]);
+        uint32_t meta_hi = uint32_t(h_meta[i] >> 32);
+        gpu_keys.push_back({h_mi[i], meta_lo, meta_hi});
     }
     std::sort(gpu_keys.begin(), gpu_keys.end());
 
diff --git a/tools/parity/t2_parity.cu b/tools/parity/t2_parity.cu
index dcb8550..4d7e80e 100644
--- a/tools/parity/t2_parity.cu
+++ b/tools/parity/t2_parity.cu
@@ -6,6 +6,7 @@
 // correctness, which is already validated by t1_parity.
 
 #include "gpu/AesGpu.cuh"
+#include "gpu/SyclBackend.hpp"
 #include "gpu/T1Kernel.cuh"
 #include "gpu/T2Kernel.cuh"
 
@@ -15,6 +16,8 @@
 #include "pos/ProofCore.hpp"
 #include "pos/ProofParams.hpp"
 
+#include "ParityCommon.hpp"
+
 #include <cuda_runtime.h>
 #include <algorithm>
 #include <array>
@@ -26,6 +29,8 @@
 
 namespace {
 
+using pos2gpu::parity::derive_plot_id;
+
 #define CHECK(call) do {                                                                     \
     cudaError_t err = (call);                                                                \
     if (err != cudaSuccess) {                                                                \
@@ -35,17 +40,6 @@ namespace {
     }                                                                                        \
 } while (0)
 
-std::array<uint8_t, 32> derive_plot_id(uint32_t seed)
-{
-    std::array<uint8_t, 32> id{};
-    uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL;
-    for (size_t i = 0; i < id.size(); ++i) {
-        s = s * 6364136223846793005ULL + 1442695040888963407ULL;
-        id[i] = static_cast<uint8_t>(s >> 56);
-    }
-    return id;
-}
-
 // Sort key for T2Pairing: (match_info, x_bits, meta) — fully canonicalises
 // the pair regardless of emission order.
 struct T2Key {
@@ -149,44 +143,59 @@ bool run_for_id(std::array<uint8_t, 32> const& plot_id, char const* label, int k
     auto t2p = pos2gpu::make_t2_params(k, strength);
     uint64_t capacity = static_cast<uint64_t>(max_pairs);
 
-    pos2gpu::T2PairingGpu* d_t2 = nullptr;
-    CHECK(cudaMalloc(&d_t2, sizeof(pos2gpu::T2PairingGpu) * capacity));
+    // T2 match emits SoA: three parallel streams.
+    uint64_t* d_t2_meta  = nullptr;
+    uint32_t* d_t2_mi    = nullptr;
+    uint32_t* d_t2_xbits = nullptr;
+    CHECK(cudaMalloc(&d_t2_meta,  sizeof(uint64_t) * capacity));
+    CHECK(cudaMalloc(&d_t2_mi,    sizeof(uint32_t) * capacity));
+    CHECK(cudaMalloc(&d_t2_xbits, sizeof(uint32_t) * capacity));
     uint64_t* d_t2_count = nullptr;
     CHECK(cudaMalloc(&d_t2_count, sizeof(uint64_t)));
 
     size_t t2_temp_bytes = 0;
-    CHECK(pos2gpu::launch_t2_match(plot_id.data(), t2p, nullptr, nullptr, t1_snapshot.size(),
-                                   d_t2, d_t2_count, capacity,
-                                   nullptr, &t2_temp_bytes));
+    pos2gpu::launch_t2_match(plot_id.data(), t2p, nullptr, nullptr, t1_snapshot.size(),
+                                   nullptr, nullptr, nullptr,
+                                   d_t2_count, capacity,
+                                   nullptr, &t2_temp_bytes, pos2gpu::sycl_backend::queue());
     void* d_t2_temp = nullptr;
     CHECK(cudaMalloc(&d_t2_temp, t2_temp_bytes));
-    CHECK(pos2gpu::launch_t2_match(plot_id.data(), t2p, d_t1_meta, d_t1_mi, t1_snapshot.size(),
-                                   d_t2, d_t2_count, capacity,
-                                   d_t2_temp, &t2_temp_bytes));
+    pos2gpu::launch_t2_match(plot_id.data(), t2p, d_t1_meta, d_t1_mi, t1_snapshot.size(),
+                                   d_t2_meta, d_t2_mi, d_t2_xbits,
+                                   d_t2_count, capacity,
+                                   d_t2_temp, &t2_temp_bytes, pos2gpu::sycl_backend::queue());
     CHECK(cudaDeviceSynchronize());
 
     uint64_t gpu_count = 0;
     CHECK(cudaMemcpy(&gpu_count, d_t2_count, sizeof(uint64_t), cudaMemcpyDeviceToHost));
 
+    auto free_all = [&]() {
+        cudaFree(d_t2_temp); cudaFree(d_t2_count);
+        cudaFree(d_t2_meta); cudaFree(d_t2_mi); cudaFree(d_t2_xbits);
+        cudaFree(d_t1_mi);   cudaFree(d_t1_meta); cudaFree(d_t1);
+    };
+
     if (gpu_count > capacity) {
         std::printf("  GPU OVERFLOW: %llu / %llu\n",
                     (unsigned long long)gpu_count, (unsigned long long)capacity);
-        cudaFree(d_t2_temp); cudaFree(d_t2_count); cudaFree(d_t2); cudaFree(d_t1_mi); cudaFree(d_t1_meta); cudaFree(d_t1);
+        free_all();
         return false;
     }
 
-    std::vector<pos2gpu::T2PairingGpu> gpu_pairs(gpu_count);
+    std::vector<uint64_t> h_meta (gpu_count);
+    std::vector<uint32_t> h_mi   (gpu_count);
+    std::vector<uint32_t> h_xbits(gpu_count);
     if (gpu_count > 0) {
-        CHECK(cudaMemcpy(gpu_pairs.data(), d_t2,
-                         sizeof(pos2gpu::T2PairingGpu) * gpu_count,
-                         cudaMemcpyDeviceToHost));
+        CHECK(cudaMemcpy(h_meta.data(),  d_t2_meta,  sizeof(uint64_t) * gpu_count, cudaMemcpyDeviceToHost));
+        CHECK(cudaMemcpy(h_mi.data(),    d_t2_mi,    sizeof(uint32_t) * gpu_count, cudaMemcpyDeviceToHost));
+        CHECK(cudaMemcpy(h_xbits.data(), d_t2_xbits, sizeof(uint32_t) * gpu_count, cudaMemcpyDeviceToHost));
     }
-    cudaFree(d_t2_temp); cudaFree(d_t2_count); cudaFree(d_t2); cudaFree(d_t1_mi); cudaFree(d_t1_meta); cudaFree(d_t1);
+    free_all();
 
     std::vector<T2Key> gpu_keys;
-    gpu_keys.reserve(gpu_pairs.size());
-    for (auto const& p : gpu_pairs) {
-        gpu_keys.push_back({p.match_info, p.x_bits, p.meta});
+    gpu_keys.reserve(gpu_count);
+    for (uint64_t i = 0; i < gpu_count; ++i) {
+        gpu_keys.push_back({h_mi[i], h_xbits[i], h_meta[i]});
     }
     std::sort(gpu_keys.begin(), gpu_keys.end());
 
diff --git a/tools/parity/t3_parity.cu b/tools/parity/t3_parity.cu
index 3fb606b..0085dff 100644
--- a/tools/parity/t3_parity.cu
+++ b/tools/parity/t3_parity.cu
@@ -5,6 +5,7 @@
 // from upstream phases (already validated by t1_parity / t2_parity).
 
 #include "gpu/AesGpu.cuh"
+#include "gpu/SyclBackend.hpp"
 #include "gpu/T2Kernel.cuh"
 #include "gpu/T3Kernel.cuh"
 
@@ -14,6 +15,8 @@
 #include "pos/ProofCore.hpp"
 #include "pos/ProofParams.hpp"
 
+#include "ParityCommon.hpp"
+
 #include <cuda_runtime.h>
 #include <algorithm>
 #include <array>
@@ -25,6 +28,8 @@
 
 namespace {
 
+using pos2gpu::parity::derive_plot_id;
+
 #define CHECK(call) do {                                                                     \
     cudaError_t err = (call);                                                                \
     if (err != cudaSuccess) {                                                                \
@@ -34,17 +39,6 @@ namespace {
     }                                                                                        \
 } while (0)
 
-std::array<uint8_t, 32> derive_plot_id(uint32_t seed)
-{
-    std::array<uint8_t, 32> id{};
-    uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL;
-    for (size_t i = 0; i < id.size(); ++i) {
-        s = s * 6364136223846793005ULL + 1442695040888963407ULL;
-        id[i] = static_cast<uint8_t>(s >> 56);
-    }
-    return id;
-}
-
 bool run_for_id(std::array<uint8_t, 32> const& plot_id, char const* label, int k, int strength)
 {
     uint64_t const total = 1ULL << k;
@@ -145,18 +139,18 @@ bool run_for_id(std::array<uint8_t, 32> const& plot_id, char const* label, int k
     CHECK(cudaMalloc(&d_t3_count, sizeof(uint64_t)));
 
     size_t t3_temp_bytes = 0;
-    CHECK(pos2gpu::launch_t3_match(plot_id.data(), t3p,
+    pos2gpu::launch_t3_match(plot_id.data(), t3p,
                                    d_t2_meta, d_t2_xbits, nullptr,
                                    t2_snapshot.size(),
                                    d_t3, d_t3_count, capacity,
-                                   nullptr, &t3_temp_bytes));
+                                   nullptr, &t3_temp_bytes, pos2gpu::sycl_backend::queue());
     void* d_t3_temp = nullptr;
     CHECK(cudaMalloc(&d_t3_temp, t3_temp_bytes));
-    CHECK(pos2gpu::launch_t3_match(plot_id.data(), t3p,
+    pos2gpu::launch_t3_match(plot_id.data(), t3p,
                                    d_t2_meta, d_t2_xbits, d_t2_mi,
                                    t2_snapshot.size(),
                                    d_t3, d_t3_count, capacity,
-                                   d_t3_temp, &t3_temp_bytes));
+                                   d_t3_temp, &t3_temp_bytes, pos2gpu::sycl_backend::queue());
     CHECK(cudaDeviceSynchronize());
 
     uint64_t gpu_count = 0;
diff --git a/tools/parity/xs_bench.cu b/tools/parity/xs_bench.cu
index b0fd563..1dad15e 100644
--- a/tools/parity/xs_bench.cu
+++ b/tools/parity/xs_bench.cu
@@ -4,11 +4,14 @@
 // chase further down the pipeline.
 
 #include "gpu/AesGpu.cuh"
+#include "gpu/SyclBackend.hpp"
 #include "gpu/XsKernel.cuh"
 
 #include "plot/TableConstructorGeneric.hpp"
 #include "pos/ProofParams.hpp"
 
+#include "ParityCommon.hpp"
+
 #include <cuda_runtime.h>
 #include <chrono>
 #include <cstdint>
@@ -26,16 +29,7 @@
     }                                                                                        \
 } while (0)
 
-static std::array<uint8_t, 32> derive_plot_id(uint32_t seed)
-{
-    std::array<uint8_t, 32> id{};
-    uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL;
-    for (size_t i = 0; i < id.size(); ++i) {
-        s = s * 6364136223846793005ULL + 1442695040888963407ULL;
-        id[i] = static_cast<uint8_t>(s >> 56);
-    }
-    return id;
-}
+using pos2gpu::parity::derive_plot_id;
 
 static double bench_cpu(uint8_t const* plot_id, int k)
 {
@@ -62,16 +56,16 @@ static double bench_gpu(uint8_t const* plot_id, int k)
     CHECK(cudaMalloc(&d_out, sizeof(pos2gpu::XsCandidateGpu) * total));
 
     size_t temp_bytes = 0;
-    CHECK(pos2gpu::launch_construct_xs(plot_id, k, false, nullptr, nullptr, &temp_bytes));
+    pos2gpu::launch_construct_xs(plot_id, k, false, nullptr, nullptr, &temp_bytes, pos2gpu::sycl_backend::queue());
     void* d_temp = nullptr;
     CHECK(cudaMalloc(&d_temp, temp_bytes));
 
     // Warm up to amortise context init.
-    CHECK(pos2gpu::launch_construct_xs(plot_id, k, false, d_out, d_temp, &temp_bytes));
+    pos2gpu::launch_construct_xs(plot_id, k, false, d_out, d_temp, &temp_bytes, pos2gpu::sycl_backend::queue());
     CHECK(cudaDeviceSynchronize());
 
     auto t0 = std::chrono::steady_clock::now();
-    CHECK(pos2gpu::launch_construct_xs(plot_id, k, false, d_out, d_temp, &temp_bytes));
+    pos2gpu::launch_construct_xs(plot_id, k, false, d_out, d_temp, &temp_bytes, pos2gpu::sycl_backend::queue());
     CHECK(cudaDeviceSynchronize());
     auto t1 = std::chrono::steady_clock::now();
 
diff --git a/tools/parity/xs_parity.cu b/tools/parity/xs_parity.cu
index f743bdd..b06d922 100644
--- a/tools/parity/xs_parity.cu
+++ b/tools/parity/xs_parity.cu
@@ -6,12 +6,15 @@
 // (match_info, x) pair matches in order.
 
 #include "gpu/AesGpu.cuh"
+#include "gpu/SyclBackend.hpp"
 #include "gpu/XsKernel.cuh"
 
 // pos2-chip headers for the CPU reference.
 #include "plot/TableConstructorGeneric.hpp"
 #include "pos/ProofParams.hpp"
 
+#include "ParityCommon.hpp"
+
 #include <cuda_runtime.h>
 #include <array>
 #include <cstdint>
@@ -23,6 +26,8 @@
 
 namespace {
 
+using pos2gpu::parity::derive_plot_id;
+
 #define CHECK(call) do {                                                                     \
     cudaError_t err = (call);                                                                \
     if (err != cudaSuccess) {                                                                \
@@ -32,17 +37,6 @@ namespace {
     }                                                                                        \
 } while (0)
 
-std::array<uint8_t, 32> derive_plot_id(uint32_t seed)
-{
-    std::array<uint8_t, 32> id{};
-    uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL;
-    for (size_t i = 0; i < id.size(); ++i) {
-        s = s * 6364136223846793005ULL + 1442695040888963407ULL;
-        id[i] = static_cast<uint8_t>(s >> 56);
-    }
-    return id;
-}
-
 bool run_for(uint32_t seed, int k, bool testnet)
 {
     auto plot_id = derive_plot_id(seed);
@@ -84,26 +78,16 @@ bool run_for(uint32_t seed, int k, bool testnet)
     CHECK(cudaMalloc(&d_out, sizeof(pos2gpu::XsCandidateGpu) * total));
 
     size_t temp_bytes = 0;
-    auto err = pos2gpu::launch_construct_xs(
+    pos2gpu::launch_construct_xs(
         plot_id.data(), k, testnet,
         /*d_out=*/nullptr,
         /*d_temp_storage=*/nullptr,
-        &temp_bytes);
-    if (err != cudaSuccess) {
-        std::fprintf(stderr, "  query temp_bytes failed: %s\n", cudaGetErrorString(err));
-        return false;
-    }
-
+        &temp_bytes, pos2gpu::sycl_backend::queue());
     void* d_temp = nullptr;
     CHECK(cudaMalloc(&d_temp, temp_bytes));
 
-    err = pos2gpu::launch_construct_xs(
-        plot_id.data(), k, testnet, d_out, d_temp, &temp_bytes);
-    if (err != cudaSuccess) {
-        std::fprintf(stderr, "  launch failed: %s\n", cudaGetErrorString(err));
-        cudaFree(d_temp); cudaFree(d_out);
-        return false;
-    }
+    pos2gpu::launch_construct_xs(
+        plot_id.data(), k, testnet, d_out, d_temp, &temp_bytes, pos2gpu::sycl_backend::queue());
     CHECK(cudaDeviceSynchronize());
 
     std::vector<pos2gpu::XsCandidateGpu> gpu_out(total);
diff --git a/tools/sanity/hellosycl.cpp b/tools/sanity/hellosycl.cpp
new file mode 100644
index 0000000..11cf500
--- /dev/null
+++ b/tools/sanity/hellosycl.cpp
@@ -0,0 +1,80 @@
+// hellosycl.cpp — minimal SYCL kernel-dispatch sanity check.
+//
+// Allocates 16 uint32_t on device, sentinel-fills via memset, runs a
+// trivial parallel_for that writes a known pattern, copies back, prints
+// pass/fail per slot. Exit 0 if all slots match expected values, else
+// non-zero with a "FAIL" line for each mismatch.
+//
+// Used to localize "is AdaptiveCpp's HIP / CUDA backend actually
+// dispatching kernels on this host?" before climbing the abstraction
+// stack to sycl_t1_parity / xchplot2. If hellosycl FAILs, no
+// xchplot2-level fix can recover the device — the issue is below our
+// level (driver mismatch, missing libcudart / libamdhip64, AdaptiveCpp
+// JIT producing no-op stubs, ACPP_TARGETS pointing at an ISA the
+// installed AdaptiveCpp can't lower for, …).
+//
+// Compile via the project CMake build (rpath + includes set up
+// automatically):
+//
+//   cmake --build build --target hellosycl
+//   ./build/tools/sanity/hellosycl
+//
+// Or standalone, mirroring whatever ACPP_TARGETS the production binary
+// is using (see the cargo:warning lines from `cargo install`):
+//
+//   ACPP_TARGETS=hip:gfx1013 /opt/adaptivecpp/bin/acpp -O2 hellosycl.cpp -o hellosycl
+//   LD_LIBRARY_PATH=/opt/rocm/lib ./hellosycl
+
+#include <sycl/sycl.hpp>
+
+#include <cstdint>
+#include <cstdio>
+
+int main()
+{
+    sycl::queue q;
+    std::printf("Device: %s\n",
+        q.get_device().get_info<sycl::info::device::name>().c_str());
+
+    constexpr std::size_t   N        = 16;
+    constexpr std::uint32_t kPattern = 0x12340000u;
+
+    std::uint32_t* d = sycl::malloc_device<std::uint32_t>(N, q);
+    if (!d) {
+        std::printf("FAIL: sycl::malloc_device returned null\n");
+        return 1;
+    }
+
+    // Sentinel-fill (0xABABABAB): a "kernel didn't write" outcome shows
+    // 0xAB, distinct from "kernel wrote a wrong value" (shows something
+    // else) and from random uninitialised bytes that might happen to
+    // match the expected pattern by coincidence.
+    q.memset(d, 0xAB, N * sizeof(std::uint32_t)).wait();
+    q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> it) {
+        std::size_t idx = it.get_global_id(0);
+        d[idx] = kPattern | static_cast<std::uint32_t>(idx);
+    }).wait();
+
+    std::uint32_t h[N];
+    q.memcpy(h, d, N * sizeof(std::uint32_t)).wait();
+    sycl::free(d, q);
+
+    int fails = 0;
+    for (std::size_t i = 0; i < N; ++i) {
+        std::uint32_t want = kPattern | static_cast<std::uint32_t>(i);
+        std::printf("[%2zu] got=0x%08x want=0x%08x %s\n",
+            i, h[i], want, h[i] == want ? "OK" : "FAIL");
+        if (h[i] != want) ++fails;
+    }
+
+    if (fails == 0) {
+        std::printf("\nALL OK — AdaptiveCpp can dispatch trivial kernels on this device.\n");
+    } else {
+        std::printf("\nFAIL — %d/%zu slot(s) wrong. Common causes:\n"
+                    "  - libcudart / libamdhip64 not in rpath (check ldd of this binary)\n"
+                    "  - AdaptiveCpp JIT producing no-op stubs (ACPP_DEBUG_LEVEL=2 to see)\n"
+                    "  - ACPP_TARGETS picks an ISA the installed AdaptiveCpp can't lower\n",
+                    fails, N);
+    }
+    return fails == 0 ? 0 : 1;
+}
diff --git a/tools/xchplot2/cli.cpp b/tools/xchplot2/cli.cpp
index 6cfa62f..de7a5c9 100644
--- a/tools/xchplot2/cli.cpp
+++ b/tools/xchplot2/cli.cpp
@@ -6,20 +6,29 @@
 //           BLS keys via the keygen-rs Rust shim, then dispatches through
 //           batch internally. The "real" entrypoint for users.
 
+#include "gpu/SyclDeviceList.hpp" // list_gpu_devices() — backs the
+                                  // `devices` subcommand below. Plain
+                                  // types only; the SYCL include lives
+                                  // in SyclDeviceList.cpp (acpp-built).
 #include "host/GpuPlotter.hpp"
 #include "host/BatchPlotter.hpp"
+#include "host/Cancel.hpp"
+#include "host/PlotFileWriterParallel.hpp"
 #include "pos2_keygen.h" // Rust shim for plot_id + memo derivation
 
 #include <algorithm>
 #include <cerrno>
+#include <chrono>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <stdexcept>
 #include <string>
+#include <thread>
 #include <vector>
 
 namespace {
@@ -32,12 +41,15 @@ void print_usage(char const* prog)
         << "         [-T|--testnet] [-o|--out DIR] [-m|--memo HEX] [-N|--out-name NAME]\n"
         << "         [--gpu-t1] [--gpu-t2] [--gpu-t3] [-G|--gpu-all] [-P|--profile]\n"
         << "  " << prog << " batch <manifest.tsv> [-v|--verbose]\n"
+        << "         [--skip-existing] [--continue-on-error]\n"
+        << "         [--devices <SPEC>]\n"
         << "    Manifest: one plot per non-empty/non-# line, whitespace-separated:\n"
         << "      k strength plot_index meta_group testnet plot_id_hex memo_hex out_dir out_name\n"
         << "    Runs GPU compute and CPU FSE in a producer/consumer pipeline so they overlap\n"
         << "    across consecutive plots. ~2x throughput vs separate `test` invocations.\n"
         << "  " << prog << " plot -k K -n N -f HEX  ( -p HEX | --pool-ph HEX | -c xch1... )\n"
         << "         [-s S] [-o DIR] [-T] [-i N] [-g N] [-S HEX] [-v]\n"
+        << "         [--skip-existing] [--continue-on-error]\n"
         << "    Standalone farmable plot(s): derives plot_id + memo internally\n"
         << "    from the keys via chia-rs, then batches through the GPU pipeline.\n"
         << "    -f, --farmer-pk HEX             : 96 hex chars (48 B G1 public key).\n"
@@ -57,6 +69,51 @@ void print_usage(char const* prog)
         << "                                      fresh /dev/urandom per plot.\n"
         << "    -T, --testnet                   : testnet proof parameters.\n"
         << "    -v, --verbose                   : per-plot progress on stderr.\n"
+    << "    --skip-existing                 : skip plots whose output file is already a\n"
+        << "                                      complete .plot2 (magic + non-trivial size).\n"
+        << "    --continue-on-error             : log per-plot failures and keep going\n"
+        << "                                      instead of aborting the batch.\n"
+        << "    --devices SPEC                  : multi-device. SPEC is a comma\n"
+        << "                                      list mixing any of:\n"
+        << "                                        all       — every GPU + CPU\n"
+        << "                                        gpu       — every visible GPU\n"
+        << "                                        cpu       — CPU worker only (slow)\n"
+        << "                                        0,1,3     — explicit GPU ids\n"
+        << "                                      e.g. gpu,cpu == all.\n"
+        << "                                      Omitted = single device via default\n"
+        << "                                      SYCL selector (zero-config).\n"
+        << "    --cpu                           : add a CPU worker alongside the\n"
+        << "                                      selected GPUs (or use CPU only when\n"
+        << "                                      no GPU is selected). Plotting on CPU\n"
+        << "                                      is 1-2 orders of magnitude slower\n"
+        << "                                      than GPU; intended for GPU-less\n"
+        << "                                      hosts or as an extra worker.\n"
+        << "    --tier plain|compact|minimal|auto : force streaming pipeline tier\n"
+        << "                                      when GPU pool doesn't fit. plain =\n"
+        << "                                      ~7.24 GB floor (k=28), faster.\n"
+        << "                                      compact = ~5.33 GB floor, fits on\n"
+        << "                                      tight 8 GB cards. minimal = ~3.83 GB\n"
+        << "                                      floor, fits on 4 GiB cards (extra\n"
+        << "                                      PCIe round-trips during T2 match).\n"
+        << "                                      auto (default) = pick the largest\n"
+        << "                                      tier that fits. Equivalent to\n"
+        << "                                      XCHPLOT2_STREAMING_TIER env var;\n"
+        << "                                      CLI flag wins if both set.\n"
+        << "  " << prog << " verify <plotfile> [--trials N]\n"
+        << "    Open <plotfile> and run N random challenges through the CPU prover.\n"
+        << "    Zero proofs across a sensible sample (>=100) strongly indicates a\n"
+        << "    corrupt plot. Default N=100.\n"
+        << "  " << prog << " parity-check [--dir PATH]\n"
+        << "    Run every *_parity binary in PATH and summarize PASS/FAIL.\n"
+        << "    Default PATH is ./build/tools/parity. Build the tests with\n"
+        << "    `cmake --build <build-dir>` first. Useful for post-refactor\n"
+        << "    regression screening.\n"
+        << "  " << prog << " devices\n"
+        << "    List every visible SYCL GPU device + the host CPU plotter\n"
+        << "    with id, name, backend, capacity, and which sort path the\n"
+        << "    runtime dispatcher will route a worker to (CUB on cuda-\n"
+        << "    backend devices when this build links CUB, otherwise SortSycl).\n"
+        << "    Use the printed [N] / [cpu] index with --devices in plot/batch.\n"
         << "\n"
         << "  test-mode positional args:\n"
         << "    <k>            : even integer in [18, 32]\n"
@@ -72,7 +129,18 @@ void print_usage(char const* prog)
         << "    -N, --out-name NAME: override output filename (basename only)\n"
         << "        --gpu-tN       : run phase N on GPU (T1/T2/T3); default CPU\n"
         << "    -G, --gpu-all      : run all phases on GPU (where implemented)\n"
-        << "    -P, --profile      : print per-phase device-time breakdown\n";
+        << "    -P, --profile      : print per-phase device-time breakdown\n"
+        << "\n"
+        << "  Environment variables:\n"
+        << "    XCHPLOT2_STREAMING=1          force the low-VRAM streaming pipeline even\n"
+        << "                                  when the persistent pool would fit.\n"
+        << "    POS2GPU_MAX_VRAM_MB=N         cap the pool/streaming VRAM query to N MB\n"
+        << "                                  (useful for testing the streaming fallback).\n"
+        << "    POS2GPU_STREAMING_STATS=1     log every streaming-path alloc / free.\n"
+        << "    POS2GPU_POOL_DEBUG=1          log pool allocation sizes at construction.\n"
+        << "    POS2GPU_PHASE_TIMING=1        per-phase wall-time breakdown on stderr.\n"
+        << "    ACPP_GFX=gfxXXXX              AMD only — required at build time to AOT\n"
+        << "                                  for the right amdgcn ISA (see README).\n";
 }
 
 bool parse_hex_bytes(std::string const& s, std::vector<uint8_t>& out)
@@ -124,6 +192,67 @@ void read_urandom(uint8_t* out, size_t n)
     }
 }
 
+// Parse a --devices value into BatchOptions.
+//
+// Accepted forms:
+//   "all"              → use every GPU visible at runtime (sets
+//                        use_all_devices; device_ids stays empty).
+//   "0"                → use only GPU id 0.
+//   "0,2,3"            → use these specific device ids, in sorted order.
+//
+// Zero-configuration default (no flag) produces device_ids.empty() and
+// use_all_devices=false — which triggers the single-device
+// gpu_selector_v path, identical to pre-multi-GPU behavior.
+//
+// Returns false on malformed input (caller prints usage + exits 1).
+bool parse_devices_arg(std::string const& s, pos2gpu::BatchOptions& opts)
+{
+    // Accept comma-separated mix of:
+    //   "all"      → every GPU + the CPU worker
+    //   "gpu"      → every visible GPU only
+    //   "cpu"      → the CPU worker only
+    //   "<int>"    → opts.device_ids.push_back(int)  (real GPU index)
+    // "cpu" alone is OK; otherwise at least one GPU token is required.
+    opts.device_ids.clear();
+    bool any_token = false;
+    bool any_gpu_token = false;
+    size_t start = 0;
+    while (start <= s.size()) {
+        size_t const end = s.find(',', start);
+        std::string const tok = s.substr(
+            start, end == std::string::npos ? std::string::npos : end - start);
+        if (tok.empty()) return false;
+        any_token = true;
+        if (tok == "all") {
+            opts.use_all_devices = true;
+            opts.include_cpu = true;
+            any_gpu_token = true;
+        } else if (tok == "gpu") {
+            opts.use_all_devices = true;
+            any_gpu_token = true;
+        } else if (tok == "cpu") {
+            opts.include_cpu = true;
+        } else {
+            char* endp = nullptr;
+            long const v = std::strtol(tok.c_str(), &endp, 10);
+            if (endp == tok.c_str() || *endp != '\0' || v < 0 || v > 1023) {
+                return false;
+            }
+            opts.device_ids.push_back(static_cast<int>(v));
+            any_gpu_token = true;
+        }
+        if (end == std::string::npos) break;
+        start = end + 1;
+    }
+    if (!any_token) return false;
+    if (!any_gpu_token && !opts.include_cpu) return false;
+    std::sort(opts.device_ids.begin(), opts.device_ids.end());
+    opts.device_ids.erase(
+        std::unique(opts.device_ids.begin(), opts.device_ids.end()),
+        opts.device_ids.end());
+    return true;
+}
+
 std::string plot_id_to_filename(int k, std::array<uint8_t, 32> const& plot_id)
 {
     // Match chia plots create's v2 filename scheme: plot-k{size}-{id}.plot2
@@ -142,6 +271,8 @@ std::string plot_id_to_filename(int k, std::array<uint8_t, 32> const& plot_id)
 
 extern "C" int xchplot2_main(int argc, char* argv[])
 {
+    pos2gpu::install_cancel_signal_handlers();
+
     if (argc < 2) {
         print_usage(argv[0]);
         return 1;
@@ -149,29 +280,218 @@ extern "C" int xchplot2_main(int argc, char* argv[])
 
     std::string mode = argv[1];
 
+    if (mode == "devices") {
+        // Enumerate every visible SYCL GPU device + the CPU plotter
+        // (always available via AdaptiveCpp's OpenMP host backend).
+        // Reports id, name, backend, capacity, and which sort path
+        // the runtime dispatcher will route a worker on this device
+        // to (CUB on cuda-backend queues when this build links the
+        // CUB sort path; SortSycl otherwise — see SortDispatch.cpp).
+        // Use the printed `[N]` / `[cpu]` index with `--devices`.
+        auto devices = pos2gpu::list_gpu_devices();
+        std::printf("Visible devices (%zu GPU + 1 CPU):\n", devices.size());
+        for (auto const& d : devices) {
+            std::size_t vram_mb =
+                static_cast<std::size_t>(d.vram_bytes / (1024ull * 1024ull));
+#ifdef XCHPLOT2_HAVE_CUB
+            char const* sort_hint = d.is_cuda_backend ? "CUB" : "SYCL";
+#else
+            char const* sort_hint = "SYCL";
+#endif
+            std::printf("  [%zu]   %-32s backend=%-10s vram=%5zu MB  CUs=%-4u  sort:%s\n",
+                        d.id, d.name.c_str(), d.backend.c_str(),
+                        vram_mb, d.cu_count, sort_hint);
+        }
+        // CPU row. hardware_concurrency() returns 0 when it can't
+        // figure out the count (rare), in which case print "?".
+        unsigned threads = std::thread::hardware_concurrency();
+        if (threads == 0) {
+            std::printf("  [cpu] %-32s backend=%-10s threads=  ?            sort:SYCL  (1-2 orders slower than GPU)\n",
+                        "Host CPU plotter", "omp");
+        } else {
+            std::printf("  [cpu] %-32s backend=%-10s threads=%-4u           sort:SYCL  (1-2 orders slower than GPU)\n",
+                        "Host CPU plotter", "omp", threads);
+        }
+        if (devices.empty()) {
+            std::printf("\nNo GPU devices visible to AdaptiveCpp / SYCL.\n"
+                        "Check rocminfo / nvidia-smi, ACPP_VISIBILITY_MASK, and that the\n"
+                        "relevant SYCL backend was built into AdaptiveCpp.\n"
+                        "The CPU plotter is always available via `--devices cpu` or `--cpu`.\n");
+        } else {
+            std::printf("\nUse `--devices N` (id) for a specific GPU,\n"
+                        "     `--devices gpu` for every GPU,\n"
+                        "     `--devices cpu` for the host CPU only,\n"
+                        "     `--devices all` for every GPU + CPU,\n"
+                        "  or any comma combination (e.g. `0,2,cpu`).\n");
+        }
+        return 0;
+    }
+
     if (mode == "batch") {
         if (argc < 3) { print_usage(argv[0]); return 1; }
         std::string manifest = argv[2];
-        bool verbose = false;
+        pos2gpu::BatchOptions opts{};
         for (int i = 3; i < argc; ++i) {
             std::string a = argv[i];
-            if (a == "-v" || a == "--verbose") verbose = true;
+            if      (a == "-v" || a == "--verbose")        opts.verbose = true;
+            else if (a == "--skip-existing")               opts.skip_existing = true;
+            else if (a == "--continue-on-error")           opts.continue_on_error = true;
+            else if (a == "--cpu")                         opts.include_cpu = true;
+            else if (a == "--tier" && i + 1 < argc) {
+                std::string t = argv[++i];
+                if (t != "plain" && t != "compact" && t != "minimal" && t != "auto") {
+                    std::cerr << "Error: --tier expects 'plain', 'compact', "
+                                 "'minimal', or 'auto' (got '" << t << "')\n";
+                    return 1;
+                }
+                opts.streaming_tier = (t == "auto") ? "" : t;
+            }
+            else if (a == "--devices" && i + 1 < argc) {
+                if (!parse_devices_arg(argv[++i], opts)) {
+                    std::cerr << "Error: --devices expects 'all', 'cpu', or a "
+                                 "comma-separated list of device ids "
+                                 "(got '" << argv[i] << "')\n";
+                    return 1;
+                }
+            }
+            else {
+                std::cerr << "Error: unknown argument: " << a << "\n";
+                print_usage(argv[0]);
+                return 1;
+            }
         }
         try {
             auto entries = pos2gpu::parse_manifest(manifest);
             std::cerr << "[batch] " << entries.size() << " plots queued\n";
-            auto res = pos2gpu::run_batch(entries, verbose);
-            double per = res.plots_written ? res.total_wall_seconds / res.plots_written : 0;
+            auto res = pos2gpu::run_batch(entries, opts);
+            double per = res.plots_written
+                ? res.total_wall_seconds / double(res.plots_written) : 0;
             std::cerr << "[batch] wrote " << res.plots_written << " plots in "
                       << res.total_wall_seconds << " s ("
-                      << per << " s/plot)\n";
-            return 0;
+                      << per << " s/plot)";
+            if (res.plots_skipped) std::cerr << "; skipped " << res.plots_skipped;
+            if (res.plots_failed)  std::cerr << "; failed "  << res.plots_failed;
+            std::cerr << "\n";
+            return (res.plots_failed > 0) ? 3 : 0;
         } catch (std::exception const& e) {
             std::cerr << "[batch] FAILED: " << e.what() << "\n";
             return 2;
         }
     }
 
+    if (mode == "verify") {
+        if (argc < 3) { print_usage(argv[0]); return 1; }
+        std::string plotfile = argv[2];
+        size_t trials = 100;
+        for (int i = 3; i < argc; ++i) {
+            std::string a = argv[i];
+            if ((a == "--trials" || a == "-n") && i + 1 < argc) {
+                long v = std::atol(argv[++i]);
+                if (v <= 0) {
+                    std::cerr << "Error: --trials must be > 0\n";
+                    return 1;
+                }
+                trials = static_cast<size_t>(v);
+            } else {
+                std::cerr << "Error: unknown argument: " << a << "\n";
+                print_usage(argv[0]);
+                return 1;
+            }
+        }
+        try {
+            std::cerr << "[verify] " << plotfile << ": running " << trials
+                      << " random challenges\n";
+            auto res = pos2gpu::verify_plot_file(plotfile, trials);
+            std::cerr << "[verify] " << res.trials << " trials, "
+                      << res.challenges_with_proof << " with >=1 proof, "
+                      << res.proofs_found << " proofs total\n";
+            if (res.proofs_found == 0) {
+                std::cerr << "[verify] FAIL: no proofs produced — plot is "
+                             "likely corrupt\n";
+                return 4;
+            }
+            std::cerr << "[verify] OK\n";
+            return 0;
+        } catch (std::exception const& e) {
+            std::cerr << "[verify] FAILED: " << e.what() << "\n";
+            return 2;
+        }
+    }
+
+    if (mode == "parity-check") {
+        std::string dir = "./build/tools/parity";
+        for (int i = 2; i < argc; ++i) {
+            std::string a = argv[i];
+            if ((a == "--dir" || a == "-d") && i + 1 < argc) {
+                dir = argv[++i];
+            } else {
+                std::cerr << "Error: unknown argument: " << a << "\n";
+                print_usage(argv[0]);
+                return 1;
+            }
+        }
+
+        // Glob every *_parity binary in `dir`. Same code path works for
+        // both branches — main ships sycl_*_parity extras that cuda-only
+        // doesn't, and the wildcard picks up whichever actually exists.
+        std::vector<std::filesystem::path> tests;
+        std::error_code ec;
+        if (std::filesystem::is_directory(dir, ec)) {
+            for (auto const& entry :
+                 std::filesystem::directory_iterator(dir, ec))
+            {
+                auto const name = entry.path().filename().string();
+                constexpr char const kSuffix[] = "_parity";
+                constexpr size_t kLen = sizeof(kSuffix) - 1;
+                bool const ends =
+                    name.size() >= kLen &&
+                    name.compare(name.size() - kLen, kLen, kSuffix) == 0;
+                if (ends && entry.is_regular_file(ec)) {
+                    tests.push_back(entry.path());
+                }
+            }
+        }
+        if (tests.empty()) {
+            std::cerr << "No `*_parity` binaries found under " << dir << ".\n"
+                         "Build them first:\n"
+                         "  cmake -B build -S . -DCMAKE_BUILD_TYPE=Release\n"
+                         "  cmake --build build --parallel\n"
+                         "Then re-run from the repo root, or pass --dir <path>.\n";
+            return 2;
+        }
+        std::sort(tests.begin(), tests.end());
+
+        int pass = 0, fail = 0;
+        std::cerr << "==> parity tests (" << tests.size() << " found in "
+                  << dir << ")\n";
+        for (auto const& test : tests) {
+            auto const name = test.filename().string();
+            std::string const log_path =
+                "/tmp/xchplot2-parity-" + name + ".log";
+            // Redirecting through the shell: `test` is a path we
+            // generated ourselves from a directory listing — no user-
+            // controlled shell metachars reach this string.
+            std::string const cmd =
+                test.string() + " >" + log_path + " 2>&1";
+            auto const t0 = std::chrono::steady_clock::now();
+            int const rc = std::system(cmd.c_str());
+            auto const ms = std::chrono::duration<double, std::milli>(
+                                std::chrono::steady_clock::now() - t0).count();
+            if (rc == 0) {
+                std::fprintf(stderr, "  PASS  %-32s  (%.1f ms)\n",
+                             name.c_str(), ms);
+                ++pass;
+            } else {
+                std::fprintf(stderr,
+                             "  FAIL  %-32s  (exit %d; log: %s)\n",
+                             name.c_str(), rc, log_path.c_str());
+                ++fail;
+            }
+        }
+        std::fprintf(stderr, "\n==> %d passed, %d failed\n", pass, fail);
+        return fail > 0 ? 1 : 0;
+    }
+
     if (mode == "plot") {
         // Standalone farmable-plot path: derive plot_id + memo internally.
         int k = 28;
@@ -181,9 +501,15 @@ extern "C" int xchplot2_main(int argc, char* argv[])
         int meta_group = 0;
         bool testnet = false;
         bool verbose = false;
+        bool skip_existing = false;
+        bool continue_on_error = false;
         std::string out_dir = ".";
         std::string farmer_pk_hex, pool_pk_hex, pool_ph_hex, pool_addr;
         std::string seed_hex;
+        std::vector<int> plot_device_ids;
+        bool plot_use_all_devices = false;
+        bool plot_include_cpu     = false;
+        std::string plot_streaming_tier;
 
         for (int i = 2; i < argc; ++i) {
             std::string a = argv[i];
@@ -207,6 +533,30 @@ extern "C" int xchplot2_main(int argc, char* argv[])
             else if ((a == "--seed"       || a == "-S") && need(1)) seed_hex        = argv[++i];
             else if  (a == "--testnet"    || a == "-T") testnet = true;
             else if  (a == "-v" || a == "--verbose")    verbose = true;
+            else if  (a == "--skip-existing")           skip_existing = true;
+            else if  (a == "--continue-on-error")       continue_on_error = true;
+            else if  (a == "--cpu")                     plot_include_cpu = true;
+            else if  (a == "--tier" && need(1)) {
+                std::string t = argv[++i];
+                if (t != "plain" && t != "compact" && t != "minimal" && t != "auto") {
+                    std::cerr << "Error: --tier expects 'plain', 'compact', "
+                                 "'minimal', or 'auto' (got '" << t << "')\n";
+                    return 1;
+                }
+                plot_streaming_tier = (t == "auto") ? "" : t;
+            }
+            else if  (a == "--devices" && need(1)) {
+                pos2gpu::BatchOptions tmp;
+                if (!parse_devices_arg(argv[++i], tmp)) {
+                    std::cerr << "Error: --devices expects 'all', 'cpu', or a "
+                                 "comma-separated list of device ids "
+                                 "(got '" << argv[i] << "')\n";
+                    return 1;
+                }
+                plot_device_ids      = std::move(tmp.device_ids);
+                plot_use_all_devices = tmp.use_all_devices;
+                if (tmp.include_cpu) plot_include_cpu = true;
+            }
             else {
                 std::cerr << "Error: unknown argument: " << a << "\n";
                 print_usage(argv[0]);
@@ -222,9 +572,14 @@ extern "C" int xchplot2_main(int argc, char* argv[])
         int const pool_specs = int(!pool_pk_hex.empty())
                              + int(!pool_ph_hex.empty())
                              + int(!pool_addr.empty());
-        if (pool_specs != 1) {
-            std::cerr << "Error: exactly one of --pool-pk, --pool-ph, "
-                         "--pool-contract-address is required\n";
+        if (pool_specs == 0) {
+            std::cerr << "Error: a pool destination is required — pick one of "
+                         "--pool-pk, --pool-ph, --pool-contract-address\n";
+            return 1;
+        }
+        if (pool_specs > 1) {
+            std::cerr << "Error: --pool-pk, --pool-ph, and --pool-contract-address "
+                         "are mutually exclusive (saw " << pool_specs << ")\n";
             return 1;
         }
         if (num < 1) {
@@ -350,16 +705,27 @@ extern "C" int xchplot2_main(int argc, char* argv[])
                 }
             }
 
-            auto res = pos2gpu::run_batch(entries, verbose);
+            pos2gpu::BatchOptions opts{};
+            opts.verbose           = verbose;
+            opts.skip_existing     = skip_existing;
+            opts.continue_on_error = continue_on_error;
+            opts.device_ids        = plot_device_ids;
+            opts.use_all_devices   = plot_use_all_devices;
+            opts.include_cpu       = plot_include_cpu;
+            opts.streaming_tier    = plot_streaming_tier;
+            auto res = pos2gpu::run_batch(entries, opts);
             double per = res.plots_written
                 ? res.total_wall_seconds / double(res.plots_written) : 0;
             std::cerr << "[plot] wrote " << res.plots_written << " plots in "
                       << res.total_wall_seconds << " s ("
-                      << per << " s/plot)\n";
+                      << per << " s/plot)";
+            if (res.plots_skipped) std::cerr << "; skipped " << res.plots_skipped;
+            if (res.plots_failed)  std::cerr << "; failed "  << res.plots_failed;
+            std::cerr << "\n";
             for (auto const& e : entries) {
                 std::cout << out_dir << "/" << e.out_name << "\n";
             }
-            return 0;
+            return (res.plots_failed > 0) ? 3 : 0;
         } catch (std::exception const& e) {
             std::cerr << "[plot] FAILED: " << e.what() << "\n";
             return 2;
diff --git a/tools/xchplot2/cli_devlink.cu b/tools/xchplot2/cli_devlink.cu
new file mode 100644
index 0000000..f5c9054
--- /dev/null
+++ b/tools/xchplot2/cli_devlink.cu
@@ -0,0 +1,37 @@
+// cli_devlink.cu — exists only to make xchplot2_cli a CUDA-language
+// target so CMake's CUDA_RESOLVE_DEVICE_SYMBOLS=ON actually triggers
+// nvcc --device-link at static-archive creation time.
+//
+// xchplot2_cli is the static lib that build.rs hands to Rust's
+// linker (cargo install). It depends on pos2_gpu (the CUDA library
+// with separable compilation) but has no CUDA sources of its own.
+// Without this stub, CMake silently treats xchplot2_cli as a pure-
+// C++ static lib, skips the device-link step regardless of
+// CUDA_RESOLVE_DEVICE_SYMBOLS, and the resulting libxchplot2_cli.a
+// has every per-TU `__sti____cudaRegisterAll()` constructor
+// referencing an undefined `__cudaRegisterLinkedBinary_*` stub.
+// Rust's `cc` host linker has no way to provide those — it doesn't
+// know to invoke nvcc — so the final link fails.
+//
+// Touching this file via add_library(... cli_devlink.cu) flips
+// xchplot2_cli to a CUDA-language target, the device-link runs at
+// archive creation, the resolution stubs land inside the .a, and
+// the host linker finds them with no extra work.
+//
+// First reported on a Debian/Ubuntu host with a real GTX 1060 +
+// `CUDA_ARCHITECTURES=61 cargo install` — the symptom was a cascade
+// of "undefined reference to __cudaRegisterLinkedBinary_*" on every
+// .cu TU in pos2_gpu.
+
+namespace {
+
+// Anonymous-namespace `__device__` function — nvcc emits it into the
+// per-TU device fatbinary, which gives the device-link step at least
+// one input from this TU. Never called from anywhere; marked
+// __device__ so it's compiled into the device-side fatbinary, not
+// the host-side .o.
+__device__ int xchplot2_cli_device_link_anchor() noexcept {
+    return 0;
+}
+
+}  // namespace