diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..3a9afc8 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,26 @@ +# Build artifacts (out-of-source, copied per Containerfile) +build/ +build-*/ +target/ + +# pos2-chip is FetchContent-cloned at CMake configure time inside the +# container; no need to ship a host-side copy. +third_party/ + +# Generated plot files left over from local benchmarks. +*.plot2 + +# Editor / tooling +.vscode/ +.idea/ +.cache/ +compile_commands.json + +# Profiling artifacts +*.nsys-rep +*.qdrep +*.qdstrm +*.ncu-rep + +# git history is irrelevant to the build itself. +.git/ diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..2b96933 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,21 @@ +version: 2 + +# Dependabot bumps deps via PR. Two ecosystems: +# - cargo: the keygen-rs subcrate's BLS / sha2 / address-codec stack. +# The build.rs at repo root only references env state and has no +# runtime crate deps, so it doesn't need its own entry. +# - github-actions: action versions in .github/workflows/. +# Weekly cadence keeps PR volume low; bump to daily if security +# advisories pile up. +updates: + - package-ecosystem: cargo + directory: /keygen-rs + schedule: + interval: weekly + open-pull-requests-limit: 5 + + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + open-pull-requests-limit: 5 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0553fdf --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,148 @@ +name: CI + +on: + pull_request: + push: + branches: [main] + +permissions: + contents: read + +jobs: + shell: + name: ShellCheck + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Install shellcheck + run: sudo apt-get update && sudo apt-get install -y shellcheck + - name: Lint scripts/ + # Recurse so scripts/test/install-container-deps/run.sh and any + # future helpers under scripts/ stay covered. + run: find scripts -name '*.sh' -print0 | xargs -0 shellcheck + + actions: + name: actionlint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: reviewdog/action-actionlint@v1 + with: + fail_level: error + + rust: + name: Rust (keygen-rs) + runs-on: ubuntu-latest + defaults: + run: + working-directory: keygen-rs + steps: + - uses: actions/checkout@v6 + - uses: dtolnay/rust-toolchain@stable + with: + components: clippy, rustfmt + - uses: Swatinem/rust-cache@v2 + with: + workspaces: keygen-rs + - name: cargo fmt --check + run: cargo fmt --all --check + - name: cargo check + run: cargo check --all-targets --locked || cargo check --all-targets + - name: cargo clippy (advisory) + run: cargo clippy --all-targets -- -W clippy::all + continue-on-error: true + - name: cargo test + run: cargo test --all-targets + + hadolint: + name: hadolint Containerfile + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: hadolint/hadolint-action@v3.3.0 + with: + dockerfile: Containerfile + # CUDA / ROCm base images make version-pinning warnings (DL3008, + # DL3009) impractical — package versions shift between base image + # rolls and the toolkit pin lives in BASE_DEVEL. Same for the + # `set -o pipefail` warnings on RUN-with-pipe (DL4006) — those + # pipes are bootstrap-time noise, not runtime data paths. Filter + # to errors so we still catch real bugs (root, ADD vs COPY, + # missing && \, COPY --chown typos, etc.). + failure-threshold: error + + compose-config: + name: docker compose config validate + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: docker compose config --quiet + # Catches typos in service names / build-arg keys / unresolvable + # ${VAR} placeholders without ever pulling a base image. ~5s. + run: docker compose -f compose.yaml config --quiet + + typos: + name: typos + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: crate-ci/typos@master + + markdownlint: + name: markdownlint README + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: DavidAnson/markdownlint-cli2-action@v23 + with: + globs: README.md + + install-container-deps-dryrun: + name: install-container-deps.sh — dry-run fixtures + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Diff --dry-run output against fixtures + # Runs --dry-run for every (distro × engine × gpu) tuple in + # arch / ubuntu / fedora containers and diffs against the + # checked-in fixtures under scripts/test/install-container-deps/. + # No mutating sudo calls — completes in ~60s. + run: scripts/test/install-container-deps/run.sh + + install-container-deps-smoke: + name: install-container-deps.sh smoke (${{ matrix.engine }} ${{ matrix.gpu }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - engine: podman + gpu: cpu + - engine: podman + gpu: amd + - engine: docker + gpu: cpu + # NVIDIA smoke is intentionally skipped: nvidia-ctk cdi generate + # needs a real GPU + driver to populate the spec, and the dry-run + # fixtures already cover the planning logic for that path. + steps: + - uses: actions/checkout@v6 + - name: Real install in ubuntu:24.04 + assert idempotent re-run + env: + ENGINE: ${{ matrix.engine }} + GPU: ${{ matrix.gpu }} + # Validates that engine + GPU-runtime packages actually install + # from the real apt repos (catches package-name drift / repo + # availability), and that re-running the script is a no-op. + run: | + docker run --rm \ + -e ENGINE -e GPU \ + -v "$PWD/scripts:/s:ro" \ + docker.io/ubuntu:24.04 \ + bash -ec ' + apt-get update -qq + apt-get install -y -qq sudo curl ca-certificates gnupg >/dev/null + /s/install-container-deps.sh --engine "$ENGINE" --gpu "$GPU" + # Idempotence: a clean second run must still exit 0. + /s/install-container-deps.sh --engine "$ENGINE" --gpu "$GPU" + ' diff --git a/.gitignore b/.gitignore index 89e01ed..43f3299 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ build/ +build-*/ *.plot2 .cache/ compile_commands.json @@ -18,3 +19,4 @@ target/ # pos2-chip is fetched here automatically by CMake at configure time. # See CMakeLists.txt → FetchContent_Declare(pos2_chip). third_party/ +docs/ diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 0000000..8b6d3d9 --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,12 @@ +{ + "_comment": "README is prose-heavy and includes terminal output, wide tables, and mixed list markers. Disable rules that produce noise without catching real issues. MD051 is also disabled because markdownlint's link-fragment slug algorithm differs from GitHub's (e.g. `### Multi-GPU: --devices` slugs differently between the two).", + "MD004": false, + "MD013": false, + "MD026": false, + "MD028": false, + "MD031": false, + "MD032": false, + "MD040": false, + "MD051": false, + "MD060": false +} diff --git a/CMakeLists.txt b/CMakeLists.txt index 25b5313..5f562e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,18 +1,140 @@ cmake_minimum_required(VERSION 3.24) -project(pos2-gpu LANGUAGES C CXX CUDA) +project(pos2-gpu VERSION 0.6.0 LANGUAGES C CXX) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CUDA_STANDARD 20) -set(CMAKE_CUDA_STANDARD_REQUIRED ON) -set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) +# Every static library here is linked into both the standalone xchplot2 +# executable and the top-level Rust crate's PIE binary (via build.rs + +# cargo install). rust-lld (the default linker on some distros) rejects +# non-PIC objects in a PIE output — seen in the wild as "relocation +# R_X86_64_32 cannot be used against local symbol; recompile with +# -fPIC" on Cancel.cpp, BatchPlotter.cpp, etc. Setting this globally +# ensures pos2_gpu, pos2_gpu_host, fse, and any other transitively- +# compiled object is built with -fPIC, so the linker choice doesn't +# matter. The per-target POSITION_INDEPENDENT_CODE ON below stay as +# explicit markers for the public-interface static libraries. +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# CUDA toolchain is conditional in slice 15. The CUDA path provides: +# - SortCuda.cu (CUB radix sort — best perf on NVIDIA) +# - AesGpu.cu (T-tables in __constant__ memory + cudaMemcpyToSymbol init) +# - AesGpuBitsliced.cu (bench-only bitsliced AES; needs nvcc) +# - The cuda-flavoured parity tests in tools/parity/ +# The non-CUDA path uses SortSycl.cpp + AesStub.cpp — runs on AMD/Intel via +# AdaptiveCpp's HIP / Level Zero backends. Default ON to preserve the +# existing NVIDIA workflow. +# +# CAVEAT: with XCHPLOT2_BUILD_CUDA=OFF the build still needs the CUDA +# Toolkit *headers* on the include path (the SYCL TUs reference cudaError_t +# / cudaStream_t / cuda_fp16.h via the kernel-wrapper headers). Lifting +# those CUDA-type dependencies out of the public SYCL API is a follow-up +# refactor (see slice 17 in docs/gpu-portability-sketch.md). nvcc itself is +# NOT required when XCHPLOT2_BUILD_CUDA=OFF — only the headers. +option(XCHPLOT2_BUILD_CUDA "Compile CUDA-only TUs (CUB sort, __constant__ AES init, bench tests)" ON) + +# On dual-toolchain hosts (CUDA Toolkit + ROCm both installed), the SYCL +# TUs pull in CUDA's via CudaHalfShim.hpp AND ROCm's +# via AdaptiveCpp's HIP backend. Their vector_types +# headers declare conflicting typedefs for char1 / int2 / etc., which +# breaks the compile. CudaHalfShim respects XCHPLOT2_SKIP_CUDA_RUNTIME / +# _FP16 — turn them on when we're (a) NOT building CUDA TUs and (b) ROCm +# is present, so the shim falls back to its opaque stubs instead. +if(NOT XCHPLOT2_BUILD_CUDA) + find_path(XCHPLOT2_HIP_RUNTIME_H hip/hip_runtime.h + PATHS /opt/rocm/include /usr/include /usr/local/include + NO_DEFAULT_PATH) + if(XCHPLOT2_HIP_RUNTIME_H) + add_compile_definitions( + XCHPLOT2_SKIP_CUDA_RUNTIME + XCHPLOT2_SKIP_CUDA_FP16) + message(STATUS "xchplot2: ROCm at ${XCHPLOT2_HIP_RUNTIME_H} — " + "skipping CUDA runtime/fp16 includes (CudaHalfShim stubs)") + endif() +endif() -# Default arch: sm_89 (RTX 4090). Override via -DCMAKE_CUDA_ARCHITECTURES=... -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 89) +if(XCHPLOT2_BUILD_CUDA) + # Default arch: sm_89 (RTX 4090). Override via -DCMAKE_CUDA_ARCHITECTURES=... + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 89) + endif() + + # Preflight nvcc-vs-arch compatibility BEFORE enable_language(CUDA), + # which is what triggers the cryptic "Unsupported gpu architecture + # 'compute_61'" TryCompile failure when Pascal/Volta meets CUDA 13.x. + # CUDA 13.0 dropped codegen for sm_50/52/53/60/61/62/70/72 entirely. + # Skip the check if nvcc isn't findable yet — enable_language(CUDA) + # below will surface its own missing-toolchain message in that case. + find_program(_xchplot2_nvcc nvcc + HINTS ENV CUDA_PATH ENV CUDA_HOME /opt/cuda /usr/local/cuda + PATH_SUFFIXES bin + DOC "nvcc for arch-compat preflight") + if(_xchplot2_nvcc) + execute_process( + COMMAND "${_xchplot2_nvcc}" --version + OUTPUT_VARIABLE _nvcc_version_out + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + # Parse "Cuda compilation tools, release 13.0, V13.0.48" → 13 + if(_nvcc_version_out MATCHES "release ([0-9]+)") + set(_nvcc_major "${CMAKE_MATCH_1}") + set(_min_arch 9999) + foreach(_a IN LISTS CMAKE_CUDA_ARCHITECTURES) + # Strip sm_ / compute_ prefixes some users pass through + string(REGEX REPLACE "^(sm_|compute_)" "" _a "${_a}") + if(_a MATCHES "^[0-9]+$" AND _a LESS _min_arch) + set(_min_arch ${_a}) + endif() + endforeach() + if(_nvcc_major GREATER_EQUAL 13 AND _min_arch LESS 75) + # Container detection: Docker writes /.dockerenv, Podman writes + # /run/.containerenv. Either presence means the host-side fixes + # don't apply — the user needs to rebuild the image with a + # different BASE_DEVEL. + if(EXISTS "/.dockerenv" OR EXISTS "/run/.containerenv") + set(_fix_block + "You're building inside a container — the toolkit comes from\n" + "the base image, not the host. Rebuild with a CUDA 12.x base:\n" + " - Recommended: rerun scripts/build-container.sh on the host;\n" + " it auto-pins nvidia/cuda:12.9.1 when CUDA_ARCH < 75.\n" + " - Or pass --build-arg explicitly:\n" + " podman build -t xchplot2:cuda \\\n" + " --build-arg BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n" + " --build-arg BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n" + " --build-arg CUDA_ARCH=${_min_arch} \\\n" + " .\n") + else() + set(_fix_block + "Fix one of:\n" + " - Install CUDA 12.9 (last toolkit with Pascal/Volta support) and re-run cmake:\n" + " sudo apt install cuda-toolkit-12-9 (Ubuntu/Debian)\n" + " Then point cmake at it:\n" + " cmake -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.9/bin/nvcc -B build -S . [...]\n" + " - Or override the target arch (only valid if you actually have a Turing+ card):\n" + " cmake -DCMAKE_CUDA_ARCHITECTURES=75 -B build -S . [...]\n" + " - Or use the container path — scripts/build-container.sh auto-pins\n" + " the 12.9 base image when it detects a pre-Turing GPU.\n") + endif() + message(FATAL_ERROR + "xchplot2: CUDA Toolkit ${_nvcc_major}.x dropped codegen for " + "sm_${_min_arch} (Pascal / Volta / pre-Turing).\n" + "\n" + "Detected:\n" + " nvcc ${_nvcc_major}.x at ${_xchplot2_nvcc}\n" + " target arch: sm_${_min_arch} (from CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES})\n" + "\n" + ${_fix_block}) + endif() + endif() + endif() + unset(_xchplot2_nvcc CACHE) + + enable_language(CUDA) + set(CMAKE_CUDA_STANDARD 20) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) endif() # Optional: compile in clock64 instrumentation for T3 match_all_buckets. @@ -20,6 +142,195 @@ endif() # call. Off by default — enable with -DXCHPLOT2_INSTRUMENT_MATCH=ON. option(XCHPLOT2_INSTRUMENT_MATCH "Instrument T3 match_all_buckets with clock64 breakdown" OFF) +# SYCL kernels via AdaptiveCpp are the only backend; the previous +# XCHPLOT2_BACKEND={cuda,sycl} toggle was retired in slice 9 once the +# CUDA-native wrapper TUs (T*OffsetsCuda.cu, PipelineKernelsCuda.cu) +# were deleted. AdaptiveCpp is now a hard build dependency. + +# AdaptiveCpp target autodetect — must run BEFORE find_package(AdaptiveCpp) +# so the package config sees a non-empty target list. acpp errors on an +# empty -DACPP_TARGETS= (which we'd otherwise pass through unchanged from +# the Containerfile's default build-arg). +# 1. NVIDIA: stay on "generic" (LLVM SSCP). Empirically a few percent +# faster than cuda:sm_XX on our kernels at k=28 — SSCP's runtime +# specialization beats the CUDA-AOT path for this workload. +# 2. AMD: rocminfo Name: gfxXXXX → hip:gfxXXXX. SSCP's HIP path is +# less mature, so AOT-compiling for the actual gfx target is the +# safer pick on AMD. +# 3. Fallback: generic (works everywhere; JITs on first use). +# Override with -DACPP_TARGETS=... on the cmake command line. +if(NOT ACPP_TARGETS) + execute_process( + COMMAND nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits + OUTPUT_VARIABLE _xchplot2_cuda_cap + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE _xchplot2_nvsmi_rc + ERROR_QUIET) + if(_xchplot2_nvsmi_rc EQUAL 0 AND _xchplot2_cuda_cap) + set(ACPP_TARGETS "generic" CACHE STRING "AdaptiveCpp target list" FORCE) + message(STATUS "xchplot2: NVIDIA GPU detected; using ACPP_TARGETS=generic (SSCP)") + else() + execute_process( + COMMAND rocminfo + OUTPUT_VARIABLE _xchplot2_rocm_out + RESULT_VARIABLE _xchplot2_rocminfo_rc + ERROR_QUIET) + if(_xchplot2_rocminfo_rc EQUAL 0) + string(REGEX MATCH "Name:[ \t]+gfx[0-9a-f]+" _xchplot2_gfx_match "${_xchplot2_rocm_out}") + string(REGEX REPLACE "Name:[ \t]+" "" _xchplot2_gfx "${_xchplot2_gfx_match}") + if(_xchplot2_gfx) + set(ACPP_TARGETS "hip:${_xchplot2_gfx}" CACHE STRING "AdaptiveCpp target list" FORCE) + message(STATUS "xchplot2: ACPP_TARGETS auto-detected via rocminfo: ${ACPP_TARGETS}") + endif() + endif() + endif() + if(NOT ACPP_TARGETS) + set(ACPP_TARGETS "generic" CACHE STRING "AdaptiveCpp target list" FORCE) + message(STATUS "xchplot2: ACPP_TARGETS fell back to generic (no nvidia-smi/rocminfo)") + endif() +endif() +message(STATUS "xchplot2: ACPP_TARGETS=${ACPP_TARGETS}") + +# Lookup precedence: +# 1. find_package(AdaptiveCpp) — system or local install (e.g. /opt/adaptivecpp). +# This is what scripts/install-deps.sh and the Containerfile produce. +# 2. FetchContent fallback — clones AdaptiveCpp at v25.10.0 and adds it as +# a CMake subproject. Slow first build (LLVM compilation, ~15-30 min) but +# removes the manual install step. Opt out with -DXCHPLOT2_FETCH_ADAPTIVECPP=OFF. +option(XCHPLOT2_FETCH_ADAPTIVECPP "Fall back to FetchContent if AdaptiveCpp not found" ON) + +# HINTS /opt/adaptivecpp matches scripts/install-deps.sh's default install +# prefix, and ENV ACPP_PREFIX honours users who installed to a custom +# location with `ACPP_PREFIX=/elsewhere ./scripts/install-deps.sh`. Without +# these, find_package wouldn't search /opt (not a standard CMake path), the +# user would have to remember to `export CMAKE_PREFIX_PATH=/opt/adaptivecpp` +# between running install-deps.sh and the build (the script can't set env +# vars in the parent shell), and FetchContent would fire pointlessly. +find_package(AdaptiveCpp QUIET HINTS /opt/adaptivecpp ENV ACPP_PREFIX) +if(NOT AdaptiveCpp_FOUND) + if(XCHPLOT2_FETCH_ADAPTIVECPP) + message(STATUS "xchplot2: AdaptiveCpp not found — fetching v25.10.0 via FetchContent") + message(STATUS "xchplot2: first build will take ~15-30 min while AdaptiveCpp compiles") + message(STATUS "xchplot2: pre-install via scripts/install-deps.sh to skip this") + + # AdaptiveCpp's compiler/CMakeLists requires ld.lld at configure + # time and aborts with "Cannot find ld.lld. Please provide path + # via -DACPP_LLD_PATH=…" otherwise. Auto-probe the conventional + # LLVM-{16..20} prefixes and pass the path through so users on a + # FetchContent build don't have to know that detail. If the + # binary isn't installed at all, fail loud with a copy-paste + # install command — far less confusing than AdaptiveCpp's own + # message. + find_program(_xchplot2_ld_lld + NAMES ld.lld + HINTS + /usr/lib/llvm-20/bin /usr/lib/llvm-19/bin /usr/lib/llvm-18/bin + /usr/lib/llvm-17/bin /usr/lib/llvm-16/bin + /usr/lib/llvm20/bin /usr/lib/llvm19/bin /usr/lib/llvm18/bin + /usr/lib64/llvm20/bin /usr/lib64/llvm19/bin /usr/lib64/llvm18/bin + /opt/llvm-20/bin /opt/llvm-19/bin /opt/llvm-18/bin + /opt/llvm20/bin /opt/llvm19/bin /opt/llvm18/bin + DOC "ld.lld required by AdaptiveCpp's compiler/CMakeLists") + if(_xchplot2_ld_lld) + set(ACPP_LLD_PATH "${_xchplot2_ld_lld}" CACHE FILEPATH + "Path to ld.lld for AdaptiveCpp's compiler/CMakeLists" FORCE) + message(STATUS "xchplot2: auto-probed ld.lld at ${_xchplot2_ld_lld}") + else() + message(FATAL_ERROR + "xchplot2: AdaptiveCpp's FetchContent build needs ld.lld " + "but it isn't installed at any of the standard LLVM-16..20 " + "prefixes. Install it:\n" + " Ubuntu/Debian: sudo apt install lld-18\n" + " Fedora/RHEL: sudo dnf install lld\n" + " Arch/CachyOS: sudo pacman -S lld\n" + "Or pre-install AdaptiveCpp via scripts/install-deps.sh " + "(also installs ld.lld and builds AdaptiveCpp at " + "/opt/adaptivecpp). Override the probe with " + "-DACPP_LLD_PATH=/path/to/ld.lld.") + endif() + + include(FetchContent) + FetchContent_Declare( + adaptivecpp + GIT_REPOSITORY https://github.com/AdaptiveCpp/AdaptiveCpp.git + GIT_TAG v25.10.0 + ) + FetchContent_MakeAvailable(adaptivecpp) + if(NOT COMMAND add_sycl_to_target) + message(FATAL_ERROR + "xchplot2: FetchContent built AdaptiveCpp but add_sycl_to_target " + "wasn't exported. Install AdaptiveCpp via scripts/install-deps.sh " + "or use the Containerfile.") + endif() + else() + message(FATAL_ERROR + "xchplot2: AdaptiveCpp not found. Install it via scripts/install-deps.sh, " + "use the Containerfile, or re-run with -DXCHPLOT2_FETCH_ADAPTIVECPP=ON.") + endif() +endif() + +# Export the AdaptiveCpp lib directory to a file so build.rs knows where +# to add -L for libacpp-rt / libacpp-common at link time. Without this, +# the Rust binary fails to link on machines where AdaptiveCpp lives +# anywhere other than /opt/adaptivecpp or /usr/local (and on FetchContent +# builds, which leave the artifacts in CMake's _deps/ build tree). +set(_xchplot2_acpp_lib_dir "") +if(TARGET acpp-rt) + # FetchContent-built target: ask CMake where it'll land. + set(_xchplot2_acpp_lib_dir "$") +elseif(AdaptiveCpp_DIR) + # Installed AdaptiveCpp: AdaptiveCpp_DIR is /lib/cmake/AdaptiveCpp, + # so two parent dirs up gives /lib. + get_filename_component(_xchplot2_acpp_cmake_root "${AdaptiveCpp_DIR}" DIRECTORY) + get_filename_component(_xchplot2_acpp_lib_dir "${_xchplot2_acpp_cmake_root}" DIRECTORY) +endif() +if(_xchplot2_acpp_lib_dir) + file(GENERATE OUTPUT "${CMAKE_BINARY_DIR}/acpp-prefix.txt" + CONTENT "${_xchplot2_acpp_lib_dir}\n") + message(STATUS "xchplot2: AdaptiveCpp lib dir = ${_xchplot2_acpp_lib_dir}") +endif() + +# Embed runtime library paths so binaries built via plain `cmake` (parity +# tests, dev rebuilds, anything not invoked through cargo+build.rs) can +# locate AdaptiveCpp's runtime lib + ROCm's libamdhip64.so without an +# external LD_LIBRARY_PATH. build.rs sets the same rpaths via +# rustc-link-arg for the cargo path, so this is idempotent for the +# production binary. Without this, a fresh `cmake -B build && cmake +# --build build --target sycl_t1_parity` produces a binary that throws +# "No matching device" at SYCL queue construction because +# librt-backend-hip.so can't dynamically link libamdhip64.so. +# +# The FetchContent path leaves _xchplot2_acpp_lib_dir as a generator +# expression ("$") which can't go into the +# RPATH variables at config time — CMake's BUILD_WITH_INSTALL_RPATH=OFF +# default already handles in-tree targets in that case. +if(_xchplot2_acpp_lib_dir AND NOT _xchplot2_acpp_lib_dir MATCHES "\\$<") + list(APPEND CMAKE_BUILD_RPATH "${_xchplot2_acpp_lib_dir}") + list(APPEND CMAKE_INSTALL_RPATH "${_xchplot2_acpp_lib_dir}") +endif() +if(XCHPLOT2_HIP_RUNTIME_H) + get_filename_component(_xchplot2_rocm_root "${XCHPLOT2_HIP_RUNTIME_H}/.." ABSOLUTE) + list(APPEND CMAKE_BUILD_RPATH "${_xchplot2_rocm_root}/lib") + list(APPEND CMAKE_INSTALL_RPATH "${_xchplot2_rocm_root}/lib") + message(STATUS "xchplot2: embedded rpath includes ${_xchplot2_rocm_root}/lib") + + # Direct-link libamdhip64 so AdaptiveCpp's runtime-dlopen'd HIP + # backend (librt-backend-hip.so) finds the library already loaded + # in the process address space. dlopen of a backend's transitive + # deps doesn't consult the calling binary's RUNPATH on glibc — + # without this explicit link, ROCm silently fails to initialise + # and AdaptiveCpp's default selector falls through to its OpenMP + # host device. The fall-through makes hellosycl / sycl_t1_parity + # report "ALL OK" while having executed entirely on CPU. Mirrors + # build.rs:631 (cargo:rustc-link-lib=amdhip64) for the cargo + # build path. + if(EXISTS "${_xchplot2_rocm_root}/lib/libamdhip64.so") + link_libraries("${_xchplot2_rocm_root}/lib/libamdhip64.so") + message(STATUS "xchplot2: link_libraries(libamdhip64.so) — " + "AdaptiveCpp HIP backend will find ROCm at runtime") + endif() +endif() + # pos2-chip dependency. # # Default behavior: FetchContent auto-clones Chia-Network/pos2-chip into @@ -74,15 +385,87 @@ endif() # Shared GPU support library (kernels). AesGpu.cu MUST come first — it # owns the constant-memory T-tables that all later kernels reference. +# All backend-dispatched wrapper TUs (T*OffsetsSycl.cpp, PipelineKernelsSycl.cpp) +# go through AdaptiveCpp via add_sycl_to_target below. +set(POS2_GPU_SYCL_SRC + src/gpu/T1OffsetsSycl.cpp + src/gpu/T2OffsetsSycl.cpp + src/gpu/T3OffsetsSycl.cpp + src/gpu/PipelineKernelsSycl.cpp + src/gpu/XsKernel.cpp + src/gpu/XsKernelsSycl.cpp + src/gpu/T1Kernel.cpp + src/gpu/T2Kernel.cpp + src/gpu/T3Kernel.cpp + src/host/GpuBufferPool.cpp + src/host/GpuPipeline.cpp) + +# Sort path: SortSycl.cpp (hand-rolled LSD radix in pure SYCL) is now +# always compiled — it's the runtime fallback for non-CUDA backends on +# dual-toolchain builds, and the only path on AMD-only / Intel-only / +# CPU builds. SortDispatch.cpp picks at runtime based on the queue's +# device backend (sycl::backend::cuda → _cub variant; everything else → +# _sycl variant). When BUILD_CUDA=OFF, the dispatcher's CUB branch is +# compiled out and reduces to a single tail call into SortSycl.cpp. +list(APPEND POS2_GPU_SYCL_SRC + src/gpu/SortSycl.cpp + src/gpu/SortDispatch.cpp + src/gpu/SyclDeviceList.cpp) + +if(XCHPLOT2_BUILD_CUDA) + set(POS2_GPU_CUDA_SRC + src/gpu/AesGpu.cu + src/gpu/AesGpuBitsliced.cu + src/gpu/SortCuda.cu) + # SortSyclCub.cpp is the SYCL-typed adapter that bridges + # sycl::queue → CUB. SortCuda.cu used to provide the SYCL-typed + # entry points itself, but mixing nvcc + in one + # TU drags AdaptiveCpp's libkernel half.hpp into the legacy CUDA + # arm of __acpp_backend_switch — a path AdaptiveCpp doesn't + # support. Splitting the SYCL surface into this acpp-compiled + # adapter (does q.wait()) and a pure-CUDA cub_sort_* in + # SortCuda.cu (does the work + cudaStreamSync) keeps each + # compiler in its lane. + list(APPEND POS2_GPU_SYCL_SRC + src/gpu/SortSyclCub.cpp) +else() + # AesStub.cpp: no-op initialize_aes_tables on builds without the + # CUDA AOT path. AesGpu.cu provides the real implementation when + # BUILD_CUDA=ON; SYCL workers ignore initialize_aes_tables anyway + # (they upload AES T-tables lazily via SyclBackend.hpp's + # aes_tables_device(q)). + set(POS2_GPU_CUDA_SRC) + list(APPEND POS2_GPU_SYCL_SRC + src/gpu/AesStub.cpp) +endif() + +# CUDA OBJECT library: compiled once, referenced via $ +# from each consuming target EXACTLY ONCE. The earlier design tried to +# put the .o files in BOTH pos2_gpu (STATIC) AND xchplot2_cli for hash +# matching, but nvlink's device-link step at xchplot2_cli archive +# creation refuses the duplicate kAesT0..3 / kernel definitions: +# +# nvlink error : Multiple definition of '_ZN7pos2gpu6kAesT0E' in +# 'libpos2_gpu.a:AesGpu.cu.o', first defined in +# 'CMakeFiles/pos2_gpu_cuda_obj.dir/src/gpu/AesGpu.cu.o' +# +# (--allow-multiple-definition is a host-linker flag — nvlink doesn't +# honour it.) So the .o files now live exclusively in xchplot2_cli for +# the cargo install path, and each parity test adds them explicitly +# below — pos2_gpu STATIC carries only the SYCL .cpp sources. +if(XCHPLOT2_BUILD_CUDA) + add_library(pos2_gpu_cuda_obj OBJECT ${POS2_GPU_CUDA_SRC}) + target_include_directories(pos2_gpu_cuda_obj PRIVATE src) + target_link_libraries(pos2_gpu_cuda_obj PRIVATE pos2_chip_headers) + target_compile_features(pos2_gpu_cuda_obj PRIVATE cxx_std_20) + set_target_properties(pos2_gpu_cuda_obj PROPERTIES POSITION_INDEPENDENT_CODE ON) + if(XCHPLOT2_INSTRUMENT_MATCH) + target_compile_definitions(pos2_gpu_cuda_obj PRIVATE XCHPLOT2_INSTRUMENT_MATCH=1) + endif() +endif() + add_library(pos2_gpu STATIC - src/gpu/AesGpu.cu - src/gpu/AesGpuBitsliced.cu - src/gpu/XsKernel.cu - src/gpu/T1Kernel.cu - src/gpu/T2Kernel.cu - src/gpu/T3Kernel.cu - src/host/GpuBufferPool.cu - src/host/GpuPipeline.cu + ${POS2_GPU_SYCL_SRC} ) target_include_directories(pos2_gpu PUBLIC src @@ -92,10 +475,98 @@ target_compile_features(pos2_gpu PUBLIC cxx_std_20) if(XCHPLOT2_INSTRUMENT_MATCH) target_compile_definitions(pos2_gpu PUBLIC XCHPLOT2_INSTRUMENT_MATCH=1) endif() +# Marker for SortDispatch.cpp: gates whether the runtime backend +# dispatcher includes the CUB branch. Defined when SortSyclCub.cpp + +# SortCuda.cu are linked (BUILD_CUDA=ON); undefined on AMD-only / +# Intel-only / CPU builds, in which case the dispatcher reduces to a +# single tail call into SortSycl.cpp. +if(XCHPLOT2_BUILD_CUDA) + target_compile_definitions(pos2_gpu PUBLIC XCHPLOT2_HAVE_CUB=1) +endif() +add_sycl_to_target(TARGET pos2_gpu SOURCES ${POS2_GPU_SYCL_SRC}) + +# AdaptiveCpp's acpp driver doesn't auto-propagate CMake's standard +# CMAKE_CXX_FLAGS_RELEASE (-O3 -DNDEBUG) into the SYCL compile step. +# Without an explicit -O flag, acpp warns "No optimization flag was +# given, optimizations are disabled by default" and the AES-heavy SYCL +# kernels (Xs gen, T*match) compile at -O0, which is dramatically +# slower on amdgcn (Xs gen alone was 200 ms / ~25% of wall on RX 6700 +# XT before this fix). +# +# An earlier attempt at -O3 was reverted because parity tests appeared +# to fail with it — but that diagnosis was confounded by an unrelated +# build-time bug (compose.yaml's silent ACPP_GFX default to gfx1100 +# made every "broken" rebuild produce kernels for the wrong amdgcn +# ISA, which executed as no-ops regardless of opt level). With +# ACPP_GFX now enforced via ${VAR:?} in compose.yaml, -O3 should be +# testable cleanly. Drop to -O2 here if it actually does fail at -O3 +# under correct gfx targeting. +target_compile_options(pos2_gpu PRIVATE + $<$:-O3> + $<$:-O2> + $<$:-Os>) +# The SYCL TUs include CUDA headers (cuda_fp16.h, transitively cuda_runtime.h +# from the kernel-wrapper headers) on both the CUDA and non-CUDA paths +# (slice 17 will lift the CUDA-type dependencies out of the public API). +# On the CUDA build we already have CMAKE_CUDA_COMPILER. On the non-CUDA +# build we need to locate the CUDA Toolkit headers via find_package +# (CUDAToolkit) — which does NOT require enable_language(CUDA). +if(XCHPLOT2_BUILD_CUDA) + get_filename_component(_xchplot2_cuda_bin ${CMAKE_CUDA_COMPILER} DIRECTORY) + get_filename_component(_xchplot2_cuda_root ${_xchplot2_cuda_bin} DIRECTORY) + set(_xchplot2_cuda_include "${_xchplot2_cuda_root}/include") +else() + find_package(CUDAToolkit QUIET) + if(CUDAToolkit_INCLUDE_DIRS) + set(_xchplot2_cuda_include ${CUDAToolkit_INCLUDE_DIRS}) + else() + # Last-resort guess; matches Arch / CachyOS layout. + set(_xchplot2_cuda_include "/opt/cuda/include") + endif() +endif() +target_include_directories(pos2_gpu PRIVATE ${_xchplot2_cuda_include}) +if(XCHPLOT2_BUILD_CUDA) + # OBJECT lib doesn't inherit pos2_gpu's PUBLIC includes via + # $ (only the .o files travel), so propagate the + # CUDA include path explicitly. Mirrors the line above for pos2_gpu. + target_include_directories(pos2_gpu_cuda_obj PRIVATE ${_xchplot2_cuda_include}) +endif() + +# Slice 17 removed the last SYCL-TU reference to a cudart *function* — only +# cuda* types survive (used for API compatibility), and types don't require +# a link against libcudart.so. On the NVIDIA build path the nvcc-compiled +# TUs (AesGpu.cu, SortCuda.cu, AesGpuBitsliced.cu) bring in cudart +# automatically. On non-NVIDIA builds cudart isn't needed at all. +# Now that the kernel-wrapper headers (T*Offsets.cuh, PipelineKernels.cuh, +# T*Kernel.cuh, XsKernel.cuh) take sycl::queue&, every TU that includes them +# needs sycl/sycl.hpp on its include path — including the parity tests +# compiled by nvcc. Make AdaptiveCpp's include dir PUBLIC so it propagates. +get_filename_component(_xchplot2_acpp_cmake_dir + "${AdaptiveCpp_DIR}" DIRECTORY) # /opt/adaptivecpp/lib/cmake/AdaptiveCpp/.. = /opt/adaptivecpp/lib/cmake +get_filename_component(_xchplot2_acpp_lib_dir + "${_xchplot2_acpp_cmake_dir}" DIRECTORY) # /opt/adaptivecpp/lib +get_filename_component(_xchplot2_acpp_root + "${_xchplot2_acpp_lib_dir}" DIRECTORY) # /opt/adaptivecpp +target_include_directories(pos2_gpu PUBLIC + ${_xchplot2_acpp_root}/include + ${_xchplot2_acpp_root}/include/AdaptiveCpp) +if(XCHPLOT2_BUILD_CUDA) + # Same reasoning as the CUDA include above — propagate AdaptiveCpp's + # include dir to the OBJECT lib explicitly so its .cu TUs see the + # kernel-wrapper headers (T*Offsets.cuh / PipelineKernels.cuh / ...) + # that pull in sycl/sycl.hpp. + target_include_directories(pos2_gpu_cuda_obj PRIVATE + ${_xchplot2_acpp_root}/include + ${_xchplot2_acpp_root}/include/AdaptiveCpp) +endif() + set_target_properties(pos2_gpu PROPERTIES POSITION_INDEPENDENT_CODE ON - # Do NOT pre-resolve device symbols — consumers (e.g. aes_parity.cu) - # reference kAesT* directly and need them visible at final device link. + # No CUDA .o files in this archive (they live in pos2_gpu_cuda_obj + # OBJECT lib and are added explicitly to each leaf consumer), so + # device-symbol resolution doesn't apply here. CUDA_RESOLVE_DEVICE_SYMBOLS + # is left explicitly OFF for clarity and to defend against any future + # CUDA TU getting added to pos2_gpu's source list. CUDA_RESOLVE_DEVICE_SYMBOLS OFF ) @@ -107,6 +578,8 @@ add_library(pos2_gpu_host STATIC src/host/GpuPlotter.cpp src/host/PlotFileWriterParallel.cpp src/host/BatchPlotter.cpp + src/host/CpuPlotter.cpp + src/host/Cancel.cpp ) target_include_directories(pos2_gpu_host PUBLIC src) target_link_libraries(pos2_gpu_host PUBLIC pos2_chip_headers pos2_gpu) @@ -170,55 +643,168 @@ endif() add_library(xchplot2_cli STATIC tools/xchplot2/cli.cpp) target_include_directories(xchplot2_cli PUBLIC tools/xchplot2) target_link_libraries(xchplot2_cli PUBLIC pos2_gpu_host pos2_keygen) +# CUDA_RESOLVE_DEVICE_SYMBOLS=ON triggers an nvcc --device-link step at +# archive creation, producing a host-side dlink.o that defines the +# `__cudaRegisterLinkedBinary_*` symbols every `__sti____cudaRegisterAll()` +# constructor references. cli_devlink.cu is the marker that flips +# xchplot2_cli to a CUDA-language target so the device-link actually +# fires (it's a silent no-op on pure-C++ targets — see cli_devlink.cu). +# +# Just adding cli_devlink.cu isn't enough: the dlink.o it produces only +# resolves symbols for .cu objects directly compiled into xchplot2_cli. +# Pulling pos2_gpu's CUDA .o files in via $ +# brings them into xchplot2_cli's archive-time device-link scope so the +# resulting dlink.o covers them too. See the pos2_gpu_cuda_obj OBJECT-lib +# comment above for why we share the .o files instead of recompiling. +if(XCHPLOT2_BUILD_CUDA) + target_sources(xchplot2_cli PRIVATE + tools/xchplot2/cli_devlink.cu + $) +endif() set_target_properties(xchplot2_cli PROPERTIES POSITION_INDEPENDENT_CODE ON CUDA_RESOLVE_DEVICE_SYMBOLS ON ) # CLI: xchplot2 (the standalone plotter binary, formerly gpu_plotter) +# +# LINK_GROUP RESCAN wraps xchplot2_cli + pos2_gpu_host so the linker +# rescans them as a unit. xchplot2_cli holds the CUDA OBJECT files +# (initialize_aes_tables, cub_sort_*); pos2_gpu_host's BatchPlotter.cpp +# and SortSyclCub.cpp reference those symbols. With single-pass static- +# archive scanning the references would land after xchplot2_cli was +# already processed — rescan resolves the back-edge. add_executable(xchplot2 tools/xchplot2/main.cpp) -target_link_libraries(xchplot2 PRIVATE xchplot2_cli) - -# Parity tests -add_executable(aes_parity tools/parity/aes_parity.cu) -target_link_libraries(aes_parity PRIVATE pos2_gpu_host) - -add_executable(aes_bs_parity tools/parity/aes_bs_parity.cu) -target_link_libraries(aes_bs_parity PRIVATE pos2_gpu_host) - -add_executable(aes_bs_bench tools/parity/aes_bs_bench.cu) -target_link_libraries(aes_bs_bench PRIVATE pos2_gpu_host) - -add_executable(aes_tezcan_bench tools/parity/aes_tezcan_bench.cu) -target_link_libraries(aes_tezcan_bench PRIVATE pos2_gpu_host) - -add_executable(xs_parity tools/parity/xs_parity.cu) -target_link_libraries(xs_parity PRIVATE pos2_gpu_host) - -add_executable(xs_bench tools/parity/xs_bench.cu) -target_link_libraries(xs_bench PRIVATE pos2_gpu_host) - -add_executable(t1_parity tools/parity/t1_parity.cu) -target_link_libraries(t1_parity PRIVATE pos2_gpu_host) - -add_executable(t1_debug tools/parity/t1_debug.cu) -target_link_libraries(t1_debug PRIVATE pos2_gpu_host) -set_target_properties(t1_debug PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity") - -add_executable(t2_parity tools/parity/t2_parity.cu) -target_link_libraries(t2_parity PRIVATE pos2_gpu_host) - -add_executable(t3_parity tools/parity/t3_parity.cu) -target_link_libraries(t3_parity PRIVATE pos2_gpu_host) +target_link_libraries(xchplot2 PRIVATE + "$") +# pos2-chip headers define non-inline soft_aesenc/soft_aesdec, which now +# end up in two TUs (PlotFileWriterParallel.cpp and CpuPlotter.cpp) inside +# pos2_gpu_host. Tolerate the duplicates at host link. +target_link_options(xchplot2 PRIVATE LINKER:--allow-multiple-definition) + +# Parity tests are nvcc-compiled (.cu) and reference __global__ kernels +# from the bench-specific bitsliced AES path. They build only on the CUDA +# target. The two SYCL-native parity tests below (sycl_*_parity) stay +# unconditional so AMD/Intel builds still have correctness coverage. +# +# Each test gets $ explicitly: +# pos2_gpu (STATIC) doesn't carry the CUDA .o files anymore — putting +# them in both pos2_gpu and xchplot2_cli triggered nvlink's "Multiple +# definition" error at xchplot2_cli's archive-time device-link, which +# host-only --allow-multiple-definition can't suppress. So leaf +# executables that need kernel symbols (kAesT0..3, host-side +# kernel-wrapper functions in pos2_gpu_host) pull them in directly, +# making the .o files appear exactly once in each link line. +if(XCHPLOT2_BUILD_CUDA) + foreach(t IN ITEMS aes_parity aes_bs_parity aes_bs_bench aes_tezcan_bench + xs_parity xs_bench t1_parity t1_debug t2_parity t3_parity) + add_executable(${t} tools/parity/${t}.cu $) + target_link_libraries(${t} PRIVATE pos2_gpu_host) + set_target_properties(${t} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity") + endforeach() + + message(STATUS "pos2-gpu configured for CUDA arch(es): ${CMAKE_CUDA_ARCHITECTURES}") +endif() -add_executable(plot_file_parity tools/parity/plot_file_parity.cpp) +# plot_file_parity is a pure .cpp harness — reads a .plot file via +# pos2_gpu_host's file-format code and checks the header / table offsets. +# Builds on all backends (CUDA, HIP, SYCL-only). On the CUDA build it +# transitively needs pos2_gpu_host's kernel-wrapper symbols, which now +# live in the OBJECT lib rather than pos2_gpu.a — pull them in here. +if(XCHPLOT2_BUILD_CUDA) + add_executable(plot_file_parity tools/parity/plot_file_parity.cpp + $) +else() + add_executable(plot_file_parity tools/parity/plot_file_parity.cpp) +endif() target_link_libraries(plot_file_parity PRIVATE pos2_gpu_host) set_target_properties(plot_file_parity PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity") # Group binaries under build/tools/... set_target_properties(xchplot2 PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/xchplot2") -foreach(t aes_parity aes_bs_parity aes_bs_bench aes_tezcan_bench xs_parity xs_bench t1_parity t2_parity t3_parity) - set_target_properties(${t} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity") -endforeach() -message(STATUS "pos2-gpu configured for CUDA arch(es): ${CMAKE_CUDA_ARCHITECTURES}") +# Slice-1 standalone SYCL parity test: exercises compute_bucket_offsets in +# isolation against a CPU reference on synthetic input — orthogonal to the +# t1_parity full-pipeline test, useful for narrowing any divergence to the +# SYCL kernel itself. +add_executable(sycl_bucket_offsets_parity tools/parity/sycl_bucket_offsets_parity.cpp) +add_sycl_to_target(TARGET sycl_bucket_offsets_parity + SOURCES tools/parity/sycl_bucket_offsets_parity.cpp) +target_compile_features(sycl_bucket_offsets_parity PRIVATE cxx_std_20) +set_target_properties(sycl_bucket_offsets_parity PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity") + +# Slice-4 standalone: validates the SYCL-compiled AES g_x_smem against the +# same function run on the host. Pulls the AES headers (now portable behind +# PortableAttrs.hpp) directly, so a host-vs-device divergence in the AES +# math isolates here without t1_parity scaffolding. +add_executable(sycl_g_x_parity tools/parity/sycl_g_x_parity.cpp) +add_sycl_to_target(TARGET sycl_g_x_parity + SOURCES tools/parity/sycl_g_x_parity.cpp) +target_include_directories(sycl_g_x_parity PRIVATE src) +target_compile_features(sycl_g_x_parity PRIVATE cxx_std_20) +set_target_properties(sycl_g_x_parity PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity") + +# Slice-18 standalone: exercises launch_sort_pairs_u32_u32 and +# launch_sort_keys_u64 against a std::sort reference. Built always — runs +# the CUB-backed wrappers when XCHPLOT2_BUILD_CUDA=ON, the hand-rolled +# SYCL radix when OFF. Lets the SYCL sort path be validated on NVIDIA +# hardware without needing AMD/Intel access. +add_executable(sycl_sort_parity tools/parity/sycl_sort_parity.cpp) +add_sycl_to_target(TARGET sycl_sort_parity + SOURCES tools/parity/sycl_sort_parity.cpp) +target_link_libraries(sycl_sort_parity PRIVATE pos2_gpu) +# On the CUDA build path, pos2_gpu's SortSyclCub.cpp (the SYCL→CUB +# adapter) calls cub_sort_* defined in SortCuda.cu — now in +# pos2_gpu_cuda_obj OBJECT lib instead of pos2_gpu STATIC. Pull the +# OBJECT lib's .o files in directly so the CUB symbols resolve. +# AMD/Intel builds use SortSycl.cpp (pure SYCL) instead and don't +# need this. +if(XCHPLOT2_BUILD_CUDA) + target_sources(sycl_sort_parity PRIVATE $) +endif() +# cuda_fp16.h transitively required by SyclBackend.hpp → sycl/sycl.hpp +# (AdaptiveCpp's half.hpp uses cuda_fp16 intrinsics on the CUDA backend). +target_include_directories(sycl_sort_parity PRIVATE ${_xchplot2_cuda_include}) +target_compile_features(sycl_sort_parity PRIVATE cxx_std_20) +set_target_properties(sycl_sort_parity PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity") + +# SYCL-native sibling of t1_parity.cu. The .cu version is nvcc-only, so on +# AMD/Intel hosts the T1 matcher had no end-to-end CPU-vs-GPU coverage — +# this binary closes that gap. Same comparison semantics as t1_parity.cu +# (sorted-set equality of T1Pairings against pos2-chip's Table1Constructor), +# but uses sycl::malloc_device + q.memcpy in place of cudaMalloc / +# cudaMemcpy so it builds on the SYCL-only path too. +if(XCHPLOT2_BUILD_CUDA) + add_executable(sycl_t1_parity tools/parity/sycl_t1_parity.cpp + $) +else() + add_executable(sycl_t1_parity tools/parity/sycl_t1_parity.cpp) +endif() +add_sycl_to_target(TARGET sycl_t1_parity + SOURCES tools/parity/sycl_t1_parity.cpp) +target_link_libraries(sycl_t1_parity PRIVATE pos2_gpu_host) +target_include_directories(sycl_t1_parity PRIVATE ${_xchplot2_cuda_include}) +target_compile_features(sycl_t1_parity PRIVATE cxx_std_20) +# pos2-chip's plot/PlotLayout.hpp + plot/TableConstructorGeneric.hpp pull +# in non-inline soft_aesenc/soft_aesdec, which already exist in pos2_gpu_host +# via PlotFileWriterParallel.cpp + CpuPlotter.cpp. Same mitigation as the +# xchplot2 CLI link line — see the --allow-multiple-definition note above. +target_link_options(sycl_t1_parity PRIVATE LINKER:--allow-multiple-definition) +set_target_properties(sycl_t1_parity PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/parity") + +# Lowest-level diagnostic: a hello-world SYCL kernel that proves +# AdaptiveCpp's HIP / CUDA backend can dispatch *anything* on the +# detected device. No pos2_gpu / pos2_gpu_host link — purely the SYCL +# runtime + a 16-element parallel_for. Use it as the first step when +# sycl_t1_parity or the production CLI silently produces no output: if +# hellosycl FAILs, no xchplot2-level fix can recover and the issue is +# below our level (driver mismatch, JIT no-op stubs, etc.). +add_executable(hellosycl tools/sanity/hellosycl.cpp) +add_sycl_to_target(TARGET hellosycl SOURCES tools/sanity/hellosycl.cpp) +target_compile_features(hellosycl PRIVATE cxx_std_20) +set_target_properties(hellosycl PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tools/sanity") diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..b565621 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,69 @@ +# Contributing to xchplot2 + +Thanks for taking the time. A few notes to keep review loops short. + +## Building + running the tests + +Build and run the parity tests following the +[Build](https://github.com/Jsewill/xchplot2#build) section of the +README. The parity binaries under `tools/parity/` are the correctness +gate: + +- `aes_parity`, `xs_parity`, `t1_parity`, `t2_parity`, `t3_parity` — + bit-exact CPU vs GPU per-phase agreement with pos2-chip's reference. +- `sycl_sort_parity`, `sycl_g_x_parity`, `sycl_bucket_offsets_parity` — + the SYCL/AdaptiveCpp backends vs the CUDA reference, so AMD/Intel + breakage is caught on NVIDIA hardware too. +- `plot_file_parity` — writer + reader round-trip on the final + `.plot2`. + +Any change that touches a kernel, the sort path, or the plot file +format **must** keep the parity tests passing at k=22 (quick) and at +k=28 (slow — the realistic production k). Output bytes are specified +to be identical to the pos2-chip CPU reference; this is the hard +invariant. + +After a functional change, spot-check one real batch end-to-end with +`xchplot2 verify ` — zero proofs over 100 random challenges is +a regression even if all parity tests pass. + +## Commit style + +Short imperative subjects, lowercase scope prefix, no trailing period: + +``` +gpu: split xs-sort keys_a to d_storage tail — drops pool VRAM min ~1.3 GB +docs: tighten streaming peak (~7.3 GB measured), add AMD row +CMakeLists: re-enable -O3 for SYCL TUs +``` + +Body paragraphs explain *why* (what invariant was wrong, what the +measurement was, what alternative was considered and why it was +rejected). The *what* is in the diff. + +## Scope of changes + +- Keep unrelated refactors out of correctness or performance commits. +- Performance changes should cite before/after numbers on a named GPU + at a specified `k`. +- New runtime knobs go in `README.md`'s + [Environment variables](https://github.com/Jsewill/xchplot2#environment-variables) + table so users can discover them. + +## PRs + +The `main` branch carries the SYCL/AdaptiveCpp port; the +[`cuda-only`](https://github.com/Jsewill/xchplot2/tree/cuda-only) +branch is the original CUDA-only path, preserved as the most-tested +NVIDIA configuration. A PR that only helps NVIDIA may still land on +`main`, but don't regress parity on AMD (`gfx1031`) along the way. + +## Reporting bugs + +Open an issue with: + +- Exact command line and the full stderr output. +- GPU vendor + model + VRAM (`nvidia-smi -L` / `rocminfo | grep gfx`). +- Build flavor: container (service name + `ACPP_GFX` / `CUDA_ARCH`), + native `scripts/install-deps.sh`, or `cargo install`. +- Whether parity tests pass on your build. diff --git a/Cargo.lock b/Cargo.lock index 04951f4..8b9667a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,4 +4,4 @@ version = 4 [[package]] name = "xchplot2" -version = "0.1.0" +version = "0.5.2" diff --git a/Cargo.toml b/Cargo.toml index 2147f53..50e3694 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,11 +1,11 @@ [package] name = "xchplot2" -version = "0.1.0" +version = "0.6.0" edition = "2021" authors = ["Abraham Sewill "] license = "MIT" description = "GPU plotter for Chia v2 proofs of space (CHIP-48)" -repository = "https://github.com/Chia-Network/xchplot2" +repository = "https://github.com/Jsewill/xchplot2" readme = "README.md" build = "build.rs" diff --git a/Containerfile b/Containerfile new file mode 100644 index 0000000..7d97b2d --- /dev/null +++ b/Containerfile @@ -0,0 +1,229 @@ +# syntax=docker/dockerfile:1 +# +# Containerfile for xchplot2 — podman-first (works with docker too). +# Supports NVIDIA (default), AMD ROCm, and Intel oneAPI via build args. +# +# ── NVIDIA (default; CUB sort) ─────────────────────────────────────────────── +# podman build -t xchplot2:cuda . +# podman run --rm --device nvidia.com/gpu=all -v $PWD/plots:/out \ +# xchplot2:cuda plot -k 28 -n 10 -f -c -o /out +# (Requires nvidia-container-toolkit + CDI on the host.) +# +# The default base image is CUDA 13.x, which only supports sm_75+ (Turing +# and newer). Pascal (sm_61) and Volta (sm_70) builds need a 12.x base — +# pass it explicitly: +# podman build -t xchplot2:cuda \ +# --build-arg BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \ +# --build-arg BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \ +# --build-arg CUDA_ARCH=61 \ +# . +# scripts/build-container.sh handles this automatically by probing +# nvidia-smi and pinning the 12.x base when CUDA_ARCH < 75. +# +# ── AMD ROCm (hand-rolled SYCL radix; XCHPLOT2_BUILD_CUDA=OFF) ─────────────── +# podman build -t xchplot2:rocm \ +# --build-arg BASE_DEVEL=docker.io/rocm/dev-ubuntu-24.04:latest \ +# --build-arg BASE_RUNTIME=docker.io/rocm/dev-ubuntu-24.04:latest \ +# --build-arg ACPP_TARGETS=hip:gfx1100 \ +# --build-arg XCHPLOT2_BUILD_CUDA=OFF \ +# --build-arg INSTALL_CUDA_HEADERS=1 \ +# . +# podman run --rm --device /dev/kfd --device /dev/dri --group-add video \ +# -v $PWD/plots:/out xchplot2:rocm plot -k 28 -n 10 ... -o /out +# (Adjust ACPP_TARGETS for your card: rocminfo | grep gfx.) +# +# ── Intel oneAPI (experimental, untested) ──────────────────────────────────── +# podman build -t xchplot2:intel \ +# --build-arg BASE_DEVEL=docker.io/intel/oneapi-basekit:latest \ +# --build-arg BASE_RUNTIME=docker.io/intel/oneapi-runtime:latest \ +# --build-arg ACPP_TARGETS=generic \ +# --build-arg XCHPLOT2_BUILD_CUDA=OFF \ +# --build-arg INSTALL_CUDA_HEADERS=1 \ +# . +# +# ── CPU-only (AdaptiveCpp OpenMP backend; slow plotting) ───────────────────── +# podman build -t xchplot2:cpu \ +# --build-arg BASE_DEVEL=docker.io/ubuntu:24.04 \ +# --build-arg BASE_RUNTIME=docker.io/ubuntu:24.04 \ +# --build-arg ACPP_TARGETS=omp \ +# --build-arg XCHPLOT2_BUILD_CUDA=OFF \ +# --build-arg INSTALL_CUDA_HEADERS=1 \ +# . +# podman run --rm -v $PWD/plots:/out xchplot2:cpu plot -k 28 -n 1 ... +# No GPU needed at build or runtime. Plotting is 1-2 orders of magnitude +# slower than GPU — useful for headless CI / dev machines without a GPU. +# +# First build pulls + builds AdaptiveCpp from source — expect 10-30 min. +# Subsequent rebuilds reuse the cached AdaptiveCpp layer. + +# BASE_RUNTIME defaults to the devel image because AdaptiveCpp's SSCP +# (LLVM "generic" target) JIT-assembles PTX at runtime via ptxas, which +# only ships in the CUDA *devel* image. The slim runtime image lacks it +# and produces "Code object construction failed". Override with a slim +# image only if you've switched ACPP_TARGETS to AOT (e.g. cuda:sm_89). +ARG BASE_DEVEL=docker.io/nvidia/cuda:13.0.0-devel-ubuntu24.04 +ARG BASE_RUNTIME=docker.io/nvidia/cuda:13.0.0-devel-ubuntu24.04 +ARG ACPP_REF=v25.10.0 +ARG ACPP_TARGETS= +ARG XCHPLOT2_BUILD_CUDA=ON +ARG INSTALL_CUDA_HEADERS=0 +ARG CUDA_ARCH=89 +# LLVM/clang root used to build AdaptiveCpp. Pinned to Ubuntu's llvm-18 +# for every compose service (cuda / rocm / intel / cpu) — none of them +# override these args. The HIP-backend version match-up happens at +# *runtime*, not build-time: ROCm 6.2's bundled clang at /opt/rocm/llvm +# ships LLVM 18.0git, so its device bitcode (ocml.bc, ockl.bc) is +# ABI-compatible with the libacpp-rt that AdaptiveCpp linked against +# Ubuntu's llvm-18. ROCm 7.x dropped LLVMConfig.cmake from its rocm-llvm +# package, which is why compose.yaml's rocm service pins BASE to 6.2. +# LLVM_CMAKE_DIR points at the dir containing LLVMConfig.cmake. +ARG LLVM_ROOT=/usr/lib/llvm-18 +ARG LLVM_CMAKE_DIR=/usr/lib/llvm-18/cmake + +# ─── builder ──────────────────────────────────────────────────────────────── +FROM ${BASE_DEVEL} AS builder + +ARG ACPP_REF +ARG ACPP_TARGETS +ARG XCHPLOT2_BUILD_CUDA +ARG INSTALL_CUDA_HEADERS +ARG CUDA_ARCH +ARG LLVM_ROOT +ARG LLVM_CMAKE_DIR + +ENV DEBIAN_FRONTEND=noninteractive + +# Common toolchain. AdaptiveCpp 25.10 wants LLVM ≥ 16 + clang + libclang; +# Ubuntu 24.04 ships llvm-18. Boost.Context, libnuma, libomp are AdaptiveCpp +# runtime deps. INSTALL_CUDA_HEADERS=1 pulls the CUDA Toolkit *headers* on +# non-NVIDIA bases — required because AdaptiveCpp's libkernel/half.hpp +# transitively includes cuda_fp16.h on every build path. +RUN apt-get update && apt-get install -y --no-install-recommends \ + cmake git ninja-build build-essential python3 pkg-config \ + curl ca-certificates \ + libboost-context-dev libnuma-dev \ + && if [ "${LLVM_ROOT}" = "/usr/lib/llvm-18" ]; then \ + apt-get install -y --no-install-recommends \ + llvm-18 llvm-18-dev clang-18 libclang-18-dev libclang-cpp18-dev \ + lld-18 libomp-18-dev; \ + fi \ + && if [ "${INSTALL_CUDA_HEADERS}" = "1" ]; then \ + apt-get install -y --no-install-recommends nvidia-cuda-toolkit-headers \ + || apt-get install -y --no-install-recommends nvidia-cuda-toolkit; \ + fi \ + && rm -rf /var/lib/apt/lists/* + +# AdaptiveCpp's HIP backend invokes a clang driver that expects +# clang-offload-bundler in its own bin dir (clang looks for helper tools +# next to itself). On ROCm 6.2-complete images /opt/rocm/llvm/bin is +# missing that one binary even though clang-18 itself is there. Ubuntu's +# llvm-18 ships the bundler; both LLVMs are 18-series so the format is +# compatible. +# +# Because we don't know up-front which clang++ AdaptiveCpp will pick +# (ROCm's /opt/rocm/llvm/bin/clang++, Ubuntu's /usr/lib/llvm-18/bin/ +# clang++, or the /usr/bin shim), symlink the bundler into every clang +# bin dir we can find. Cheap, belt-and-braces, no per-base-image logic. +RUN set -eux; \ + echo "=== clang-offload-bundler discovery ==="; \ + find / -xdev -name 'clang-offload-bundler*' -executable -type f 2>/dev/null | head -20 || true; \ + BUNDLER=""; \ + for c in /usr/lib/llvm-18/bin/clang-offload-bundler \ + /opt/rocm/llvm/bin/clang-offload-bundler \ + /usr/bin/clang-offload-bundler-18 \ + /usr/bin/clang-offload-bundler; do \ + if [ -x "$c" ]; then BUNDLER="$c"; break; fi; \ + done; \ + if [ -z "$BUNDLER" ]; then \ + BUNDLER=$(find / -xdev -name clang-offload-bundler -executable -type f 2>/dev/null | head -1 || true); \ + fi; \ + echo "=== bundler resolved to: ${BUNDLER:-} ==="; \ + if [ -n "$BUNDLER" ]; then \ + for d in /opt/rocm/llvm/bin /opt/rocm/bin /usr/lib/llvm-18/bin /usr/bin; do \ + [ -d "$d" ] || continue; \ + [ -e "$d/clang-offload-bundler" ] && continue; \ + ln -sf "$BUNDLER" "$d/clang-offload-bundler"; \ + echo "linked -> $d/clang-offload-bundler"; \ + done; \ + fi + +# Rust toolchain (for keygen-rs and the `cargo install` entry point). +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \ + sh -s -- -y --default-toolchain stable --profile minimal +ENV PATH=/root/.cargo/bin:${PATH} + +# AdaptiveCpp from source, pinned. Installs to /opt/adaptivecpp. +RUN git clone --depth 1 --branch ${ACPP_REF} \ + https://github.com/AdaptiveCpp/AdaptiveCpp.git /tmp/acpp-src \ + && cmake -S /tmp/acpp-src -B /tmp/acpp-build -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/opt/adaptivecpp \ + -DCMAKE_C_COMPILER=${LLVM_ROOT}/bin/clang \ + -DCMAKE_CXX_COMPILER=${LLVM_ROOT}/bin/clang++ \ + -DLLVM_DIR=${LLVM_CMAKE_DIR} \ + -DACPP_LLD_PATH=${LLVM_ROOT}/bin/ld.lld \ + && cmake --build /tmp/acpp-build --parallel \ + && cmake --install /tmp/acpp-build \ + && echo "=== AdaptiveCpp LLVM linkage ===" \ + && (ldd /opt/adaptivecpp/lib/libacpp-rt.so | grep -iE "llvm|libomp" || true) \ + && (ldd /opt/adaptivecpp/lib/libacpp-common.so | grep -iE "llvm|libomp" || true) \ + && rm -rf /tmp/acpp-src /tmp/acpp-build + +ENV CMAKE_PREFIX_PATH=/opt/adaptivecpp:${CMAKE_PREFIX_PATH} +ENV PATH=/opt/adaptivecpp/bin:${PATH} + +WORKDIR /xchplot2 +COPY . . + +# Build xchplot2. CUDA_ARCHITECTURES + ACPP_TARGETS + XCHPLOT2_BUILD_CUDA +# get picked up by build.rs; the latter switches the CMake source set +# between the CUB-using TUs (.cu files via nvcc) and the SYCL-only path. +RUN CUDA_ARCHITECTURES=${CUDA_ARCH} \ + ACPP_TARGETS=${ACPP_TARGETS} \ + XCHPLOT2_BUILD_CUDA=${XCHPLOT2_BUILD_CUDA} \ + cargo install --path . --root /usr/local --locked + +# Also build the parity tests via plain CMake so they're available +# inside the container for first-port validation on new GPUs (especially +# AMD/Intel). Reuses the static libs cargo install just built. +RUN cmake -S . -B build-tests -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} \ + -DACPP_TARGETS=${ACPP_TARGETS} \ + -DXCHPLOT2_BUILD_CUDA=${XCHPLOT2_BUILD_CUDA} \ + && cmake --build build-tests --parallel --target sycl_sort_parity \ + sycl_bucket_offsets_parity \ + sycl_g_x_parity \ + plot_file_parity \ + && install -m 0755 build-tests/tools/parity/sycl_sort_parity /usr/local/bin/ \ + && install -m 0755 build-tests/tools/parity/sycl_bucket_offsets_parity /usr/local/bin/ \ + && install -m 0755 build-tests/tools/parity/sycl_g_x_parity /usr/local/bin/ \ + && install -m 0755 build-tests/tools/parity/plot_file_parity /usr/local/bin/ \ + && rm -rf build-tests target + +# ─── runtime ──────────────────────────────────────────────────────────────── +FROM ${BASE_RUNTIME} + +ENV DEBIAN_FRONTEND=noninteractive + +# AdaptiveCpp's runtime backend loaders dlopen libLLVM (for SSCP runtime +# specialization), libnuma (OMP backend), libomp, and Boost.Context. +# SSCP also shells out to LLVM's `opt` and `llc` binaries at runtime to +# generate PTX from the SSCP bitcode — install the full llvm-18 package +# (binaries + lib), not just libllvm18. +RUN apt-get update && apt-get install -y --no-install-recommends \ + llvm-18 lld-18 libnuma1 libomp5-18 libboost-context1.83.0 \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /usr/local/bin/xchplot2 /usr/local/bin/xchplot2 +COPY --from=builder /usr/local/bin/sycl_sort_parity /usr/local/bin/sycl_sort_parity +COPY --from=builder /usr/local/bin/sycl_bucket_offsets_parity /usr/local/bin/sycl_bucket_offsets_parity +COPY --from=builder /usr/local/bin/sycl_g_x_parity /usr/local/bin/sycl_g_x_parity +COPY --from=builder /usr/local/bin/plot_file_parity /usr/local/bin/plot_file_parity +COPY --from=builder /opt/adaptivecpp /opt/adaptivecpp + +ENV LD_LIBRARY_PATH=/opt/adaptivecpp/lib:${LD_LIBRARY_PATH} +ENV PATH=/opt/adaptivecpp/bin:${PATH} + +ENTRYPOINT ["/usr/local/bin/xchplot2"] +CMD ["--help"] diff --git a/NOTICE b/NOTICE index c203f35..3ffbead 100644 --- a/NOTICE +++ b/NOTICE @@ -49,11 +49,40 @@ FSE (Finite State Entropy) Vendored upstream by pos2-chip at lib/fse/ and statically linked into xchplot2. Provides the entropy-coding step of v2 plot file compression. ================================================================================ +AdaptiveCpp (formerly hipSYCL) + https://github.com/AdaptiveCpp/AdaptiveCpp + Copyright (c) The AdaptiveCpp Contributors + Licensed under the BSD 2-Clause "Simplified" License. + + SYCL implementation. Statically linked at build time (libacpp-rt and + friends) for the cross-vendor SYCL kernel path. Pulled in via + find_package(AdaptiveCpp) from /opt/adaptivecpp (the install-deps.sh + default) or via CMake FetchContent at v25.10.0. +================================================================================ NVIDIA CUDA Toolkit (runtime + CUB) Used at build time and dynamically at run time. Subject to the NVIDIA CUDA Toolkit End User License Agreement (https://docs.nvidia.com/cuda/eula/). ================================================================================ +AMD ROCm / HIP + https://github.com/ROCm/ROCm + Copyright (c) Advanced Micro Devices, Inc. + + Used at build time (HIP toolchain) and dynamically at run time on + AMD builds. Components are licensed per-package — primarily MIT and + University of Illinois/NCSA Open Source — see the per-component + LICENSE files in each ROCm subproject. +================================================================================ +Intel oneAPI / Level Zero + https://github.com/oneapi-src + Copyright (c) Intel Corporation + + Used at build time and dynamically at run time on Intel SYCL builds + (currently wired up but untested — no Intel GPU in our test matrix). + Components are licensed per-package: Apache-2.0 with LLVM exception + for the DPC++ compiler, MIT for the Level Zero loader, and the Intel + oneAPI End User License Agreement for the proprietary toolkit pieces. +================================================================================ Full license texts for each Apache-2.0 component are reproduced in their respective upstream source trees, which CMake FetchContent / cargo will diff --git a/README.md b/README.md index 7f73683..ff7d7a1 100644 --- a/README.md +++ b/README.md @@ -4,36 +4,394 @@ GPU plotter for Chia v2 proofs of space (CHIP-48). Produces farmable `.plot2` files byte-identical to the [pos2-chip](https://github.com/Chia-Network/pos2-chip) CPU reference. -## Performance +> **Status — work in progress.** Plots are byte-identical to the +> pos2-chip CPU reference and deterministic across runs; performance, +> AMD/Intel support, and the install/CI story are still evolving. Use +> [`cuda-only`](https://github.com/Jsewill/xchplot2/tree/cuda-only) for +> the most-tested path. -k=28, strength=2, RTX 4090 (sm_89), PCIe Gen4 x16: +> **Branches:** `main` — SYCL/AdaptiveCpp port, runs on NVIDIA + +> AMD + Intel (CUB fast path preserved on NVIDIA). +> [`cuda-only`](https://github.com/Jsewill/xchplot2/tree/cuda-only) — +> original pure-CUDA path, pick it if you only target NVIDIA. See +> [Performance](#performance) for the tradeoff. -| Mode | Per plot | -|---|---| -| pos2-chip CPU baseline | ~50 s | -| `xchplot2 batch` steady-state wall | **2.06 s** | -| Producer GPU time, steady-state | 1.96 s | -| Device-kernel floor (single-plot nsys) | 1.91 s | +## Quick start + +```bash +# Install — needs CUDA Toolkit 12+ (or AdaptiveCpp for AMD/Intel), +# CMake ≥ 3.24, a C++20 compiler, and Rust. See Build for alternatives. +cargo install --git https://github.com/Jsewill/xchplot2 -A physically narrower PCIe slot (e.g. Gen4 x4) adds ~240 ms per plot to -the final fragment D2H copy. Check `cat /sys/bus/pci/devices/*/current_link_width` -under load if numbers look off by that much. +# Plot — 10 × k=28 files, keys derived internally from your BLS pair. +xchplot2 plot -k 28 -n 10 \ + -f \ + -c \ + -o /mnt/plots + +# Multi-GPU — one worker per GPU, round-robin partition. +# (`--devices all` adds a CPU worker too; `--devices gpu` sticks to GPUs.) +xchplot2 plot ... --devices gpu +``` + +See [Hardware compatibility](#hardware-compatibility) for GPU / VRAM +/ OS requirements, [Build](#build) for container / native / CMake +paths, and [Use](#use) for every flag. +**Windows users**: this `cargo install` line works under WSL2; for +native Windows or a non-WSL setup, jump to [Windows](#windows). + +## Hardware compatibility + +- **GPU:** + - **NVIDIA**, compute capability ≥ 5.0 (Maxwell / GTX 750-class + and newer) via the CUDA fast path. Builds auto-detect the + installed GPU's `compute_cap` via `nvidia-smi`; override with + `$CUDA_ARCHITECTURES` for fat or cross-target builds (see + [Build](#build)). Pre-sm_53 cards lack native FP16 ALUs, but + `cuda_fp16.h` falls back to fp32 emulation for the half-precision + intrinsics — kernels work correctly with the emulation cost. + On dual-vendor hosts (e.g. AMD primary + secondary NVIDIA), + `build.rs` also routes around CUDA 13.x + sm < 75 (the toolkit + dropped Maxwell-Volta codegen) so an old NVIDIA card next to a + working AMD GPU no longer derails the build. + - **AMD ROCm** via the SYCL / AdaptiveCpp path. Validated on RDNA2 + (`gfx1031`, RX 6700 XT, 12 GB) — bit-exact parity with the CUDA + backend across the sort / bucket-offsets / g_x kernels, and + farmable plots end-to-end. ROCm 6.2 required (newer ROCm versions + have LLVM packaging breakage — see [`compose.yaml`](compose.yaml) + rocm-service comments). Build picks `ACPP_TARGETS=hip:gfxXXXX` + from `rocminfo` automatically for RDNA2+. Other gfx targets + (`gfx1030` / `gfx1100`) build cleanly but are untested on real + hardware. **RDNA1 cards (`gfx1010`/`gfx1011`/`gfx1012`, e.g. + Radeon Pro W5700, RX 5700 / 5700 XT)** default to + `ACPP_TARGETS=generic` (SSCP JIT) — a previous community + workaround AOT-spoofed them as `gfx1013`, but that has been + observed to silently produce no-op kernel stubs on at least one + W5700 + ROCm 6 + AdaptiveCpp 25.10 setup. Generic SSCP works + end-to-end through k=24 parity tests. Two opt-in escape hatches + preserved: `XCHPLOT2_FORCE_GFX_SPOOF=1` to restore the legacy + AOT spoof, `XCHPLOT2_NO_GFX_SPOOF=1` to AOT-target the actual + ISA natively (build will fail clearly if AdaptiveCpp doesn't + accept it). + - **Intel oneAPI** is wired up but untested. + - **CPU** (no GPU) via AdaptiveCpp's OpenMP backend. Opt-in with + `--cpu` (or `--devices cpu`) — never the default. Plotting is + 1-2 orders of magnitude slower than a real GPU; intended for + headless CI, GPU-less dev machines, or as an extra worker + alongside GPUs (`--devices all` runs every visible GPU plus a + CPU worker on the same batch; `--devices gpu` sticks to GPUs). Build the container with + `scripts/build-container.sh --gpu cpu` for the standalone CPU + image (`xchplot2:cpu`, ~400 MB; no CUDA / ROCm in the image). +- **VRAM:** four tiers, picked automatically based on free device + VRAM at k=28. All four produce byte-identical plots. + - **Pool** (~11 GB device + ~4 GB pinned host): fastest steady-state, + used on 12 GB+ cards. + - **Plain streaming** (~7.3 GB peak + 128 MB margin): per-plot + allocations, no pinned-host parks, single-pass T2 match. ~400 ms/ + plot faster than compact. Used on 10-11 GB cards that can't fit + the pool but have headroom above compact. + - **Compact streaming** (~5.2 GB peak + 128 MB margin): full + park/rehydrate + N=2 T2 match tiling. Used on 6-8 GB cards where + plain won't fit. 6 GB cards (RTX 2060, RX 6600) are on the edge; + 8 GB cards (3070, 2070 Super) comfortably fit. + - **Minimal streaming** (~3.76 GB peak + 128 MB margin): six layered + cuts on top of compact — N=8 T2 match staging, tiled gathers in + T1/T2 sort, sliced T1 match (per section_l), sliced T3 match + (T2 inputs parked on host, slice H2D'd per section pair), + per-tile CUB outputs in T1/T2/T3 sort with USM-host merges, and + tiled Xs gen+sort+pack with host-pinned accumulation. Bottleneck + moves from compact's T1 sort (5200 MB) to T3 match (3754 MB). + Targets 5 GiB+ cards (RTX 2060, RX 6600 XT, RX 7600) comfortably; + 4 GiB cards (GTX 1050 Ti, RTX 3050 4GB, MX450) are an edge case + since real 4 GiB hardware reports ~3.5 GiB free post-CUDA-context. + Trade-off: ~6 extra cap-sized PCIe round-trips per plot. k=28 + wall on sm_89: ~34 s/plot vs ~13 s for compact. Detailed + breakdown in [VRAM](#vram). + + With [`--devices`](#multi-gpu---devices), each worker picks its own + tier from its own GPU's free VRAM — heterogeneous rigs (e.g. one + 12 GB + one 8 GB card) plot concurrently with each device on its + matching tier. +- **PCIe:** Gen4 x16 or wider recommended. A physically narrower slot + (e.g. Gen4 x4) adds ~240 ms per plot to the final fragment D2H + copy; check `cat /sys/bus/pci/devices/*/current_link_width` + under load if throughput looks off. +- **Host RAM:** ≥ 16 GB recommended; `batch` mode pins ~4 GB of host + memory for D2H double-buffering (pool or streaming). +- **CUDA Toolkit:** 12+ required for the NVIDIA build path (tested on + 13.x). Skipped automatically on AMD/Intel builds where `nvcc` isn't + available — `build.rs` runs `nvcc --version` and flips + `XCHPLOT2_BUILD_CUDA=OFF` when missing. The toolkit-vs-arch matrix: + - `sm_50` – `sm_72` (Maxwell / Pascal / Volta): need CUDA **12.9** + (last toolkit with codegen for these arches — 13.x dropped them + entirely). `build.rs` catches the 13.x + old-arch pairing in a + preflight and points at the fix path. + - `sm_75` – `sm_90` (Turing / Ampere / Hopper): 12.x or 13.x both + work. + - `sm_120` (RTX 50-series Blackwell): need 12.8+; earlier toolkits + lack Blackwell codegen. +- **OS:** Linux (tested on modern glibc distributions) is the supported + path. Windows users route through either the `cuda-only` branch + natively (NVIDIA + MSVC + CUDA) or WSL2 (any vendor WSL2 supports) + — see [Windows](#windows) below. macOS is not supported (no CUDA, + no modern SYCL runtime). ## Build -Requires CUDA Toolkit 12+ (tested on 13.x), C++20 host compiler, CMake -≥ 3.24, and a Rust toolchain (for `keygen-rs`). +### Which path should I use? + +- **"I just want to plot, Linux host"** → **container (path 1)**. Smallest + host install (just `podman` + `podman-compose` + the GPU passthrough + bits — `scripts/install-container-deps.sh` installs all of it). All + toolchain lives inside the image. Auto-detects your GPU and pins the + right CUDA / ROCm base. +- **"NVIDIA only, native binary, no SYCL/AdaptiveCpp"** → **`cuda-only` + branch (path 2)**. Three host packages — `cmake` + `build-essential` + + the CUDA Toolkit. No LLVM/lld/AdaptiveCpp install. Smaller dep + surface than main; same end result for NVIDIA users. +- **"Full build — AMD / Intel / CPU support, parity tests on the host"** + → **`install-deps.sh` (path 3)**. Auto-installs cmake, lld, LLVM 18, + AdaptiveCpp from source. ~30-45 min first-time setup. + +Three ways to get the dependencies in place, easiest first: + +### 1. Container (`podman compose` or `docker compose`) + +Easiest path — `scripts/build-container.sh` does host-side GPU +probing and feeds the right env vars to `compose build`. If you're +starting from a fresh host, `scripts/install-container-deps.sh` +installs the engine + GPU passthrough bits first (podman + GPU probe ++ `nvidia-container-toolkit` / video-render groups, as appropriate; +no native CUDA / ROCm / LLVM / AdaptiveCpp on the host): + +```bash +./scripts/install-container-deps.sh # one-time: engine + GPU passthrough +./scripts/build-container.sh # auto: nvidia-smi → cuda, rocminfo → rocm +podman compose run --rm cuda plot -k 28 -n 10 -f -c -o /out +``` + +**The script handles a handful of host-side decisions that bare +`podman compose build` can't:** + +- **Vendor pick** (cuda / rocm / intel / cpu) from nvidia-smi / + rocminfo, or `--gpu cpu` to force CPU. +- **Multi-GPU fat binary** (e.g. `CUDA_ARCH="61;86"` on a + 1070+3060 rig) — compose alone defaults to a single arch. +- **Pascal/Volta auto-pin** to `nvidia/cuda:12.9.1-devel-ubuntu24.04` + when min arch < 75. CUDA 13 dropped sub-Turing codegen, so a Pascal + user without this pin hits a build-time `Unsupported gpu + architecture 'compute_61'` error inside the container. +- **AMD `ACPP_GFX` extract** from rocminfo + the RDNA1 (gfx1010 → + gfx1013) workaround for Radeon Pro W5700. +- **`--no-cache`** pass-through to force a clean rebuild after a + toolchain bump. + +You CAN run `podman compose build` directly — it just means setting +those env vars yourself. The compose YAML's defaults are conservative +(CUDA 13.0, sm_89, no AMD target without `ACPP_GFX`), so plain +`podman compose build cuda` only "just works" on Turing-or-newer +NVIDIA hosts. Anything else needs the script or the equivalent +manual env: + +[`compose.yaml`](compose.yaml) defines four vendor-specific services +sharing one [`Containerfile`](Containerfile); the script just runs +`compose build` against whichever matches your hardware. Override +manually if you prefer: + +```bash +# NVIDIA (default sm_89; override via $CUDA_ARCH=120 etc.) +podman compose build cuda + +# AMD ROCm — set $ACPP_GFX from `rocminfo | grep gfx`. +ACPP_GFX=gfx1031 podman compose build rocm # Navi 22 +ACPP_GFX=gfx1100 podman compose build rocm # Navi 31 (default) + +# Intel oneAPI (experimental, untested). +podman compose build intel + +# CPU-only (no GPU; AdaptiveCpp OpenMP backend; ~400 MB image). +# Plotting is 1-2 orders of magnitude slower than GPU — see CPU bullet +# under Hardware compatibility for the use case. +podman compose build cpu +``` + +Plot files land in `./plots/` on the host. The container also bundles +the parity tests (`sycl_sort_parity`, `sycl_g_x_parity`, etc.) under +`/usr/local/bin/` for quick first-port validation on a new GPU: + +```bash +podman compose run --rm --entrypoint /usr/local/bin/sycl_sort_parity rocm +``` + +First build is ~15-30 min (AdaptiveCpp + LLVM 18 compile from source); +subsequent rebuilds reuse the cached layers. GPU performance inside +the container is identical to native — kernels run on real hardware +via the engine's GPU pass-through: + +- **NVIDIA**: requires `nvidia-container-toolkit` on the host. For + Docker users, also run once after install: + ```bash + sudo apt install nvidia-container-toolkit + sudo nvidia-ctk runtime configure --runtime=docker + sudo systemctl restart docker + ``` + Podman 5.x with CDI works without the runtime-configure step. +- **AMD**: `/dev/kfd` + `/dev/dri` device files. The compose `rocm` + service handles this automatically; for bare `podman/docker run` + pass `--device /dev/kfd --device /dev/dri --group-add video`. + +#### AMD container — sudo, `--privileged`, and `ACPP_GFX` + +AMD GPUs need three pieces of friction handled correctly. None are +optional on most hosts, and getting any one wrong tends to fail +silently or in confusing ways: + +1. **`ACPP_GFX` must be set** to your GPU's gfx target. The kernels + are AOT-compiled for a specific amdgcn ISA at build time. If the + wrong arch is baked in, HIP loads the fatbinary without complaint + but the kernels execute as silent no-ops at runtime — sort returns + input unchanged, AES match finds zero matches, plots look valid + but contain non-canonical proofs that won't qualify against real + challenges. `compose.yaml` defaults `ACPP_GFX` to a placeholder + string that AdaptiveCpp's HIP backend rejects loudly at build + time, so an unset value fails fast with the placeholder visible + in the error rather than silently using a default like `gfx1100`. + Common values (`rocminfo | grep gfx` to confirm yours): + + - `gfx1030` — RDNA2 Navi 21 (RX 6800 / 6800 XT / 6900 XT) + - `gfx1031` — RDNA2 Navi 22 (RX 6700 XT / 6700 / 6800M) + - `gfx1100` — RDNA3 Navi 31 (RX 7900 XTX / XT) + - `gfx1101` — RDNA3 Navi 32 (RX 7800 XT / 7700 XT) + +2. **Rootful `--privileged` for runs.** Rootless podman's default + seccomp filter + capability set blocks some of the KFD ioctls + `libhsa-runtime64` needs during DMA setup. Without them you get + a segfault deep inside the HSA runtime on the very first + host→device copy, even though `rocminfo` works fine. Builds don't + need GPU access and can stay rootless if you prefer. + +3. **`sudo` strips environment variables by default**, including + the `ACPP_GFX` you set in your shell. So a bare + `sudo podman compose build rocm` loses it. Either invoke the + build script (it sets the var inside the sudo'd shell where + compose can see it) or pass the var through explicitly. + +The recommended invocation pair, in order of how short each one is: + +```bash +# Build (autodetects ACPP_GFX from rocminfo — works under sudo too): +sudo ./scripts/build-container.sh + +# Run a single test plot at k=22: +sudo podman run --rm --privileged \ + --device /dev/kfd --device /dev/dri \ + -v $PWD/plots:/out xchplot2:rocm \ + test 22 2 0 0 -G -o /out + +# Run real plotting: +sudo podman run --rm --privileged \ + --device /dev/kfd --device /dev/dri \ + -v $PWD/plots:/out xchplot2:rocm \ + plot -k 28 -n 10 -f -c -o /out +``` + +If `sudo` doesn't carry `/opt/rocm/bin` on your distro and the build +script can't find `rocminfo`, fall back to one of: + +```bash +sudo -E ./scripts/build-container.sh # preserve your shell PATH +sudo ACPP_GFX=gfx1031 ./scripts/build-container.sh # explicit, no rocminfo needed +``` + +Or skip the script entirely: + +```bash +sudo ACPP_GFX=gfx1031 podman compose build rocm +``` + +For convenience, drop a wrapper at `~/.local/bin/xchplot2-amd`: + +```bash +#!/bin/bash +exec sudo podman run --rm --privileged \ + --device /dev/kfd --device /dev/dri \ + -v "$PWD/plots:/out" xchplot2:rocm "$@" +``` + +Then `xchplot2-amd plot -k 28 -n 10 -f ... -c ... -o /out` just works. + +### 2. Native install via `scripts/install-deps.sh` + +```bash +./scripts/install-deps.sh # auto-detects distro + GPU vendor +``` + +Installs the toolchain via the system package manager (Arch, Ubuntu / +Debian, Fedora) plus AdaptiveCpp from source into `/opt/adaptivecpp`. +GPU vendor is auto-detected: `nvidia-smi` / `rocminfo` first, +`/sys/class/drm` PCI IDs as fallback (so fresh installs without driver +tools still work). On a no-GPU host (CI / build box) the script +errors out — pass `--gpu nvidia` to install the toolchain anyway. +`--gpu amd` forces the AMD path on dual-vendor hosts. Intel detection +currently errors with a hint pointing at `--gpu nvidia` (the SYCL +toolchain JITs onto Intel via AdaptiveCpp's generic SSCP target) or +the container. Pass `--no-acpp` to skip the AdaptiveCpp build and +let CMake fall back to FetchContent. + +### 3. Manual / FetchContent fallback + +If you'd rather install dependencies yourself, the toolchain is: + +| Dep | Notes | +|---|---| +| **AdaptiveCpp 25.10+** | SYCL implementation. CMake auto-fetches it via FetchContent if `find_package(AdaptiveCpp)` fails — first build adds ~15-30 min. Disable with `-DXCHPLOT2_FETCH_ADAPTIVECPP=OFF` if you want a hard error. | +| **CUDA Toolkit 12+** (headers) | Required on **every** build path because AdaptiveCpp's `half.hpp` includes `cuda_fp16.h`. `nvcc` itself only runs when `XCHPLOT2_BUILD_CUDA=ON`. Default is vendor-aware — `ON` for NVIDIA GPUs, `OFF` for AMD / Intel GPUs (even if `nvcc` is installed), falling through to `nvcc`-presence only when no GPU is probed (CI / container). Override with the env var. | +| **LLVM / Clang ≥ 18** | `clang`, `lld` (AdaptiveCpp's CMake requires `ld.lld`), plus the libclang dev packages. `install-deps.sh` installs all of them; manual installs need to add `lld-18` (apt) / `lld` (dnf, pacman) explicitly. | +| **C++20 compiler** | clang ≥ 18 or gcc ≥ 13. | +| **CMake ≥ 3.24**, **Ninja**, **Python 3** | build tools. | +| **Boost.Context, libnuma, libomp** | AdaptiveCpp runtime deps. | +| **Rust toolchain** (stable) | for `keygen-rs` and `cargo install`. | + +`pos2-chip` and `FSE` are auto-fetched at CMake configure time +(`FetchContent`); override `-DPOS2_CHIP_DIR=/abs/path` for a local +checkout. + +For non-NVIDIA targets, the build also probes: +- **ROCm 6+** (`rocminfo`): if found, sets `ACPP_TARGETS=hip:gfxXXXX`. +- **Intel oneAPI** (Level Zero / compute-runtime): manual `ACPP_TARGETS`. ### `cargo install` ```bash -cargo install --git https://github.com/Chia-Network/xchplot2 -# or fat build: -CUDA_ARCHITECTURES="89;120" cargo install --git https://github.com/Chia-Network/xchplot2 +cargo install --git https://github.com/Jsewill/xchplot2 ``` -`build.rs` auto-detects the local GPU's compute capability via -`nvidia-smi` (falling back to `sm_89`). Override with `$CUDA_ARCHITECTURES`. +`build.rs` auto-detects the local GPU's compute capability by querying +`nvidia-smi --query-gpu=compute_cap` and builds for only that +architecture. That keeps the binary small and the build fast when the +install and the target GPU are the same machine. + +If auto-detection fails (no `nvidia-smi` in `PATH`, or +`nvidia-smi` can't see a GPU — common when building inside a container +or on a headless build host that lacks the CUDA driver), the build +falls back to `sm_89`. Note that arch-detect picks *which CUDA arch* — +*whether* CUDA TUs build at all is a separate vendor-aware decision +(see `XCHPLOT2_BUILD_CUDA` in [Environment variables](#environment-variables)). + +If you need to target a GPU that isn't the one doing the build — or if +you want a single "fat build" binary that covers multiple +architectures — override with `$CUDA_ARCHITECTURES`: + +```bash +# Fat build for Ada (4090) and Blackwell (5090): +CUDA_ARCHITECTURES="89;120" cargo install --git https://github.com/Jsewill/xchplot2 + +# Single target (e.g. Turing 2080 Ti): +CUDA_ARCHITECTURES=75 cargo install --git https://github.com/Jsewill/xchplot2 +``` + +Common values: `61` GTX 10-series, `70` Volta, `75` Turing, `80` A100, +`86` RTX 30-series, `89` RTX 40-series, `90` H100, `120` RTX 50-series. ### CMake (also builds the parity tests) @@ -50,6 +408,188 @@ Outputs: - `build/tools/xchplot2/xchplot2` - `build/tools/parity/{aes,xs,t1,t2,t3}_parity` — bit-exact CPU/GPU tests +### Windows + +Two supported paths — native `main` doesn't work because AdaptiveCpp +has hard Linux-isms (libnuma, pthreads, LLVM SSCP) that fall apart on +Windows. Jump to the relevant subsection below: + +- [Native Windows build (`cuda-only` branch)](#native-windows-build-cuda-only-branch) — recommended NVIDIA path. +- [Native Windows build — SYCL path (adventurous)](#native-windows-build--sycl-path-adventurous) — AMD/Intel/cross-vendor, untested. + +**NVIDIA only** → use the +[`cuda-only`](https://github.com/Jsewill/xchplot2/tree/cuda-only) +branch. Pure MSVC + CUDA Toolkit + Rust, no SYCL runtime involved. +See that branch's README for the VS 2022 / Windows SDK / `LIB` +troubleshooting (the `LNK1181: kernel32.lib` and friends). + +**AMD or Intel, or if you just want the `main` code path** → run +under **WSL2**. WSL2 is a full Linux environment, so every install +option in this README works there unchanged — `cargo install`, +`scripts/install-deps.sh`, or the container (section 1 above). +Enable WSL2 once with `wsl --install` in an elevated PowerShell. +GPU access in WSL2: + +- **NVIDIA**: install the latest "NVIDIA GPU Driver for Windows", + nothing else — CUDA shows up inside WSL2 automatically. +- **AMD**: ROCm 6.1+ supports a limited card list on WSL2 (RX 7900 + XTX, Radeon Pro W7900, specific Instincts). Follow AMD's "Install + ROCm on WSL" guide. +- **Intel**: oneAPI on WSL2 via the Intel Linux graphics driver. + +Once the GPU is visible from a WSL2 shell (`nvidia-smi`, `rocminfo`, +or `sycl-ls`), proceed with the native Linux instructions above. + +#### Native Windows build (cuda-only branch) + +Full walkthrough for the NVIDIA native path, repeated here so you +don't have to flip between READMEs. Prerequisites: + +- Windows 10 21H2+ or Windows 11, x64 +- [Visual Studio 2022](https://visualstudio.microsoft.com/) Community + with the **"Desktop development with C++"** workload. That workload + bundles MSVC + the Windows SDK; the SDK is non-optional because it + ships `kernel32.lib` / `user32.lib` / etc. that `link.exe` + consumes. If you've trimmed the installer to "C++ build tools" + only, open **Visual Studio Installer → Modify → Individual + components** and tick the latest **Windows 11 SDK** before + retrying. +- [CUDA Toolkit 12.0+](https://developer.nvidia.com/cuda-downloads) — + install **after** Visual Studio so the CUDA installer wires up the + MSBuild integration. 12.8+ required for RTX 50-series (Blackwell, + `sm_120`). +- [Rust](https://www.rust-lang.org/tools/install) using the MSVC + toolchain (`rustup default stable-x86_64-pc-windows-msvc`). +- [CMake 3.24+](https://cmake.org/download/) and [Git for + Windows](https://gitforwindows.org/). + +Launch the **x64 Native Tools Command Prompt for VS 2022** from the +Start menu — there are several similarly-named prompts (x86 / +x86_64 / 2019 / 2022); the one that matters is the x64 for 2022. +That prompt is the one that sets `LIB`, `INCLUDE`, and `PATH` so +`cl.exe`, `link.exe`, `nvcc`, and `cmake` all see each other plus +the Windows SDK. A plain `cmd` / PowerShell / Windows Terminal tab +does **not** do this — running `cargo install` from one of those +produces `LNK1181: cannot open input file 'kernel32.lib'` at the +first link step. + +Quick sanity check in the prompt: + +```cmd +where link.exe +echo %LIB% +``` + +`%LIB%` should include a `...\Windows Kits\10\Lib\...\um\x64` +entry. If it doesn't, you're in the wrong prompt or the Windows SDK +component isn't installed. + +Build: + +```cmd +set CUDA_ARCHITECTURES=89 +cargo install --git https://github.com/Jsewill/xchplot2 --branch cuda-only +``` + +Or for a local checkout you can iterate on: + +```cmd +git clone -b cuda-only https://github.com/Jsewill/xchplot2 +cd xchplot2 +set CUDA_ARCHITECTURES=89 +cargo install --path . +``` + +Set `CUDA_ARCHITECTURES` to match your card (see the list above). +PowerShell users: use `$env:CUDA_ARCHITECTURES = "89"` instead of +`set`. The CMake path (`cmake -B build -S . && cmake --build build`) +also works inside the same Native Tools prompt if you prefer that +over `cargo install`. + +#### Native Windows build — SYCL path (adventurous) + +**Strongly recommend WSL2 first** (see the top of this section). +This subsection exists because the path is in principle buildable +on native Windows; in practice it's days of build-system tinkering +without hardware the maintainers can iterate on. Not validated by +us. File an issue with your findings. + +What you're signing up for: AdaptiveCpp, built from source on +Windows, pointed at either **AMD HIP SDK for Windows** (for AMD) or +the **CUDA Toolkit** (for NVIDIA through SYCL, if you want the +`main` branch's cross-vendor code path on NVIDIA instead of +`cuda-only`'s CUB one). xchplot2's CMake then finds that install +via `find_package(AdaptiveCpp)` and builds normally. AdaptiveCpp's +FetchContent fallback is **not** viable on native Windows — its own +CMakeLists assumes Linux-isms (libnuma, pthreads) that fall apart. +Pre-install is mandatory. + +Prerequisites (on top of the cuda-only prereqs above — MSVC, +Windows SDK, Rust, CMake, Git): + +- **LLVM 16–20** with Clang + LLD + the CMake development package + (`LLVMConfig.cmake` / `ClangConfig.cmake`). Version coverage of + Windows binary installers is patchy for these components; a + self-built LLVM is usually the path of least resistance. See + [AdaptiveCpp's Windows install guide](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/installing.md) + for the currently-recommended source. +- **AMD HIP SDK for Windows** (for the AMD target) from AMD's + [HIP SDK download page](https://www.amd.com/en/developer/rocm-hub/hip-sdk.html). + AMD officially flags it as preview: limited card list, different + device-library layout vs Linux ROCm, runtime coverage varies per + GPU. +- **CUDA Toolkit 12+** (for the NVIDIA-via-SYCL target). Same + installer as the `cuda-only` path above. + +Rough build sequence from a clean **x64 Native Tools Command Prompt +for VS 2022** (paths are indicative — match your installs): + +```cmd +:: 1. Build AdaptiveCpp +git clone --branch v25.10.0 https://github.com/AdaptiveCpp/AdaptiveCpp.git +cd AdaptiveCpp +cmake -B build -S . -G Ninja ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DCMAKE_INSTALL_PREFIX=C:\opt\adaptivecpp ^ + -DLLVM_DIR=C:\path\to\llvm\lib\cmake\llvm ^ + -DWITH_CUDA_BACKEND=OFF ^ + -DWITH_HIP_BACKEND=ON ^ + -DROCM_PATH="C:\Program Files\AMD\ROCm\6.1" +cmake --build build --parallel +cmake --install build + +:: 2. Build xchplot2 main against the install +cd \path\to\xchplot2 +:: CMAKE_PREFIX_PATH only needed if you installed AdaptiveCpp to a +:: non-default Windows path. The build's auto-discovery only covers +:: Linux's /opt/adaptivecpp — Windows users tell CMake explicitly. +set CMAKE_PREFIX_PATH=C:\opt\adaptivecpp +set ACPP_TARGETS=hip:gfx1101 +set XCHPLOT2_BUILD_CUDA=OFF +cargo install --path . +``` + +Flip `WITH_HIP_BACKEND` ↔ `WITH_CUDA_BACKEND` and set +`ACPP_TARGETS=cuda:sm_XX` for the NVIDIA-through-SYCL variant. + +Failure modes you should expect to triage: + +- **Missing LLVM CMake modules** — source-built LLVM with + `LLVM_INSTALL_UTILS=ON` and the clang / clang-tools-extra + projects enabled is the reliable recipe. +- **Generic SSCP compiler disabled** (`DEFAULT_TARGETS` warning + during AdaptiveCpp configure) — harmless if you set + `ACPP_TARGETS=hip:gfxXXXX` explicitly at xchplot2's configure. +- **`ROCM_PATH` mismatch** — AMD's Windows installer versions the + directory (`C:\Program Files\AMD\ROCm\6.1\`); match it exactly. +- **Clean build, runtime kernel failures** — the HIP SDK for + Windows preview doesn't cover every GPU the Linux ROCm path + does. Run `scripts/test-multi-gpu.sh` / `xchplot2 test 22 ...` + with a k=22 plot first and `xchplot2 verify` the result before + committing a large batch. + +Seriously, try WSL2 first. + ## Use ### Standalone (farmable plots) @@ -65,6 +605,16 @@ Pool variants: `-p ` or `--pool-ph `. Other common flags: `-s `, `-T` testnet, `-S ` for reproducible runs, `-v` verbose. Full help: `xchplot2 -h`. +For long batches, `--skip-existing` skips plots whose output file is +already a complete `.plot2` (magic bytes + non-trivial size), and +`--continue-on-error` logs per-plot failures and keeps going instead of +aborting the whole run. Both flags work for `plot` and `batch` modes. + +Plots are written to `.plot2.partial` and atomically renamed on +completion, so a crash / `SIGINT` / `ENOSPC` mid-write never leaves a +malformed plot at the destination. A first `Ctrl-C` asks the plotter to +finish the plot in flight and stop; a second hard-kills. + #### Grouping plots: `-i ` and `-g ` Both are v2 PoS fields and default to 0. @@ -84,13 +634,171 @@ decisions. When the grouped layout lands, the auto-incrementing `` above is the per-plot within-group identifier it will expect. +#### Multi-device: `--devices` and `--cpu` + +`xchplot2 devices` prints id, name, backend, VRAM, compute-unit count, +and which sort path each device will use (CUB on cuda-backend devices +when this build links CUB, SortSycl otherwise) — the printed `[N]` +index is the value `--devices N` accepts: + +``` +$ xchplot2 devices +Visible devices (2 GPU + 1 CPU): + [0] NVIDIA GeForce RTX 4090 backend=cuda vram=24076 MB CUs=128 sort:CUB + [1] AMD Radeon Pro W5700 backend=hip vram= 8176 MB CUs=36 sort:SYCL + [cpu] Host CPU plotter backend=omp threads=32 sort:SYCL (1-2 orders slower than GPU) + +Use `--devices N` (id) for a specific GPU, + `--devices gpu` for every GPU, + `--devices cpu` for the host CPU only, + `--devices all` for every GPU + CPU, + or any comma combination (e.g. `0,2,cpu`). +``` + +Both `plot` and `batch` accept `--devices ` to fan plots out +across multiple devices — one worker thread per device, each with its +own buffer pool and writer channel. Plots are partitioned round-robin, +so a batch of 10 plots on 2 GPUs sends plots 0/2/4/6/8 to the first +GPU and 1/3/5/7/9 to the second. + +```bash +# Every visible GPU — enumerated at runtime. No CPU worker. +xchplot2 plot --k 28 --num 10 -f -c \ + --out /mnt/plots --devices gpu + +# Every visible GPU PLUS a CPU worker on the same batch. +xchplot2 plot ... --devices all + +# Only these specific GPU ids (sorted, deduplicated). +xchplot2 plot ... --devices 0,2,3 + +# Explicit single id (same as omitting the flag on a single-GPU host). +xchplot2 plot ... --devices 0 + +# CPU-only: AdaptiveCpp OpenMP backend (slow). Use the `cpu` token in +# --devices, or the standalone --cpu flag (equivalent on its own). +xchplot2 plot ... --devices cpu +xchplot2 plot ... --cpu + +# Mix tokens: specific GPUs + CPU. +xchplot2 plot ... --devices 0,1,cpu +``` + +CPU plotting is **1-2 orders of magnitude slower than GPU** — meant for +GPU-less hosts, headless CI, or as an extra background worker. Don't +expect GPU-grade throughput from a CPU worker on a heterogeneous batch. + +Omitted flag = single device via the default SYCL / CUDA selector — +identical to pre-multi-GPU behavior, zero regression risk. + +**Caveats for v1:** + +- Static round-robin partition. If your GPUs differ in speed the + batch finishes only as fast as the slowest worker's slice; use + `--devices` to pick matched cards when that matters. +- Each worker gets its own ~4 GB pinned host pool, so host RAM scales + linearly. A 4-GPU rig pins ~16 GB — size accordingly. +- The workers share `stderr` (line-buffered, atomic per-`fprintf`) so + log lines from different GPUs may interleave. Fine for progress, + not for parsing. + +Smoke test: `scripts/test-multi-gpu.sh` exercises argument parsing +(works on any host, even single-GPU) and, when 2+ GPUs are visible, +runs a live k=22 plot across `--devices 0,1`. + ### Lower-level subcommands ```bash -xchplot2 test [strength] ... # single plot, raw inputs -xchplot2 batch [-v] # batched, raw inputs +xchplot2 test [strength] ... # single plot, raw inputs +xchplot2 batch [-v] [--skip-existing] [--continue-on-error] + [--devices ] +xchplot2 verify [--trials N] # run N random challenges +xchplot2 parity-check [--dir PATH] # CPU↔GPU regression screen ``` +`verify` opens a `.plot2` through pos2-chip's CPU prover and runs N +(default 100) random challenges. Zero proofs across a reasonable sample +strongly indicates a corrupt plot; the command exits non-zero in that +case. Intended as a quick sanity check before farming a newly built +batch — not a replacement for `chia plots check`. + +`parity-check` execs every `*_parity` binary in `--dir` (default +`./build/tools/parity`) and summarizes PASS/FAIL with per-test wall +time. Use after a refactor or driver update to confirm CPU↔GPU +agreement is still bit-exact across `aes` / `xs` / `t1` / `t2` / `t3` / +`plot_file`. Requires `cmake --build` to have produced the parity +binaries first. + +## Troubleshooting + +- **Listing visible GPUs**: `xchplot2 devices` prints id, name, backend, + VRAM, compute-unit count, and which sort path each device will use + (CUB on cuda-backend devices when this build links CUB; SortSycl + otherwise). Use the printed `[N]` index with `--devices N` for + `plot` / `batch`. + +- **Hybrid hosts (NVIDIA + AMD/Intel on the same box)**: a single + binary handles all visible GPUs. `xchplot2 plot --devices gpu` + spawns a worker per GPU (use `--devices all` to also add a CPU + worker); each worker picks the right sort backend at queue + construction (CUB on NVIDIA, hand-rolled SYCL radix on AMD/Intel) + via the runtime dispatcher in `SortDispatch.cpp`. No rebuild + required to add a second-vendor card. + +- **`[AdaptiveCpp Warning] [backend_loader] Could not load library: + /opt/adaptivecpp/lib/hipSYCL/librt-backend-cuda.so (libcudart.so.11.0: + cannot open shared object file)`**: cosmetic only — AdaptiveCpp + built with CUDA backend support but no CUDA runtime to load. Happens + when AdaptiveCpp was installed out-of-band rather than via + `scripts/install-deps.sh --gpu amd` (which sets + `-DCMAKE_DISABLE_FIND_PACKAGE_CUDA=TRUE`). To suppress without a + rebuild: `export ACPP_VISIBILITY_MASK=hip;omp` so AdaptiveCpp skips + the CUDA backend probe entirely. + +- **`T1 match produced 0 entries`** on RDNA1 (`gfx1010` / `gfx1011` / + `gfx1012`, including the Radeon Pro W5700 / RX 5700 XT). The + community `gfx1013` AOT-spoof default was observed to silently + compile no-op kernel stubs on at least one W5700 + ROCm 6 + + AdaptiveCpp 25.10 host. Default flipped to `ACPP_TARGETS=generic` + (SSCP JIT) in recent main; `cargo install --force` past commit + `d939ee8` restores correct behavior. To restore the old spoof, + `XCHPLOT2_FORCE_GFX_SPOOF=1 cargo install ...`. The startup self- + test in `SyclBackend::queue()` catches the no-op-kernel case at + queue construction with a clear exception, so this surfaces + immediately rather than as empty pipeline output minutes in. + +- **`CUB ... invalid argument`** mid-pipeline, or + **`sycl_backend::queue: device id 0 out of range (found 0 usable + GPU device(s))`** with `--devices N` while the default selector + finds a GPU: pre-`762fde2` symptoms of CUB-only sort being + dispatched against an AMD/Intel device (or being filtered out of + the device list). The runtime sort dispatcher fixes both — `git + pull && cargo install --path . --force` to upgrade. + +- **Deep-pipeline diagnostics**: set `POS2GPU_T1_DEBUG=1` for verbose + per-stage dumps (Xs gen / sort intermediates, T1 match input/output + samples, AES T-table sanity). Useful when the symptom isn't on the + list above and you want to localize where the data goes wrong. + +## Environment variables + +| Variable | Effect | +|-------------------------------|-------------------------------------------------------------------------| +| `XCHPLOT2_BUILD_CUDA=ON\|OFF` | Override the build-time CUB / nvcc-TU switch. Default is vendor-aware (NVIDIA → ON; AMD / Intel → OFF; no GPU → `nvcc`-presence). Force `OFF` on dual-toolchain hosts (CUDA + ROCm) where you want the SYCL-only build. | +| `XCHPLOT2_STREAMING=1` | Force the low-VRAM streaming pipeline even when the pool would fit. | +| `XCHPLOT2_STREAMING_TIER=plain\|compact\|minimal` | Override the streaming-tier auto-pick (plain = ~7.3 GB peak, no parks; compact = ~5.2 GB peak, full parks + N=2 T2 match tiling; minimal = ~3.76 GB peak with full host-pinned slicing of T1/T3 match + tiled CUB outputs in all sort phases + tiled Xs gen/sort/pack — targets 5 GiB+ cards). Equivalent CLI flag: `--tier`. | +| `POS2GPU_MAX_VRAM_MB=N` | Cap the pool/streaming VRAM query to N MB (exercise streaming fallback).| +| `POS2GPU_STREAMING_STATS=1` | Log every streaming-path `malloc_device` / `free`. | +| `POS2GPU_POOL_DEBUG=1` | Log pool allocation sizes at construction. | +| `POS2GPU_PHASE_TIMING=1` | Per-phase wall-time breakdown (Xs / sort / T1 / T2 / T3) on stderr. | +| `ACPP_GFX=gfxXXXX` | AMD only — required at **build** time; sets AOT target for amdgcn ISA. | +| `ACPP_TARGETS=...` | Override AdaptiveCpp target selection (defaults: NVIDIA `generic`, AMD `hip:$ACPP_GFX`). | +| `CUDA_ARCHITECTURES=sm_XX` | Override the CUDA arch autodetected from `nvidia-smi`. | +| `CUDA_PATH=/path/to/cuda` | Override the CUDA Toolkit root for linking (default: `/opt/cuda`, `/usr/local/cuda`). Useful on JetPack / non-standard installs. | +| `CUDA_HOME=/path/to/cuda` | Fallback for `CUDA_PATH` — same effect. | +| `POS2_CHIP_DIR=/path` | Build-time: point at a local pos2-chip checkout instead of FetchContent.| +| `XCHPLOT2_TEST_GPU_COUNT=N` | Override `scripts/test-multi-gpu.sh`'s auto-detected GPU count (forces run / skip without consulting `nvidia-smi`). | + ## Testing farming on a testnet v2 (CHIP-48) farming in stock chia-blockchain is presently unfinished @@ -115,9 +823,13 @@ pieces any v2 plot needs for farming, regardless of who produced it. ## Architecture ``` -src/gpu/ CUDA kernels — AES, Xs, T1, T2, T3 +src/gpu/ GPU kernels — AES, Xs, T1, T2, T3. + CUDA path: .cu files via nvcc + CUB sort. + SYCL path: matching .cpp files via + AdaptiveCpp + hand-rolled LSD radix. src/host/ -├── GpuPipeline Xs → T1 → T2 → T3 device orchestration +├── GpuPipeline Xs → T1 → T2 → T3 device orchestration; +│ pool + streaming (low-VRAM) variants ├── GpuBufferPool persistent device + 2× pinned host pool ├── BatchPlotter producer / consumer batch driver └── PlotFileWriterParallel sole TU touching pos2-chip headers @@ -128,13 +840,153 @@ keygen-rs/ Rust staticlib: plot_id_v2, BLS HD, bech32m ## VRAM -PoS2 plots are k=28 by spec; the persistent buffer pool needs **~15 GB -of device VRAM**, so a 16 GB+ card is required (RTX 4080 / 4090 / -5080 / 5090, A6000, etc.). `xchplot2` queries `cudaMemGetInfo` at -startup and refuses with an actionable error if the pool won't fit. +PoS2 plots are k=28 by spec. Four code paths, dispatched automatically +based on available VRAM at batch start: + +- **Pool path (~11 GB device + ~4 GB pinned host; 12 GB+ cards + reliably).** The persistent buffer pool is sized worst-case and + reused across plots in `batch` mode for amortised allocator cost and + double-buffered D2H. Xs sort's keys_a slot aliases d_storage tail + (idle during Xs gen+sort), trimming pair_b's worst case from + `max(cap·12, 4·N·u32 + cub)` to `max(cap·12, 3·N·u32 + cub)` — + saves ~1 GiB at k=28. Targets: RTX 4090 / 5090, A6000, H100, + RTX 4080 (16 GB), and 12 GB cards like RTX 3060 / RX 6700 XT. +- **Plain streaming (~7.3 GB peak + 128 MB margin; ≥ 7.42 GiB free at + k=28).** Allocates per-phase and frees between phases, but keeps + large intermediates (`d_t1_meta`, `d_t1_keys_merged`, `d_t2_meta`, + `d_t2_xbits`, `d_t2_keys_merged`) alive across their idle windows + instead of parking them on pinned host. T2 match runs as a single + full-cap pass (N=1). Used on 10-11 GB cards that can't fit the pool + but have headroom above the compact floor. ~400 ms/plot faster than + compact at k=28 because there are no park/rehydrate PCIe round-trips. +- **Compact streaming (~5.2 GB peak + 128 MB margin; ≥ 5.33 GiB free + at k=28).** All three match phases (T1/T2/T3) are tiled N=2 across + disjoint bucket ranges with half-cap device staging and + D2H-to-pinned-host between passes. T1 + T2 sorts are tiled (N=2 and + N=4) with merge trees, and `d_t1_meta`, `d_t2_meta`, and the + `*_keys_merged` buffers are parked on pinned host across their + sort phases and JIT-H2D'd only for the next consumer. Xs is inlined + as gen → sort → pack with separate-allocation scratch so keys_a + + vals_a can be freed right after CUB sort. Peak at k=28 is + **5200 MB** (measured on sm_89); per-phase live maxes: + + | Phase | Peak (MB) | + |-----------|----------:| + | Xs | 4128 | + | T1 match | 5168 | + | T1 sort | 5200 | + | T2 match | 5200 | + | T2 sort | 5200 | + | T3 match | 5200 | + | T3 sort | 4228 | + + A BatchPlotter preflight rejects cards reporting less than + `streaming_peak_bytes(k) + 128 MB` free before any queue work, so + mid-pipeline OOM is impossible on supported configurations. + Practical targets: 6 GB cards on the edge (card-dependent; RTX 2060 + typically has ~5.5 GiB free which has ~170 MB slack over the + 5328 MB requirement), 8 GB cards comfortable, 10 GB and up ample. + Log the full alloc trace with `POS2GPU_STREAMING_STATS=1`. +- **Minimal streaming (~3.76 GB peak + 128 MB margin; ≥ 3.80 GiB free + at k=28).** Layered cuts on top of compact: + - **N=8 T2 match staging.** cap/8 ≈ 570 MB vs compact's cap/2 + ≈ 2280 MB — saves ~1.5 GB on the T2-match peak. + - **Tiled gathers in T1 sort + T2 sort meta + T2 sort xbits.** + Each gather output produced in N=4 tiles, D2H'd to host pinned + (reusing the existing parking buffers) one tile at a time, then + rebuilt on device after the cap-sized inputs are freed. Drops + each gather peak from 5200 MB → ~3640 MB. + - **Sliced T1 match.** N passes (one per section_l) emit to a + cap/N device staging pair, D2H per pass to host pinned. d_xs + (2048 MB at k=28) no longer co-resides with full-cap d_t1_meta + + d_t1_mi → T1-match peak drops from 5168 MB → 3023 MB. + - **Sliced T3 match.** d_t2_meta_sorted parked on host across + T3 match; per pass H2Ds the (section_l, section_r) row slices + onto a small device buffer pair. d_t2_xbits_sorted + + d_t2_keys_merged remain full-cap on device for binary-search / + target reads. T3-match peak: 5200 MB → 3754 MB. + - **Per-tile CUB outputs in T1/T2/T3 sort sub-phases.** T1 and T2 + sort use cap/2 / cap/4 device output buffers respectively, D2H + per tile to USM-host accumulators, with the existing 2-way merge + kernel reading USM-host inputs. T2 additionally parks AB / CD + intermediates to host between tree steps so the final merge + sees only its own outputs. T3 sort uses cap/2 tile + host-side + `std::inplace_merge`. CUB sub-phase peaks: 4170-4228 MB → + 3155-3640 MB. + - **Tiled Xs gen+sort+pack.** N=2 position halves through cap/2 + ping-pong buffers + USM-host accumulator + 2-way merge, then + pack runs in cap/2 halves with D2H per tile to a host-pinned + `XsCandidateGpu` accumulator (final d_xs rehydrated H2D). + Xs phase peak: 4128 MB → 3072 MB. + + Bottleneck after all six cuts is the T3 match phase at 3754 MB. + Targets 5 GiB+ cards comfortably (RTX 2060, RX 6600 XT, RX 7600 + with ~1.7+ GiB headroom). 4 GiB cards (GTX 1050 Ti / 1650, RTX 3050 + 4GB, MX450) are an edge case — real 4 GiB physical hardware + reports ~3.5 GiB free post-CUDA-context, just under the 3.80 GiB + required floor. Trade-off: ~6 extra cap-sized PCIe round-trips per + plot push k=28 wall on sm_89 from ~13 s/plot (compact) to ~34 + s/plot (minimal). There is no smaller tier — a forced minimal on a + card below the floor throws rather than falling further. + +At pool construction `xchplot2` queries `cudaMemGetInfo` on the +CUDA-only build, or `global_mem_size` (device total) on the SYCL +path — SYCL has no portable free-memory query, so the check +effectively approximates "free == total" and lets the actual +`malloc_device` failure trigger the fallback. If the pool doesn't +fit, the streaming-tier dispatch picks the largest tier that fits +with the 128 MB margin: plain if free ≥ 7.42 GiB, else compact if +free ≥ 5.33 GiB, else minimal. `XCHPLOT2_STREAMING=1` forces +streaming even when the pool would fit; `--tier +plain|compact|minimal` (or `XCHPLOT2_STREAMING_TIER`) overrides the +auto-pick. Forced plain or compact below their floor warns and +proceeds (caller's risk); forced minimal below its floor throws +because there is no smaller tier to fall back to. + +Plot output is bit-identical across all four paths — streaming +reorganises memory, not algorithms. Verified at k=22 with md5sum +across pool / plain / compact / minimal. + +## Performance + +k=28, strength=2, RTX 4090 (sm_89), PCIe Gen4 x16. Steady-state per-plot +wall from `xchplot2 batch` (10-plot manifest, mean): + +| Build | Per plot | Notes | +|---|---|---| +| pos2-chip CPU baseline | ~50 s | reference | +| `cuda-only` branch | **2.15 s** | original CUDA-only path | +| `main`, `XCHPLOT2_BUILD_CUDA=ON` (CUB sort) | 2.41 s | NVIDIA fast path on the SYCL/AdaptiveCpp port | +| `main`, `XCHPLOT2_BUILD_CUDA=OFF` (hand-rolled SYCL radix) | 3.79 s | cross-vendor fallback (AMD/Intel) on AdaptiveCpp | +| plain streaming tier (10-11 GB cards) | ~5.7 s | no parks, single-pass T2 match; ~400 ms/plot faster than compact | +| compact streaming tier (6-8 GB cards) | ~7.3 s | full parks + N=2 T2 match | +| minimal streaming tier (4 GiB cards) | TBD | full parks + N=8 T2 match; smallest peak (~3.7 GB) | +| `main` on RX 6700 XT (gfx1031 / ROCm 6.2 / AdaptiveCpp HIP) | **9.97 s** | AMD batch steady-state at k=28; T-table AES near-optimal on RDNA2 via this compiler stack | + +The `main`/CUB row is +12% over `cuda-only` from extra AdaptiveCpp +scheduling overhead. The SYCL row is +57% over CUB on the same NVIDIA +hardware; ~88% of GPU compute is identical between the two paths +(`nsys` per-kernel breakdown), and the gap is dominated by host-side +runtime overhead in AdaptiveCpp's DAG manager rather than kernel +performance. AMD and Intel runtimes are untested; expect roughly the +SYCL-row latency adjusted for relative GPU throughput. + +Numbers above are single-GPU. With `--devices 0,1,...` the batch is +partitioned round-robin across N worker threads (one per device), so +wall-clock throughput is bounded by the slowest device's slice — +≈ linear scaling on matched cards, less if cards differ in speed. +Live multi-GPU plots were confirmed end-to-end on NVIDIA; per-device +numbers will vary with PCIe bandwidth sharing on the host root +complex. ## License MIT — see [LICENSE](LICENSE) and [NOTICE](NOTICE) for third-party attributions. Built collaboratively with [Claude](https://claude.ai/code). + +## Like this? Send a coin my way! + +If you appreciate this, and want to give back, feel free. + +xch1d80tfje65xy97fpxg7kl89wugnd6svlv5uag2qays0um5ay5sn0qz8vph8 diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..1b5fc68 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,52 @@ +# Security Policy + +## Reporting a vulnerability + +Email **abraham.sewill@proton.me** with a description of the issue and +steps to reproduce. Please do not open a public GitHub issue for +security-sensitive reports. + +## Scope — what counts for a plotter + +xchplot2 is a client-side plot builder. It handles: + +- Farmer and pool public keys provided on the command line. +- Optional `--seed` entropy that derives per-plot subseeds; a weak + or reused seed lets an attacker who observes plot IDs correlate + plots to the same master key. +- BLS key parsing via the + [`chia` Rust crate](https://crates.io/crates/chia) through + `keygen-rs`. +- Large file writes into caller-supplied output directories. + +Relevant threat model items we want to hear about: + +- **Key handling:** any path where farmer/pool key bytes or the + master seed leak into logs, temporary files, crash dumps, or + the plot file itself beyond the documented memo payload. +- **File-path handling:** any way a crafted `-o` / `out_dir` / memo + string escapes the intended output directory or overwrites files + outside it (path traversal, symlink races). The atomic + `.partial` + rename is safe by design; report if you can break it. +- **Manifest parsing:** malformed `batch` manifests that cause + out-of-bounds reads, arbitrary allocation, or unchecked sign + conversion. +- **Build-time supply chain:** tampering paths in + `scripts/install-deps.sh`, `Containerfile`, `compose.yaml`, or + the FetchContent targets (pos2-chip, AdaptiveCpp). + +## Explicitly out of scope + +- Proof-of-space soundness and the v2 PoS algorithm itself — + report those upstream in + [`pos2-chip`](https://github.com/Chia-Network/pos2-chip). +- Consensus, farming, or wallet behavior — those belong in + [`chia-blockchain`](https://github.com/Chia-Network/chia-blockchain) + and [`chia_rs`](https://github.com/Chia-Network/chia_rs). +- Performance regressions on exotic GPUs — file as a normal bug. + +## Response + +Acknowledgement within a week. Fixes for in-scope issues land on +`main` (and the `cuda-only` branch if applicable) with credit in the +commit message unless you prefer otherwise. diff --git a/_typos.toml b/_typos.toml new file mode 100644 index 0000000..d82642d --- /dev/null +++ b/_typos.toml @@ -0,0 +1,17 @@ +# _typos.toml — domain-specific allowlist for xchplot2. +# +# typos' default dictionary flags a handful of proper nouns and +# CUDA / SYCL intrinsic names that only LOOK like misspellings. The +# risk of one of these coincidentally being a real typo elsewhere in +# the tree is low, so allowlist them globally rather than per-file. + +[default.extend-words] +# AMD ROCm "Heterogeneous System Architecture" runtime. +HSA = "HSA" +# SYCL kernel range / index types: nd_range, nd_item. +nd = "nd" +# CUDA half-precision intrinsics: __hge ("greater-or-equal"), +# __hgt, __hle, __hlt; AdaptiveCpp's libkernel/half.hpp aliases. +hge = "hge" +# Yann Collet, author of LZ4 / zstd, attributed in NOTICE. +Collet = "Collet" diff --git a/build.rs b/build.rs index 6111517..319c082 100644 --- a/build.rs +++ b/build.rs @@ -36,6 +36,348 @@ fn detect_cuda_arch() -> Option { Some(arch.to_string()) } +/// Same probe as `detect_cuda_arch`, but filters out NVIDIA GPUs +/// below our README-documented minimum compute capability (sm_50, +/// Maxwell first-gen / GTX 750-class). The floor used to be sm_61 on +/// the assumption that AdaptiveCpp's `half.hpp` referenced FP16 +/// intrinsics (`__hadd` / `__hsub` / `__hmul` / `__hdiv` / `__hlt` / +/// `__hgt`) only available on sm_53+ — but those intrinsics are +/// *implemented* in `cuda_fp16.hpp` via `NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, …)` +/// with a fp32 emulation fallback for pre-sm_53 cards. CUDA 12.x +/// toolkits compile cleanly for sm_50/52/53. The real floor is the +/// toolkit's own codegen support: CUDA 12.x supports sm_50-90+, +/// CUDA 13.x dropped sm_50-72 (CMakeLists' nvcc-vs-arch preflight +/// catches that pairing with a FATAL_ERROR + fix block). +/// +/// Returns Some(arch) only when nvidia-smi reports a card at or +/// above our minimum; emits a cargo:warning and returns None +/// otherwise so callers fall through to the AMD / Intel detection. +fn usable_nvidia_arch() -> Option { + let arch = detect_cuda_arch()?; + let n: u32 = arch.parse().ok()?; + if n < 50 { + println!( + "cargo:warning=xchplot2: nvidia-smi detected sm_{arch} — below our \ + minimum supported compute capability (sm_50 / Maxwell). CUDA 11.x \ + was the last toolkit to compile for Kepler (sm_30-37); we don't \ + support that path. Ignoring NVIDIA for default targeting; if \ + this card is your only GPU, force the build with \ + CUDA_ARCHITECTURES={arch} + XCHPLOT2_BUILD_CUDA=ON and an \ + appropriately-old CUDA toolkit, or fall back to \ + ACPP_TARGETS=omp for AdaptiveCpp's CPU OpenMP backend."); + return None; + } + if n < 75 && detect_nvcc_major().map(|m| m >= 13).unwrap_or(false) { + println!( + "cargo:warning=xchplot2: nvidia-smi detected sm_{arch} (Maxwell / \ + Pascal / Volta) but nvcc is CUDA 13.x, which dropped codegen \ + for sm_50-72. Ignoring NVIDIA for default targeting; install \ + CUDA 12.9 (last toolkit with Maxwell-Volta support) and re-run, \ + or use scripts/build-container.sh which auto-pins the right \ + base image. CMakeLists' preflight will FATAL_ERROR with the \ + exact remediation if you force-build anyway."); + return None; + } + Some(arch) +} + +/// Check whether nvcc is on $PATH and runnable. Used as the fall-back +/// signal for XCHPLOT2_BUILD_CUDA when no GPU is enumerable (headless +/// CI / container builds). Runs `nvcc --version` rather than a simple +/// PATH lookup so stale symlinks don't pass. +fn detect_nvcc() -> bool { + Command::new("nvcc") + .arg("--version") + .output() + .map(|o| o.status.success()) + .unwrap_or(false) +} + +/// Parse nvcc's major version from `nvcc --version` output. +/// The release line looks like: +/// "Cuda compilation tools, release 13.0, V13.0.48" +/// Returns None if nvcc isn't on PATH or the line can't be parsed — +/// callers treat that as "skip the version-vs-arch compat check" +/// rather than blocking the build. +fn detect_nvcc_major() -> Option { + let out = Command::new("nvcc").arg("--version").output().ok()?; + if !out.status.success() { return None; } + let s = std::str::from_utf8(&out.stdout).ok()?; + for line in s.lines() { + let mut iter = line.split_whitespace(); + while let Some(w) = iter.next() { + if w == "release" { + let next = iter.next()?; // "13.0," + let major = next.trim_end_matches(',').split('.').next()?; + return major.parse().ok(); + } + } + } + None +} + +/// Minimum integer arch from a CMake-style CUDA_ARCHITECTURES list +/// ("61", "61;86", "61;86;120"). Tolerates "sm_61" / "compute_61" +/// prefixes that Cargo users sometimes pass through. Returns None +/// when the list parses to nothing. +fn min_arch(arch_list: &str) -> Option { + arch_list.split(';') + .filter_map(|s| { + let s = s.trim() + .trim_start_matches("sm_") + .trim_start_matches("compute_"); + s.parse().ok() + }) + .min() +} + +/// Probe /sys/class/drm for a display-class PCI device with Intel's +/// vendor ID (0x8086). Used as a heuristic to default +/// XCHPLOT2_BUILD_CUDA=OFF on Intel hosts, mirroring what rocminfo +/// already does for AMD. Returns false on non-Linux or when the sysfs +/// path isn't accessible — callers fall back to the next signal. +fn detect_intel_gpu() -> bool { + let entries = match std::fs::read_dir("/sys/class/drm") { + Ok(d) => d, + Err(_) => return false, + }; + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + // Skip connector nodes like card0-DP-1; we only want the card itself. + if !name.starts_with("card") || name.contains('-') { + continue; + } + let vendor = entry.path().join("device/vendor"); + if let Ok(v) = std::fs::read_to_string(&vendor) { + if v.trim() == "0x8086" { + return true; + } + } + } + false +} + +/// Does the host have any AMD GPU detectable by rocminfo? Independent +/// of which ACPP_TARGETS string we'd pick for it — `detect_amd_gfx` may +/// return None for AMD cards we choose to route through SSCP (RDNA1 +/// default), but the GPU is still present and BUILD_CUDA detection +/// should still see it as "AMD host, skip CUDA TUs". +/// +/// Falls back to /sys/class/drm vendor-ID probe (0x1002) when rocminfo +/// isn't on $PATH at build time. That happens reliably when users +/// install ROCm via /opt/rocm/bin without sourcing /etc/profile.d/rocm.sh +/// in the shell that runs `cargo install`, or run `cargo install` under +/// systemd / sudo / chroot where the parent shell's PATH is stripped. +/// Without the fallback the BUILD_CUDA selector falls through to the +/// `nvcc present → ON, "CI fallback"` arm, the build links CUB, and the +/// streaming pipeline dies on first sort dispatch against the AMD card. +fn amd_gpu_present() -> bool { + if let Ok(out) = Command::new("rocminfo").output() { + if out.status.success() { + if let Ok(s) = std::str::from_utf8(&out.stdout) { + if s.lines().any(|l| { + l.trim().strip_prefix("Name:") + .map(|rest| rest.trim().starts_with("gfx")) + .unwrap_or(false) + }) { + return true; + } + } + } + } + // PCI fallback — same pattern as detect_intel_gpu(). Doesn't need any + // user-space tools, only readable sysfs (true on every Linux host + // with the amdgpu / radeon kernel module loaded). + let entries = match std::fs::read_dir("/sys/class/drm") { + Ok(d) => d, + Err(_) => return false, + }; + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + if !name.starts_with("card") || name.contains('-') { + continue; + } + let vendor = entry.path().join("device/vendor"); + if let Ok(v) = std::fs::read_to_string(&vendor) { + if v.trim() == "0x1002" { + return true; + } + } + } + false +} + +/// Ask `rocminfo` for the first AMD GPU's architecture, e.g. "gfx1100" for +/// an RX 7900 XTX. Returns None when rocminfo is missing or there's no AMD +/// GPU, AND ALSO when we deliberately want the caller to fall through to +/// ACPP_TARGETS=generic (currently for RDNA1 gfx1010/1011/1012). Use +/// amd_gpu_present() to distinguish "no AMD GPU at all" from "AMD GPU +/// present but routed through generic SSCP". +fn detect_amd_gfx() -> Option { + let out = Command::new("rocminfo").output().ok()?; + if !out.status.success() { + return None; + } + let s = std::str::from_utf8(&out.stdout).ok()?; + for line in s.lines() { + if let Some(rest) = line.trim().strip_prefix("Name:") { + let name = rest.trim(); + if name.starts_with("gfx") { + // RDNA1 (gfx1010/1011/1012) isn't a direct AdaptiveCpp + // HIP AOT target. We previously defaulted to a community + // workaround that AOT-compiled for gfx1013 (close-ISA), + // but it has been observed to silently produce no-op + // kernels on at least one W5700 / ROCm 6 / AdaptiveCpp + // 25.10 setup — every kernel dispatch completes without + // writing, surfacing far downstream as "T1 match + // produced 0 entries". A separate-build experiment on + // the same host with ACPP_TARGETS=generic (SSCP JIT) + // dispatched and produced correct output through k=24. + // + // Default for RDNA1 is now ACPP_TARGETS=generic (signal + // by returning None — caller's None branch picks + // generic). Two opt-in escape hatches preserved for + // users who've validated their stack on the legacy + // path: + // XCHPLOT2_FORCE_GFX_SPOOF=1 — gfx1013 AOT spoof + // XCHPLOT2_NO_GFX_SPOOF=1 — native gfx1010 AOT + // (may fail to compile + // if AdaptiveCpp doesn't + // advertise it as a HIP + // target). + let spoofed = match name { + "gfx1010" | "gfx1011" | "gfx1012" => { + let force_spoof = env::var("XCHPLOT2_FORCE_GFX_SPOOF") + .map(|v| !v.is_empty() && v != "0") + .unwrap_or(false); + let no_spoof = env::var("XCHPLOT2_NO_GFX_SPOOF") + .map(|v| !v.is_empty() && v != "0") + .unwrap_or(false); + if force_spoof { + println!( + "cargo:warning=xchplot2: RDNA1 {name} detected, \ + XCHPLOT2_FORCE_GFX_SPOOF set — building for \ + gfx1013 (legacy community workaround). The \ + default switched to ACPP_TARGETS=generic (SSCP \ + JIT) after the spoof was observed to silently \ + produce no-op kernels on some W5700 setups; \ + unset XCHPLOT2_FORCE_GFX_SPOOF if your plots \ + fail with 'T1 match produced 0 entries'."); + "gfx1013".to_string() + } else if no_spoof { + println!( + "cargo:warning=xchplot2: RDNA1 {name} detected, \ + XCHPLOT2_NO_GFX_SPOOF set — AOT-targeting {name} \ + natively. If AdaptiveCpp doesn't advertise {name} \ + as a HIP target on your toolchain, the build will \ + fail; unset XCHPLOT2_NO_GFX_SPOOF to fall back to \ + the (working-on-most-cards) generic SSCP JIT."); + name.to_string() + } else { + println!( + "cargo:warning=xchplot2: RDNA1 {name} detected — \ + defaulting to ACPP_TARGETS=generic (SSCP JIT). \ + The previous gfx1013 community workaround was \ + observed to silently produce no-op kernels on \ + at least one W5700 / ROCm 6 setup. Override: \ + XCHPLOT2_FORCE_GFX_SPOOF=1 (back to gfx1013 AOT) \ + or XCHPLOT2_NO_GFX_SPOOF=1 (try native {name})." + ); + return None; + } + } + other => other.to_string(), + }; + return Some(spoofed); + } + } + } + None +} + +/// Probe whether `cmd` is on PATH and runnable. Used by preflight() +/// to detect missing toolchain pieces before cmake gets to fail with +/// a cryptic message. +fn command_runs(cmd: &str) -> bool { + Command::new(cmd) + .arg("--version") + .output() + .map(|o| o.status.success()) + .unwrap_or(false) +} + +/// Locate `ld.lld` either on PATH or in the conventional LLVM-{16..20} +/// install prefixes. Mirrors the find_program HINTS list in +/// CMakeLists.txt's FetchContent block. AdaptiveCpp's CMake aborts +/// with "Cannot find ld.lld" without it. +fn ld_lld_findable() -> bool { + if command_runs("ld.lld") { return true; } + for p in &[ + "/usr/lib/llvm-20/bin/ld.lld", "/usr/lib/llvm-19/bin/ld.lld", + "/usr/lib/llvm-18/bin/ld.lld", "/usr/lib/llvm-17/bin/ld.lld", + "/usr/lib/llvm-16/bin/ld.lld", + "/usr/lib/llvm20/bin/ld.lld", "/usr/lib/llvm19/bin/ld.lld", + "/usr/lib/llvm18/bin/ld.lld", + "/usr/lib64/llvm20/bin/ld.lld", "/usr/lib64/llvm19/bin/ld.lld", + "/usr/lib64/llvm18/bin/ld.lld", + "/opt/llvm-20/bin/ld.lld", "/opt/llvm-19/bin/ld.lld", + "/opt/llvm-18/bin/ld.lld", + ] { + if std::path::Path::new(p).exists() { return true; } + } + false +} + +/// True when AdaptiveCpp is already installed — at $ACPP_PREFIX if +/// set, otherwise the install-deps.sh default of /opt/adaptivecpp. +/// When this is true the FetchContent fallback won't fire and +/// AdaptiveCpp's own build-time deps (notably ld.lld) aren't needed +/// for our build. +fn adaptivecpp_installed() -> bool { + let prefix = env::var("ACPP_PREFIX") + .unwrap_or_else(|_| "/opt/adaptivecpp".to_string()); + std::path::Path::new(&format!( + "{prefix}/lib/cmake/AdaptiveCpp/AdaptiveCppConfig.cmake" + )).exists() +} + +/// Detect a container engine on PATH, preferring podman (matches +/// scripts/build-container.sh's default). Used to phrase the preflight +/// panic differently when the user already has tooling that lets them +/// skip the host-side install entirely. +fn detect_container_engine() -> Option<&'static str> { + if command_runs("podman") { return Some("podman"); } + if command_runs("docker") { return Some("docker"); } + None +} + +/// Walk critical build-time prerequisites and return human-readable +/// names of anything missing. Cargo install users in particular don't +/// read the Build section of README.md (and don't expect to need to), +/// so a friendly preflight is much better than letting CMake or +/// AdaptiveCpp fail with cryptic errors deep into a build. +fn preflight(build_cuda_on: bool) -> Vec { + let mut missing: Vec = vec![]; + if !command_runs("cmake") { + missing.push("cmake (3.24+) — apt install cmake / dnf install cmake / pacman -S cmake".into()); + } + if !command_runs("c++") && !command_runs("g++") && !command_runs("clang++") { + missing.push("C++20 compiler (g++ ≥ 13 or clang++ ≥ 18) — apt install build-essential, dnf install gcc-c++, or pacman -S base-devel".into()); + } + // ld.lld is only required when FetchContent will rebuild + // AdaptiveCpp; a pre-installed AdaptiveCpp linked against ld.lld + // at its own install time, so consumers don't need it again. + if !adaptivecpp_installed() && !ld_lld_findable() { + missing.push("ld.lld (apt: lld-18, dnf/pacman: lld) — required by AdaptiveCpp's FetchContent build".into()); + } + if build_cuda_on && !detect_nvcc() { + missing.push("nvcc (CUDA Toolkit 12+) — XCHPLOT2_BUILD_CUDA=ON requested but no nvcc on PATH".into()); + } + missing +} + fn main() { let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); @@ -56,6 +398,180 @@ fn main() { }; println!("cargo:warning=xchplot2: building for CUDA arch {cuda_arch} ({source})"); + // AdaptiveCpp target precedence: + // 1. $ACPP_TARGETS if set. + // 2. NVIDIA: "generic" (LLVM SSCP). Empirically a few percent + // faster than cuda:sm_ on our kernels. + // 3. AMD: hip:gfx<...> via rocminfo. SSCP's HIP path is less + // mature, so AOT-compile for the gfx target. + // 4. generic (LLVM SSCP, JITs on first use). + let (acpp_targets, acpp_source) = match env::var("ACPP_TARGETS") { + // Treat an empty env var the same as unset — Containerfile build + // args propagate as `ACPP_TARGETS=` when the user doesn't override + // them, and acpp rejects an empty target string. + Ok(v) if !v.is_empty() => (v, "$ACPP_TARGETS"), + Ok(_) | Err(_) => { + // Prefer a USABLE NVIDIA GPU (sm_61+) over AMD, otherwise fall + // through to AMD / fallback. `detect_cuda_arch` alone would + // trigger on an ancient secondary NVIDIA card even when AMD is + // the real plotting target (see usable_nvidia_arch). + if usable_nvidia_arch().is_some() { + ("generic".to_string(), "NVIDIA detected — using SSCP") + } else if let Some(gfx) = detect_amd_gfx() { + (format!("hip:{gfx}"), "rocminfo probe") + } else { + ("generic".to_string(), "fallback (LLVM SSCP)") + } + } + }; + println!("cargo:warning=xchplot2: ACPP_TARGETS={acpp_targets} ({acpp_source})"); + + // XCHPLOT2_BUILD_CUDA toggles whether the CUB sort + nvcc-compiled + // CUDA TUs (AesGpu.cu, SortCuda.cu, AesGpuBitsliced.cu) are built. + // Autodetect prefers actual GPU vendor over toolchain availability: + // dual-toolchain hosts (AMD / Intel GPU, CUDA Toolkit also installed) + // would otherwise try to compile SortCuda.cu through nvcc + AdaptiveCpp + // — which has triggered upstream `half.hpp` compile errors for at + // least one Radeon Pro W5700 user. Priority order: + // NVIDIA GPU → ON (CUB is the fast path) + // AMD GPU → OFF (SYCL/HIP path; CUB unused anyway) + // Intel GPU → OFF (SYCL/L0 path) + // no GPU, nvcc present → ON (CI / container build) + // no GPU, no nvcc → OFF + let (build_cuda, bc_source) = match env::var("XCHPLOT2_BUILD_CUDA") { + Ok(v) if !v.is_empty() => (v, "$XCHPLOT2_BUILD_CUDA"), + _ => { + // Same usable-arch gate as the ACPP_TARGETS block: an + // ancient secondary NVIDIA card (e.g. sm_52 alongside an + // AMD W5700) must NOT claim the CUB path, because + // AdaptiveCpp half.hpp references sm_53+ FP16 intrinsics + // that the old card's cuda_fp16.h guards out. + let nvidia_gpu = usable_nvidia_arch().is_some(); + // amd_gpu_present, NOT detect_amd_gfx().is_some() — the + // latter returns None for RDNA1 (we route those through + // SSCP instead of an AOT hip:* target), but the GPU is + // there and we MUST skip CUDA TUs to avoid running + // SortCuda.cu's CUB calls against AMD silicon. + let amd_gpu = amd_gpu_present(); + let intel_gpu = detect_intel_gpu(); + if nvidia_gpu { + ("ON".to_string(), "NVIDIA GPU detected") + } else if amd_gpu { + ("OFF".to_string(), "AMD GPU detected — skipping CUDA TUs") + } else if intel_gpu { + ("OFF".to_string(), "Intel GPU detected — skipping CUDA TUs") + } else if detect_nvcc() { + ("ON".to_string(), "no GPU probe, nvcc present — assuming CI/container") + } else { + ("OFF".to_string(), "no GPU, no nvcc — skipping CUDA TUs") + } + }, + }; + println!("cargo:warning=xchplot2: XCHPLOT2_BUILD_CUDA={build_cuda} ({bc_source})"); + + // Preflight critical system deps BEFORE invoking cmake. Cargo + // install users land here without reading README.md's Build + // section; without preflight, missing deps surface as cryptic + // CMake / AdaptiveCpp errors deep in the configure / build. + let missing = preflight(build_cuda == "ON"); + if !missing.is_empty() { + let bullets = missing.iter() + .map(|m| format!(" - {m}")) + .collect::>() + .join("\n"); + // Surface the container path proactively when we can already + // see podman/docker — for many users that's the smoothest fix + // because the toolchain stays bundled in the image. + let next_steps = match detect_container_engine() { + Some(engine) => format!( + "Two ways forward, pick whichever fits:\n\n \ + - Install those packages on the host:\n \ + ./scripts/install-deps.sh --gpu nvidia # auto-detects vendor + AdaptiveCpp\n\n \ + - Or, since you have {engine} installed, build inside a container —\n \ + toolchain stays in the image, no host changes needed:\n \ + ./scripts/build-container.sh\n \ + {engine} compose run --rm cuda plot ... # or rocm / intel / cpu\n\n\ + If install-deps.sh just ran and you're still seeing this, check\n\ + its tail output — it names the failed package before exiting." + ), + None => format!( + "Two ways forward, pick whichever fits:\n\n \ + - Install those packages on the host:\n \ + ./scripts/install-deps.sh --gpu nvidia # auto-detects vendor + AdaptiveCpp\n\n \ + - Or build inside a container (no host toolchain needed beyond\n \ + podman or docker — install whichever you prefer first):\n \ + ./scripts/build-container.sh\n\n\ + If install-deps.sh just ran and you're still seeing this, check\n\ + its tail output — it names the failed package before exiting." + ), + }; + panic!("\nxchplot2: build prerequisites missing:\n{bullets}\n\n{next_steps}\n"); + } + + // CUDA 13.0 dropped codegen for sm_50/52/53/60/61/62/70/72 entirely + // — its nvcc fails the CMake TryCompile probe with "Unsupported gpu + // architecture 'compute_61'" on Pascal, "compute_70" on Volta, etc. + // Catch that mismatch HERE so the failure surfaces with a clear fix + // path, not buried in a CMakeError.log 40 lines into a TryCompile. + // Skipped when nvcc version or arch list can't be parsed (treat as + // "preflight not actionable, let cmake try" — preserves prior + // behaviour for unusual setups). + if build_cuda == "ON" { + if let (Some(nvcc_major), Some(min)) = (detect_nvcc_major(), min_arch(&cuda_arch)) { + if nvcc_major >= 13 && min < 75 { + // Container detection: Docker writes /.dockerenv, Podman writes + // /run/.containerenv. Either presence means the host-side fixes + // (apt install cuda-toolkit, set CUDA_PATH) are not actionable + // from inside this build — the user needs to rebuild the image + // with a different BASE_DEVEL. + let in_container = std::path::Path::new("/.dockerenv").exists() + || std::path::Path::new("/run/.containerenv").exists(); + let fix_block = if in_container { + format!( + "You're building inside a container — the toolkit comes from the\n\ + base image, not the host. Rebuild the image with a CUDA 12.x base:\n \ + - Recommended: rerun scripts/build-container.sh on the host;\n \ + it auto-pins nvidia/cuda:12.9.1 when CUDA_ARCH < 75.\n \ + - Or pass --build-arg explicitly:\n \ + podman build -t xchplot2:cuda \\\n \ + --build-arg BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n \ + --build-arg BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n \ + --build-arg CUDA_ARCH={min} \\\n \ + .\n \ + - Or via compose with env vars:\n \ + CUDA_ARCH={min} \\\n \ + BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n \ + BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \\\n \ + podman compose build cuda\n" + ) + } else { + "Fix one of:\n \ + - Install CUDA 12.9 (last toolkit with Pascal/Volta support):\n \ + Ubuntu/Debian: sudo apt install cuda-toolkit-12-9\n \ + Arch: pacman -S cuda (or pin to a 12.x channel)\n \ + then point the build at it:\n \ + CUDA_PATH=/usr/local/cuda-12.9 cargo install \\\n \ + --git https://github.com/Jsewill/xchplot2 --force\n \ + - Or override the arch (only valid if you actually have a Turing+ card):\n \ + CUDA_ARCHITECTURES=75 cargo install \\\n \ + --git https://github.com/Jsewill/xchplot2 --force\n \ + - Or use the container path — scripts/build-container.sh auto-pins\n \ + the 12.9 base image when it detects a pre-Turing GPU.\n".to_string() + }; + panic!( + "\nxchplot2: CUDA Toolkit {nvcc_major}.x dropped codegen for sm_{min} \ + (Pascal / Volta / pre-Turing).\n\ + \n\ + Detected:\n \ + nvcc {nvcc_major}.x\n \ + target arch: sm_{min} (from CUDA_ARCHITECTURES={cuda_arch})\n\ + \n\ + {fix_block}" + ); + } + } + } + // ---- configure ---- let status = Command::new("cmake") .args([ @@ -64,6 +580,8 @@ fn main() { "-DCMAKE_BUILD_TYPE=Release", ]) .arg(format!("-DCMAKE_CUDA_ARCHITECTURES={cuda_arch}")) + .arg(format!("-DACPP_TARGETS={acpp_targets}")) + .arg(format!("-DXCHPLOT2_BUILD_CUDA={build_cuda}")) .status() .expect("failed to invoke cmake — is it installed?"); if !status.success() { @@ -111,21 +629,136 @@ fn main() { println!("cargo:rustc-link-lib=static=fse"); println!("cargo:rustc-link-arg=-Wl,--end-group"); - // ---- CUDA runtime ---- - // Honour $CUDA_PATH / $CUDA_HOME if set, else fall back to /opt/cuda - // (Arch / CachyOS) then /usr/local/cuda (Debian-ish). - let cuda_root = env::var("CUDA_PATH") - .or_else(|_| env::var("CUDA_HOME")) - .unwrap_or_else(|_| { - for guess in ["/opt/cuda", "/usr/local/cuda"] { - if std::path::Path::new(guess).exists() { return guess.to_string(); } + // ---- AdaptiveCpp runtime ---- + // The static archives produced by CMake reference hipsycl::rt::* symbols + // that live in libacpp-rt + libacpp-common (shared). CMake writes the + // exact lib directory to $cmake_build/acpp-prefix.txt during configure; + // honour that, then $ACPP_PREFIX / standard locations as fallbacks. + let acpp_lib_dir = std::fs::read_to_string(cmake_build.join("acpp-prefix.txt")) + .ok() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .or_else(|| env::var("ACPP_PREFIX").ok().map(|p| format!("{p}/lib"))) + .or_else(|| env::var("AdaptiveCpp_ROOT").ok().map(|p| format!("{p}/lib"))) + .unwrap_or_else(|| { + for guess in ["/opt/adaptivecpp/lib", "/usr/local/lib", + "/usr/lib/x86_64-linux-gnu", "/usr/lib"] { + if std::path::Path::new(&format!("{guess}/libacpp-rt.so")).exists() { + return guess.to_string(); + } } - "/opt/cuda".to_string() + "/opt/adaptivecpp/lib".to_string() }); - println!("cargo:rustc-link-search=native={cuda_root}/lib64"); - println!("cargo:rustc-link-search=native={cuda_root}/lib"); - println!("cargo:rustc-link-lib=cudart"); - println!("cargo:rustc-link-lib=cudadevrt"); + println!("cargo:rustc-link-search=native={acpp_lib_dir}"); + println!("cargo:rustc-link-arg=-Wl,-rpath,{acpp_lib_dir}"); + println!("cargo:rustc-link-lib=acpp-rt"); + println!("cargo:rustc-link-lib=acpp-common"); + + // ---- LLVM OpenMP runtime (SYCL→OMP backend) ---- + // AdaptiveCpp's OMP backend lowers SYCL nd_range kernels to OpenMP + // parallel loops. The compiled .o files reference libomp's runtime + // symbols (__kmpc_fork_call, __kmpc_global_thread_num, __kmpc_barrier, + // __kmpc_for_static_init_8u / _fini). cc / rust-lld don't auto-link + // libomp — pos2_gpu's SYCL TUs would then fail to link with + // + // rust-lld: error: undefined symbol: __kmpc_fork_call + // + // Only fire on builds where ACPP_TARGETS includes "omp"; HIP and + // SSCP-with-CUDA backends translate to their own runtimes and don't + // need libomp at link time. + // + // Locations: + // Ubuntu/Debian (apt libomp-18-dev): /usr/lib/llvm-18/lib/libomp.so + // Arch (pacman openmp): /usr/lib/libomp.so + // AdaptiveCpp install (bundled): $ACPP_PREFIX/lib/libomp.so + if acpp_targets.split(';').any(|t| t.trim() == "omp") { + for guess in ["/usr/lib/llvm-18/lib", "/usr/lib/llvm-19/lib", + "/usr/lib/llvm-20/lib", "/usr/lib"] { + if std::path::Path::new(&format!("{guess}/libomp.so")).exists() + || std::path::Path::new(&format!("{guess}/libomp.so.5")).exists() { + println!("cargo:rustc-link-search=native={guess}"); + println!("cargo:rustc-link-arg=-Wl,-rpath,{guess}"); + break; + } + } + println!("cargo:rustc-link-lib=omp"); + } + + // ---- CUDA runtime ---- + // Only needed when XCHPLOT2_BUILD_CUDA=ON — then the nvcc-compiled + // TUs (SortCuda, AesGpu, AesGpuBitsliced) pull in cudart / cudadevrt. + // On the AMD/Intel OFF path there's no CUDA Toolkit on the image and + // nothing in the static archives references cudart, so emitting + // `-lcudart` would make rust-lld fail with "unable to find library". + if build_cuda == "ON" { + // Honour $CUDA_PATH / $CUDA_HOME if set, else fall back to + // /opt/cuda (Arch / CachyOS) then /usr/local/cuda (Debian-ish). + let cuda_root = env::var("CUDA_PATH") + .or_else(|_| env::var("CUDA_HOME")) + .unwrap_or_else(|_| { + for guess in ["/opt/cuda", "/usr/local/cuda"] { + if std::path::Path::new(guess).exists() { return guess.to_string(); } + } + "/opt/cuda".to_string() + }); + println!("cargo:rustc-link-search=native={cuda_root}/lib64"); + println!("cargo:rustc-link-search=native={cuda_root}/lib"); + println!("cargo:rustc-link-lib=cudart"); + println!("cargo:rustc-link-lib=cudadevrt"); + } + + // ---- HIP runtime ---- + // When ACPP_TARGETS is "hip:gfxXXXX", AdaptiveCpp's HIP backend + // compiles SYCL kernels into HIP fat binaries whose host-side + // launcher stubs reference __hipPushCallConfiguration / + // __hipRegisterFatBinary / hipLaunchKernel from libamdhip64. Without + // -lamdhip64 rust-lld fails with "undefined symbol: __hip*". + // Honour $ROCM_PATH if set, else fall back to /opt/rocm (standard + // bare-metal + all official ROCm container images). + // Link libamdhip64 whenever ROCm is reachable, not just when + // ACPP_TARGETS is hip-prefixed. ACPP_TARGETS=generic (SSCP JIT) on + // an AMD host still needs the HIP runtime at load time — + // librt-backend-hip.so dlopens libamdhip64, but glibc doesn't walk + // the binary's RUNPATH for transitive backend deps. By making + // libamdhip64 a direct dependency of the binary, the loader pulls + // it in at startup via RUNPATH, and AdaptiveCpp's runtime dlopen + // finds the already-loaded handle. Without this, an AMD-host + // build with the new RDNA1 default (generic instead of the + // gfx1013 spoof) fails at first queue construction with + // "No matching device" because HIP can't initialise. + // + // We pass the full .so path (rather than `cargo:rustc-link-lib=amdhip64` + // which becomes `-lamdhip64`) because the SSCP path emits no host- + // side HIP symbol references, and the linker's default --as-needed + // would drop a name-only -l flag from NEEDED. A positional path + // argument bypasses --as-needed and keeps the library in the link. + // Same approach as CMakeLists.txt's `link_libraries(.../libamdhip64.so)`. + let rocm_root = env::var("ROCM_PATH") + .unwrap_or_else(|_| "/opt/rocm".to_string()); + let amdhip_lib = format!("{rocm_root}/lib/libamdhip64.so"); + if acpp_targets.starts_with("hip:") || std::path::Path::new(&amdhip_lib).exists() { + println!("cargo:rustc-link-search=native={rocm_root}/lib"); + println!("cargo:rustc-link-search=native={rocm_root}/hip/lib"); + println!("cargo:rustc-link-arg=-Wl,-rpath,{rocm_root}/lib"); + if std::path::Path::new(&amdhip_lib).exists() { + // Wrap with --no-as-needed/--as-needed: even a positional + // .so path gets dropped from NEEDED by ld's --as-needed + // when no symbol references it (true for the SSCP path + // that has zero host-side HIP symbol refs). The library + // itself must end up in DT_NEEDED so AdaptiveCpp's runtime + // dlopen finds it already loaded; otherwise HIP backend + // never initialises and we throw "No matching device". + println!("cargo:rustc-link-arg=-Wl,--no-as-needed"); + println!("cargo:rustc-link-arg={amdhip_lib}"); + println!("cargo:rustc-link-arg=-Wl,--as-needed"); + } else { + // Fallback: ROCm not at /opt/rocm/lib but the user set + // ACPP_TARGETS=hip:* explicitly. AOT HIP fat binaries + // reference HIP symbols directly, so --as-needed keeps + // -lamdhip64 in NEEDED on that path. + println!("cargo:rustc-link-lib=amdhip64"); + } + } // C++ stdlib + POSIX bits the static libs (Rust std + pthread inside // pos2_keygen, std::async + std::thread in pos2_gpu_host) reach for. diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..b297cd1 --- /dev/null +++ b/compose.yaml @@ -0,0 +1,184 @@ +# compose.yaml — podman-first (also works with docker compose). +# +# Three vendor-specific services share one Containerfile, parameterized +# via build args. Pick one based on your GPU; the build context is the +# same so the AdaptiveCpp + xchplot2 build layers cache across services. +# +# Build & run examples: +# +# # NVIDIA (default sm_89 / RTX 4090; override via $CUDA_ARCH=120 etc.) +# podman compose build cuda +# podman compose run --rm cuda test 22 2 0 0 -G -o /out +# +# # NVIDIA Pascal/Volta (sm_61 / GTX 10-series, sm_70 / V100): CUDA 13.x +# # dropped codegen for pre-Turing archs, so pin to a 12.x base image. +# # scripts/build-container.sh does this automatically when it detects +# # CUDA_ARCH < 75; if invoking compose directly, set the base manually: +# CUDA_ARCH=61 \ +# BASE_DEVEL=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \ +# BASE_RUNTIME=docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 \ +# podman compose build cuda +# +# # AMD ROCm — set $ACPP_GFX to your card's gfx target (rocminfo | grep gfx). +# # gfx1031 = Navi 22 (RX 6700/6700 XT/6800M) +# # gfx1100 = Navi 31 (RX 7900 XTX/XT) ← default +# # gfx900 = Vega 10 (RX Vega 56/64, MI25) +# ACPP_GFX=gfx1031 podman compose build rocm +# podman compose run --rm rocm test 22 2 0 0 -G -o /out +# +# # Intel oneAPI (experimental, untested). +# podman compose build intel +# +# Plot files land in ./plots/ on the host (mounted at /out in the +# container). + +services: + cuda: + build: + context: . + dockerfile: Containerfile + args: + # BASE_DEVEL / BASE_RUNTIME default to CUDA 13.x (latest, sm_75+). + # scripts/build-container.sh overrides both to nvidia/cuda:12.9.1 + # when it detects a pre-Turing GPU (Pascal/Volta, CUDA_ARCH < 75) + # — CUDA 13.0 dropped codegen for those archs. Set BASE_DEVEL + # explicitly to bypass the auto-pick (e.g. for cross-targeting an + # arch the host doesn't have). + BASE_DEVEL: "${BASE_DEVEL:-docker.io/nvidia/cuda:13.0.0-devel-ubuntu24.04}" + BASE_RUNTIME: "${BASE_RUNTIME:-docker.io/nvidia/cuda:13.0.0-devel-ubuntu24.04}" + ACPP_TARGETS: "generic" + XCHPLOT2_BUILD_CUDA: "ON" + INSTALL_CUDA_HEADERS: "0" + CUDA_ARCH: "${CUDA_ARCH:-89}" + image: xchplot2:cuda + # GPU pass-through. Works on both engines: + # - Docker (with nvidia-container-toolkit + `nvidia-ctk runtime + # configure --runtime=docker && systemctl restart docker`) + # - Podman 5.x (with podman-compose 1.x+; equivalent to + # `--device nvidia.com/gpu=all` via CDI) + # The previous `devices: nvidia.com/gpu=all` shorthand worked on + # podman but Docker silently ignored it as an unknown device path, + # leaving the container without libcuda.so.1 and producing a + # confusing "No matching device" failure mid-plot. + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + volumes: + - ./plots:/out + + rocm: + build: + context: . + dockerfile: Containerfile + args: + # Pinned to ROCm 6.2.x for two reasons: + # 1. ROCm 7.x's rocm-llvm package no longer ships LLVMConfig.cmake, + # so AdaptiveCpp's find_package(LLVM) can't run. + # 2. ROCm 6.2 ships LLVM 18.0git, matching Ubuntu's llvm-18 so the + # device bitcode (ocml.bc, ockl.bc) is readable by AdaptiveCpp + # built against Ubuntu's LLVM. No "Unknown attribute kind" + # mismatch. + # AdaptiveCpp is therefore built against Ubuntu's /usr/lib/llvm-18 + # (the Containerfile default), and ROCm provides its own clang + + # device libs at /opt/rocm/llvm for the HIP backend at runtime. + BASE_DEVEL: docker.io/rocm/dev-ubuntu-24.04:6.2-complete + BASE_RUNTIME: docker.io/rocm/dev-ubuntu-24.04:6.2-complete + # IMPORTANT: ACPP_GFX is intentionally *required* — no silent default. + # If it's unset the SYCL kernels are AOT-compiled for the wrong amdgcn + # ISA, which HIP loads without error but the kernels execute as silent + # no-ops at runtime (sort returns input, AES match finds zero results, + # plot content diverges from the canonical reference). That failure + # mode is extremely confusing to diagnose — it looks like a correctness + # bug in the kernels rather than a build-time config error. + # + # Set ACPP_GFX explicitly. If you sudo compose, pass the var through + # (sudo strips env by default): + # ACPP_GFX=gfx1031 sudo -E podman compose build rocm + # sudo ACPP_GFX=gfx1031 podman compose build rocm + # + # Common gfx targets (see `rocminfo | grep gfx`): + # gfx1030 = RDNA2 Navi 21 (RX 6800/6800 XT/6900 XT) + # gfx1031 = RDNA2 Navi 22 (RX 6700/6700 XT/6800M) + # gfx1100 = RDNA3 Navi 31 (RX 7900 XTX/XT) + # gfx1101 = RDNA3 Navi 32 (RX 7800 XT/7700 XT) + # gfx906 = Vega 20 (Radeon VII, MI50) + # gfx900 = Vega 10 (RX Vega 56/64, MI25) + # Use ${VAR:-default} (NOT ${VAR:?error}) so that building cuda + # / intel / cpu services without ACPP_GFX set doesn't trip a + # parse-time error — podman-compose evaluates :? across ALL + # services during YAML parse, not just the one being built. + # The placeholder value is intentionally invalid as a gfx + # target so AdaptiveCpp's HIP backend fails loudly with the + # placeholder string in its error message — much better than + # silently building wrong-arch amdgcn ISA from a default like + # gfx1100 (kernels would then execute as runtime no-ops, see + # the IMPORTANT block above). + ACPP_TARGETS: "hip:${ACPP_GFX:-MISSING-set-ACPP_GFX-or-use-scripts-build-container-sh}" + XCHPLOT2_BUILD_CUDA: "OFF" + # No CUDA headers on the AMD path — they conflict with HIP's + # uchar1/etc. typedefs. CudaHalfShim.hpp's __has_include guard + # handles the absence cleanly. + INSTALL_CUDA_HEADERS: "0" + image: xchplot2:rocm + devices: + - /dev/kfd + - /dev/dri + group_add: + - video + # Rootless podman's default seccomp filter + capability set blocks + # some of the KFD IOCTLs libhsa-runtime64 issues during DMA setup, + # which surfaces as a segfault inside the HSA runtime on the first + # host→device copy (rocminfo-level queries still work, so the + # failure is subtle and confusing). Loosen the sandbox just enough + # for HSA's DMA path. If rootless still fails on your host, run + # rootful + privileged instead: + # sudo podman run --rm --privileged --device /dev/kfd \ + # --device /dev/dri -v $PWD/plots:/out xchplot2:rocm \ + # plot -k 28 -n 10 -f -c -o /out + security_opt: + - seccomp=unconfined + cap_add: + - SYS_ADMIN + volumes: + - ./plots:/out + + intel: + build: + context: . + dockerfile: Containerfile + args: + BASE_DEVEL: docker.io/intel/oneapi-basekit:latest + BASE_RUNTIME: docker.io/intel/oneapi-runtime:latest + ACPP_TARGETS: "generic" + XCHPLOT2_BUILD_CUDA: "OFF" + INSTALL_CUDA_HEADERS: "1" + image: xchplot2:intel + devices: + - /dev/dri + volumes: + - ./plots:/out + + cpu: + # CPU-only image: AdaptiveCpp's OpenMP backend compiles the SYCL + # kernels for the host CPU. No GPU runtime needed. Plotting is + # 1-2 orders of magnitude slower than GPU; useful for headless CI, + # dev machines without a GPU, or as an extra worker on a + # heterogeneous `--devices` list. See README's CPU section. + build: + context: . + dockerfile: Containerfile + args: + BASE_DEVEL: docker.io/ubuntu:24.04 + BASE_RUNTIME: docker.io/ubuntu:24.04 + ACPP_TARGETS: "omp" + XCHPLOT2_BUILD_CUDA: "OFF" + # AdaptiveCpp's libkernel/half.hpp includes cuda_fp16.h on every + # build path; pull the headers (no libcudart link, just headers). + INSTALL_CUDA_HEADERS: "1" + image: xchplot2:cpu + volumes: + - ./plots:/out diff --git a/keygen-rs/Cargo.lock b/keygen-rs/Cargo.lock index 6ed82bb..795af9a 100644 --- a/keygen-rs/Cargo.lock +++ b/keygen-rs/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + [[package]] name = "asn1-rs" version = "0.6.2" @@ -53,6 +59,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" +[[package]] +name = "base16ct" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd307490d624467aa6f74b0eabb77633d1f758a7b25f12bceb0b22e08d9726f6" + [[package]] name = "base64" version = "0.22.1" @@ -98,6 +110,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-buffer" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +dependencies = [ + "hybrid-array", +] + [[package]] name = "blst" version = "0.3.16" @@ -148,9 +169,9 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "chia" -version = "0.42.0" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff1f2c3905a718d77dd48a4f4653e1b29c9e39cd599c2de8fccb10970c563049" +checksum = "5fb7c121855983543518ab67cb1ebea7e52badc965e547f98d90ee6f728d6c06" dependencies = [ "chia-bls 0.42.0", "chia-client", @@ -170,17 +191,17 @@ dependencies = [ [[package]] name = "chia-bls" -version = "0.36.1" +version = "0.38.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f02cbfd038d9050d45edbe8f38e09391c73479c0cca5b37925daf48c4d4fcd4" +checksum = "a70dfe8540688eaed5bdecffd51c26df489b8bc610890b613b81461411f90cc9" dependencies = [ "blst", - "chia-sha2 0.36.1", - "chia-traits 0.36.1", + "chia-sha2 0.38.2", + "chia-traits 0.38.2", "hex", "hkdf", "linked-hash-map", - "sha2", + "sha2 0.10.9", "thiserror 1.0.69", ] @@ -198,7 +219,7 @@ dependencies = [ "hkdf", "linked-hash-map", "serde", - "sha2", + "sha2 0.10.9", "thiserror 1.0.69", ] @@ -335,8 +356,8 @@ checksum = "82c0c0303a91f6190b26ba8778f7b38438e79df02a5631b80269d3aa36372a76" dependencies = [ "chia-sha2 0.42.0", "hex", - "k256", - "p256", + "k256 0.13.4", + "p256 0.13.2", ] [[package]] @@ -351,11 +372,11 @@ dependencies = [ [[package]] name = "chia-sha2" -version = "0.36.1" +version = "0.38.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0934b0d6b878f29ba6c958e56e4b7158f9e687c200ffdca141dbc408a5cce42e" +checksum = "5a57be484b5abb4481a3ea8b2e6fc0404f41222e0cfb35b81269c2404b64107a" dependencies = [ - "sha2", + "sha2 0.10.9", ] [[package]] @@ -364,7 +385,7 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6636ca8bba852fc516eacf01b2c3964b6b290359e7d1e89b950e6754e2a1082" dependencies = [ - "sha2", + "sha2 0.10.9", ] [[package]] @@ -382,12 +403,12 @@ dependencies = [ [[package]] name = "chia-traits" -version = "0.36.1" +version = "0.38.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f4922b447b2d8418213948af1a448c3ca7b84e149b51b2c87a2e00e80bb19b0" +checksum = "b13ea36e3ae5ede1d015d873fdfa91ea4d7a8790c6859c78b6b74065c7ddbbbd" dependencies = [ - "chia-sha2 0.36.1", - "chia_streamable_macro 0.36.1", + "chia-sha2 0.38.2", + "chia_streamable_macro 0.38.2", "thiserror 1.0.69", ] @@ -404,9 +425,9 @@ dependencies = [ [[package]] name = "chia_streamable_macro" -version = "0.36.1" +version = "0.38.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b60cefc5fe39f695816d42a327cbefad3d6d6a8ecadad1b58d7507067c25da8" +checksum = "4450a65b83cd89f8ccad2b4d5f8dc23e89ab0b6ae86d8c535ffde9fdc9d9c6c5" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -466,36 +487,54 @@ dependencies = [ [[package]] name = "clvmr" -version = "0.17.5" +version = "0.17.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56b333963b083468df9a15602fcc3a24fa3f8c3964569fb9d2415ac70c0820e9" +checksum = "3060bcd64cb8cf2b32fe6ee3a82698835c03361c8e1da446d2e9d058fbfffd5f" dependencies = [ "bitflags", "bitvec", "bumpalo", - "chia-bls 0.36.1", - "chia-sha2 0.36.1", + "chia-bls 0.38.2", + "chia-sha2 0.38.2", "hex", "hex-literal", - "k256", + "k256 0.14.0-rc.9", "lazy_static", "malachite-bigint", "num-bigint", "num-integer", "num-traits", - "p256", - "rand 0.8.6", + "p256 0.14.0-rc.9", + "rand 0.9.4", "sha1", "sha3", - "thiserror 1.0.69", + "thiserror 2.0.18", ] +[[package]] +name = "cmov" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f88a43d011fc4a6876cb7344703e297c71dda42494fee094d5f7c76bf13f746" + [[package]] name = "const-oid" version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "const-oid" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" + +[[package]] +name = "cpubits" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15b85f9c39137c3a891689859392b1bd49812121d0d61c9caf00d46ed5ce06ae" + [[package]] name = "cpufeatures" version = "0.2.17" @@ -505,6 +544,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -542,6 +590,22 @@ dependencies = [ "zeroize", ] +[[package]] +name = "crypto-bigint" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42a0d26b245348befa0c121944541476763dcc46ede886c88f9d12e1697d27c3" +dependencies = [ + "cpubits", + "ctutils", + "getrandom 0.4.2", + "hybrid-array", + "num-traits", + "rand_core 0.10.1", + "subtle", + "zeroize", +] + [[package]] name = "crypto-common" version = "0.1.6" @@ -552,6 +616,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "crypto-common" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710" +dependencies = [ + "getrandom 0.4.2", + "hybrid-array", + "rand_core 0.10.1", +] + +[[package]] +name = "ctutils" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5515a3834141de9eafb9717ad39eea8247b5674e6066c404e8c4b365d2a29e" +dependencies = [ + "cmov", + "subtle", +] + [[package]] name = "data-encoding" version = "2.10.0" @@ -564,8 +649,19 @@ version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" dependencies = [ - "const-oid", - "pem-rfc7468", + "const-oid 0.9.6", + "pem-rfc7468 0.7.0", + "zeroize", +] + +[[package]] +name = "der" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b" +dependencies = [ + "const-oid 0.10.2", + "pem-rfc7468 1.0.0", "zeroize", ] @@ -598,12 +694,24 @@ version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ - "block-buffer", - "const-oid", - "crypto-common", + "block-buffer 0.10.4", + "const-oid 0.9.6", + "crypto-common 0.1.6", "subtle", ] +[[package]] +name = "digest" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c" +dependencies = [ + "block-buffer 0.12.0", + "const-oid 0.10.2", + "crypto-common 0.2.1", + "ctutils", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -621,12 +729,27 @@ version = "0.16.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" dependencies = [ - "der", - "digest", - "elliptic-curve", - "rfc6979", - "signature", - "spki", + "der 0.7.10", + "digest 0.10.7", + "elliptic-curve 0.13.8", + "rfc6979 0.4.0", + "signature 2.2.0", + "spki 0.7.3", +] + +[[package]] +name = "ecdsa" +version = "0.17.0-rc.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54fb064faabbee66e1fc8e5c5a9458d4269dc2d8b638fe86a425adb2510d1a96" +dependencies = [ + "der 0.8.0", + "digest 0.11.2", + "elliptic-curve 0.14.0-rc.32", + "rfc6979 0.5.0-rc.5", + "signature 3.0.0", + "spki 0.8.0", + "zeroize", ] [[package]] @@ -641,16 +764,38 @@ version = "0.13.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" dependencies = [ - "base16ct", - "crypto-bigint", - "digest", + "base16ct 0.2.0", + "crypto-bigint 0.5.5", + "digest 0.10.7", "ff", "generic-array", "group", - "pem-rfc7468", - "pkcs8", + "pem-rfc7468 0.7.0", + "pkcs8 0.10.2", "rand_core 0.6.4", - "sec1", + "sec1 0.7.3", + "subtle", + "zeroize", +] + +[[package]] +name = "elliptic-curve" +version = "0.14.0-rc.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda94f31325c4275e9706adecbb6f0650dee2f904c915a98e3d81adaaaa757aa" +dependencies = [ + "base16ct 1.0.0", + "crypto-bigint 0.7.3", + "crypto-common 0.2.1", + "digest 0.11.2", + "hybrid-array", + "once_cell", + "pem-rfc7468 1.0.0", + "pkcs8 0.11.0", + "rand_core 0.10.1", + "rustcrypto-ff", + "rustcrypto-group", + "sec1 0.8.1", "subtle", "zeroize", ] @@ -677,6 +822,12 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "foldhash" version = "0.2.0" @@ -762,10 +913,24 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", ] +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "rand_core 0.10.1", + "wasip2", + "wasip3", +] + [[package]] name = "glob" version = "0.3.3" @@ -783,13 +948,22 @@ dependencies = [ "subtle", ] +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + [[package]] name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" dependencies = [ - "foldhash", + "foldhash 0.2.0", ] [[package]] @@ -798,6 +972,12 @@ version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.5.2" @@ -822,7 +1002,7 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" dependencies = [ - "hmac", + "hmac 0.12.1", ] [[package]] @@ -831,7 +1011,16 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" dependencies = [ - "digest", + "digest 0.10.7", +] + +[[package]] +name = "hmac" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6303bc9732ae41b04cb554b844a762b4115a61bfaa81e3e83050991eeb56863f" +dependencies = [ + "digest 0.11.2", ] [[package]] @@ -850,6 +1039,23 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" +[[package]] +name = "hybrid-array" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d46837a0ed51fe95bd3b05de33cd64a1ee88fc797477ca48446872504507c5" +dependencies = [ + "subtle", + "typenum", + "zeroize", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "indexmap" version = "2.14.0" @@ -858,6 +1064,8 @@ checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", "hashbrown 0.17.0", + "serde", + "serde_core", ] [[package]] @@ -892,11 +1100,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6e3919bbaa2945715f0bb6d3934a173d1e9a59ac23767fbaaef277265a7411b" dependencies = [ "cfg-if", - "ecdsa", - "elliptic-curve", + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", "once_cell", - "sha2", - "signature", + "sha2 0.10.9", + "signature 2.2.0", +] + +[[package]] +name = "k256" +version = "0.14.0-rc.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b382cbfd43caf55991a93850ce538aa1aa67bb264af367d22dfe7937c4e997d" +dependencies = [ + "cpubits", + "ecdsa 0.17.0-rc.18", + "elliptic-curve 0.14.0-rc.32", + "sha2 0.11.0", + "signature 3.0.0", ] [[package]] @@ -905,7 +1126,7 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653" dependencies = [ - "cpufeatures", + "cpufeatures 0.2.17", ] [[package]] @@ -917,6 +1138,12 @@ dependencies = [ "spin", ] +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libc" version = "0.2.185" @@ -1104,10 +1331,23 @@ version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b" dependencies = [ - "ecdsa", - "elliptic-curve", - "primeorder", - "sha2", + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", + "primeorder 0.13.6", + "sha2 0.10.9", +] + +[[package]] +name = "p256" +version = "0.14.0-rc.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b97e3bf0465157ae90975ff52dbeb1362ba618924878c9f74c25baa27a65f9a" +dependencies = [ + "ecdsa 0.17.0-rc.18", + "elliptic-curve 0.14.0-rc.32", + "primefield", + "primeorder 0.14.0-rc.9", + "sha2 0.11.0", ] [[package]] @@ -1135,6 +1375,15 @@ dependencies = [ "base64ct", ] +[[package]] +name = "pem-rfc7468" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9" +dependencies = [ + "base64ct", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -1147,9 +1396,9 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" dependencies = [ - "der", - "pkcs8", - "spki", + "der 0.7.10", + "pkcs8 0.10.2", + "spki 0.7.3", ] [[package]] @@ -1158,8 +1407,18 @@ version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" dependencies = [ - "der", - "spki", + "der 0.7.10", + "spki 0.7.3", +] + +[[package]] +name = "pkcs8" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "451913da69c775a56034ea8d9003d27ee8948e12443eae7c038ba100a4f21cb7" +dependencies = [ + "der 0.8.0", + "spki 0.8.0", ] [[package]] @@ -1175,7 +1434,7 @@ dependencies = [ "bech32", "chia", "hex", - "sha2", + "sha2 0.11.0", ] [[package]] @@ -1193,13 +1452,46 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "primefield" +version = "0.14.0-rc.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b52e6ee42db392378a95622b463c9740631171d1efce43fa445a569c1600cb6" +dependencies = [ + "crypto-bigint 0.7.3", + "crypto-common 0.2.1", + "rand_core 0.10.1", + "rustcrypto-ff", + "subtle", + "zeroize", +] + [[package]] name = "primeorder" version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6" dependencies = [ - "elliptic-curve", + "elliptic-curve 0.13.8", +] + +[[package]] +name = "primeorder" +version = "0.14.0-rc.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0556580e42c19833f5d232aca11a7687a503ee41f937b54f5ae1d50fc2a6a36a" +dependencies = [ + "elliptic-curve 0.14.0-rc.32", ] [[package]] @@ -1236,6 +1528,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "radium" version = "0.7.0" @@ -1301,6 +1599,12 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + [[package]] name = "rayon" version = "1.12.0" @@ -1341,7 +1645,17 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" dependencies = [ - "hmac", + "hmac 0.12.1", + "subtle", +] + +[[package]] +name = "rfc6979" +version = "0.5.0-rc.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23a3127ee32baec36af75b4107082d9bd823501ec14a4e016be4b6b37faa74ae" +dependencies = [ + "hmac 0.13.0", "subtle", ] @@ -1365,20 +1679,41 @@ version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8573f03f5883dcaebdfcf4725caa1ecb9c15b2ef50c43a07b816e06799bb12d" dependencies = [ - "const-oid", - "digest", + "const-oid 0.9.6", + "digest 0.10.7", "num-bigint-dig", "num-integer", "num-traits", "pkcs1", - "pkcs8", + "pkcs8 0.10.2", "rand_core 0.6.4", - "signature", - "spki", + "signature 2.2.0", + "spki 0.7.3", "subtle", "zeroize", ] +[[package]] +name = "rustcrypto-ff" +version = "0.14.0-rc.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd2a8adb347447693cd2ba0d218c4b66c62da9b0a5672b17b981e4291ec65ff6" +dependencies = [ + "rand_core 0.10.1", + "subtle", +] + +[[package]] +name = "rustcrypto-group" +version = "0.14.0-rc.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "369f9b61aa45933c062c9f6b5c3c50ab710687eca83dd3802653b140b43f85ed" +dependencies = [ + "rand_core 0.10.1", + "rustcrypto-ff", + "subtle", +] + [[package]] name = "rusticata-macros" version = "4.1.0" @@ -1418,14 +1753,34 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" dependencies = [ - "base16ct", - "der", + "base16ct 0.2.0", + "der 0.7.10", "generic-array", - "pkcs8", + "pkcs8 0.10.2", + "subtle", + "zeroize", +] + +[[package]] +name = "sec1" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d56d437c2f19203ce5f7122e507831de96f3d2d4d3be5af44a0b0a09d8a80e4d" +dependencies = [ + "base16ct 1.0.0", + "ctutils", + "der 0.8.0", + "hybrid-array", "subtle", "zeroize", ] +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + [[package]] name = "serde" version = "1.0.228" @@ -1474,6 +1829,19 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + [[package]] name = "sha1" version = "0.10.6" @@ -1481,8 +1849,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.2.17", + "digest 0.10.7", ] [[package]] @@ -1492,8 +1860,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + +[[package]] +name = "sha2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "digest 0.11.2", ] [[package]] @@ -1502,7 +1881,7 @@ version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60" dependencies = [ - "digest", + "digest 0.10.7", "keccak", ] @@ -1518,10 +1897,20 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" dependencies = [ - "digest", + "digest 0.10.7", "rand_core 0.6.4", ] +[[package]] +name = "signature" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d567dcbaf0049cb8ac2608a76cd95ff9e4412e1899d389ee400918ca7537f5" +dependencies = [ + "digest 0.11.2", + "rand_core 0.10.1", +] + [[package]] name = "slab" version = "0.4.12" @@ -1557,7 +1946,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" dependencies = [ "base64ct", - "der", + "der 0.7.10", +] + +[[package]] +name = "spki" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d9efca8738c78ee9484207732f728b1ef517bbb1833d6fc0879ca898a522f6f" +dependencies = [ + "base64ct", + "der 0.8.0", ] [[package]] @@ -1736,9 +2135,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.19.0" +version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" [[package]] name = "unicode-ident" @@ -1746,6 +2145,12 @@ version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "untrusted" version = "0.9.0" @@ -1779,6 +2184,49 @@ dependencies = [ "wit-bindgen", ] +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + [[package]] name = "wide" version = "1.3.0" @@ -1891,6 +2339,88 @@ name = "wit-bindgen" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] [[package]] name = "wyz" @@ -1968,6 +2498,12 @@ dependencies = [ "syn", ] +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + [[package]] name = "zstd" version = "0.13.3" diff --git a/keygen-rs/Cargo.toml b/keygen-rs/Cargo.toml index 0365b3d..02c4349 100644 --- a/keygen-rs/Cargo.toml +++ b/keygen-rs/Cargo.toml @@ -10,7 +10,7 @@ crate-type = ["staticlib"] [dependencies] chia = "0.42" bech32 = "0.11" -sha2 = "0.10" +sha2 = "0.11" [dev-dependencies] hex = "0.4" diff --git a/keygen-rs/src/lib.rs b/keygen-rs/src/lib.rs index 2f9e1b3..9126907 100644 --- a/keygen-rs/src/lib.rs +++ b/keygen-rs/src/lib.rs @@ -10,20 +10,20 @@ // byte-identical to `chia plots create --v2`. use chia::bls::{PublicKey, SecretKey}; -use chia::protocol::{Bytes32, compute_plot_id_v2}; +use chia::protocol::{compute_plot_id_v2, Bytes32}; use chia::sha2::Sha256; // --------------------------------------------------------------------------- // Result codes returned across the FFI boundary. // --------------------------------------------------------------------------- -pub const POS2_OK: i32 = 0; -pub const POS2_BAD_FARMER_PK: i32 = -1; -pub const POS2_BAD_POOL_KEY: i32 = -2; -pub const POS2_BAD_POOL_KIND: i32 = -3; +pub const POS2_OK: i32 = 0; +pub const POS2_BAD_FARMER_PK: i32 = -1; +pub const POS2_BAD_POOL_KEY: i32 = -2; +pub const POS2_BAD_POOL_KIND: i32 = -3; pub const POS2_MEMO_BUF_TOO_SMALL: i32 = -4; -pub const POS2_BAD_SEED: i32 = -5; -pub const POS2_BAD_ADDRESS: i32 = -6; -pub const POS2_BAD_HRP: i32 = -7; +pub const POS2_BAD_SEED: i32 = -5; +pub const POS2_BAD_ADDRESS: i32 = -6; +pub const POS2_BAD_HRP: i32 = -7; // pool_kind values. pub const POS2_POOL_PK: i32 = 0; // pool_key_or_ph points to 48 bytes (G1) @@ -108,8 +108,8 @@ pub unsafe extern "C" fn pos2_keygen_derive_plot( strength: u8, plot_index: u16, meta_group: u8, - out_plot_id: *mut u8, // 32 bytes written - out_memo_buf: *mut u8, // caller-owned buffer + out_plot_id: *mut u8, // 32 bytes written + out_memo_buf: *mut u8, // caller-owned buffer inout_memo_len: *mut usize, // in: capacity; out: bytes written ) -> i32 { if seed_len < 32 { @@ -117,48 +117,42 @@ pub unsafe extern "C" fn pos2_keygen_derive_plot( } let seed: &[u8] = unsafe { std::slice::from_raw_parts(seed_ptr, seed_len) }; - let farmer_pk_bytes: &[u8; 48] = - match unsafe { (farmer_pk_ptr as *const [u8; 48]).as_ref() } { - Some(b) => b, - None => return POS2_BAD_FARMER_PK, - }; + let farmer_pk_bytes: &[u8; 48] = match unsafe { (farmer_pk_ptr as *const [u8; 48]).as_ref() } { + Some(b) => b, + None => return POS2_BAD_FARMER_PK, + }; let farmer_pk = match PublicKey::from_bytes(farmer_pk_bytes) { Ok(pk) => pk, Err(_) => return POS2_BAD_FARMER_PK, }; - let (pool_pk_opt, pool_ph_opt, pool_key_slice): ( - Option, - Option, - &[u8], - ) = match pool_kind { - x if x == POS2_POOL_PK => { - let bytes: &[u8; 48] = - match unsafe { (pool_key_ptr as *const [u8; 48]).as_ref() } { + let (pool_pk_opt, pool_ph_opt, pool_key_slice): (Option, Option, &[u8]) = + match pool_kind { + x if x == POS2_POOL_PK => { + let bytes: &[u8; 48] = match unsafe { (pool_key_ptr as *const [u8; 48]).as_ref() } { Some(b) => b, None => return POS2_BAD_POOL_KEY, }; - let pk = match PublicKey::from_bytes(bytes) { - Ok(pk) => pk, - Err(_) => return POS2_BAD_POOL_KEY, - }; - (Some(pk), None, &bytes[..]) - } - x if x == POS2_POOL_PH => { - let bytes: &[u8; 32] = - match unsafe { (pool_key_ptr as *const [u8; 32]).as_ref() } { + let pk = match PublicKey::from_bytes(bytes) { + Ok(pk) => pk, + Err(_) => return POS2_BAD_POOL_KEY, + }; + (Some(pk), None, &bytes[..]) + } + x if x == POS2_POOL_PH => { + let bytes: &[u8; 32] = match unsafe { (pool_key_ptr as *const [u8; 32]).as_ref() } { Some(b) => b, None => return POS2_BAD_POOL_KEY, }; - let ph: Bytes32 = (*bytes).into(); - (None, Some(ph), &bytes[..]) - } - _ => return POS2_BAD_POOL_KIND, - }; + let ph: Bytes32 = (*bytes).into(); + (None, Some(ph), &bytes[..]) + } + _ => return POS2_BAD_POOL_KIND, + }; let master_sk = SecretKey::from_seed(seed); - let local_sk = master_sk_to_local_sk(&master_sk); - let local_pk = local_sk.public_key(); + let local_sk = master_sk_to_local_sk(&master_sk); + let local_pk = local_sk.public_key(); let include_taproot = pool_ph_opt.is_some(); let plot_pk = generate_plot_public_key(&local_pk, &farmer_pk, include_taproot); @@ -185,11 +179,7 @@ pub unsafe extern "C" fn pos2_keygen_derive_plot( std::ptr::copy_nonoverlapping(plot_id.as_ref().as_ptr(), out_plot_id, 32); let dst = out_memo_buf; std::ptr::copy_nonoverlapping(pool_key_slice.as_ptr(), dst, pool_key_slice.len()); - std::ptr::copy_nonoverlapping( - farmer_pk_bytes.as_ptr(), - dst.add(pool_key_slice.len()), - 48, - ); + std::ptr::copy_nonoverlapping(farmer_pk_bytes.as_ptr(), dst.add(pool_key_slice.len()), 48); std::ptr::copy_nonoverlapping( master_sk_bytes.as_ptr(), dst.add(pool_key_slice.len() + 48), @@ -223,7 +213,7 @@ pub unsafe extern "C" fn pos2_keygen_decode_address( // bech32 0.11: decode returns (Hrp, Vec) with the 8-bit payload. let (hrp, data) = match bech32::decode(s) { - Ok(x) => x, + Ok(x) => x, Err(_) => return POS2_BAD_ADDRESS, }; let h = hrp.as_str(); @@ -251,7 +241,7 @@ pub unsafe extern "C" fn pos2_keygen_decode_address( pub unsafe extern "C" fn pos2_keygen_derive_subseed( base_seed: *const u8, // 32 bytes idx: u64, - out_seed: *mut u8, // 32 bytes + out_seed: *mut u8, // 32 bytes ) -> i32 { use sha2::{Digest, Sha256}; if base_seed.is_null() || out_seed.is_null() { @@ -275,19 +265,23 @@ mod tests { // Same inputs must produce identical plot_id + memo. #[test] fn deterministic_same_seed() { - let seed = [0xAA_u8; 32]; + let seed = [0xAA_u8; 32]; let farmer_pk = SecretKey::from_seed(&[0xBB_u8; 32]).public_key().to_bytes(); - let pool_ph = [0xCC_u8; 32]; + let pool_ph = [0xCC_u8; 32]; let mut pid1 = [0u8; 32]; let mut memo1 = vec![0u8; 128]; let mut mlen1: usize = memo1.len(); let rc1 = unsafe { pos2_keygen_derive_plot( - seed.as_ptr(), seed.len(), + seed.as_ptr(), + seed.len(), farmer_pk.as_ptr(), - pool_ph.as_ptr(), POS2_POOL_PH, - 2, 0, 0, + pool_ph.as_ptr(), + POS2_POOL_PH, + 2, + 0, + 0, pid1.as_mut_ptr(), memo1.as_mut_ptr(), &mut mlen1, @@ -301,10 +295,14 @@ mod tests { let mut mlen2: usize = memo2.len(); let rc2 = unsafe { pos2_keygen_derive_plot( - seed.as_ptr(), seed.len(), + seed.as_ptr(), + seed.len(), farmer_pk.as_ptr(), - pool_ph.as_ptr(), POS2_POOL_PH, - 2, 0, 0, + pool_ph.as_ptr(), + POS2_POOL_PH, + 2, + 0, + 0, pid2.as_mut_ptr(), memo2.as_mut_ptr(), &mut mlen2, diff --git a/scripts/build-container.sh b/scripts/build-container.sh new file mode 100755 index 0000000..439699d --- /dev/null +++ b/scripts/build-container.sh @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +# +# build-container.sh — auto-detect GPU vendor on the host and run the +# matching `podman compose build ` with the right env vars. +# +# Container builds can't probe the GPU themselves (no device access), +# so this script does it from the host before invoking compose. +# +# Usage: +# ./scripts/build-container.sh # auto-detect +# ./scripts/build-container.sh --gpu nvidia # force NVIDIA +# ./scripts/build-container.sh --gpu amd # force AMD +# ./scripts/build-container.sh --gpu intel # force Intel +# ./scripts/build-container.sh --gpu cpu # CPU-only (AdaptiveCpp OpenMP) +# ./scripts/build-container.sh --no-cache # force clean rebuild +# ./scripts/build-container.sh --engine docker # use docker compose instead + +set -euo pipefail + +ENGINE=podman +GPU="" +declare -a EXTRA_BUILD_ARGS=() + +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu) GPU="$2"; shift 2 ;; + --engine) ENGINE="$2"; shift 2 ;; + # Force a clean rebuild (ignore podman/docker layer cache). Useful + # after a host upgrade (new nvcc / new AdaptiveCpp release / etc.) + # where the cached layers reference stale toolchain versions. + --no-cache) EXTRA_BUILD_ARGS+=("--no-cache"); shift 1 ;; + -h|--help) sed -n '2,/^$/p' "$0" | sed 's/^# \?//'; exit 0 ;; + *) echo "unknown arg: $1" >&2; exit 1 ;; + esac +done + +# ── Detect vendor ─────────────────────────────────────────────────────────── +# Capture output first so `set -o pipefail` doesn't bite us — rocminfo and +# some nvidia-smi configurations exit non-zero even when they print useful +# information, and the pipefail bash setting then makes the entire pipeline +# return non-zero regardless of grep's match status. +if [[ -z "$GPU" ]]; then + nvidia_out="" + rocm_out="" + if command -v nvidia-smi >/dev/null; then + nvidia_out=$(nvidia-smi -L 2>/dev/null || true) + fi + if command -v rocminfo >/dev/null; then + rocm_out=$(rocminfo 2>/dev/null || true) + fi + + if [[ "$nvidia_out" == *GPU* ]]; then + GPU=nvidia + elif [[ "$rocm_out" == *gfx* ]]; then + GPU=amd + else + echo "[build-container] No GPU detected via nvidia-smi or rocminfo." >&2 + echo "[build-container]" >&2 + echo "[build-container] Either:" >&2 + echo "[build-container] 1. Run scripts/install-container-deps.sh, which installs the" >&2 + echo "[build-container] discovery tool (nvidia-smi / rocminfo) along with the" >&2 + echo "[build-container] container engine + GPU runtime." >&2 + echo "[build-container] 2. Install the discovery tool manually:" >&2 + echo "[build-container] Arch: sudo pacman -S nvidia-utils (NVIDIA)" >&2 + echo "[build-container] sudo pacman -S rocminfo (AMD)" >&2 + echo "[build-container] Ubuntu: sudo apt install nvidia-utils-XXX (NVIDIA)" >&2 + echo "[build-container] sudo apt install rocminfo (AMD)" >&2 + echo "[build-container] 3. Force a service explicitly:" >&2 + echo "[build-container] $0 --gpu nvidia | amd | intel" >&2 + echo "[build-container] 4. Or build a CPU-only image (slow plotting, no GPU needed):" >&2 + echo "[build-container] $0 --gpu cpu" >&2 + exit 1 + fi +fi + +# ── Map vendor → compose service + env ────────────────────────────────────── +case "$GPU" in + nvidia) + SERVICE=cuda + # Enumerate ALL GPUs and build a fat binary (CMake's "61;86" + # list syntax) so heterogeneous rigs (e.g. 1070 + 3060) get + # native sm_NN codegen for each card, not just whichever one + # nvidia-smi happened to list first. Single-card hosts produce + # a single-arch list ("89") — same end result as the prior + # head -1 path. Skip the probe entirely if the user pre-set + # CUDA_ARCH (single arch or "61;86" list) so cross-targeting + # an absent GPU still works. + if [[ -z "${CUDA_ARCH:-}" ]] && command -v nvidia-smi >/dev/null; then + # sed first (strip the dot), then sort -un (numeric dedup). + # Without the numeric sort, 1070+5090 would emit "120;61" + # because sort -u defaults to lexicographic. + caps=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null \ + | sed 's/\.//' | sort -un) + if [[ -n "$caps" ]]; then + # Split assignment from export so a non-zero exit from the + # subshell pipeline propagates instead of being masked by + # `export`'s own success (shellcheck SC2155). + CUDA_ARCH=$(echo "$caps" | paste -sd';') + export CUDA_ARCH + fi + fi + : "${CUDA_ARCH:=89}" + export CUDA_ARCH + # Min arch drives the toolkit choice: a 1070+3060 mix needs a + # toolchain that targets sm_61, not just sm_86. Works for + # single-arch CUDA_ARCH=89 (min=89) and for user-set lists + # like "61;86" (min=61). + min_arch=$(echo "$CUDA_ARCH" | tr ';' '\n' | sort -n | head -1) + # CUDA 13.0 dropped codegen for sm_50/52/53/60/61/62/70/72 entirely + # — its nvcc fails the CMake TryCompile probe with "Unsupported gpu + # architecture 'compute_61'" on Pascal, "compute_70" on Volta, etc. + # Pin builds with ANY pre-Turing card to the last 12.x dev image, + # which still covers sm_50 (Maxwell) through sm_120 (Blackwell), so + # a mixed 1070+3060 (or 1070+5090) rig gets one toolchain that + # handles every arch in the list. Honour an explicit BASE_DEVEL / + # BASE_RUNTIME override from the env so users can pin to a + # different toolkit if they need to. + if (( min_arch < 75 )) && [[ -z "${BASE_DEVEL:-}" ]]; then + export BASE_DEVEL="docker.io/nvidia/cuda:12.9.1-devel-ubuntu24.04" + export BASE_RUNTIME="${BASE_RUNTIME:-$BASE_DEVEL}" + echo "[build-container] sm_${min_arch} (pre-Turing) detected → pinning CUDA 12.9 base (CUDA 13.x dropped sub-Turing codegen)" + fi + echo "[build-container] vendor=nvidia service=$SERVICE CUDA_ARCH=$CUDA_ARCH" + ;; + amd) + SERVICE=rocm + # Reuse the rocminfo output captured during vendor detection (or + # capture it now if --gpu amd was forced and rocm_out is empty). + # Avoid `rocminfo | awk '...; exit'` because awk's early exit + # SIGPIPEs rocminfo, and pipefail + set -e then kills the script. + if [[ -z "${rocm_out:-}" ]] && command -v rocminfo >/dev/null; then + rocm_out=$(rocminfo 2>/dev/null || true) + fi + # Honour an explicit ACPP_GFX from the env first (lets the user + # cross-target a different GPU than the host one), else autodetect. + if [[ -z "${ACPP_GFX:-}" ]]; then + if [[ -n "${rocm_out:-}" && "$rocm_out" =~ (gfx[0-9a-f]+) ]]; then + detected_gfx="${BASH_REMATCH[1]}" + # RDNA1 workaround: gfx1010/1011/1012 aren't direct + # AdaptiveCpp HIP targets. Community-tested (Radeon Pro + # W5700) that gfx1013 is ISA-close enough to run on + # gfx1010 silicon. Not parity-validated. + case "$detected_gfx" in + gfx1010|gfx1011|gfx1012) + echo "[build-container] RDNA1 $detected_gfx detected — " \ + "using gfx1013 spoof (community workaround, not " \ + "parity-validated; verify plots with \`xchplot2 " \ + "verify\` before farming)" >&2 + export ACPP_GFX=gfx1013 + ;; + *) + export ACPP_GFX="$detected_gfx" + ;; + esac + fi + fi + if [[ -z "${ACPP_GFX:-}" ]]; then + # No silent fallback: a wrong gfx target produces an image that + # builds clean and runs without errors, but the AOT amdgcn ISA + # is for the wrong arch and the SYCL kernels execute as silent + # no-ops at runtime (sort returns input unchanged, AES match + # finds zero results, plot output diverges from reference). + # Fail loud here instead. + echo "[build-container] ERROR: couldn't detect AMD gfx target." >&2 + echo "[build-container] Either install rocminfo so the host probe finds it," >&2 + echo "[build-container] or set ACPP_GFX explicitly to your card's arch:" >&2 + echo "[build-container] ACPP_GFX=gfx1012 $0 --gpu amd # RX 5500 XT 4GB (RDNA1 — auto-spoofed to gfx1013)" >&2 + echo "[build-container] ACPP_GFX=gfx1030 $0 --gpu amd # RX 6800 / 6800 XT / 6900 XT" >&2 + echo "[build-container] ACPP_GFX=gfx1031 $0 --gpu amd # RX 6700 XT / 6700 / 6800M" >&2 + echo "[build-container] ACPP_GFX=gfx1034 $0 --gpu amd # RX 6500 XT / 6400 (4 GiB → minimal tier)" >&2 + echo "[build-container] ACPP_GFX=gfx1100 $0 --gpu amd # RX 7900 XTX / XT" >&2 + echo "[build-container] (run \"rocminfo | grep gfx\" if available)" >&2 + exit 1 + fi + echo "[build-container] vendor=amd service=$SERVICE ACPP_GFX=$ACPP_GFX" + ;; + intel) + SERVICE=intel + echo "[build-container] vendor=intel service=$SERVICE (experimental, untested)" + ;; + cpu) + # CPU-only build: AdaptiveCpp's OpenMP backend, no GPU at runtime. + # Useful for headless CI, dev machines without a GPU, or as a + # secondary worker on a `--devices` list alongside real GPUs. + # Plotting throughput will be 1-2 orders of magnitude lower than + # GPU — see README's CPU section for the perf expectations. + SERVICE=cpu + echo "[build-container] vendor=cpu service=$SERVICE (AdaptiveCpp OpenMP backend; slow plotting, see README)" + ;; + *) + echo "unknown --gpu value: $GPU (expected nvidia|amd|intel|cpu)" >&2 + exit 1 + ;; +esac + +# ── Invoke compose ────────────────────────────────────────────────────────── +case "$ENGINE" in + podman) COMPOSE=(podman compose) ;; + docker) COMPOSE=(docker compose) ;; + *) echo "unknown --engine: $ENGINE (expected podman|docker)" >&2; exit 1 ;; +esac + +set -x +"${COMPOSE[@]}" build "${EXTRA_BUILD_ARGS[@]}" "$SERVICE" diff --git a/scripts/install-container-deps.sh b/scripts/install-container-deps.sh new file mode 100755 index 0000000..edb60a5 --- /dev/null +++ b/scripts/install-container-deps.sh @@ -0,0 +1,489 @@ +#!/usr/bin/env bash +# +# install-container-deps.sh — bootstrap the host packages required to +# build & run xchplot2's container images via scripts/build-container.sh. +# +# Native build deps (CUDA Toolkit, ROCm SDK, LLVM 18+, AdaptiveCpp, +# Boost.Context, libnuma, libomp, Rust) all live INSIDE the container +# image — the host does not need any of them. This script only +# installs: +# 1. A container engine + compose plugin: `podman` + `podman-compose` +# (default), or `docker` + the `docker compose` v2 plugin via +# `--engine docker`. +# 2. The GPU discovery tool used by build-container.sh's autodetect +# (`nvidia-smi` for NVIDIA, `rocminfo` for AMD). build-container.sh +# *errors* on AMD if ACPP_GFX can't be resolved, so rocminfo isn't +# strictly optional unless you pass ACPP_GFX through the env. +# 3. The GPU container runtime: `nvidia-container-toolkit` + a CDI +# spec at /etc/cdi/nvidia.yaml (podman) or the docker runtime hook +# (docker) for NVIDIA. AMD and Intel only need /dev/kfd | /dev/dri +# access via the `video` and `render` groups; this script adds +# the invoking user to both. +# +# For NATIVE host builds (no container) use scripts/install-deps.sh +# instead — that path needs the full CUDA / ROCm / LLVM / AdaptiveCpp +# stack on the host and takes 30-45 min on a first run. +# +# Usage: +# scripts/install-container-deps.sh # auto-detect distro + GPU +# scripts/install-container-deps.sh --gpu nvidia +# scripts/install-container-deps.sh --gpu amd +# scripts/install-container-deps.sh --gpu intel +# scripts/install-container-deps.sh --gpu cpu # engine only, no GPU runtime +# scripts/install-container-deps.sh --engine docker # docker instead of podman +# scripts/install-container-deps.sh --no-nvidia-repo # skip adding NVIDIA's apt/dnf repo +# scripts/install-container-deps.sh --dry-run # print the plan, change nothing +# +# Supported distros: Arch family, Ubuntu/Debian, Fedora/RHEL. + +set -euo pipefail + +ENGINE=podman +GPU="" +ADD_NVIDIA_REPO=1 +DRY_RUN=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu) GPU="$2"; shift 2 ;; + --engine) ENGINE="$2"; shift 2 ;; + --no-nvidia-repo) ADD_NVIDIA_REPO=0; shift ;; + --dry-run) DRY_RUN=1; shift ;; + -h|--help) sed -n '2,/^$/p' "$0" | sed 's/^# \?//'; exit 0 ;; + *) echo "unknown arg: $1" >&2; exit 1 ;; + esac +done + +case "$ENGINE" in + podman|docker) ;; + *) echo "[install-container-deps] unknown --engine: $ENGINE (expected podman|docker)" >&2; exit 1 ;; +esac + +# ── Helpers ───────────────────────────────────────────────────────────────── +# In dry-run mode every mutating call is replaced with a `+ sudo …` stub; +# probes (`command -v`, `[[ -f ]]`, etc.) still run as normal because they +# don't change host state and the planning logic depends on them. The `+ ` +# prefix mirrors `set -x`'s syntax so dry-run output reads as an executable +# trace. +sudo_or_dry() { + if (( DRY_RUN )); then + printf '+ sudo %s\n' "$*" + else + sudo "$@" + fi +} + +apt_update_or_dry() { + if (( DRY_RUN )); then + printf '+ sudo apt-get update\n' + else + sudo apt-get update + fi +} + +# Curl-piped-to-(sudo tee | sudo gpg --dearmor) write. Records "+ write +# DEST (from URL)" in dry-run mode. `mode=dearmor` covers the apt +# gpgkey path; default mode is plain tee. +write_url_or_dry() { + local url="$1" dest="$2" mode="${3:-cat}" + if (( DRY_RUN )); then + case "$mode" in + dearmor) printf '+ write %s (gpg --dearmor from %s)\n' "$dest" "$url" ;; + *) printf '+ write %s (from %s)\n' "$dest" "$url" ;; + esac + return + fi + case "$mode" in + dearmor) + curl -fsSL "$url" \ + | sudo gpg --batch --yes --dearmor -o "$dest" + ;; + *) + curl -fsSL "$url" | sudo tee "$dest" >/dev/null + ;; + esac +} + +# ── Detect distro ─────────────────────────────────────────────────────────── +if [[ ! -f /etc/os-release ]]; then + echo "[install-container-deps] Cannot detect distro: /etc/os-release missing" >&2 + exit 1 +fi +# shellcheck source=/dev/null +. /etc/os-release +DISTRO=$ID +DISTRO_LIKE=${ID_LIKE:-} + +# ── Detect GPU vendor ─────────────────────────────────────────────────────── +# Two-tier strategy mirroring install-deps.sh: tool-based first (authoritative +# when the driver is loaded), PCI vendor-ID fallback (works pre-driver). The +# driver tools cannot be a hard prerequisite because installing them is one +# of the things this script is supposed to do. +detect_gpu_via_pci() { + local found="" entry name vendor + for entry in /sys/class/drm/card*; do + name=$(basename "$entry") + # Skip connector entries like card0-DP-1; only the bare cardN + # nodes carry a `device/vendor` attribute we can read. + [[ "$name" =~ ^card[0-9]+$ ]] || continue + [[ -r "$entry/device/vendor" ]] || continue + vendor=$(cat "$entry/device/vendor" 2>/dev/null) + case "$vendor" in + 0x10de) found="nvidia"; break ;; # highest precedence + 0x1002) found="amd" ;; # overrides intel + 0x8086) [[ -z "$found" ]] && found="intel" ;; # only if nothing else + esac + done + echo "$found" +} + +# Skip autodetect under --dry-run — CI containers have no GPU, and tests +# always pass --gpu explicitly. Avoids "could not auto-detect" exit on +# headless runners. +if [[ -z "$GPU" ]] && (( ! DRY_RUN )); then + if command -v nvidia-smi >/dev/null && nvidia-smi -L 2>/dev/null | grep -q GPU; then + GPU=nvidia + echo "[install-container-deps] Detected NVIDIA GPU (nvidia-smi)." + elif command -v rocminfo >/dev/null && rocminfo 2>/dev/null | grep -q gfx; then + GPU=amd + echo "[install-container-deps] Detected AMD GPU (rocminfo)." + else + GPU=$(detect_gpu_via_pci) + if [[ -n "$GPU" ]]; then + echo "[install-container-deps] Detected $GPU GPU via /sys/class/drm (PCI vendor ID); driver tools not yet installed." + fi + fi +fi + +if [[ -z "$GPU" ]]; then + if (( DRY_RUN )); then + echo "[install-container-deps] --dry-run requires --gpu to be set explicitly" >&2 + else + echo "[install-container-deps] Could not auto-detect a GPU. Pass" >&2 + echo "[install-container-deps] --gpu nvidia | amd | intel | cpu" >&2 + echo "[install-container-deps] explicitly. Use --gpu cpu for a GPU-less host" >&2 + echo "[install-container-deps] (CPU-only image; slow plotting, see README)." >&2 + fi + exit 1 +fi + +case "$GPU" in + nvidia|amd|intel|cpu) ;; + *) echo "[install-container-deps] unknown --gpu: $GPU (expected nvidia|amd|intel|cpu)" >&2; exit 1 ;; +esac + +echo "[install-container-deps] distro=$DISTRO, gpu=$GPU, engine=$ENGINE" + +# ── Per-distro packages ───────────────────────────────────────────────────── +install_arch() { + local pkgs=() + case "$ENGINE" in + podman) pkgs+=(podman podman-compose) ;; + docker) pkgs+=(docker docker-compose docker-buildx) ;; + esac + case "$GPU" in + # nvidia-utils provides nvidia-smi (used by build-container.sh's + # CUDA_ARCH probe). nvidia-container-toolkit provides nvidia-ctk + + # the CDI / runtime hook libraries for GPU pass-through. + nvidia) pkgs+=(nvidia-utils nvidia-container-toolkit) ;; + # rocminfo: build-container.sh fails fast on AMD if ACPP_GFX can't + # be resolved from rocminfo (compose.yaml's ACPP_TARGETS default + # is a deliberately invalid placeholder so wrong-arch builds fail + # loudly instead of silently producing no-op kernels). + # No ROCm SDK on the host — that lives inside the container. + amd) pkgs+=(rocminfo) ;; + esac + sudo_or_dry pacman -S --needed --noconfirm "${pkgs[@]}" +} + +install_apt() { + apt_update_or_dry + + local pkgs=() + case "$ENGINE" in + # podman-compose lags upstream on LTS but covers what + # build-container.sh exercises (build/run, no fancy flags). + podman) pkgs+=(podman podman-compose) ;; + # docker.io = Ubuntu's stock dockerd. The compose v2 plugin name + # varies (24.04: docker-compose-v2 in universe; via Docker's + # official repo: docker-compose-plugin). Resolved below. + docker) pkgs+=(docker.io docker-buildx) ;; + esac + case "$GPU" in + nvidia) + # nvidia-utils-XXX is suffixed with the loaded driver branch. + # If a driver is already loaded, pin the matching utils branch + # via /proc/driver/nvidia/version. If no driver is loaded, skip + # — nvidia-container-toolkit still works without nvidia-smi, + # it just means build-container.sh can't autodetect CUDA_ARCH. + local drv_major="" + if (( DRY_RUN )); then + # Use a placeholder so dry-run output stays deterministic + # regardless of whether the runner has a driver loaded. + drv_major="" + elif [[ -r /proc/driver/nvidia/version ]]; then + drv_major=$(grep -oE '[0-9]+\.[0-9]+' /proc/driver/nvidia/version 2>/dev/null \ + | head -1 | cut -d. -f1) + fi + if [[ -n "$drv_major" ]]; then + pkgs+=("nvidia-utils-$drv_major") + else + echo "[install-container-deps] No loaded NVIDIA driver detected via" >&2 + echo "[install-container-deps] /proc/driver/nvidia/version. Skipping" >&2 + echo "[install-container-deps] nvidia-utils-* — install your driver" >&2 + echo "[install-container-deps] first, or pass --gpu nvidia + CUDA_ARCH" >&2 + echo "[install-container-deps] manually to build-container.sh." >&2 + fi + ;; + amd) pkgs+=(rocminfo) ;; + esac + sudo_or_dry apt-get install -y --no-install-recommends "${pkgs[@]}" + + # Docker compose v2 plugin: the package name varies by source. + # `docker-compose-v2` ships in 24.04+ universe; `docker-compose-plugin` + # ships in Docker's official deb repo. Both install the same binary at + # /usr/libexec/docker/cli-plugins/docker-compose. build-container.sh + # uses the v2 `docker compose ` syntax, so we MUST install one + # of these two — the legacy v1 `docker-compose` (Python) won't work. + if [[ "$ENGINE" == docker ]]; then + local compose_pkg="docker-compose-v2" + if (( ! DRY_RUN )); then + compose_pkg="" + for cand in docker-compose-v2 docker-compose-plugin; do + if apt-cache show "$cand" >/dev/null 2>&1; then + compose_pkg="$cand"; break + fi + done + if [[ -z "$compose_pkg" ]]; then + echo "[install-container-deps] No compose v2 package available in apt." >&2 + echo "[install-container-deps] Add Docker's official repo for docker-compose-plugin:" >&2 + echo "[install-container-deps] https://docs.docker.com/engine/install/ubuntu/" >&2 + echo "[install-container-deps] Or use --engine podman (default; tested with compose.yaml)." >&2 + exit 1 + fi + fi + sudo_or_dry apt-get install -y --no-install-recommends "$compose_pkg" + fi + + # nvidia-container-toolkit isn't in stock Ubuntu/Debian repos. Pull it + # from NVIDIA's official apt repo (the path NVIDIA's own docs use). + if [[ "$GPU" == nvidia ]]; then + if [[ $ADD_NVIDIA_REPO -eq 1 ]] \ + && { (( DRY_RUN )) || [[ ! -f /etc/apt/sources.list.d/nvidia-container-toolkit.list ]]; }; then + echo "[install-container-deps] Adding NVIDIA's container-toolkit apt repo to /etc/apt/sources.list.d/." + sudo_or_dry install -m 0755 -d /usr/share/keyrings + write_url_or_dry \ + https://nvidia.github.io/libnvidia-container/gpgkey \ + /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + dearmor + # The repo file gets a sed transform to inject signed-by= ; + # in dry-run we record the URL → dest, which is the bit + # users actually care about. + if (( DRY_RUN )); then + write_url_or_dry \ + https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + /etc/apt/sources.list.d/nvidia-container-toolkit.list + else + curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null + fi + apt_update_or_dry + fi + sudo_or_dry apt-get install -y --no-install-recommends nvidia-container-toolkit + fi +} + +install_dnf() { + local pkgs=() + case "$ENGINE" in + podman) + # Fedora's first-class engine — both packages are in the stock + # repos (podman is the default container tool on Fedora 36+). + pkgs+=(podman podman-compose) + ;; + docker) + # docker isn't in Fedora/RHEL stock repos; the user has to add + # docker-ce.repo per Docker's docs first. Bail rather than + # silently fail mid-install. Skip the precondition check in + # dry-run so the planning output stays useful even in CI + # containers that haven't added the repo. + if (( ! DRY_RUN )); then + if ! sudo dnf list --installed docker-ce >/dev/null 2>&1 \ + && ! sudo dnf list --installed docker >/dev/null 2>&1; then + echo "[install-container-deps] Docker is not in Fedora/RHEL stock repos." >&2 + echo "[install-container-deps] Add docker-ce.repo per Docker's docs first," >&2 + echo "[install-container-deps] then re-run this script. Or use --engine podman" >&2 + echo "[install-container-deps] (default; Fedora's first-class engine)." >&2 + exit 1 + fi + fi + pkgs+=(docker-compose-plugin docker-buildx-plugin) + ;; + esac + case "$GPU" in + nvidia) + # Hint only — Fedora's nvidia driver lives in RPMFusion and + # auto-enabling third-party repos behind the user's back is + # rude. nvidia-container-toolkit (added below) comes from + # NVIDIA's own repo, which is already a precedent set by + # NVIDIA's docs. + if (( ! DRY_RUN )) && ! command -v nvidia-smi >/dev/null; then + echo "[install-container-deps] WARNING: nvidia-smi not on PATH." >&2 + echo "[install-container-deps] Enable RPMFusion + install akmod-nvidia (or" >&2 + echo "[install-container-deps] akmod-nvidia-open) for the host driver, or" >&2 + echo "[install-container-deps] pass --gpu nvidia + CUDA_ARCH manually." >&2 + fi + ;; + amd) pkgs+=(rocminfo) ;; + esac + if [[ ${#pkgs[@]} -gt 0 ]]; then + sudo_or_dry dnf install -y "${pkgs[@]}" + fi + + if [[ "$GPU" == nvidia ]]; then + if [[ $ADD_NVIDIA_REPO -eq 1 ]] \ + && { (( DRY_RUN )) || [[ ! -f /etc/yum.repos.d/nvidia-container-toolkit.repo ]]; }; then + echo "[install-container-deps] Adding NVIDIA's container-toolkit dnf repo to /etc/yum.repos.d/." + write_url_or_dry \ + https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \ + /etc/yum.repos.d/nvidia-container-toolkit.repo + fi + sudo_or_dry dnf install -y nvidia-container-toolkit + fi +} + +# ── Distro-agnostic post-install (NVIDIA only) ────────────────────────────── +configure_nvidia_runtime() { + if (( ! DRY_RUN )) && ! command -v nvidia-ctk >/dev/null; then + echo "[install-container-deps] WARNING: nvidia-ctk not on PATH — skipping CDI / runtime setup." >&2 + return + fi + case "$ENGINE" in + podman) + # CDI spec at /etc/cdi/nvidia.yaml lets `--device nvidia.com/gpu=all` + # (and the `deploy.resources.reservations.devices` shorthand in + # compose.yaml's cuda service) resolve to real GPUs. Re-run after + # driver upgrades — the spec hard-codes device file paths. + sudo_or_dry install -m 0755 -d /etc/cdi + sudo_or_dry nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml + echo "[install-container-deps] Generated CDI spec at /etc/cdi/nvidia.yaml." + # nvidia-ctk's "discoverer" enumerates every NVIDIA-related path + # the driver could expose — Vulkan ICDs, X11 configs, the + # fabric-manager / MPS / IMEX sockets, etc. — and prints WARN + # lines for ones it can't find. On any non-server, headless + # GPU host most of these won't be present; the spec gracefully + # omits them. Tell the user up front so the WARN volume on the + # next line doesn't look like a failure. + echo "[install-container-deps] (WARNings about libnvidia-vulkan-producer / X11 configs /" + echo "[install-container-deps] fabric-manager / MPS / IMEX from nvidia-ctk are expected on" + echo "[install-container-deps] non-server hosts — those are optional features the spec" + echo "[install-container-deps] gracefully omits when not present.)" + ;; + docker) + # Writes /etc/docker/daemon.json's `runtimes.nvidia` entry + + # restarts dockerd so the change takes effect. + sudo_or_dry nvidia-ctk runtime configure --runtime=docker + if (( DRY_RUN )); then + printf '+ sudo systemctl restart docker\n' + else + sudo systemctl restart docker || true + fi + echo "[install-container-deps] Configured docker NVIDIA runtime + restarted dockerd." + ;; + esac +} + +# ── Distro-agnostic post-install (AMD / Intel) ────────────────────────────── +# /dev/kfd (AMD) and /dev/dri (AMD + Intel) are group-owned by `video` (and +# `render` on newer udev/systemd setups). Add the invoking user to both so +# rootless containers can pass the device through. Effective on next login. +add_user_to_video_render_groups() { + local target_user + if (( DRY_RUN )); then + # Stable placeholder so the fixture doesn't depend on $USER. + target_user="" + else + target_user="${SUDO_USER:-${USER:-}}" + if [[ -z "$target_user" || "$target_user" == root ]]; then + echo "[install-container-deps] Skipping group membership (no non-root user detected)." + return + fi + fi + for grp in video render; do + if (( ! DRY_RUN )); then + getent group "$grp" >/dev/null 2>&1 || continue + if id -nG "$target_user" | tr ' ' '\n' | grep -qx "$grp"; then + continue + fi + fi + sudo_or_dry usermod -aG "$grp" "$target_user" + echo "[install-container-deps] Added $target_user to group $grp (re-login to apply)." + done +} + +# ── Enable docker daemon when applicable ──────────────────────────────────── +enable_docker_service() { + [[ "$ENGINE" == docker ]] || return 0 + if (( ! DRY_RUN )); then + command -v systemctl >/dev/null || return 0 + fi + if (( DRY_RUN )); then + printf '+ sudo systemctl enable --now docker.service\n' + else + sudo systemctl enable --now docker.service || true + fi +} + +# ── Distro dispatch ───────────────────────────────────────────────────────── +case "$DISTRO" in + arch|cachyos|manjaro|endeavouros) install_arch ;; + ubuntu|debian|pop|linuxmint) install_apt ;; + fedora|rhel|centos|rocky|almalinux) install_dnf ;; + *) + case "$DISTRO_LIKE" in + *arch*) install_arch ;; + *debian*) install_apt ;; + *rhel*|*fedora*) install_dnf ;; + *) + echo "[install-container-deps] Unknown distro '$DISTRO'. Install equivalents of:" + if [[ "$ENGINE" == podman ]]; then + echo " podman + podman-compose" + else + echo " docker + docker-compose-v2 (or docker-compose-plugin) + docker-buildx" + fi + case "$GPU" in + nvidia) echo " nvidia-container-toolkit (from NVIDIA's repo: https://nvidia.github.io/libnvidia-container/)" ;; + amd) echo " rocminfo (only used by build-container.sh's ACPP_GFX autodetect)" ;; + esac + exit 1 + ;; + esac + ;; +esac + +enable_docker_service + +case "$GPU" in + nvidia) configure_nvidia_runtime ;; + amd|intel) add_user_to_video_render_groups ;; + cpu) : ;; +esac + +# ── Final notes ───────────────────────────────────────────────────────────── +echo +echo "[install-container-deps] Done." +echo " Build the image:" +echo " ./scripts/build-container.sh --engine $ENGINE${GPU:+ --gpu $GPU}" +case "$GPU" in + amd|intel) + echo " If this run added you to the video / render groups, log out" + echo " and back in before running plots — group changes only take" + echo " effect for fresh login sessions." + ;; + nvidia) + echo " After future NVIDIA driver upgrades, re-run this script (or" + echo " re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure" + echo " manually) so the CDI spec / docker runtime hook stays current." + ;; +esac diff --git a/scripts/install-deps.sh b/scripts/install-deps.sh new file mode 100755 index 0000000..8d98085 --- /dev/null +++ b/scripts/install-deps.sh @@ -0,0 +1,309 @@ +#!/usr/bin/env bash +# +# install-deps.sh — bootstrap xchplot2's native build dependencies. +# +# Installs CUDA Toolkit on NVIDIA, ROCm HIP SDK on AMD, LLVM 18+, +# AdaptiveCpp 25.10, and a Rust toolchain via rustup. After this completes, +# you can build with either: +# cargo install --git https://github.com/Jsewill/xchplot2 +# # or: +# cmake -B build -S . && cmake --build build -j +# +# Usage: +# scripts/install-deps.sh # auto-detect distro + GPU +# scripts/install-deps.sh --no-acpp # skip AdaptiveCpp build (use FetchContent) +# scripts/install-deps.sh --gpu amd # force AMD path (CUDA headers only) +# scripts/install-deps.sh --gpu nvidia # force NVIDIA path (full CUDA Toolkit) +# +# Supported distros: Arch family, Ubuntu/Debian, Fedora/RHEL. +# For anything else, install the equivalents listed at the bottom and +# build AdaptiveCpp from source manually. + +set -euo pipefail + +ACPP_REF=${ACPP_REF:-v25.10.0} +ACPP_PREFIX=${ACPP_PREFIX:-/opt/adaptivecpp} +SKIP_ACPP=0 +GPU="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --no-acpp) SKIP_ACPP=1; shift ;; + --gpu) GPU="$2"; shift 2 ;; + -h|--help) sed -n '2,/^$/p' "$0" | sed 's/^# \?//'; exit 0 ;; + *) echo "unknown arg: $1" >&2; exit 1 ;; + esac +done + +# ── Detect distro ─────────────────────────────────────────────────────────── +if [[ ! -f /etc/os-release ]]; then + echo "Cannot detect distro: /etc/os-release missing" >&2 + exit 1 +fi +# shellcheck source=/dev/null +. /etc/os-release +DISTRO=$ID +DISTRO_LIKE=${ID_LIKE:-} + +# ── Detect GPU vendor (NVIDIA / AMD / Intel) ──────────────────────────────── +# Two-tier detection so a fresh OS install (no driver tools yet) still works: +# 1. Tool-based (nvidia-smi / rocminfo) — authoritative when available, +# because it confirms the driver+runtime is functional, not just that +# a card is plugged in. +# 2. PCI vendor ID via /sys/class/drm — works pre-driver. The whole point +# of running install-deps.sh is to install the driver/toolkit, so we +# can't require the driver tools as a prerequisite for detection. +# +# Precedence (when multiple GPUs are present): NVIDIA > AMD > Intel. +# Matches the build.rs vendor-precedence logic. +detect_gpu_via_pci() { + local found="" entry name vendor + for entry in /sys/class/drm/card*; do + name=$(basename "$entry") + # Skip connector entries like card0-DP-1 — only the bare cardN + # nodes have a `device/vendor` attribute we care about. + [[ "$name" =~ ^card[0-9]+$ ]] || continue + [[ -r "$entry/device/vendor" ]] || continue + vendor=$(cat "$entry/device/vendor" 2>/dev/null) + case "$vendor" in + 0x10de) found="nvidia"; break ;; # highest precedence + 0x1002) found="amd" ;; # overrides intel + 0x8086) [[ -z "$found" ]] && found="intel" ;; # only if nothing else + esac + done + echo "$found" +} + +if [[ -z "$GPU" ]]; then + if command -v nvidia-smi >/dev/null && nvidia-smi -L 2>/dev/null | grep -q GPU; then + GPU=nvidia + echo "[install-deps] Detected NVIDIA GPU (nvidia-smi)." + elif command -v rocminfo >/dev/null && rocminfo 2>/dev/null | grep -q gfx; then + GPU=amd + echo "[install-deps] Detected AMD GPU (rocminfo)." + else + GPU=$(detect_gpu_via_pci) + if [[ -n "$GPU" ]]; then + echo "[install-deps] Detected $GPU GPU via /sys/class/drm (PCI vendor ID); driver tools not yet installed." + fi + fi +fi + +if [[ -z "$GPU" ]]; then + echo "[install-deps] Could not auto-detect a GPU (no nvidia-smi / rocminfo," >&2 + echo "[install-deps] no usable PCI device under /sys/class/drm)." >&2 + echo "[install-deps] Pass --gpu nvidia or --gpu amd explicitly to override." >&2 + echo "[install-deps] Headless / CI builds: --gpu nvidia installs the LLVM" >&2 + echo "[install-deps] toolchain + CUDA Toolkit headers used by the SYCL path." >&2 + exit 1 +fi + +if [[ "$GPU" == "intel" ]]; then + echo "[install-deps] Intel GPU detected, but install-deps.sh has no Intel-" >&2 + echo "[install-deps] specific package path yet. Options:" >&2 + echo "[install-deps] --gpu nvidia install LLVM + CUDA headers (the SYCL" >&2 + echo "[install-deps] path JITs onto Intel via AdaptiveCpp's" >&2 + echo "[install-deps] generic SSCP target at runtime)" >&2 + echo "[install-deps] ./scripts/build-container.sh container with Intel oneAPI" >&2 + exit 1 +fi +echo "[install-deps] distro=$DISTRO, gpu=$GPU, acpp=${ACPP_REF}, prefix=${ACPP_PREFIX}" + +# ── Per-distro packages ───────────────────────────────────────────────────── +install_arch() { + local pkgs=(cmake git base-devel python ninja + llvm clang lld + boost numactl curl) + case "$GPU" in + nvidia) pkgs+=(cuda) ;; + # rocminfo: needed by build-container.sh + scripts/install-deps.sh + # autodetection (rocm-hip-sdk doesn't pull it transitively). + # No CUDA pkg on the AMD path — CudaHalfShim.hpp guards the CUDA + # headers via __has_include, and pulling CUDA alongside HIP causes + # uchar1/char1 typedef redefinitions. + amd) pkgs+=(rocm-hip-sdk rocm-device-libs rocminfo) ;; + esac + sudo pacman -S --needed --noconfirm "${pkgs[@]}" +} + +install_apt() { + local pkgs=(cmake git ninja-build build-essential python3 pkg-config + llvm-18 llvm-18-dev clang-18 lld-18 libclang-18-dev libclang-cpp18-dev + libboost-context-dev libnuma-dev libomp-18-dev curl ca-certificates) + case "$GPU" in + nvidia) pkgs+=(nvidia-cuda-toolkit) ;; + amd) pkgs+=(rocm-hip-sdk rocm-libs rocminfo) + # rocminfo is the discovery tool build-container.sh probes; + # not pulled in transitively by rocm-hip-sdk. + # No nvidia-cuda-toolkit-headers on the AMD path — + # CudaHalfShim.hpp guards the CUDA headers via + # __has_include, and pulling CUDA alongside HIP causes + # uchar1/char1 typedef redefinitions. + ;; + esac + sudo apt-get update + sudo apt-get install -y --no-install-recommends "${pkgs[@]}" +} + +install_dnf() { + local pkgs=(cmake git ninja-build gcc-c++ python3 pkg-config + llvm llvm-devel clang clang-devel lld + boost-devel numactl-devel libomp-devel curl) + case "$GPU" in + nvidia) pkgs+=(cuda-toolkit) ;; + # No cuda-toolkit on the AMD path — CudaHalfShim.hpp guards the + # CUDA headers via __has_include, and pulling CUDA alongside HIP + # causes uchar1/char1 typedef redefinitions. + amd) pkgs+=(rocm-hip-devel rocminfo) ;; + esac + sudo dnf install -y "${pkgs[@]}" +} + +case "$DISTRO" in + arch|cachyos|manjaro|endeavouros) install_arch ;; + ubuntu|debian|pop|linuxmint) install_apt ;; + fedora|rhel|centos|rocky|almalinux) install_dnf ;; + *) + case "$DISTRO_LIKE" in + *arch*) install_arch ;; + *debian*) install_apt ;; + *rhel*|*fedora*) install_dnf ;; + *) + echo "[install-deps] Unknown distro '$DISTRO'. Install equivalents of:" + echo " CMake ≥ 3.24, Ninja, LLVM 18+, clang 18+, libclang dev," + echo " Boost.Context, libnuma, libomp, Python 3, git," + if [[ "$GPU" == "nvidia" ]]; then + echo " CUDA Toolkit 12+ (with nvcc)" + else + echo " ROCm 6+ HIP SDK (rocm-hip-sdk / rocm-hip-devel)" + fi + echo "Then re-run with --no-acpp to skip pkg install and only build AdaptiveCpp." + exit 1 + ;; + esac + ;; +esac + +# ── Rust toolchain via rustup ─────────────────────────────────────────────── +if ! command -v cargo >/dev/null; then + echo "[install-deps] Installing Rust toolchain via rustup" + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \ + sh -s -- -y --default-toolchain stable --profile minimal + export PATH=$HOME/.cargo/bin:$PATH +fi + +# ── AdaptiveCpp ───────────────────────────────────────────────────────────── +if [[ $SKIP_ACPP -eq 1 ]]; then + echo "[install-deps] Skipping AdaptiveCpp build per --no-acpp." + echo "[install-deps] CMakeLists will FetchContent it automatically (slow first build)." + exit 0 +fi + +if [[ -d "$ACPP_PREFIX" ]] && [[ -f "$ACPP_PREFIX/lib/cmake/AdaptiveCpp/AdaptiveCppConfig.cmake" ]]; then + echo "[install-deps] AdaptiveCpp already installed at $ACPP_PREFIX. Skipping." + exit 0 +fi + +ACPP_BUILD_DIR=$(mktemp -d -t xchplot2-acpp-XXXXXX) +trap 'rm -rf "$ACPP_BUILD_DIR"' EXIT + +# ── Find a compatible LLVM ────────────────────────────────────────────────── +# AdaptiveCpp 25.10 only supports LLVM 16-20. On rolling distros (Arch, +# Fedora rawhide) the system LLVM is often 21+, which AdaptiveCpp rejects +# with "LLVM versions greater than 20 are not yet tested/supported". Probe +# the conventional install prefixes for the newest usable LLVM and pin +# AdaptiveCpp to it explicitly. Fail fast with a distro-specific install +# hint rather than letting AdaptiveCpp's CMake fail mid-configure. +LLVM_ROOT="" +for cand in \ + /usr/lib/llvm-20 /usr/lib/llvm-19 /usr/lib/llvm-18 \ + /usr/lib/llvm-17 /usr/lib/llvm-16 \ + /usr/lib/llvm20 /usr/lib/llvm19 /usr/lib/llvm18 \ + /usr/lib64/llvm20 /usr/lib64/llvm19 /usr/lib64/llvm18 \ + /opt/llvm20 /opt/llvm-20 /opt/llvm19 /opt/llvm-19 \ + /opt/llvm18 /opt/llvm-18; do + if [[ -x "$cand/bin/clang" ]] && [[ -x "$cand/bin/ld.lld" ]]; then + ver=$("$cand/bin/clang" --version 2>/dev/null \ + | head -1 | grep -oE 'version [0-9]+' | grep -oE '[0-9]+') + if [[ -n "$ver" ]] && (( ver >= 16 && ver <= 20 )); then + LLVM_ROOT="$cand" + break + fi + fi +done + +if [[ -z "$LLVM_ROOT" ]]; then + echo "[install-deps] No compatible LLVM (16-20) with ld.lld found." >&2 + echo "[install-deps] AdaptiveCpp $ACPP_REF only supports LLVM 16-20." >&2 + echo "[install-deps] Install one and re-run, or use the container path:" >&2 + case "$DISTRO" in + arch|cachyos|manjaro|endeavouros) + echo " yay -S llvm18-bin lld18-bin # or paru -S, or any AUR helper" >&2 ;; + ubuntu|debian|pop|linuxmint) + echo " sudo apt install llvm-18 llvm-18-dev clang-18 lld-18 libomp-18-dev" >&2 ;; + fedora|rhel|centos|rocky|almalinux) + echo " sudo dnf install llvm18 llvm18-devel clang18 lld18-devel" >&2 ;; + *) + echo " install LLVM 16-20 + clang + ld.lld for your distro" >&2 ;; + esac + echo " ./scripts/build-container.sh # container has LLVM 18 pinned" >&2 + exit 1 +fi +echo "[install-deps] Using LLVM at $LLVM_ROOT for AdaptiveCpp build." + +# ── ROCm device libs path (AMD only) ──────────────────────────────────────── +# AdaptiveCpp's HIP backend needs ockl.bc / ocml.bc to compile kernels for +# amdgcn. The bitcode location moved between ROCm versions; probe the +# common spots. CMake will warn if the path's missing on AMD; without a +# match here, the build fails with "ROCm device library path not found". +ACPP_ROCM_FLAGS=() +if [[ "$GPU" == "amd" ]]; then + for d in \ + /opt/rocm/amdgcn/bitcode \ + /opt/rocm/lib/llvm-amdgpu/amdgcn/bitcode \ + /opt/rocm/share/amdgcn/bitcode; do + if [[ -f "$d/ockl.bc" ]]; then + ACPP_ROCM_FLAGS+=(-DROCM_DEVICE_LIBS_PATH="$d") + echo "[install-deps] ROCm device libs: $d" + break + fi + done +fi + +echo "[install-deps] Building AdaptiveCpp $ACPP_REF in $ACPP_BUILD_DIR" +git clone --depth 1 --branch "$ACPP_REF" \ + https://github.com/AdaptiveCpp/AdaptiveCpp.git "$ACPP_BUILD_DIR/src" + +# AMD-only builds don't need AdaptiveCpp's CUDA backend. Skip the +# `find_package(CUDA)` probe that AdaptiveCpp's CMakeLists runs at +# line ~122: on hosts where a CUDA headers subset is installed (distro +# `cuda` package, JetPack fragments, /usr/lib from some wrappers), the +# probe finds a partial install and AdaptiveCpp's own `FindCUDA.cmake` +# emits `CUDAToolkit_LIBRARY_ROOT /usr/lib does not point to the +# correct directory, try setting it manually`. The warning is cosmetic +# (AdaptiveCpp continues without CUDA), but it looks like an error to +# users skimming the install log. +ACPP_CUDA_DISABLE=() +if [[ "$GPU" == "amd" ]]; then + ACPP_CUDA_DISABLE+=(-DCMAKE_DISABLE_FIND_PACKAGE_CUDA=TRUE) +fi + +cmake -S "$ACPP_BUILD_DIR/src" -B "$ACPP_BUILD_DIR/build" -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX="$ACPP_PREFIX" \ + -DCMAKE_C_COMPILER="$LLVM_ROOT/bin/clang" \ + -DCMAKE_CXX_COMPILER="$LLVM_ROOT/bin/clang++" \ + -DLLVM_DIR="$LLVM_ROOT/lib/cmake/llvm" \ + -DACPP_LLD_PATH="$LLVM_ROOT/bin/ld.lld" \ + "${ACPP_CUDA_DISABLE[@]}" \ + "${ACPP_ROCM_FLAGS[@]}" +cmake --build "$ACPP_BUILD_DIR/build" --parallel +sudo cmake --install "$ACPP_BUILD_DIR/build" + +echo +echo "[install-deps] Done." +echo " AdaptiveCpp: $ACPP_PREFIX" +echo " Build xchplot2:" +echo " export CMAKE_PREFIX_PATH=$ACPP_PREFIX:\$CMAKE_PREFIX_PATH" +echo " cargo install --path . # or:" +echo " cmake -B build -S . && cmake --build build -j" diff --git a/scripts/test-multi-gpu.sh b/scripts/test-multi-gpu.sh new file mode 100755 index 0000000..6bb7fb2 --- /dev/null +++ b/scripts/test-multi-gpu.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +# +# test-multi-gpu.sh — smoke test for the --devices flag. +# +# Two passes: +# +# 1. Argument-parsing checks. Runs xchplot2 against an empty manifest +# (run_batch returns before touching the GPU, so these work on any +# host including CI with no GPU visible). +# +# 2. Live multi-device plot, runtime-gated. Skipped automatically when +# < 2 GPUs are enumerable — so single-GPU dev boxes just see the +# parse checks run green, and a 2+ GPU rig exercises the fan-out. +# +# Usage: +# scripts/test-multi-gpu.sh [path/to/xchplot2] +# +# If the path is omitted, falls back to `xchplot2` on PATH (so +# `cargo install --path .` followed by this script works out of the +# box). + +set -u +XCHPLOT2="${1:-$(command -v xchplot2 || true)}" +if [[ -z "$XCHPLOT2" || ! -x "$XCHPLOT2" ]]; then + echo "ERROR: xchplot2 not found. Pass path as \$1 or put it on \$PATH." >&2 + exit 1 +fi + +PASS=0; FAIL=0; SKIP=0 +pass() { printf ' \e[32mPASS\e[0m: %s\n' "$1"; PASS=$((PASS+1)); } +fail() { printf ' \e[31mFAIL\e[0m: %s\n' "$1"; FAIL=$((FAIL+1)); } +skip() { printf ' \e[33mSKIP\e[0m: %s\n' "$1"; SKIP=$((SKIP+1)); } + +EMPTY_TSV=$(mktemp -t xchplot2-empty-XXXXXX.tsv) +TMP_OUT=$(mktemp -d -t xchplot2-multigpu-out-XXXXXX) +trap 'rm -rf "$EMPTY_TSV" "$TMP_OUT"' EXIT + +check_accept() { + local desc="$1"; shift + if "$XCHPLOT2" batch "$EMPTY_TSV" "$@" >/dev/null 2>&1; then + pass "accepts $desc" + else + fail "accepts $desc (exit $?)" + fi +} +check_reject() { + local desc="$1"; shift + if ! "$XCHPLOT2" batch "$EMPTY_TSV" "$@" >/dev/null 2>&1; then + pass "rejects $desc" + else + fail "rejects $desc (should have exited nonzero)" + fi +} + +echo "==> --devices argument parsing ($XCHPLOT2)" +check_accept "'all'" --devices all +check_accept "single id '0'" --devices 0 +check_accept "explicit list" --devices 0,1,2 +check_reject "garbage spec" --devices badspec +check_reject "negative id" --devices -1 +check_reject "empty value" --devices "" + +# --- Live multi-GPU plot (runtime-gated) --- +echo "==> multi-device plot" + +# GPU_COUNT source of truth: +# - Explicit override lets a CI / test runner force-skip or force-run. +# - nvidia-smi works on both the main (SYCL+CUDA) and cuda-only branches +# whenever the target GPUs are NVIDIA, which covers every multi-GPU +# rig we realistically expect to hit. AMD-only multi-GPU can use +# `XCHPLOT2_TEST_GPU_COUNT=N scripts/test-multi-gpu.sh`. +GPU_COUNT="${XCHPLOT2_TEST_GPU_COUNT:-}" +if [[ -z "$GPU_COUNT" ]]; then + if command -v nvidia-smi >/dev/null 2>&1; then + GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits 2>/dev/null \ + | head -1 | tr -d ' ' || echo 0) + fi + GPU_COUNT="${GPU_COUNT:-0}" +fi + +if [[ "$GPU_COUNT" -lt 2 ]]; then + skip "need >=2 GPUs (got $GPU_COUNT); set XCHPLOT2_TEST_GPU_COUNT=N to override" +else + # k=22 is the smallest k the pipeline supports; two plots give each + # worker one entry to process under round-robin partition. + # + # We build a MANIFEST with pre-computed plot_id_hex + memo_hex (the + # `batch` subcommand feeds these straight to run_gpu_pipeline) rather + # than invoking `plot` with synthetic BLS keys — pos2_keygen rejects + # anything that isn't a real G1 public key with rc=-1 before the + # pipeline ever sees it. + LIVE_TSV="$TMP_OUT/live.tsv" + printf '22\t2\t0\t0\t0\tabababababababababababababababababababababababababababababababab\t00\t%s\tm1.plot2\n22\t2\t1\t0\t0\tcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd\t00\t%s\tm2.plot2\n' \ + "$TMP_OUT" "$TMP_OUT" > "$LIVE_TSV" + + if "$XCHPLOT2" batch "$LIVE_TSV" --devices 0,1 >"$TMP_OUT/log" 2>&1 + then + # Two output files expected, each starting with the 'pos2' magic. + local_ok=1 + shopt -s nullglob + plots=("$TMP_OUT"/m?.plot2) + if [[ "${#plots[@]}" -ne 2 ]]; then + fail "expected 2 plots, got ${#plots[@]}" + local_ok=0 + else + for p in "${plots[@]}"; do + magic=$(head -c 4 "$p" | tr -d '\0') + if [[ "$magic" != "pos2" ]]; then + fail "bad magic in $(basename "$p"): '$magic'" + local_ok=0 + fi + done + fi + if (( local_ok )); then + pass "wrote 2 k=22 plots across devices 0,1" + fi + else + fail "batch --devices 0,1 failed (see $TMP_OUT/log)" + sed 's/^/ /' "$TMP_OUT/log" + fi + + echo "==> cross-device byte-stability" + # 4-entry manifest exercises round-robin (2 plots per worker on a + # 2-GPU rig). Plot output must be byte-identical regardless of + # which worker ran it; if --devices 0 and --devices 0,1 produce + # different SHAs for the same plot_id, the multi-device path has + # introduced non-determinism we shouldn't ship. + SD_DIR="$TMP_OUT/sd" + MD_DIR="$TMP_OUT/md" + mkdir -p "$SD_DIR" "$MD_DIR" + SD_TSV="$TMP_OUT/parity_sd.tsv" + MD_TSV="$TMP_OUT/parity_md.tsv" + { + a64=$(printf '%64s' '' | tr ' ' a) + b64=$(printf '%64s' '' | tr ' ' b) + c64=$(printf '%64s' '' | tr ' ' c) + d64=$(printf '%64s' '' | tr ' ' d) + printf '22\t2\t0\t0\t0\t%s\t00\t%s\tp0.plot2\n' "$a64" "$SD_DIR" + printf '22\t2\t1\t0\t0\t%s\t00\t%s\tp1.plot2\n' "$b64" "$SD_DIR" + printf '22\t2\t2\t0\t0\t%s\t00\t%s\tp2.plot2\n' "$c64" "$SD_DIR" + printf '22\t2\t3\t0\t0\t%s\t00\t%s\tp3.plot2\n' "$d64" "$SD_DIR" + } > "$SD_TSV" + sed "s|$SD_DIR|$MD_DIR|g" "$SD_TSV" > "$MD_TSV" + + if "$XCHPLOT2" batch "$SD_TSV" --devices 0 >"$TMP_OUT/sd.log" 2>&1 \ + && "$XCHPLOT2" batch "$MD_TSV" --devices 0,1 >"$TMP_OUT/md.log" 2>&1 + then + parity_ok=1 + for f in "$SD_DIR"/p?.plot2; do + name=$(basename "$f") + sd_sha=$(sha256sum "$f" | awk '{print $1}') + md_sha=$(sha256sum "$MD_DIR/$name" | awk '{print $1}') + if [[ "$sd_sha" != "$md_sha" ]]; then + fail "byte mismatch on $name (sd=${sd_sha:0:12} md=${md_sha:0:12})" + parity_ok=0 + fi + done + if (( parity_ok )); then + pass "single-device and multi-device produced byte-identical plots" + fi + else + fail "cross-device parity batches failed (logs in $TMP_OUT/sd.log, md.log)" + fi +fi + +echo +printf '==> %d passed, %d failed, %d skipped\n' "$PASS" "$FAIL" "$SKIP" +exit $(( FAIL > 0 ? 1 : 0 )) diff --git a/scripts/test/install-container-deps/arch.txt b/scripts/test/install-container-deps/arch.txt new file mode 100644 index 0000000..058ac4d --- /dev/null +++ b/scripts/test/install-container-deps/arch.txt @@ -0,0 +1,112 @@ +=== engine=podman gpu=nvidia === +[install-container-deps] distro=arch, gpu=nvidia, engine=podman ++ sudo pacman -S --needed --noconfirm podman podman-compose nvidia-utils nvidia-container-toolkit ++ sudo install -m 0755 -d /etc/cdi ++ sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml +[install-container-deps] Generated CDI spec at /etc/cdi/nvidia.yaml. +[install-container-deps] (WARNings about libnvidia-vulkan-producer / X11 configs / +[install-container-deps] fabric-manager / MPS / IMEX from nvidia-ctk are expected on +[install-container-deps] non-server hosts — those are optional features the spec +[install-container-deps] gracefully omits when not present.) + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu nvidia + After future NVIDIA driver upgrades, re-run this script (or + re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure + manually) so the CDI spec / docker runtime hook stays current. + +=== engine=podman gpu=amd === +[install-container-deps] distro=arch, gpu=amd, engine=podman ++ sudo pacman -S --needed --noconfirm podman podman-compose rocminfo ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu amd + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=podman gpu=intel === +[install-container-deps] distro=arch, gpu=intel, engine=podman ++ sudo pacman -S --needed --noconfirm podman podman-compose ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu intel + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=podman gpu=cpu === +[install-container-deps] distro=arch, gpu=cpu, engine=podman ++ sudo pacman -S --needed --noconfirm podman podman-compose + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu cpu + +=== engine=docker gpu=nvidia === +[install-container-deps] distro=arch, gpu=nvidia, engine=docker ++ sudo pacman -S --needed --noconfirm docker docker-compose docker-buildx nvidia-utils nvidia-container-toolkit ++ sudo systemctl enable --now docker.service ++ sudo nvidia-ctk runtime configure --runtime=docker ++ sudo systemctl restart docker +[install-container-deps] Configured docker NVIDIA runtime + restarted dockerd. + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu nvidia + After future NVIDIA driver upgrades, re-run this script (or + re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure + manually) so the CDI spec / docker runtime hook stays current. + +=== engine=docker gpu=amd === +[install-container-deps] distro=arch, gpu=amd, engine=docker ++ sudo pacman -S --needed --noconfirm docker docker-compose docker-buildx rocminfo ++ sudo systemctl enable --now docker.service ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu amd + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=docker gpu=intel === +[install-container-deps] distro=arch, gpu=intel, engine=docker ++ sudo pacman -S --needed --noconfirm docker docker-compose docker-buildx ++ sudo systemctl enable --now docker.service ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu intel + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=docker gpu=cpu === +[install-container-deps] distro=arch, gpu=cpu, engine=docker ++ sudo pacman -S --needed --noconfirm docker docker-compose docker-buildx ++ sudo systemctl enable --now docker.service + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu cpu + diff --git a/scripts/test/install-container-deps/fedora.txt b/scripts/test/install-container-deps/fedora.txt new file mode 100644 index 0000000..9fb1a7c --- /dev/null +++ b/scripts/test/install-container-deps/fedora.txt @@ -0,0 +1,118 @@ +=== engine=podman gpu=nvidia === +[install-container-deps] distro=fedora, gpu=nvidia, engine=podman ++ sudo dnf install -y podman podman-compose +[install-container-deps] Adding NVIDIA's container-toolkit dnf repo to /etc/yum.repos.d/. ++ write /etc/yum.repos.d/nvidia-container-toolkit.repo (from https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo) ++ sudo dnf install -y nvidia-container-toolkit ++ sudo install -m 0755 -d /etc/cdi ++ sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml +[install-container-deps] Generated CDI spec at /etc/cdi/nvidia.yaml. +[install-container-deps] (WARNings about libnvidia-vulkan-producer / X11 configs / +[install-container-deps] fabric-manager / MPS / IMEX from nvidia-ctk are expected on +[install-container-deps] non-server hosts — those are optional features the spec +[install-container-deps] gracefully omits when not present.) + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu nvidia + After future NVIDIA driver upgrades, re-run this script (or + re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure + manually) so the CDI spec / docker runtime hook stays current. + +=== engine=podman gpu=amd === +[install-container-deps] distro=fedora, gpu=amd, engine=podman ++ sudo dnf install -y podman podman-compose rocminfo ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu amd + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=podman gpu=intel === +[install-container-deps] distro=fedora, gpu=intel, engine=podman ++ sudo dnf install -y podman podman-compose ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu intel + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=podman gpu=cpu === +[install-container-deps] distro=fedora, gpu=cpu, engine=podman ++ sudo dnf install -y podman podman-compose + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu cpu + +=== engine=docker gpu=nvidia === +[install-container-deps] distro=fedora, gpu=nvidia, engine=docker ++ sudo dnf install -y docker-compose-plugin docker-buildx-plugin +[install-container-deps] Adding NVIDIA's container-toolkit dnf repo to /etc/yum.repos.d/. ++ write /etc/yum.repos.d/nvidia-container-toolkit.repo (from https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo) ++ sudo dnf install -y nvidia-container-toolkit ++ sudo systemctl enable --now docker.service ++ sudo nvidia-ctk runtime configure --runtime=docker ++ sudo systemctl restart docker +[install-container-deps] Configured docker NVIDIA runtime + restarted dockerd. + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu nvidia + After future NVIDIA driver upgrades, re-run this script (or + re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure + manually) so the CDI spec / docker runtime hook stays current. + +=== engine=docker gpu=amd === +[install-container-deps] distro=fedora, gpu=amd, engine=docker ++ sudo dnf install -y docker-compose-plugin docker-buildx-plugin rocminfo ++ sudo systemctl enable --now docker.service ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu amd + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=docker gpu=intel === +[install-container-deps] distro=fedora, gpu=intel, engine=docker ++ sudo dnf install -y docker-compose-plugin docker-buildx-plugin ++ sudo systemctl enable --now docker.service ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu intel + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=docker gpu=cpu === +[install-container-deps] distro=fedora, gpu=cpu, engine=docker ++ sudo dnf install -y docker-compose-plugin docker-buildx-plugin ++ sudo systemctl enable --now docker.service + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu cpu + diff --git a/scripts/test/install-container-deps/run.sh b/scripts/test/install-container-deps/run.sh new file mode 100755 index 0000000..eee753a --- /dev/null +++ b/scripts/test/install-container-deps/run.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# +# run.sh — verify install-container-deps.sh's --dry-run output matches +# checked-in fixtures across (distro × engine × gpu) combinations. +# +# Each distro's full (engine × gpu) matrix runs inside a single +# arch/ubuntu/fedora container, so the cost is three image pulls + three +# container startups regardless of how many tuples the matrix expands to. +# +# Usage: +# scripts/test/install-container-deps/run.sh # diff mode (CI default) +# scripts/test/install-container-deps/run.sh --update # regenerate fixtures +# +# Honours $XCHPLOT2_CONTAINER_RUNTIME (podman|docker); auto-detects +# otherwise, preferring podman. + +set -euo pipefail + +# Derive ROOT from this script's own path so the harness works no +# matter what CWD it runs from. The previous `git rev-parse` form +# resolved against the *outer* CWD, so running this script from +# another repo's directory wrote fixtures into the wrong tree. +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd) +FIXTURE_DIR="$ROOT/scripts/test/install-container-deps" + +UPDATE=0 +[[ "${1:-}" == --update ]] && UPDATE=1 + +if [[ -n "${XCHPLOT2_CONTAINER_RUNTIME:-}" ]]; then + RUNTIME="$XCHPLOT2_CONTAINER_RUNTIME" +elif command -v podman >/dev/null; then + RUNTIME=podman +elif command -v docker >/dev/null; then + RUNTIME=docker +else + echo "run.sh: neither podman nor docker on PATH" >&2 + exit 1 +fi + +declare -A IMAGES=( + [arch]=docker.io/archlinux:latest + [ubuntu]=docker.io/ubuntu:24.04 + [fedora]=docker.io/fedora:40 +) + +# `XCHPLOT2_DRY_DISTRO_FILTER=arch` runs only one distro — handy when +# regenerating a single fixture without re-pulling all three images. +FILTER="${XCHPLOT2_DRY_DISTRO_FILTER:-}" + +failed=0 +for distro in arch ubuntu fedora; do + [[ -z "$FILTER" || "$FILTER" == "$distro" ]] || continue + + img="${IMAGES[$distro]}" + fixture="$FIXTURE_DIR/$distro.txt" + tmp=$(mktemp) + # shellcheck disable=SC2064 # intentional early expansion + trap "rm -f '$tmp'" EXIT + + # All (engine × gpu) combos for this distro run in one container. + # Each combo gets a `=== engine=X gpu=Y ===` header so the fixture + # diffs cleanly when one tuple drifts. + # shellcheck disable=SC2016 # $engine/$gpu intentionally evaluated inside the container shell + "$RUNTIME" run --rm -v "$ROOT/scripts:/s:ro" "$img" bash -c ' + for engine in podman docker; do + for gpu in nvidia amd intel cpu; do + printf "=== engine=%s gpu=%s ===\n" "$engine" "$gpu" + /s/install-container-deps.sh --dry-run \ + --engine "$engine" --gpu "$gpu" 2>&1 \ + || printf "[exit=%d]\n" $? + printf "\n" + done + done + ' > "$tmp" + + if (( UPDATE )); then + cp "$tmp" "$fixture" + echo "updated: $fixture" + elif ! diff -u "$fixture" "$tmp"; then + echo "::error::fixture mismatch for distro=$distro" + failed=1 + else + echo "ok: $distro" + fi +done + +exit $failed diff --git a/scripts/test/install-container-deps/ubuntu.txt b/scripts/test/install-container-deps/ubuntu.txt new file mode 100644 index 0000000..c4666a4 --- /dev/null +++ b/scripts/test/install-container-deps/ubuntu.txt @@ -0,0 +1,136 @@ +=== engine=podman gpu=nvidia === +[install-container-deps] distro=ubuntu, gpu=nvidia, engine=podman ++ sudo apt-get update ++ sudo apt-get install -y --no-install-recommends podman podman-compose nvidia-utils- +[install-container-deps] Adding NVIDIA's container-toolkit apt repo to /etc/apt/sources.list.d/. ++ sudo install -m 0755 -d /usr/share/keyrings ++ write /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg (gpg --dearmor from https://nvidia.github.io/libnvidia-container/gpgkey) ++ write /etc/apt/sources.list.d/nvidia-container-toolkit.list (from https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list) ++ sudo apt-get update ++ sudo apt-get install -y --no-install-recommends nvidia-container-toolkit ++ sudo install -m 0755 -d /etc/cdi ++ sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml +[install-container-deps] Generated CDI spec at /etc/cdi/nvidia.yaml. +[install-container-deps] (WARNings about libnvidia-vulkan-producer / X11 configs / +[install-container-deps] fabric-manager / MPS / IMEX from nvidia-ctk are expected on +[install-container-deps] non-server hosts — those are optional features the spec +[install-container-deps] gracefully omits when not present.) + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu nvidia + After future NVIDIA driver upgrades, re-run this script (or + re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure + manually) so the CDI spec / docker runtime hook stays current. + +=== engine=podman gpu=amd === +[install-container-deps] distro=ubuntu, gpu=amd, engine=podman ++ sudo apt-get update ++ sudo apt-get install -y --no-install-recommends podman podman-compose rocminfo ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu amd + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=podman gpu=intel === +[install-container-deps] distro=ubuntu, gpu=intel, engine=podman ++ sudo apt-get update ++ sudo apt-get install -y --no-install-recommends podman podman-compose ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu intel + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=podman gpu=cpu === +[install-container-deps] distro=ubuntu, gpu=cpu, engine=podman ++ sudo apt-get update ++ sudo apt-get install -y --no-install-recommends podman podman-compose + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine podman --gpu cpu + +=== engine=docker gpu=nvidia === +[install-container-deps] distro=ubuntu, gpu=nvidia, engine=docker ++ sudo apt-get update ++ sudo apt-get install -y --no-install-recommends docker.io docker-buildx nvidia-utils- ++ sudo apt-get install -y --no-install-recommends docker-compose-v2 +[install-container-deps] Adding NVIDIA's container-toolkit apt repo to /etc/apt/sources.list.d/. ++ sudo install -m 0755 -d /usr/share/keyrings ++ write /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg (gpg --dearmor from https://nvidia.github.io/libnvidia-container/gpgkey) ++ write /etc/apt/sources.list.d/nvidia-container-toolkit.list (from https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list) ++ sudo apt-get update ++ sudo apt-get install -y --no-install-recommends nvidia-container-toolkit ++ sudo systemctl enable --now docker.service ++ sudo nvidia-ctk runtime configure --runtime=docker ++ sudo systemctl restart docker +[install-container-deps] Configured docker NVIDIA runtime + restarted dockerd. + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu nvidia + After future NVIDIA driver upgrades, re-run this script (or + re-run nvidia-ctk cdi generate / nvidia-ctk runtime configure + manually) so the CDI spec / docker runtime hook stays current. + +=== engine=docker gpu=amd === +[install-container-deps] distro=ubuntu, gpu=amd, engine=docker ++ sudo apt-get update ++ sudo apt-get install -y --no-install-recommends docker.io docker-buildx rocminfo ++ sudo apt-get install -y --no-install-recommends docker-compose-v2 ++ sudo systemctl enable --now docker.service ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu amd + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=docker gpu=intel === +[install-container-deps] distro=ubuntu, gpu=intel, engine=docker ++ sudo apt-get update ++ sudo apt-get install -y --no-install-recommends docker.io docker-buildx ++ sudo apt-get install -y --no-install-recommends docker-compose-v2 ++ sudo systemctl enable --now docker.service ++ sudo usermod -aG video +[install-container-deps] Added to group video (re-login to apply). ++ sudo usermod -aG render +[install-container-deps] Added to group render (re-login to apply). + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu intel + If this run added you to the video / render groups, log out + and back in before running plots — group changes only take + effect for fresh login sessions. + +=== engine=docker gpu=cpu === +[install-container-deps] distro=ubuntu, gpu=cpu, engine=docker ++ sudo apt-get update ++ sudo apt-get install -y --no-install-recommends docker.io docker-buildx ++ sudo apt-get install -y --no-install-recommends docker-compose-v2 ++ sudo systemctl enable --now docker.service + +[install-container-deps] Done. + Build the image: + ./scripts/build-container.sh --engine docker --gpu cpu + diff --git a/src/gpu/AesGpu.cu b/src/gpu/AesGpu.cu index 88625a9..37297c8 100644 --- a/src/gpu/AesGpu.cu +++ b/src/gpu/AesGpu.cu @@ -1,8 +1,9 @@ -// AesGpu.cu — T-table initialisation. Tables are computed on the host -// (small, deterministic) and copied to constant memory. +// AesGpu.cu — T-table initialisation. Tables are computed at compile +// time in AesTables.inl (shared with the SYCL backend) and copied here +// into __constant__ memory for the CUDA path. #include "gpu/AesGpu.cuh" -#include +#include "gpu/AesTables.inl" namespace pos2gpu { @@ -11,70 +12,12 @@ __device__ __constant__ uint32_t kAesT1[256]; __device__ __constant__ uint32_t kAesT2[256]; __device__ __constant__ uint32_t kAesT3[256]; -namespace { - -// Rijndael S-box. -constexpr uint8_t kSBox[256] = { - 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76, - 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0, - 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15, - 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75, - 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84, - 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf, - 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8, - 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2, - 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73, - 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb, - 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79, - 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08, - 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a, - 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e, - 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf, - 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 -}; - -// xtime() — multiplication by x (i.e. 0x02) in GF(2^8) with the AES polynomial. -constexpr uint8_t xtime(uint8_t x) { - return static_cast((x << 1) ^ ((x & 0x80) ? 0x1B : 0)); -} - -// MixColumns row [02 03 01 01]. T0[a] = (2·S[a], 1·S[a], 1·S[a], 3·S[a]) -// little-endian bytes are: byte0=2S, byte1=S, byte2=S, byte3=3S. -constexpr uint32_t te_word(uint8_t a, int rotate) -{ - uint8_t s = kSBox[a]; - uint8_t s2 = xtime(s); - uint8_t s3 = static_cast(s2 ^ s); - uint8_t b[4] = { s2, s, s, s3 }; - uint32_t v = 0; - for (int i = 0; i < 4; ++i) { - v |= uint32_t(b[(i + rotate) & 3]) << (8 * i); - } - return v; -} - -constexpr std::array build_table(int rotate) -{ - std::array t{}; - for (int i = 0; i < 256; ++i) { - t[i] = te_word(static_cast(i), rotate); - } - return t; -} - -constexpr auto T0 = build_table(0); -constexpr auto T1 = build_table(3); -constexpr auto T2 = build_table(2); -constexpr auto T3 = build_table(1); - -} // namespace - void initialize_aes_tables() { - cudaMemcpyToSymbol(kAesT0, T0.data(), sizeof(uint32_t) * 256); - cudaMemcpyToSymbol(kAesT1, T1.data(), sizeof(uint32_t) * 256); - cudaMemcpyToSymbol(kAesT2, T2.data(), sizeof(uint32_t) * 256); - cudaMemcpyToSymbol(kAesT3, T3.data(), sizeof(uint32_t) * 256); + cudaMemcpyToSymbol(kAesT0, aes_tables::T0.data(), sizeof(uint32_t) * 256); + cudaMemcpyToSymbol(kAesT1, aes_tables::T1.data(), sizeof(uint32_t) * 256); + cudaMemcpyToSymbol(kAesT2, aes_tables::T2.data(), sizeof(uint32_t) * 256); + cudaMemcpyToSymbol(kAesT3, aes_tables::T3.data(), sizeof(uint32_t) * 256); } } // namespace pos2gpu diff --git a/src/gpu/AesGpu.cuh b/src/gpu/AesGpu.cuh index 46a566f..42cf2d7 100644 --- a/src/gpu/AesGpu.cuh +++ b/src/gpu/AesGpu.cuh @@ -20,26 +20,44 @@ // // Cross-check against pos2-chip/src/pos/aes/intrin_portable.h which // defines `rx_aesenc_vec_i128 _mm_aesenc_si128`. +// +// Backend portability: +// +// The SYCL path (compiled by acpp/clang in non-CUDA mode) cannot see +// __constant__ memory, threadIdx, or __device__ markup. The pieces it +// needs — aesenc_round_smem, set_int_vec_i128, load_state_le, and the +// AesState struct itself — are decorated with the portable macros from +// PortableAttrs.hpp and stay outside the __CUDACC__ gate. The constant- +// memory T-tables, the aesenc_round variant that reads them, and +// load_aes_tables_smem (uses threadIdx) are CUDA-only. #pragma once -#include +#include "gpu/PortableAttrs.hpp" + #include +#if defined(__CUDACC__) + #include +#endif + namespace pos2gpu { -// AES S-box (Rijndael forward S-box). +#if defined(__CUDACC__) +// AES T-tables in constant memory. Defined in AesGpu.cu, populated by +// initialize_aes_tables() at startup. __device__ __constant__ extern uint32_t kAesT0[256]; __device__ __constant__ extern uint32_t kAesT1[256]; __device__ __constant__ extern uint32_t kAesT2[256]; __device__ __constant__ extern uint32_t kAesT3[256]; +#endif struct AesState { uint32_t w[4]; }; // Load 16 bytes (little-endian) into an AesState. -__host__ __device__ inline AesState load_state_le(uint8_t const* bytes) +POS2_HOST_DEVICE_INLINE AesState load_state_le(uint8_t const* bytes) { AesState s; #pragma unroll @@ -52,12 +70,11 @@ __host__ __device__ inline AesState load_state_le(uint8_t const* bytes) return s; } -// One AES round equivalent to _mm_aesenc_si128(state, key). -// Implemented with T-tables. ShiftRows is folded into the byte-extraction -// indices, then SubBytes+MixColumns is the table lookup. -// -// AESENC operates per-column. For column c (0..3), the output column is: -// T0[s[c, 0]] ^ T1[s[(c+1) mod 4, 1]] ^ T2[s[(c+2) mod 4, 2]] ^ T3[s[(c+3) mod 4, 3]] ^ key[c] +#if defined(__CUDACC__) +// One AES round equivalent to _mm_aesenc_si128(state, key), reading the +// T-tables from constant memory. CUDA-only because __constant__ has no +// SYCL equivalent — the SYCL path uses aesenc_round_smem with tables +// preloaded into local memory. __device__ __forceinline__ AesState aesenc_round(AesState s, AesState const& key) { auto byte = [](uint32_t w, int n) -> uint32_t { @@ -75,10 +92,11 @@ __device__ __forceinline__ AesState aesenc_round(AesState s, AesState const& key } return out; } +#endif // Convenience: load an i128 from four little-endian 32-bit ints, matching // rx_set_int_vec_i128(i3, i2, i1, i0). -__host__ __device__ inline AesState set_int_vec_i128(int32_t i3, int32_t i2, int32_t i1, int32_t i0) +POS2_HOST_DEVICE_INLINE AesState set_int_vec_i128(int32_t i3, int32_t i2, int32_t i1, int32_t i0) { AesState s; s.w[0] = static_cast(i0); @@ -90,6 +108,7 @@ __host__ __device__ inline AesState set_int_vec_i128(int32_t i3, int32_t i2, int // Initialize the constant-memory T-tables on first use. Must be called once // per program from host code before any kernel that touches AesGpu runs. +// Implemented in AesGpu.cu (CUDA TU only). void initialize_aes_tables(); // ========================================================================= @@ -106,8 +125,14 @@ void initialize_aes_tables(); // __syncthreads(); // AesState state = ...; // state = aesenc_round_smem(state, round_key, sT); +// +// The SYCL path uses the same aesenc_round_smem (pointer-based, fully +// portable) but provides its own loader — local_accessor + nd_item barrier +// in place of __shared__ + __syncthreads — and supplies the table data +// from a USM buffer initialised from AesTables.inl on the host side. // ========================================================================= +#if defined(__CUDACC__) __device__ __forceinline__ void load_aes_tables_smem(uint32_t* sT) { // sT layout: [T0|T1|T2|T3], 256 entries each (4096 entries total). @@ -121,8 +146,9 @@ __device__ __forceinline__ void load_aes_tables_smem(uint32_t* sT) sT[3 * 256 + i] = kAesT3[i]; } } +#endif -__device__ __forceinline__ AesState aesenc_round_smem( +POS2_DEVICE_INLINE AesState aesenc_round_smem( AesState s, AesState const& key, uint32_t const* __restrict__ sT) { auto byte = [](uint32_t w, int n) -> uint32_t { diff --git a/src/gpu/AesHashBsSycl.hpp b/src/gpu/AesHashBsSycl.hpp new file mode 100644 index 0000000..e1176ea --- /dev/null +++ b/src/gpu/AesHashBsSycl.hpp @@ -0,0 +1,376 @@ +// AesHashBsSycl.hpp — sub_group-cooperative bit-sliced AES hash for SYCL. +// +// Cross-reference: +// src/gpu/AesGpuBitsliced.cuh (CUDA original, 32-lane warp-coop) +// src/gpu/AesHashGpu.cuh (CUDA T-table API; _smem family) +// src/gpu/AesSBoxBP.cuh (Boyar-Peralta S-box circuit, shared) +// +// Exports sub_group-cooperative equivalents of g_x_smem / pairing_smem / +// matching_target_smem. Each kernel thread holds one state; 32 threads in +// a sub_group cooperate on 32 parallel AES computations, using only bit +// ops + sub_group shuffles — no T-table LDS lookups, which is what makes +// the bitsliced path win on amdgcn under AdaptiveCpp's HIP backend. +// +// Preconditions for callers: +// - Kernel MUST be launched with reqd_sub_group_size(32) (wave32 on +// RDNA2, warp32 on NVIDIA; both native). The shuffle/ballot math is +// hard-coded for 32 lanes. +// - ALL 32 lanes of the sub_group must participate in every call. +// Lanes with no real work should pass dummy inputs, do the call, +// then return afterwards. + +#pragma once + +#include "gpu/AesGpu.cuh" +#include "gpu/AesHashGpu.cuh" +#include "gpu/AesSBoxBP.cuh" + +#include + +#include + +namespace pos2gpu { + +// ---------- low-level sub_group primitives ---------- + +inline uint32_t bs_shfl(sycl::sub_group const& sg, uint32_t x, int lane) +{ + return sycl::select_from_group(sg, x, lane); +} + +// Ballot: 32 lanes each contribute one bit, collected into a single +// uint32 mask (bit l of the result == lane l's predicate). +// +// Fast path on AdaptiveCpp's HIP target: __builtin_amdgcn_ballot_w32 +// lowers to a single v_cmp + s_mov on RDNA2/3 — one native amdgcn +// instruction instead of the log-n reduction the portable fallback +// compiles to. This is the critical piece for bitsliced AES to win +// on amdgcn: bs32_pack calls ballot 128× per hash, so a 5× speedup +// per call is the difference between a +23 % regression (the first +// attempt with reduce_over_group) and a net win. +// +// Dispatch MUST go through AdaptiveCpp's __acpp_if_target_hip(stmts) +// macro, not a raw `#if defined(__HIP_DEVICE_COMPILE__)`. AdaptiveCpp +// compiles each kernel body for every backend target it's configured +// for (including the OMP host-CPU fallback), so on the OMP pass the +// preprocessor branch is chosen per-TU but the kernel body is also +// evaluated as a __host__ function — clang then rejects the +// __device__-only `__builtin_amdgcn_ballot_w32` with "reference to +// __device__ function in __host__ function" even though the #if +// would have eliminated it on the non-HIP backend. __acpp_if_target_hip +// expands to `stmts` during the HIP device code-gen pass only, and +// to nothing on all other passes — so the intrinsic truly never +// appears in a __host__ context. +// +// Wave-size caveat: we hard-code _w32 because gfx1031 (RDNA2) is +// wave32 and the entire bitsliced scheme is wave32-only (reqd_sub_ +// group_size(32) on the kernels, 32-way pack/unpack layout). Using +// _w64 on a wave32 target miscompiles — LLVM issue #62477. +// +// Recipe source: AdaptiveCpp doc/hip-source-interop.md. +inline uint32_t bs_ballot(sycl::sub_group const& sg, bool pred) +{ + __acpp_if_target_hip( + return static_cast(__builtin_amdgcn_ballot_w32(pred)); + ); + // Portable fallback — reachable on every non-HIP target (OMP host, + // CUDA, Intel Level Zero, SSCP). The HIP device pass early-returns + // above so this branch is dead on amdgcn. + uint32_t lane = sg.get_local_linear_id(); + uint32_t bit = pred ? (1u << lane) : 0u; + return sycl::reduce_over_group(sg, bit, sycl::bit_or{}); +} + +// ---------- 32-way pack / unpack ---------- +// +// Bit-plane layout matches AesGpuBitsliced.cuh: +// plane p (0..127) has bit l = bit p of lane l's scalar state. +// thread t owns planes { 4t, 4t+1, 4t+2, 4t+3 }. + +inline void bs32_pack(sycl::sub_group const& sg, + AesState const& my, uint32_t out[4]) +{ + uint32_t lane = sg.get_local_linear_id(); + for (int p = 0; p < 128; ++p) { + int byte_idx = p >> 3; + int bit_in_byte = p & 7; + int word_idx = byte_idx >> 2; + int byte_in_w = byte_idx & 3; + uint32_t bit = (my.w[word_idx] >> (8 * byte_in_w + bit_in_byte)) & 1u; + uint32_t plane = bs_ballot(sg, bit != 0u); + if (lane == uint32_t(p >> 2)) { + out[p & 3] = plane; + } + } +} + +inline void bs32_unpack(sycl::sub_group const& sg, + uint32_t const in[4], AesState& my) +{ + uint32_t lane = sg.get_local_linear_id(); + my.w[0] = my.w[1] = my.w[2] = my.w[3] = 0u; + for (int p = 0; p < 128; ++p) { + int owner = p >> 2; + int slot = p & 3; + uint32_t plane = bs_shfl(sg, in[slot], owner); + uint32_t bit = (plane >> lane) & 1u; + int byte_idx = p >> 3; + int bit_in_byte = p & 7; + int word_idx = byte_idx >> 2; + int byte_in_w = byte_idx & 3; + my.w[word_idx] |= bit << (8 * byte_in_w + bit_in_byte); + } +} + +// ---------- round key materialisation ---------- +// +// All 32 states share the same key, so each bit-plane of a bit-sliced +// key is either all-ones or all-zeros. No cross-lane communication. + +inline void make_bs32_round_key(sycl::sub_group const& sg, + AesState const& key, uint32_t key_bs[4]) +{ + uint32_t lane = sg.get_local_linear_id(); + #pragma unroll + for (int i = 0; i < 4; ++i) { + int p = 4 * int(lane) + i; + int byte_idx = p >> 3; + int bit_in_byte = p & 7; + int word_idx = byte_idx >> 2; + int byte_in_w = byte_idx & 3; + uint32_t bit = (key.w[word_idx] >> (8 * byte_in_w + bit_in_byte)) & 1u; + key_bs[i] = bit ? 0xFFFFFFFFu : 0u; + } +} + +inline void add_round_key_bs32(uint32_t bs[4], uint32_t const key_bs[4]) +{ + bs[0] ^= key_bs[0]; bs[1] ^= key_bs[1]; + bs[2] ^= key_bs[2]; bs[3] ^= key_bs[3]; +} + +// ---------- ShiftRows ---------- +// +// Each lane fetches its own output byte from a single source lane. The +// permutation preserves bit-within-byte index, so one shuffle per plane. + +inline void shift_rows_bs32(sycl::sub_group const& sg, uint32_t bs[4]) +{ + uint32_t lane = sg.get_local_linear_id(); + int is_hi = int(lane) & 1; + int b = int(lane) >> 1; + int c = b >> 2; + int r = b & 3; + int b_old = ((c + r) & 3) * 4 + r; + int owner = 2 * b_old + is_hi; + uint32_t n0 = bs_shfl(sg, bs[0], owner); + uint32_t n1 = bs_shfl(sg, bs[1], owner); + uint32_t n2 = bs_shfl(sg, bs[2], owner); + uint32_t n3 = bs_shfl(sg, bs[3], owner); + bs[0] = n0; bs[1] = n1; bs[2] = n2; bs[3] = n3; +} + +// ---------- MixColumns ---------- +// +// See AesGpuBitsliced.cuh for the algebraic derivation. 14 shuffles per +// lane (12 same-half column mates + 2 cross-half boundary bits). + +inline void mix_columns_bs32(sycl::sub_group const& sg, uint32_t bs[4]) +{ + uint32_t lane = sg.get_local_linear_id(); + int is_hi = int(lane) & 1; + int b = int(lane) >> 1; + int c = b >> 2; + int r = b & 3; + int partner = int(lane) ^ 1; + int col_base = 8 * c; + int r1 = (r + 1) & 3; + int r2 = (r + 2) & 3; + int r3 = (r + 3) & 3; + int L1 = col_base + 2 * r1 + is_hi; + int L2 = col_base + 2 * r2 + is_hi; + int L3 = col_base + 2 * r3 + is_hi; + int L1_other = col_base + 2 * r1 + (is_hi ^ 1); + + uint32_t r1_0 = bs_shfl(sg, bs[0], L1); + uint32_t r1_1 = bs_shfl(sg, bs[1], L1); + uint32_t r1_2 = bs_shfl(sg, bs[2], L1); + uint32_t r1_3 = bs_shfl(sg, bs[3], L1); + uint32_t r2_0 = bs_shfl(sg, bs[0], L2); + uint32_t r2_1 = bs_shfl(sg, bs[1], L2); + uint32_t r2_2 = bs_shfl(sg, bs[2], L2); + uint32_t r2_3 = bs_shfl(sg, bs[3], L2); + uint32_t r3_0 = bs_shfl(sg, bs[0], L3); + uint32_t r3_1 = bs_shfl(sg, bs[1], L3); + uint32_t r3_2 = bs_shfl(sg, bs[2], L3); + uint32_t r3_3 = bs_shfl(sg, bs[3], L3); + + uint32_t t_0 = bs[0] ^ r1_0; + uint32_t t_1 = bs[1] ^ r1_1; + uint32_t t_2 = bs[2] ^ r1_2; + uint32_t t_3 = bs[3] ^ r1_3; + + uint32_t t_boundary = bs_shfl(sg, bs[3], partner) + ^ bs_shfl(sg, bs[3], L1_other); + + uint32_t xt_0, xt_1, xt_2, xt_3; + if (is_hi) { + xt_0 = t_boundary ^ t_3; + xt_1 = t_0; + xt_2 = t_1; + xt_3 = t_2; + } else { + xt_0 = t_boundary; + xt_1 = t_0 ^ t_boundary; + xt_2 = t_1; + xt_3 = t_2 ^ t_boundary; + } + + bs[0] = xt_0 ^ r1_0 ^ r2_0 ^ r3_0; + bs[1] = xt_1 ^ r1_1 ^ r2_1 ^ r3_1; + bs[2] = xt_2 ^ r1_2 ^ r2_2 ^ r3_2; + bs[3] = xt_3 ^ r1_3 ^ r2_3 ^ r3_3; +} + +// ---------- SubBytes via Boyar-Peralta bitsliced S-box ---------- +// +// Threads 2b and 2b+1 cooperate on byte b: they swap their four planes +// once, run the 113-gate BP circuit redundantly, then keep the four +// outputs for their own half of the byte. + +inline void sub_bytes_bs32(sycl::sub_group const& sg, uint32_t bs[4]) +{ + uint32_t lane = sg.get_local_linear_id(); + int is_hi = int(lane) & 1; + int partner = int(lane) ^ 1; + + uint32_t peer0 = bs_shfl(sg, bs[0], partner); + uint32_t peer1 = bs_shfl(sg, bs[1], partner); + uint32_t peer2 = bs_shfl(sg, bs[2], partner); + uint32_t peer3 = bs_shfl(sg, bs[3], partner); + + uint32_t U0, U1, U2, U3, U4, U5, U6, U7; + if (is_hi) { + U0 = bs[3]; U1 = bs[2]; U2 = bs[1]; U3 = bs[0]; + U4 = peer3; U5 = peer2; U6 = peer1; U7 = peer0; + } else { + U0 = peer3; U1 = peer2; U2 = peer1; U3 = peer0; + U4 = bs[3]; U5 = bs[2]; U6 = bs[1]; U7 = bs[0]; + } + + uint32_t S0, S1, S2, S3, S4, S5, S6, S7; + bp_sbox_circuit(U0, U1, U2, U3, U4, U5, U6, U7, + S0, S1, S2, S3, S4, S5, S6, S7, + 0xFFFFFFFFu); + + if (is_hi) { + bs[3] = S0; bs[2] = S1; bs[1] = S2; bs[0] = S3; + } else { + bs[3] = S4; bs[2] = S5; bs[1] = S6; bs[0] = S7; + } +} + +// ---------- full round + round loop ---------- + +inline void aesenc_round_bs32(sycl::sub_group const& sg, + uint32_t bs[4], uint32_t const key_bs[4]) +{ + shift_rows_bs32(sg, bs); + sub_bytes_bs32(sg, bs); + mix_columns_bs32(sg, bs); + add_round_key_bs32(bs, key_bs); +} + +inline void run_rounds_bs32(sycl::sub_group const& sg, + uint32_t bs[4], + uint32_t const k1_bs[4], + uint32_t const k2_bs[4], + int rounds) +{ + #pragma unroll 2 + for (int r = 0; r < rounds; ++r) { + aesenc_round_bs32(sg, bs, k1_bs); + aesenc_round_bs32(sg, bs, k2_bs); + } +} + +// ---------- high-level wrappers matching AesHashGpu.cuh ---------- +// +// Each wrapper must be called uniformly across the sub_group. The return +// value is per-lane (this lane's result); callers collect per-lane values +// into their own output buffers as usual. + +// g_x_bs32 — bitsliced equivalent of g_x_smem(keys, x, k). Each lane +// contributes its own `x`, returns bottom k bits of state.w[0] for this +// lane's x. +inline uint32_t g_x_bs32(sycl::sub_group const& sg, + AesHashKeys const& keys, uint32_t x, int k, + int rounds = kAesGRounds) +{ + AesState in = set_int_vec_i128(0, 0, 0, static_cast(x)); + uint32_t bs[4], k1_bs[4], k2_bs[4]; + bs32_pack(sg, in, bs); + make_bs32_round_key(sg, keys.round_key_1, k1_bs); + make_bs32_round_key(sg, keys.round_key_2, k2_bs); + run_rounds_bs32(sg, bs, k1_bs, k2_bs, rounds); + AesState out; + bs32_unpack(sg, bs, out); + return out.w[0] & ((1u << k) - 1u); +} + +// matching_target_bs32 — bitsliced equivalent of matching_target_smem. +// (table_id, match_key) are typically sub_group-uniform in the match +// kernels; only `meta` varies per lane. That's fine — bitslicing doesn't +// require per-lane inputs to differ. +inline uint32_t matching_target_bs32(sycl::sub_group const& sg, + AesHashKeys const& keys, + uint32_t table_id, uint32_t match_key, + uint64_t meta, + int extra_rounds_bits = 0) +{ + int32_t i0 = static_cast(table_id); + int32_t i1 = static_cast(match_key); + int32_t i2 = static_cast(meta & 0xFFFFFFFFu); + int32_t i3 = static_cast((meta >> 32) & 0xFFFFFFFFu); + AesState in = set_int_vec_i128(i3, i2, i1, i0); + uint32_t bs[4], k1_bs[4], k2_bs[4]; + bs32_pack(sg, in, bs); + make_bs32_round_key(sg, keys.round_key_1, k1_bs); + make_bs32_round_key(sg, keys.round_key_2, k2_bs); + int rounds = kAesMatchingTargetRounds << extra_rounds_bits; + run_rounds_bs32(sg, bs, k1_bs, k2_bs, rounds); + AesState out; + bs32_unpack(sg, bs, out); + return out.w[0]; +} + +// pairing_bs32 — bitsliced equivalent of pairing_smem. Kept for +// completeness / future use; the current match kernels keep the inner +// loop on T-table pairing because the inner trip count is data-dependent +// (per-lane window size varies), which is awkward to bit-slice without +// a batch-collect prepass. +inline Result128 pairing_bs32(sycl::sub_group const& sg, + AesHashKeys const& keys, + uint64_t meta_l, uint64_t meta_r, + int extra_rounds_bits = 0) +{ + int32_t i0 = static_cast(meta_l & 0xFFFFFFFFu); + int32_t i1 = static_cast((meta_l >> 32) & 0xFFFFFFFFu); + int32_t i2 = static_cast(meta_r & 0xFFFFFFFFu); + int32_t i3 = static_cast((meta_r >> 32) & 0xFFFFFFFFu); + AesState in = set_int_vec_i128(i3, i2, i1, i0); + uint32_t bs[4], k1_bs[4], k2_bs[4]; + bs32_pack(sg, in, bs); + make_bs32_round_key(sg, keys.round_key_1, k1_bs); + make_bs32_round_key(sg, keys.round_key_2, k2_bs); + int rounds = kAesPairingRounds << extra_rounds_bits; + run_rounds_bs32(sg, bs, k1_bs, k2_bs, rounds); + AesState out; + bs32_unpack(sg, bs, out); + Result128 r{}; + r.r[0] = out.w[0]; r.r[1] = out.w[1]; + r.r[2] = out.w[2]; r.r[3] = out.w[3]; + return r; +} + +} // namespace pos2gpu diff --git a/src/gpu/AesHashGpu.cuh b/src/gpu/AesHashGpu.cuh index 29aa895..36453ff 100644 --- a/src/gpu/AesHashGpu.cuh +++ b/src/gpu/AesHashGpu.cuh @@ -8,10 +8,21 @@ // The CPU code uses 16 alternating rounds (round_key_1, round_key_2). We // keep the same round count constants here so a single binary can be a // drop-in for the CPU code. +// +// Backend portability: +// +// The `_smem` family (run_rounds_smem, g_x_smem, pairing_smem, +// matching_target_smem, chain_smem) is fully pointer-driven (table +// pointer passed as an argument) and decorated with portable macros, so +// it compiles under both nvcc and acpp/clang. The non-smem family reads +// the constant-memory T-tables directly via aesenc_round and is +// therefore CUDA-only. #pragma once #include "gpu/AesGpu.cuh" +#include "gpu/PortableAttrs.hpp" + #include namespace pos2gpu { @@ -28,7 +39,7 @@ struct AesHashKeys { // Build the two round keys from a 32-byte plot_id, matching // load_plot_id_as_aes_key in AesHash.hpp. -__host__ __device__ inline AesHashKeys make_keys(uint8_t const* plot_id_bytes) +POS2_HOST_DEVICE inline AesHashKeys make_keys(uint8_t const* plot_id_bytes) { AesHashKeys k; k.round_key_1 = load_state_le(plot_id_bytes + 0); @@ -36,8 +47,10 @@ __host__ __device__ inline AesHashKeys make_keys(uint8_t const* plot_id_bytes) return k; } +#if defined(__CUDACC__) // One full alternating round-pair. The CPU loop is: // for r in 0..Rounds: state = aesenc(state, k1); state = aesenc(state, k2); +// CUDA-only: calls aesenc_round which reads constant-memory T-tables. __device__ __forceinline__ AesState run_rounds(AesState state, AesHashKeys const& keys, int rounds) { #pragma unroll 2 @@ -56,12 +69,14 @@ __device__ __forceinline__ uint32_t g_x(AesHashKeys const& keys, uint32_t x, int s = run_rounds(s, keys, rounds); return s.w[0] & ((1u << k) - 1u); } +#endif // pairing: load (meta_l_lo, meta_l_hi, meta_r_lo, meta_r_hi) into i0..i3, // run AES_PAIRING_ROUNDS << extra_rounds_bits, return all 4 u32s. // Mirrors AesHash::pairing. struct Result128 { uint32_t r[4]; }; +#if defined(__CUDACC__) __device__ __forceinline__ Result128 pairing( AesHashKeys const& keys, uint64_t meta_l, uint64_t meta_r, @@ -110,14 +125,17 @@ __device__ __forceinline__ uint64_t chain(AesHashKeys const& keys, uint64_t inpu s = run_rounds(s, keys, kAesChainingRounds); return uint64_t(s.w[0]) | (uint64_t(s.w[1]) << 32); } +#endif // __CUDACC__ // ========================================================================= // Shared-memory T-table variants. Use after load_aes_tables_smem(sT) + -// __syncthreads(). All four functions mirror their constant-memory peers -// above; only the inner aesenc_round call changes. +// __syncthreads() in CUDA, or after a SYCL local_accessor + barrier in +// SYCL. All five functions mirror their constant-memory peers above; +// only the inner aesenc_round_smem call (and the table pointer arg) +// differ. Fully portable — compile under both backends. // ========================================================================= -__device__ __forceinline__ AesState run_rounds_smem( +POS2_DEVICE_INLINE AesState run_rounds_smem( AesState state, AesHashKeys const& keys, int rounds, uint32_t const* __restrict__ sT) { #pragma unroll 2 @@ -128,7 +146,7 @@ __device__ __forceinline__ AesState run_rounds_smem( return state; } -__device__ __forceinline__ uint32_t g_x_smem( +POS2_DEVICE_INLINE uint32_t g_x_smem( AesHashKeys const& keys, uint32_t x, int k, uint32_t const* __restrict__ sT, int rounds = kAesGRounds) { @@ -137,7 +155,7 @@ __device__ __forceinline__ uint32_t g_x_smem( return s.w[0] & ((1u << k) - 1u); } -__device__ __forceinline__ Result128 pairing_smem( +POS2_DEVICE_INLINE Result128 pairing_smem( AesHashKeys const& keys, uint64_t meta_l, uint64_t meta_r, uint32_t const* __restrict__ sT, @@ -156,7 +174,7 @@ __device__ __forceinline__ Result128 pairing_smem( return out; } -__device__ __forceinline__ uint32_t matching_target_smem( +POS2_DEVICE_INLINE uint32_t matching_target_smem( AesHashKeys const& keys, uint32_t table_id, uint32_t match_key, uint64_t meta, uint32_t const* __restrict__ sT, @@ -172,7 +190,7 @@ __device__ __forceinline__ uint32_t matching_target_smem( return s.w[0]; } -__device__ __forceinline__ uint64_t chain_smem( +POS2_DEVICE_INLINE uint64_t chain_smem( AesHashKeys const& keys, uint64_t input, uint32_t const* __restrict__ sT) { diff --git a/src/gpu/AesSBoxBP.cuh b/src/gpu/AesSBoxBP.cuh index 6b8b57e..3a56a0c 100644 --- a/src/gpu/AesSBoxBP.cuh +++ b/src/gpu/AesSBoxBP.cuh @@ -20,12 +20,21 @@ #pragma once +#include "gpu/PortableAttrs.hpp" + #include namespace pos2gpu { +// Portable markup: POS2_HOST_DEVICE_INLINE expands to +// __host__ __device__ __forceinline__ under nvcc (CUDA TU) and to +// inline __attribute__((always_inline)) under acpp/clang (SYCL TU). +// Raw __host__ / __device__ tokens would fail to parse under +// AdaptiveCpp's SYCL-to-HIP compilation path (they're not defined +// outside nvcc/hipcc source-to-source front-ends), which would +// cascade to "no matching function" errors at every call site. template -__host__ __device__ __forceinline__ +POS2_HOST_DEVICE_INLINE void bp_sbox_circuit(T U0, T U1, T U2, T U3, T U4, T U5, T U6, T U7, T& S0, T& S1, T& S2, T& S3, T& S4, T& S5, T& S6, T& S7, @@ -154,7 +163,7 @@ void bp_sbox_circuit(T U0, T U1, T U2, T U3, T U4, T U5, T U6, T U7, S5 = tc21 ^ tc17; } -__host__ __device__ __forceinline__ +POS2_HOST_DEVICE_INLINE uint8_t bp_sbox(uint8_t x) { uint8_t U0 = uint8_t((x >> 7) & 1u); diff --git a/src/gpu/AesStub.cpp b/src/gpu/AesStub.cpp new file mode 100644 index 0000000..afe271a --- /dev/null +++ b/src/gpu/AesStub.cpp @@ -0,0 +1,15 @@ +// AesStub.cpp — provides the symbols defined by AesGpu.cu when the build +// excludes the CUDA AOT path (XCHPLOT2_BUILD_CUDA=OFF). The CUDA path +// uploads AES T-tables into __constant__ memory; the SYCL path keeps them +// in a USM device buffer (SyclBackend.hpp's aes_tables_device(q)) which +// is initialised lazily on first kernel call. So this stub simply makes +// initialize_aes_tables a no-op — the SYCL kernels don't depend on it. + +namespace pos2gpu { + +void initialize_aes_tables() { + // No-op on non-CUDA builds. AES T-tables are uploaded by + // SyclBackend.hpp's aes_tables_device(q) on first use. +} + +} // namespace pos2gpu diff --git a/src/gpu/AesTables.inl b/src/gpu/AesTables.inl new file mode 100644 index 0000000..c186470 --- /dev/null +++ b/src/gpu/AesTables.inl @@ -0,0 +1,70 @@ +// AesTables.inl — AES T-table values shared between the CUDA path +// (uploaded into __constant__ memory by initialize_aes_tables in +// AesGpu.cu) and the SYCL path (uploaded once into a USM device +// buffer at first use). +// +// The four tables are constexpr — built at compile time from kSBox + +// xtime via the standard 4-table T-box construction. Sourced from +// AesGpu.cu lines 17-68; behaviour unchanged. + +#pragma once + +#include +#include + +namespace pos2gpu::aes_tables { + +// Rijndael S-box. +constexpr uint8_t kSBox[256] = { + 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76, + 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0, + 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15, + 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75, + 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84, + 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf, + 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8, + 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2, + 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73, + 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb, + 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79, + 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08, + 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a, + 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e, + 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf, + 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +}; + +constexpr uint8_t xtime(uint8_t x) { + return static_cast((x << 1) ^ ((x & 0x80) ? 0x1B : 0)); +} + +// MixColumns row [02 03 01 01]. T0[a] = (2·S[a], 1·S[a], 1·S[a], 3·S[a]) +// little-endian bytes are: byte0=2S, byte1=S, byte2=S, byte3=3S. +constexpr uint32_t te_word(uint8_t a, int rotate) +{ + uint8_t s = kSBox[a]; + uint8_t s2 = xtime(s); + uint8_t s3 = static_cast(s2 ^ s); + uint8_t b[4] = { s2, s, s, s3 }; + uint32_t v = 0; + for (int i = 0; i < 4; ++i) { + v |= uint32_t(b[(i + rotate) & 3]) << (8 * i); + } + return v; +} + +constexpr std::array build_table(int rotate) +{ + std::array t{}; + for (int i = 0; i < 256; ++i) { + t[i] = te_word(static_cast(i), rotate); + } + return t; +} + +constexpr auto T0 = build_table(0); +constexpr auto T1 = build_table(3); +constexpr auto T2 = build_table(2); +constexpr auto T3 = build_table(1); + +} // namespace pos2gpu::aes_tables diff --git a/src/gpu/CudaHalfShim.hpp b/src/gpu/CudaHalfShim.hpp new file mode 100644 index 0000000..424e2ae --- /dev/null +++ b/src/gpu/CudaHalfShim.hpp @@ -0,0 +1,59 @@ +// CudaHalfShim.hpp — conditionally pulls in the CUDA Toolkit headers +// consumed by AdaptiveCpp-compatible SYCL TUs: +// - cuda_fp16.h (AdaptiveCpp's libkernel/half_representation.hpp +// references __half whenever the CUDA backend is +// in scope) +// - cuda_runtime.h (our .cuh signatures reference cudaEvent_t / +// cudaError_t for signature-only interop) +// +// On NVIDIA builds these headers are on the include path and everything +// "just works". On AMD/ROCm builds they're absent — ROCm's HIP headers +// redefine vector types like uchar1 that CUDA's headers also define, so +// pulling both in blows up with typedef redefinition errors. +// +// Uses __has_include so the CUDA Toolkit is only pulled in when actually +// available. For HIP/Intel backends we provide minimal type stubs — just +// enough for function signatures carrying cudaEvent_t / cudaError_t to +// parse. Those parameters are always nullptr / ignored on non-CUDA paths, +// so the stubs are purely compile-time bookkeeping. +// +// Define XCHPLOT2_SKIP_CUDA_FP16 or XCHPLOT2_SKIP_CUDA_RUNTIME to opt out +// of either include unconditionally (useful when CUDA headers are present +// for an unrelated reason but you want to test the stub path). + +#pragma once + +#include + +#if !defined(XCHPLOT2_SKIP_CUDA_RUNTIME) && __has_include() + #include +#else + // Opaque stubs for signature-only CUDA types. These only appear in + // launch_*_profiled parameter lists where non-CUDA callers pass nullptr. + using cudaEvent_t = void*; + using cudaError_t = int; + #ifndef cudaSuccess + #define cudaSuccess 0 + #endif + #ifndef cudaErrorInvalidValue + #define cudaErrorInvalidValue 1 + #endif +#endif + +// __half / __half2: AdaptiveCpp's libkernel/half_representation can +// reference these by name even when the codegen target is HIP, not CUDA. +// Earlier the SKIP path simply didn't include cuda_fp16.h and provided +// nothing in its place — silent on most hosts, but on at least one +// W5700 / gfx1010 / gfx1013-spoof + ROCm + AdaptiveCpp combination, the +// missing types caused JIT to emit no-op kernel stubs (every kernel +// dispatch completed cleanly with zero device-side writes). Fall back +// to ROCm's when available, then to opaque struct +// stubs as a last resort. +#if !defined(XCHPLOT2_SKIP_CUDA_FP16) && __has_include() + #include +#elif __has_include() + #include +#else + struct __half { uint16_t x; }; + struct __half2 { uint16_t x; uint16_t y; }; +#endif diff --git a/src/gpu/DeviceIds.hpp b/src/gpu/DeviceIds.hpp new file mode 100644 index 0000000..27ec6b0 --- /dev/null +++ b/src/gpu/DeviceIds.hpp @@ -0,0 +1,26 @@ +// DeviceIds.hpp — synthetic device-id sentinels shared between the +// CLI / BatchPlotter (host code) and SyclBackend (per-thread queue +// routing). Real GPU ids are 0..N-1; negative values are reserved +// for selectors that don't correspond to a numbered device. +// +// Lives in src/gpu/ rather than src/host/ because SyclBackend.hpp +// (which can't include host-side headers) is the authoritative +// consumer; BatchPlotter / cli.cpp pull the same constants from +// here so the two sides agree on the encoding. + +#pragma once + +namespace pos2gpu { + +// Default thread-local value of sycl_backend::current_device_id_ref(). +// queue() picks sycl::gpu_selector_v in this case — the single-device +// zero-config path users see when --devices is not passed. +inline constexpr int kDefaultGpuId = -1; + +// Routes queue() to sycl::cpu_selector_v — AdaptiveCpp's OMP backend +// on the CPU build path (ACPP_TARGETS=omp). BatchPlotter pushes this +// into device_ids when --cpu (or `cpu` in --devices) is requested, +// so the multi-device fan-out treats CPU like just-another-device. +inline constexpr int kCpuDeviceId = -2; + +} // namespace pos2gpu diff --git a/src/gpu/FeistelCipherGpu.cuh b/src/gpu/FeistelCipherGpu.cuh index 28ee6d5..1afb256 100644 --- a/src/gpu/FeistelCipherGpu.cuh +++ b/src/gpu/FeistelCipherGpu.cuh @@ -5,7 +5,8 @@ #pragma once -#include +#include "gpu/PortableAttrs.hpp" + #include namespace pos2gpu { @@ -16,7 +17,7 @@ struct FeistelKey { int rounds; }; -__host__ __device__ inline FeistelKey make_feistel_key(uint8_t const* plot_id, int k, int rounds = 4) +POS2_HOST_DEVICE_INLINE FeistelKey make_feistel_key(uint8_t const* plot_id, int k, int rounds = 4) { FeistelKey fk; fk.k = k; @@ -26,14 +27,14 @@ __host__ __device__ inline FeistelKey make_feistel_key(uint8_t const* plot_id, i return fk; } -__host__ __device__ inline uint64_t feistel_rotate_left(uint64_t value, uint64_t shift, uint64_t bit_length) +POS2_HOST_DEVICE_INLINE uint64_t feistel_rotate_left(uint64_t value, uint64_t shift, uint64_t bit_length) { if (shift > bit_length) shift = bit_length; uint64_t mask = (bit_length == 64 ? ~0ULL : ((1ULL << bit_length) - 1)); return ((value << shift) & mask) | (value >> (bit_length - shift)); } -__host__ __device__ inline uint64_t feistel_slice_key(FeistelKey const& fk, int start_bit, int num_bits) +POS2_HOST_DEVICE_INLINE uint64_t feistel_slice_key(FeistelKey const& fk, int start_bit, int num_bits) { int start_byte = start_bit / 8; int bit_offset = start_bit % 8; @@ -49,7 +50,7 @@ __host__ __device__ inline uint64_t feistel_slice_key(FeistelKey const& fk, int return (key_segment >> shift_amount) & mask; } -__host__ __device__ inline uint64_t feistel_round_key(FeistelKey const& fk, int round_num) +POS2_HOST_DEVICE_INLINE uint64_t feistel_round_key(FeistelKey const& fk, int round_num) { int half_length = fk.k; int bits_for_round = 3 * half_length; @@ -61,7 +62,7 @@ __host__ __device__ inline uint64_t feistel_round_key(FeistelKey const& fk, int struct FeistelResultGpu { uint64_t left, right; }; -__host__ __device__ inline FeistelResultGpu feistel_round( +POS2_HOST_DEVICE_INLINE FeistelResultGpu feistel_round( FeistelKey const& fk, uint64_t left, uint64_t right, uint64_t round_key) { int k = fk.k; @@ -87,7 +88,7 @@ __host__ __device__ inline FeistelResultGpu feistel_round( return res; } -__host__ __device__ inline uint64_t feistel_encrypt(FeistelKey const& fk, uint64_t input_value) +POS2_HOST_DEVICE_INLINE uint64_t feistel_encrypt(FeistelKey const& fk, uint64_t input_value) { int k = fk.k; uint64_t bitmask = (k == 64 ? ~0ULL : ((1ULL << k) - 1)); diff --git a/src/gpu/PipelineKernels.cuh b/src/gpu/PipelineKernels.cuh new file mode 100644 index 0000000..37f4a7f --- /dev/null +++ b/src/gpu/PipelineKernels.cuh @@ -0,0 +1,64 @@ +// PipelineKernels.cuh — backend-dispatched wrappers for the simple +// orchestration kernels in src/host/GpuPipeline.cu (init, gather, +// permute, merge). All five are pure grid-stride compute — no AES, no +// shared memory, no atomics — so the SYCL ports are mechanical. +// +// Selection at configure time via XCHPLOT2_BACKEND, same shape as +// T1Offsets / T2Offsets / T3Offsets. + +#pragma once + +#include + +#include "gpu/CudaHalfShim.hpp" +#include + +namespace pos2gpu { + +// vals[i] = i for i in [0, count). Used to seed the index stream that +// the subsequent radix sort permutes. +void launch_init_u32_identity( + uint32_t* d_vals, + uint64_t count, + sycl::queue& q); + +// dst[p] = src[indices[p]] for p in [0, count). Two width specialisations. +void launch_gather_u64( + uint64_t const* d_src, + uint32_t const* d_indices, + uint64_t* d_dst, + uint64_t count, + sycl::queue& q); + +void launch_gather_u32( + uint32_t const* d_src, + uint32_t const* d_indices, + uint32_t* d_dst, + uint64_t count, + sycl::queue& q); + +// dst_meta[idx] = src_meta [indices[idx]] +// dst_xbits[idx] = src_xbits[indices[idx]] +// for idx in [0, count). T2's two-stream gather, fused. +void launch_permute_t2( + uint64_t const* d_src_meta, + uint32_t const* d_src_xbits, + uint32_t const* d_indices, + uint64_t* d_dst_meta, + uint32_t* d_dst_xbits, + uint64_t count, + sycl::queue& q); + +// Stable 2-way merge of two sorted (key, value) runs via per-thread +// merge-path binary search. A wins on ties (load-bearing for parity +// with the pool path's CUB radix sort). Only the (uint32, uint32) +// instantiation is currently used — both T1 and T2 streaming-merge +// paths sort uint32 keys (match_info) by uint32 indices. +void launch_merge_pairs_stable_2way_u32_u32( + uint32_t const* d_A_keys, uint32_t const* d_A_vals, uint64_t nA, + uint32_t const* d_B_keys, uint32_t const* d_B_vals, uint64_t nB, + uint32_t* d_out_keys, uint32_t* d_out_vals, + uint64_t total, + sycl::queue& q); + +} // namespace pos2gpu diff --git a/src/gpu/PipelineKernelsSycl.cpp b/src/gpu/PipelineKernelsSycl.cpp new file mode 100644 index 0000000..bf665ae --- /dev/null +++ b/src/gpu/PipelineKernelsSycl.cpp @@ -0,0 +1,123 @@ +// PipelineKernelsSycl.cpp — SYCL implementation of the simple pipeline +// kernels. Mirrors PipelineKernelsCuda.cu; reuses the shared queue from +// SyclBackend.hpp. None of these touch AES so no T-table buffer is +// needed. + +#include "gpu/PipelineKernels.cuh" +#include "gpu/SyclBackend.hpp" + +#include + +namespace pos2gpu { + +namespace { + +constexpr size_t kThreads = 256; + +inline size_t global_for(uint64_t count) +{ + size_t groups = static_cast((count + kThreads - 1) / kThreads); + return groups * kThreads; +} + +} // namespace + +void launch_init_u32_identity( + uint32_t* d_vals, uint64_t count, sycl::queue& q) +{ + q.parallel_for( + sycl::nd_range<1>{ global_for(count), kThreads }, + [=](sycl::nd_item<1> it) { + uint64_t idx = it.get_global_id(0); + if (idx >= count) return; + d_vals[idx] = uint32_t(idx); + }).wait(); +} + +void launch_gather_u64( + uint64_t const* d_src, uint32_t const* d_indices, + uint64_t* d_dst, uint64_t count, sycl::queue& q) +{ + q.parallel_for( + sycl::nd_range<1>{ global_for(count), kThreads }, + [=](sycl::nd_item<1> it) { + uint64_t p = it.get_global_id(0); + if (p >= count) return; + d_dst[p] = d_src[d_indices[p]]; + }).wait(); +} + +void launch_gather_u32( + uint32_t const* d_src, uint32_t const* d_indices, + uint32_t* d_dst, uint64_t count, sycl::queue& q) +{ + q.parallel_for( + sycl::nd_range<1>{ global_for(count), kThreads }, + [=](sycl::nd_item<1> it) { + uint64_t p = it.get_global_id(0); + if (p >= count) return; + d_dst[p] = d_src[d_indices[p]]; + }).wait(); +} + +void launch_permute_t2( + uint64_t const* d_src_meta, uint32_t const* d_src_xbits, + uint32_t const* d_indices, + uint64_t* d_dst_meta, uint32_t* d_dst_xbits, + uint64_t count, sycl::queue& q) +{ + q.parallel_for( + sycl::nd_range<1>{ global_for(count), kThreads }, + [=](sycl::nd_item<1> it) { + uint64_t idx = it.get_global_id(0); + if (idx >= count) return; + uint32_t i = d_indices[idx]; + d_dst_meta[idx] = d_src_meta[i]; + d_dst_xbits[idx] = d_src_xbits[i]; + }).wait(); +} + +void launch_merge_pairs_stable_2way_u32_u32( + uint32_t const* d_A_keys, uint32_t const* d_A_vals, uint64_t nA, + uint32_t const* d_B_keys, uint32_t const* d_B_vals, uint64_t nB, + uint32_t* d_out_keys, uint32_t* d_out_vals, uint64_t total, + sycl::queue& q) +{ + q.parallel_for( + sycl::nd_range<1>{ global_for(total), kThreads }, + [=](sycl::nd_item<1> it) { + uint64_t p = it.get_global_id(0); + if (p >= total) return; + + uint64_t lo = (p > nB) ? (p - nB) : 0; + uint64_t hi = (p < nA) ? p : nA; + while (lo < hi) { + uint64_t i = lo + (hi - lo + 1) / 2; + uint64_t j = p - i; + uint32_t a_prev = d_A_keys[i - 1]; + uint32_t b_here = (j < nB) ? d_B_keys[j] : 0xFFFFFFFFu; + if (a_prev > b_here) { + hi = i - 1; + } else { + lo = i; + } + } + uint64_t i = lo; + uint64_t j = p - i; + + bool take_a; + if (i >= nA) take_a = false; + else if (j >= nB) take_a = true; + else take_a = d_A_keys[i] <= d_B_keys[j]; + + if (take_a) { + d_out_keys[p] = d_A_keys[i]; + d_out_vals[p] = d_A_vals[i]; + } else { + d_out_keys[p] = d_B_keys[j]; + d_out_vals[p] = d_B_vals[j]; + } + }).wait(); +} + +} // namespace pos2gpu diff --git a/src/gpu/PortableAttrs.hpp b/src/gpu/PortableAttrs.hpp new file mode 100644 index 0000000..c959657 --- /dev/null +++ b/src/gpu/PortableAttrs.hpp @@ -0,0 +1,21 @@ +// PortableAttrs.hpp — backend-portable function attribute macros so the +// AES helpers in AesGpu.cuh / AesHashGpu.cuh compile under both nvcc +// (CUDA TU) and acpp/clang (SYCL TU). +// +// Under CUDA the macros expand to the usual __device__ / __host__ / etc. +// markup. Under non-CUDA the markup is dropped and we fall back to plain +// inline (with a force-inline hint where appropriate). The functions +// then compile as ordinary C++ that can be called from a SYCL kernel +// lambda by ADL with no special decoration. + +#pragma once + +#if defined(__CUDACC__) + #define POS2_DEVICE_INLINE __device__ __forceinline__ + #define POS2_HOST_DEVICE_INLINE __host__ __device__ __forceinline__ + #define POS2_HOST_DEVICE __host__ __device__ +#else + #define POS2_DEVICE_INLINE inline __attribute__((always_inline)) + #define POS2_HOST_DEVICE_INLINE inline __attribute__((always_inline)) + #define POS2_HOST_DEVICE +#endif diff --git a/src/gpu/Sort.cuh b/src/gpu/Sort.cuh new file mode 100644 index 0000000..85b5d37 --- /dev/null +++ b/src/gpu/Sort.cuh @@ -0,0 +1,59 @@ +// Sort.cuh — backend-dispatched radix sort wrappers. +// +// Two implementations: +// SortCuda.cu — CUB-backed, compiled by nvcc. NVIDIA-only target. The +// wrapper takes sycl::queue& q and bridges by draining q +// with q.wait(), calling CUB on the default stream, then +// cudaStreamSynchronize(nullptr). CUB and the SYCL backend +// share the same primary CUDA context (libcuda underneath +// both), so device pointers interop natively. ~2 host +// fences per sort call (~50µs each, well under 1ms/plot). +// SortSycl.cpp — TODO: oneDPL-backed for AMD/Intel targets. Slower than +// CUB on NVIDIA but the only path on non-NVIDIA hardware. +// +// CMake selects between them based on the target. For now (NVIDIA-only) +// SortCuda.cu is always built. +// +// API mirrors CUB's two-mode contract: pass d_temp_storage=nullptr to +// query the required temp_bytes; pass real storage to perform the sort. + +#pragma once + +#include +#include + +#include + +namespace pos2gpu { + +// Sort (key, value) pairs by uint32 key over [begin_bit, end_bit) bits. +// Stable. Used for T1 / T2 / Xs sorts (key=match_info, value=index or x). +// +// Both keys_in/vals_in AND keys_out/vals_out are writable: the SYCL +// implementation uses them as a ping-pong pair across radix passes to +// avoid allocating its own (8 × N bytes) alt buffers. Caller treats +// keys_in/vals_in as scratch on input — they get clobbered. The result +// always lands in keys_out/vals_out (the wrapper does a final memcpy +// internally if the pass count is odd). The CUB backend ignores the +// non-constness — it still treats keys_in/vals_in as read-only. +void launch_sort_pairs_u32_u32( + void* d_temp_storage, + size_t& temp_bytes, + uint32_t* keys_in, uint32_t* keys_out, + uint32_t* vals_in, uint32_t* vals_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q); + +// Sort uint64 keys over [begin_bit, end_bit) bits. Used for the final +// T3 fragment sort (sort by proof_fragment's low 2k bits). +// Same in/out ping-pong contract as launch_sort_pairs_u32_u32. +void launch_sort_keys_u64( + void* d_temp_storage, + size_t& temp_bytes, + uint64_t* keys_in, uint64_t* keys_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q); + +} // namespace pos2gpu diff --git a/src/gpu/SortCubInternal.cuh b/src/gpu/SortCubInternal.cuh new file mode 100644 index 0000000..322fd02 --- /dev/null +++ b/src/gpu/SortCubInternal.cuh @@ -0,0 +1,57 @@ +// SortCubInternal.cuh — pure-CUDA, SYCL-free declarations of the +// CUB-backed radix sort. This header is the only entry point that +// SortCuda.cu (compiled by nvcc) needs to see — it deliberately +// does NOT include so the nvcc translation unit +// never reaches into AdaptiveCpp's libkernel headers. +// +// AdaptiveCpp's expected consumer pattern is "compile through acpp, +// or stay out of the SYCL header tree." Pulling +// into a .cu file hits the legacy CUDA branch of half.hpp's +// __acpp_backend_switch and tries to reference __hadd / __hsub / +// etc. that aren't in scope without cuda_fp16.h. Keeping nvcc TUs +// SYCL-free removes that whole class of bug. +// +// The SYCL-typed public API stays in Sort.cuh; SortSyclCub.cpp +// (compiled by acpp) bridges by draining the SYCL queue, calling +// these CUB symbols, and the cudaStreamSynchronize at the end is +// already done inside the CUB body — see comments below. + +#pragma once + +#include +#include + +namespace pos2gpu { + +// Pure-CUDA CUB radix sort. Caller responsibilities: +// - Inputs (keys_in / vals_in) must be ready on the device — the +// SYCL adapter handles this by draining the producing queue +// with q.wait() before calling. +// - Output is on the default CUDA stream and is fully drained +// before the function returns (we cudaStreamSynchronize(nullptr) +// internally so the caller can immediately consume keys_out / +// vals_out without further fences). +// +// Sizing-query mode: pass d_temp_storage = nullptr; *temp_bytes is +// filled with the required scratch size and the function returns +// immediately without doing any work or any sync. +// +// Same in/out ping-pong contract as the SYCL-typed public API in +// Sort.cuh: keys_in/vals_in are clobbered, the result lands in +// keys_out/vals_out (memcpy from the CUB-chosen buffer if needed). +void cub_sort_pairs_u32_u32( + void* d_temp_storage, + size_t& temp_bytes, + uint32_t* keys_in, uint32_t* keys_out, + uint32_t* vals_in, uint32_t* vals_out, + uint64_t count, + int begin_bit, int end_bit); + +void cub_sort_keys_u64( + void* d_temp_storage, + size_t& temp_bytes, + uint64_t* keys_in, uint64_t* keys_out, + uint64_t count, + int begin_bit, int end_bit); + +} // namespace pos2gpu diff --git a/src/gpu/SortCuda.cu b/src/gpu/SortCuda.cu new file mode 100644 index 0000000..3ea4c36 --- /dev/null +++ b/src/gpu/SortCuda.cu @@ -0,0 +1,130 @@ +// SortCuda.cu — CUB-backed implementation of the Sort.cuh wrappers. +// Compiled by nvcc; required when targeting NVIDIA. CUB's radix sort is +// state-of-the-art, so on NVIDIA we lean on it directly even from the +// SYCL host code by bridging the queue↔CUDA-stream boundary: drain the +// SYCL queue with q.wait(), run CUB on the default CUDA stream, then +// cudaStreamSynchronize(nullptr). Both backends share the same primary +// CUDA context (libcuda underneath both), so device pointers interop +// natively. Two host fences per sort call (~50µs each, well under +// 1ms/plot at the typical 3 sorts/plot rate). + +// Pure-CUDA TU — never include here, directly or +// transitively. AdaptiveCpp's libkernel reaches into nvcc's CUDA +// device pass via __acpp_backend_switch when the SYCL header is in +// scope, and that path was never intended to be used from +// nvcc-driver-compiled consumer TUs (per the AdaptiveCpp dev's +// guidance: stick to --acpp-targets=generic, or stay out of the +// SYCL header tree from non-acpp compilers). The SYCL-typed entry +// points live in SortSyclCub.cpp (compiled by acpp) and call into +// the cub_sort_* declarations below. +#include "gpu/SortCubInternal.cuh" + +#include +#include + +#include +#include + +namespace pos2gpu { + +namespace { + +inline void cuda_check_or_throw(cudaError_t err, char const* what) +{ + if (err != cudaSuccess) { + throw std::runtime_error(std::string("CUB ") + what + ": " + + cudaGetErrorString(err)); + } +} + +} // namespace + +// CUB DoubleBuffer mode: caller passes both buffers as a ping-pong pair, +// CUB picks which one the result lands in (db.Current()), and CUB's own +// scratch shrinks to ~MB of histograms instead of ~2 GB of internal +// temp keys/vals buffers it would otherwise allocate. We then memcpy +// db.Current() to keys_out if needed so the public API contract holds. +// +// Caller (SortSyclCub.cpp) drains the producing SYCL queue with q.wait() +// before this is called. This function syncs the default CUDA stream +// internally before returning so the caller can hand keys_out / vals_out +// straight back to SYCL without another fence. +void cub_sort_pairs_u32_u32( + void* d_temp_storage, + size_t& temp_bytes, + uint32_t* keys_in, uint32_t* keys_out, + uint32_t* vals_in, uint32_t* vals_out, + uint64_t count, + int begin_bit, int end_bit) +{ + if (d_temp_storage == nullptr) { + cub::DoubleBuffer d_keys(keys_in, keys_out); + cub::DoubleBuffer d_vals(vals_in, vals_out); + cuda_check_or_throw(cub::DeviceRadixSort::SortPairs( + nullptr, temp_bytes, + d_keys, d_vals, + static_cast(count), begin_bit, end_bit, /*stream=*/nullptr), + "SortPairs (sizing)"); + return; + } + + cub::DoubleBuffer d_keys(keys_in, keys_out); + cub::DoubleBuffer d_vals(vals_in, vals_out); + cuda_check_or_throw(cub::DeviceRadixSort::SortPairs( + d_temp_storage, temp_bytes, + d_keys, d_vals, + static_cast(count), begin_bit, end_bit, /*stream=*/nullptr), + "SortPairs"); + + // CUB picks the output buffer; copy to keys_out/vals_out if it landed + // in keys_in/vals_in instead. + if (d_keys.Current() != keys_out) { + cuda_check_or_throw(cudaMemcpyAsync(keys_out, d_keys.Current(), + count * sizeof(uint32_t), cudaMemcpyDeviceToDevice, nullptr), + "memcpy keys_out"); + } + if (d_vals.Current() != vals_out) { + cuda_check_or_throw(cudaMemcpyAsync(vals_out, d_vals.Current(), + count * sizeof(uint32_t), cudaMemcpyDeviceToDevice, nullptr), + "memcpy vals_out"); + } + + cuda_check_or_throw(cudaStreamSynchronize(nullptr), + "cudaStreamSynchronize after SortPairs"); +} + +void cub_sort_keys_u64( + void* d_temp_storage, + size_t& temp_bytes, + uint64_t* keys_in, uint64_t* keys_out, + uint64_t count, + int begin_bit, int end_bit) +{ + if (d_temp_storage == nullptr) { + cub::DoubleBuffer d_keys(keys_in, keys_out); + cuda_check_or_throw(cub::DeviceRadixSort::SortKeys( + nullptr, temp_bytes, + d_keys, + static_cast(count), begin_bit, end_bit, /*stream=*/nullptr), + "SortKeys (sizing)"); + return; + } + + cub::DoubleBuffer d_keys(keys_in, keys_out); + cuda_check_or_throw(cub::DeviceRadixSort::SortKeys( + d_temp_storage, temp_bytes, + d_keys, + static_cast(count), begin_bit, end_bit, /*stream=*/nullptr), + "SortKeys"); + + if (d_keys.Current() != keys_out) { + cuda_check_or_throw(cudaMemcpyAsync(keys_out, d_keys.Current(), + count * sizeof(uint64_t), cudaMemcpyDeviceToDevice, nullptr), + "memcpy keys_out"); + } + + cuda_check_or_throw(cudaStreamSynchronize(nullptr), + "cudaStreamSynchronize after SortKeys"); +} + +} // namespace pos2gpu diff --git a/src/gpu/SortDispatch.cpp b/src/gpu/SortDispatch.cpp new file mode 100644 index 0000000..f0d8d3f --- /dev/null +++ b/src/gpu/SortDispatch.cpp @@ -0,0 +1,104 @@ +// SortDispatch.cpp — runtime backend dispatch for the radix sort wrappers. +// +// Two implementations can coexist in the same binary on dual-toolchain +// builds: +// +// launch_sort_*_cub — CUB-backed (SortSyclCub.cpp + SortCuda.cu); +// present only when XCHPLOT2_HAVE_CUB defined. +// launch_sort_*_sycl — pure-SYCL hand-rolled radix (SortSycl.cpp); +// always present. +// +// The dispatcher picks based on the queue's device backend, so a hybrid +// host (NVIDIA + AMD on the same box) runs CUB on the NVIDIA worker and +// SYCL radix on the AMD worker without rebuilding. Single-vendor builds +// (BUILD_CUDA=OFF) compile out the CUB branch entirely; the dispatcher +// reduces to a single tail call. + +#include "gpu/Sort.cuh" + +namespace pos2gpu { + +#if defined(XCHPLOT2_HAVE_CUB) +void launch_sort_pairs_u32_u32_cub( + void* d_temp_storage, + size_t& temp_bytes, + uint32_t* keys_in, uint32_t* keys_out, + uint32_t* vals_in, uint32_t* vals_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q); + +void launch_sort_keys_u64_cub( + void* d_temp_storage, + size_t& temp_bytes, + uint64_t* keys_in, uint64_t* keys_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q); +#endif + +void launch_sort_pairs_u32_u32_sycl( + void* d_temp_storage, + size_t& temp_bytes, + uint32_t* keys_in, uint32_t* keys_out, + uint32_t* vals_in, uint32_t* vals_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q); + +void launch_sort_keys_u64_sycl( + void* d_temp_storage, + size_t& temp_bytes, + uint64_t* keys_in, uint64_t* keys_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q); + +void launch_sort_pairs_u32_u32( + void* d_temp_storage, + size_t& temp_bytes, + uint32_t* keys_in, uint32_t* keys_out, + uint32_t* vals_in, uint32_t* vals_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q) +{ +#if defined(XCHPLOT2_HAVE_CUB) + if (q.get_device().get_backend() == sycl::backend::cuda) { + launch_sort_pairs_u32_u32_cub( + d_temp_storage, temp_bytes, + keys_in, keys_out, vals_in, vals_out, + count, begin_bit, end_bit, q); + return; + } +#endif + launch_sort_pairs_u32_u32_sycl( + d_temp_storage, temp_bytes, + keys_in, keys_out, vals_in, vals_out, + count, begin_bit, end_bit, q); +} + +void launch_sort_keys_u64( + void* d_temp_storage, + size_t& temp_bytes, + uint64_t* keys_in, uint64_t* keys_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q) +{ +#if defined(XCHPLOT2_HAVE_CUB) + if (q.get_device().get_backend() == sycl::backend::cuda) { + launch_sort_keys_u64_cub( + d_temp_storage, temp_bytes, + keys_in, keys_out, + count, begin_bit, end_bit, q); + return; + } +#endif + launch_sort_keys_u64_sycl( + d_temp_storage, temp_bytes, + keys_in, keys_out, + count, begin_bit, end_bit, q); +} + +} // namespace pos2gpu diff --git a/src/gpu/SortSycl.cpp b/src/gpu/SortSycl.cpp new file mode 100644 index 0000000..1984b35 --- /dev/null +++ b/src/gpu/SortSycl.cpp @@ -0,0 +1,391 @@ +// SortSycl.cpp — stable LSD radix sort in SYCL with parallel scan + +// per-tile parallel-across-tiles scatter. Used when XCHPLOT2_BUILD_CUDA=OFF; +// the CUDA build uses SortCuda.cu (CUB). +// +// Why hand-rolled? oneDPL's sort_by_key segfaults on AdaptiveCpp's CUDA +// backend, and AdaptiveCpp's bitonic_sort is O(N log² N) and unstable +// (we need stability for LSD radix). This implementation runs on every +// AdaptiveCpp backend (CUDA, HIP, Level Zero, OpenCL). +// +// Design (per 4-bit pass; RADIX=16; TILE_SIZE=1024 items per workgroup): +// Phase 1 — parallel per-tile count: each WG reduces its tile into a +// local 16-bucket histogram, then writes those 16 counts (no atomics) +// into a bucket-major device array tile_hist[d * num_tiles + t]. The +// bucket-major layout is what makes phase 2 a single 1-D scan. +// Phase 2 — global exclusive scan over the entire tile_hist via +// AdaptiveCpp's scanning::scan (decoupled-lookback, multi-WG, parallel). +// The scan output, tile_offsets[d * num_tiles + t], is exactly the +// starting position in the output where tile t's bucket-d items go, +// because the bucket-major layout means the scan accumulates each +// bucket's tiles in order, then rolls over to the next bucket. Stable +// by construction: tile t < t' always lands earlier within bucket d. +// Phase 3 — parallel-across-tiles scatter: each WG loads its tile into +// local memory, then thread 0 sequentially walks the tile and emits +// each item to out[tile_offsets[d * num_tiles + t] + pos[d]++]. Stable +// within each tile (sequential walk preserves input order). +// +// Performance vs CUB: significantly slower (single-thread scatter per WG +// is ~32× under-utilized vs CUB's warp-cooperative scatter), but parallel +// across tiles. Future work: cooperative intra-tile scatter using per-WG +// per-bucket prefix scans. For now, correct and parallel beats fast and +// wrong. + +#include "gpu/Sort.cuh" + +#include + +#include "hipSYCL/algorithms/scan/scan.hpp" +#include "hipSYCL/algorithms/util/allocation_cache.hpp" + +#include +#include + +namespace pos2gpu { + +namespace { + +constexpr int RADIX_BITS = 4; +constexpr int RADIX = 1 << RADIX_BITS; +constexpr int RADIX_MASK = RADIX - 1; +constexpr int WG_SIZE = 256; +constexpr int ITEMS_PER_THREAD = 4; +constexpr int TILE_SIZE = WG_SIZE * ITEMS_PER_THREAD; // 1024 + +using local_atomic_u32 = sycl::atomic_ref< + uint32_t, + sycl::memory_order::relaxed, + sycl::memory_scope::work_group, + sycl::access::address_space::local_space>; + +// Per-process scratch cache for AdaptiveCpp's scan algorithm. Lives for +// the program's lifetime; allocations are pooled and reused across calls. +hipsycl::algorithms::util::allocation_cache& scan_alloc_cache() +{ + static hipsycl::algorithms::util::allocation_cache cache( + hipsycl::algorithms::util::allocation_type::device); + return cache; +} + +uint64_t tile_count_for(uint64_t count) +{ + return (count + TILE_SIZE - 1) / TILE_SIZE; +} + +void radix_pass_pairs_u32( + sycl::queue& q, + uint32_t const* in_keys, uint32_t const* in_vals, + uint32_t* out_keys, uint32_t* out_vals, + uint32_t* tile_hist, uint32_t* tile_offsets, + uint64_t count, int bit) +{ + uint64_t const num_tiles = tile_count_for(count); + uint64_t const grid = num_tiles * WG_SIZE; + + // Phase 1: per-tile histogram → tile_hist[d * num_tiles + t]. + q.submit([&](sycl::handler& h) { + sycl::local_accessor local_hist(sycl::range<1>(RADIX), h); + h.parallel_for(sycl::nd_range<1>(grid, WG_SIZE), + [=](sycl::nd_item<1> it) { + int const tid = static_cast(it.get_local_id(0)); + uint64_t const tile = it.get_group(0); + + if (tid < RADIX) local_hist[tid] = 0; + it.barrier(sycl::access::fence_space::local_space); + + uint64_t const base = tile * TILE_SIZE; + for (int i = 0; i < ITEMS_PER_THREAD; ++i) { + uint64_t const idx = base + static_cast(i) * WG_SIZE + tid; + if (idx < count) { + uint32_t const d = (in_keys[idx] >> bit) & RADIX_MASK; + local_atomic_u32(local_hist[d]).fetch_add(1u); + } + } + it.barrier(sycl::access::fence_space::local_space); + + if (tid < RADIX) { + tile_hist[static_cast(tid) * num_tiles + tile] = local_hist[tid]; + } + }); + }); + q.wait(); + + // Phase 2: parallel exclusive scan over the entire tile_hist. + { + hipsycl::algorithms::util::allocation_group scratch_alloc( + &scan_alloc_cache(), q.get_device()); + size_t const scan_size = static_cast(RADIX) * static_cast(num_tiles); + hipsycl::algorithms::scanning::scan( + q, scratch_alloc, + tile_hist, tile_hist + scan_size, + tile_offsets, + sycl::plus{}, + uint32_t{0}).wait(); + } + + // Phase 3: per-tile stable scatter, cooperative across the WG. + // Items are laid out in local memory CONTIGUOUSLY-PER-THREAD so that + // the per-digit prefix scan (one per bucket; 16 iterations) yields + // ranks in input order, preserving stability. Each iteration: + // 1. Each thread counts its items that match the current digit. + // 2. exclusive_scan_over_group turns those counts into per-thread + // offsets within the bucket. + // 3. Each thread scatters its matching items to local_bases[d] + + // offset, advancing one position per matching item. + q.submit([&](sycl::handler& h) { + sycl::local_accessor local_keys (sycl::range<1>(TILE_SIZE), h); + sycl::local_accessor local_vals (sycl::range<1>(TILE_SIZE), h); + sycl::local_accessor local_digits(sycl::range<1>(TILE_SIZE), h); + sycl::local_accessor local_bases (sycl::range<1>(RADIX), h); + h.parallel_for(sycl::nd_range<1>(grid, WG_SIZE), + [=](sycl::nd_item<1> it) { + int const tid = static_cast(it.get_local_id(0)); + uint64_t const tile = it.get_group(0); + auto const grp = it.get_group(); + + uint64_t const base = tile * TILE_SIZE; + int const items_in_tile = static_cast( + sycl::min(TILE_SIZE, count - base)); + + for (int i = 0; i < ITEMS_PER_THREAD; ++i) { + int const local_pos = tid * ITEMS_PER_THREAD + i; + if (local_pos < items_in_tile) { + uint32_t const k = in_keys[base + local_pos]; + local_keys [local_pos] = k; + local_vals [local_pos] = in_vals[base + local_pos]; + local_digits[local_pos] = static_cast((k >> bit) & RADIX_MASK); + } + } + + if (tid < RADIX) { + local_bases[tid] = tile_offsets[ + static_cast(tid) * num_tiles + tile]; + } + it.barrier(sycl::access::fence_space::local_space); + + for (int d = 0; d < RADIX; ++d) { + uint32_t my_count = 0; + for (int i = 0; i < ITEMS_PER_THREAD; ++i) { + int const local_pos = tid * ITEMS_PER_THREAD + i; + if (local_pos < items_in_tile && local_digits[local_pos] == d) { + ++my_count; + } + } + + uint32_t const my_offset = sycl::exclusive_scan_over_group( + grp, my_count, sycl::plus()); + + uint32_t pos_in_bucket = my_offset; + for (int i = 0; i < ITEMS_PER_THREAD; ++i) { + int const local_pos = tid * ITEMS_PER_THREAD + i; + if (local_pos < items_in_tile && local_digits[local_pos] == d) { + uint32_t const target = local_bases[d] + pos_in_bucket; + out_keys[target] = local_keys[local_pos]; + out_vals[target] = local_vals[local_pos]; + ++pos_in_bucket; + } + } + it.barrier(sycl::access::fence_space::local_space); + } + }); + }); + q.wait(); +} + +void radix_pass_keys_u64( + sycl::queue& q, + uint64_t const* in_keys, + uint64_t* out_keys, + uint32_t* tile_hist, uint32_t* tile_offsets, + uint64_t count, int bit) +{ + uint64_t const num_tiles = tile_count_for(count); + uint64_t const grid = num_tiles * WG_SIZE; + + q.submit([&](sycl::handler& h) { + sycl::local_accessor local_hist(sycl::range<1>(RADIX), h); + h.parallel_for(sycl::nd_range<1>(grid, WG_SIZE), + [=](sycl::nd_item<1> it) { + int const tid = static_cast(it.get_local_id(0)); + uint64_t const tile = it.get_group(0); + + if (tid < RADIX) local_hist[tid] = 0; + it.barrier(sycl::access::fence_space::local_space); + + uint64_t const base = tile * TILE_SIZE; + for (int i = 0; i < ITEMS_PER_THREAD; ++i) { + uint64_t const idx = base + static_cast(i) * WG_SIZE + tid; + if (idx < count) { + uint32_t const d = + static_cast((in_keys[idx] >> bit) & uint64_t{RADIX_MASK}); + local_atomic_u32(local_hist[d]).fetch_add(1u); + } + } + it.barrier(sycl::access::fence_space::local_space); + + if (tid < RADIX) { + tile_hist[static_cast(tid) * num_tiles + tile] = local_hist[tid]; + } + }); + }); + q.wait(); + + { + hipsycl::algorithms::util::allocation_group scratch_alloc( + &scan_alloc_cache(), q.get_device()); + size_t const scan_size = static_cast(RADIX) * static_cast(num_tiles); + hipsycl::algorithms::scanning::scan( + q, scratch_alloc, + tile_hist, tile_hist + scan_size, + tile_offsets, + sycl::plus{}, + uint32_t{0}).wait(); + } + + q.submit([&](sycl::handler& h) { + sycl::local_accessor local_keys (sycl::range<1>(TILE_SIZE), h); + sycl::local_accessor local_digits(sycl::range<1>(TILE_SIZE), h); + sycl::local_accessor local_bases (sycl::range<1>(RADIX), h); + h.parallel_for(sycl::nd_range<1>(grid, WG_SIZE), + [=](sycl::nd_item<1> it) { + int const tid = static_cast(it.get_local_id(0)); + uint64_t const tile = it.get_group(0); + auto const grp = it.get_group(); + + uint64_t const base = tile * TILE_SIZE; + int const items_in_tile = static_cast( + sycl::min(TILE_SIZE, count - base)); + + for (int i = 0; i < ITEMS_PER_THREAD; ++i) { + int const local_pos = tid * ITEMS_PER_THREAD + i; + if (local_pos < items_in_tile) { + uint64_t const k = in_keys[base + local_pos]; + local_keys [local_pos] = k; + local_digits[local_pos] = + static_cast((k >> bit) & uint64_t{RADIX_MASK}); + } + } + + if (tid < RADIX) { + local_bases[tid] = tile_offsets[ + static_cast(tid) * num_tiles + tile]; + } + it.barrier(sycl::access::fence_space::local_space); + + for (int d = 0; d < RADIX; ++d) { + uint32_t my_count = 0; + for (int i = 0; i < ITEMS_PER_THREAD; ++i) { + int const local_pos = tid * ITEMS_PER_THREAD + i; + if (local_pos < items_in_tile && local_digits[local_pos] == d) { + ++my_count; + } + } + + uint32_t const my_offset = sycl::exclusive_scan_over_group( + grp, my_count, sycl::plus()); + + uint32_t pos_in_bucket = my_offset; + for (int i = 0; i < ITEMS_PER_THREAD; ++i) { + int const local_pos = tid * ITEMS_PER_THREAD + i; + if (local_pos < items_in_tile && local_digits[local_pos] == d) { + uint32_t const target = local_bases[d] + pos_in_bucket; + out_keys[target] = local_keys[local_pos]; + ++pos_in_bucket; + } + } + it.barrier(sycl::access::fence_space::local_space); + } + }); + }); + q.wait(); +} + +} // namespace + +// DoubleBuffer-style ping-pong over caller's buffers — no internal alt +// allocation. Scratch is just tile_hist + tile_offsets (a few MB at k=28 +// vs the ~6 GB the old keys_alt/vals_alt cost there). The result lands +// in keys_out; if the pass count is odd we do one final memcpy from +// keys_in (which holds the result after the last swap). +// Renamed _sycl in 2026-05; the canonical launch_sort_pairs_u32_u32 lives +// in SortDispatch.cpp and routes to this implementation for non-CUDA +// devices (and for everything when XCHPLOT2_HAVE_CUB isn't defined). +void launch_sort_pairs_u32_u32_sycl( + void* d_temp_storage, + size_t& temp_bytes, + uint32_t* keys_in, uint32_t* keys_out, + uint32_t* vals_in, uint32_t* vals_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q) +{ + uint64_t const num_tiles = tile_count_for(count); + size_t const bytes = sizeof(uint32_t) * RADIX * num_tiles * 2; + if (d_temp_storage == nullptr) { + temp_bytes = bytes; + return; + } + + uint8_t* p = static_cast(d_temp_storage); + uint32_t* tile_hist = reinterpret_cast(p); p += sizeof(uint32_t) * RADIX * num_tiles; + uint32_t* tile_offsets = reinterpret_cast(p); + + // First pass reads from keys_in (caller's input). Subsequent passes + // ping-pong between keys_in and keys_out — we treat keys_in as + // scratch from here on, which the public API documents. + uint32_t* cur_keys = keys_in; + uint32_t* cur_vals = vals_in; + uint32_t* dst_keys = keys_out; + uint32_t* dst_vals = vals_out; + + for (int bit = begin_bit; bit < end_bit; bit += RADIX_BITS) { + radix_pass_pairs_u32(q, cur_keys, cur_vals, dst_keys, dst_vals, + tile_hist, tile_offsets, count, bit); + std::swap(cur_keys, dst_keys); + std::swap(cur_vals, dst_vals); + } + q.wait(); + + // After the loop, cur_keys/cur_vals point to the buffer holding the + // sorted result (because radix_pass writes to dst, then we swap so + // dst becomes the input for the next pass). If that's not keys_out, + // copy the result over. + if (cur_keys != keys_out) { + q.memcpy(keys_out, cur_keys, sizeof(uint32_t) * count); + q.memcpy(vals_out, cur_vals, sizeof(uint32_t) * count).wait(); + } +} + +void launch_sort_keys_u64_sycl( + void* d_temp_storage, + size_t& temp_bytes, + uint64_t* keys_in, uint64_t* keys_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q) +{ + uint64_t const num_tiles = tile_count_for(count); + size_t const bytes = sizeof(uint32_t) * RADIX * num_tiles * 2; + if (d_temp_storage == nullptr) { + temp_bytes = bytes; + return; + } + + uint8_t* p = static_cast(d_temp_storage); + uint32_t* tile_hist = reinterpret_cast(p); p += sizeof(uint32_t) * RADIX * num_tiles; + uint32_t* tile_offsets = reinterpret_cast(p); + + uint64_t* cur = keys_in; + uint64_t* dst = keys_out; + + for (int bit = begin_bit; bit < end_bit; bit += RADIX_BITS) { + radix_pass_keys_u64(q, cur, dst, tile_hist, tile_offsets, count, bit); + std::swap(cur, dst); + } + q.wait(); + + if (cur != keys_out) { + q.memcpy(keys_out, cur, sizeof(uint64_t) * count).wait(); + } +} + +} // namespace pos2gpu diff --git a/src/gpu/SortSyclCub.cpp b/src/gpu/SortSyclCub.cpp new file mode 100644 index 0000000..f1c47bf --- /dev/null +++ b/src/gpu/SortSyclCub.cpp @@ -0,0 +1,61 @@ +// SortSyclCub.cpp — SYCL-typed entry points for the CUB-backed sort. +// +// Compiled by acpp (the AdaptiveCpp compiler), so +// is in scope here. SortCuda.cu (compiled by nvcc) used to provide +// these directly with a `sycl::queue&` parameter, but that meant +// nvcc was reaching into AdaptiveCpp's libkernel headers — a path +// AdaptiveCpp doesn't intend to support. We now keep nvcc's view +// SYCL-free (see SortCubInternal.cuh) and bridge here: +// +// q.wait() — drain the producing SYCL +// queue so CUB sees the +// right inputs. +// cub_sort_*(...) — pure-CUDA CUB kernel + +// internal cudaStreamSync. +// +// This file is only built when XCHPLOT2_BUILD_CUDA=ON. The dispatcher +// in SortDispatch.cpp routes here for CUDA-backend queues; non-CUDA +// queues (HIP / Level Zero / OpenMP host) flow to SortSycl.cpp's +// launch_sort_*_sycl variants instead. AMD-only / Intel-only / CPU +// builds skip this file entirely (BUILD_CUDA=OFF). + +#include "gpu/Sort.cuh" +#include "gpu/SortCubInternal.cuh" + +namespace pos2gpu { + +void launch_sort_pairs_u32_u32_cub( + void* d_temp_storage, + size_t& temp_bytes, + uint32_t* keys_in, uint32_t* keys_out, + uint32_t* vals_in, uint32_t* vals_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q) +{ + // The sizing-query path (d_temp_storage == nullptr) never touches + // device memory — no need to fence the SYCL queue. + if (d_temp_storage != nullptr) { + q.wait(); + } + cub_sort_pairs_u32_u32(d_temp_storage, temp_bytes, + keys_in, keys_out, vals_in, vals_out, + count, begin_bit, end_bit); +} + +void launch_sort_keys_u64_cub( + void* d_temp_storage, + size_t& temp_bytes, + uint64_t* keys_in, uint64_t* keys_out, + uint64_t count, + int begin_bit, int end_bit, + sycl::queue& q) +{ + if (d_temp_storage != nullptr) { + q.wait(); + } + cub_sort_keys_u64(d_temp_storage, temp_bytes, + keys_in, keys_out, count, begin_bit, end_bit); +} + +} // namespace pos2gpu diff --git a/src/gpu/SyclBackend.hpp b/src/gpu/SyclBackend.hpp new file mode 100644 index 0000000..6ad762a --- /dev/null +++ b/src/gpu/SyclBackend.hpp @@ -0,0 +1,268 @@ +// SyclBackend.hpp — shared SYCL infrastructure for the cross-backend +// kernel implementations in T*OffsetsSycl.cpp. +// +// Both helpers are header-only inline so multiple SYCL TUs (T1OffsetsSycl, +// T2OffsetsSycl, T3OffsetsSycl) share a single queue and a single AES +// T-table USM buffer per process — function-local statics inside inline +// functions have unique-instance semantics under ISO C++17+. +// +// This file is consumed only by the SYCL backend; CUDA TUs never include +// it. It depends on PortableAttrs.hpp solely for the AesTables namespace +// dependency through AesTables.inl, which has no CUDA-specific content. + +#pragma once + +#include "gpu/AesTables.inl" +#include "gpu/DeviceIds.hpp" + +// cuda_fp16.h must precede sycl/sycl.hpp when this header is consumed +// from an nvcc TU — AdaptiveCpp's libkernel/detail/half_representation.hpp +// references __half, which only exists once cuda_fp16 has been seen. +#include "gpu/CudaHalfShim.hpp" +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace pos2gpu::sycl_backend { + +// Async-exception handler for the persistent queue. AdaptiveCpp's +// default policy for unhandled async errors is to call std::terminate() +// via its `throw_result` path, which is what caused the observed +// "Aborted (core dumped)" after a synchronous malloc_device failure +// threw a clean std::runtime_error — secondary async errors (e.g. a +// CUDA:2 from in-flight work on the now-starved context) hit the +// default handler and killed the process before the CLI could exit +// normally. Logging and swallowing here keeps the synchronous +// std::runtime_error as the primary signal. +inline void async_error_handler(sycl::exception_list exns) noexcept +{ + for (std::exception_ptr const& ep : exns) { + try { std::rethrow_exception(ep); } + catch (sycl::exception const& e) { + std::fprintf(stderr, "[sycl async] %s\n", e.what()); + } + catch (std::exception const& e) { + std::fprintf(stderr, "[sycl async] %s\n", e.what()); + } + catch (...) { + std::fprintf(stderr, "[sycl async] (unknown exception type)\n"); + } + } +} + +// Per-thread target device id. A worker thread sets this once at startup +// via set_current_device_id() so that its subsequent queue() call returns +// a queue bound to the requested device. Sentinel values: +// kDefaultGpuId (-1) : sycl::gpu_selector_v (single-device default, +// pre-multi-GPU zero-config path) +// kCpuDeviceId (-2) : sycl::cpu_selector_v (latent — kept so a future +// SYCL-on-CPU benchmark path can compare against +// pos2-chip's hand-tuned CPU plotter; production +// --cpu / --devices cpu plotting bypasses this +// and dispatches directly to run_one_plot_cpu() +// in BatchPlotter, see CpuPlotter.cpp) +// 0..N-1 : explicit GPU index from +// sycl::device::get_devices(gpu) +// +// Thread-local, not global: the multi-device fan-out in BatchPlotter runs +// N worker threads, each binding to a distinct device. The main thread +// stays at kDefaultGpuId and sees the default selector. +inline int& current_device_id_ref() +{ + thread_local int id = kDefaultGpuId; + return id; +} + +inline void set_current_device_id(int id) +{ + current_device_id_ref() = id; +} + +inline int current_device_id() +{ + return current_device_id_ref(); +} + +// Every SYCL GPU device this process can see. Used by --devices N to +// translate the user's index into a sycl::device, and by --devices all +// to spawn a worker per device. +// +// Used to filter non-CUDA backends out when the CUB sort path was +// linked, on the theory that a worker landing on an AMD device with +// CUB-only sort would just die mid-pipeline. The runtime backend +// dispatch in SortDispatch.cpp made that filter unnecessary — a hybrid +// host (NVIDIA + AMD) can now run a worker per device, with each +// worker picking the right sort backend at queue construction time. +inline std::vector usable_gpu_devices() +{ + auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); + return devs; +} + +// Per-thread SYCL queue. Bound to the thread's current device id (see +// the kDefaultGpuId / kCpuDeviceId sentinels above). A unique_ptr wrapper +// lets us defer construction until the thread has had a chance to set +// its device id. +// +// gpu_selector_v ensures the CUDA-backed GPU (or whichever AdaptiveCpp +// was configured for) is picked over the OpenMP host device. cpu_selector_v +// bypasses GPU enumeration entirely and lands on AdaptiveCpp's OMP backend +// (CPU build path, ACPP_TARGETS=omp). +// +// Runs a one-shot dispatch sanity check on first construction (see +// validate_kernel_dispatch below). If AdaptiveCpp's HIP / CUDA backend +// on this host produces a no-op kernel stub at JIT/AOT time, the throw +// surfaces here — at the first GPU work request — instead of much later +// as a confusing "T1 match produced 0 entries" / streaming-tier error. +// Set POS2GPU_SKIP_SELFTEST=1 to bypass; useful when you've already +// validated the device this session and want lower startup overhead +// across many short-lived processes. +inline void validate_kernel_dispatch(sycl::queue& q) +{ + if (char const* v = std::getenv("POS2GPU_SKIP_SELFTEST"); v && v[0] == '1') { + return; + } + + constexpr std::size_t N = 16; + constexpr std::uint32_t kPattern = 0xDEADBEEFu; + + std::uint32_t* d = sycl::malloc_device(N, q); + if (!d) { + throw std::runtime_error( + "[selftest] sycl::malloc_device(16 * u32) returned null. " + "The SYCL runtime can't allocate even tiny device buffers — " + "device discovery probably failed (check rocminfo / nvidia-smi, " + "ACPP_VISIBILITY_MASK)."); + } + + // Sentinel-fill: a "no kernel writes landed" outcome shows the + // sentinel, not random uninitialised bytes that might happen to + // match the expected pattern by coincidence. + q.memset(d, 0xCD, N * sizeof(std::uint32_t)).wait(); + q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> it) { + std::size_t idx = it.get_global_id(0); + d[idx] = kPattern + static_cast(idx); + }).wait(); + + std::uint32_t host[N] = {}; + q.memcpy(host, d, N * sizeof(std::uint32_t)).wait(); + sycl::free(d, q); + + int fails = 0; + for (std::size_t i = 0; i < N; ++i) { + if (host[i] != kPattern + static_cast(i)) ++fails; + } + if (fails == 0) return; + + char head[64]; + std::snprintf(head, sizeof(head), "0x%08x (expected 0x%08x)", + host[0], kPattern); + std::string msg = + "[selftest] SYCL kernel writes are not landing on the device. " + "A trivial parallel_for(16) writing a known pattern produced " + "host[0]="; + msg += head; + msg += ".\n "; + if (host[0] == 0xCDCDCDCDu) { + msg += "The pre-launch sentinel (0xCDCDCDCD) is intact, so the " + "kernel completed without writing anything. "; + } else { + msg += "The sentinel was overwritten but with a wrong value — " + "the kernel is dispatching but its output is corrupted. "; + } + msg += "Most likely AdaptiveCpp's HIP / CUDA backend on this host is " + "producing a no-op or miscompiled kernel stub at JIT/AOT time. " + "Diagnose with:\n" + " - ACPP_DEBUG_LEVEL=2 ./xchplot2 ... (shows the JIT log)\n" + " - rocminfo / nvidia-smi (confirm the actual ISA " + "matches the AOT target — see cargo:warning lines from your " + "last `cargo install`)\n" + " - try ACPP_TARGETS=generic (forces SSCP JIT instead " + "of an AOT spoof)\n" + "Bypass the self-test with POS2GPU_SKIP_SELFTEST=1 if you've " + "already validated this device this session."; + throw std::runtime_error(msg); +} + +inline sycl::queue& queue() +{ + thread_local std::unique_ptr q; + if (!q) { + int const id = current_device_id(); + if (id == kCpuDeviceId) { + // AdaptiveCpp's OpenMP backend exposes its host device as + // `info::device_type::host`, which SYCL 2020's + // `cpu_selector_v` *can* reject (host-device is deprecated + // in 2020). And a custom selector lambda does too on the + // 25.10 headers. Bypass selectors and take the first device + // visible under whatever ACPP_VISIBILITY_MASK is in effect — + // when limited to omp, that's the OMP host device by + // construction. When CPU + GPU are both visible, set the + // mask to "omp" before invoking to disambiguate. + auto devs = sycl::device::get_devices(); + if (devs.empty()) { + throw std::runtime_error( + "sycl_backend::queue (CPU): no SYCL devices visible. " + "Set ACPP_VISIBILITY_MASK=omp to expose AdaptiveCpp's " + "OpenMP backend."); + } + q = std::make_unique(devs.front(), + async_error_handler); + } else if (id < 0) { + q = std::make_unique(sycl::gpu_selector_v, + async_error_handler); + } else { + auto devices = usable_gpu_devices(); + if (id >= static_cast(devices.size())) { + throw std::runtime_error( + "sycl_backend::queue: device id " + std::to_string(id) + + " out of range (found " + std::to_string(devices.size()) + + " usable GPU device(s))"); + } + q = std::make_unique(devices[id], async_error_handler); + } + validate_kernel_dispatch(*q); + } + return *q; +} + +// Return the number of SYCL GPU devices visible to the process AND +// usable by this build. Used by BatchOptions::use_all_devices to expand +// "all" into an explicit list. See usable_gpu_devices() for the filter. +inline int get_gpu_device_count() +{ + return static_cast(usable_gpu_devices().size()); +} + +// AES T-tables uploaded into a USM device buffer on first use, kept +// alive for the thread's queue lifetime — mirrors the CUDA path's +// __constant__ T-tables. Thread-local because each worker thread's queue +// is on a different device; the table upload must happen once per device, +// not once per process. +// +// Pointer layout matches what the _smem family expects: [T0|T1|T2|T3], +// 256 entries each. +inline uint32_t* aes_tables_device(sycl::queue& q) +{ + thread_local uint32_t* d_tables = nullptr; + if (d_tables) return d_tables; + + std::vector sT_host(4 * 256); + for (int i = 0; i < 256; ++i) { + sT_host[0 * 256 + i] = pos2gpu::aes_tables::T0[i]; + sT_host[1 * 256 + i] = pos2gpu::aes_tables::T1[i]; + sT_host[2 * 256 + i] = pos2gpu::aes_tables::T2[i]; + sT_host[3 * 256 + i] = pos2gpu::aes_tables::T3[i]; + } + d_tables = sycl::malloc_device(4 * 256, q); + q.memcpy(d_tables, sT_host.data(), sizeof(uint32_t) * 4 * 256).wait(); + return d_tables; +} + +} // namespace pos2gpu::sycl_backend diff --git a/src/gpu/SyclDeviceList.cpp b/src/gpu/SyclDeviceList.cpp new file mode 100644 index 0000000..6993db4 --- /dev/null +++ b/src/gpu/SyclDeviceList.cpp @@ -0,0 +1,45 @@ +// SyclDeviceList.cpp — implementation of list_gpu_devices(). +// Compiled by acpp via add_sycl_to_target so the SYCL headers are in +// scope here; the public-facing header (SyclDeviceList.hpp) carries +// only plain types for non-acpp consumers like cli.cpp. + +#include "gpu/SyclDeviceList.hpp" +#include "gpu/SyclBackend.hpp" + +namespace pos2gpu { + +std::vector list_gpu_devices() +{ + std::vector out; + auto devs = sycl_backend::usable_gpu_devices(); + out.reserve(devs.size()); + for (std::size_t i = 0; i < devs.size(); ++i) { + auto const& d = devs[i]; + GpuDeviceInfo info{}; + info.id = i; + info.name = d.get_info(); + info.vram_bytes = d.get_info(); + info.cu_count = static_cast( + d.get_info()); + info.is_cuda_backend = false; + switch (d.get_backend()) { + case sycl::backend::cuda: + info.backend = "cuda"; + info.is_cuda_backend = true; + break; + case sycl::backend::hip: + info.backend = "hip"; + break; + case sycl::backend::level_zero: + info.backend = "level_zero"; + break; + default: + info.backend = "?"; + break; + } + out.push_back(std::move(info)); + } + return out; +} + +} // namespace pos2gpu diff --git a/src/gpu/SyclDeviceList.hpp b/src/gpu/SyclDeviceList.hpp new file mode 100644 index 0000000..0b35b99 --- /dev/null +++ b/src/gpu/SyclDeviceList.hpp @@ -0,0 +1,34 @@ +// SyclDeviceList.hpp — plain-types declaration for `xchplot2 devices` +// (and any other consumer that needs to enumerate GPU devices without +// pulling into its TU). +// +// cli.cpp is compiled by g++ with -Werror, and including SyclBackend.hpp +// drags in AdaptiveCpp's libkernel/host/builtins.hpp which has a +// narrowing-conversion warning that gets escalated to an error. Keeping +// this header SYCL-free lets non-acpp TUs query the device list via the +// implementation in SyclDeviceList.cpp (compiled by acpp). + +#pragma once + +#include +#include +#include +#include + +namespace pos2gpu { + +struct GpuDeviceInfo { + std::size_t id; + std::string name; + std::string backend; // "cuda" / "hip" / "level_zero" / "opencl" / "?" + bool is_cuda_backend; // true iff backend == sycl::backend::cuda + std::uint64_t vram_bytes; + unsigned cu_count; // max_compute_units +}; + +// Enumerate every visible SYCL GPU device. Order matches what +// `--devices N` uses for index lookup, so the printed `[N]` is a +// drop-in for that flag. +std::vector list_gpu_devices(); + +} // namespace pos2gpu diff --git a/src/gpu/T1Kernel.cpp b/src/gpu/T1Kernel.cpp new file mode 100644 index 0000000..75a43bf --- /dev/null +++ b/src/gpu/T1Kernel.cpp @@ -0,0 +1,202 @@ +// T1Kernel.cu — port of pos2-chip Table1Constructor. +// +// Algorithm (mirrors pos2-chip/src/plot/TableConstructorGeneric.hpp): +// +// For each section_l in {0,1,2,3} (order doesn't affect the *set* of +// T1Pairings produced; CPU iterates 3,0,2,1 but the post-construct +// sort by match_info collapses ordering): +// section_r = matching_section(section_l) +// For each match_key_r in [0, num_match_keys): +// L = sorted_xs[section_l..section_l+1) (entire section) +// R = sorted_xs in (section_r, match_key_r) bucket +// For each L candidate (one thread): +// target_l = matching_target(1, match_key_r, x_l) & target_mask +// binary-search R for first entry with match_target == target_l +// walk forward while still equal; for each: +// pairing_t1(x_l, x_r); if test_result == 0, emit T1Pairing +// { meta = (x_l << k) | x_r, match_info = pair.r[0] mask k } + +#include "host/PoolSizing.hpp" + +#include "gpu/AesGpu.cuh" +#include "gpu/AesHashGpu.cuh" +#include "gpu/T1Kernel.cuh" +#include "gpu/T1Offsets.cuh" + +#include +#include + +namespace pos2gpu { + +T1MatchParams make_t1_params(int k, int strength) +{ + T1MatchParams p{}; + p.k = k; + p.strength = strength; + p.num_section_bits = (k < 28) ? 2 : (k - 26); + p.num_match_key_bits = 2; // table_id == 1 + p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits; + return p; +} + +// All T1 kernels (compute_bucket_offsets, compute_fine_bucket_offsets, +// match_all_buckets) and the previously-unused matching_section helper +// have moved to T1Offsets.cuh / T1OffsetsSycl.cpp on the cross-backend path. + +namespace { + +constexpr int kT1FineBits = 8; + +struct T1Derived { + uint32_t num_sections; + uint32_t num_match_keys; + uint32_t num_buckets; + uint64_t fine_entries; + size_t bucket_bytes; + size_t fine_bytes; + size_t temp_needed; + uint32_t target_mask; + uint64_t l_count_max; +}; + +T1Derived derive_t1(T1MatchParams const& params) +{ + T1Derived d{}; + d.num_sections = 1u << params.num_section_bits; + d.num_match_keys = 1u << params.num_match_key_bits; + d.num_buckets = d.num_sections * d.num_match_keys; + uint64_t const fine_count = 1ull << kT1FineBits; + d.fine_entries = uint64_t(d.num_buckets) * fine_count + 1; + d.bucket_bytes = sizeof(uint64_t) * (d.num_buckets + 1); + d.fine_bytes = sizeof(uint64_t) * d.fine_entries; + d.temp_needed = d.bucket_bytes + d.fine_bytes; + d.target_mask = (params.num_match_target_bits >= 32) + ? 0xFFFFFFFFu + : ((1u << params.num_match_target_bits) - 1u); + d.l_count_max = + static_cast(max_pairs_per_section(params.k, params.num_section_bits)); + return d; +} + +} // namespace + +void launch_t1_match_prepare( + uint8_t const* plot_id_bytes, + T1MatchParams const& params, + XsCandidateGpu const* d_sorted_xs, + uint64_t total, + uint64_t* d_out_count, + void* d_temp_storage, + size_t* temp_bytes, + sycl::queue& q) +{ + if (!plot_id_bytes || !temp_bytes) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.strength < 2) throw std::invalid_argument("invalid argument to launch wrapper"); + + T1Derived const d = derive_t1(params); + + if (d_temp_storage == nullptr) { + *temp_bytes = d.temp_needed; + return; + } + if (*temp_bytes < d.temp_needed) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_sorted_xs || !d_out_count) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.num_match_target_bits <= kT1FineBits) throw std::invalid_argument("invalid argument to launch wrapper"); + + auto* d_offsets = reinterpret_cast(d_temp_storage); + auto* d_fine_offsets = d_offsets + (d.num_buckets + 1); + + launch_compute_bucket_offsets( + d_sorted_xs, total, + params.num_match_target_bits, + d.num_buckets, d_offsets, q); + launch_compute_fine_bucket_offsets( + d_sorted_xs, d_offsets, + params.num_match_target_bits, kT1FineBits, + d.num_buckets, d_fine_offsets, q); + q.memset(d_out_count, 0, sizeof(uint64_t)).wait(); +} + +void launch_t1_match_range( + uint8_t const* plot_id_bytes, + T1MatchParams const& params, + XsCandidateGpu const* d_sorted_xs, + uint64_t total, + uint64_t* d_out_meta, + uint32_t* d_out_mi, + uint64_t* d_out_count, + uint64_t capacity, + void const* d_temp_storage, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q) +{ + (void)total; + if (!plot_id_bytes) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.strength < 2) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_temp_storage) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_sorted_xs || !d_out_meta || !d_out_mi || !d_out_count) + throw std::invalid_argument("invalid argument to launch wrapper"); + + T1Derived const d = derive_t1(params); + if (bucket_end > d.num_buckets) throw std::invalid_argument("invalid argument to launch wrapper"); + if (bucket_end <= bucket_begin) return; + + constexpr int kThreads = 256; + uint64_t const blocks_x_u64 = (d.l_count_max + kThreads - 1) / kThreads; + if (blocks_x_u64 > UINT_MAX) throw std::invalid_argument("invalid argument to launch wrapper"); + + auto const* d_offsets = reinterpret_cast(d_temp_storage); + auto const* d_fine_offsets = d_offsets + (d.num_buckets + 1); + + AesHashKeys keys = make_keys(plot_id_bytes); + + int const extra_rounds_bits = params.strength - 2; + int const num_test_bits = params.num_match_key_bits; + int const num_info_bits = params.k; + + launch_t1_match_all_buckets( + keys, d_sorted_xs, + const_cast(d_offsets), + const_cast(d_fine_offsets), + d.num_match_keys, d.num_buckets, + params.k, params.num_section_bits, + params.num_match_target_bits, kT1FineBits, + extra_rounds_bits, d.target_mask, + num_test_bits, num_info_bits, + d_out_meta, d_out_mi, d_out_count, + capacity, d.l_count_max, + bucket_begin, bucket_end, q); +} + +void launch_t1_match( + uint8_t const* plot_id_bytes, + T1MatchParams const& params, + XsCandidateGpu const* d_sorted_xs, + uint64_t total, + uint64_t* d_out_meta, + uint32_t* d_out_mi, + uint64_t* d_out_count, + uint64_t capacity, + void* d_temp_storage, + size_t* temp_bytes, + sycl::queue& q) +{ + // Single-shot wrapper: prepare + one full-range match. Preserves + // the original API for pool path, test mode, and parity tests. + launch_t1_match_prepare( + plot_id_bytes, params, d_sorted_xs, total, + d_out_count, d_temp_storage, temp_bytes, q); + if (d_temp_storage == nullptr) return; // size-query path + + T1Derived const d = derive_t1(params); + launch_t1_match_range( + plot_id_bytes, params, d_sorted_xs, total, + d_out_meta, d_out_mi, d_out_count, + capacity, d_temp_storage, + /*bucket_begin=*/0, /*bucket_end=*/d.num_buckets, q); +} + +} // namespace pos2gpu diff --git a/src/gpu/T1Kernel.cu b/src/gpu/T1Kernel.cu deleted file mode 100644 index 43ef516..0000000 --- a/src/gpu/T1Kernel.cu +++ /dev/null @@ -1,328 +0,0 @@ -// T1Kernel.cu — port of pos2-chip Table1Constructor. -// -// Algorithm (mirrors pos2-chip/src/plot/TableConstructorGeneric.hpp): -// -// For each section_l in {0,1,2,3} (order doesn't affect the *set* of -// T1Pairings produced; CPU iterates 3,0,2,1 but the post-construct -// sort by match_info collapses ordering): -// section_r = matching_section(section_l) -// For each match_key_r in [0, num_match_keys): -// L = sorted_xs[section_l..section_l+1) (entire section) -// R = sorted_xs in (section_r, match_key_r) bucket -// For each L candidate (one thread): -// target_l = matching_target(1, match_key_r, x_l) & target_mask -// binary-search R for first entry with match_target == target_l -// walk forward while still equal; for each: -// pairing_t1(x_l, x_r); if test_result == 0, emit T1Pairing -// { meta = (x_l << k) | x_r, match_info = pair.r[0] mask k } - -#include "gpu/AesGpu.cuh" -#include "gpu/AesHashGpu.cuh" -#include "gpu/T1Kernel.cuh" - -#include -#include -#include -#include - -namespace pos2gpu { - -T1MatchParams make_t1_params(int k, int strength) -{ - T1MatchParams p{}; - p.k = k; - p.strength = strength; - p.num_section_bits = (k < 28) ? 2 : (k - 26); - p.num_match_key_bits = 2; // table_id == 1 - p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits; - return p; -} - -namespace { - -// Mirrors pos2-chip/src/pos/ProofCore.hpp:198 matching_section. -__host__ __device__ inline uint32_t matching_section(uint32_t section, int num_section_bits) -{ - uint32_t num_sections = 1u << num_section_bits; - uint32_t mask = num_sections - 1u; - uint32_t rotated_left = ((section << 1) | (section >> (num_section_bits - 1))) & mask; - uint32_t rotated_left_plus_1 = (rotated_left + 1) & mask; - uint32_t section_new = ((rotated_left_plus_1 >> 1) - | (rotated_left_plus_1 << (num_section_bits - 1))) & mask; - return section_new; -} - -__global__ void compute_bucket_offsets( - XsCandidateGpu const* __restrict__ sorted, - uint64_t total, - int num_match_target_bits, // bucket id = match_info >> num_match_target_bits - uint32_t num_buckets, // num_sections * num_match_keys - uint64_t* __restrict__ offsets) // offsets[num_buckets + 1] -{ - if (threadIdx.x != 0 || blockIdx.x != 0) return; - uint32_t bucket_shift = static_cast(num_match_target_bits); - - uint64_t pos = 0; - for (uint32_t b = 0; b < num_buckets; ++b) { - uint64_t lo = pos, hi = total; - while (lo < hi) { - uint64_t mid = lo + ((hi - lo) >> 1); - uint32_t bucket_mid = sorted[mid].match_info >> bucket_shift; - if (bucket_mid < b) lo = mid + 1; - else hi = mid; - } - offsets[b] = lo; - pos = lo; - } - offsets[num_buckets] = total; -} - -// See T3Kernel.cu for the rationale. T1's sorted stream is -// XsCandidateGpu AoS; we read match_info directly from the struct. -__global__ void compute_fine_bucket_offsets( - XsCandidateGpu const* __restrict__ sorted, - uint64_t const* __restrict__ bucket_offsets, - int num_match_target_bits, - int fine_bits, - uint32_t num_buckets, - uint64_t* __restrict__ fine_offsets) -{ - uint32_t const fine_count = 1u << fine_bits; - uint32_t const total = num_buckets * fine_count; - uint32_t const tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= total) return; - - uint32_t const r_bucket = tid / fine_count; - uint32_t const fine_key = tid % fine_count; - - uint64_t const r_start = bucket_offsets[r_bucket]; - uint64_t const r_end = bucket_offsets[r_bucket + 1]; - - uint32_t const target_mask = (num_match_target_bits >= 32) - ? 0xFFFFFFFFu - : ((1u << num_match_target_bits) - 1u); - uint32_t const shift = static_cast(num_match_target_bits - fine_bits); - - uint64_t lo = r_start, hi = r_end; - while (lo < hi) { - uint64_t mid = lo + ((hi - lo) >> 1); - uint32_t t = (sorted[mid].match_info & target_mask) >> shift; - if (t < fine_key) lo = mid + 1; - else hi = mid; - } - fine_offsets[tid] = lo; - - if (tid == total - 1) { - fine_offsets[total] = bucket_offsets[num_buckets]; - } -} - -// Fused match kernel: handles all (section_l, match_key_r) buckets in a -// single launch. blockIdx.y identifies the bucket, blockIdx.x slices L. -// Loads AES T-tables into shared memory once per block. -__global__ __launch_bounds__(256, 4) void match_all_buckets( - AesHashKeys keys, - XsCandidateGpu const* __restrict__ sorted_xs, - uint64_t const* __restrict__ d_offsets, // [num_buckets+1] - uint64_t const* __restrict__ d_fine_offsets, - uint32_t num_match_keys, - int k, - int num_section_bits, - int num_match_target_bits, - int fine_bits, - int extra_rounds_bits, - uint32_t target_mask, - int num_test_bits, - int num_match_info_bits, - T1PairingGpu* __restrict__ out, - unsigned long long* __restrict__ out_count, - uint64_t out_capacity) -{ - __shared__ uint32_t sT[4 * 256]; - load_aes_tables_smem(sT); - __syncthreads(); - - uint32_t bucket_id = blockIdx.y; // 0..num_buckets - uint32_t section_l = bucket_id / num_match_keys; - uint32_t match_key_r = bucket_id % num_match_keys; - - uint32_t section_r; - { - uint32_t mask = (1u << num_section_bits) - 1u; - uint32_t rl = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask; - uint32_t rl1 = (rl + 1) & mask; - section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask; - } - - uint64_t l_start = d_offsets[section_l * num_match_keys]; - uint64_t l_end = d_offsets[(section_l + 1) * num_match_keys]; - uint32_t r_bucket = section_r * num_match_keys + match_key_r; - - uint64_t l = l_start + blockIdx.x * uint64_t(blockDim.x) + threadIdx.x; - if (l >= l_end) return; - - uint32_t x_l = sorted_xs[l].x; - - // Per pos2-chip/src/pos/ProofHashing.hpp:160, T1's matching_target uses - // extra_rounds_bits = strength - 2 (only T1, not T2/T3). The kernel arg - // already carries that value; we were passing 0 here, producing wrong - // target_l values at strength > 2. - uint32_t target_l = matching_target_smem(keys, 1u, match_key_r, uint64_t(x_l), - sT, extra_rounds_bits) - & target_mask; - - // Fine-bucket pre-index; see T3Kernel.cu for rationale. - uint32_t fine_shift = static_cast(num_match_target_bits - fine_bits); - uint32_t fine_key = target_l >> fine_shift; - uint64_t fine_idx = (uint64_t(r_bucket) << fine_bits) | fine_key; - uint64_t lo = d_fine_offsets[fine_idx]; - uint64_t fine_hi = d_fine_offsets[fine_idx + 1]; - uint64_t hi = fine_hi; - - while (lo < hi) { - uint64_t mid = lo + ((hi - lo) >> 1); - uint32_t target_mid = sorted_xs[mid].match_info & target_mask; - if (target_mid < target_l) lo = mid + 1; - else hi = mid; - } - - uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu - : ((1u << num_test_bits) - 1u); - uint32_t info_mask = (num_match_info_bits >= 32) ? 0xFFFFFFFFu - : ((1u << num_match_info_bits) - 1u); - - for (uint64_t r = lo; r < fine_hi; ++r) { - uint32_t target_r = sorted_xs[r].match_info & target_mask; - if (target_r != target_l) break; - - uint32_t x_r = sorted_xs[r].x; - Result128 res = pairing_smem(keys, uint64_t(x_l), uint64_t(x_r), sT, extra_rounds_bits); - - uint32_t test_result = res.r[3] & test_mask; - if (test_result != 0) continue; - - uint32_t match_info_result = res.r[0] & info_mask; - - unsigned long long out_idx = atomicAdd(out_count, 1ULL); - if (out_idx >= out_capacity) return; - - uint64_t meta = (uint64_t(x_l) << k) | uint64_t(x_r); - T1PairingGpu p; - p.meta_lo = uint32_t(meta); - p.meta_hi = uint32_t(meta >> 32); - p.match_info = match_info_result; - out[out_idx] = p; - } -} - -} // namespace - -cudaError_t launch_t1_match( - uint8_t const* plot_id_bytes, - T1MatchParams const& params, - XsCandidateGpu const* d_sorted_xs, - uint64_t total, - T1PairingGpu* d_out_pairings, - uint64_t* d_out_count, - uint64_t capacity, - void* d_temp_storage, - size_t* temp_bytes, - cudaStream_t stream) -{ - if (!plot_id_bytes || !temp_bytes) return cudaErrorInvalidValue; - if (params.k < 18 || params.k > 32) return cudaErrorInvalidValue; - if (params.strength < 2) return cudaErrorInvalidValue; - - uint32_t num_sections = 1u << params.num_section_bits; - uint32_t num_match_keys = 1u << params.num_match_key_bits; - uint32_t num_buckets = num_sections * num_match_keys; - - // temp layout: offsets[num_buckets + 1] uint64 || fine_offsets[num_buckets * 2^FINE_BITS + 1] - constexpr int FINE_BITS = 8; - uint64_t const fine_count = 1ull << FINE_BITS; - uint64_t const fine_entries = uint64_t(num_buckets) * fine_count + 1; - - size_t const bucket_bytes = sizeof(uint64_t) * (num_buckets + 1); - size_t const fine_bytes = sizeof(uint64_t) * fine_entries; - size_t const needed = bucket_bytes + fine_bytes; - - if (d_temp_storage == nullptr) { - *temp_bytes = needed; - return cudaSuccess; - } - if (*temp_bytes < needed) return cudaErrorInvalidValue; - if (!d_sorted_xs || !d_out_pairings || !d_out_count) return cudaErrorInvalidValue; - if (params.num_match_target_bits <= FINE_BITS) return cudaErrorInvalidValue; - - auto* d_offsets = reinterpret_cast(d_temp_storage); - auto* d_fine_offsets = d_offsets + (num_buckets + 1); - - AesHashKeys keys = make_keys(plot_id_bytes); - - // 1) Bucket offsets. - compute_bucket_offsets<<<1, 1, 0, stream>>>( - d_sorted_xs, total, - params.num_match_target_bits, - num_buckets, - d_offsets); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) return err; - - // 1b) Fine-bucket offsets: one thread per (r_bucket, fine_key). - uint32_t fine_threads_total = num_buckets * uint32_t(fine_count); - unsigned fine_blocks = (fine_threads_total + 255) / 256; - compute_fine_bucket_offsets<<>>( - d_sorted_xs, d_offsets, - params.num_match_target_bits, FINE_BITS, - num_buckets, d_fine_offsets); - err = cudaGetLastError(); - if (err != cudaSuccess) return err; - - // Reset out_count to 0. - err = cudaMemsetAsync(d_out_count, 0, sizeof(uint64_t), stream); - if (err != cudaSuccess) return err; - - // 2) Compute max L-count across sections (small H2D copy only for sizing). - std::vector h_offsets(num_buckets + 1); - err = cudaMemcpyAsync(h_offsets.data(), d_offsets, - sizeof(uint64_t) * (num_buckets + 1), - cudaMemcpyDeviceToHost, stream); - if (err != cudaSuccess) return err; - err = cudaStreamSynchronize(stream); - if (err != cudaSuccess) return err; - - uint64_t l_count_max = 0; - for (uint32_t s = 0; s < num_sections; ++s) { - uint64_t l_count = h_offsets[(s + 1) * num_match_keys] - - h_offsets[s * num_match_keys]; - if (l_count > l_count_max) l_count_max = l_count; - } - - uint32_t target_mask = (params.num_match_target_bits >= 32) - ? 0xFFFFFFFFu - : ((1u << params.num_match_target_bits) - 1u); - int extra_rounds_bits = params.strength - 2; - int num_test_bits = params.num_match_key_bits; - int num_info_bits = params.k; - - constexpr int kThreads = 256; - uint64_t blocks_x_u64 = (l_count_max + kThreads - 1) / kThreads; - if (blocks_x_u64 > UINT_MAX) return cudaErrorInvalidValue; - dim3 grid(static_cast(blocks_x_u64), num_buckets, 1); - - match_all_buckets<<>>( - keys, d_sorted_xs, d_offsets, d_fine_offsets, - num_match_keys, - params.k, params.num_section_bits, - params.num_match_target_bits, FINE_BITS, - extra_rounds_bits, target_mask, - num_test_bits, num_info_bits, - d_out_pairings, - reinterpret_cast(d_out_count), - capacity); - err = cudaGetLastError(); - if (err != cudaSuccess) return err; - return cudaSuccess; -} - -} // namespace pos2gpu diff --git a/src/gpu/T1Kernel.cuh b/src/gpu/T1Kernel.cuh index 05a4aa3..71abf0a 100644 --- a/src/gpu/T1Kernel.cuh +++ b/src/gpu/T1Kernel.cuh @@ -9,7 +9,8 @@ #include "gpu/AesHashGpu.cuh" #include "gpu/XsKernel.cuh" -#include +#include "gpu/CudaHalfShim.hpp" +#include #include #include @@ -37,21 +38,66 @@ T1MatchParams make_t1_params(int k, int strength); // Run the full T1 phase. // d_sorted_xs : output of launch_construct_xs (sorted by match_info) // total : 1 << k -// d_out_pairings : caller-allocated, capacity entries +// d_out_meta : caller-allocated, capacity entries (uint64 meta). +// d_out_mi : caller-allocated, capacity entries (uint32 match_info). // d_out_count : single uint64_t, will hold actual emitted count -// capacity : max number of T1Pairings d_out_pairings can hold +// capacity : max number of T1Pairings the output arrays can hold // d_temp_storage : nullptr to query *temp_bytes; otherwise must be // at least *temp_bytes large -cudaError_t launch_t1_match( +// +// Output is SoA (two parallel streams) rather than an AoS T1PairingGpu +// array so the streaming pipeline can feed d_out_mi straight into CUB +// as the sort-key input and free it as soon as CUB consumes it, without +// touching the meta stream. Saves ~1 GB at k=28 during the T1 sort +// phase. t1_parity and other consumers rebuild the AoS form locally if +// they need it. +void launch_t1_match( uint8_t const* plot_id_bytes, T1MatchParams const& params, XsCandidateGpu const* d_sorted_xs, uint64_t total, - T1PairingGpu* d_out_pairings, + uint64_t* d_out_meta, + uint32_t* d_out_mi, uint64_t* d_out_count, uint64_t capacity, void* d_temp_storage, size_t* temp_bytes, - cudaStream_t stream = nullptr); + sycl::queue& q); + +// Two-step entry point for callers that want to run T1 match in +// multiple bucket-range passes (parallel to T3's prepare/range plumbing). +// +// launch_t1_match_prepare: computes bucket + fine-bucket offsets into +// d_temp_storage and zeroes d_out_count. Same sizing protocol as +// launch_t1_match (d_temp_storage==nullptr fills *temp_bytes). +// +// launch_t1_match_range: runs the match kernel for bucket range +// [bucket_begin, bucket_end). Multiple calls sharing the same +// d_out_meta / d_out_mi / d_out_count produce a concatenated output +// via atomic append, byte-equivalent to a single full-range call +// after the subsequent T1 sort. +void launch_t1_match_prepare( + uint8_t const* plot_id_bytes, + T1MatchParams const& params, + XsCandidateGpu const* d_sorted_xs, + uint64_t total, + uint64_t* d_out_count, + void* d_temp_storage, + size_t* temp_bytes, + sycl::queue& q); + +void launch_t1_match_range( + uint8_t const* plot_id_bytes, + T1MatchParams const& params, + XsCandidateGpu const* d_sorted_xs, + uint64_t total, + uint64_t* d_out_meta, + uint32_t* d_out_mi, + uint64_t* d_out_count, + uint64_t capacity, + void const* d_temp_storage, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q); } // namespace pos2gpu diff --git a/src/gpu/T1Offsets.cuh b/src/gpu/T1Offsets.cuh new file mode 100644 index 0000000..79ba482 --- /dev/null +++ b/src/gpu/T1Offsets.cuh @@ -0,0 +1,95 @@ +// T1Offsets.cuh — backend-dispatched wrapper for compute_bucket_offsets. +// +// One-thread-per-bucket binary search that emits offsets[num_buckets+1] +// for T1's sorted XsCandidateGpu stream. Two implementations live in +// sibling TUs and are selected at configure time: +// +// XCHPLOT2_BACKEND=cuda → T1OffsetsCuda.cu (default; existing __global__) +// XCHPLOT2_BACKEND=sycl → T1OffsetsSycl.cpp (AdaptiveCpp parallel_for) +// +// The CUDA stream parameter is honoured by both: the CUDA path launches +// directly on it; the SYCL path syncs the stream before its own launch +// and waits for the SYCL queue to complete before returning, so the +// caller can chain subsequent CUDA work on `stream` unchanged. + +#pragma once + +#include "gpu/AesHashGpu.cuh" +#include "gpu/XsCandidateGpu.hpp" + +#include + +// Forward-declare cudaStream_t instead of including , so the +// SYCL backend implementation (compiled by acpp/clang in non-CUDA mode) can +// include this header without dragging in nvcc-only intrinsics from the +// transitive AesGpu.cuh chain. CUDA-side TUs include +// themselves; the typedef redeclaration to the same type is permitted. +#include "gpu/CudaHalfShim.hpp" +#include + +namespace pos2gpu { + +void launch_compute_bucket_offsets( + XsCandidateGpu const* d_sorted, + uint64_t total, + int num_match_target_bits, + uint32_t num_buckets, + uint64_t* d_offsets, + sycl::queue& q); + +// Per-fine-key offsets: for each (r_bucket, fine_key) in +// [0, num_buckets) × [0, 2^fine_bits), find the lowest index i in +// `sorted[bucket_offsets[r_bucket] .. bucket_offsets[r_bucket+1])` such +// that ((sorted[i].match_info & target_mask) >> shift) >= fine_key, where +// target_mask = (1<= l_end`. +// +// Across multiple calls sharing the same d_out_meta / d_out_mi / +// d_out_count, results append via the atomic counter — same pattern +// as T3 match's bucket-range plumbing. Used by minimal tier to split +// T1 match into N passes with smaller per-pass staging output, keeping +// d_t1_meta + d_t1_mi off-device until after T1 match completes. +void launch_t1_match_all_buckets( + AesHashKeys keys, + XsCandidateGpu const* d_sorted_xs, + uint64_t const* d_offsets, + uint64_t const* d_fine_offsets, + uint32_t num_match_keys, + uint32_t num_buckets, + int k, + int num_section_bits, + int num_match_target_bits, + int fine_bits, + int extra_rounds_bits, + uint32_t target_mask, + int num_test_bits, + int num_match_info_bits, + uint64_t* d_out_meta, + uint32_t* d_out_mi, + uint64_t* d_out_count, + uint64_t out_capacity, + uint64_t l_count_max, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q); + +} // namespace pos2gpu diff --git a/src/gpu/T1OffsetsSycl.cpp b/src/gpu/T1OffsetsSycl.cpp new file mode 100644 index 0000000..c7708e4 --- /dev/null +++ b/src/gpu/T1OffsetsSycl.cpp @@ -0,0 +1,234 @@ +// T1OffsetsSycl.cpp — SYCL/AdaptiveCpp implementation of +// launch_compute_bucket_offsets, selected when XCHPLOT2_BACKEND=sycl. +// +// Same algorithm and output layout as T1OffsetsCuda.cu. The SYCL queue +// uses AdaptiveCpp's CUDA backend (gpu_selector picks the RTX 4090 in +// our test bench), which uses libcuda directly and shares the primary +// CUDA context with the rest of the pipeline — so raw CUDA device +// pointers from cudaMalloc are valid USM device pointers in the SYCL +// kernel without any copy or remap. +// +// Synchronisation: the function syncs `stream` before launching SYCL +// (so prior CUDA writes to d_sorted are visible) and waits for the +// SYCL queue after (so subsequent CUDA reads of d_offsets see the +// SYCL writes). Two extra host syncs vs. the pure-CUDA path; not +// perf-relevant for slice 2. + +#include "gpu/SyclBackend.hpp" +#include "gpu/T1Offsets.cuh" + +#include + +namespace pos2gpu { + + +void launch_compute_bucket_offsets( + XsCandidateGpu const* d_sorted, + uint64_t total, + int num_match_target_bits, + uint32_t num_buckets, + uint64_t* d_offsets, + sycl::queue& q) +{ + constexpr size_t threads = 256; + size_t const out_count = static_cast(num_buckets) + 1; + size_t const groups = (out_count + threads - 1) / threads; + + q.parallel_for( + sycl::nd_range<1>{ groups * threads, threads }, + [=](sycl::nd_item<1> it) { + uint32_t b = static_cast(it.get_global_id(0)); + if (b > num_buckets) return; + if (b == num_buckets) { d_offsets[num_buckets] = total; return; } + + uint32_t bucket_shift = static_cast(num_match_target_bits); + uint64_t lo = 0, hi = total; + while (lo < hi) { + uint64_t mid = lo + ((hi - lo) >> 1); + uint32_t v = d_sorted[mid].match_info >> bucket_shift; + if (v < b) lo = mid + 1; + else hi = mid; + } + d_offsets[b] = lo; + }).wait(); +} + +void launch_compute_fine_bucket_offsets( + XsCandidateGpu const* d_sorted, + uint64_t const* d_bucket_offsets, + int num_match_target_bits, + int fine_bits, + uint32_t num_buckets, + uint64_t* d_fine_offsets, + sycl::queue& q) +{ + constexpr size_t threads = 256; + uint32_t const fine_count = 1u << fine_bits; + uint32_t const total = num_buckets * fine_count; + size_t const groups = (total + threads - 1) / threads; + + q.parallel_for( + sycl::nd_range<1>{ groups * threads, threads }, + [=](sycl::nd_item<1> it) { + uint32_t tid = static_cast(it.get_global_id(0)); + if (tid >= total) return; + + uint32_t r_bucket = tid / fine_count; + uint32_t fine_key = tid % fine_count; + + uint64_t r_start = d_bucket_offsets[r_bucket]; + uint64_t r_end = d_bucket_offsets[r_bucket + 1]; + + uint32_t target_mask = (num_match_target_bits >= 32) + ? 0xFFFFFFFFu + : ((1u << num_match_target_bits) - 1u); + uint32_t shift = static_cast(num_match_target_bits - fine_bits); + + uint64_t lo = r_start, hi = r_end; + while (lo < hi) { + uint64_t mid = lo + ((hi - lo) >> 1); + uint32_t t = (d_sorted[mid].match_info & target_mask) >> shift; + if (t < fine_key) lo = mid + 1; + else hi = mid; + } + d_fine_offsets[tid] = lo; + + if (tid == total - 1) { + d_fine_offsets[total] = d_bucket_offsets[num_buckets]; + } + }).wait(); +} + +void launch_t1_match_all_buckets( + AesHashKeys keys, + XsCandidateGpu const* d_sorted_xs, + uint64_t const* d_offsets, + uint64_t const* d_fine_offsets, + uint32_t num_match_keys, + uint32_t num_buckets, + int k, + int num_section_bits, + int num_match_target_bits, + int fine_bits, + int extra_rounds_bits, + uint32_t target_mask, + int num_test_bits, + int num_match_info_bits, + uint64_t* d_out_meta, + uint32_t* d_out_mi, + uint64_t* d_out_count, + uint64_t out_capacity, + uint64_t l_count_max, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q) +{ + (void)num_buckets; + if (bucket_end <= bucket_begin) return; + uint32_t const num_buckets_in_range = bucket_end - bucket_begin; + + uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q); + + constexpr size_t threads = 256; + uint64_t blocks_x_u64 = (l_count_max + threads - 1) / threads; + size_t const blocks_x = static_cast(blocks_x_u64); + + auto* d_out_count_ull = + reinterpret_cast(d_out_count); + + q.submit([&](sycl::handler& h) { + sycl::local_accessor sT_local{ + sycl::range<1>{4 * 256}, h}; + + h.parallel_for( + sycl::nd_range<2>{ + sycl::range<2>{ static_cast(num_buckets_in_range), + blocks_x * threads }, + sycl::range<2>{ 1, threads } + }, + [=, keys_copy = keys](sycl::nd_item<2> it) { + // Cooperative load of AES T-tables into local memory. + uint32_t* sT = &sT_local[0]; + size_t local_id = it.get_local_id(1); + #pragma unroll 1 + for (size_t i = local_id; i < 4 * 256; i += threads) { + sT[i] = d_aes_tables[i]; + } + it.barrier(sycl::access::fence_space::local_space); + + uint32_t bucket_id = bucket_begin + static_cast(it.get_group(0)); + uint32_t section_l = bucket_id / num_match_keys; + uint32_t match_key_r = bucket_id % num_match_keys; + + uint32_t section_r; + { + uint32_t mask = (1u << num_section_bits) - 1u; + uint32_t rl = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask; + uint32_t rl1 = (rl + 1) & mask; + section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask; + } + + uint64_t l_start = d_offsets[section_l * num_match_keys]; + uint64_t l_end = d_offsets[(section_l + 1) * num_match_keys]; + uint32_t r_bucket = section_r * num_match_keys + match_key_r; + + uint64_t l = l_start + + it.get_group(1) * uint64_t(threads) + + local_id; + if (l >= l_end) return; + + uint32_t x_l = d_sorted_xs[l].x; + + uint32_t target_l = pos2gpu::matching_target_smem( + keys_copy, 1u, match_key_r, uint64_t(x_l), + sT, extra_rounds_bits) + & target_mask; + + uint32_t fine_shift = static_cast(num_match_target_bits - fine_bits); + uint32_t fine_key = target_l >> fine_shift; + uint64_t fine_idx = (uint64_t(r_bucket) << fine_bits) | fine_key; + uint64_t lo = d_fine_offsets[fine_idx]; + uint64_t fine_hi = d_fine_offsets[fine_idx + 1]; + uint64_t hi = fine_hi; + + while (lo < hi) { + uint64_t mid = lo + ((hi - lo) >> 1); + uint32_t target_mid = d_sorted_xs[mid].match_info & target_mask; + if (target_mid < target_l) lo = mid + 1; + else hi = mid; + } + + uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu + : ((1u << num_test_bits) - 1u); + uint32_t info_mask = (num_match_info_bits >= 32) ? 0xFFFFFFFFu + : ((1u << num_match_info_bits) - 1u); + + for (uint64_t r = lo; r < fine_hi; ++r) { + uint32_t target_r = d_sorted_xs[r].match_info & target_mask; + if (target_r != target_l) break; + + uint32_t x_r = d_sorted_xs[r].x; + pos2gpu::Result128 res = pos2gpu::pairing_smem( + keys_copy, uint64_t(x_l), uint64_t(x_r), sT, extra_rounds_bits); + + uint32_t test_result = res.r[3] & test_mask; + if (test_result != 0) continue; + + uint32_t match_info_result = res.r[0] & info_mask; + + sycl::atomic_ref + out_count_atomic{ *d_out_count_ull }; + unsigned long long out_idx = out_count_atomic.fetch_add(1ULL); + if (out_idx >= out_capacity) return; + + uint64_t meta = (uint64_t(x_l) << k) | uint64_t(x_r); + d_out_meta[out_idx] = meta; + d_out_mi [out_idx] = match_info_result; + } + }); + }).wait(); +} + +} // namespace pos2gpu diff --git a/src/gpu/T2Kernel.cpp b/src/gpu/T2Kernel.cpp new file mode 100644 index 0000000..e86bb1a --- /dev/null +++ b/src/gpu/T2Kernel.cpp @@ -0,0 +1,213 @@ +// T2Kernel.cu — port of pos2-chip Table2Constructor. +// +// Differences from T1 (see T1Kernel.cu): +// - Input is T1Pairing (12 bytes, has 64-bit meta accessor), not Xs_Candidate. +// - matching_target uses table_id=2 and meta=T1Pairing.meta() (64-bit). +// ProofHashing::matching_target sets extra_rounds_bits=0 for table_id != 1. +// - pairing_t2 calls AesHash::pairing without extra_rounds_bits (always 0). +// - num_match_key_bits = strength (not hard-coded 2 like T1). +// - Output T2Pairing has the AES pair.meta_result (64-bit) + x_bits derived +// from upper-k bits of meta_l/meta_r. + +#include "gpu/AesGpu.cuh" +#include "gpu/AesHashGpu.cuh" +#include "gpu/T2Kernel.cuh" +#include "gpu/T2Offsets.cuh" +#include "host/PoolSizing.hpp" + +#include +#include + +namespace pos2gpu { + +T2MatchParams make_t2_params(int k, int strength) +{ + T2MatchParams p{}; + p.k = k; + p.strength = strength; + p.num_section_bits = (k < 28) ? 2 : (k - 26); + p.num_match_key_bits = strength; // T2 uses strength match_key bits + p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits; + return p; +} + +// T2's three kernels — compute_bucket_offsets, compute_fine_bucket_offsets, +// match_all_buckets — have moved to T2Offsets.cuh / T2OffsetsCuda.cu / +// T2OffsetsSycl.cpp on the cross-backend path. The previously-unused +// matching_section helper went with them. + +namespace { + +// Fine-bucket pre-index; see T3Kernel.cu for the scheme. +constexpr int kT2FineBits = 8; + +// Shared parameter derivation so launch_t2_match, launch_t2_match_prepare, +// and launch_t2_match_range all agree on bucket counts, offset layout, +// and temp_storage sizing. +struct T2Derived { + uint32_t num_sections; + uint32_t num_match_keys; + uint32_t num_buckets; + uint64_t fine_entries; + size_t bucket_bytes; + size_t fine_bytes; + size_t temp_needed; + uint32_t target_mask; + int num_test_bits; + int num_info_bits; + int half_k; + uint64_t l_count_max; +}; + +T2Derived derive_t2(T2MatchParams const& params) +{ + T2Derived d{}; + d.num_sections = 1u << params.num_section_bits; + d.num_match_keys = 1u << params.num_match_key_bits; + d.num_buckets = d.num_sections * d.num_match_keys; + uint64_t const fine_count = 1ull << kT2FineBits; + d.fine_entries = uint64_t(d.num_buckets) * fine_count + 1; + d.bucket_bytes = sizeof(uint64_t) * (d.num_buckets + 1); + d.fine_bytes = sizeof(uint64_t) * d.fine_entries; + d.temp_needed = d.bucket_bytes + d.fine_bytes; + d.target_mask = (params.num_match_target_bits >= 32) + ? 0xFFFFFFFFu + : ((1u << params.num_match_target_bits) - 1u); + d.num_test_bits = params.num_match_key_bits; + d.num_info_bits = params.k; + d.half_k = params.k / 2; + d.l_count_max = + static_cast(max_pairs_per_section(params.k, params.num_section_bits)); + return d; +} + +} // namespace + +void launch_t2_match_prepare( + uint8_t const* plot_id_bytes, + T2MatchParams const& params, + uint32_t const* d_sorted_mi, + uint64_t t1_count, + uint64_t* d_out_count, + void* d_temp_storage, + size_t* temp_bytes, + sycl::queue& q) +{ + if (!plot_id_bytes || !temp_bytes) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.strength < 2) throw std::invalid_argument("invalid argument to launch wrapper"); + + T2Derived const d = derive_t2(params); + + if (d_temp_storage == nullptr) { + *temp_bytes = d.temp_needed; + return; + } + if (*temp_bytes < d.temp_needed) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_sorted_mi || !d_out_count) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.num_match_target_bits <= kT2FineBits) throw std::invalid_argument("invalid argument to launch wrapper"); + + auto* d_offsets = reinterpret_cast(d_temp_storage); + auto* d_fine_offsets = d_offsets + (d.num_buckets + 1); + + launch_t2_compute_bucket_offsets( + d_sorted_mi, t1_count, + params.num_match_target_bits, + d.num_buckets, d_offsets, q); + launch_t2_compute_fine_bucket_offsets( + d_sorted_mi, d_offsets, + params.num_match_target_bits, kT2FineBits, + d.num_buckets, d_fine_offsets, q); + q.memset(d_out_count, 0, sizeof(uint64_t)).wait(); +} + +void launch_t2_match_range( + uint8_t const* plot_id_bytes, + T2MatchParams const& params, + uint64_t const* d_sorted_meta, + uint32_t const* d_sorted_mi, + uint64_t t1_count, + uint64_t* d_out_meta, + uint32_t* d_out_mi, + uint32_t* d_out_xbits, + uint64_t* d_out_count, + uint64_t capacity, + void const* d_temp_storage, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q) +{ + (void)t1_count; + if (!plot_id_bytes) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.strength < 2) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_temp_storage) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_sorted_meta || !d_sorted_mi || + !d_out_meta || !d_out_mi || !d_out_xbits || !d_out_count) + { + throw std::invalid_argument("invalid argument to launch wrapper"); + } + + T2Derived const d = derive_t2(params); + + if (bucket_end > d.num_buckets) throw std::invalid_argument("invalid argument to launch wrapper"); + if (bucket_end <= bucket_begin) return; // empty range is a no-op + + constexpr int kThreads = 256; + uint64_t const blocks_x_u64 = (d.l_count_max + kThreads - 1) / kThreads; + if (blocks_x_u64 > UINT_MAX) throw std::invalid_argument("invalid argument to launch wrapper"); + + auto const* d_offsets = reinterpret_cast(d_temp_storage); + auto const* d_fine_offsets = d_offsets + (d.num_buckets + 1); + + AesHashKeys keys = make_keys(plot_id_bytes); + + launch_t2_match_all_buckets( + keys, d_sorted_meta, d_sorted_mi, + // launch_t2_match_all_buckets takes mutable pointers to the + // offset arrays (historical — they're treated as const inside + // the kernel). Cast away const at the ABI boundary only. + const_cast(d_offsets), + const_cast(d_fine_offsets), + d.num_match_keys, d.num_buckets, + params.k, params.num_section_bits, + params.num_match_target_bits, kT2FineBits, + d.target_mask, d.num_test_bits, d.num_info_bits, d.half_k, + d_out_meta, d_out_mi, d_out_xbits, d_out_count, + capacity, d.l_count_max, + bucket_begin, bucket_end, + q); +} + +void launch_t2_match( + uint8_t const* plot_id_bytes, + T2MatchParams const& params, + uint64_t const* d_sorted_meta, + uint32_t const* d_sorted_mi, + uint64_t t1_count, + uint64_t* d_out_meta, + uint32_t* d_out_mi, + uint32_t* d_out_xbits, + uint64_t* d_out_count, + uint64_t capacity, + void* d_temp_storage, + size_t* temp_bytes, + sycl::queue& q) +{ + // Single-shot wrapper: prepare + one full-range match. Preserves the + // original API for test-mode, the pool path, and parity-test callers. + launch_t2_match_prepare( + plot_id_bytes, params, d_sorted_mi, t1_count, + d_out_count, d_temp_storage, temp_bytes, q); + if (d_temp_storage == nullptr) return; // size-query path + + T2Derived const d = derive_t2(params); + launch_t2_match_range( + plot_id_bytes, params, + d_sorted_meta, d_sorted_mi, t1_count, + d_out_meta, d_out_mi, d_out_xbits, d_out_count, + capacity, d_temp_storage, + /*bucket_begin=*/0, /*bucket_end=*/d.num_buckets, q); +} + +} // namespace pos2gpu diff --git a/src/gpu/T2Kernel.cu b/src/gpu/T2Kernel.cu deleted file mode 100644 index 691d18b..0000000 --- a/src/gpu/T2Kernel.cu +++ /dev/null @@ -1,320 +0,0 @@ -// T2Kernel.cu — port of pos2-chip Table2Constructor. -// -// Differences from T1 (see T1Kernel.cu): -// - Input is T1Pairing (12 bytes, has 64-bit meta accessor), not Xs_Candidate. -// - matching_target uses table_id=2 and meta=T1Pairing.meta() (64-bit). -// ProofHashing::matching_target sets extra_rounds_bits=0 for table_id != 1. -// - pairing_t2 calls AesHash::pairing without extra_rounds_bits (always 0). -// - num_match_key_bits = strength (not hard-coded 2 like T1). -// - Output T2Pairing has the AES pair.meta_result (64-bit) + x_bits derived -// from upper-k bits of meta_l/meta_r. - -#include "gpu/AesGpu.cuh" -#include "gpu/AesHashGpu.cuh" -#include "gpu/T2Kernel.cuh" - -#include -#include -#include -#include - -namespace pos2gpu { - -T2MatchParams make_t2_params(int k, int strength) -{ - T2MatchParams p{}; - p.k = k; - p.strength = strength; - p.num_section_bits = (k < 28) ? 2 : (k - 26); - p.num_match_key_bits = strength; // T2 uses strength match_key bits - p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits; - return p; -} - -namespace { - -__host__ __device__ inline uint32_t matching_section(uint32_t section, int num_section_bits) -{ - uint32_t num_sections = 1u << num_section_bits; - uint32_t mask = num_sections - 1u; - uint32_t rotated_left = ((section << 1) | (section >> (num_section_bits - 1))) & mask; - uint32_t rotated_left_plus_1 = (rotated_left + 1) & mask; - uint32_t section_new = ((rotated_left_plus_1 >> 1) - | (rotated_left_plus_1 << (num_section_bits - 1))) & mask; - return section_new; -} - -__global__ void compute_bucket_offsets( - uint32_t const* __restrict__ sorted_mi, - uint64_t total, - int num_match_target_bits, - uint32_t num_buckets, - uint64_t* __restrict__ offsets) -{ - if (threadIdx.x != 0 || blockIdx.x != 0) return; - uint32_t bucket_shift = static_cast(num_match_target_bits); - - uint64_t pos = 0; - for (uint32_t b = 0; b < num_buckets; ++b) { - uint64_t lo = pos, hi = total; - while (lo < hi) { - uint64_t mid = lo + ((hi - lo) >> 1); - uint32_t bucket_mid = sorted_mi[mid] >> bucket_shift; - if (bucket_mid < b) lo = mid + 1; - else hi = mid; - } - offsets[b] = lo; - pos = lo; - } - offsets[num_buckets] = total; -} - -// See T3Kernel.cu for the rationale — one offset per (r_bucket, top -// fine_bits of target) cuts the match-kernel bsearch window 256× at -// fine_bits=8. -__global__ void compute_fine_bucket_offsets( - uint32_t const* __restrict__ sorted_mi, - uint64_t const* __restrict__ bucket_offsets, - int num_match_target_bits, - int fine_bits, - uint32_t num_buckets, - uint64_t* __restrict__ fine_offsets) -{ - uint32_t const fine_count = 1u << fine_bits; - uint32_t const total = num_buckets * fine_count; - uint32_t const tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= total) return; - - uint32_t const r_bucket = tid / fine_count; - uint32_t const fine_key = tid % fine_count; - - uint64_t const r_start = bucket_offsets[r_bucket]; - uint64_t const r_end = bucket_offsets[r_bucket + 1]; - - uint32_t const target_mask = (num_match_target_bits >= 32) - ? 0xFFFFFFFFu - : ((1u << num_match_target_bits) - 1u); - uint32_t const shift = static_cast(num_match_target_bits - fine_bits); - - uint64_t lo = r_start, hi = r_end; - while (lo < hi) { - uint64_t mid = lo + ((hi - lo) >> 1); - uint32_t t = (sorted_mi[mid] & target_mask) >> shift; - if (t < fine_key) lo = mid + 1; - else hi = mid; - } - fine_offsets[tid] = lo; - - if (tid == total - 1) { - fine_offsets[total] = bucket_offsets[num_buckets]; - } -} - -__global__ __launch_bounds__(256, 4) void match_all_buckets( - AesHashKeys keys, - uint64_t const* __restrict__ sorted_meta, - uint32_t const* __restrict__ sorted_mi, - uint64_t const* __restrict__ d_offsets, - uint64_t const* __restrict__ d_fine_offsets, - uint32_t num_match_keys, - int k, - int num_section_bits, - int num_match_target_bits, - int fine_bits, - uint32_t target_mask, - int num_test_bits, - int num_match_info_bits, - int half_k, - T2PairingGpu* __restrict__ out, - unsigned long long* __restrict__ out_count, - uint64_t out_capacity) -{ - __shared__ uint32_t sT[4 * 256]; - load_aes_tables_smem(sT); - __syncthreads(); - - uint32_t bucket_id = blockIdx.y; - uint32_t section_l = bucket_id / num_match_keys; - uint32_t match_key_r = bucket_id % num_match_keys; - - uint32_t section_r; - { - uint32_t mask = (1u << num_section_bits) - 1u; - uint32_t rl = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask; - uint32_t rl1 = (rl + 1) & mask; - section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask; - } - - uint64_t l_start = d_offsets[section_l * num_match_keys]; - uint64_t l_end = d_offsets[(section_l + 1) * num_match_keys]; - uint32_t r_bucket = section_r * num_match_keys + match_key_r; - - uint64_t l = l_start + blockIdx.x * uint64_t(blockDim.x) + threadIdx.x; - if (l >= l_end) return; - - uint64_t meta_l = sorted_meta[l]; - - uint32_t target_l = matching_target_smem(keys, 2u, match_key_r, meta_l, sT, 0) - & target_mask; - - // Fine-bucket pre-index; see T3Kernel.cu for rationale. - uint32_t fine_shift = static_cast(num_match_target_bits - fine_bits); - uint32_t fine_key = target_l >> fine_shift; - uint64_t fine_idx = (uint64_t(r_bucket) << fine_bits) | fine_key; - uint64_t lo = d_fine_offsets[fine_idx]; - uint64_t fine_hi = d_fine_offsets[fine_idx + 1]; - uint64_t hi = fine_hi; - - while (lo < hi) { - uint64_t mid = lo + ((hi - lo) >> 1); - uint32_t target_mid = sorted_mi[mid] & target_mask; - if (target_mid < target_l) lo = mid + 1; - else hi = mid; - } - - uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu - : ((1u << num_test_bits) - 1u); - uint32_t info_mask = (num_match_info_bits >= 32) ? 0xFFFFFFFFu - : ((1u << num_match_info_bits) - 1u); - int meta_bits = 2 * k; - - for (uint64_t r = lo; r < fine_hi; ++r) { - uint32_t target_r = sorted_mi[r] & target_mask; - if (target_r != target_l) break; - - uint64_t meta_r = sorted_meta[r]; - - Result128 res = pairing_smem(keys, meta_l, meta_r, sT, 0); - - uint32_t test_result = res.r[3] & test_mask; - if (test_result != 0) continue; - - uint32_t match_info_result = res.r[0] & info_mask; - uint64_t meta_result_full = uint64_t(res.r[1]) | (uint64_t(res.r[2]) << 32); - uint64_t meta_result = (meta_bits == 64) - ? meta_result_full - : (meta_result_full & ((1ULL << meta_bits) - 1ULL)); - - uint32_t x_bits_l = static_cast((meta_l >> k) >> half_k); - uint32_t x_bits_r = static_cast((meta_r >> k) >> half_k); - uint32_t x_bits = (x_bits_l << half_k) | x_bits_r; - - unsigned long long out_idx = atomicAdd(out_count, 1ULL); - if (out_idx >= out_capacity) return; - - T2PairingGpu p; - p.meta = meta_result; - p.match_info = match_info_result; - p.x_bits = x_bits; - out[out_idx] = p; - } -} - -} // namespace - -cudaError_t launch_t2_match( - uint8_t const* plot_id_bytes, - T2MatchParams const& params, - uint64_t const* d_sorted_meta, - uint32_t const* d_sorted_mi, - uint64_t t1_count, - T2PairingGpu* d_out_pairings, - uint64_t* d_out_count, - uint64_t capacity, - void* d_temp_storage, - size_t* temp_bytes, - cudaStream_t stream) -{ - if (!plot_id_bytes || !temp_bytes) return cudaErrorInvalidValue; - if (params.k < 18 || params.k > 32) return cudaErrorInvalidValue; - if (params.strength < 2) return cudaErrorInvalidValue; - - uint32_t num_sections = 1u << params.num_section_bits; - uint32_t num_match_keys = 1u << params.num_match_key_bits; - uint32_t num_buckets = num_sections * num_match_keys; - - // Fine-bucket pre-index; see T3Kernel.cu for the scheme. - constexpr int FINE_BITS = 8; - uint64_t const fine_count = 1ull << FINE_BITS; - uint64_t const fine_entries = uint64_t(num_buckets) * fine_count + 1; - - size_t const bucket_bytes = sizeof(uint64_t) * (num_buckets + 1); - size_t const fine_bytes = sizeof(uint64_t) * fine_entries; - size_t const needed = bucket_bytes + fine_bytes; - - if (d_temp_storage == nullptr) { - *temp_bytes = needed; - return cudaSuccess; - } - if (*temp_bytes < needed) return cudaErrorInvalidValue; - if (!d_sorted_meta || !d_sorted_mi || !d_out_pairings || !d_out_count) return cudaErrorInvalidValue; - if (params.num_match_target_bits <= FINE_BITS) return cudaErrorInvalidValue; - - auto* d_offsets = reinterpret_cast(d_temp_storage); - auto* d_fine_offsets = d_offsets + (num_buckets + 1); - - AesHashKeys keys = make_keys(plot_id_bytes); - - compute_bucket_offsets<<<1, 1, 0, stream>>>( - d_sorted_mi, t1_count, - params.num_match_target_bits, - num_buckets, - d_offsets); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) return err; - - uint32_t fine_threads_total = num_buckets * uint32_t(fine_count); - unsigned fine_blocks = (fine_threads_total + 255) / 256; - compute_fine_bucket_offsets<<>>( - d_sorted_mi, d_offsets, - params.num_match_target_bits, FINE_BITS, - num_buckets, d_fine_offsets); - err = cudaGetLastError(); - if (err != cudaSuccess) return err; - - err = cudaMemsetAsync(d_out_count, 0, sizeof(uint64_t), stream); - if (err != cudaSuccess) return err; - - std::vector h_offsets(num_buckets + 1); - err = cudaMemcpyAsync(h_offsets.data(), d_offsets, - sizeof(uint64_t) * (num_buckets + 1), - cudaMemcpyDeviceToHost, stream); - if (err != cudaSuccess) return err; - err = cudaStreamSynchronize(stream); - if (err != cudaSuccess) return err; - - uint64_t l_count_max = 0; - for (uint32_t s = 0; s < num_sections; ++s) { - uint64_t l_count = h_offsets[(s + 1) * num_match_keys] - - h_offsets[s * num_match_keys]; - if (l_count > l_count_max) l_count_max = l_count; - } - - uint32_t target_mask = (params.num_match_target_bits >= 32) - ? 0xFFFFFFFFu - : ((1u << params.num_match_target_bits) - 1u); - int num_test_bits = params.num_match_key_bits; - int num_info_bits = params.k; - int half_k = params.k / 2; - - constexpr int kThreads = 256; - uint64_t blocks_x_u64 = (l_count_max + kThreads - 1) / kThreads; - if (blocks_x_u64 > UINT_MAX) return cudaErrorInvalidValue; - dim3 grid(static_cast(blocks_x_u64), num_buckets, 1); - - match_all_buckets<<>>( - keys, d_sorted_meta, d_sorted_mi, - d_offsets, d_fine_offsets, - num_match_keys, - params.k, params.num_section_bits, - params.num_match_target_bits, FINE_BITS, - target_mask, num_test_bits, num_info_bits, half_k, - d_out_pairings, - reinterpret_cast(d_out_count), - capacity); - err = cudaGetLastError(); - if (err != cudaSuccess) return err; - return cudaSuccess; -} - -} // namespace pos2gpu diff --git a/src/gpu/T2Kernel.cuh b/src/gpu/T2Kernel.cuh index b311e66..d41b351 100644 --- a/src/gpu/T2Kernel.cuh +++ b/src/gpu/T2Kernel.cuh @@ -9,7 +9,8 @@ #include "gpu/AesHashGpu.cuh" #include "gpu/T1Kernel.cuh" -#include +#include "gpu/CudaHalfShim.hpp" +#include #include #include @@ -45,17 +46,67 @@ T2MatchParams make_t2_params(int k, int strength); // Dropping the 4-byte match_info from the permuted stream trims the sorted-T1 // footprint 12 B → 8 B per entry and removes wasted bandwidth on the match // kernel's hot meta loads. -cudaError_t launch_t2_match( +// +// Output is also SoA: three parallel streams instead of a packed +// T2PairingGpu array. This lets the streaming pipeline free the mi +// stream early (after it's consumed by the subsequent CUB sort as the +// key input) without touching the meta/xbits streams, shaving ~1 GB +// off the k=28 T2-sort peak. The matching-parity tool rebuilds +// T2PairingGpu locally when it needs the AoS form. +void launch_t2_match( uint8_t const* plot_id_bytes, T2MatchParams const& params, uint64_t const* d_sorted_meta, // meta, sorted by match_info ascending uint32_t const* d_sorted_mi, // parallel match_info stream uint64_t t1_count, - T2PairingGpu* d_out_pairings, + uint64_t* d_out_meta, // uint64 meta per emitted pair + uint32_t* d_out_mi, // uint32 match_info per emitted pair + uint32_t* d_out_xbits, // uint32 x_bits per emitted pair uint64_t* d_out_count, uint64_t capacity, void* d_temp_storage, size_t* temp_bytes, - cudaStream_t stream = nullptr); + sycl::queue& q); + +// Two-step entry point for callers that want to run the match kernel +// in multiple bucket-range passes (e.g. the streaming pipeline's N=2 +// tiling — see docs/t2-match-tiling-plan.md). Equivalent to calling +// launch_t2_match with (0, num_buckets) when the range covers the +// whole bucket space. +// +// launch_t2_match_prepare: computes bucket + fine-bucket offsets into +// d_temp_storage and zeroes d_out_count. Same sizing protocol as +// launch_t2_match (d_temp_storage==nullptr fills *temp_bytes). +// +// launch_t2_match_range: runs the match kernel for bucket-id range +// [bucket_begin, bucket_end). Multiple calls sharing the same +// d_temp_storage / d_out_* buffers / d_out_count produce a single +// concatenated output (atomic counter), byte-equivalent to a single +// full-range call after the subsequent T2 sort. +void launch_t2_match_prepare( + uint8_t const* plot_id_bytes, + T2MatchParams const& params, + uint32_t const* d_sorted_mi, + uint64_t t1_count, + uint64_t* d_out_count, + void* d_temp_storage, + size_t* temp_bytes, + sycl::queue& q); + +void launch_t2_match_range( + uint8_t const* plot_id_bytes, + T2MatchParams const& params, + uint64_t const* d_sorted_meta, + uint32_t const* d_sorted_mi, + uint64_t t1_count, + uint64_t* d_out_meta, + uint32_t* d_out_mi, + uint32_t* d_out_xbits, + uint64_t* d_out_count, + uint64_t capacity, + void const* d_temp_storage, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q); } // namespace pos2gpu diff --git a/src/gpu/T2Offsets.cuh b/src/gpu/T2Offsets.cuh new file mode 100644 index 0000000..f5f2a30 --- /dev/null +++ b/src/gpu/T2Offsets.cuh @@ -0,0 +1,81 @@ +// T2Offsets.cuh — backend-dispatched wrappers for T2's three kernels. +// Parallel to T1Offsets.cuh; selected at configure time via XCHPLOT2_BACKEND +// (T2OffsetsCuda.cu vs T2OffsetsSycl.cpp). +// +// T2's input stream is SoA (uint64 meta + uint32 match_info) rather than +// T1's AoS XsCandidateGpu, so the bucket/fine-offset wrappers take the +// match_info array directly. The match kernel emits three output streams +// (meta, match_info, x_bits) instead of T1's two. + +#pragma once + +#include "gpu/AesHashGpu.cuh" + +#include + +#include "gpu/CudaHalfShim.hpp" +#include + +namespace pos2gpu { + +void launch_t2_compute_bucket_offsets( + uint32_t const* d_sorted_mi, + uint64_t total, + int num_match_target_bits, + uint32_t num_buckets, + uint64_t* d_offsets, + sycl::queue& q); + +void launch_t2_compute_fine_bucket_offsets( + uint32_t const* d_sorted_mi, + uint64_t const* d_bucket_offsets, + int num_match_target_bits, + int fine_bits, + uint32_t num_buckets, + uint64_t* d_fine_offsets, + sycl::queue& q); + +// Fused T2 match. table_id=2, no strength scaling on AES rounds. Emits +// (meta, match_info, x_bits) triples via an atomic cursor; x_bits packs +// the upper-half-k bits of meta_l and meta_r per Table2Constructor. +// +// bucket_begin / bucket_end select which bucket-id range to process +// (inclusive / exclusive). Passing (0, num_buckets) preserves the +// original full-pass behavior. Smaller ranges let callers split T2 +// match into temporally-separated passes so downstream memory does +// not need to hold the full T2 output at once (see +// docs/t2-match-tiling-plan.md). +// +// Across all passes that share the same d_out_{meta,mi,xbits} + +// d_out_count, results append starting at the current value of +// d_out_count (atomic). Callers that want pass-disjoint output should +// sum counts themselves; callers that want the concatenation as a +// single array should simply leave d_out_count and the buffers untouched +// between passes. +void launch_t2_match_all_buckets( + AesHashKeys keys, + uint64_t const* d_sorted_meta, + uint32_t const* d_sorted_mi, + uint64_t const* d_offsets, + uint64_t const* d_fine_offsets, + uint32_t num_match_keys, + uint32_t num_buckets, + int k, + int num_section_bits, + int num_match_target_bits, + int fine_bits, + uint32_t target_mask, + int num_test_bits, + int num_match_info_bits, + int half_k, + uint64_t* d_out_meta, + uint32_t* d_out_mi, + uint32_t* d_out_xbits, + uint64_t* d_out_count, + uint64_t out_capacity, + uint64_t l_count_max, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q); + +} // namespace pos2gpu diff --git a/src/gpu/T2OffsetsSycl.cpp b/src/gpu/T2OffsetsSycl.cpp new file mode 100644 index 0000000..2887b5c --- /dev/null +++ b/src/gpu/T2OffsetsSycl.cpp @@ -0,0 +1,231 @@ +// T2OffsetsSycl.cpp — SYCL implementation of T2's three backend-dispatched +// kernels. Pattern mirrors T1OffsetsSycl.cpp; reuses the shared SYCL +// queue + AES-table USM buffer from SyclBackend.hpp. + +#include "gpu/SyclBackend.hpp" +#include "gpu/T2Offsets.cuh" + +#include + +namespace pos2gpu { + +void launch_t2_compute_bucket_offsets( + uint32_t const* d_sorted_mi, + uint64_t total, + int num_match_target_bits, + uint32_t num_buckets, + uint64_t* d_offsets, + sycl::queue& q) +{ + constexpr size_t threads = 256; + size_t const out_count = static_cast(num_buckets) + 1; + size_t const groups = (out_count + threads - 1) / threads; + + q.parallel_for( + sycl::nd_range<1>{ groups * threads, threads }, + [=](sycl::nd_item<1> it) { + uint32_t b = static_cast(it.get_global_id(0)); + if (b > num_buckets) return; + if (b == num_buckets) { d_offsets[num_buckets] = total; return; } + + uint32_t bucket_shift = static_cast(num_match_target_bits); + uint64_t lo = 0, hi = total; + while (lo < hi) { + uint64_t mid = lo + ((hi - lo) >> 1); + uint32_t v = d_sorted_mi[mid] >> bucket_shift; + if (v < b) lo = mid + 1; + else hi = mid; + } + d_offsets[b] = lo; + }).wait(); +} + +void launch_t2_compute_fine_bucket_offsets( + uint32_t const* d_sorted_mi, + uint64_t const* d_bucket_offsets, + int num_match_target_bits, + int fine_bits, + uint32_t num_buckets, + uint64_t* d_fine_offsets, + sycl::queue& q) +{ + constexpr size_t threads = 256; + uint32_t const fine_count = 1u << fine_bits; + uint32_t const total = num_buckets * fine_count; + size_t const groups = (total + threads - 1) / threads; + + q.parallel_for( + sycl::nd_range<1>{ groups * threads, threads }, + [=](sycl::nd_item<1> it) { + uint32_t tid = static_cast(it.get_global_id(0)); + if (tid >= total) return; + + uint32_t r_bucket = tid / fine_count; + uint32_t fine_key = tid % fine_count; + + uint64_t r_start = d_bucket_offsets[r_bucket]; + uint64_t r_end = d_bucket_offsets[r_bucket + 1]; + + uint32_t target_mask = (num_match_target_bits >= 32) + ? 0xFFFFFFFFu + : ((1u << num_match_target_bits) - 1u); + uint32_t shift = static_cast(num_match_target_bits - fine_bits); + + uint64_t lo = r_start, hi = r_end; + while (lo < hi) { + uint64_t mid = lo + ((hi - lo) >> 1); + uint32_t t = (d_sorted_mi[mid] & target_mask) >> shift; + if (t < fine_key) lo = mid + 1; + else hi = mid; + } + d_fine_offsets[tid] = lo; + + if (tid == total - 1) { + d_fine_offsets[total] = d_bucket_offsets[num_buckets]; + } + }).wait(); +} + +void launch_t2_match_all_buckets( + AesHashKeys keys, + uint64_t const* d_sorted_meta, + uint32_t const* d_sorted_mi, + uint64_t const* d_offsets, + uint64_t const* d_fine_offsets, + uint32_t num_match_keys, + uint32_t num_buckets, + int k, + int num_section_bits, + int num_match_target_bits, + int fine_bits, + uint32_t target_mask, + int num_test_bits, + int num_match_info_bits, + int half_k, + uint64_t* d_out_meta, + uint32_t* d_out_mi, + uint32_t* d_out_xbits, + uint64_t* d_out_count, + uint64_t out_capacity, + uint64_t l_count_max, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q) +{ + (void)num_buckets; // only the [begin, end) sub-range is iterated + if (bucket_end <= bucket_begin) return; + uint32_t const num_buckets_in_range = bucket_end - bucket_begin; + + uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q); + + constexpr size_t threads = 256; + uint64_t blocks_x_u64 = (l_count_max + threads - 1) / threads; + size_t const blocks_x = static_cast(blocks_x_u64); + + auto* d_out_count_ull = + reinterpret_cast(d_out_count); + + q.submit([&](sycl::handler& h) { + sycl::local_accessor sT_local{ + sycl::range<1>{4 * 256}, h}; + + h.parallel_for( + sycl::nd_range<2>{ + sycl::range<2>{ static_cast(num_buckets_in_range), + blocks_x * threads }, + sycl::range<2>{ 1, threads } + }, + [=, keys_copy = keys](sycl::nd_item<2> it) { + uint32_t* sT = &sT_local[0]; + size_t local_id = it.get_local_id(1); + #pragma unroll 1 + for (size_t i = local_id; i < 4 * 256; i += threads) { + sT[i] = d_aes_tables[i]; + } + it.barrier(sycl::access::fence_space::local_space); + + uint32_t bucket_id = bucket_begin + static_cast(it.get_group(0)); + uint32_t section_l = bucket_id / num_match_keys; + uint32_t match_key_r = bucket_id % num_match_keys; + + uint32_t section_r; + { + uint32_t mask = (1u << num_section_bits) - 1u; + uint32_t rl = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask; + uint32_t rl1 = (rl + 1) & mask; + section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask; + } + + uint64_t l_start = d_offsets[section_l * num_match_keys]; + uint64_t l_end = d_offsets[(section_l + 1) * num_match_keys]; + uint32_t r_bucket = section_r * num_match_keys + match_key_r; + + uint64_t l = l_start + + it.get_group(1) * uint64_t(threads) + + local_id; + if (l >= l_end) return; + + uint64_t meta_l = d_sorted_meta[l]; + + uint32_t target_l = pos2gpu::matching_target_smem( + keys_copy, 2u, match_key_r, meta_l, sT, 0) + & target_mask; + + uint32_t fine_shift = static_cast(num_match_target_bits - fine_bits); + uint32_t fine_key = target_l >> fine_shift; + uint64_t fine_idx = (uint64_t(r_bucket) << fine_bits) | fine_key; + uint64_t lo = d_fine_offsets[fine_idx]; + uint64_t fine_hi = d_fine_offsets[fine_idx + 1]; + uint64_t hi = fine_hi; + + while (lo < hi) { + uint64_t mid = lo + ((hi - lo) >> 1); + uint32_t target_mid = d_sorted_mi[mid] & target_mask; + if (target_mid < target_l) lo = mid + 1; + else hi = mid; + } + + uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu + : ((1u << num_test_bits) - 1u); + uint32_t info_mask = (num_match_info_bits >= 32) ? 0xFFFFFFFFu + : ((1u << num_match_info_bits) - 1u); + int meta_bits = 2 * k; + + for (uint64_t r = lo; r < fine_hi; ++r) { + uint32_t target_r = d_sorted_mi[r] & target_mask; + if (target_r != target_l) break; + + uint64_t meta_r = d_sorted_meta[r]; + + pos2gpu::Result128 res = pos2gpu::pairing_smem( + keys_copy, meta_l, meta_r, sT, 0); + + uint32_t test_result = res.r[3] & test_mask; + if (test_result != 0) continue; + + uint32_t match_info_result = res.r[0] & info_mask; + uint64_t meta_result_full = uint64_t(res.r[1]) | (uint64_t(res.r[2]) << 32); + uint64_t meta_result = (meta_bits == 64) + ? meta_result_full + : (meta_result_full & ((1ULL << meta_bits) - 1ULL)); + + uint32_t x_bits_l = static_cast((meta_l >> k) >> half_k); + uint32_t x_bits_r = static_cast((meta_r >> k) >> half_k); + uint32_t x_bits = (x_bits_l << half_k) | x_bits_r; + + sycl::atomic_ref + out_count_atomic{ *d_out_count_ull }; + unsigned long long out_idx = out_count_atomic.fetch_add(1ULL); + if (out_idx >= out_capacity) return; + + d_out_meta [out_idx] = meta_result; + d_out_mi [out_idx] = match_info_result; + d_out_xbits[out_idx] = x_bits; + } + }); + }).wait(); +} + +} // namespace pos2gpu diff --git a/src/gpu/T3Kernel.cpp b/src/gpu/T3Kernel.cpp new file mode 100644 index 0000000..a89db1a --- /dev/null +++ b/src/gpu/T3Kernel.cpp @@ -0,0 +1,268 @@ +// T3Kernel.cu — port of pos2-chip Table3Constructor. +// +// Differences from T2: +// - Input is T2Pairing { meta(64), match_info(32), x_bits(32) }. +// - matching_target uses table_id=3 and meta=T2Pairing.meta (no extra rounds). +// - pairing_t3 only consumes test_result; no match_info / meta extraction +// from the AES output. AES rounds = AES_PAIRING_ROUNDS (16), no strength +// bonus. +// - Emit T3Pairing { proof_fragment = FeistelCipher.encrypt(all_x_bits) } +// where all_x_bits = (l.x_bits << k) | r.x_bits. + +#include "gpu/AesGpu.cuh" +#include "gpu/AesHashGpu.cuh" +#include "gpu/FeistelCipherGpu.cuh" +#include "gpu/T2Offsets.cuh" +#include "gpu/T3Kernel.cuh" +#include "gpu/T3Offsets.cuh" +#include "host/PoolSizing.hpp" + +#include +#include + +namespace pos2gpu { + +// The CUDA __constant__ FeistelKey + its setup have moved to +// T3OffsetsCuda.cu, scoped to the wrapper that uses them. The SYCL +// path captures FeistelKey by value in the lambda instead. + +T3MatchParams make_t3_params(int k, int strength) +{ + T3MatchParams p{}; + p.k = k; + p.strength = strength; + p.num_section_bits = (k < 28) ? 2 : (k - 26); + p.num_match_key_bits = strength; + p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits; + return p; +} + +// T3's three kernels (compute_bucket_offsets, compute_fine_bucket_offsets, +// match_all_buckets) have moved to the cross-backend path. The two offset +// kernels are bit-identical to T2's and reuse T2Offsets.cuh's wrappers; the +// match kernel — Feistel-encrypted output — has its own wrapper in +// T3Offsets.cuh. The previously-unused matching_section helper went with +// them. + + +namespace { + +constexpr int kT3FineBits = 8; + +struct T3Derived { + uint32_t num_sections; + uint32_t num_match_keys; + uint32_t num_buckets; + uint64_t fine_entries; + size_t bucket_bytes; + size_t fine_bytes; + size_t temp_needed; + uint32_t target_mask; + int num_test_bits; + uint64_t l_count_max; +}; + +T3Derived derive_t3(T3MatchParams const& params) +{ + T3Derived d{}; + d.num_sections = 1u << params.num_section_bits; + d.num_match_keys = 1u << params.num_match_key_bits; + d.num_buckets = d.num_sections * d.num_match_keys; + uint64_t const fine_count = 1ull << kT3FineBits; + d.fine_entries = uint64_t(d.num_buckets) * fine_count + 1; + d.bucket_bytes = sizeof(uint64_t) * (d.num_buckets + 1); + d.fine_bytes = sizeof(uint64_t) * d.fine_entries; + d.temp_needed = d.bucket_bytes + d.fine_bytes; + d.target_mask = (params.num_match_target_bits >= 32) + ? 0xFFFFFFFFu + : ((1u << params.num_match_target_bits) - 1u); + d.num_test_bits = params.num_match_key_bits; + d.l_count_max = + static_cast(max_pairs_per_section(params.k, params.num_section_bits)); + return d; +} + +} // namespace + +void launch_t3_match_prepare( + uint8_t const* plot_id_bytes, + T3MatchParams const& params, + uint32_t const* d_sorted_mi, + uint64_t t2_count, + uint64_t* d_out_count, + void* d_temp_storage, + size_t* temp_bytes, + sycl::queue& q) +{ + if (!plot_id_bytes || !temp_bytes) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.strength < 2) throw std::invalid_argument("invalid argument to launch wrapper"); + + T3Derived const d = derive_t3(params); + + if (d_temp_storage == nullptr) { + *temp_bytes = d.temp_needed; + return; + } + if (*temp_bytes < d.temp_needed) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_sorted_mi || !d_out_count) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.num_match_target_bits <= kT3FineBits) throw std::invalid_argument("invalid argument to launch wrapper"); + + auto* d_offsets = reinterpret_cast(d_temp_storage); + auto* d_fine_offsets = d_offsets + (d.num_buckets + 1); + + // T3 reuses T2's offset wrappers (identical layout + algorithm). + launch_t2_compute_bucket_offsets( + d_sorted_mi, t2_count, + params.num_match_target_bits, + d.num_buckets, d_offsets, q); + launch_t2_compute_fine_bucket_offsets( + d_sorted_mi, d_offsets, + params.num_match_target_bits, kT3FineBits, + d.num_buckets, d_fine_offsets, q); + q.memset(d_out_count, 0, sizeof(uint64_t)).wait(); +} + +void launch_t3_match_range( + uint8_t const* plot_id_bytes, + T3MatchParams const& params, + uint64_t const* d_sorted_meta, + uint32_t const* d_sorted_xbits, + uint32_t const* d_sorted_mi, + uint64_t t2_count, + T3PairingGpu* d_out_pairings, + uint64_t* d_out_count, + uint64_t capacity, + void const* d_temp_storage, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q) +{ + (void)t2_count; + if (!plot_id_bytes) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.strength < 2) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_temp_storage) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_sorted_meta || !d_sorted_xbits || !d_sorted_mi + || !d_out_pairings || !d_out_count) throw std::invalid_argument("invalid argument to launch wrapper"); + + T3Derived const d = derive_t3(params); + + if (bucket_end > d.num_buckets) throw std::invalid_argument("invalid argument to launch wrapper"); + if (bucket_end <= bucket_begin) return; + + constexpr int kThreads = 256; + uint64_t const blocks_x_u64 = (d.l_count_max + kThreads - 1) / kThreads; + if (blocks_x_u64 > UINT_MAX) throw std::invalid_argument("invalid argument to launch wrapper"); + + auto const* d_offsets = reinterpret_cast(d_temp_storage); + auto const* d_fine_offsets = d_offsets + (d.num_buckets + 1); + + AesHashKeys keys = make_keys(plot_id_bytes); + FeistelKey fk = make_feistel_key(plot_id_bytes, params.k, /*rounds=*/4); + + launch_t3_match_all_buckets( + keys, fk, + d_sorted_meta, d_sorted_xbits, d_sorted_mi, + const_cast(d_offsets), + const_cast(d_fine_offsets), + d.num_match_keys, d.num_buckets, + params.k, params.num_section_bits, + params.num_match_target_bits, kT3FineBits, + d.target_mask, d.num_test_bits, + d_out_pairings, d_out_count, + capacity, d.l_count_max, + bucket_begin, bucket_end, + q); +} + +void launch_t3_match_section_pair_range( + uint8_t const* plot_id_bytes, + T3MatchParams const& params, + uint64_t const* d_meta_l_slice, + uint64_t section_l_row_start, + uint64_t const* d_meta_r_slice, + uint64_t section_r_row_start, + uint32_t const* d_sorted_xbits, + uint32_t const* d_sorted_mi, + uint64_t t2_count, + T3PairingGpu* d_out_pairings, + uint64_t* d_out_count, + uint64_t capacity, + void const* d_temp_storage, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q) +{ + (void)t2_count; + if (!plot_id_bytes) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.k < 18 || params.k > 32) throw std::invalid_argument("invalid argument to launch wrapper"); + if (params.strength < 2) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_temp_storage) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_meta_l_slice || !d_meta_r_slice + || !d_sorted_xbits || !d_sorted_mi + || !d_out_pairings || !d_out_count) throw std::invalid_argument("invalid argument to launch wrapper"); + + T3Derived const d = derive_t3(params); + + if (bucket_end > d.num_buckets) throw std::invalid_argument("invalid argument to launch wrapper"); + if (bucket_end <= bucket_begin) return; + + constexpr int kThreads = 256; + uint64_t const blocks_x_u64 = (d.l_count_max + kThreads - 1) / kThreads; + if (blocks_x_u64 > UINT_MAX) throw std::invalid_argument("invalid argument to launch wrapper"); + + auto const* d_offsets = reinterpret_cast(d_temp_storage); + auto const* d_fine_offsets = d_offsets + (d.num_buckets + 1); + + AesHashKeys keys = make_keys(plot_id_bytes); + FeistelKey fk = make_feistel_key(plot_id_bytes, params.k, /*rounds=*/4); + + launch_t3_match_section_pair( + keys, fk, + d_meta_l_slice, section_l_row_start, + d_meta_r_slice, section_r_row_start, + d_sorted_xbits, d_sorted_mi, + const_cast(d_offsets), + const_cast(d_fine_offsets), + d.num_match_keys, d.num_buckets, + params.k, params.num_section_bits, + params.num_match_target_bits, kT3FineBits, + d.target_mask, d.num_test_bits, + d_out_pairings, d_out_count, + capacity, d.l_count_max, + bucket_begin, bucket_end, + q); +} + +void launch_t3_match( + uint8_t const* plot_id_bytes, + T3MatchParams const& params, + uint64_t const* d_sorted_meta, + uint32_t const* d_sorted_xbits, + uint32_t const* d_sorted_mi, + uint64_t t2_count, + T3PairingGpu* d_out_pairings, + uint64_t* d_out_count, + uint64_t capacity, + void* d_temp_storage, + size_t* temp_bytes, + sycl::queue& q) +{ + // Single-shot wrapper: prepare + one full-range match. Preserves the + // original API for pool path, test mode, and parity-test callers. + launch_t3_match_prepare( + plot_id_bytes, params, d_sorted_mi, t2_count, + d_out_count, d_temp_storage, temp_bytes, q); + if (d_temp_storage == nullptr) return; // size-query path + + T3Derived const d = derive_t3(params); + launch_t3_match_range( + plot_id_bytes, params, + d_sorted_meta, d_sorted_xbits, d_sorted_mi, t2_count, + d_out_pairings, d_out_count, + capacity, d_temp_storage, + /*bucket_begin=*/0, /*bucket_end=*/d.num_buckets, q); +} + +} // namespace pos2gpu diff --git a/src/gpu/T3Kernel.cu b/src/gpu/T3Kernel.cu deleted file mode 100644 index 6e91ba5..0000000 --- a/src/gpu/T3Kernel.cu +++ /dev/null @@ -1,337 +0,0 @@ -// T3Kernel.cu — port of pos2-chip Table3Constructor. -// -// Differences from T2: -// - Input is T2Pairing { meta(64), match_info(32), x_bits(32) }. -// - matching_target uses table_id=3 and meta=T2Pairing.meta (no extra rounds). -// - pairing_t3 only consumes test_result; no match_info / meta extraction -// from the AES output. AES rounds = AES_PAIRING_ROUNDS (16), no strength -// bonus. -// - Emit T3Pairing { proof_fragment = FeistelCipher.encrypt(all_x_bits) } -// where all_x_bits = (l.x_bits << k) | r.x_bits. - -#include "gpu/AesGpu.cuh" -#include "gpu/AesHashGpu.cuh" -#include "gpu/FeistelCipherGpu.cuh" -#include "gpu/T3Kernel.cuh" - -#include -#include -#include -#include - -namespace pos2gpu { - -// FeistelKey is 40 bytes (32-byte plot_id + 2 ints). Passed by value as -// a kernel arg, the compiler spilled it to local memory (STACK:40), so -// `fk.plot_id[i]` accesses inside feistel_encrypt became scattered LMEM -// LDGs — brutal for an L1-bound kernel. Stashing it in __constant__ -// memory makes those loads broadcast-cached across the warp instead. -__constant__ FeistelKey g_t3_fk; - -T3MatchParams make_t3_params(int k, int strength) -{ - T3MatchParams p{}; - p.k = k; - p.strength = strength; - p.num_section_bits = (k < 28) ? 2 : (k - 26); - p.num_match_key_bits = strength; - p.num_match_target_bits = k - p.num_section_bits - p.num_match_key_bits; - return p; -} - -namespace { - -__host__ __device__ inline uint32_t matching_section(uint32_t section, int num_section_bits) -{ - uint32_t num_sections = 1u << num_section_bits; - uint32_t mask = num_sections - 1u; - uint32_t rotated_left = ((section << 1) | (section >> (num_section_bits - 1))) & mask; - uint32_t rotated_left_plus_1 = (rotated_left + 1) & mask; - uint32_t section_new = ((rotated_left_plus_1 >> 1) - | (rotated_left_plus_1 << (num_section_bits - 1))) & mask; - return section_new; -} - -__global__ void compute_bucket_offsets( - uint32_t const* __restrict__ sorted_mi, - uint64_t total, - int num_match_target_bits, - uint32_t num_buckets, - uint64_t* __restrict__ offsets) -{ - if (threadIdx.x != 0 || blockIdx.x != 0) return; - uint32_t bucket_shift = static_cast(num_match_target_bits); - - uint64_t pos = 0; - for (uint32_t b = 0; b < num_buckets; ++b) { - uint64_t lo = pos, hi = total; - while (lo < hi) { - uint64_t mid = lo + ((hi - lo) >> 1); - uint32_t bucket_mid = sorted_mi[mid] >> bucket_shift; - if (bucket_mid < b) lo = mid + 1; - else hi = mid; - } - offsets[b] = lo; - pos = lo; - } - offsets[num_buckets] = total; -} - -// Compute fine-grained bucket offsets: one offset per (r_bucket, -// top-FINE_BITS-of-target) pair. Lets the match kernel replace a -// ~24-iteration bsearch on sorted_mi with a 2-LDG lookup + an ~16- -// iteration bsearch in a 256× narrower window. Each thread writes -// one fine_offsets entry via an in-range bsearch over sorted_mi -// restricted to its parent bucket. -__global__ void compute_fine_bucket_offsets( - uint32_t const* __restrict__ sorted_mi, - uint64_t const* __restrict__ bucket_offsets, - int num_match_target_bits, - int fine_bits, - uint32_t num_buckets, - uint64_t* __restrict__ fine_offsets) -{ - uint32_t const fine_count = 1u << fine_bits; - uint32_t const total = num_buckets * fine_count; - uint32_t const tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= total) return; - - uint32_t const r_bucket = tid / fine_count; - uint32_t const fine_key = tid % fine_count; - - uint64_t const r_start = bucket_offsets[r_bucket]; - uint64_t const r_end = bucket_offsets[r_bucket + 1]; - - uint32_t const target_mask = (num_match_target_bits >= 32) - ? 0xFFFFFFFFu - : ((1u << num_match_target_bits) - 1u); - uint32_t const shift = static_cast(num_match_target_bits - fine_bits); - - uint64_t lo = r_start, hi = r_end; - while (lo < hi) { - uint64_t mid = lo + ((hi - lo) >> 1); - uint32_t t = (sorted_mi[mid] & target_mask) >> shift; - if (t < fine_key) lo = mid + 1; - else hi = mid; - } - fine_offsets[tid] = lo; - - // Last thread writes the sentinel (overall end = sorted_mi length). - if (tid == total - 1) { - fine_offsets[total] = bucket_offsets[num_buckets]; - } -} - -__global__ __launch_bounds__(256, 4) void match_all_buckets( - AesHashKeys keys, - uint64_t const* __restrict__ sorted_meta, - uint32_t const* __restrict__ sorted_xbits, - uint32_t const* __restrict__ sorted_mi, - uint64_t const* __restrict__ d_offsets, - uint64_t const* __restrict__ d_fine_offsets, - uint32_t num_match_keys, - int k, - int num_section_bits, - int num_match_target_bits, - int fine_bits, - uint32_t target_mask, - int num_test_bits, - T3PairingGpu* __restrict__ out, - unsigned long long* __restrict__ out_count, - uint64_t out_capacity) -{ - __shared__ uint32_t sT[4 * 256]; - load_aes_tables_smem(sT); - __syncthreads(); - - uint32_t bucket_id = blockIdx.y; - uint32_t section_l = bucket_id / num_match_keys; - uint32_t match_key_r = bucket_id % num_match_keys; - - uint32_t section_r; - { - uint32_t mask = (1u << num_section_bits) - 1u; - uint32_t rl = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask; - uint32_t rl1 = (rl + 1) & mask; - section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask; - } - - uint64_t l_start = d_offsets[section_l * num_match_keys]; - uint64_t l_end = d_offsets[(section_l + 1) * num_match_keys]; - uint32_t r_bucket = section_r * num_match_keys + match_key_r; - - uint64_t l = l_start + blockIdx.x * uint64_t(blockDim.x) + threadIdx.x; - if (l >= l_end) return; - - uint64_t meta_l = sorted_meta[l]; - uint32_t xb_l = sorted_xbits[l]; - - uint32_t target_l = matching_target_smem(keys, 3u, match_key_r, meta_l, sT, 0) - & target_mask; - - // Fine-bucket pre-index: narrows the bsearch range by 2^fine_bits - // using a precomputed offset table indexed by (r_bucket, top - // fine_bits of target_l). Two cached LDGs replace the outer d_offsets - // r_start/r_end and shrink the bsearch window 256× at fine_bits=8. - uint32_t fine_shift = static_cast(num_match_target_bits - fine_bits); - uint32_t fine_key = target_l >> fine_shift; - uint64_t fine_idx = (uint64_t(r_bucket) << fine_bits) | fine_key; - uint64_t lo = d_fine_offsets[fine_idx]; - uint64_t fine_hi = d_fine_offsets[fine_idx + 1]; - uint64_t hi = fine_hi; - - while (lo < hi) { - uint64_t mid = lo + ((hi - lo) >> 1); - uint32_t target_mid = sorted_mi[mid] & target_mask; - if (target_mid < target_l) lo = mid + 1; - else hi = mid; - } - - uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu - : ((1u << num_test_bits) - 1u); - - for (uint64_t r = lo; r < fine_hi; ++r) { - uint32_t target_r = sorted_mi[r] & target_mask; - if (target_r != target_l) break; - - uint64_t meta_r = sorted_meta[r]; - uint32_t xb_r = sorted_xbits[r]; - - Result128 res = pairing_smem(keys, meta_l, meta_r, sT, 0); - uint32_t test_result = res.r[3] & test_mask; - if (test_result != 0) continue; - - uint64_t all_x_bits = (uint64_t(xb_l) << k) | uint64_t(xb_r); - uint64_t fragment = feistel_encrypt(g_t3_fk, all_x_bits); - - unsigned long long out_idx = atomicAdd(out_count, 1ULL); - if (out_idx >= out_capacity) return; - - T3PairingGpu p; - p.proof_fragment = fragment; - out[out_idx] = p; - } -} - -} // namespace - -cudaError_t launch_t3_match( - uint8_t const* plot_id_bytes, - T3MatchParams const& params, - uint64_t const* d_sorted_meta, - uint32_t const* d_sorted_xbits, - uint32_t const* d_sorted_mi, - uint64_t t2_count, - T3PairingGpu* d_out_pairings, - uint64_t* d_out_count, - uint64_t capacity, - void* d_temp_storage, - size_t* temp_bytes, - cudaStream_t stream) -{ - if (!plot_id_bytes || !temp_bytes) return cudaErrorInvalidValue; - if (params.k < 18 || params.k > 32) return cudaErrorInvalidValue; - if (params.strength < 2) return cudaErrorInvalidValue; - - uint32_t num_sections = 1u << params.num_section_bits; - uint32_t num_match_keys = 1u << params.num_match_key_bits; - uint32_t num_buckets = num_sections * num_match_keys; - - // Fine-bucket pre-index: 2^FINE_BITS slots per bucket shrinks the - // match-kernel bsearch window by the same factor. Requires at least - // FINE_BITS+1 bits of target range; num_match_target_bits is - // k - section_bits - match_key_bits = 14..30 across the supported - // (k, strength) matrix, so 8 fine bits always leaves ≥6 for bsearch. - constexpr int FINE_BITS = 8; - uint64_t const fine_count = 1ull << FINE_BITS; - uint64_t const fine_entries = uint64_t(num_buckets) * fine_count + 1; - - size_t const bucket_bytes = sizeof(uint64_t) * (num_buckets + 1); - size_t const fine_bytes = sizeof(uint64_t) * fine_entries; - size_t const needed = bucket_bytes + fine_bytes; - - if (d_temp_storage == nullptr) { - *temp_bytes = needed; - return cudaSuccess; - } - if (*temp_bytes < needed) return cudaErrorInvalidValue; - if (!d_sorted_meta || !d_sorted_xbits || !d_sorted_mi - || !d_out_pairings || !d_out_count) return cudaErrorInvalidValue; - if (params.num_match_target_bits <= FINE_BITS) { - // Fall-back would be needed here; not expected for supported - // (k, strength) combinations, so fail loudly if we ever trip it. - return cudaErrorInvalidValue; - } - - auto* d_offsets = reinterpret_cast(d_temp_storage); - auto* d_fine_offsets = d_offsets + (num_buckets + 1); - - AesHashKeys keys = make_keys(plot_id_bytes); - FeistelKey fk = make_feistel_key(plot_id_bytes, params.k, /*rounds=*/4); - cudaError_t fk_err = cudaMemcpyToSymbolAsync(g_t3_fk, &fk, sizeof(fk), - 0, cudaMemcpyHostToDevice, stream); - if (fk_err != cudaSuccess) return fk_err; - - compute_bucket_offsets<<<1, 1, 0, stream>>>( - d_sorted_mi, t2_count, - params.num_match_target_bits, - num_buckets, - d_offsets); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) return err; - - // One thread per (r_bucket, fine_key). At T3 k=28 strength=2: - // 16 × 256 = 4096 threads = 16 blocks × 256. - uint32_t fine_threads_total = num_buckets * uint32_t(fine_count); - unsigned fine_blocks = (fine_threads_total + 255) / 256; - compute_fine_bucket_offsets<<>>( - d_sorted_mi, d_offsets, - params.num_match_target_bits, FINE_BITS, - num_buckets, d_fine_offsets); - err = cudaGetLastError(); - if (err != cudaSuccess) return err; - - err = cudaMemsetAsync(d_out_count, 0, sizeof(uint64_t), stream); - if (err != cudaSuccess) return err; - - std::vector h_offsets(num_buckets + 1); - err = cudaMemcpyAsync(h_offsets.data(), d_offsets, - sizeof(uint64_t) * (num_buckets + 1), - cudaMemcpyDeviceToHost, stream); - if (err != cudaSuccess) return err; - err = cudaStreamSynchronize(stream); - if (err != cudaSuccess) return err; - - uint64_t l_count_max = 0; - for (uint32_t s = 0; s < num_sections; ++s) { - uint64_t l_count = h_offsets[(s + 1) * num_match_keys] - - h_offsets[s * num_match_keys]; - if (l_count > l_count_max) l_count_max = l_count; - } - - uint32_t target_mask = (params.num_match_target_bits >= 32) - ? 0xFFFFFFFFu - : ((1u << params.num_match_target_bits) - 1u); - int num_test_bits = params.num_match_key_bits; - - constexpr int kThreads = 256; - uint64_t blocks_x_u64 = (l_count_max + kThreads - 1) / kThreads; - if (blocks_x_u64 > UINT_MAX) return cudaErrorInvalidValue; - dim3 grid(static_cast(blocks_x_u64), num_buckets, 1); - - match_all_buckets<<>>( - keys, d_sorted_meta, d_sorted_xbits, d_sorted_mi, - d_offsets, d_fine_offsets, - num_match_keys, - params.k, params.num_section_bits, - params.num_match_target_bits, FINE_BITS, - target_mask, num_test_bits, - d_out_pairings, - reinterpret_cast(d_out_count), - capacity); - err = cudaGetLastError(); - if (err != cudaSuccess) return err; - return cudaSuccess; -} - -} // namespace pos2gpu diff --git a/src/gpu/T3Kernel.cuh b/src/gpu/T3Kernel.cuh index 46295b9..2711d06 100644 --- a/src/gpu/T3Kernel.cuh +++ b/src/gpu/T3Kernel.cuh @@ -10,7 +10,8 @@ #include "gpu/AesHashGpu.cuh" #include "gpu/T2Kernel.cuh" -#include +#include "gpu/CudaHalfShim.hpp" +#include #include #include @@ -35,7 +36,7 @@ T3MatchParams make_t3_params(int k, int strength); // sorted_t2 input is SoA-split: d_sorted_meta[i] is T2Pairing.meta and // d_sorted_xbits[i] is T2Pairing.x_bits after the T2 sort. match_info is // carried in the parallel d_sorted_mi stream. -cudaError_t launch_t3_match( +void launch_t3_match( uint8_t const* plot_id_bytes, T3MatchParams const& params, uint64_t const* d_sorted_meta, // cap entries, uint64 meta @@ -47,6 +48,72 @@ cudaError_t launch_t3_match( uint64_t capacity, void* d_temp_storage, size_t* temp_bytes, - cudaStream_t stream = nullptr); + sycl::queue& q); + +// Two-step entry point for callers that want to run T3 match in multiple +// bucket-range passes (stage 4d — parallel to the T2 prepare/range split). +// Equivalent to calling launch_t3_match with (0, num_buckets) when the +// range covers the whole bucket space. +// +// launch_t3_match_prepare: computes bucket + fine-bucket offsets into +// d_temp_storage (reusing T2's wrappers, which T3's input is +// bit-identical to) and zeroes d_out_count. Same sizing protocol as +// launch_t3_match (d_temp_storage==nullptr fills *temp_bytes). +// +// launch_t3_match_range: runs the match kernel for bucket range +// [bucket_begin, bucket_end). Multiple calls sharing d_temp_storage / +// d_out_pairings / d_out_count produce a concatenated output via +// atomic append, byte-equivalent to a single full-range call after +// the subsequent T3 sort. +void launch_t3_match_prepare( + uint8_t const* plot_id_bytes, + T3MatchParams const& params, + uint32_t const* d_sorted_mi, + uint64_t t2_count, + uint64_t* d_out_count, + void* d_temp_storage, + size_t* temp_bytes, + sycl::queue& q); + +void launch_t3_match_range( + uint8_t const* plot_id_bytes, + T3MatchParams const& params, + uint64_t const* d_sorted_meta, + uint32_t const* d_sorted_xbits, + uint32_t const* d_sorted_mi, + uint64_t t2_count, + T3PairingGpu* d_out_pairings, + uint64_t* d_out_count, + uint64_t capacity, + void const* d_temp_storage, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q); + +// Sliced-meta variant of launch_t3_match_range (minimal tier). Caller +// must ensure that all bucket ids in [bucket_begin, bucket_end) share +// the same section_l so that l reads always fall within section_l's +// row range and r reads always fall within section_r's row range. The +// caller pre-computes the row starts for each section (from the +// d_offsets table sitting in d_temp_storage) and H2Ds the relevant +// section slices of d_sorted_meta into d_meta_l_slice / d_meta_r_slice. +// d_sorted_xbits and d_sorted_mi are still full-cap on device. +void launch_t3_match_section_pair_range( + uint8_t const* plot_id_bytes, + T3MatchParams const& params, + uint64_t const* d_meta_l_slice, + uint64_t section_l_row_start, + uint64_t const* d_meta_r_slice, + uint64_t section_r_row_start, + uint32_t const* d_sorted_xbits, + uint32_t const* d_sorted_mi, + uint64_t t2_count, + T3PairingGpu* d_out_pairings, + uint64_t* d_out_count, + uint64_t capacity, + void const* d_temp_storage, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q); } // namespace pos2gpu diff --git a/src/gpu/T3Offsets.cuh b/src/gpu/T3Offsets.cuh new file mode 100644 index 0000000..3c6b594 --- /dev/null +++ b/src/gpu/T3Offsets.cuh @@ -0,0 +1,98 @@ +// T3Offsets.cuh — backend-dispatched wrapper for T3's match kernel. +// +// T3 reuses T2's bucket / fine-bucket offset wrappers (the input is the +// same uint32_t* sorted_mi stream and the algorithm is identical), so +// only the match kernel — which differs in the Feistel-encrypted output +// — is declared here. + +#pragma once + +#include "gpu/AesHashGpu.cuh" +#include "gpu/FeistelCipherGpu.cuh" +#include "gpu/T3Kernel.cuh" // T3PairingGpu + +#include + +#include "gpu/CudaHalfShim.hpp" +#include + +namespace pos2gpu { + +// Fused T3 match. table_id=3, no strength scaling. For each surviving +// (l, r) pair, emits T3PairingGpu{ proof_fragment = feistel_encrypt( +// (xb_l << k) | xb_r) } via an atomic cursor. +// +// bucket_begin / bucket_end select which bucket-id range to process +// (inclusive / exclusive). Passing (0, num_buckets) preserves the +// original full-pass behavior. Smaller ranges let callers split T3 +// match into temporally-separated passes so downstream memory does +// not need to hold the full T3 output at once — parallel to the T2 +// match bucket-range plumbing in T2Offsets.cuh. +// +// Across all passes sharing the same d_out_pairings / d_out_count, +// results append via the atomic counter in the kernel. +void launch_t3_match_all_buckets( + AesHashKeys keys, + FeistelKey fk, + uint64_t const* d_sorted_meta, + uint32_t const* d_sorted_xbits, + uint32_t const* d_sorted_mi, + uint64_t const* d_offsets, + uint64_t const* d_fine_offsets, + uint32_t num_match_keys, + uint32_t num_buckets, + int k, + int num_section_bits, + int num_match_target_bits, + int fine_bits, + uint32_t target_mask, + int num_test_bits, + T3PairingGpu* d_out_pairings, + uint64_t* d_out_count, + uint64_t out_capacity, + uint64_t l_count_max, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q); + +// Sliced variant: same algorithm as launch_t3_match_all_buckets but with +// d_sorted_meta accessed via two per-section slices instead of a full +// cap-sized device buffer. The kernel reads: +// meta_l = d_meta_l_slice[l - section_l_row_start] +// meta_r = d_meta_r_slice[r - section_r_row_start] +// Caller MUST ensure that all bucket ids in [bucket_begin, bucket_end) +// share the same section_l (i.e., the range is contained in +// [section_l*num_match_keys, (section_l+1)*num_match_keys)) so that +// every l read falls in section_l's row range and every r read falls in +// the (uniquely-determined) section_r's row range. d_sorted_xbits and +// d_sorted_mi remain full-cap on device (no slicing). Used by minimal +// tier to keep d_t2_meta_sorted parked on host pinned across T3 match; +// drops T3 match peak from ~5200 MB to ~3380 MB at k=28. +void launch_t3_match_section_pair( + AesHashKeys keys, + FeistelKey fk, + uint64_t const* d_meta_l_slice, + uint64_t section_l_row_start, + uint64_t const* d_meta_r_slice, + uint64_t section_r_row_start, + uint32_t const* d_sorted_xbits, + uint32_t const* d_sorted_mi, + uint64_t const* d_offsets, + uint64_t const* d_fine_offsets, + uint32_t num_match_keys, + uint32_t num_buckets, + int k, + int num_section_bits, + int num_match_target_bits, + int fine_bits, + uint32_t target_mask, + int num_test_bits, + T3PairingGpu* d_out_pairings, + uint64_t* d_out_count, + uint64_t out_capacity, + uint64_t l_count_max, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q); + +} // namespace pos2gpu diff --git a/src/gpu/T3OffsetsSycl.cpp b/src/gpu/T3OffsetsSycl.cpp new file mode 100644 index 0000000..ab764e8 --- /dev/null +++ b/src/gpu/T3OffsetsSycl.cpp @@ -0,0 +1,282 @@ +// T3OffsetsSycl.cpp — SYCL implementation of T3's match kernel. Mirrors +// the CUDA path; FeistelKey (40 B) is captured by value in the parallel_for +// lambda instead of going through CUDA constant memory. AdaptiveCpp's +// SSCP backend handles the capture via the kernel-arg mechanism, which is +// fine at this size — if local-memory spills ever bite, switch to a USM +// upload analogous to the CUDA cudaMemcpyToSymbolAsync path. + +#include "gpu/SyclBackend.hpp" +#include "gpu/T3Offsets.cuh" + +#include + +namespace pos2gpu { + +void launch_t3_match_all_buckets( + AesHashKeys keys, + FeistelKey fk, + uint64_t const* d_sorted_meta, + uint32_t const* d_sorted_xbits, + uint32_t const* d_sorted_mi, + uint64_t const* d_offsets, + uint64_t const* d_fine_offsets, + uint32_t num_match_keys, + uint32_t num_buckets, + int k, + int num_section_bits, + int num_match_target_bits, + int fine_bits, + uint32_t target_mask, + int num_test_bits, + T3PairingGpu* d_out_pairings, + uint64_t* d_out_count, + uint64_t out_capacity, + uint64_t l_count_max, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q) +{ + (void)num_buckets; // only the [begin, end) sub-range is iterated + if (bucket_end <= bucket_begin) return; + uint32_t const num_buckets_in_range = bucket_end - bucket_begin; + + uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q); + + constexpr size_t threads = 256; + uint64_t blocks_x_u64 = (l_count_max + threads - 1) / threads; + size_t const blocks_x = static_cast(blocks_x_u64); + + auto* d_out_count_ull = + reinterpret_cast(d_out_count); + + q.submit([&](sycl::handler& h) { + sycl::local_accessor sT_local{ + sycl::range<1>{4 * 256}, h}; + + h.parallel_for( + sycl::nd_range<2>{ + sycl::range<2>{ static_cast(num_buckets_in_range), + blocks_x * threads }, + sycl::range<2>{ 1, threads } + }, + [=, keys_copy = keys, fk_copy = fk](sycl::nd_item<2> it) { + uint32_t* sT = &sT_local[0]; + size_t local_id = it.get_local_id(1); + #pragma unroll 1 + for (size_t i = local_id; i < 4 * 256; i += threads) { + sT[i] = d_aes_tables[i]; + } + it.barrier(sycl::access::fence_space::local_space); + + uint32_t bucket_id = bucket_begin + static_cast(it.get_group(0)); + uint32_t section_l = bucket_id / num_match_keys; + uint32_t match_key_r = bucket_id % num_match_keys; + + uint32_t section_r; + { + uint32_t mask = (1u << num_section_bits) - 1u; + uint32_t rl = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask; + uint32_t rl1 = (rl + 1) & mask; + section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask; + } + + uint64_t l_start = d_offsets[section_l * num_match_keys]; + uint64_t l_end = d_offsets[(section_l + 1) * num_match_keys]; + uint32_t r_bucket = section_r * num_match_keys + match_key_r; + + uint64_t l = l_start + + it.get_group(1) * uint64_t(threads) + + local_id; + if (l >= l_end) return; + + uint64_t meta_l = d_sorted_meta[l]; + uint32_t xb_l = d_sorted_xbits[l]; + + uint32_t target_l = pos2gpu::matching_target_smem( + keys_copy, 3u, match_key_r, meta_l, sT, 0) + & target_mask; + + uint32_t fine_shift = static_cast(num_match_target_bits - fine_bits); + uint32_t fine_key = target_l >> fine_shift; + uint64_t fine_idx = (uint64_t(r_bucket) << fine_bits) | fine_key; + uint64_t lo = d_fine_offsets[fine_idx]; + uint64_t fine_hi = d_fine_offsets[fine_idx + 1]; + uint64_t hi = fine_hi; + + while (lo < hi) { + uint64_t mid = lo + ((hi - lo) >> 1); + uint32_t target_mid = d_sorted_mi[mid] & target_mask; + if (target_mid < target_l) lo = mid + 1; + else hi = mid; + } + + uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu + : ((1u << num_test_bits) - 1u); + + for (uint64_t r = lo; r < fine_hi; ++r) { + uint32_t target_r = d_sorted_mi[r] & target_mask; + if (target_r != target_l) break; + + uint64_t meta_r = d_sorted_meta[r]; + uint32_t xb_r = d_sorted_xbits[r]; + + pos2gpu::Result128 res = pos2gpu::pairing_smem( + keys_copy, meta_l, meta_r, sT, 0); + uint32_t test_result = res.r[3] & test_mask; + if (test_result != 0) continue; + + uint64_t all_x_bits = (uint64_t(xb_l) << k) | uint64_t(xb_r); + uint64_t fragment = pos2gpu::feistel_encrypt(fk_copy, all_x_bits); + + sycl::atomic_ref + out_count_atomic{ *d_out_count_ull }; + unsigned long long out_idx = out_count_atomic.fetch_add(1ULL); + if (out_idx >= out_capacity) return; + + T3PairingGpu p; + p.proof_fragment = fragment; + d_out_pairings[out_idx] = p; + } + }); + }).wait(); +} + +void launch_t3_match_section_pair( + AesHashKeys keys, + FeistelKey fk, + uint64_t const* d_meta_l_slice, + uint64_t section_l_row_start, + uint64_t const* d_meta_r_slice, + uint64_t section_r_row_start, + uint32_t const* d_sorted_xbits, + uint32_t const* d_sorted_mi, + uint64_t const* d_offsets, + uint64_t const* d_fine_offsets, + uint32_t num_match_keys, + uint32_t num_buckets, + int k, + int num_section_bits, + int num_match_target_bits, + int fine_bits, + uint32_t target_mask, + int num_test_bits, + T3PairingGpu* d_out_pairings, + uint64_t* d_out_count, + uint64_t out_capacity, + uint64_t l_count_max, + uint32_t bucket_begin, + uint32_t bucket_end, + sycl::queue& q) +{ + (void)num_buckets; + if (bucket_end <= bucket_begin) return; + uint32_t const num_buckets_in_range = bucket_end - bucket_begin; + + uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q); + + constexpr size_t threads = 256; + uint64_t blocks_x_u64 = (l_count_max + threads - 1) / threads; + size_t const blocks_x = static_cast(blocks_x_u64); + + auto* d_out_count_ull = + reinterpret_cast(d_out_count); + + q.submit([&](sycl::handler& h) { + sycl::local_accessor sT_local{ + sycl::range<1>{4 * 256}, h}; + + h.parallel_for( + sycl::nd_range<2>{ + sycl::range<2>{ static_cast(num_buckets_in_range), + blocks_x * threads }, + sycl::range<2>{ 1, threads } + }, + [=, keys_copy = keys, fk_copy = fk](sycl::nd_item<2> it) { + uint32_t* sT = &sT_local[0]; + size_t local_id = it.get_local_id(1); + #pragma unroll 1 + for (size_t i = local_id; i < 4 * 256; i += threads) { + sT[i] = d_aes_tables[i]; + } + it.barrier(sycl::access::fence_space::local_space); + + uint32_t bucket_id = bucket_begin + static_cast(it.get_group(0)); + uint32_t section_l = bucket_id / num_match_keys; + uint32_t match_key_r = bucket_id % num_match_keys; + + uint32_t section_r; + { + uint32_t mask = (1u << num_section_bits) - 1u; + uint32_t rl = ((section_l << 1) | (section_l >> (num_section_bits - 1))) & mask; + uint32_t rl1 = (rl + 1) & mask; + section_r = ((rl1 >> 1) | (rl1 << (num_section_bits - 1))) & mask; + } + + uint64_t l_start = d_offsets[section_l * num_match_keys]; + uint64_t l_end = d_offsets[(section_l + 1) * num_match_keys]; + uint32_t r_bucket = section_r * num_match_keys + match_key_r; + + uint64_t l = l_start + + it.get_group(1) * uint64_t(threads) + + local_id; + if (l >= l_end) return; + + // Sliced read: caller guarantees l ∈ [section_l_row_start, ...). + uint64_t meta_l = d_meta_l_slice[l - section_l_row_start]; + uint32_t xb_l = d_sorted_xbits[l]; + + uint32_t target_l = pos2gpu::matching_target_smem( + keys_copy, 3u, match_key_r, meta_l, sT, 0) + & target_mask; + + uint32_t fine_shift = static_cast(num_match_target_bits - fine_bits); + uint32_t fine_key = target_l >> fine_shift; + uint64_t fine_idx = (uint64_t(r_bucket) << fine_bits) | fine_key; + uint64_t lo = d_fine_offsets[fine_idx]; + uint64_t fine_hi = d_fine_offsets[fine_idx + 1]; + uint64_t hi = fine_hi; + + while (lo < hi) { + uint64_t mid = lo + ((hi - lo) >> 1); + uint32_t target_mid = d_sorted_mi[mid] & target_mask; + if (target_mid < target_l) lo = mid + 1; + else hi = mid; + } + + uint32_t test_mask = (num_test_bits >= 32) ? 0xFFFFFFFFu + : ((1u << num_test_bits) - 1u); + + for (uint64_t r = lo; r < fine_hi; ++r) { + uint32_t target_r = d_sorted_mi[r] & target_mask; + if (target_r != target_l) break; + + // Sliced read: caller guarantees r ∈ [section_r_row_start, ...). + uint64_t meta_r = d_meta_r_slice[r - section_r_row_start]; + uint32_t xb_r = d_sorted_xbits[r]; + + pos2gpu::Result128 res = pos2gpu::pairing_smem( + keys_copy, meta_l, meta_r, sT, 0); + uint32_t test_result = res.r[3] & test_mask; + if (test_result != 0) continue; + + uint64_t all_x_bits = (uint64_t(xb_l) << k) | uint64_t(xb_r); + uint64_t fragment = pos2gpu::feistel_encrypt(fk_copy, all_x_bits); + + sycl::atomic_ref + out_count_atomic{ *d_out_count_ull }; + unsigned long long out_idx = out_count_atomic.fetch_add(1ULL); + if (out_idx >= out_capacity) return; + + T3PairingGpu p; + p.proof_fragment = fragment; + d_out_pairings[out_idx] = p; + } + }); + }).wait(); +} + +} // namespace pos2gpu diff --git a/src/gpu/XsCandidateGpu.hpp b/src/gpu/XsCandidateGpu.hpp new file mode 100644 index 0000000..a42fef3 --- /dev/null +++ b/src/gpu/XsCandidateGpu.hpp @@ -0,0 +1,22 @@ +// XsCandidateGpu.hpp — minimal header carrying just the Xs_Candidate POD. +// +// Split out from XsKernel.cuh so the type can be referenced from non-CUDA +// translation units (notably the SYCL backend implementations), which can't +// pull in the CUDA-laden XsKernel.cuh → AesHashGpu.cuh → AesGpu.cuh chain. +// +// Layout mirrors pos2-chip/src/plot/TableConstructorGeneric.hpp:496 so a +// host-side reinterpret_cast to the pos2-chip type is safe. + +#pragma once + +#include + +namespace pos2gpu { + +struct XsCandidateGpu { + uint32_t match_info; + uint32_t x; +}; +static_assert(sizeof(XsCandidateGpu) == 8, "must match pos2-chip Xs_Candidate layout"); + +} // namespace pos2gpu diff --git a/src/gpu/XsKernel.cpp b/src/gpu/XsKernel.cpp new file mode 100644 index 0000000..162e92b --- /dev/null +++ b/src/gpu/XsKernel.cpp @@ -0,0 +1,185 @@ +// XsKernel.cpp — orchestrates Xs construction on a SYCL queue. +// +// Pipeline: +// 1. launch_xs_gen: writes (g(x⊕xor_const), x) into (keys_a, vals_a). +// 2. launch_sort_pairs_u32_u32: stable radix sort by the bottom k bits. +// 3. launch_xs_pack: fold sorted (keys, vals) into XsCandidateGpu[total]. +// +// All scratch is allocated by the caller; on the first call with +// d_temp_storage == nullptr the function only writes the required +// *temp_bytes and returns without launching anything. + +#include "gpu/AesHashGpu.cuh" +#include "gpu/Sort.cuh" +#include "gpu/XsKernel.cuh" +#include "gpu/XsKernels.cuh" + +#include + +#include +#include +#include +#include +#include + +namespace pos2gpu { + +namespace { + +// Mirrors pos2-chip/src/pos/ProofConstants.hpp:14 +constexpr uint32_t kTestnetGXorConst = 0xA3B1C4D7u; + +// Layout of caller-provided d_temp_storage: +// [0 .. cub_bytes) CUB sort scratch +// [keys_a_off .. keys_a_off + N*4) keys_a (uint32) (*) +// [keys_b_off .. keys_b_off + N*4) keys_b (uint32) +// [vals_a_off .. vals_a_off + N*4) vals_a (uint32) +// [vals_b_off .. vals_b_off + N*4) vals_b (uint32) +// (*) In split mode (split_keys_a != nullptr) the keys_a slot is OMITTED +// from d_temp_storage — keys_a_off is set to SIZE_MAX as a sentinel and +// keys_b_off follows directly after cub_scratch. Total bytes drop by +// one aligned (N*u32) block (~1 GiB at k=28). +struct ScratchLayout { + size_t cub_bytes; + size_t keys_a_off; + size_t keys_b_off; + size_t vals_a_off; + size_t vals_b_off; + size_t total_bytes; +}; + +inline size_t align_up(size_t v, size_t a) { return (v + a - 1) / a * a; } + +ScratchLayout layout_for(uint64_t total, size_t cub_bytes, bool split_keys_a) +{ + ScratchLayout s{}; + s.cub_bytes = cub_bytes; + size_t cur = align_up(s.cub_bytes, 256); + if (split_keys_a) { + s.keys_a_off = ~size_t{0}; // sentinel: keys_a lives externally + } else { + s.keys_a_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256); + } + s.keys_b_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256); + s.vals_a_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256); + s.vals_b_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256); + s.total_bytes = cur; + return s; +} + +} // namespace + +void launch_construct_xs( + uint8_t const* plot_id_bytes, int k, bool testnet, + XsCandidateGpu* d_out, void* d_temp_storage, size_t* temp_bytes, + sycl::queue& q, void* split_keys_a) +{ + return launch_construct_xs_profiled(plot_id_bytes, k, testnet, + d_out, d_temp_storage, temp_bytes, + nullptr, nullptr, q, split_keys_a); +} + +void launch_construct_xs_profiled( + uint8_t const* plot_id_bytes, + int k, + bool testnet, + XsCandidateGpu* d_out, + void* d_temp_storage, + size_t* temp_bytes, + cudaEvent_t /*after_gen*/, + cudaEvent_t /*after_sort*/, + sycl::queue& q, + void* split_keys_a) +{ + // NOTE: the cudaEvent_t after_gen / after_sort parameters are kept + // for API compatibility but no longer recorded. xs_bench's per-phase + // timing is therefore zero through this call; use chrono on the host + // around launch_construct_xs to measure end-to-end wall time. A + // sycl::event-based profiling overload is the natural follow-up. + + if (k < 18 || k > 32 || (k & 1) != 0) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!plot_id_bytes || !temp_bytes) throw std::invalid_argument("invalid argument to launch wrapper"); + + uint64_t const total = 1ULL << k; + + // Query CUB temp size via the wrapper (sizing mode: null storage). + size_t cub_bytes = 0; + launch_sort_pairs_u32_u32( + nullptr, cub_bytes, + nullptr, nullptr, + nullptr, nullptr, + total, /*begin_bit=*/0, /*end_bit=*/k, q); + + bool const split = (split_keys_a != nullptr); + auto sl = layout_for(total, cub_bytes, split); + + if (d_temp_storage == nullptr) { + *temp_bytes = sl.total_bytes; + + return; + } + if (*temp_bytes < sl.total_bytes) throw std::invalid_argument("invalid argument to launch wrapper"); + if (!d_out) throw std::invalid_argument("invalid argument to launch wrapper"); + + auto* base = static_cast(d_temp_storage); + auto* cub_scratch = base; // first cub_bytes + auto* keys_a = split + ? static_cast(split_keys_a) + : reinterpret_cast(base + sl.keys_a_off); + auto* keys_b = reinterpret_cast(base + sl.keys_b_off); + auto* vals_a = reinterpret_cast(base + sl.vals_a_off); + auto* vals_b = reinterpret_cast(base + sl.vals_b_off); + + AesHashKeys keys = make_keys(plot_id_bytes); + uint32_t xor_const = testnet ? kTestnetGXorConst : 0u; + + // Sub-phase wall-time breakdown — useful when GpuPipeline's outer + // "Xs gen+sort" phase dominates total wall (notably on the SYCL/HIP + // backend, where the Xs phase has been observed at ~40% on RDNA2 vs + // ~6% on NVIDIA). Gated on POS2GPU_PHASE_TIMING=1 so the q.wait()s + // don't perturb production runs. + bool const xs_timing = [] { + char const* v = std::getenv("POS2GPU_PHASE_TIMING"); + return v && v[0] == '1'; + }(); + using xs_clock = std::chrono::steady_clock; + auto xs_now = [&] { return xs_clock::now(); }; + auto xs_elapsed_ms = [&](xs_clock::time_point t0) { + return std::chrono::duration(xs_now() - t0).count(); + }; + auto xs_t0 = xs_now(); + if (xs_timing) q.wait(); + + // Phase 1: generate (match_info, x) into keys_a / vals_a + launch_xs_gen(keys, keys_a, vals_a, total, k, xor_const, q); + double t_gen = 0.0; + if (xs_timing) { q.wait(); t_gen = xs_elapsed_ms(xs_t0); xs_t0 = xs_now(); } + + // Phase 2: stable radix sort by (key low k bits) — keys_a → keys_b, + // vals_a → vals_b. (We give up CUB's DoubleBuffer optimisation here, + // costing one extra pass at most; pack reads from the b side.) + launch_sort_pairs_u32_u32( + cub_scratch, cub_bytes, + keys_a, keys_b, + vals_a, vals_b, + total, /*begin_bit=*/0, /*end_bit=*/k, q); + double t_sort = 0.0; + if (xs_timing) { q.wait(); t_sort = xs_elapsed_ms(xs_t0); xs_t0 = xs_now(); } + + // Phase 3: pack the sorted side into AoS XsCandidateGpu in d_out. + launch_xs_pack(keys_b, vals_b, d_out, total, q); + double t_pack = 0.0; + if (xs_timing) { q.wait(); t_pack = xs_elapsed_ms(xs_t0); } + + if (xs_timing) { + double const total_ms = t_gen + t_sort + t_pack; + std::fprintf(stderr, + "[xs-timing] gen=%.1fms(%.0f%%) sort=%.1fms(%.0f%%) pack=%.1fms(%.0f%%) total=%.1fms\n", + t_gen, total_ms > 0.0 ? 100.0 * t_gen / total_ms : 0.0, + t_sort, total_ms > 0.0 ? 100.0 * t_sort / total_ms : 0.0, + t_pack, total_ms > 0.0 ? 100.0 * t_pack / total_ms : 0.0, + total_ms); + } +} + +} // namespace pos2gpu diff --git a/src/gpu/XsKernel.cu b/src/gpu/XsKernel.cu deleted file mode 100644 index 133504e..0000000 --- a/src/gpu/XsKernel.cu +++ /dev/null @@ -1,181 +0,0 @@ -// XsKernel.cu — implementation of launch_construct_xs. -// -// Pipeline: -// 1. Phase 1 kernel writes XsCandidateGpu[x] = { g(x), x } for x in [0, 2^k). -// 2. Pack into (key=match_info, value=x) and call cub::DeviceRadixSort:: -// SortPairs over the bottom k bits. CUB's radix sort is stable -// (preserves relative order for equal keys), matching pos2-chip's -// RadixSort which is multi-pass LSD radix. -// 3. Repack sorted (key, value) back into XsCandidateGpu in d_out. -// -// All scratch is allocated by the caller; on first call with d_temp_storage -// == nullptr the function only writes the required *temp_bytes and returns -// without launching anything. - -#include "gpu/AesGpu.cuh" -#include "gpu/AesHashGpu.cuh" -#include "gpu/XsKernel.cuh" - -#include -#include -#include - -namespace pos2gpu { - -namespace { - -// Mirrors pos2-chip/src/pos/ProofConstants.hpp:14 -constexpr uint32_t kTestnetGXorConst = 0xA3B1C4D7u; - -__global__ void gen_kernel( - AesHashKeys keys, - uint32_t* __restrict__ keys_out, // match_info - uint32_t* __restrict__ vals_out, // x - uint64_t total, - int k, - uint32_t xor_const) -{ - __shared__ uint32_t sT[4 * 256]; - load_aes_tables_smem(sT); - __syncthreads(); - - uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x; - if (idx >= total) return; - uint32_t x = static_cast(idx); - uint32_t mixed = x ^ xor_const; - keys_out[idx] = g_x_smem(keys, mixed, k, sT, kAesGRounds); - vals_out[idx] = x; -} - -__global__ void pack_kernel( - uint32_t const* __restrict__ keys_in, - uint32_t const* __restrict__ vals_in, - XsCandidateGpu* __restrict__ out, - uint64_t total) -{ - uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x; - if (idx >= total) return; - out[idx] = XsCandidateGpu{ keys_in[idx], vals_in[idx] }; -} - -// Layout of caller-provided d_temp_storage (single arena): -// -// [0 .. keys_in_off) reserved for CUB scratch -// [keys_in_off .. keys_in_off + N*4) keys_in (uint32) -// [keys_out_off .. keys_out_off + N*4) keys_out (uint32) -// [vals_in_off .. vals_in_off + N*4) vals_in (uint32) -// [vals_out_off .. vals_out_off + N*4) vals_out (uint32) -// -// CUB SortPairs alternates ping-pong between in/out; we use the -// `DoubleBuffer` API to let CUB pick which side ends up holding the -// sorted result. - -struct ScratchLayout { - size_t cub_bytes; // bytes for CUB's own scratch - size_t keys_a_off; // offset to keys buffer A - size_t keys_b_off; // offset to keys buffer B - size_t vals_a_off; // offset to vals buffer A - size_t vals_b_off; // offset to vals buffer B - size_t total_bytes; -}; - -constexpr size_t align_up(size_t v, size_t a) { return (v + a - 1) / a * a; } - -ScratchLayout layout_for(uint64_t total, size_t cub_bytes) -{ - ScratchLayout s{}; - s.cub_bytes = cub_bytes; - size_t cur = align_up(s.cub_bytes, 256); - s.keys_a_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256); - s.keys_b_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256); - s.vals_a_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256); - s.vals_b_off = cur; cur += sizeof(uint32_t) * total; cur = align_up(cur, 256); - s.total_bytes = cur; - return s; -} - -} // namespace - -cudaError_t launch_construct_xs( - uint8_t const* plot_id_bytes, int k, bool testnet, - XsCandidateGpu* d_out, void* d_temp_storage, size_t* temp_bytes, - cudaStream_t stream) -{ - return launch_construct_xs_profiled(plot_id_bytes, k, testnet, - d_out, d_temp_storage, temp_bytes, - nullptr, nullptr, stream); -} - -cudaError_t launch_construct_xs_profiled( - uint8_t const* plot_id_bytes, - int k, - bool testnet, - XsCandidateGpu* d_out, - void* d_temp_storage, - size_t* temp_bytes, - cudaEvent_t after_gen, - cudaEvent_t after_sort, - cudaStream_t stream) -{ - if (k < 18 || k > 32 || (k & 1) != 0) return cudaErrorInvalidValue; - if (!plot_id_bytes || !temp_bytes) return cudaErrorInvalidValue; - - uint64_t const total = 1ULL << k; - - // Query CUB temp size once (depends only on N). - cub::DoubleBuffer probe_keys(nullptr, nullptr); - cub::DoubleBuffer probe_vals(nullptr, nullptr); - size_t cub_bytes = 0; - cudaError_t err = cub::DeviceRadixSort::SortPairs( - nullptr, cub_bytes, - probe_keys, probe_vals, - total, /*begin_bit=*/0, /*end_bit=*/k, stream); - if (err != cudaSuccess) return err; - - auto sl = layout_for(total, cub_bytes); - - if (d_temp_storage == nullptr) { - *temp_bytes = sl.total_bytes; - return cudaSuccess; - } - if (*temp_bytes < sl.total_bytes) return cudaErrorInvalidValue; - if (!d_out) return cudaErrorInvalidValue; - - auto* base = static_cast(d_temp_storage); - auto* cub_scratch = base; // first cub_bytes - auto* keys_a = reinterpret_cast(base + sl.keys_a_off); - auto* keys_b = reinterpret_cast(base + sl.keys_b_off); - auto* vals_a = reinterpret_cast(base + sl.vals_a_off); - auto* vals_b = reinterpret_cast(base + sl.vals_b_off); - - AesHashKeys keys = make_keys(plot_id_bytes); - uint32_t xor_const = testnet ? kTestnetGXorConst : 0u; - - constexpr int kThreads = 256; - uint64_t blocks_u64 = (total + kThreads - 1) / kThreads; - if (blocks_u64 > UINT_MAX) return cudaErrorInvalidValue; - unsigned blocks = static_cast(blocks_u64); - - // Phase 1: generate (match_info, x) into keys_a / vals_a - gen_kernel<<>>(keys, keys_a, vals_a, total, k, xor_const); - err = cudaGetLastError(); - if (err != cudaSuccess) return err; - if (after_gen) cudaEventRecord(after_gen, stream); - - // Phase 2: stable radix sort by (key low k bits) - cub::DoubleBuffer keys_buf(keys_a, keys_b); - cub::DoubleBuffer vals_buf(vals_a, vals_b); - err = cub::DeviceRadixSort::SortPairs( - cub_scratch, cub_bytes, - keys_buf, vals_buf, - total, /*begin_bit=*/0, /*end_bit=*/k, stream); - if (err != cudaSuccess) return err; - - // Phase 3: pack the side CUB ended up writing into d_out - pack_kernel<<>>( - keys_buf.Current(), vals_buf.Current(), d_out, total); - if (after_sort) cudaEventRecord(after_sort, stream); - return cudaGetLastError(); -} - -} // namespace pos2gpu diff --git a/src/gpu/XsKernel.cuh b/src/gpu/XsKernel.cuh index b43d11c..8ea924e 100644 --- a/src/gpu/XsKernel.cuh +++ b/src/gpu/XsKernel.cuh @@ -9,19 +9,15 @@ #pragma once #include "gpu/AesHashGpu.cuh" +#include "gpu/XsCandidateGpu.hpp" -#include +#include "gpu/CudaHalfShim.hpp" +#include #include #include namespace pos2gpu { -struct XsCandidateGpu { - uint32_t match_info; - uint32_t x; -}; -static_assert(sizeof(XsCandidateGpu) == 8, "must match pos2-chip Xs_Candidate layout"); - // Generate Xs_Candidate[2^k], sorted by match_info (low k bits, stable). // Caller must have called initialize_aes_tables() once before invocation. // @@ -32,22 +28,31 @@ static_assert(sizeof(XsCandidateGpu) == 8, "must match pos2-chip Xs_Candidate la // d_out : device buffer of at least (1ULL << k) XsCandidateGpu // d_temp_storage : device scratch; pass nullptr first to query size // temp_bytes : in/out — when d_temp_storage is null, set to required size -// stream : optional CUDA stream +// split_keys_a : optional device pointer of at least total*sizeof(uint32_t) +// bytes. When non-null, the sort's keys_a slot is placed +// there instead of inside d_temp_storage, and *temp_bytes +// correspondingly shrinks by total*u32 (plus alignment). +// Intended for the pool path, which aliases keys_a into +// d_storage's tail (idle during Xs gen+sort) to drop +// ~1 GiB off the pair_b xs-scratch region at k=28. The +// non-null-ness is the flag in sizing mode (the actual +// pointer is read only when d_temp_storage != nullptr). // // Returns cudaSuccess on launch success. The sort is asynchronous on the // stream — synchronize before reading d_out on the host. -cudaError_t launch_construct_xs( +void launch_construct_xs( uint8_t const* plot_id_bytes, int k, bool testnet, XsCandidateGpu* d_out, void* d_temp_storage, size_t* temp_bytes, - cudaStream_t stream = nullptr); + sycl::queue& q, + void* split_keys_a = nullptr); // Optional callback fired between the gen kernel and the sort, useful for // per-stage cudaEvent timing. Pass nullptr to skip. -cudaError_t launch_construct_xs_profiled( +void launch_construct_xs_profiled( uint8_t const* plot_id_bytes, int k, bool testnet, @@ -56,6 +61,7 @@ cudaError_t launch_construct_xs_profiled( size_t* temp_bytes, cudaEvent_t after_gen, // nullable; recorded after gen kernel queued cudaEvent_t after_sort, // nullable; recorded after sort queued - cudaStream_t stream = nullptr); + sycl::queue& q, + void* split_keys_a = nullptr); } // namespace pos2gpu diff --git a/src/gpu/XsKernels.cuh b/src/gpu/XsKernels.cuh new file mode 100644 index 0000000..35ac27f --- /dev/null +++ b/src/gpu/XsKernels.cuh @@ -0,0 +1,66 @@ +// XsKernels.cuh — backend-dispatched wrappers for the two non-sort phases +// of Xs construction. The orchestration (sizing query, sort, fold-into-AoS) +// lives in XsKernel.cpp and chains these via a sycl::queue. +// +// Phase 1: launch_xs_gen — fill (keys_out[x], vals_out[x]) = (g_x(x⊕xor_const), x) +// for x in [0, total). Loads AES T-tables into local memory once +// per workgroup, mirroring the CUDA gen_kernel pattern. +// +// Phase 3: launch_xs_pack — pack sorted (keys_in, vals_in) back into AoS +// XsCandidateGpu[total]. Pure grid-stride; no AES. + +#pragma once + +#include "gpu/AesHashGpu.cuh" +#include "gpu/XsCandidateGpu.hpp" + +#include + +#include "gpu/CudaHalfShim.hpp" +#include + +namespace pos2gpu { + +void launch_xs_gen( + AesHashKeys keys, + uint32_t* keys_out, + uint32_t* vals_out, + uint64_t total, + int k, + uint32_t xor_const, + sycl::queue& q); + +// Position-range variant of launch_xs_gen. Generates Xs candidates for +// positions x ∈ [pos_begin, pos_end) and writes to keys_out[i] / +// vals_out[i] where i = x - pos_begin (relative indexing). keys_out / +// vals_out must be sized for at least (pos_end - pos_begin) elements. +// Used by minimal tier to tile the Xs gen + sort phase below the +// 4 GiB-cap peak. +void launch_xs_gen_range( + AesHashKeys keys, + uint32_t* keys_out, + uint32_t* vals_out, + uint64_t pos_begin, + uint64_t pos_end, + int k, + uint32_t xor_const, + sycl::queue& q); + +void launch_xs_pack( + uint32_t const* keys_in, + uint32_t const* vals_in, + XsCandidateGpu* d_out, + uint64_t total, + sycl::queue& q); + +// Position-range variant of launch_xs_pack. Reads keys_in[i] / vals_in[i] +// for i ∈ [0, count) and writes XsCandidateGpu{keys_in[i], vals_in[i]} +// to d_out[i + dst_begin]. Lets the caller pack incrementally. +void launch_xs_pack_range( + uint32_t const* keys_in, + uint32_t const* vals_in, + XsCandidateGpu* d_out, + uint64_t count, + sycl::queue& q); + +} // namespace pos2gpu diff --git a/src/gpu/XsKernelsSycl.cpp b/src/gpu/XsKernelsSycl.cpp new file mode 100644 index 0000000..9ae3589 --- /dev/null +++ b/src/gpu/XsKernelsSycl.cpp @@ -0,0 +1,136 @@ +// XsKernelsSycl.cpp — SYCL implementation of Xs gen/pack kernels. +// Same shape as the T1/T2/T3 SYCL impls; gen reuses the AES T-table USM +// buffer from SyclBackend.hpp, pack is a pure grid-stride lambda. + +#include "gpu/SyclBackend.hpp" +#include "gpu/XsKernels.cuh" + +#include + +namespace pos2gpu { + +void launch_xs_gen( + AesHashKeys keys, + uint32_t* keys_out, + uint32_t* vals_out, + uint64_t total, + int k, + uint32_t xor_const, + sycl::queue& q) +{ + uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q); + + constexpr size_t threads = 256; + size_t const groups = (total + threads - 1) / threads; + + q.submit([&](sycl::handler& h) { + sycl::local_accessor sT_local{ + sycl::range<1>{4 * 256}, h}; + + h.parallel_for( + sycl::nd_range<1>{ groups * threads, threads }, + [=, keys_copy = keys](sycl::nd_item<1> it) { + // Cooperative load of AES T-tables into local memory. + uint32_t* sT = &sT_local[0]; + size_t local_id = it.get_local_id(0); + #pragma unroll 1 + for (size_t i = local_id; i < 4 * 256; i += threads) { + sT[i] = d_aes_tables[i]; + } + it.barrier(sycl::access::fence_space::local_space); + + uint64_t idx = it.get_global_id(0); + if (idx >= total) return; + uint32_t x = static_cast(idx); + uint32_t mixed = x ^ xor_const; + keys_out[idx] = pos2gpu::g_x_smem(keys_copy, mixed, k, sT); + vals_out[idx] = x; + }); + }).wait(); +} + +void launch_xs_gen_range( + AesHashKeys keys, + uint32_t* keys_out, + uint32_t* vals_out, + uint64_t pos_begin, + uint64_t pos_end, + int k, + uint32_t xor_const, + sycl::queue& q) +{ + if (pos_end <= pos_begin) return; + uint64_t const range_n = pos_end - pos_begin; + + uint32_t* d_aes_tables = sycl_backend::aes_tables_device(q); + + constexpr size_t threads = 256; + size_t const groups = (range_n + threads - 1) / threads; + + q.submit([&](sycl::handler& h) { + sycl::local_accessor sT_local{ + sycl::range<1>{4 * 256}, h}; + + h.parallel_for( + sycl::nd_range<1>{ groups * threads, threads }, + [=, keys_copy = keys](sycl::nd_item<1> it) { + uint32_t* sT = &sT_local[0]; + size_t local_id = it.get_local_id(0); + #pragma unroll 1 + for (size_t i = local_id; i < 4 * 256; i += threads) { + sT[i] = d_aes_tables[i]; + } + it.barrier(sycl::access::fence_space::local_space); + + uint64_t local_idx = it.get_global_id(0); + if (local_idx >= range_n) return; + uint32_t x = static_cast(pos_begin + local_idx); + uint32_t mixed = x ^ xor_const; + keys_out[local_idx] = pos2gpu::g_x_smem(keys_copy, mixed, k, sT); + vals_out[local_idx] = x; + }); + }).wait(); +} + +void launch_xs_pack( + uint32_t const* keys_in, + uint32_t const* vals_in, + XsCandidateGpu* d_out, + uint64_t total, + sycl::queue& q) +{ + constexpr size_t threads = 256; + size_t const groups = (total + threads - 1) / threads; + + q.parallel_for( + sycl::nd_range<1>{ groups * threads, threads }, + [=](sycl::nd_item<1> it) { + uint64_t idx = it.get_global_id(0); + if (idx >= total) return; + d_out[idx] = XsCandidateGpu{ keys_in[idx], vals_in[idx] }; + }).wait(); +} + +void launch_xs_pack_range( + uint32_t const* keys_in, + uint32_t const* vals_in, + XsCandidateGpu* d_out, + uint64_t count, + sycl::queue& q) +{ + // Same body as launch_xs_pack — caller passes already-offset pointers + // (keys_in, vals_in, d_out) and the slice count. + if (count == 0) return; + constexpr size_t threads = 256; + size_t const groups = (count + threads - 1) / threads; + + q.parallel_for( + sycl::nd_range<1>{ groups * threads, threads }, + [=](sycl::nd_item<1> it) { + uint64_t idx = it.get_global_id(0); + if (idx >= count) return; + d_out[idx] = XsCandidateGpu{ keys_in[idx], vals_in[idx] }; + }).wait(); +} + +} // namespace pos2gpu diff --git a/src/host/BatchPlotter.cpp b/src/host/BatchPlotter.cpp index ccb3949..4d53434 100644 --- a/src/host/BatchPlotter.cpp +++ b/src/host/BatchPlotter.cpp @@ -1,13 +1,17 @@ // BatchPlotter.cu — implementation of staggered multi-plot pipeline. #include "host/BatchPlotter.hpp" +#include "host/Cancel.hpp" +#include "host/CpuPlotter.hpp" // run_one_plot_cpu — pos2-chip CPU pipeline #include "host/GpuBufferPool.hpp" #include "host/GpuPipeline.hpp" #include "host/PlotFileWriterParallel.hpp" +#include "gpu/DeviceIds.hpp" // kCpuDeviceId for the --cpu device-list mixin // Deliberately no pos2-chip includes here — see PlotFileWriterParallel.cpp. #include +#include #include #include #include @@ -15,11 +19,14 @@ #include #include #include +#include +#include #include #include #include #include #include +#include #include namespace pos2gpu { @@ -100,24 +107,111 @@ struct WorkItem { size_t index = 0; }; -// Bounded SPSC queue of depth 1 plus end-of-stream signal. +// Rough per-plot upper-bound estimate for the disk preflight. The actual +// compressed .plot2 is smaller (FSE over proof-fragment stubs); this +// uncompressed ceiling is deliberately pessimistic so we only WARN when +// the disk is genuinely too small, not for boundary cases. +// +// Formula: 2^k fragments × (proof_fragment_bits) / 8, where +// proof_fragment_bits ≈ k + (k - MINUS_STUB_BITS) + overhead, ≈ 2k bytes*bits. +uint64_t approx_plot_bytes_upper_bound(int k) +{ + if (k <= 0 || k > 32) return 0; + uint64_t const fragments = uint64_t(1) << k; + uint64_t const bits_per = uint64_t(2 * k); // k stub + k-2 xbits, rounded up + return (fragments * bits_per) / 8; +} + +// Check `.plot2` is present at path AND looks like a valid plot file +// (magic bytes "pos2" + nonzero size). Used for --skip-existing so we +// don't silently skip a zero-byte or crash-truncated leftover. +bool looks_like_complete_plot(std::filesystem::path const& path) +{ + std::error_code ec; + auto const sz = std::filesystem::file_size(path, ec); + if (ec || sz < 64) return false; // header alone is >64 B + + std::ifstream in(path, std::ios::binary); + if (!in) return false; + char magic[4]{}; + in.read(magic, 4); + return in.good() && magic[0] == 'p' && magic[1] == 'o' + && magic[2] == 's' && magic[3] == '2'; +} + +// Print a warning if the available free space on each unique output +// directory looks insufficient for the plots targeted there. Purely +// advisory — the atomic .partial write handles actual ENOSPC cleanly. +void preflight_disk_space(std::vector const& entries, + BatchOptions const& opts) +{ + if (entries.empty()) return; + + std::map> per_dir; // dir -> (count, bytes) + for (auto const& e : entries) { + uint64_t const est = approx_plot_bytes_upper_bound(e.k); + auto& slot = per_dir[e.out_dir.empty() ? std::string(".") : e.out_dir]; + slot.first += 1; + slot.second += est; + } + + constexpr double GB = 1.0 / (1024.0 * 1024.0 * 1024.0); + for (auto const& [dir, tally] : per_dir) { + std::error_code ec; + std::filesystem::create_directories(dir, ec); // space() needs it to exist + auto const info = std::filesystem::space(dir, ec); + if (ec) { + if (opts.verbose) { + std::fprintf(stderr, + "[batch] preflight: cannot stat free space on %s (%s) — " + "skipping check\n", dir.c_str(), ec.message().c_str()); + } + continue; + } + double const need_gb = tally.second * GB; + double const free_gb = info.available * GB; + if (info.available < tally.second) { + std::fprintf(stderr, + "[batch] WARNING: %s has %.1f GB free but %zu plot(s) may need " + "up to ~%.1f GB (uncompressed upper bound). The batch will " + "still run; .partial writes are atomic so mid-plot ENOSPC is " + "recoverable, but consider freeing space or reducing count.\n", + dir.c_str(), free_gb, tally.first, need_gb); + } else if (opts.verbose) { + std::fprintf(stderr, + "[batch] preflight: %s has %.1f GB free, %zu plot(s) need " + "up to ~%.1f GB\n", + dir.c_str(), free_gb, tally.first, need_gb); + } + } +} + +// Bounded SPSC queue + end-of-stream signal. +// +// Depth = kNumPinnedBuffers - 1 so the producer never overtakes the +// consumer by more than (num_pinned - 1) plots. The pinned slot the +// producer writes is slot (i % kNumPinnedBuffers); with depth-(N-1) +// the consumer is guaranteed to have popped plot (i - N) before the +// producer overwrites its slot. class Channel { public: + explicit Channel(std::size_t capacity) : capacity_(capacity) {} + void push(WorkItem item) { std::unique_lock lock(mu_); - cv_.wait(lock, [&]{ return !item_.has_value() && !closed_; }); + cv_not_full_.wait(lock, [&]{ return q_.size() < capacity_ || closed_; }); if (closed_) return; - item_ = std::move(item); - cv_.notify_all(); + q_.push(std::move(item)); + cv_not_empty_.notify_one(); } - // Returns false when channel is closed AND empty. + // Returns false when the channel is closed AND empty. bool pop(WorkItem& out) { std::unique_lock lock(mu_); - cv_.wait(lock, [&]{ return item_.has_value() || closed_; }); - if (item_.has_value()) { - out = std::move(*item_); - item_.reset(); - cv_.notify_all(); + cv_not_empty_.wait(lock, [&]{ return !q_.empty() || closed_; }); + if (!q_.empty()) { + out = std::move(q_.front()); + q_.pop(); + cv_not_full_.notify_one(); return true; } return false; @@ -125,93 +219,397 @@ class Channel { void close() { std::lock_guard lock(mu_); closed_ = true; - cv_.notify_all(); + cv_not_empty_.notify_all(); + cv_not_full_.notify_all(); } private: std::mutex mu_; - std::condition_variable cv_; - std::optional item_; + std::condition_variable cv_not_empty_, cv_not_full_; + std::queue q_; + std::size_t capacity_; bool closed_ = false; }; } // namespace -BatchResult run_batch(std::vector const& entries, bool verbose) +namespace { + +// Per-worker pipeline. Extracted from run_batch so the multi-device +// fan-out can spawn N of these concurrently — one thread per device, +// each with its own pool / channel / consumer. The outer run_batch +// validates homogeneity and runs the disk-space preflight once; this +// helper assumes both have already been done on `entries`. +// +// device_id sentinels (see src/gpu/DeviceIds.hpp): +// kDefaultGpuId (-1) → keep the default SYCL gpu_selector_v +// (single-device default; zero-config users +// see unchanged behavior). +// kCpuDeviceId (-2) → CPU worker via sycl::cpu_selector_v +// (--cpu / --devices cpu; AdaptiveCpp OMP +// backend, much slower than GPU). +// 0..N-1 → explicit GPU index from get_devices(gpu). +// worker_id < 0 → single-device path; currently unused beyond +// documenting intent but reserved for a future per- +// worker log prefix (see fprintf calls below — one +// line per call means ordering is already atomic +// per-line, so interleaving across workers is +// acceptable for v1 without prefix disambiguation). +// shared_idx (default null) lets multiple workers race for the next plot +// out of a single shared `entries` list. When set, every worker calls +// shared_idx->fetch_add(1) and exits when the result >= entries.size() — +// dynamic load balancing, so a fast GPU worker keeps pulling plots while +// a slow CPU worker handles only what it can finish in the same wall. +// When null (single-device path), the worker iterates 0..entries.size()-1 +// in order — original behaviour. +BatchResult run_batch_slice(std::vector const& entries, + BatchOptions const& opts, + int device_id, + int worker_id, + std::atomic* shared_idx = nullptr) { + (void)worker_id; + + // CPU worker: bypass the GPU pool / streaming path entirely. pos2-chip's + // Plotter manages all internal state itself, so each plot is a + // synchronous run_one_plot_cpu() call. Single-threaded internally; + // multi-core utilization comes from passing `cpu` multiple times in + // --devices (e.g. --devices cpu,cpu,cpu,cpu on a 4-core host). + // + // XCHPLOT2_SYCL_CPU_BENCH=1 routes --cpu through the SYCL pipeline on + // AdaptiveCpp's CPU backend instead of pos2-chip — exposed as an env + // var purely for benchmarking the two CPU paths against each other, + // not as a supported plotting mode (pos2-chip is faster + leaner). + bool const sycl_cpu_bench = [] { + char const* v = std::getenv("XCHPLOT2_SYCL_CPU_BENCH"); + return v && v[0] == '1'; + }(); + if (device_id == kCpuDeviceId && !sycl_cpu_bench) { + BatchResult res; + if (entries.empty()) return res; + auto const t_start = std::chrono::steady_clock::now(); + std::size_t local_idx = 0; + while (true) { + std::size_t const i = shared_idx + ? shared_idx->fetch_add(1, std::memory_order_relaxed) + : local_idx++; + if (i >= entries.size()) break; + if (opts.skip_existing) { + auto out_path = std::filesystem::path(entries[i].out_dir) + / entries[i].out_name; + if (looks_like_complete_plot(out_path)) { + if (opts.verbose) { + std::fprintf(stderr, + "[batch:cpu] skipping plot %zu: %s (already exists)\n", + i, out_path.string().c_str()); + } + ++res.plots_skipped; + continue; + } + } + try { + run_one_plot_cpu(entries[i], opts); + ++res.plots_written; + if (opts.verbose) { + std::fprintf(stderr, + "[batch:cpu] plot %zu done: %s\n", + i, entries[i].out_name.c_str()); + } + } catch (std::exception const& ex) { + std::fprintf(stderr, + "[batch:cpu] plot %zu FAILED: %s\n", i, ex.what()); + ++res.plots_failed; + if (!opts.continue_on_error) { + res.total_wall_seconds = std::chrono::duration( + std::chrono::steady_clock::now() - t_start).count(); + return res; + } + } + if (cancel_requested()) break; + } + res.total_wall_seconds = std::chrono::duration( + std::chrono::steady_clock::now() - t_start).count(); + return res; + } + + if (device_id >= 0 || device_id == kCpuDeviceId) bind_current_device(device_id); initialize_aes_tables(); + bool const verbose = opts.verbose; + BatchResult res; if (entries.empty()) return res; - // All entries in a batch must share (k, strength, testnet) so one pool - // fits all plots. Mixed-shape batches could be supported by splitting - // into homogeneous sub-batches; not needed in practice. + // Pool shape from the first entry. Homogeneity (all entries share + // k/strength/testnet) was checked by the outer run_batch. int pool_k = entries[0].k; int pool_strength = entries[0].strength; bool pool_testnet = entries[0].testnet; - for (size_t i = 1; i < entries.size(); ++i) { - if (entries[i].k != pool_k - || entries[i].strength != pool_strength - || entries[i].testnet != pool_testnet) - { - throw std::runtime_error( - "run_batch: all entries must share (k, strength, testnet)"); - } - } // Allocate the pool once; destructor frees at function exit. This is // the whole point of the batch path — eliminate the per-plot ~2.4 s // allocator cost (dominated by cudaMallocHost(2 GB)). - GpuBufferPool pool(pool_k, pool_strength, pool_testnet); - if (verbose) { + // + // On insufficient device VRAM (small card), the pool ctor throws + // InsufficientVramError. Fall back to the streaming pipeline per + // plot — slower (no buffer amortisation across plots, no + // producer/consumer overlap between GPU D2H and consumer I/O on + // pinned double-buffered pool slots), but it fits inside the card's + // VRAM and is still overlapped via the Channel between the producer + // thread's streaming call and the consumer thread's FSE compression + // + plot-file write. + std::unique_ptr pool_ptr; + // Streaming-fallback pinned buffers — double-buffered the same way the + // pool does, so producer's D2H of plot N+1 can run concurrently with + // the consumer reading plot N. cudaMallocHost is ~600 ms, so doing it + // once instead of per plot is a significant win on long batches. + uint64_t* stream_pinned[GpuBufferPool::kNumPinnedBuffers] = {}; + size_t stream_pinned_cap = 0; + // Stage 4f: amortised streaming-path pinned-host scratch. Populated + // in the streaming-fallback branch below; nullptr fields when the + // pool path is active (pool_ptr != null). + StreamingPinnedScratch stream_scratch{}; + + // Force-streaming override (matches the one-shot run_gpu_pipeline + // dispatch). Useful for testing the streaming path on a high-VRAM + // card and for users who want the smaller peak even when the pool + // would fit. + bool const force_streaming = [] { + char const* v = std::getenv("XCHPLOT2_STREAMING"); + return v && v[0] == '1'; + }(); + + try { + if (force_streaming) { + throw InsufficientVramError("XCHPLOT2_STREAMING=1 forced"); + } + pool_ptr = std::make_unique( + pool_k, pool_strength, pool_testnet); + } catch (InsufficientVramError const& e) { + if (force_streaming) { + std::fprintf(stderr, "[batch] XCHPLOT2_STREAMING=1 — using " + "streaming pipeline per plot\n"); + } else { + std::fprintf(stderr, + "[batch] pool needs %.2f GiB, only %.2f GiB free — using " + "streaming pipeline per plot\n", + e.required_bytes / double(1ULL << 30), + e.free_bytes / double(1ULL << 30)); + } + // Streaming tier dispatch — three tiers, increasing PCIe pressure + // for decreasing peak VRAM: + // plain (~7290 MB at k=28): no parks, single-pass T2 match. + // Fastest, ~400 ms/plot over compact. + // compact (~5200 MB at k=28): all parks + N=2 T2 match staging. + // Targets 6-8 GiB cards. + // minimal (~3700 MB at k=28): compact's parks + N=8 T2 match + // staging. Targets 4 GiB cards at + // the cost of extra PCIe round-trips + // during T2 match. + // Auto-pick takes the largest tier that fits with the margin. + // 128 MB margin above measured CUDA-context + driver overhead + // on headless cards. + // + // opts.streaming_tier (--tier CLI flag) > XCHPLOT2_STREAMING_TIER + // env var > auto. Forced plain/compact below their floor warn but + // proceed (caller's risk); forced minimal below its floor throws + // because there is no smaller tier to fall back to. + { + auto const mem = query_device_memory(); + size_t const plain_peak = streaming_plain_peak_bytes(pool_k); + size_t const compact_peak = streaming_peak_bytes(pool_k); + size_t const minimal_peak = streaming_minimal_peak_bytes(pool_k); + size_t const margin = 128ULL << 20; + auto to_gib = [](size_t b) { return b / double(1ULL << 30); }; + + char const* tier_env = std::getenv("XCHPLOT2_STREAMING_TIER"); + std::string const tier_pref = + !opts.streaming_tier.empty() ? opts.streaming_tier : + (tier_env ? std::string(tier_env) : std::string()); + + enum class Tier { Plain, Compact, Minimal }; + Tier tier; + if (tier_pref == "plain") { + tier = Tier::Plain; + } else if (tier_pref == "compact") { + tier = Tier::Compact; + } else if (tier_pref == "minimal") { + tier = Tier::Minimal; + } else { + // Auto: pick the largest tier that fits with margin. + tier = (mem.free_bytes >= plain_peak + margin) ? Tier::Plain : + (mem.free_bytes >= compact_peak + margin) ? Tier::Compact : + Tier::Minimal; + } + + auto tier_name = [](Tier t) -> char const* { + return t == Tier::Plain ? "plain" + : t == Tier::Compact ? "compact" + : "minimal"; + }; + size_t const required = + tier == Tier::Plain ? plain_peak : + tier == Tier::Compact ? compact_peak : + minimal_peak; + + // Minimal is the open-ended fallback — if even minimal won't + // fit, throw. Forced higher tier below its floor warns and + // proceeds (caller asked). + if (tier == Tier::Minimal && mem.free_bytes < required + margin) { + InsufficientVramError se( + "[batch] streaming pipeline needs ~" + + std::to_string(to_gib(required + margin)).substr(0, 5) + + " GiB peak for k=" + std::to_string(pool_k) + + " (minimal tier, the smallest available), device reports " + + std::to_string(to_gib(mem.free_bytes)).substr(0, 5) + + " GiB free of " + + std::to_string(to_gib(mem.total_bytes)).substr(0, 5) + + " GiB total. Use a smaller k or a larger GPU " + "(or --cpu for pos2-chip CPU plotting)."); + se.required_bytes = required + margin; + se.free_bytes = mem.free_bytes; + se.total_bytes = mem.total_bytes; + throw se; + } + if (tier != Tier::Minimal && mem.free_bytes < required + margin) { + std::fprintf(stderr, + "[batch] streaming tier: %s forced (%.2f GiB free < %.2f GiB " + "%s floor) — proceeding, may OOM mid-plot\n", + tier_name(tier), + to_gib(mem.free_bytes), + to_gib(required + margin), + tier_name(tier)); + } + + stream_scratch.plain_mode = (tier == Tier::Plain); + if (tier == Tier::Minimal) { + stream_scratch.t2_tile_count = 8; + stream_scratch.gather_tile_count = 4; + } + + std::fprintf(stderr, + "[batch] streaming tier: %s " + "(%.2f GiB free, %.2f GiB peak, %.2f GiB plain floor)\n", + tier_name(tier), + to_gib(mem.free_bytes), + to_gib(required), + to_gib(plain_peak + margin)); + } + // Size the pinned buffers using the same cap formula as the pool. + int const num_section_bits = (pool_k < 28) ? 2 : (pool_k - 26); + int const extra_margin_bits = 8 - ((28 - pool_k) / 2); + uint64_t const per_section = + (1ULL << (pool_k - num_section_bits)) + + (1ULL << (pool_k - extra_margin_bits)); + uint64_t const cap = per_section * (1ULL << num_section_bits); + stream_pinned_cap = size_t(cap); + bool any_fail = false; + for (int s = 0; s < GpuBufferPool::kNumPinnedBuffers; ++s) { + stream_pinned[s] = streaming_alloc_pinned_uint64(stream_pinned_cap); + if (!stream_pinned[s]) { any_fail = true; break; } + } + if (any_fail) { + for (int s = 0; s < GpuBufferPool::kNumPinnedBuffers; ++s) { + if (stream_pinned[s]) streaming_free_pinned_uint64(stream_pinned[s]); + } + throw std::runtime_error( + "[batch] streaming-fallback: pinned D2H buffer allocation failed"); + } + + // Stage 4f (compact tier only): amortise streaming-path + // pinned-host scratch across all plots in the batch. Lifetime + // analysis (see StreamingPinnedScratch doc) lets four shared + // buffers cover all six internal park/staging roles. At k=28: + // h_meta 2080 MB + h_keys_merged 1040 MB + h_t2_xbits 1040 MB + // + h_t3 2080 MB = ~6.24 GB of pinned host, paid ONCE for the + // whole batch. + // + // Plain tier does not park anything, so these pinned-host + // scratch buffers are not needed. + if (!stream_scratch.plain_mode) { + stream_scratch.h_meta = streaming_alloc_pinned_uint64(stream_pinned_cap); + stream_scratch.h_keys_merged = streaming_alloc_pinned_uint32(stream_pinned_cap); + stream_scratch.h_t2_xbits = streaming_alloc_pinned_uint32(stream_pinned_cap); + stream_scratch.h_t3 = streaming_alloc_pinned_uint64(stream_pinned_cap); + if (!stream_scratch.h_meta || !stream_scratch.h_keys_merged || + !stream_scratch.h_t2_xbits || !stream_scratch.h_t3) + { + if (stream_scratch.h_meta) streaming_free_pinned_uint64(stream_scratch.h_meta); + if (stream_scratch.h_keys_merged) streaming_free_pinned_uint32(stream_scratch.h_keys_merged); + if (stream_scratch.h_t2_xbits) streaming_free_pinned_uint32(stream_scratch.h_t2_xbits); + if (stream_scratch.h_t3) streaming_free_pinned_uint64(stream_scratch.h_t3); + for (int s = 0; s < GpuBufferPool::kNumPinnedBuffers; ++s) { + if (stream_pinned[s]) streaming_free_pinned_uint64(stream_pinned[s]); + } + throw std::runtime_error( + "[batch] streaming-fallback: pinned-host scratch allocation failed"); + } + } + } + if (verbose && pool_ptr) { double gb = 1.0 / (1024.0 * 1024.0 * 1024.0); std::fprintf(stderr, "[batch] pool: storage=%.2f GB pair_a=%.2f GB pair_b=%.2f GB " "sort_scratch=%.2f GB pinned=2x%.2f GB " "(Xs scratch aliased in pair_b)\n", - pool.storage_bytes * gb, - pool.pair_bytes * gb, - pool.pair_bytes * gb, - pool.sort_scratch_bytes * gb, - pool.pinned_bytes * gb); + pool_ptr->storage_bytes * gb, + pool_ptr->pair_a_bytes * gb, + pool_ptr->pair_b_bytes * gb, + pool_ptr->sort_scratch_bytes * gb, + pool_ptr->pinned_bytes * gb); } - Channel chan; + // Depth = kNumPinnedBuffers - 1. See Channel's comment block above. + Channel chan(static_cast(GpuBufferPool::kNumPinnedBuffers - 1)); std::atomic consumer_failed{false}; std::atomic plots_done{0}; std::exception_ptr consumer_err; auto t_start = std::chrono::steady_clock::now(); + std::atomic plots_failed_consumer{0}; + // Consumer: takes finished GpuPipelineResults and writes plot files. + // Under continue_on_error, per-plot exceptions (e.g. ENOSPC for a + // specific plot) are logged and the loop continues rather than + // tearing down the batch. The .partial + rename in + // write_plot_file_parallel guarantees failed writes leave nothing + // behind at the destination. std::thread consumer([&] { try { WorkItem item; while (chan.pop(item)) { - std::filesystem::create_directories(item.entry.out_dir); auto full_path = std::filesystem::path(item.entry.out_dir) / item.entry.out_name; - - std::vector memo_bytes = item.entry.memo; - if (memo_bytes.empty()) memo_bytes.assign(32 + 48 + 32, 0); - - // Fragments are borrowed from the pool's pinned slot; the - // producer is synchronised via the depth-1 channel so that - // slot won't be reused until we're done here. - write_plot_file_parallel( - full_path.string(), - item.result.fragments(), - item.entry.plot_id.data(), - static_cast(item.entry.k), - static_cast(item.entry.strength), - item.entry.testnet ? uint8_t{1} : uint8_t{0}, - static_cast(item.entry.plot_index), - static_cast(item.entry.meta_group), - std::span(memo_bytes.data(), memo_bytes.size())); - - ++plots_done; - if (verbose) { - std::fprintf(stderr, "[batch] consumer wrote plot %zu: %s\n", - item.index, full_path.string().c_str()); + try { + std::filesystem::create_directories(item.entry.out_dir); + + std::vector memo_bytes = item.entry.memo; + if (memo_bytes.empty()) memo_bytes.assign(32 + 48 + 32, 0); + + // Fragments are borrowed from the pool's pinned slot; the + // producer is synchronised via the depth-1 channel so that + // slot won't be reused until we're done here. + write_plot_file_parallel( + full_path.string(), + item.result.fragments(), + item.entry.plot_id.data(), + static_cast(item.entry.k), + static_cast(item.entry.strength), + item.entry.testnet ? uint8_t{1} : uint8_t{0}, + static_cast(item.entry.plot_index), + static_cast(item.entry.meta_group), + std::span(memo_bytes.data(), memo_bytes.size())); + + ++plots_done; + if (verbose) { + std::fprintf(stderr, "[batch] consumer wrote plot %zu: %s\n", + item.index, full_path.string().c_str()); + } + } catch (std::exception const& e) { + if (!opts.continue_on_error) throw; + ++plots_failed_consumer; + std::fprintf(stderr, + "[batch] plot %zu FAILED (write %s): %s — continuing\n", + item.index, full_path.string().c_str(), e.what()); } } } catch (...) { @@ -220,11 +618,44 @@ BatchResult run_batch(std::vector const& entries, bool verbose) } }); + size_t producer_failed = 0; + // Producer (this thread): drives the GPU pipeline, hands off to consumer. + // local_count rotates this worker's own pinned-buffer slots (channel + // depth = kNumPinnedBuffers); it must NOT use the global plot index + // when shared_idx is in play, because peer workers also hold slots in + // their own pools. try { - for (size_t i = 0; i < entries.size(); ++i) { + std::size_t local_idx = 0; + std::size_t local_count = 0; + while (true) { if (consumer_failed) break; + std::size_t const i = shared_idx + ? shared_idx->fetch_add(1, std::memory_order_relaxed) + : local_idx++; + if (i >= entries.size()) break; + + if (cancel_requested()) { + std::fprintf(stderr, + "[batch] cancel received — stopping before plot %zu\n", i); + break; + } + + if (opts.skip_existing) { + auto out_path = std::filesystem::path(entries[i].out_dir) + / entries[i].out_name; + if (looks_like_complete_plot(out_path)) { + if (verbose) { + std::fprintf(stderr, + "[batch] skipping plot %zu: %s (already exists)\n", + i, out_path.string().c_str()); + } + ++res.plots_skipped; + continue; + } + } + auto t_plot = std::chrono::steady_clock::now(); GpuPipelineConfig cfg; @@ -237,9 +668,29 @@ BatchResult run_batch(std::vector const& entries, bool verbose) WorkItem item; item.entry = entries[i]; item.index = i; - // Alternate pinned buffer per plot so the current D2H doesn't - // clobber pinned data the consumer is still reading. - item.result = run_gpu_pipeline(cfg, pool, static_cast(i % 2)); + int const slot = static_cast( + local_count % GpuBufferPool::kNumPinnedBuffers); + try { + if (pool_ptr) { + // Pool path: rotate pinned slot per plot. The channel's + // (kNumPinnedBuffers - 1) depth holds the producer back + // before it overtakes the consumer's read of that slot. + item.result = run_gpu_pipeline(cfg, *pool_ptr, slot); + } else { + // Streaming path with externally-owned pinned: same + // rotation + channel-depth invariant. + item.result = run_gpu_pipeline_streaming( + cfg, stream_pinned[slot], stream_pinned_cap, + stream_scratch); + } + } catch (std::exception const& e) { + if (!opts.continue_on_error) throw; + ++producer_failed; + std::fprintf(stderr, + "[batch] plot %zu FAILED (GPU): %s — continuing\n", + i, e.what()); + continue; + } if (verbose) { auto ms = std::chrono::duration( @@ -254,6 +705,7 @@ BatchResult run_batch(std::vector const& entries, bool verbose) } chan.push(std::move(item)); + ++local_count; } } catch (...) { chan.close(); @@ -266,10 +718,140 @@ BatchResult run_batch(std::vector const& entries, bool verbose) if (consumer_failed && consumer_err) std::rethrow_exception(consumer_err); + for (int s = 0; s < GpuBufferPool::kNumPinnedBuffers; ++s) { + streaming_free_pinned_uint64(stream_pinned[s]); + } + // Stage 4f: free the amortised streaming scratch (no-op if pool path + // was used — all fields stay nullptr in that case). + if (stream_scratch.h_meta) streaming_free_pinned_uint64(stream_scratch.h_meta); + if (stream_scratch.h_keys_merged) streaming_free_pinned_uint32(stream_scratch.h_keys_merged); + if (stream_scratch.h_t2_xbits) streaming_free_pinned_uint32(stream_scratch.h_t2_xbits); + if (stream_scratch.h_t3) streaming_free_pinned_uint64(stream_scratch.h_t3); + res.plots_written = plots_done.load(); + res.plots_failed = producer_failed + plots_failed_consumer.load(); res.total_wall_seconds = std::chrono::duration( std::chrono::steady_clock::now() - t_start).count(); return res; } +} // namespace + +BatchResult run_batch(std::vector const& entries, + BatchOptions const& opts) +{ + if (entries.empty()) return BatchResult{}; + + // Homogeneity check (all entries must share k/strength/testnet) — + // runs once on the full list before any per-worker dispatch so both + // the single- and multi-device paths share the same error surface. + int const pool_k = entries[0].k; + int const pool_strength = entries[0].strength; + bool const pool_testnet = entries[0].testnet; + for (size_t i = 1; i < entries.size(); ++i) { + if (entries[i].k != pool_k + || entries[i].strength != pool_strength + || entries[i].testnet != pool_testnet) + { + throw std::runtime_error( + "run_batch: all entries must share (k, strength, testnet)"); + } + } + + preflight_disk_space(entries, opts); + + // Resolve the target device list: + // use_all_devices → enumerate at runtime, one worker per GPU + // device_ids → use these explicit ids + // (neither) → empty list → single-device default selector + // include_cpu → orthogonal: also append kCpuDeviceId so the + // CPU runs as one more worker. Mixes with the + // above (--cpu alone → CPU only; --cpu --devices + // all → all GPUs + CPU; etc.). + std::vector device_ids; + if (opts.use_all_devices) { + int const n = gpu_device_count(); + if (n <= 0) { + std::fprintf(stderr, + "[batch] --devices all: runtime enumerated 0 GPUs — " + "falling back to the default SYCL selector\n"); + } else { + device_ids.reserve(static_cast(n)); + for (int i = 0; i < n; ++i) device_ids.push_back(i); + } + } else if (!opts.device_ids.empty()) { + device_ids = opts.device_ids; + } + if (opts.include_cpu && + std::find(device_ids.begin(), device_ids.end(), kCpuDeviceId) + == device_ids.end()) { + device_ids.push_back(kCpuDeviceId); + } + + auto const t_start = std::chrono::steady_clock::now(); + + // Fast path: zero-config default or one explicit id. Runs on the + // caller thread — identical control flow to pre-multi-GPU except + // for the optional thread-local device bind at the top of the + // slice. + if (device_ids.size() <= 1) { + int const dev = device_ids.empty() ? -1 : device_ids[0]; + BatchResult r = run_batch_slice(entries, opts, dev, -1); + r.total_wall_seconds = std::chrono::duration( + std::chrono::steady_clock::now() - t_start).count(); + return r; + } + + // Multi-device: workers race to pull plots from a single shared + // queue (atomic counter into `entries`) so a fast GPU keeps pulling + // work while a slow CPU only handles what it can finish in the same + // wall. Each worker still constructs its own GpuBufferPool / + // producer-consumer channel / writer thread on its target device — + // zero cross-worker shared state beyond `next_idx`, stderr, and + // the filesystem. + size_t const N = device_ids.size(); + std::fprintf(stderr, + "[batch] multi-device: %zu plots across %zu workers (work-queue) — devices:", + entries.size(), N); + for (size_t i = 0; i < N; ++i) { + std::fprintf(stderr, " %d", device_ids[i]); + } + std::fprintf(stderr, "\n"); + + std::atomic next_idx{0}; + std::vector per_worker(N); + std::vector per_worker_exc(N); + std::vector workers; + workers.reserve(N); + for (size_t i = 0; i < N; ++i) { + workers.emplace_back([&, i]() { + try { + per_worker[i] = run_batch_slice( + entries, opts, device_ids[i], + static_cast(i), &next_idx); + } catch (...) { + per_worker_exc[i] = std::current_exception(); + } + }); + } + for (auto& t : workers) t.join(); + + // Propagate the first worker exception after every worker has + // joined — prevents a fast failure from leaving peer workers still + // running and printing to a half-torn-down pipeline. + for (auto& ep : per_worker_exc) { + if (ep) std::rethrow_exception(ep); + } + + BatchResult agg; + for (auto const& r : per_worker) { + agg.plots_written += r.plots_written; + agg.plots_skipped += r.plots_skipped; + agg.plots_failed += r.plots_failed; + } + agg.total_wall_seconds = std::chrono::duration( + std::chrono::steady_clock::now() - t_start).count(); + return agg; +} + } // namespace pos2gpu diff --git a/src/host/BatchPlotter.hpp b/src/host/BatchPlotter.hpp index 2c1423e..e9b7c37 100644 --- a/src/host/BatchPlotter.hpp +++ b/src/host/BatchPlotter.hpp @@ -32,15 +32,75 @@ struct BatchEntry { struct BatchResult { size_t plots_written = 0; + size_t plots_skipped = 0; // present + skipped via BatchOptions::skip_existing + size_t plots_failed = 0; // raised an exception under BatchOptions::continue_on_error double total_wall_seconds = 0.0; }; +// Options controlling batch behavior. +// verbose — per-plot progress on stderr +// skip_existing — if an output .plot2 already exists (and passes a +// lightweight magic/size check), skip the plot +// instead of overwriting it +// continue_on_error — catch per-plot exceptions and log rather than +// aborting the batch; plots_failed in the result +// counts how many skipped this way +// device_ids — explicit list of GPU device ids to use. When empty +// and use_all_devices is false, run on a single +// device picked by the default SYCL gpu_selector_v +// (zero-configuration, pre-multi-GPU behavior). +// With multiple ids, the batch is partitioned +// across workers — one thread per device, each +// with its own GpuBufferPool and producer/consumer +// channel. Plots are assigned round-robin +// (entry i → worker i % N). +// use_all_devices — enumerate all SYCL GPU devices at runtime and +// use them. Overrides device_ids. Useful when the +// caller doesn't know the host's device count up +// front (e.g. `--devices all` on the CLI). +// include_cpu — append the CPU as a worker device alongside any +// GPUs already selected. Set by `--cpu` (orthogonal +// to --devices) or by passing `cpu` as a token in +// --devices. CPU is encoded as kCpuDeviceId (-2) in +// device_ids — see src/gpu/DeviceIds.hpp. Plotting +// on CPU is 1-2 orders of magnitude slower than on +// GPU; this is meant for headless CI / GPU-less +// hosts / heterogeneous device-list mixing. +// streaming_tier — optional manual override for the streaming +// pipeline tier (when the GPU pool doesn't fit). +// Accepted values: "plain" (~7.24 GB floor at k=28, +// ~10-15% faster), "compact" (~5.33 GB floor, fits +// on tight 8 GB cards). Empty string = auto (the +// pre-existing behavior: pick plain if it fits, +// else compact). Equivalent to XCHPLOT2_STREAMING_TIER +// env var but settable via --tier on the CLI; the +// struct field takes precedence over the env var. +struct BatchOptions { + bool verbose = false; + bool skip_existing = false; + bool continue_on_error = false; + std::vector device_ids; + bool use_all_devices = false; + bool include_cpu = false; + std::string streaming_tier; +}; + // Parse a manifest file in the format described in tools/xchplot2/main.cpp // (tab-separated, one plot per line). Throws std::runtime_error on bad input. std::vector parse_manifest(std::string const& path); // Run the staggered pipeline. Producer/consumer share a queue of depth 1. // The first plot pays the full GPU+FSE cost; subsequent plots overlap. -BatchResult run_batch(std::vector const& entries, bool verbose = false); +BatchResult run_batch(std::vector const& entries, + BatchOptions const& opts); + +// Legacy bool-verbose shim kept for source-compat with older callsites. +inline BatchResult run_batch(std::vector const& entries, + bool verbose = false) +{ + BatchOptions opts; + opts.verbose = verbose; + return run_batch(entries, opts); +} } // namespace pos2gpu diff --git a/src/host/Cancel.cpp b/src/host/Cancel.cpp new file mode 100644 index 0000000..7ba7fd6 --- /dev/null +++ b/src/host/Cancel.cpp @@ -0,0 +1,68 @@ +// Cancel.cpp — implementation of the SIGINT/SIGTERM cancel flag. + +#include "host/Cancel.hpp" + +#include + +#if defined(__unix__) || defined(__APPLE__) +# include // write(2) +#endif + +namespace pos2gpu { + +namespace { + +// sig_atomic_t is the one type C/C++ guarantee is safe to read/write from +// a signal handler without synchronization concerns. The count lets us +// turn the second same-signal receipt into a hard kill, so a user whose +// cooperative shutdown is stuck can still escape with a second Ctrl-C. +volatile std::sig_atomic_t g_cancel_count = 0; + +void write_stderr_safe(char const* msg, std::size_t len) noexcept +{ +#if defined(__unix__) || defined(__APPLE__) + // write(2) is async-signal-safe; std::fprintf is not. + ssize_t const rc = ::write(2, msg, len); + (void)rc; // nothing useful to do if stderr is gone +#else + (void)msg; + (void)len; +#endif +} + +extern "C" void cancel_handler(int sig) noexcept +{ + // On the second receipt, restore the default disposition and re-raise + // so the process dies immediately. Prevents a hung plotter from + // needing kill -9 when the user insists. + if (g_cancel_count >= 1) { + std::signal(sig, SIG_DFL); + std::raise(sig); + return; + } + g_cancel_count = 1; + static char const msg[] = + "\n[xchplot2] cancel requested — finishing current plot then " + "stopping. Press Ctrl-C again to abort immediately.\n"; + write_stderr_safe(msg, sizeof(msg) - 1); +} + +} // namespace + +void install_cancel_signal_handlers() +{ + std::signal(SIGINT, cancel_handler); + std::signal(SIGTERM, cancel_handler); +} + +bool cancel_requested() noexcept +{ + return g_cancel_count > 0; +} + +void reset_cancel_for_tests() noexcept +{ + g_cancel_count = 0; +} + +} // namespace pos2gpu diff --git a/src/host/Cancel.hpp b/src/host/Cancel.hpp new file mode 100644 index 0000000..cc4138e --- /dev/null +++ b/src/host/Cancel.hpp @@ -0,0 +1,26 @@ +// Cancel.hpp — SIGINT/SIGTERM handling for long-running batches. +// +// install_cancel_signal_handlers() installs handlers that set an +// async-signal-safe flag on first receipt and restore the default +// disposition on second receipt (so double-Ctrl-C kills hard). +// +// cancel_requested() is cheap enough to call from tight loops. + +#pragma once + +namespace pos2gpu { + +// Install SIGINT + SIGTERM handlers. Idempotent — safe to call more than +// once. First signal sets the cancel flag and prints a one-line notice +// via write(2) (async-signal-safe). Second signal of the same type +// re-raises with the default disposition, terminating the process. +void install_cancel_signal_handlers(); + +// True if a cancelling signal has been received since program start +// (or since reset_cancel_for_tests()). +bool cancel_requested() noexcept; + +// Testing hook — clear the flag. Not intended for production code. +void reset_cancel_for_tests() noexcept; + +} // namespace pos2gpu diff --git a/src/host/CpuPlotter.cpp b/src/host/CpuPlotter.cpp new file mode 100644 index 0000000..1e83e09 --- /dev/null +++ b/src/host/CpuPlotter.cpp @@ -0,0 +1,72 @@ +// CpuPlotter.cpp — wraps pos2-chip's Plotter + PlotFile::writeData. +// +// Isolated to one TU because pos2-chip's Plotter.hpp pulls in the full +// table-construction template stack (Table1/2/3Constructor + RadixSort +// + ChunkCompressor + ...). Including that header anywhere else in the +// build would balloon compile times for no benefit — only this TU +// actually invokes Plotter::run(). + +#include "host/CpuPlotter.hpp" +#include "host/BatchPlotter.hpp" // for BatchEntry / BatchOptions + +// pos2-chip headers — header-only, no separate compilation needed. +// pos2_chip_headers (PUBLIC dep of pos2_gpu_host) provides the +// include path + fse link. +#include "plot/Plotter.hpp" +#include "plot/PlotFile.hpp" +#include "pos/ProofParams.hpp" + +#include +#include +#include +#include +#include +#include + +namespace pos2gpu { + +void run_one_plot_cpu(BatchEntry const& entry, BatchOptions const& opts) +{ + // Build pos2-chip's ProofParams from BatchEntry's existing fields. + // ProofParams is in the global namespace (pos2-chip doesn't wrap + // its public types in a namespace). + ::ProofParams params(entry.plot_id.data(), + static_cast(entry.k), + static_cast(entry.strength), + static_cast(entry.testnet ? 1 : 0)); + + ::Plotter::Options pl_opts; + pl_opts.verbose = opts.verbose; + + ::Plotter plotter(params); + ::PlotData plot = plotter.run(pl_opts); + + // pos2-chip's PlotFile::writeData accepts the memo as a span and + // writes a 1-byte length prefix on disk, so any size in [0, 255] + // is valid. keygen-rs emits two layouts: + // - pool-PH mode: 32-byte pool_ph + 48-byte farmer_pk + 32-byte + // master_sk = 112 bytes + // - pool-PK mode: 48-byte pool_pk + 48-byte farmer_pk + 32-byte + // master_sk = 128 bytes + // BatchEntry.memo already holds the bytes in the on-disk layout, so + // pass them through as a span. The previous strict 112-byte check + // rejected pool-PK plots produced via `xchplot2 plot -p ...`. + if (entry.memo.size() > 255) { + throw std::runtime_error( + "CpuPlotter: memo size " + std::to_string(entry.memo.size()) + + " exceeds the 255-byte on-disk limit"); + } + + std::filesystem::path const out_path = + std::filesystem::path(entry.out_dir) / entry.out_name; + + ::PlotFile::writeData(out_path.string(), + plot, + params, + static_cast(entry.plot_index), + static_cast(entry.meta_group), + std::span(entry.memo.data(), + entry.memo.size())); +} + +} // namespace pos2gpu diff --git a/src/host/CpuPlotter.hpp b/src/host/CpuPlotter.hpp new file mode 100644 index 0000000..796034a --- /dev/null +++ b/src/host/CpuPlotter.hpp @@ -0,0 +1,28 @@ +// CpuPlotter.hpp — single-plot CPU pipeline using pos2-chip's Plotter +// directly (no SYCL / no GPU code path involved). +// +// Format-compatible with the GPU output: same plot_id derivation, same +// .plot2 file layout, byte-identical proofs. pos2-chip is the upstream +// PoS2 reference implementation, already in our build tree via +// FetchContent (third_party/pos2-chip), so we link its CPU plotter +// directly rather than routing SYCL kernels through AdaptiveCpp's +// OpenMP backend. +// +// Single-threaded internally (the Plotter constructs T1/T2/T3 in +// sequence). Multi-core utilization comes from BatchPlotter spawning +// one of these per `cpu` token in --devices, e.g. `--devices cpu,cpu` +// runs two concurrent plots on two cores. +// +// Throws std::runtime_error on plotting failure (caller decides +// whether to continue under continue_on_error). + +#pragma once + +namespace pos2gpu { + +struct BatchEntry; +struct BatchOptions; + +void run_one_plot_cpu(BatchEntry const& entry, BatchOptions const& opts); + +} // namespace pos2gpu diff --git a/src/host/GpuBufferPool.cpp b/src/host/GpuBufferPool.cpp new file mode 100644 index 0000000..d35fd53 --- /dev/null +++ b/src/host/GpuBufferPool.cpp @@ -0,0 +1,478 @@ +// GpuBufferPool.cu — queries per-phase scratch sizes once and allocates +// worst-case-sized persistent buffers. Slice 13 migrated the device and +// pinned-host allocations from the cudaMalloc / cudaMallocHost family to +// sycl::malloc_device / sycl::malloc_host on the shared SYCL queue; +// cudaMemGetInfo is left as-is because it's a context-level query that +// works regardless of which runtime is doing the allocations (SYCL + +// CUDA host code share the same primary CUDA context). + +#include "host/GpuBufferPool.hpp" +#include "gpu/Sort.cuh" +#include "gpu/SyclBackend.hpp" +#include "host/PoolSizing.hpp" + +#include "gpu/XsKernel.cuh" +#include "gpu/T1Kernel.cuh" +#include "gpu/T2Kernel.cuh" +#include "gpu/T3Kernel.cuh" + +#include + +#include +#include +#include +#include +#include +#include + +namespace pos2gpu { + +namespace { + + +// Allocate `bytes` of device memory on `q` and check for null. The cap-and- +// throw helpers in GpuPipeline.cu are streaming-pipeline specific; the pool +// just allocates worst-case sizes once at construction so a one-line wrap +// suffices. +// Format a byte count as " bytes ( MB)" for diagnostics. The +// raw byte count surfaces sub-MiB requests that would otherwise round +// to "0 MB"; the MB form keeps human readability for the > 1 MiB case. +inline std::string fmt_alloc_bytes(size_t bytes) +{ + char buf[64]; + std::snprintf(buf, sizeof(buf), "%zu bytes (%.2f MB)", + bytes, double(bytes) / (1024.0 * 1024.0)); + return std::string(buf); +} + +// AdaptiveCpp's CUDA allocator throws sycl::exception on cudaMalloc +// failure (e.g. "cuda_allocator: cudaMalloc() failed (error code = +// CUDA:2)" for cudaErrorMemoryAllocation). Older / non-CUDA backends +// may instead return nullptr. Cover both paths with one diagnostic +// shape so callers see "sycl::malloc_device(d_pair_a, 4690 MB) failed: +// " regardless of which branch fired. This also catches +// the throw synchronously so the async error handler doesn't log the +// same CUDA error a second time after caller cleanup. +inline void* sycl_alloc_device_or_throw(size_t bytes, sycl::queue& q, + char const* what) +{ + void* p = nullptr; + try { + p = sycl::malloc_device(bytes, q); + } catch (sycl::exception const& e) { + throw std::runtime_error( + std::string("sycl::malloc_device(") + what + ", " + + fmt_alloc_bytes(bytes) + ") failed: " + e.what() + + ". Likely transient OOM — check `nvidia-smi` for other GPU " + "consumers, or set POS2GPU_MAX_VRAM_MB lower if VRAM is " + "shared with display/compositor."); + } + if (!p) { + throw std::runtime_error( + std::string("sycl::malloc_device(") + what + ", " + + fmt_alloc_bytes(bytes) + ") returned null (out of device " + "memory). Likely transient OOM — check `nvidia-smi` for " + "other GPU consumers, or set POS2GPU_MAX_VRAM_MB lower if " + "VRAM is shared with display/compositor."); + } + return p; +} + +inline void* sycl_alloc_host_or_throw(size_t bytes, sycl::queue& q, + char const* what) +{ + void* p = nullptr; + try { + p = sycl::malloc_host(bytes, q); + } catch (sycl::exception const& e) { + throw std::runtime_error( + std::string("sycl::malloc_host(") + what + ", " + + fmt_alloc_bytes(bytes) + ") failed: " + e.what()); + } + if (!p) { + throw std::runtime_error( + std::string("sycl::malloc_host(") + what + ", " + + fmt_alloc_bytes(bytes) + ") returned null (out of host pinned memory)"); + } + return p; +} + +} // namespace + +GpuBufferPool::GpuBufferPool(int k_, int strength_, bool testnet_) + : k(k_), strength(strength_), testnet(testnet_) +{ + sycl::queue& q = sycl_backend::queue(); + + int const num_section_bits = (k < 28) ? 2 : (k - 26); + total_xs = 1ULL << k; + cap = max_pairs_per_section(k, num_section_bits) * (1ULL << num_section_bits); + + // d_storage must hold EITHER total_xs XsCandidateGpu (8 B each) OR + // THREE cap-sized uint32 key/val arrays during sort. Only three, not + // four: the sort API signature takes a (keys_in, keys_out, vals_in, + // vals_out) quad, but pool-path callers always pass the SoA match-info + // stream (d_t1_mi / d_t2_mi, living in d_pair_a) as keys_in, so the + // keys_in slot inside d_storage was never read. Dropping it saves + // cap·4 B (~1.09 GiB at k=28) — enough to close the 0.71 GiB pool + // shortfall on 12 GiB cards. + storage_bytes = std::max( + static_cast(total_xs) * sizeof(XsCandidateGpu), + static_cast(cap) * 3 * sizeof(uint32_t)); + + // d_pair_a holds the *match output* of the current phase: T1 SoA + // (meta·8 B + mi·4 B = 12 B), T2 SoA (meta·8 B + mi·4 B + xbits·4 B = + // 16 B), then T3 (T3PairingGpu, 8 B). Worst case is T2 at 16 B/entry. + // It does NOT alias the Xs construction scratch — that's d_pair_b. + pair_a_bytes = std::max({ + static_cast(cap) * sizeof(T1PairingGpu), + static_cast(cap) * sizeof(T2PairingGpu), + static_cast(cap) * sizeof(T3PairingGpu), + static_cast(cap) * sizeof(uint64_t), + }); + + // d_pair_b holds the *sort output* of the current phase (sorted T1 + // meta, sorted T2 meta+xbits, T3 frags) AND the Xs construction + // scratch. Sized to the max of those. + // + // Split-keys_a optimisation: the pool places the Xs sort's keys_a + // slot (total_xs·u32 = 1 GiB at k=28) in d_storage's tail — idle + // during Xs gen+sort, and the final pack phase only writes + // d_storage[0..total_xs·8), leaving the tail region undisturbed. + // This drops xs_temp_bytes from ~4.36 GB (4·N·u32 + cub) to + // ~3.22 GB (3·N·u32 + cub). At k=28 pair_b is then bounded by + // cap·12 (sorted T2 meta+xbits = 3.27 GB) rather than xs scratch, + // saving ~1.09 GB off the pool's peak VRAM requirement vs the + // pre-split layout. + uint8_t dummy_plot_id[32] = {}; + // Non-null sentinel tells launch_construct_xs to report the + // split-layout size. The sentinel value is read only in sizing + // mode (d_temp_storage == nullptr), where only its non-null-ness + // matters. + void* const xs_split_sentinel = reinterpret_cast(uintptr_t{1}); + launch_construct_xs(dummy_plot_id, k, testnet, + nullptr, nullptr, &xs_temp_bytes, q, + xs_split_sentinel); + pair_b_bytes = std::max({ + static_cast(cap) * sizeof(uint64_t), // sorted T1 meta + static_cast(cap) * (sizeof(uint64_t) + sizeof(uint32_t)), // sorted T2 meta+xbits + static_cast(cap) * sizeof(uint64_t), // T3 frags out + xs_temp_bytes, // Xs aliased scratch (3·N·u32 + cub) + }); + + // Query CUB sort scratch sizes (largest across T1/T2/T3 sorts). + size_t s_pairs = 0; + launch_sort_pairs_u32_u32( + nullptr, s_pairs, + static_cast(nullptr), static_cast(nullptr), + static_cast(nullptr), static_cast(nullptr), + cap, 0, k, q); + size_t s_keys = 0; + launch_sort_keys_u64( + nullptr, s_keys, + static_cast(nullptr), static_cast(nullptr), + cap, 0, 2 * k, q); + sort_scratch_bytes = std::max(s_pairs, s_keys); + + pinned_bytes = cap * sizeof(uint64_t); + + // Check VRAM before attempting allocation so we can give a useful + // diagnostic instead of a generic allocation failure. The margin covers + // GPU driver/context state, sort scratch, AES T-tables, and other small + // runtime allocations. + // + // SYCL has no portable free-memory query, so slice 17c approximates + // free_b == total_b. The actual sycl::malloc_device call will throw if + // VRAM is exhausted; the diagnostic message is just less precise about + // how much of the total is already consumed by other processes. + { + size_t const required_device = + storage_bytes + pair_a_bytes + pair_b_bytes + sort_scratch_bytes + sizeof(uint64_t); + // Margin covers per-context driver state + AES T-tables + the + // tiny (sizeof(uint64_t)) d_counter alloc that's not counted in + // sort_scratch. Originally 512 MB (slice 17c); trimmed to 256 MB + // after measuring actual runtime overhead on gfx1031/ROCm 6.2 + // and sm_89/CUDA 13: both land under 150 MB of non-pool device + // allocations, so a 256 MB margin leaves >100 MB headroom while + // letting cards on the threshold (e.g. 12 GiB reporting ~11.8 + // GiB free at ctor time) now succeed into the pool path. + size_t const margin = 256ULL * 1024 * 1024; // 256 MB + size_t const total_b = + q.get_device().get_info(); + size_t const free_b = total_b; // approximation — see comment above + if (free_b < required_device + margin) { + auto to_gib = [](size_t b) { return b / double(1ULL << 30); }; + InsufficientVramError e( + "GpuBufferPool: insufficient device VRAM for k=" + + std::to_string(k) + " strength=" + std::to_string(strength) + + "; need ~" + std::to_string(to_gib(required_device + margin)).substr(0, 5) + + " GiB (pool " + std::to_string(to_gib(required_device)).substr(0, 5) + + " GiB + ~0.25 GiB runtime), only " + + std::to_string(to_gib(free_b)).substr(0, 5) + + " GiB free of " + std::to_string(to_gib(total_b)).substr(0, 5) + + " GiB total. Use a smaller k or a GPU with more VRAM."); + e.required_bytes = required_device + margin; + e.free_bytes = free_b; + e.total_bytes = total_b; + throw e; + } + } + + if (getenv("POS2GPU_POOL_DEBUG")) { + size_t const total_b = + q.get_device().get_info(); + std::fprintf(stderr, + "[pool] k=%d strength=%d cap=%llu total_xs=%llu " + "total=%.2fGB (free unavailable in SYCL build)\n", + k, strength, (unsigned long long)cap, (unsigned long long)total_xs, + total_b/1e9); + std::fprintf(stderr, + "[pool] sizes: storage=%.2fGB pair_a=%.2fGB pair_b=%.2fGB " + "xs_temp(alias→pair_b)=%.2fGB sort_scratch=%.2fGB pinned=%.2fGB\n", + storage_bytes/1e9, pair_a_bytes/1e9, pair_b_bytes/1e9, + xs_temp_bytes/1e9, sort_scratch_bytes/1e9, pinned_bytes/1e9); + } + + // Wrap allocations so a mid-sequence failure (e.g. d_pair_b OOM after + // d_storage + d_pair_a have already succeeded) frees the pre-allocated + // buffers instead of leaking ~10 GB of device VRAM and ~7 GB of host + // pinned memory per failed pool ctor across a batch retry loop. + auto cleanup_partial = [&]{ + if (d_storage) { sycl::free(d_storage, q); d_storage = nullptr; } + if (d_pair_a) { sycl::free(d_pair_a, q); d_pair_a = nullptr; } + if (d_pair_b) { sycl::free(d_pair_b, q); d_pair_b = nullptr; } + if (d_sort_scratch) { sycl::free(d_sort_scratch, q); d_sort_scratch = nullptr; } + if (d_counter) { sycl::free(d_counter, q); d_counter = nullptr; } + for (int i = 0; i < kNumPinnedBuffers; ++i) { + if (h_pinned_t3[i]) { sycl::free(h_pinned_t3[i], q); h_pinned_t3[i] = nullptr; } + } + }; + try { + d_storage = sycl_alloc_device_or_throw(storage_bytes, q, "d_storage"); + // d_pair_a is allocated lazily in ensure_pair_a(), called by + // run_gpu_pipeline's pool path right after submitting Xs gen + // — the malloc_device then overlaps with Xs GPU execution. + // Saves ~400-500 ms on first-plot wall vs eager alloc; batch + // plots 2+ are unaffected (fast-path pointer lookup). + d_pair_b = sycl_alloc_device_or_throw(pair_b_bytes, q, "d_pair_b"); + d_sort_scratch = sycl_alloc_device_or_throw(sort_scratch_bytes, q, "d_sort_scratch"); + d_counter = static_cast( + sycl_alloc_device_or_throw(sizeof(uint64_t), q, "d_counter")); + // h_pinned_t3[] is allocated lazily in ensure_pinned(); see + // the header comment for why. Single-plot runs only ever + // touch slot 0 so the other two 2.2 GB malloc_host calls + // aren't paid at all. + } catch (...) { + cleanup_partial(); + throw; + } +} + +void* GpuBufferPool::ensure_pair_a() +{ + if (d_pair_a) return d_pair_a; + std::lock_guard lk(pair_a_mu_); + if (d_pair_a) return d_pair_a; + sycl::queue& q = sycl_backend::queue(); + d_pair_a = sycl_alloc_device_or_throw(pair_a_bytes, q, "d_pair_a"); + return d_pair_a; +} + +void GpuBufferPool::release_pair_a() +{ + std::lock_guard lk(pair_a_mu_); + if (!d_pair_a) return; + sycl::free(d_pair_a, sycl_backend::queue()); + d_pair_a = nullptr; +} + +uint64_t* GpuBufferPool::ensure_pinned(int idx) +{ + if (idx < 0 || idx >= kNumPinnedBuffers) { + throw std::runtime_error("GpuBufferPool::ensure_pinned: idx out of range"); + } + // Double-checked locking: fast path skips the mutex once the + // slot's pointer is visible. Writes inside the mutex are + // release-ordered w.r.t. the mutex release; the unlocked read + // on the fast path is an acquire (relaxed access is fine here + // because x86 and arm64 give us acquire ordering for aligned + // pointer reads; if this ever needs to be portable to weaker + // architectures, make h_pinned_t3 std::atomic[]). + if (h_pinned_t3[idx]) return h_pinned_t3[idx]; + std::lock_guard lk(pinned_mu_[idx]); + if (h_pinned_t3[idx]) return h_pinned_t3[idx]; + sycl::queue& q = sycl_backend::queue(); + h_pinned_t3[idx] = static_cast( + sycl_alloc_host_or_throw(pinned_bytes, q, "h_pinned_t3")); + return h_pinned_t3[idx]; +} + +GpuBufferPool::~GpuBufferPool() +{ + sycl::queue& q = sycl_backend::queue(); + if (d_storage) sycl::free(d_storage, q); + if (d_pair_a) sycl::free(d_pair_a, q); + if (d_pair_b) sycl::free(d_pair_b, q); + if (d_sort_scratch) sycl::free(d_sort_scratch, q); + if (d_counter) sycl::free(d_counter, q); + for (int i = 0; i < kNumPinnedBuffers; ++i) { + if (h_pinned_t3[i]) sycl::free(h_pinned_t3[i], q); + } +} + +DeviceMemInfo query_device_memory() +{ + sycl::queue& q = sycl_backend::queue(); + DeviceMemInfo info; + info.total_bytes = + q.get_device().get_info(); + // SYCL has no portable free-memory query; AdaptiveCpp's + // global_mem_size returns the device total. On the CUDA backend + // the underlying driver often subtracts active reservations + // (framebuffer, compositor) before reporting, which gets us + // closer to "free" in practice. Treat the result as an upper + // bound; sycl::malloc_device is still the source of truth. + info.free_bytes = info.total_bytes; + + if (char const* v = std::getenv("POS2GPU_MAX_VRAM_MB"); v && v[0]) { + size_t const cap = size_t(std::strtoull(v, nullptr, 10)) * (1ULL << 20); + info.free_bytes = std::min(info.free_bytes, cap); + info.total_bytes = std::min(info.total_bytes, cap); + } + return info; +} + +namespace { + +// CUB's DeviceRadixSort temp_storage_bytes at k=28 with our key/val +// shape lands around 64-128 MB on sm_89; the streaming peak anchors +// below were measured with that overhead already live, so they +// implicitly budget for it. AdaptiveCpp's HIP backend routes the +// same `launch_sort_*` calls through a hand-rolled SYCL radix in +// SortSycl.cpp that uses ping-pong buffers sized to the input — +// multi-GiB at k=28, far exceeding what CUB's in-place radix needs. +// The streaming peak prediction has to add that excess so dispatch +// in BatchPlotter doesn't pick a tier whose "predicted peak" is +// several GiB short of the actual T1-sort live, the way an 8 GiB +// W5700 (gfx1010 → gfx1013 spoof) currently does. +// +// Baseline set at 256 MB at k=28 (a touch over CUB's typical scratch +// on sm_89 to keep headroom on NVIDIA cards near the threshold) and +// scaled 2× per +k step (linear in cap, matching how CUB's actual +// DeviceRadixSort scratch grows). The returned adjustment is +// `max(0, runtime_sort_scratch - baseline)`, so NVIDIA hosts whose +// runtime scratch is at or below the baseline see no change in +// predicted peak. +inline size_t streaming_sort_scratch_adjustment(int k) +{ + constexpr size_t cub_baseline_at_k28_bytes = 256ULL << 20; + + sycl::queue& q = sycl_backend::queue(); + int const num_section_bits = (k < 28) ? 2 : (k - 26); + size_t const cap_for_k = + max_pairs_per_section(k, num_section_bits) * (1ULL << num_section_bits); + + size_t s_pairs = 0; + launch_sort_pairs_u32_u32( + nullptr, s_pairs, + static_cast(nullptr), static_cast(nullptr), + static_cast(nullptr), static_cast(nullptr), + cap_for_k, 0, k, q); + size_t s_keys = 0; + launch_sort_keys_u64( + nullptr, s_keys, + static_cast(nullptr), static_cast(nullptr), + cap_for_k, 0, 2 * k, q); + size_t const actual = std::max(s_pairs, s_keys); + + int const dk = k - 28; + size_t baseline = cub_baseline_at_k28_bytes; + if (dk > 0) baseline <<= dk; + else if (dk < 0) baseline >>= -dk; + + return (actual > baseline) ? (actual - baseline) : 0; +} + +} // namespace + +size_t streaming_peak_bytes(int k) +{ + // Anchor: 5200 MB at k=28 (measured post-stage-4e on sm_89). + // After the full T1/T2/T3 match/sort work (stages 1-4d) + Xs + // gen+sort+pack inlining (4e), all match + sort phases cap out at + // cap·sizeof(uint64_t) × ~2.5 aliases = ~5200 MB. Xs peak is 4128, + // T3 sort 4228, all others ≤ 5200. Dominant terms scale with 2^k. + constexpr size_t anchor_mb = 5200; + size_t const adj = streaming_sort_scratch_adjustment(k); + if (k == 28) return (anchor_mb << 20) + adj; + if (k < 18) return (size_t(16) << 20) + adj; // floor for tiny test plots + if (k > 32) return (size_t(anchor_mb) << (20 + (32 - 28))) + adj; + + if (k < 28) { + int const shift = 28 - k; // cap halves per −1 in k → 2× smaller + return ((size_t(anchor_mb) << 20) >> shift) + adj; + } + int const shift = k - 28; + return ((size_t(anchor_mb) << 20) << shift) + adj; +} + +size_t streaming_plain_peak_bytes(int k) +{ + // Anchor: 7290 MB at k=28 (pre-stage-1-4 peak — d_t1_meta + + // d_t1_keys_merged + d_t2_meta + d_t2_mi + d_t2_xbits all live + // concurrently during T2 match, no parks). Plain tier skips all + // park/rehydrate round-trips for ~400 ms/plot over compact at the + // cost of this higher peak. Scales the same way as compact. + constexpr size_t anchor_mb = 7290; + size_t const adj = streaming_sort_scratch_adjustment(k); + if (k == 28) return (anchor_mb << 20) + adj; + if (k < 18) return (size_t(16) << 20) + adj; + if (k > 32) return (size_t(anchor_mb) << (20 + (32 - 28))) + adj; + + if (k < 28) { + int const shift = 28 - k; + return ((size_t(anchor_mb) << 20) >> shift) + adj; + } + int const shift = k - 28; + return ((size_t(anchor_mb) << 20) << shift) + adj; +} + +size_t streaming_minimal_peak_bytes(int k) +{ + // Anchor: 3760 MB at k=28 (measured 3754 MB on sm_89 + the + // streaming-stats trace; rounded up for safety). Bottleneck is T3 + // match where d_t2_keys_merged + d_t2_xbits_sorted + meta-l/r + // slices + d_t3_stage are co-resident. + // + // Minimal layers cumulative cuts on top of compact: + // 1. N=8 T2 match staging (cap/8 ≈ 570 MB vs compact's cap/2). + // 2. T1 sort gather, T2 sort meta+xbits gathers — tiled output, + // D2H per tile to host pinned, rebuild on device after free. + // 3. T3 match — d_t2_meta_sorted parked on host pinned, sliced + // device buffers H2D'd per (section_l, section_r) pass. + // 4. T1 match — sliced into N passes per section_l, output + // accumulated to host pinned. + // 5. T1, T2, T3 sort CUB sub-phases — per-tile cap/N output + // buffers, USM-host accumulation, merges with USM-host inputs. + // 6. Xs phase — gen+sort tiled in N=2 position halves with + // USM-host accumulators; pack tiled with D2H per tile. + // + // Cumulative effect at k=28: peak drops from 5200 MB (compact) → + // 3754 MB (minimal). Trade-off: ~6 extra cap-sized PCIe round- + // trips per plot (~2.5× wall on NVIDIA — 13 s/plot → 34 s/plot + // at k=28). Same k-scaling as compact / plain. + constexpr size_t anchor_mb = 3760; + size_t const adj = streaming_sort_scratch_adjustment(k); + if (k == 28) return (anchor_mb << 20) + adj; + if (k < 18) return (size_t(16) << 20) + adj; + if (k > 32) return (size_t(anchor_mb) << (20 + (32 - 28))) + adj; + + if (k < 28) { + int const shift = 28 - k; + return ((size_t(anchor_mb) << 20) >> shift) + adj; + } + int const shift = k - 28; + return ((size_t(anchor_mb) << 20) << shift) + adj; +} + +} // namespace pos2gpu diff --git a/src/host/GpuBufferPool.cu b/src/host/GpuBufferPool.cu deleted file mode 100644 index ddb3298..0000000 --- a/src/host/GpuBufferPool.cu +++ /dev/null @@ -1,151 +0,0 @@ -// GpuBufferPool.cu — queries per-phase scratch sizes once and allocates -// worst-case-sized persistent buffers. - -#include "host/GpuBufferPool.hpp" - -#include "gpu/XsKernel.cuh" -#include "gpu/T1Kernel.cuh" -#include "gpu/T2Kernel.cuh" -#include "gpu/T3Kernel.cuh" - -#include -#include - -#include -#include -#include -#include - -namespace pos2gpu { - -namespace { - -// Variadic so the preprocessor doesn't choke on template-argument commas -// in e.g. cub::DeviceRadixSort::SortPairs(...). -#define POOL_CHECK(...) do { \ - cudaError_t err = (__VA_ARGS__); \ - if (err != cudaSuccess) { \ - throw std::runtime_error(std::string("GpuBufferPool CUDA: ") + \ - cudaGetErrorString(err)); \ - } \ -} while (0) - -// Mirrors GpuPipeline.cu's max_pairs_per_section (and pos2-chip's -// TableConstructorGeneric.hpp:23). -inline size_t max_pairs_per_section(int k, int num_section_bits) { - int extra_margin_bits = 8 - ((28 - k) / 2); - return (1ULL << (k - num_section_bits)) + (1ULL << (k - extra_margin_bits)); -} - -} // namespace - -GpuBufferPool::GpuBufferPool(int k_, int strength_, bool testnet_) - : k(k_), strength(strength_), testnet(testnet_) -{ - int const num_section_bits = (k < 28) ? 2 : (k - 26); - total_xs = 1ULL << k; - cap = max_pairs_per_section(k, num_section_bits) * (1ULL << num_section_bits); - - // d_storage must hold EITHER total_xs XsCandidateGpu (8 B each) OR four - // cap-sized uint32 key/val arrays during sort. Cast everything to size_t - // so std::max's template deduction finds one common type. - storage_bytes = std::max( - static_cast(total_xs) * sizeof(XsCandidateGpu), - static_cast(cap) * 4 * sizeof(uint32_t)); - - // d_pair_*: worst case across T1 (12 B), T2 (16 B), T3 (8 B), uint64 frags (8 B). - pair_bytes = std::max({ - static_cast(cap) * sizeof(T1PairingGpu), - static_cast(cap) * sizeof(T2PairingGpu), - static_cast(cap) * sizeof(T3PairingGpu), - static_cast(cap) * sizeof(uint64_t), - }); - - // Only the Xs phase asks for kernel scratch; T1/T2/T3 match report 0. - // Xs wants ~4.34 GB at k=28 — we alias d_pair_b for that, so no separate - // allocation. - uint8_t dummy_plot_id[32] = {}; - POOL_CHECK(launch_construct_xs(dummy_plot_id, k, testnet, - nullptr, nullptr, &xs_temp_bytes)); - if (xs_temp_bytes > pair_bytes) { - throw std::runtime_error( - "GpuBufferPool: Xs scratch exceeds pair buffer size; aliasing " - "d_pair_b as Xs temp is no longer safe"); - } - - // Query CUB sort scratch sizes (largest across T1/T2/T3 sorts). - size_t s_pairs = 0; - POOL_CHECK(cub::DeviceRadixSort::SortPairs( - nullptr, s_pairs, - static_cast(nullptr), static_cast(nullptr), - static_cast(nullptr), static_cast(nullptr), - cap, 0, k, nullptr)); - size_t s_keys = 0; - POOL_CHECK(cub::DeviceRadixSort::SortKeys( - nullptr, s_keys, - static_cast(nullptr), static_cast(nullptr), - cap, 0, 2 * k, nullptr)); - sort_scratch_bytes = std::max(s_pairs, s_keys); - - pinned_bytes = cap * sizeof(uint64_t); - - // Check free VRAM before attempting allocation so we can give a useful - // diagnostic instead of a generic cudaErrorMemoryAllocation. The margin - // covers CUDA driver/context state, CUB internal scratch, AES T-tables, - // and other small runtime allocations. - { - size_t const required_device = - storage_bytes + 2 * pair_bytes + sort_scratch_bytes + sizeof(uint64_t); - size_t const margin = 512ULL * 1024 * 1024; // 512 MB - size_t free_b = 0, total_b = 0; - POOL_CHECK(cudaMemGetInfo(&free_b, &total_b)); - if (free_b < required_device + margin) { - auto to_gib = [](size_t b) { return b / double(1ULL << 30); }; - throw std::runtime_error( - "GpuBufferPool: insufficient device VRAM for k=" + - std::to_string(k) + " strength=" + std::to_string(strength) + - "; need ~" + std::to_string(to_gib(required_device + margin)).substr(0, 5) + - " GiB (pool " + std::to_string(to_gib(required_device)).substr(0, 5) + - " GiB + ~0.5 GiB runtime), only " + - std::to_string(to_gib(free_b)).substr(0, 5) + - " GiB free of " + std::to_string(to_gib(total_b)).substr(0, 5) + - " GiB total. Use a smaller k or a GPU with more VRAM."); - } - } - - if (getenv("POS2GPU_POOL_DEBUG")) { - size_t free_b = 0, total_b = 0; - cudaMemGetInfo(&free_b, &total_b); - std::fprintf(stderr, - "[pool] k=%d strength=%d cap=%llu total_xs=%llu " - "free=%.2fGB total=%.2fGB\n", - k, strength, (unsigned long long)cap, (unsigned long long)total_xs, - free_b/1e9, total_b/1e9); - std::fprintf(stderr, - "[pool] sizes: storage=%.2fGB pair=%.2fGB xs_temp(alias)=%.2fGB " - "sort_scratch=%.2fGB pinned=%.2fGB\n", - storage_bytes/1e9, pair_bytes/1e9, xs_temp_bytes/1e9, - sort_scratch_bytes/1e9, pinned_bytes/1e9); - } - - POOL_CHECK(cudaMalloc(&d_storage, storage_bytes)); - POOL_CHECK(cudaMalloc(&d_pair_a, pair_bytes)); - POOL_CHECK(cudaMalloc(&d_pair_b, pair_bytes)); - POOL_CHECK(cudaMalloc(&d_sort_scratch, sort_scratch_bytes)); - POOL_CHECK(cudaMalloc(&d_counter, sizeof(uint64_t))); - POOL_CHECK(cudaMallocHost(&h_pinned_t3[0], pinned_bytes)); - POOL_CHECK(cudaMallocHost(&h_pinned_t3[1], pinned_bytes)); -} - -GpuBufferPool::~GpuBufferPool() -{ - if (d_storage) cudaFree(d_storage); - if (d_pair_a) cudaFree(d_pair_a); - if (d_pair_b) cudaFree(d_pair_b); - if (d_sort_scratch) cudaFree(d_sort_scratch); - if (d_counter) cudaFree(d_counter); - if (h_pinned_t3[0]) cudaFreeHost(h_pinned_t3[0]); - if (h_pinned_t3[1]) cudaFreeHost(h_pinned_t3[1]); -} - -} // namespace pos2gpu diff --git a/src/host/GpuBufferPool.hpp b/src/host/GpuBufferPool.hpp index 834f520..fd404c6 100644 --- a/src/host/GpuBufferPool.hpp +++ b/src/host/GpuBufferPool.hpp @@ -7,36 +7,69 @@ // between device time (~2.75 s) and producer wall time (~5.1 s). // // Memory layout with aliasing (k=28 worst-case sizes in parens): -// d_storage (4.36 GB) — Xs candidates during Xs phase, -// then 4×uint32[cap] sort keys/vals during sorts -// d_pair_a (4.36 GB) — T1/T2/T3 match output (reused across phases); -// also serves as Xs phase scratch before T1 -// d_pair_b (4.36 GB) — *_sorted / frags_out (reused across phases); -// also serves as Xs phase scratch before T1 -// d_sort_scratch (~2.3 GB) — CUB radix-sort scratch (largest across phases) +// d_storage (~3.3 GB) — Xs candidates during Xs phase (2.1 GB), +// then 3×uint32[cap] sort keys_out/vals_in/ +// vals_out during sorts. The fourth +// (keys_in) slot the sort API would want +// is ALWAYS the SoA match-info stream +// from d_pair_a (d_t1_mi / d_t2_mi), so +// d_storage doesn't allocate for it — +// saves cap·4 B (~1.09 GiB at k=28) vs +// the old 4-slot layout. +// d_pair_a (~4.4 GB) — T1/T2/T3 match output (reused across phases). +// Sized to the largest match-output: cap·16 B +// for T2 (meta+mi+xbits SoA). Does NOT alias the +// Xs phase scratch — that lives in d_pair_b. +// d_pair_b (~4.4 GB) — *_sorted / frags_out (reused across phases), +// AND the Xs construction scratch. Sized to +// max(largest sorted-output, xs_temp_bytes); +// at k=28 xs_temp dominates. +// d_sort_scratch (~MB) — Radix sort scratch. After ping-pong refactor: +// CUB DoubleBuffer mode shrinks this from ~2 GB +// to ~MB; SortSycl already ping-pongs over the +// caller's keys_in/keys_out buffers. // d_counter (8 B) — reused uint64_t count output -// h_pinned_t3[2] (2.18 GB ea) — double-buffered final fragments DMA target. -// Producer writes plot N to buffer (N%2) while -// consumer reads plot N-1 from the other slot. -// With a depth-1 channel + producer being -// slower than consumer, this is race-free. +// h_pinned_t3[N] (~2.2 GB ea) — rotating final-fragments DMA targets. +// Producer writes plot K into slot K mod N +// while consumer reads earlier plots from +// the other slots; channel depth N-1 keeps +// the producer from overwriting in-flight +// reads. N defaults to 3 (see kNumPinnedBuffers). // -// Total ~15 GB device + ~4.36 GB pinned host — fits in 17 GB free VRAM on a -// 24 GB 4090. +// Total ~12 GB device + ~6.6 GB pinned host at k=28 — fits (just) in the +// 11.98 GiB free VRAM of a Navi 22 (RX 6700 XT) after the d_storage +// slot-trim above. Pre-trim the total was ~13.1 GB and overshot this +// card's budget by ~0.7 GiB, forcing a fallback to the streaming +// pipeline which costs an extra ~5 s at k=28. // // Note: T1/T2/T3 match kernels report temp_bytes = 0 (no scratch needed). -// Only the Xs phase wants ~4.34 GB of scratch, so we alias d_pair_b for that. +// Only the Xs phase wants ~4.4 GB of scratch, and we alias d_pair_b for that. #pragma once #include #include +#include +#include namespace pos2gpu { +// Typed exception for the "pool sizing exceeds available device VRAM" +// case. Callers that want to fall back to the streaming pipeline when +// the pool does not fit should catch this specifically rather than +// string-matching a generic std::runtime_error. +struct InsufficientVramError : std::runtime_error { + using std::runtime_error::runtime_error; + size_t required_bytes = 0; + size_t free_bytes = 0; + size_t total_bytes = 0; +}; + struct GpuBufferPool { - // Allocates all buffers sized for (k, strength, testnet). Throws on any - // CUDA allocation failure. + // Allocates all buffers sized for (k, strength, testnet). Throws + // InsufficientVramError when the sized pool will not fit in free + // device VRAM; throws std::runtime_error on any other CUDA + // allocation or API failure. GpuBufferPool(int k, int strength, bool testnet); ~GpuBufferPool(); @@ -52,7 +85,8 @@ struct GpuBufferPool { uint64_t total_xs = 0; uint64_t cap = 0; size_t storage_bytes = 0; - size_t pair_bytes = 0; + size_t pair_a_bytes = 0; // max(T1/T2/T3 match-output footprints) + size_t pair_b_bytes = 0; // max(*_sorted footprints, xs_temp_bytes) size_t xs_temp_bytes = 0; // scratch size the Xs phase asks for size_t sort_scratch_bytes = 0; size_t pinned_bytes = 0; // per pinned buffer @@ -65,10 +99,93 @@ struct GpuBufferPool { void* d_sort_scratch = nullptr; uint64_t* d_counter = nullptr; - // Pinned host buffers for final T3 fragment D2H. Double-buffered so the - // consumer can read plot N directly from one slot while producer writes - // plot N+1 into the other — no intermediate ~2 GB heap copy per plot. - uint64_t* h_pinned_t3[2] = {nullptr, nullptr}; + // Number of rotating pinned slots for the final T3-fragment D2H. + // Set to 3 so the channel can hold depth-2 of in-flight plots + // without the producer ever overwriting a slot the consumer is + // still reading — useful when consumer wall > producer wall + // (slow disk / FSE-heavy strengths). 2 was enough for the + // previously measured producer-slower-than-consumer case, but + // 3 costs only ~2 GB of host pinned at k=28 and widens the + // "safe" consumer/producer ratio. + // + // Pinned slots are allocated LAZILY on first use via + // ensure_pinned(idx). The ctor no longer pays ~1.8 s at k=28 + // for the 3 × 2.2 GB malloc_host calls; single-plot runs + // (plot -n 1) only ever allocate slot 0, saving ~1.2 s of + // ctor time. Batch runs (plot -n N, N ≥ 3) amortise the + // allocation cost across the first three plots' D2H phases + // instead of the ctor — identical total batch time. + static constexpr int kNumPinnedBuffers = 3; + uint64_t* h_pinned_t3[kNumPinnedBuffers] = {}; + + // Returns pool.h_pinned_t3[idx], allocating the slot if it + // hasn't been used yet. Thread-safe via a per-slot mutex + // (concurrent callers with the same idx cooperate through + // double-checked locking; different idx values proceed + // independently). Throws std::runtime_error on host alloc + // failure. + uint64_t* ensure_pinned(int idx); + + // Returns pool.d_pair_a, allocating it on first use. Deferred + // from ctor so run_gpu_pipeline can submit Xs gen *before* + // paying this 4.36 GB malloc_device. Thread-safe via double- + // checked locking on pair_a_mu_. + // + // Measured on RX 6700 XT / ROCm 6.2 / AdaptiveCpp HIP: + // sycl::malloc_device of 4.36 GB takes ~5 ms (the driver + // almost certainly just reserves virtual-address space and + // defers physical commit to first write). Overlap benefit + // vs eager alloc is therefore ~5 ms in practice, below noise. + // The lazy pattern is kept because (a) it's a drop-in + // replacement with zero regression, (b) it mirrors + // ensure_pinned, and (c) it enables release_pair_a() below. + void* ensure_pair_a(); + + // Frees d_pair_a if it's allocated, so a subsequent + // ensure_pair_a() will re-allocate. Called by the pool path + // at the end of each plot in a batch to shrink the + // inter-plot VRAM peak. With ~5 ms malloc on AMD, the + // release-and-realloc cost is below noise per plot, while + // the 4.36 GB VRAM freed during file-write / D2H-consume + // phases lets the pool path fit cards with ~7-8 GiB free + // that would otherwise hit the InsufficientVramError path + // and fall back to streaming. + // + // Thread-safe via pair_a_mu_; lock-order is + // (pair_a_mu_ → sycl::free) so release can run concurrently + // with a future ensure_pair_a from a different thread + // without deadlock. In practice run_batch is single-producer + // so contention is zero. + void release_pair_a(); + +private: + std::mutex pinned_mu_[kNumPinnedBuffers]; + std::mutex pair_a_mu_; +}; + +// Free + total device VRAM at call time. On SYCL backends without a +// portable free-memory query, free_bytes is approximated as +// total_bytes (AdaptiveCpp's global_mem_size = device total). Used as +// a preflight signal; sycl::malloc_device remains the source of +// truth. POS2GPU_MAX_VRAM_MB caps both fields when set. +struct DeviceMemInfo { + size_t free_bytes = 0; + size_t total_bytes = 0; }; +DeviceMemInfo query_device_memory(); + +// Upper bound on streaming-pipeline peak device VRAM at given k. +// streaming_peak_bytes: compact tier (anchored at 5200 MB at k=28). +// streaming_plain_peak_bytes: plain tier (anchored at 7290 MB at k=28, +// pre-park pipeline — saves ~400 ms/plot over compact via fewer PCIe +// round-trips, at the cost of the higher peak). +// streaming_minimal_peak_bytes: minimal tier (anchored at 3700 MB at +// k=28). Same parks as compact plus N=8 T2 match staging (cap/8 vs +// compact's cap/2) — targets 4 GiB cards at the cost of more PCIe +// round-trips during T2 match. +// Dominant terms scale with 2^k, so other k extrapolate linearly. +size_t streaming_peak_bytes(int k); +size_t streaming_plain_peak_bytes(int k); +size_t streaming_minimal_peak_bytes(int k); } // namespace pos2gpu diff --git a/src/host/GpuPipeline.cpp b/src/host/GpuPipeline.cpp new file mode 100644 index 0000000..9263084 --- /dev/null +++ b/src/host/GpuPipeline.cpp @@ -0,0 +1,2461 @@ +// GpuPipeline.cu — orchestrates Xs → T1 → T2 → T3 on the device, with +// CUB radix sort between phases (each phase consumes sorted-by-match_info +// input). Final T3 output is sorted by proof_fragment (low 2k bits) to +// match pos2-chip Table3Constructor::post_construct_span. +// +// Two overloads live here: +// run_gpu_pipeline(cfg) — transient pool, one-shot. +// run_gpu_pipeline(cfg, pool) — shared pool, batch-friendly. This is the +// real implementation; the one-shot form +// just wraps it in a temporary pool. + +#include "host/GpuPipeline.hpp" +#include "host/GpuBufferPool.hpp" +#include "host/PoolSizing.hpp" + +#include "gpu/AesGpu.cuh" +#include "gpu/XsKernel.cuh" +#include "gpu/XsKernels.cuh" // launch_xs_gen / launch_xs_pack (stage 4e) +#include "gpu/T1Kernel.cuh" +#include "gpu/T2Kernel.cuh" +#include "gpu/T3Kernel.cuh" +#include "gpu/PipelineKernels.cuh" +#include "gpu/Sort.cuh" +#include "gpu/SyclBackend.hpp" + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace pos2gpu { + +namespace { + + +// ===================================================================== +// T1 sort: by match_info, low k bits, stable. Uses CUB SortPairs with +// (key=match_info, value=index) then permutes T1Pairings. +// ===================================================================== +// T2 sort: same shape — sort indices by match_info. +// ===================================================================== +// Streaming allocation tracker. +// +// Wraps cudaMalloc / cudaFree so we can: (a) account for live/peak VRAM +// used by the streaming pipeline, (b) honour a soft device-memory cap +// set via POS2GPU_MAX_VRAM_MB (throws before the underlying cudaMalloc +// when an alloc would push live past the cap), and (c) emit a per-alloc +// trace under POS2GPU_STREAMING_STATS=1 for manual audits. +// +// Pinned host allocations are NOT counted — the cap is specifically for +// device VRAM, and the pinned D2H staging buffer is host-resident. +// ===================================================================== +struct StreamingStats { + size_t cap = 0; // 0 = no cap + size_t live = 0; + size_t peak = 0; + std::unordered_map sizes; + bool verbose = false; + char const* phase = "(init)"; + + // Free any allocations still alive on destruction. If the streaming + // pipeline throws partway (e.g. d_xs_temp OOM after d_xs already + // succeeded), this dtor releases the still-live device buffers + // instead of leaking them across batch iterations. + ~StreamingStats() { + if (sizes.empty()) return; + auto& q = sycl_backend::queue(); + for (auto& [ptr, _bytes] : sizes) { + if (ptr) sycl::free(ptr, q); + } + sizes.clear(); + } +}; + +inline void s_init_from_env(StreamingStats& s) +{ + if (char const* v = std::getenv("POS2GPU_MAX_VRAM_MB"); v && v[0]) { + s.cap = size_t(std::strtoull(v, nullptr, 10)) * (1ULL << 20); + } + if (char const* v = std::getenv("POS2GPU_STREAMING_STATS"); v && v[0] == '1') { + s.verbose = true; + } +} + +// Format a byte count as both raw bytes and decimal MB. The previous +// `bytes >> 20` form (integer right-shift = truncating divide by 1 MiB) +// rounded any sub-MiB request down to "0 MB", which masked both the +// real allocation size and any genuine zero-byte sizing bug at the +// call site. Use this helper in every error path so a future +// `requested=0` is unambiguous (raw bytes settles it). +inline std::string s_fmt_bytes(size_t bytes) { + char buf[64]; + std::snprintf(buf, sizeof(buf), + "%zu bytes (%.2f MB)", bytes, bytes / 1048576.0); + return std::string(buf); +} + +template +inline void s_malloc(StreamingStats& s, T*& out, size_t bytes, char const* reason) +{ + // Zero-byte requests come from sizing queries that returned 0, + // which downstream callers honour as "skip this alloc" only by + // accident (sycl::malloc_device(0) returns null on HIP). Surface + // the actual upstream cause instead of triggering the misleading + // "Card likely too small" path below. + if (bytes == 0) { + throw std::runtime_error( + std::string("internal: s_malloc('") + reason + "') called with " + "bytes=0 — an upstream sizing query returned 0 (count=0). On " + "AMD/HIP this most often indicates a kernel correctness issue " + "on an unvalidated device — either an AOT target outside the " + "validated set (the gfx1013/RDNA1 community spoof is the known " + "case) or AdaptiveCpp's generic SSCP JIT miscompiling a kernel " + "for the actual gfx ISA. Run the parity tests on this device " + "to localise: sycl_g_x_parity, sycl_sort_parity, " + "sycl_bucket_offsets_parity, sycl_t1_parity."); + } + if (s.cap && s.live + bytes > s.cap) { + throw std::runtime_error( + std::string("streaming VRAM cap: phase=") + s.phase + + " alloc=" + reason + + " live=" + s_fmt_bytes(s.live) + + " + new=" + s_fmt_bytes(bytes) + + " would exceed cap=" + s_fmt_bytes(s.cap)); + } + void* p = sycl::malloc_device(bytes, sycl_backend::queue()); + if (!p) { + throw std::runtime_error( + std::string("sycl::malloc_device(") + reason + "): null — phase=" + + s.phase + " requested=" + s_fmt_bytes(bytes) + + " live=" + s_fmt_bytes(s.live) + + ". Card likely too small for this k via the streaming " + "pipeline; try a smaller k or a card with more VRAM."); + } + out = static_cast(p); + s.live += bytes; + if (s.live > s.peak) s.peak = s.live; + s.sizes[p] = bytes; + if (s.verbose) { + std::fprintf(stderr, + "[stream %-8s] +%7.2f MB %-20s live=%8.2f peak=%8.2f\n", + s.phase, bytes / 1048576.0, reason, + s.live / 1048576.0, s.peak / 1048576.0); + } +} + +template +inline void s_free(StreamingStats& s, T*& ptr) +{ + if (!ptr) return; + void* raw = static_cast(ptr); + auto it = s.sizes.find(raw); + if (it != s.sizes.end()) { + s.live -= it->second; + if (s.verbose) { + std::fprintf(stderr, + "[stream %-8s] -%7.2f MB %-20s live=%8.2f peak=%8.2f\n", + s.phase, it->second / 1048576.0, "(free)", + s.live / 1048576.0, s.peak / 1048576.0); + } + s.sizes.erase(it); + } + sycl::free(raw, sycl_backend::queue()); + ptr = nullptr; +} + +// Sanity-check t1_count after T1 match. Healthy plots produce ~2^k +// entries; anything below total_xs/64 (= 2^(k-6)) — let alone literal +// zero — points at kernel correctness on the device, not a VRAM +// shortfall. Catching this here surfaces a clear diagnostic instead of +// letting downstream sort-scratch alloc fail with the misleading +// "Card likely too small" message. Two AMD/HIP cases produce 0 T1 +// matches at k=28: the gfx1013/RDNA1 community spoof on a W5700, and +// AdaptiveCpp's generic SSCP JIT on the same RDNA1 silicon (the JIT +// path is theoretically more compatible than the AOT spoof but has +// been observed to miscompile the matcher). Only the OOM further down +// was visible before this check. +inline void validate_t1_count(uint64_t t1_count, int k) +{ + uint64_t const min_plausible = (1ULL << k) >> 6; + if (t1_count >= min_plausible) return; + + throw std::runtime_error( + "T1 match produced " + std::to_string(t1_count) + " entries " + "(expected ~2^" + std::to_string(k) + " = " + + std::to_string(1ULL << k) + " for k=" + std::to_string(k) + + "). This indicates a kernel correctness issue on this device, " + "not a VRAM shortfall. On AMD/HIP this most often means the " + "AdaptiveCpp target produced wrong output for the actual gfx " + "ISA — either the gfx1013/RDNA1 community AOT spoof or the " + "generic SSCP JIT path on an unvalidated card. Build the " + "parity tests via cmake and verify on this device: " + "sycl_g_x_parity, sycl_sort_parity, sycl_bucket_offsets_parity, " + "sycl_t1_parity. The first three exercise individual kernels at " + "small N; sycl_t1_parity runs the full T1 matcher against the " + "pos2-chip CPU reference and is the closest reproducer of the " + "k=28 failure. README's 'Community-tested, not parity-validated' " + "caveat applies."); +} + +} // namespace + +GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg, + GpuBufferPool& pool, + int pinned_index) +{ + + sycl::queue& q = sycl_backend::queue(); + if (cfg.k < 18 || cfg.k > 32 || (cfg.k & 1) != 0) { + throw std::runtime_error("k must be even in [18, 32]"); + } + if (cfg.strength < 2) { + throw std::runtime_error("strength must be >= 2"); + } + if (pool.k != cfg.k || pool.strength != cfg.strength + || pool.testnet != cfg.testnet) + { + throw std::runtime_error( + "GpuBufferPool was sized for different (k, strength, testnet)"); + } + if (pinned_index < 0 || pinned_index >= GpuBufferPool::kNumPinnedBuffers) { + throw std::runtime_error( + "pinned_index must be in [0, GpuBufferPool::kNumPinnedBuffers)"); + } + + uint64_t const total_xs = pool.total_xs; + uint64_t const cap = pool.cap; + + constexpr int kThreads = 256; + auto blocks = [&](uint64_t n) { + return unsigned((n + kThreads - 1) / kThreads); + }; + + // ---- pool aliases ---- + // d_pair_a carries the "current phase match output": T1, then T2, then T3. + // d_pair_b carries the "current phase sort output": sorted T1, sorted T2, + // then final uint64_t fragments. Each subsequent phase's output overwrites + // the previous (consumed) contents in the same slot. + XsCandidateGpu* d_xs = static_cast(pool.d_storage); + // d_pair_a-derived aliases (d_t1_meta, d_t1_mi, d_t2_meta, d_t2_mi, + // d_t2_xbits, d_t3) are NOT declared here. They're declared inside + // the Xs phase block below, right after pool.ensure_pair_a() + // performs the lazy malloc_device for d_pair_a. Deferring that + // alloc until after Xs gen has been submitted to the queue lets + // the ~400-500 ms CPU-side malloc_device overlap with Xs's + // ~750 ms GPU execution — saves ~400-500 ms off first-plot wall; + // batch plots 2+ hit ensure_pair_a's cached-pointer fast path + // so the alloc cost is paid exactly once per pool. + // + // d_pair_b-derived aliases stay up here because d_pair_b is + // eager-allocated by the pool ctor: Xs gen needs it as scratch + // from the start of the pipeline. + uint64_t* d_t1_meta_sorted = static_cast (pool.d_pair_b); + uint64_t* d_t2_meta_sorted = static_cast (pool.d_pair_b); + uint32_t* d_t2_xbits_sorted = reinterpret_cast( + static_cast(pool.d_pair_b) + pool.cap * sizeof(uint64_t)); + uint64_t* d_frags_out = static_cast (pool.d_pair_b); + + uint64_t* d_count = pool.d_counter; + // Xs phase needs ~3.22 GB scratch at k=28 in split-keys_a mode + // (3 × total_xs × u32 + cub); d_pair_b is idle through the whole + // Xs phase (not touched until T1 sort permute writes to it), so + // we alias it rather than allocating separately. + // + // Split-keys_a: the Xs sort's keys_a (total_xs · u32 = 1 GiB at + // k=28) lives in d_storage's tail — bytes [total_xs·8, storage_bytes) + // which is idle during Xs gen+sort. The final pack phase writes + // d_storage[0..total_xs·8) only, leaving keys_a's memory region + // undisturbed (and its contents unread after the sort anyway, so + // the overlap on T1/T2/T3-sort aliases in d_storage after pack is + // a pure write-without-read of stale bytes). Saves ~1 GiB off the + // pair_b xs-scratch region — see GpuBufferPool.cpp for sizing. + void* const d_xs_split_keys_a = static_cast(pool.d_storage) + + pool.total_xs * sizeof(XsCandidateGpu); + void* d_xs_temp = pool.d_pair_b; + void* d_sort_scratch = pool.d_sort_scratch; + // Lazy pinned-host alloc: skips ~600 ms × (kNumPinnedBuffers-1) + // on single-plot runs (only slot 0 gets allocated). See + // GpuBufferPool::ensure_pinned header comment for rationale. + uint64_t* h_pinned_t3 = pool.ensure_pinned(pinned_index); + // T1/T2/T3 match kernels report 0 scratch bytes, but some CUDA paths + // reject a nullptr d_temp_storage with cudaErrorInvalidArgument even + // when bytes==0. Point them at d_sort_scratch (idle during match) to + // give the kernel a valid non-null handle. + void* d_match_temp = pool.d_sort_scratch; + + // Sort key/val arrays alias d_storage. Safe because Xs is fully consumed + // by T1 match (stream-synchronised) before we enter T1 sort. + // + // Only three slots live here — keys_out, vals_in, vals_out. The + // sort's keys_input is always the SoA match-info stream from + // d_pair_a (d_t1_mi / d_t2_mi), so the fourth slot that would + // have hosted "d_keys_in" is neither allocated nor used. See + // GpuBufferPool.cpp for the matching storage_bytes shrink. + auto storage_u32 = static_cast(pool.d_storage); + uint32_t* d_keys_out = storage_u32 + 0 * cap; + uint32_t* d_vals_in = storage_u32 + 1 * cap; + uint32_t* d_vals_out = storage_u32 + 2 * cap; + + // ---- per-phase wall-time profiling ---- + // Enabled when either cfg.profile is set (xchplot2 -P / --profile) or + // POS2GPU_PHASE_TIMING=1 is in the env. Each phase's wall is measured + // around q.wait()s so launches actually drain to the device before the + // next start sample — adds a sync point but gives an honest breakdown. + // When disabled, begin/end/report are early-out and add ~zero cost. + bool const phase_timing = cfg.profile || [] { + char const* v = std::getenv("POS2GPU_PHASE_TIMING"); + return v && v[0] == '1'; + }(); + using phase_clock = std::chrono::steady_clock; + std::vector> phase_starts; + std::vector> phase_records; + auto begin_phase = [&](char const* label) -> int { + if (!phase_timing) return -1; + q.wait(); + phase_starts.emplace_back(label, phase_clock::now()); + return static_cast(phase_starts.size() - 1); + }; + auto end_phase = [&](int idx) { + if (idx < 0) return; + q.wait(); + auto const t1 = phase_clock::now(); + auto const& [name, t0] = phase_starts[idx]; + double const ms = std::chrono::duration(t1 - t0).count(); + phase_records.emplace_back(name, ms); + }; + auto report_phases = [&]() { + if (!phase_timing || phase_records.empty()) return; + double total = 0.0; + for (auto const& [_n, ms] : phase_records) total += ms; + std::fprintf(stderr, "[phase-timing]"); + for (auto const& [name, ms] : phase_records) { + std::fprintf(stderr, " %s=%.1fms(%.0f%%)", + name, ms, total > 0.0 ? 100.0 * ms / total : 0.0); + } + std::fprintf(stderr, " total=%.1fms\n", total); + }; + + // ---------- Phase Xs ---------- + size_t xs_temp_bytes = 0; + launch_construct_xs(cfg.plot_id.data(), cfg.k, cfg.testnet, + nullptr, nullptr, &xs_temp_bytes, q, + d_xs_split_keys_a); + int p_xs = begin_phase("Xs gen+sort"); + // Xs phase events stubbed in slice 17b — pass nullptr for the (no-op) + // profiling event slots. The launch_construct_xs_profiled signature still + // accepts cudaEvent_t for API compatibility but ignores the values. + launch_construct_xs_profiled(cfg.plot_id.data(), cfg.k, cfg.testnet, + d_xs, d_xs_temp, &xs_temp_bytes, + nullptr, nullptr, q, + d_xs_split_keys_a); + // Overlap d_pair_a's lazy malloc_device (~400-500 ms for 4.36 GB at + // k=28) with Xs gen's GPU execution. In production + // (POS2GPU_PHASE_TIMING unset), launch_construct_xs_profiled returns + // immediately with the kernel in-flight on the queue; this CPU-side + // alloc then runs in parallel and its wall is hidden behind Xs's + // ~750 ms GPU work. In phase_timing mode xs-timing's internal + // q.waits serialise Xs first, then this alloc pays full wall — a + // diagnostic-mode trade-off. + void* const d_pair_a_raw = pool.ensure_pair_a(); + end_phase(p_xs); + + // d_pair_a-derived aliases, now that the lazy alloc has resolved. + // Same layout as the old eager version — just computed from the + // local d_pair_a_raw instead of pool.d_pair_a so there's no + // confusion about when the pointer became valid. + // + // T1 match output is SoA, carved out of d_pair_a. Layout: meta[cap] + // (cap·8 B) then mi[cap] (cap·4 B). Total cap·12 B, fits in d_pair_a's + // cap·16 B budget. + uint64_t* d_t1_meta = static_cast(d_pair_a_raw); + uint32_t* d_t1_mi = reinterpret_cast( + static_cast(d_pair_a_raw) + pool.cap * sizeof(uint64_t)); + // T2 match output is SoA, carved out of d_pair_a. Layout: meta[cap] + // (cap·8 B), then mi[cap] (cap·4 B), then xbits[cap] (cap·4 B). Total + // cap·16 B, matching d_pair_a's size. + uint64_t* d_t2_meta = static_cast(d_pair_a_raw); + uint32_t* d_t2_mi = reinterpret_cast( + static_cast(d_pair_a_raw) + pool.cap * sizeof(uint64_t)); + uint32_t* d_t2_xbits = reinterpret_cast( + static_cast(d_pair_a_raw) + pool.cap * (sizeof(uint64_t) + sizeof(uint32_t))); + T3PairingGpu* d_t3 = static_cast(d_pair_a_raw); + + // ---------- Phase T1 ---------- + auto t1p = make_t1_params(cfg.k, cfg.strength); + size_t t1_temp_bytes = 0; + launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs, + nullptr, nullptr, d_count, cap, + nullptr, &t1_temp_bytes, q); + q.memset(d_count, 0, sizeof(uint64_t)); + int p_t1 = begin_phase("T1 match"); + launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs, + d_t1_meta, d_t1_mi, d_count, cap, + d_match_temp, &t1_temp_bytes, q); + end_phase(p_t1); + + // No explicit sync: the next cudaMemcpy (non-async, default stream) + // implicitly drains prior stream work before the host reads t1_count. + uint64_t t1_count = 0; + q.memcpy(&t1_count, d_count, sizeof(uint64_t)).wait(); + if (t1_count > cap) throw std::runtime_error("T1 overflow"); + validate_t1_count(t1_count, cfg.k); + + + // Sort T1 by match_info (low k bits). d_storage is now repurposed + // as (keys_in, keys_out, vals_in, vals_out), Xs having been fully + // consumed by T1 match above. T1 match emits match_info in a SoA + // stream (d_t1_mi), so we feed that directly to CUB as the sort key + // input rather than extracting from a packed struct. + int p_t1_sort = begin_phase("T1 sort"); + { + launch_init_u32_identity(d_vals_in, t1_count, q); + size_t sort_bytes = pool.sort_scratch_bytes; + launch_sort_pairs_u32_u32( + d_sort_scratch, sort_bytes, + d_t1_mi, d_keys_out, d_vals_in, d_vals_out, + t1_count, /*begin_bit=*/0, /*end_bit=*/cfg.k, q); + + launch_gather_u64(d_t1_meta, d_vals_out, d_t1_meta_sorted, t1_count, q); + } + end_phase(p_t1_sort); + + // ---------- Phase T2 ---------- + // Sorted T1 = (d_t1_meta_sorted: uint64 meta, d_keys_out: uint32 match_info). + // No AoS struct anymore — saves 33 % of sorted-T1 bandwidth on both the + // permute write and the match-kernel hot path. + auto t2p = make_t2_params(cfg.k, cfg.strength); + size_t t2_temp_bytes = 0; + launch_t2_match(cfg.plot_id.data(), t2p, nullptr, nullptr, t1_count, + nullptr, nullptr, nullptr, d_count, cap, + nullptr, &t2_temp_bytes, q); + q.memset(d_count, 0, sizeof(uint64_t)); + int p_t2 = begin_phase("T2 match"); + launch_t2_match(cfg.plot_id.data(), t2p, d_t1_meta_sorted, d_keys_out, t1_count, + d_t2_meta, d_t2_mi, d_t2_xbits, d_count, cap, + d_match_temp, &t2_temp_bytes, q); + end_phase(p_t2); + + uint64_t t2_count = 0; + q.memcpy(&t2_count, d_count, sizeof(uint64_t)).wait(); + if (t2_count > cap) throw std::runtime_error("T2 overflow"); + + int p_t2_sort = begin_phase("T2 sort"); + { + // T2 match emitted match_info as a SoA stream (d_t2_mi) — feed + // it straight into CUB as the sort key input rather than + // re-extracting from a packed struct. vals_in just needs a + // 0..n-1 identity fill. + launch_init_u32_identity(d_vals_in, t2_count, q); + size_t sort_bytes = pool.sort_scratch_bytes; + launch_sort_pairs_u32_u32( + d_sort_scratch, sort_bytes, + d_t2_mi, d_keys_out, d_vals_in, d_vals_out, + t2_count, 0, cfg.k, q); + + launch_permute_t2(d_t2_meta, d_t2_xbits, d_vals_out, + d_t2_meta_sorted, d_t2_xbits_sorted, t2_count, q); + } + end_phase(p_t2_sort); + + // ---------- Phase T3 ---------- + // d_keys_out now holds the T2 sorted match_info (T1's was overwritten by + // the T2 sort above) — pass as the slim stream for binary search in T3. + auto t3p = make_t3_params(cfg.k, cfg.strength); + size_t t3_temp_bytes = 0; + launch_t3_match(cfg.plot_id.data(), t3p, + d_t2_meta_sorted, d_t2_xbits_sorted, + nullptr, t2_count, + d_t3, d_count, cap, + nullptr, &t3_temp_bytes, q); + q.memset(d_count, 0, sizeof(uint64_t)); + int p_t3 = begin_phase("T3 match + Feistel"); + launch_t3_match(cfg.plot_id.data(), t3p, + d_t2_meta_sorted, d_t2_xbits_sorted, + d_keys_out, t2_count, + d_t3, d_count, cap, + d_match_temp, &t3_temp_bytes, q); + end_phase(p_t3); + + uint64_t t3_count = 0; + q.memcpy(&t3_count, d_count, sizeof(uint64_t)).wait(); + if (t3_count > cap) throw std::runtime_error("T3 overflow"); + + // Sort T3 by proof_fragment (low 2k bits). T3PairingGpu is just a + // uint64_t, so reinterpret the d_pair_a slot directly. + uint64_t* d_frags_in = reinterpret_cast(d_t3); + int p_t3_sort = begin_phase("T3 sort"); + { + size_t sort_bytes = pool.sort_scratch_bytes; + launch_sort_keys_u64( + d_sort_scratch, sort_bytes, + d_frags_in, d_frags_out, + t3_count, /*begin_bit=*/0, /*end_bit=*/2 * cfg.k, q); + } + end_phase(p_t3_sort); + + // ---------- D2H ---------- + int p_d2h = begin_phase("D2H copy T3 fragments (pinned)"); + GpuPipelineResult result; + result.t1_count = t1_count; + result.t2_count = t2_count; + result.t3_count = t3_count; + + if (t3_count > 0) { + q.memcpy(h_pinned_t3, d_frags_out, sizeof(uint64_t) * t3_count); + q.wait(); + } + end_phase(p_d2h); + + if (t3_count > 0) { + // Borrow: caller (batch producer) promises to finish consuming this + // pinned slot before reusing it for another plot. + result.external_fragments_ptr = h_pinned_t3; + result.external_fragments_count = t3_count; + } + + // Xs gen / sort per-phase timings stubbed in slice 17b — see profiling + // notes above. + + // Release d_pair_a so it isn't held between plots in a batch run. + // At ~5 ms/alloc on amdgcn (sycl::malloc_device effectively just + // reserves virtual address space), the per-plot realloc cost is + // below noise, but freeing 4.36 GB during the inter-plot gap means + // the pool path is viable on cards with ~7-8 GiB free that would + // otherwise hit InsufficientVramError and fall back to streaming. + // The final q.wait() inside the D2H block above has already drained + // T3 sort so the buffer is safe to free. + pool.release_pair_a(); + + report_phases(); + return result; +} + +GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg) +{ + // Explicit override for callers that want the streaming path without + // having to rebuild anything. Handy for testing and for users who know + // their hardware won't fit the pool. + if (char const* env = std::getenv("XCHPLOT2_STREAMING"); + env && env[0] == '1') + { + return run_gpu_pipeline_streaming(cfg); + } + + // Default: build a transient pool and run through it. Pays the full + // per-call allocator overhead (~2.4 s for k=28) — batch callers should + // construct a pool once and reuse it via the 3-arg overload. + // + // On insufficient device VRAM the pool ctor throws + // InsufficientVramError; catch it specifically and fall back to + // streaming so users on small-VRAM cards get a working plot with no + // flags. Other CUDA errors propagate. + try { + GpuBufferPool pool(cfg.k, cfg.strength, cfg.testnet); + GpuPipelineResult r = run_gpu_pipeline(cfg, pool, /*pinned_index=*/0); + // Pool (and its pinned buffer) is about to be destroyed, so + // materialise a self-contained copy before returning. + if (r.external_fragments_ptr && r.external_fragments_count > 0) { + r.t3_fragments_storage.resize(r.external_fragments_count); + std::memcpy(r.t3_fragments_storage.data(), + r.external_fragments_ptr, + sizeof(uint64_t) * r.external_fragments_count); + } + r.external_fragments_ptr = nullptr; + r.external_fragments_count = 0; + return r; + } catch (InsufficientVramError const& e) { + std::fprintf(stderr, + "[xchplot2] pool needs %.2f GiB, only %.2f GiB free of " + "%.2f GiB — falling back to streaming pipeline\n", + e.required_bytes / double(1ULL << 30), + e.free_bytes / double(1ULL << 30), + e.total_bytes / double(1ULL << 30)); + return run_gpu_pipeline_streaming(cfg); + } +} + +// ===================================================================== +// Streaming pipeline — per-phase cudaMalloc / cudaFree, no persistent pool. +// +// Only buffers required for the CURRENT and NEXT phase are resident at any +// point. Tiled sorts + SoA emission drive the peak down under 8 GB at +// k=28, so an 8 GB card can run this path. +// +// The implementation body below accepts an optional caller-provided +// pinned D2H buffer — used by BatchPlotter to amortise cudaMallocHost +// across plots and double-buffer the D2H with the FSE consumer. +// +// Exception safety: on throw mid-pipeline we currently leak the +// still-live device allocations. The CLI terminates on exception anyway, +// so the OS reclaims the context. If we later embed this in a long-lived +// process we can add RAII owners without changing the public surface. +// ===================================================================== +namespace { // anon: shared impl, not part of the public API. + +GpuPipelineResult run_gpu_pipeline_streaming_impl( + GpuPipelineConfig const& cfg, + uint64_t* pinned_dst, // nullable + size_t pinned_capacity, // count, not bytes; ignored if pinned_dst null + StreamingPinnedScratch const& scratch); // any field nullptr → per-plot malloc_host fallback + +} // namespace + +GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg) +{ + + sycl::queue& q = sycl_backend::queue(); + return run_gpu_pipeline_streaming_impl(cfg, /*pinned_dst=*/nullptr, + /*pinned_capacity=*/0, + StreamingPinnedScratch{}); +} + +GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg, + uint64_t* pinned_dst, + size_t pinned_capacity) +{ + if (!pinned_dst || pinned_capacity == 0) { + throw std::runtime_error( + "run_gpu_pipeline_streaming(cfg, pinned, cap): pinned buffer must be non-null"); + } + return run_gpu_pipeline_streaming_impl(cfg, pinned_dst, pinned_capacity, + StreamingPinnedScratch{}); +} + +GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg, + uint64_t* pinned_dst, + size_t pinned_capacity, + StreamingPinnedScratch const& scratch) +{ + if (!pinned_dst || pinned_capacity == 0) { + throw std::runtime_error( + "run_gpu_pipeline_streaming(cfg, pinned, cap, scratch): pinned buffer must be non-null"); + } + return run_gpu_pipeline_streaming_impl(cfg, pinned_dst, pinned_capacity, scratch); +} + +namespace { + +GpuPipelineResult run_gpu_pipeline_streaming_impl( + GpuPipelineConfig const& cfg, + uint64_t* pinned_dst, + size_t pinned_capacity, + StreamingPinnedScratch const& scratch) +{ + + sycl::queue& q = sycl_backend::queue(); + if (cfg.k < 18 || cfg.k > 32 || (cfg.k & 1) != 0) { + throw std::runtime_error("k must be even in [18, 32]"); + } + if (cfg.strength < 2) { + throw std::runtime_error("strength must be >= 2"); + } + + int const num_section_bits = (cfg.k < 28) ? 2 : (cfg.k - 26); + uint64_t const total_xs = 1ULL << cfg.k; + uint64_t const cap = + max_pairs_per_section(cfg.k, num_section_bits) * + (1ULL << num_section_bits); + + constexpr int kThreads = 256; + auto blocks = [&](uint64_t n) { + return unsigned((n + kThreads - 1) / kThreads); + }; + + StreamingStats stats; + s_init_from_env(stats); + + // ---- per-phase wall-time profiling ---- + // Identical shape to the pool path (run_gpu_pipeline above); the + // [phase-timing] output format matches so POS2GPU_PHASE_TIMING=1 now + // produces the same breakdown whether the pipeline runs pool or + // falls back to streaming. On 12 GiB cards at k=28 (where pool + // overflows and we always streams) this is the only way to see + // which phase is eating the wall. + bool const phase_timing = cfg.profile || [] { + char const* v = std::getenv("POS2GPU_PHASE_TIMING"); + return v && v[0] == '1'; + }(); + using phase_clock = std::chrono::steady_clock; + std::vector> phase_starts; + std::vector> phase_records; + auto begin_phase = [&](char const* label) -> int { + if (!phase_timing) return -1; + q.wait(); + phase_starts.emplace_back(label, phase_clock::now()); + return static_cast(phase_starts.size() - 1); + }; + auto end_phase = [&](int idx) { + if (idx < 0) return; + q.wait(); + auto const t1 = phase_clock::now(); + auto const& [name, t0] = phase_starts[idx]; + double const ms = std::chrono::duration(t1 - t0).count(); + phase_records.emplace_back(name, ms); + }; + auto report_phases = [&]() { + if (!phase_timing || phase_records.empty()) return; + double total = 0.0; + for (auto const& [_n, ms] : phase_records) total += ms; + std::fprintf(stderr, "[phase-timing]"); + for (auto const& [name, ms] : phase_records) { + std::fprintf(stderr, " %s=%.1fms(%.0f%%)", + name, ms, total > 0.0 ? 100.0 * ms / total : 0.0); + } + std::fprintf(stderr, " total=%.1fms\n", total); + }; + + // --- pipeline-wide tiny allocations --- + // d_counter: per-phase uint64 count output (reused). + // The match kernels each need their own temp-storage buffer sized via + // their size query; we allocate it per-phase rather than globally so + // that the peak VRAM is the phase's alone. + stats.phase = "init"; + uint64_t* d_counter = nullptr; + s_malloc(stats, d_counter, sizeof(uint64_t), "d_counter"); + + // ---------- Phase Xs (stage 4e: inlined gen+sort+pack) ---------- + // launch_construct_xs lumps keys_a/keys_b/vals_a/vals_b into a single + // d_xs_temp blob (~4 GB at k=28). keys_a+vals_a are dead after the + // CUB sort but can't be freed because they're interior slices of a + // single allocation. Inline the three sub-kernels so we can: + // 1. alloc cub_scratch + keys_a + vals_a + // 2. gen fills keys_a, vals_a + // 3. alloc keys_b + vals_b + // 4. CUB sort keys_a/vals_a -> keys_b/vals_b; keys_a/vals_a now dead + // 5. free cub_scratch + keys_a + vals_a <- 2078 MB freed + // 6. alloc d_xs + // 7. pack keys_b/vals_b -> d_xs + // 8. free keys_b + vals_b + // Phase peak at k=28 drops from d_xs (2048) + d_xs_temp (4128) = + // 6176 MB to max(sort 4126 MB, pack 4096 MB) = 4126 MB. + stats.phase = "Xs"; + + AesHashKeys const xs_keys = make_keys(cfg.plot_id.data()); + uint32_t const xs_xor_const = cfg.testnet ? 0xA3B1C4D7u : 0u; + + XsCandidateGpu* d_xs = nullptr; + uint32_t* d_xs_keys_b = nullptr; + uint32_t* d_xs_vals_b = nullptr; + + bool const xs_sliced = !scratch.plain_mode && scratch.gather_tile_count > 1; + + if (!xs_sliced) { + // Compact / plain — full-cap gen+sort+pack (4128 MB sort peak). + size_t xs_cub_bytes = 0; + launch_sort_pairs_u32_u32( + nullptr, xs_cub_bytes, + static_cast(nullptr), static_cast(nullptr), + static_cast(nullptr), static_cast(nullptr), + total_xs, /*begin_bit=*/0, /*end_bit=*/cfg.k, q); + + void* d_xs_cub_scratch = nullptr; + uint32_t* d_xs_keys_a = nullptr; + uint32_t* d_xs_vals_a = nullptr; + s_malloc(stats, d_xs_cub_scratch, xs_cub_bytes, "d_xs_cub"); + s_malloc(stats, d_xs_keys_a, total_xs * sizeof(uint32_t), "d_xs_keys_a"); + s_malloc(stats, d_xs_vals_a, total_xs * sizeof(uint32_t), "d_xs_vals_a"); + + if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') { + // Sentinel-fill keys_a / vals_a head/mid/tail with 0xCD. + uint64_t const off_mid = total_xs / 2; + uint64_t const off_tail = (total_xs >= 16ULL) ? total_xs - 16ULL : 0ULL; + q.memset(d_xs_keys_a, 0xCD, 64).wait(); + q.memset(d_xs_keys_a + off_mid, 0xCD, 64).wait(); + q.memset(d_xs_keys_a + off_tail, 0xCD, 64).wait(); + q.memset(d_xs_vals_a, 0xCD, 64).wait(); + q.memset(d_xs_vals_a + off_mid, 0xCD, 64).wait(); + q.memset(d_xs_vals_a + off_tail, 0xCD, 64).wait(); + + // Trivial-kernel sanity: writes 0xDEADBEEF to keys_a[0..16] + // with no LDS / no captured struct / no AES. If this + // produces 0xCDCDCDCD post-launch, AdaptiveCpp's HIP + // submission path is producing no-op stubs for ANY kernel + // — the problem is below our level. If it produces + // 0xDEADBEEF, simple kernels work and the issue is + // specific to the cooperative-LDS / AES kernel pattern. + { + uint32_t* p = d_xs_keys_a; + q.parallel_for( + sycl::nd_range<1>{256, 256}, + [=](sycl::nd_item<1> it) { + size_t idx = it.get_global_id(0); + if (idx < 16) p[idx] = 0xDEADBEEFu; + }).wait(); + uint32_t check[16] = {}; + q.memcpy(check, d_xs_keys_a, 16 * sizeof(uint32_t)).wait(); + bool const ok = (check[0] == 0xDEADBEEFu); + std::fprintf(stderr, + "[t1-debug] trivial kernel test: %s (keys_a[0]=0x%08x)\n", + ok ? "PASS — simple kernels can write" + : "FAIL — kernel writes are not landing", + check[0]); + // Restore sentinel since the trivial kernel overwrote + // the head region. + q.memset(d_xs_keys_a, 0xCD, 64).wait(); + } + + // Dump d_aes_tables[0..16]. Standard AES T0[0] = 0xC66363A5. + // If we see 0xBE / 0xCD here, the T-table USM buffer was + // never populated by aes_tables_device's q.memcpy — kernels + // would then read garbage and produce nothing useful. + { + uint32_t* d_tables = sycl_backend::aes_tables_device(q); + uint32_t aes_check[16] = {}; + q.memcpy(aes_check, d_tables, 16 * sizeof(uint32_t)).wait(); + std::fprintf(stderr, + "[t1-debug] d_aes_tables[0..16] (T0[a] = (2S[a],S[a],S[a],3S[a]) packed LE; T0[0] = 0xa56363c6):\n"); + for (int i = 0; i < 16; ++i) { + std::fprintf(stderr, " [%2d] 0x%08x\n", i, aes_check[i]); + } + } + } + + int p_xs = begin_phase("Xs gen+sort"); + launch_xs_gen(xs_keys, d_xs_keys_a, d_xs_vals_a, total_xs, + cfg.k, xs_xor_const, q); + + if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') { + uint64_t const sn = (total_xs < 16ULL) ? total_xs : 16ULL; + uint64_t const off_mid = total_xs / 2; + uint64_t const off_tail = (total_xs >= 16ULL) ? total_xs - 16ULL : 0ULL; + uint32_t ka_h[16] = {}, va_h[16] = {}; + uint32_t ka_m[16] = {}, va_m[16] = {}; + uint32_t ka_t[16] = {}, va_t[16] = {}; + q.memcpy(ka_h, d_xs_keys_a, sn * sizeof(uint32_t)).wait(); + q.memcpy(va_h, d_xs_vals_a, sn * sizeof(uint32_t)).wait(); + q.memcpy(ka_m, d_xs_keys_a + off_mid, sn * sizeof(uint32_t)).wait(); + q.memcpy(va_m, d_xs_vals_a + off_mid, sn * sizeof(uint32_t)).wait(); + q.memcpy(ka_t, d_xs_keys_a + off_tail, sn * sizeof(uint32_t)).wait(); + q.memcpy(va_t, d_xs_vals_a + off_tail, sn * sizeof(uint32_t)).wait(); + std::fprintf(stderr, + "[t1-debug] post-xs_gen total_xs=%llu (head idx=0, mid idx=%llu, tail idx=%llu):\n", + (unsigned long long)total_xs, + (unsigned long long)off_mid, (unsigned long long)off_tail); + for (uint64_t i = 0; i < sn; ++i) { + std::fprintf(stderr, + " H[%2llu] ka=0x%08x va=0x%08x M[%2llu] ka=0x%08x va=0x%08x T[%2llu] ka=0x%08x va=0x%08x\n", + (unsigned long long)i, ka_h[i], va_h[i], + (unsigned long long)(off_mid + i), ka_m[i], va_m[i], + (unsigned long long)(off_tail + i), ka_t[i], va_t[i]); + } + } + + s_malloc(stats, d_xs_keys_b, total_xs * sizeof(uint32_t), "d_xs_keys_b"); + s_malloc(stats, d_xs_vals_b, total_xs * sizeof(uint32_t), "d_xs_vals_b"); + + launch_sort_pairs_u32_u32( + d_xs_cub_scratch, xs_cub_bytes, + d_xs_keys_a, d_xs_keys_b, + d_xs_vals_a, d_xs_vals_b, + total_xs, /*begin_bit=*/0, /*end_bit=*/cfg.k, q); + end_phase(p_xs); + + s_free(stats, d_xs_cub_scratch); + s_free(stats, d_xs_keys_a); + s_free(stats, d_xs_vals_a); + + s_malloc(stats, d_xs, total_xs * sizeof(XsCandidateGpu), "d_xs"); + + if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') { + uint64_t const sn = (total_xs < 16ULL) ? total_xs : 16ULL; + uint64_t const off_mid = total_xs / 2; + uint64_t const off_tail = (total_xs >= 16ULL) ? total_xs - 16ULL : 0ULL; + uint32_t kb_h[16] = {}, vb_h[16] = {}; + uint32_t kb_m[16] = {}, vb_m[16] = {}; + uint32_t kb_t[16] = {}, vb_t[16] = {}; + q.memcpy(kb_h, d_xs_keys_b, sn * sizeof(uint32_t)).wait(); + q.memcpy(vb_h, d_xs_vals_b, sn * sizeof(uint32_t)).wait(); + q.memcpy(kb_m, d_xs_keys_b + off_mid, sn * sizeof(uint32_t)).wait(); + q.memcpy(vb_m, d_xs_vals_b + off_mid, sn * sizeof(uint32_t)).wait(); + q.memcpy(kb_t, d_xs_keys_b + off_tail, sn * sizeof(uint32_t)).wait(); + q.memcpy(vb_t, d_xs_vals_b + off_tail, sn * sizeof(uint32_t)).wait(); + std::fprintf(stderr, + "[t1-debug] post-xs_sort total_xs=%llu (head idx=0, mid idx=%llu, tail idx=%llu):\n", + (unsigned long long)total_xs, + (unsigned long long)off_mid, (unsigned long long)off_tail); + for (uint64_t i = 0; i < sn; ++i) { + std::fprintf(stderr, + " H[%2llu] kb=0x%08x vb=0x%08x M[%2llu] kb=0x%08x vb=0x%08x T[%2llu] kb=0x%08x vb=0x%08x\n", + (unsigned long long)i, kb_h[i], vb_h[i], + (unsigned long long)(off_mid + i), kb_m[i], vb_m[i], + (unsigned long long)(off_tail + i), kb_t[i], vb_t[i]); + } + } + + int p_xs_pack = begin_phase("Xs pack"); + launch_xs_pack(d_xs_keys_b, d_xs_vals_b, d_xs, total_xs, q); + end_phase(p_xs_pack); + + s_free(stats, d_xs_keys_b); + s_free(stats, d_xs_vals_b); + } else { + // Sliced (minimal). Tile gen+sort in N=2 position halves into + // cap/2 device buffers, D2H per tile to USM-host. Then merge + // host-pinned tile outputs into device d_xs_keys_b + d_xs_vals_b + // (full cap). Then pack in N=2 halves with D2H per tile to a + // host-pinned XsCandidateGpu accumulator. Finally rehydrate + // d_xs from host pinned. Drops sort peak from 4128 MB → 2056 MB + // and pack peak from 4096 MB → 3072 MB at k=28. + uint64_t const xs_tile_n0 = total_xs / 2; + uint64_t const xs_tile_n1 = total_xs - xs_tile_n0; + uint64_t const xs_tile_max = (xs_tile_n0 > xs_tile_n1) ? xs_tile_n0 : xs_tile_n1; + + size_t xs_cub_tile_bytes = 0; + launch_sort_pairs_u32_u32( + nullptr, xs_cub_tile_bytes, + static_cast(nullptr), static_cast(nullptr), + static_cast(nullptr), static_cast(nullptr), + xs_tile_max, /*begin_bit=*/0, /*end_bit=*/cfg.k, q); + + void* d_xs_cub_scratch = nullptr; + uint32_t* d_xs_keys_a_tile = nullptr; + uint32_t* d_xs_vals_a_tile = nullptr; + uint32_t* d_xs_keys_b_tile = nullptr; + uint32_t* d_xs_vals_b_tile = nullptr; + s_malloc(stats, d_xs_keys_a_tile, xs_tile_max * sizeof(uint32_t), "d_xs_keys_a_tile"); + s_malloc(stats, d_xs_vals_a_tile, xs_tile_max * sizeof(uint32_t), "d_xs_vals_a_tile"); + s_malloc(stats, d_xs_keys_b_tile, xs_tile_max * sizeof(uint32_t), "d_xs_keys_b_tile"); + s_malloc(stats, d_xs_vals_b_tile, xs_tile_max * sizeof(uint32_t), "d_xs_vals_b_tile"); + s_malloc(stats, d_xs_cub_scratch, xs_cub_tile_bytes, "d_xs_cub"); + + uint32_t* h_xs_keys = static_cast( + sycl::malloc_host(total_xs * sizeof(uint32_t), q)); + if (!h_xs_keys) throw std::runtime_error("sycl::malloc_host(h_xs_keys) failed"); + uint32_t* h_xs_vals = static_cast( + sycl::malloc_host(total_xs * sizeof(uint32_t), q)); + if (!h_xs_vals) throw std::runtime_error("sycl::malloc_host(h_xs_vals) failed"); + + int p_xs = begin_phase("Xs gen+sort"); + auto run_tile = [&](uint64_t pos_begin, uint64_t pos_end, uint64_t out_offset) { + uint64_t tile_n = pos_end - pos_begin; + if (tile_n == 0) return; + launch_xs_gen_range( + xs_keys, d_xs_keys_a_tile, d_xs_vals_a_tile, + pos_begin, pos_end, cfg.k, xs_xor_const, q); + launch_sort_pairs_u32_u32( + d_xs_cub_scratch, xs_cub_tile_bytes, + d_xs_keys_a_tile, d_xs_keys_b_tile, + d_xs_vals_a_tile, d_xs_vals_b_tile, + tile_n, /*begin_bit=*/0, /*end_bit=*/cfg.k, q); + q.memcpy(h_xs_keys + out_offset, d_xs_keys_b_tile, + tile_n * sizeof(uint32_t)).wait(); + q.memcpy(h_xs_vals + out_offset, d_xs_vals_b_tile, + tile_n * sizeof(uint32_t)).wait(); + }; + run_tile(0, xs_tile_n0, 0); + run_tile(xs_tile_n0, total_xs, xs_tile_n0); + end_phase(p_xs); + + s_free(stats, d_xs_cub_scratch); + s_free(stats, d_xs_vals_b_tile); + s_free(stats, d_xs_keys_b_tile); + s_free(stats, d_xs_vals_a_tile); + s_free(stats, d_xs_keys_a_tile); + + // Full-cap merge outputs on device. Merge from USM-host inputs. + s_malloc(stats, d_xs_keys_b, total_xs * sizeof(uint32_t), "d_xs_keys_b"); + s_malloc(stats, d_xs_vals_b, total_xs * sizeof(uint32_t), "d_xs_vals_b"); + launch_merge_pairs_stable_2way_u32_u32( + h_xs_keys + 0, h_xs_vals + 0, xs_tile_n0, + h_xs_keys + xs_tile_n0, h_xs_vals + xs_tile_n0, xs_tile_n1, + d_xs_keys_b, d_xs_vals_b, total_xs, q); + sycl::free(h_xs_keys, q); + sycl::free(h_xs_vals, q); + + // Tiled pack. d_xs_pack_tile (cap/2 × XsCandidate = 1024 MB + // at k=28) reuses across tiles; the packed output collects on + // host pinned h_xs (cap × XsCandidate = 2048 MB host). + uint64_t const pack_tile_n0 = total_xs / 2; + uint64_t const pack_tile_n1 = total_xs - pack_tile_n0; + uint64_t const pack_tile_max = (pack_tile_n0 > pack_tile_n1) ? pack_tile_n0 : pack_tile_n1; + + XsCandidateGpu* d_xs_pack_tile = nullptr; + s_malloc(stats, d_xs_pack_tile, pack_tile_max * sizeof(XsCandidateGpu), "d_xs_pack_tile"); + + XsCandidateGpu* h_xs = static_cast( + sycl::malloc_host(total_xs * sizeof(XsCandidateGpu), q)); + if (!h_xs) throw std::runtime_error("sycl::malloc_host(h_xs) failed"); + + int p_xs_pack = begin_phase("Xs pack"); + if (pack_tile_n0 > 0) { + launch_xs_pack_range(d_xs_keys_b + 0, d_xs_vals_b + 0, + d_xs_pack_tile, pack_tile_n0, q); + q.memcpy(h_xs + 0, d_xs_pack_tile, + pack_tile_n0 * sizeof(XsCandidateGpu)).wait(); + } + if (pack_tile_n1 > 0) { + launch_xs_pack_range(d_xs_keys_b + pack_tile_n0, + d_xs_vals_b + pack_tile_n0, + d_xs_pack_tile, pack_tile_n1, q); + q.memcpy(h_xs + pack_tile_n0, d_xs_pack_tile, + pack_tile_n1 * sizeof(XsCandidateGpu)).wait(); + } + end_phase(p_xs_pack); + + s_free(stats, d_xs_pack_tile); + s_free(stats, d_xs_keys_b); + s_free(stats, d_xs_vals_b); + d_xs_keys_b = nullptr; + d_xs_vals_b = nullptr; + + // Re-hydrate full d_xs on device from host pinned. + s_malloc(stats, d_xs, total_xs * sizeof(XsCandidateGpu), "d_xs"); + q.memcpy(d_xs, h_xs, total_xs * sizeof(XsCandidateGpu)).wait(); + sycl::free(h_xs, q); + } + + // ---------- Phase T1 match ---------- + // SoA output: meta (uint64) + mi (uint32). Same 12 B/pair as the old + // AoS struct, but the two streams can be freed independently — we + // drop d_t1_mi as soon as CUB consumes it in the T1 sort phase. + // + // Minimal mode (gather_tile_count > 1) splits T1 match into N= + // num_sections passes (one per section_l) with cap/N staging + // outputs that are D2H'd to host pinned per pass — keeps d_xs + + // d_t1_meta + d_t1_mi from being co-resident at full-cap. Drops + // the T1 match peak from + // d_xs (2048) + d_t1_meta (2080) + d_t1_mi (1040) = 5168 MB + // to + // d_xs (2048) + d_t1_meta_stage (cap/N × 8) + + // d_t1_mi_stage (cap/N × 4) = ~2870 MB at k=28 N=4. + // + // d_t1_meta + d_t1_mi (full cap) are then re-allocated on device + // for T1 sort, with the data H2D'd from host pinned. d_t1_meta + // stays parked on h_t1_meta across T1 sort exactly as in compact + // mode (the existing park dance is skipped — data is already on + // host). + bool const t1_match_sliced = !scratch.plain_mode && scratch.gather_tile_count > 1; + + stats.phase = "T1 match"; + auto t1p = make_t1_params(cfg.k, cfg.strength); + size_t t1_temp_bytes = 0; + launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs, + nullptr, nullptr, d_counter, cap, + nullptr, &t1_temp_bytes, q); + + uint64_t* d_t1_meta = nullptr; + uint32_t* d_t1_mi = nullptr; + void* d_t1_match_temp = nullptr; + + // Lift h_t1_meta / h_t1_mi out of the T1 sort scope so the sliced + // T1 match path can populate them directly. h_t1_mi is sliced-only + // — it's freed in T1 sort once CUB has consumed the H2D'd copy. + bool const h_meta_owned = (!scratch.plain_mode && scratch.h_meta == nullptr); + uint64_t* h_t1_meta = nullptr; + bool h_t1_mi_owned = false; + uint32_t* h_t1_mi = nullptr; + + uint64_t t1_count = 0; + + if (!t1_match_sliced) { + // Single-shot path (compact / plain): d_t1_meta + d_t1_mi + // allocated full-cap on device. + s_malloc(stats, d_t1_meta, cap * sizeof(uint64_t), "d_t1_meta"); + s_malloc(stats, d_t1_mi, cap * sizeof(uint32_t), "d_t1_mi"); + s_malloc(stats, d_t1_match_temp, t1_temp_bytes, "d_t1_match_temp"); + + if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') { + uint64_t const sample_n = (total_xs < 16ULL) ? total_xs : 16ULL; + XsCandidateGpu sample[16] = {}; + q.memcpy(sample, d_xs, sample_n * sizeof(XsCandidateGpu)).wait(); + std::fprintf(stderr, + "[t1-debug] plain pre-launch k=%d total_xs=%llu cap=%llu d_xs[0..%llu]:\n", + cfg.k, (unsigned long long)total_xs, + (unsigned long long)cap, (unsigned long long)sample_n); + for (uint64_t i = 0; i < sample_n; ++i) { + std::fprintf(stderr, + " [%2llu] match_info=0x%08x x=0x%08x\n", + (unsigned long long)i, sample[i].match_info, sample[i].x); + } + } + + int p_t1 = begin_phase("T1 match"); + q.memset(d_counter, 0, sizeof(uint64_t)); + launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs, + d_t1_meta, d_t1_mi, d_counter, cap, + d_t1_match_temp, &t1_temp_bytes, q); + end_phase(p_t1); + + q.memcpy(&t1_count, d_counter, sizeof(uint64_t)).wait(); + if (t1_count > cap) throw std::runtime_error("T1 overflow"); + if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') { + std::fprintf(stderr, + "[t1-debug] plain post-launch t1_count=%llu\n", + (unsigned long long)t1_count); + } + validate_t1_count(t1_count, cfg.k); + + s_free(stats, d_t1_match_temp); + s_free(stats, d_xs); + } else { + // Sliced path (minimal): N=num_sections passes with cap/N + // staging buffers. Output accumulates on host pinned, then + // d_t1_mi + h_t1_meta receive their final populations after + // d_xs is freed. + uint32_t const t1_num_sections = 1u << t1p.num_section_bits; + uint32_t const t1_num_match_keys = 1u << t1p.num_match_key_bits; + // 25% safety over the per-section average expected output. + uint64_t const t1_section_cap = + ((cap + t1_num_sections - 1) / t1_num_sections) * 5ULL / 4ULL; + + s_malloc(stats, d_t1_match_temp, t1_temp_bytes, "d_t1_match_temp"); + + // Compute bucket + fine-bucket offsets once; passes share them. + // Also zeros d_counter. + launch_t1_match_prepare(cfg.plot_id.data(), t1p, d_xs, total_xs, + d_counter, d_t1_match_temp, &t1_temp_bytes, q); + + // Host pinned full-cap accumulators for meta + mi. + h_t1_meta = h_meta_owned + ? static_cast(sycl::malloc_host(cap * sizeof(uint64_t), q)) + : scratch.h_meta; + if (!h_t1_meta) throw std::runtime_error("sycl::malloc_host(h_t1_meta) failed"); + h_t1_mi_owned = true; + h_t1_mi = static_cast(sycl::malloc_host(cap * sizeof(uint32_t), q)); + if (!h_t1_mi) throw std::runtime_error("sycl::malloc_host(h_t1_mi) failed"); + + // Per-pass staging device buffers (cap/N). + uint64_t* d_t1_meta_stage = nullptr; + uint32_t* d_t1_mi_stage = nullptr; + s_malloc(stats, d_t1_meta_stage, t1_section_cap * sizeof(uint64_t), "d_t1_meta_stage"); + s_malloc(stats, d_t1_mi_stage, t1_section_cap * sizeof(uint32_t), "d_t1_mi_stage"); + + if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') { + uint64_t const sample_n = (total_xs < 16ULL) ? total_xs : 16ULL; + XsCandidateGpu sample[16] = {}; + q.memcpy(sample, d_xs, sample_n * sizeof(XsCandidateGpu)).wait(); + std::fprintf(stderr, + "[t1-debug] sliced pre-launch k=%d total_xs=%llu cap=%llu d_xs[0..%llu]:\n", + cfg.k, (unsigned long long)total_xs, + (unsigned long long)cap, (unsigned long long)sample_n); + for (uint64_t i = 0; i < sample_n; ++i) { + std::fprintf(stderr, + " [%2llu] match_info=0x%08x x=0x%08x\n", + (unsigned long long)i, sample[i].match_info, sample[i].x); + } + } + + int p_t1 = begin_phase("T1 match"); + uint64_t host_offset = 0; + for (uint32_t section_l = 0; section_l < t1_num_sections; ++section_l) { + uint32_t const bucket_begin = section_l * t1_num_match_keys; + uint32_t const bucket_end = (section_l + 1) * t1_num_match_keys; + + launch_t1_match_range( + cfg.plot_id.data(), t1p, d_xs, total_xs, + d_t1_meta_stage, d_t1_mi_stage, d_counter, t1_section_cap, + d_t1_match_temp, bucket_begin, bucket_end, q); + + uint64_t pass_count = 0; + q.memcpy(&pass_count, d_counter, sizeof(uint64_t)).wait(); + if (pass_count > t1_section_cap) { + throw std::runtime_error( + "T1 match (sliced) section_l=" + std::to_string(section_l) + + " produced " + std::to_string(pass_count) + + " pairs, staging holds " + std::to_string(t1_section_cap) + + ". Increase t1_section_cap safety factor."); + } + q.memcpy(h_t1_meta + host_offset, d_t1_meta_stage, + pass_count * sizeof(uint64_t)).wait(); + q.memcpy(h_t1_mi + host_offset, d_t1_mi_stage, + pass_count * sizeof(uint32_t)).wait(); + host_offset += pass_count; + q.memset(d_counter, 0, sizeof(uint64_t)).wait(); + } + end_phase(p_t1); + + t1_count = host_offset; + if (t1_count > cap) throw std::runtime_error("T1 overflow"); + if (char const* v = std::getenv("POS2GPU_T1_DEBUG"); v && v[0] == '1') { + std::fprintf(stderr, + "[t1-debug] sliced post-launch t1_count=%llu (sum across %u sections)\n", + (unsigned long long)t1_count, t1_num_sections); + } + validate_t1_count(t1_count, cfg.k); + + s_free(stats, d_t1_meta_stage); + s_free(stats, d_t1_mi_stage); + s_free(stats, d_t1_match_temp); + + // Xs fully consumed. + s_free(stats, d_xs); + + // Re-hydrate d_t1_mi full-cap on device for T1 sort (CUB + // sort key input). h_t1_meta stays on host across T1 sort. + s_malloc(stats, d_t1_mi, cap * sizeof(uint32_t), "d_t1_mi"); + q.memcpy(d_t1_mi, h_t1_mi, t1_count * sizeof(uint32_t)).wait(); + if (h_t1_mi_owned) sycl::free(h_t1_mi, q); + h_t1_mi = nullptr; + // d_t1_meta stays nullptr — h_t1_meta has the data; the + // existing T1-sort park block will see d_t1_meta == nullptr + // and skip the d_t1_meta → h_t1_meta memcpy. + } + + // Stage 4b (compact only): park d_t1_meta on pinned host across + // the T1 sort phase. d_t1_meta is only needed again for + // launch_gather_u64 at the end of T1 sort — holding it alive + // through CUB setup was responsible for the 6256 MB overall + // streaming peak (d_t1_meta 2080 + d_t1_mi 1040 + CUB working 3120 + // + scratch). JIT H2D before the gather below, free right after. + // Mirror of stage 4a for T2. + // + // Stage 4f: use caller-provided scratch when present (amortised + // across batch); fall back to per-plot malloc_host otherwise. Same + // pattern applied to h_t1_keys_merged, h_t2_*, h_t3 below. + // + // Plain mode skips the park entirely: d_t1_meta stays live through + // T1 sort. Costs ~2 GB peak but saves a PCIe round-trip. + // + // Sliced mode: h_t1_meta was already populated by the T1 match + // passes — d_t1_meta is nullptr and the park dance is skipped + // here. h_meta_owned + h_t1_meta were declared above (lifted out + // of the original T1-sort scope) so the rest of T1 sort sees the + // same variables in both paths. + if (!scratch.plain_mode && !t1_match_sliced) { + h_t1_meta = h_meta_owned + ? static_cast(sycl::malloc_host(cap * sizeof(uint64_t), q)) + : scratch.h_meta; + if (!h_t1_meta) throw std::runtime_error("sycl::malloc_host(h_t1_meta) failed"); + q.memcpy(h_t1_meta, d_t1_meta, t1_count * sizeof(uint64_t)).wait(); + s_free(stats, d_t1_meta); + d_t1_meta = nullptr; + } + + // ---------- Phase T1 sort (tiled, N=2) ---------- + // Partition T1 into two halves by index, CUB-sort each with scratch + // sized for the larger half, then stable 2-way merge the sorted runs + // back into the extract-input slot (d_keys_in / d_vals_in) — that + // slot is free because the CUB sort has already consumed it. + // + // N=2 is the minimal case that exercises the tile + merge path; a + // larger N shrinks per-tile CUB scratch further but needs a multi- + // way merge or a tree of pairwise merges. Phase 6 can bump N once + // Phase 4's k=28 VRAM measurement shows how tight the budget is. + uint64_t const t1_tile_n0 = t1_count / 2; + uint64_t const t1_tile_n1 = t1_count - t1_tile_n0; + uint64_t const t1_tile_max = (t1_tile_n0 > t1_tile_n1) ? t1_tile_n0 : t1_tile_n1; + + size_t t1_sort_bytes = 0; + launch_sort_pairs_u32_u32( + nullptr, t1_sort_bytes, + static_cast(nullptr), static_cast(nullptr), + static_cast(nullptr), static_cast(nullptr), + t1_tile_max, 0, cfg.k, q); + + stats.phase = "T1 sort"; + // With T1 SoA emission, d_t1_mi IS the CUB key input. We only need + // d_keys_out (CUB sort output), d_vals_in (identity) + d_vals_out + // (sorted vals). d_t1_mi is freed as soon as CUB consumes it. + // + // Compact / plain: full-cap d_keys_out + d_vals_in + d_vals_out + // (1040 MB each at k=28); plus d_t1_mi (1040, full-cap input) + + // scratch ≈ 4176 MB peak. + // + // Minimal: per-tile cap/2 output buffers (520 each) instead of + // full-cap + USM-host h_keys/h_vals to collect tile outputs + + // launch_merge_pairs_stable_2way_u32_u32 reading USM-host inputs. + // Drops T1 sort CUB peak to: + // d_t1_mi (1040) + 3 × cap/2 u32 (1560) + scratch ≈ 2616 MB. + void* d_sort_scratch = nullptr; + uint32_t* d_keys_out = nullptr; // populated in compact path; minimal uses h_keys instead + uint32_t* d_vals_in = nullptr; // T2 sort below also uses this; declared at wider scope + uint32_t* d_vals_out = nullptr; // populated in compact path; minimal uses h_vals instead + uint32_t* h_keys = nullptr; // USM-host, sliced path only + uint32_t* h_vals = nullptr; // USM-host, sliced path only + + int p_t1_sort = begin_phase("T1 sort"); + + if (!t1_match_sliced) { + // Compact / plain — existing full-cap path. + s_malloc(stats, d_keys_out, cap * sizeof(uint32_t), "d_keys_out"); + s_malloc(stats, d_vals_in, cap * sizeof(uint32_t), "d_vals_in"); + s_malloc(stats, d_vals_out, cap * sizeof(uint32_t), "d_vals_out"); + s_malloc(stats, d_sort_scratch, t1_sort_bytes, "d_sort_scratch(t1)"); + + launch_init_u32_identity(d_vals_in, t1_count, q); + if (t1_tile_n0 > 0) { + launch_sort_pairs_u32_u32( + d_sort_scratch, t1_sort_bytes, + d_t1_mi + 0, d_keys_out + 0, + d_vals_in + 0, d_vals_out + 0, + t1_tile_n0, /*begin_bit=*/0, /*end_bit=*/cfg.k, q); + } + if (t1_tile_n1 > 0) { + launch_sort_pairs_u32_u32( + d_sort_scratch, t1_sort_bytes, + d_t1_mi + t1_tile_n0, d_keys_out + t1_tile_n0, + d_vals_in + t1_tile_n0, d_vals_out + t1_tile_n0, + t1_tile_n1, /*begin_bit=*/0, /*end_bit=*/cfg.k, q); + } + + s_free(stats, d_sort_scratch); + s_free(stats, d_vals_in); + s_free(stats, d_t1_mi); + } else { + // Sliced — per-tile cap/2 output buffers, D2H to USM-host. + uint32_t* d_keys_out_tile = nullptr; + uint32_t* d_vals_in_tile = nullptr; + uint32_t* d_vals_out_tile = nullptr; + s_malloc(stats, d_keys_out_tile, t1_tile_max * sizeof(uint32_t), "d_t1_keys_out_tile"); + s_malloc(stats, d_vals_in_tile, t1_tile_max * sizeof(uint32_t), "d_t1_vals_in_tile"); + s_malloc(stats, d_vals_out_tile, t1_tile_max * sizeof(uint32_t), "d_t1_vals_out_tile"); + s_malloc(stats, d_sort_scratch, t1_sort_bytes, "d_sort_scratch(t1)"); + + h_keys = static_cast(sycl::malloc_host(cap * sizeof(uint32_t), q)); + if (!h_keys) throw std::runtime_error("sycl::malloc_host(h_keys t1) failed"); + h_vals = static_cast(sycl::malloc_host(cap * sizeof(uint32_t), q)); + if (!h_vals) throw std::runtime_error("sycl::malloc_host(h_vals t1) failed"); + + auto run_tile = [&](uint64_t tile_off, uint64_t tile_n) { + if (tile_n == 0) return; + uint32_t const off32 = static_cast(tile_off); + uint32_t* d_vals_in_tile_local = d_vals_in_tile; + q.parallel_for( + sycl::range<1>{ static_cast(tile_n) }, + [=](sycl::id<1> i) { + d_vals_in_tile_local[i] = off32 + uint32_t(i); + }).wait(); + launch_sort_pairs_u32_u32( + d_sort_scratch, t1_sort_bytes, + d_t1_mi + tile_off, d_keys_out_tile, + d_vals_in_tile, d_vals_out_tile, + tile_n, /*begin_bit=*/0, /*end_bit=*/cfg.k, q); + q.memcpy(h_keys + tile_off, d_keys_out_tile, + tile_n * sizeof(uint32_t)).wait(); + q.memcpy(h_vals + tile_off, d_vals_out_tile, + tile_n * sizeof(uint32_t)).wait(); + }; + run_tile(0, t1_tile_n0); + run_tile(t1_tile_n0, t1_tile_n1); + + s_free(stats, d_sort_scratch); + s_free(stats, d_vals_out_tile); + s_free(stats, d_vals_in_tile); + s_free(stats, d_keys_out_tile); + s_free(stats, d_t1_mi); + } + + // 3-pass post-CUB (merge → gather meta) — same shape as T2 sort, + // but T1 only has one gather stream (meta) so it's 2 passes here. + uint32_t* d_t1_keys_merged = nullptr; + uint32_t* d_t1_merged_vals = nullptr; + s_malloc(stats, d_t1_keys_merged, cap * sizeof(uint32_t), "d_t1_keys_merged"); + s_malloc(stats, d_t1_merged_vals, cap * sizeof(uint32_t), "d_t1_merged_vals"); + + if (!t1_match_sliced) { + launch_merge_pairs_stable_2way_u32_u32( + d_keys_out + 0, d_vals_out + 0, t1_tile_n0, + d_keys_out + t1_tile_n0, d_vals_out + t1_tile_n0, t1_tile_n1, + d_t1_keys_merged, d_t1_merged_vals, t1_count, q); + s_free(stats, d_keys_out); + s_free(stats, d_vals_out); + } else { + // Merge inputs are USM-host; the kernel reads via PCIe (sequential + // 2-way merge → bandwidth-bound, ~3.27 GB at k=28 / ~25 GB/s ≈ + // 130 ms). Live device set during merge is just the two cap-sized + // output buffers (d_t1_keys_merged + d_t1_merged_vals = 2080 MB). + launch_merge_pairs_stable_2way_u32_u32( + h_keys + 0, h_vals + 0, t1_tile_n0, + h_keys + t1_tile_n0, h_vals + t1_tile_n0, t1_tile_n1, + d_t1_keys_merged, d_t1_merged_vals, t1_count, q); + sycl::free(h_keys, q); h_keys = nullptr; + sycl::free(h_vals, q); h_vals = nullptr; + } + + // Stage 4c (compact only): d_t1_keys_merged is not used by the + // gather below (gather uses d_t1_merged_vals for indices); it is + // only consumed by T2 match as the "d_sorted_mi" input. Park it on + // pinned host across the gather peak so the 1040 MB doesn't coexist + // with d_t1_merged_vals + d_t1_meta + d_t1_meta_sorted. H2D'd back + // at T2 match entry. + // + // Plain mode keeps d_t1_keys_merged live across the gather peak. + bool const h_keys_owned = (!scratch.plain_mode && scratch.h_keys_merged == nullptr); + uint32_t* h_t1_keys_merged = nullptr; + if (!scratch.plain_mode) { + h_t1_keys_merged = h_keys_owned + ? static_cast(sycl::malloc_host(cap * sizeof(uint32_t), q)) + : scratch.h_keys_merged; + if (!h_t1_keys_merged) throw std::runtime_error("sycl::malloc_host(h_t1_keys_merged) failed"); + q.memcpy(h_t1_keys_merged, d_t1_keys_merged, t1_count * sizeof(uint32_t)).wait(); + s_free(stats, d_t1_keys_merged); + d_t1_keys_merged = nullptr; + } + + // Stage 4b (compact only): JIT H2D d_t1_meta back onto the device + // for the gather, then free it immediately. Peak during this window: + // d_t1_keys_merged (1040) + d_t1_merged_vals (1040) + // + d_t1_meta (2080 H2D) + d_t1_meta_sorted (2080 populated) + // = 6240 MB — same as T2 sort's gather peak, and no longer the + // overall bottleneck on its own. + // + // Plain mode: d_t1_meta is already live (never parked). + int const t1_gather_N = scratch.plain_mode ? 1 : scratch.gather_tile_count; + if (!scratch.plain_mode) { + s_malloc(stats, d_t1_meta, cap * sizeof(uint64_t), "d_t1_meta"); + q.memcpy(d_t1_meta, h_t1_meta, t1_count * sizeof(uint64_t)).wait(); + // With gather_tile_count > 1 we reuse h_t1_meta to stage the + // sorted output (overwriting the unsorted data we just + // rehydrated from); defer the free until after the H2D rebuild. + if (t1_gather_N <= 1) { + if (h_meta_owned) sycl::free(h_t1_meta, q); + h_t1_meta = nullptr; + } + } + + uint64_t* d_t1_meta_sorted = nullptr; + if (t1_gather_N <= 1) { + s_malloc(stats, d_t1_meta_sorted, cap * sizeof(uint64_t), "d_t1_meta_sorted"); + launch_gather_u64(d_t1_meta, d_t1_merged_vals, d_t1_meta_sorted, t1_count, q); + end_phase(p_t1_sort); + s_free(stats, d_t1_meta); + s_free(stats, d_t1_merged_vals); + } else { + // Tiled-output gather (minimal tier). Produce the sorted output + // in N tiles, D2H each tile to h_t1_meta (overwriting the + // unsorted data we just rehydrated from), then free the inputs + // and rebuild the full d_t1_meta_sorted on device. Peak during + // gather drops from + // d_t1_meta (2080) + d_t1_merged_vals (1040) + // + d_t1_meta_sorted (2080) = 5200 MB + // to + // d_t1_meta (2080) + d_t1_merged_vals (1040) + // + d_tile (cap/N × u64 = 520 at N=4) = ~3640 MB. + uint64_t const tile_max = + (t1_count + uint64_t(t1_gather_N) - 1) / uint64_t(t1_gather_N); + uint64_t* d_tile = nullptr; + s_malloc(stats, d_tile, tile_max * sizeof(uint64_t), "d_t1_meta_sorted_tile"); + for (int n = 0; n < t1_gather_N; ++n) { + uint64_t const tile_off = uint64_t(n) * tile_max; + if (tile_off >= t1_count) break; + uint64_t const tile_n = std::min(tile_max, t1_count - tile_off); + launch_gather_u64( + d_t1_meta, d_t1_merged_vals + tile_off, + d_tile, tile_n, q); + q.memcpy(h_t1_meta + tile_off, d_tile, + tile_n * sizeof(uint64_t)).wait(); + } + s_free(stats, d_tile); + s_free(stats, d_t1_meta); + s_free(stats, d_t1_merged_vals); + s_malloc(stats, d_t1_meta_sorted, cap * sizeof(uint64_t), "d_t1_meta_sorted"); + q.memcpy(d_t1_meta_sorted, h_t1_meta, t1_count * sizeof(uint64_t)).wait(); + end_phase(p_t1_sort); + if (h_meta_owned) sycl::free(h_t1_meta, q); + h_t1_meta = nullptr; + } + + // Stage 4c (compact only): H2D d_t1_keys_merged back now that T2 + // match (its consumer) is about to start. Pinned host freed after + // H2D. Plain mode: d_t1_keys_merged is already live. + if (!scratch.plain_mode) { + s_malloc(stats, d_t1_keys_merged, cap * sizeof(uint32_t), "d_t1_keys_merged"); + q.memcpy(d_t1_keys_merged, h_t1_keys_merged, t1_count * sizeof(uint32_t)).wait(); + if (h_keys_owned) sycl::free(h_t1_keys_merged, q); + h_t1_keys_merged = nullptr; + } + + // ---------- Phase T2 match ---------- + // Plain mode: single-pass full-cap N=1 match. Device live set + // during match is T1 sorted (3.07 GB at k=28) + full-cap T2 output + // (4.16 GB) ≈ 7.23 GB. No PCIe round-trips. + // + // Compact mode (tiled N=2, D2H per pass): two bucket-range passes + // through half-cap device staging + pinned host accumulators. Match + // live set drops to T1 sorted + half-cap staging ≈ 5.15 GB, at the + // cost of ~70 ms of PCIe per pass. This is stage 3 of C (see + // docs/t2-match-tiling-plan.md). Pool path uses the single-shot + // launch_t2_match — it has the VRAM and doesn't pay the staging + // round-trip cost. + // + // Per-pass compact safety: we expect each half to produce ≤ cap/2 + // pairs because the match output is roughly uniform across bucket + // ids. cap itself has a built-in safety margin (see + // extra_margin_bits in PoolSizing), and typical actual utilisation + // is well under 100 %. If a pass ever exceeds staging capacity we + // throw rather than silently dropping pairs. + stats.phase = "T2 match"; + auto t2p = make_t2_params(cfg.k, cfg.strength); + + // Shared outputs. In plain mode d_t2_meta / d_t2_xbits / d_t2_mi + // all become live full-cap buffers here; the T2 sort / gather + // sections below skip the JIT H2D re-hydrations. In compact mode + // only d_t2_mi is live here (hydrated from the per-plot h_t2_mi), + // and h_t2_meta / h_t2_xbits hold the concatenated outputs on + // pinned host until JIT H2D at the gather site. + uint64_t* d_t2_meta = nullptr; + uint32_t* d_t2_mi = nullptr; + uint32_t* d_t2_xbits = nullptr; + uint64_t t2_count = 0; + uint64_t* h_t2_meta = nullptr; + uint32_t* h_t2_xbits = nullptr; + bool h_xbits_owned = false; + + if (scratch.plain_mode) { + // Plain: one-shot launch_t2_match into full-cap device buffers. + size_t t2_temp_bytes = 0; + launch_t2_match(cfg.plot_id.data(), t2p, nullptr, nullptr, t1_count, + nullptr, nullptr, nullptr, d_counter, cap, + nullptr, &t2_temp_bytes, q); + + void* d_t2_match_temp = nullptr; + s_malloc(stats, d_t2_meta, cap * sizeof(uint64_t), "d_t2_meta"); + s_malloc(stats, d_t2_mi, cap * sizeof(uint32_t), "d_t2_mi"); + s_malloc(stats, d_t2_xbits, cap * sizeof(uint32_t), "d_t2_xbits"); + s_malloc(stats, d_t2_match_temp, t2_temp_bytes, "d_t2_match_temp"); + + q.memset(d_counter, 0, sizeof(uint64_t)).wait(); + int p_t2 = begin_phase("T2 match"); + launch_t2_match(cfg.plot_id.data(), t2p, + d_t1_meta_sorted, d_t1_keys_merged, t1_count, + d_t2_meta, d_t2_mi, d_t2_xbits, + d_counter, cap, + d_t2_match_temp, &t2_temp_bytes, q); + end_phase(p_t2); + + q.memcpy(&t2_count, d_counter, sizeof(uint64_t)).wait(); + if (t2_count > cap) throw std::runtime_error("T2 overflow"); + + s_free(stats, d_t2_match_temp); + s_free(stats, d_t1_meta_sorted); + s_free(stats, d_t1_keys_merged); + } else { + // Compact: N-tile cap/N staging with pinned-host accumulators. + // N = scratch.t2_tile_count: 2 = compact (~2.3 GB staging at + // k=28); 8 = minimal (~570 MB) for 4 GiB cards. Must be a power + // of 2 ≤ t2_num_buckets so even bucket distribution is exact. + uint32_t const t2_num_buckets = + (1u << t2p.num_section_bits) * (1u << t2p.num_match_key_bits); + int const N = scratch.t2_tile_count; + if (N < 2 || (N & (N - 1)) != 0) { + throw std::runtime_error( + "scratch.t2_tile_count must be a power of 2 ≥ 2 (got " + + std::to_string(N) + ")"); + } + if (static_cast(N) > t2_num_buckets) { + throw std::runtime_error( + "scratch.t2_tile_count " + std::to_string(N) + + " exceeds t2_num_buckets " + std::to_string(t2_num_buckets)); + } + uint64_t const t2_tile_cap = (cap + uint64_t(N) - 1) / uint64_t(N); + + size_t t2_temp_bytes = 0; + launch_t2_match_prepare(cfg.plot_id.data(), t2p, nullptr, t1_count, + d_counter, nullptr, &t2_temp_bytes, q); + + // Tile-cap device staging (reused across all N passes). + uint64_t* d_t2_meta_stage = nullptr; + uint32_t* d_t2_mi_stage = nullptr; + uint32_t* d_t2_xbits_stage = nullptr; + void* d_t2_match_temp = nullptr; + s_malloc(stats, d_t2_meta_stage, t2_tile_cap * sizeof(uint64_t), "d_t2_meta_stage"); + s_malloc(stats, d_t2_mi_stage, t2_tile_cap * sizeof(uint32_t), "d_t2_mi_stage"); + s_malloc(stats, d_t2_xbits_stage, t2_tile_cap * sizeof(uint32_t), "d_t2_xbits_stage"); + s_malloc(stats, d_t2_match_temp, t2_temp_bytes, "d_t2_match_temp"); + + // Full-cap pinned host that will hold the concatenated T2 output. + // Stage 4f: reuse the caller-provided scratch for h_meta / h_xbits + // (amortised across batch). h_t2_mi is still allocated per-plot. + auto alloc_pinned_or_throw = [&](size_t bytes, char const* what) { + void* p = sycl::malloc_host(bytes, q); + if (!p) throw std::runtime_error(std::string("sycl::malloc_host(") + + what + ") failed"); + return p; + }; + h_t2_meta = h_meta_owned + ? static_cast(alloc_pinned_or_throw(cap * sizeof(uint64_t), "h_t2_meta")) + : scratch.h_meta; + uint32_t* h_t2_mi = static_cast( + alloc_pinned_or_throw(cap * sizeof(uint32_t), "h_t2_mi")); + h_xbits_owned = (scratch.h_t2_xbits == nullptr); + h_t2_xbits = h_xbits_owned + ? static_cast(alloc_pinned_or_throw(cap * sizeof(uint32_t), "h_t2_xbits")) + : scratch.h_t2_xbits; + + // Compute bucket + fine-bucket offsets once; both passes share + // them. Also zeroes d_counter. + launch_t2_match_prepare(cfg.plot_id.data(), t2p, + d_t1_keys_merged, t1_count, + d_counter, d_t2_match_temp, &t2_temp_bytes, q); + + auto run_pass_and_stage = [&](uint32_t bucket_begin, uint32_t bucket_end, + uint64_t host_offset) -> uint64_t + { + launch_t2_match_range(cfg.plot_id.data(), t2p, + d_t1_meta_sorted, d_t1_keys_merged, t1_count, + d_t2_meta_stage, d_t2_mi_stage, d_t2_xbits_stage, + d_counter, t2_tile_cap, d_t2_match_temp, + bucket_begin, bucket_end, q); + uint64_t pass_count = 0; + q.memcpy(&pass_count, d_counter, sizeof(uint64_t)).wait(); + if (pass_count > t2_tile_cap) { + throw std::runtime_error( + "T2 match pass overflow: bucket range [" + + std::to_string(bucket_begin) + "," + std::to_string(bucket_end) + + ") produced " + std::to_string(pass_count) + + " pairs, staging holds " + std::to_string(t2_tile_cap) + + " (consider lower N or fall back to compact tier)."); + } + q.memcpy(h_t2_meta + host_offset, d_t2_meta_stage, pass_count * sizeof(uint64_t)); + q.memcpy(h_t2_mi + host_offset, d_t2_mi_stage, pass_count * sizeof(uint32_t)); + q.memcpy(h_t2_xbits + host_offset, d_t2_xbits_stage, pass_count * sizeof(uint32_t)); + q.wait(); + q.memset(d_counter, 0, sizeof(uint64_t)).wait(); + return pass_count; + }; + + int p_t2 = begin_phase("T2 match"); + // N evenly-spaced bucket ranges. host_offset accumulates so each + // pass appends to the pinned host buffer behind the prior pass. + t2_count = 0; + for (int pass = 0; pass < N; ++pass) { + uint32_t const bucket_begin = + uint32_t(uint64_t(pass) * t2_num_buckets / uint64_t(N)); + uint32_t const bucket_end = + uint32_t(uint64_t(pass + 1) * t2_num_buckets / uint64_t(N)); + t2_count += run_pass_and_stage(bucket_begin, bucket_end, + /*host_offset=*/t2_count); + } + end_phase(p_t2); + + if (t2_count > cap) throw std::runtime_error("T2 overflow"); + + // Free device staging + T1 sorted + match temp before + // re-allocating the full-cap d_t2_mi that T2 sort expects. + s_free(stats, d_t2_match_temp); + s_free(stats, d_t2_meta_stage); + s_free(stats, d_t2_mi_stage); + s_free(stats, d_t2_xbits_stage); + s_free(stats, d_t1_meta_sorted); + s_free(stats, d_t1_keys_merged); + + // Stage 4a: hydrate full-cap d_t2_mi from h_t2_mi. d_t2_meta + // and d_t2_xbits are NOT hydrated yet — they stay on pinned + // host until their gather calls at the end of T2 sort. + s_malloc(stats, d_t2_mi, cap * sizeof(uint32_t), "d_t2_mi"); + q.memcpy(d_t2_mi, h_t2_mi, t2_count * sizeof(uint32_t)); + q.wait(); + sycl::free(h_t2_mi, q); + } + + // ---------- Phase T2 sort (tiled, N=2) ---------- + // Mirror of T1 sort above — same tile-and-merge shape, but permute + // writes a meta-xbits pair (T2 match output is 16 B, split SoA for + // T3's L1-bound read pattern) instead of plain meta. + // N=4 tiling halves the CUB scratch peak (~1044 MB → ~522 MB at + // k=28), bringing the T2 CUB-alloc peak under 8 GB. Merge is done + // as a tree of three 2-way merges: (0+1)→AB, (2+3)→CD, (AB+CD)→final. + constexpr int kNumT2Tiles = 4; + uint64_t t2_tile_n [kNumT2Tiles]; + uint64_t t2_tile_off[kNumT2Tiles + 1]; + uint64_t const t2_base_tile = t2_count / kNumT2Tiles; + uint64_t t2_rem = t2_count % kNumT2Tiles; + t2_tile_off[0] = 0; + for (int t = 0; t < kNumT2Tiles; ++t) { + t2_tile_n[t] = t2_base_tile + (t2_rem > 0 ? 1 : 0); + if (t2_rem > 0) --t2_rem; + t2_tile_off[t+1] = t2_tile_off[t] + t2_tile_n[t]; + } + uint64_t t2_tile_max = 0; + for (int t = 0; t < kNumT2Tiles; ++t) + if (t2_tile_n[t] > t2_tile_max) t2_tile_max = t2_tile_n[t]; + + size_t t2_sort_bytes = 0; + launch_sort_pairs_u32_u32( + nullptr, t2_sort_bytes, + static_cast(nullptr), static_cast(nullptr), + static_cast(nullptr), static_cast(nullptr), + t2_tile_max, 0, cfg.k, q); + + stats.phase = "T2 sort"; + // CUB sort key input = d_t2_mi (emitted SoA by T2 match); no extract + // needed, so d_keys_in only needs to hold the merged sorted-MI output + // that downstream T3 match will consume. Allocate it AFTER the CUB + // tile-sort has freed d_t2_mi to keep peak narrow. + // + // Compact / plain: full-cap d_keys_out + d_vals_in + d_vals_out + // (~4168 MB peak with d_t2_mi during tile sort). + // + // Sliced (minimal): per-tile cap/N output buffers + USM-host + // accumulators, then USM-host parking of AB / CD between merge + // tree steps so the final merge sees only its own outputs + + // USM-host inputs (live device ~2080 MB at k=28). Peaks under + // 4 GiB at every step. + + uint64_t const ab_count = t2_tile_n[0] + t2_tile_n[1]; + uint64_t const cd_count = t2_tile_n[2] + t2_tile_n[3]; + + int p_t2_sort = begin_phase("T2 sort"); + + if (!t1_match_sliced) { + // Compact / plain — existing full-cap CUB tile sort. + s_malloc(stats, d_keys_out, cap * sizeof(uint32_t), "d_keys_out"); + s_malloc(stats, d_vals_in, cap * sizeof(uint32_t), "d_vals_in"); + s_malloc(stats, d_vals_out, cap * sizeof(uint32_t), "d_vals_out"); + s_malloc(stats, d_sort_scratch, t2_sort_bytes, "d_sort_scratch(t2)"); + + launch_init_u32_identity(d_vals_in, t2_count, q); + for (int t = 0; t < kNumT2Tiles; ++t) { + if (t2_tile_n[t] == 0) continue; + uint64_t off = t2_tile_off[t]; + launch_sort_pairs_u32_u32( + d_sort_scratch, t2_sort_bytes, + d_t2_mi + off, d_keys_out + off, + d_vals_in + off, d_vals_out + off, + t2_tile_n[t], 0, cfg.k, q); + } + + s_free(stats, d_sort_scratch); + s_free(stats, d_vals_in); + s_free(stats, d_t2_mi); + } else { + // Sliced — per-tile cap/N output, D2H to USM-host h_keys/h_vals. + uint32_t* d_keys_out_tile = nullptr; + uint32_t* d_vals_in_tile = nullptr; + uint32_t* d_vals_out_tile = nullptr; + s_malloc(stats, d_keys_out_tile, t2_tile_max * sizeof(uint32_t), "d_t2_keys_out_tile"); + s_malloc(stats, d_vals_in_tile, t2_tile_max * sizeof(uint32_t), "d_t2_vals_in_tile"); + s_malloc(stats, d_vals_out_tile, t2_tile_max * sizeof(uint32_t), "d_t2_vals_out_tile"); + s_malloc(stats, d_sort_scratch, t2_sort_bytes, "d_sort_scratch(t2)"); + + h_keys = static_cast(sycl::malloc_host(cap * sizeof(uint32_t), q)); + if (!h_keys) throw std::runtime_error("sycl::malloc_host(h_keys t2) failed"); + h_vals = static_cast(sycl::malloc_host(cap * sizeof(uint32_t), q)); + if (!h_vals) throw std::runtime_error("sycl::malloc_host(h_vals t2) failed"); + + for (int t = 0; t < kNumT2Tiles; ++t) { + uint64_t const tile_n = t2_tile_n[t]; + if (tile_n == 0) continue; + uint64_t const tile_off = t2_tile_off[t]; + uint32_t const off32 = static_cast(tile_off); + uint32_t* d_vals_in_tile_local = d_vals_in_tile; + q.parallel_for( + sycl::range<1>{ static_cast(tile_n) }, + [=](sycl::id<1> i) { + d_vals_in_tile_local[i] = off32 + uint32_t(i); + }).wait(); + launch_sort_pairs_u32_u32( + d_sort_scratch, t2_sort_bytes, + d_t2_mi + tile_off, d_keys_out_tile, + d_vals_in_tile, d_vals_out_tile, + tile_n, 0, cfg.k, q); + q.memcpy(h_keys + tile_off, d_keys_out_tile, + tile_n * sizeof(uint32_t)).wait(); + q.memcpy(h_vals + tile_off, d_vals_out_tile, + tile_n * sizeof(uint32_t)).wait(); + } + + s_free(stats, d_sort_scratch); + s_free(stats, d_vals_out_tile); + s_free(stats, d_vals_in_tile); + s_free(stats, d_keys_out_tile); + s_free(stats, d_t2_mi); + } + + // Tree-of-2-way-merges: (tile 0 + tile 1) → AB, (tile 2 + tile 3) → CD, + // then (AB + CD) → final merged stream. + // + // Compact: AB + CD live across the final merge → peak ~4160 MB. + // Sliced: AB and CD parked to USM-host between tree steps so the + // final merge sees only itself + USM-host inputs (~2080 MB peak). + uint32_t* d_AB_keys = nullptr; + uint32_t* d_AB_vals = nullptr; + uint32_t* d_CD_keys = nullptr; + uint32_t* d_CD_vals = nullptr; + uint32_t* h_AB_keys = nullptr; + uint32_t* h_AB_vals = nullptr; + uint32_t* h_CD_keys = nullptr; + uint32_t* h_CD_vals = nullptr; + + if (!t1_match_sliced) { + s_malloc(stats, d_AB_keys, ab_count * sizeof(uint32_t), "d_t2_AB_keys"); + s_malloc(stats, d_AB_vals, ab_count * sizeof(uint32_t), "d_t2_AB_vals"); + s_malloc(stats, d_CD_keys, cd_count * sizeof(uint32_t), "d_t2_CD_keys"); + s_malloc(stats, d_CD_vals, cd_count * sizeof(uint32_t), "d_t2_CD_vals"); + + if (ab_count > 0) { + launch_merge_pairs_stable_2way_u32_u32( + d_keys_out + t2_tile_off[0], d_vals_out + t2_tile_off[0], t2_tile_n[0], + d_keys_out + t2_tile_off[1], d_vals_out + t2_tile_off[1], t2_tile_n[1], + d_AB_keys, d_AB_vals, ab_count, q); + } + if (cd_count > 0) { + launch_merge_pairs_stable_2way_u32_u32( + d_keys_out + t2_tile_off[2], d_vals_out + t2_tile_off[2], t2_tile_n[2], + d_keys_out + t2_tile_off[3], d_vals_out + t2_tile_off[3], t2_tile_n[3], + d_CD_keys, d_CD_vals, cd_count, q); + } + + s_free(stats, d_keys_out); + s_free(stats, d_vals_out); + } else { + // AB merge: read USM-host slices, write device d_AB. Then D2H + // to USM-host and free device. + s_malloc(stats, d_AB_keys, ab_count * sizeof(uint32_t), "d_t2_AB_keys"); + s_malloc(stats, d_AB_vals, ab_count * sizeof(uint32_t), "d_t2_AB_vals"); + if (ab_count > 0) { + launch_merge_pairs_stable_2way_u32_u32( + h_keys + t2_tile_off[0], h_vals + t2_tile_off[0], t2_tile_n[0], + h_keys + t2_tile_off[1], h_vals + t2_tile_off[1], t2_tile_n[1], + d_AB_keys, d_AB_vals, ab_count, q); + } + h_AB_keys = static_cast(sycl::malloc_host(ab_count * sizeof(uint32_t), q)); + h_AB_vals = static_cast(sycl::malloc_host(ab_count * sizeof(uint32_t), q)); + if (!h_AB_keys || !h_AB_vals) throw std::runtime_error("sycl::malloc_host(h_AB) failed"); + if (ab_count > 0) { + q.memcpy(h_AB_keys, d_AB_keys, ab_count * sizeof(uint32_t)); + q.memcpy(h_AB_vals, d_AB_vals, ab_count * sizeof(uint32_t)).wait(); + } + s_free(stats, d_AB_vals); + s_free(stats, d_AB_keys); + + // CD merge: same shape. + s_malloc(stats, d_CD_keys, cd_count * sizeof(uint32_t), "d_t2_CD_keys"); + s_malloc(stats, d_CD_vals, cd_count * sizeof(uint32_t), "d_t2_CD_vals"); + if (cd_count > 0) { + launch_merge_pairs_stable_2way_u32_u32( + h_keys + t2_tile_off[2], h_vals + t2_tile_off[2], t2_tile_n[2], + h_keys + t2_tile_off[3], h_vals + t2_tile_off[3], t2_tile_n[3], + d_CD_keys, d_CD_vals, cd_count, q); + } + h_CD_keys = static_cast(sycl::malloc_host(cd_count * sizeof(uint32_t), q)); + h_CD_vals = static_cast(sycl::malloc_host(cd_count * sizeof(uint32_t), q)); + if (!h_CD_keys || !h_CD_vals) throw std::runtime_error("sycl::malloc_host(h_CD) failed"); + if (cd_count > 0) { + q.memcpy(h_CD_keys, d_CD_keys, cd_count * sizeof(uint32_t)); + q.memcpy(h_CD_vals, d_CD_vals, cd_count * sizeof(uint32_t)).wait(); + } + s_free(stats, d_CD_vals); + s_free(stats, d_CD_keys); + + // h_keys + h_vals consumed by AB/CD merges — free. + sycl::free(h_keys, q); h_keys = nullptr; + sycl::free(h_vals, q); h_vals = nullptr; + } + + uint32_t* d_t2_keys_merged = nullptr; // merged sorted MI for T3. + uint32_t* d_merged_vals = nullptr; // merged sorted src indices. + s_malloc(stats, d_t2_keys_merged, cap * sizeof(uint32_t), "d_t2_keys_merged"); + s_malloc(stats, d_merged_vals, cap * sizeof(uint32_t), "d_merged_vals"); + + if (!t1_match_sliced) { + launch_merge_pairs_stable_2way_u32_u32( + d_AB_keys, d_AB_vals, ab_count, + d_CD_keys, d_CD_vals, cd_count, + d_t2_keys_merged, d_merged_vals, t2_count, q); + s_free(stats, d_AB_keys); + s_free(stats, d_AB_vals); + s_free(stats, d_CD_keys); + s_free(stats, d_CD_vals); + } else { + // Final merge from USM-host inputs into device outputs. + launch_merge_pairs_stable_2way_u32_u32( + h_AB_keys, h_AB_vals, ab_count, + h_CD_keys, h_CD_vals, cd_count, + d_t2_keys_merged, d_merged_vals, t2_count, q); + sycl::free(h_AB_keys, q); h_AB_keys = nullptr; + sycl::free(h_AB_vals, q); h_AB_vals = nullptr; + sycl::free(h_CD_keys, q); h_CD_keys = nullptr; + sycl::free(h_CD_vals, q); h_CD_vals = nullptr; + } + + // Stage 4c (compact only): d_t2_keys_merged is not consumed by the + // gather calls below (they use d_merged_vals for indices) — it's + // only needed later by T3 match as the sorted-MI input. Park it on + // pinned host across the gather peak so the 1040 MB doesn't coexist + // with d_merged_vals + d_t2_meta + d_t2_meta_sorted. H2D'd back + // before T3 match. + // + // Plain mode keeps d_t2_keys_merged live across the gather peak. + uint32_t* h_t2_keys_merged = nullptr; + if (!scratch.plain_mode) { + h_t2_keys_merged = h_keys_owned // reuse t1_keys flag: same scratch + ? static_cast(sycl::malloc_host(cap * sizeof(uint32_t), q)) + : scratch.h_keys_merged; + if (!h_t2_keys_merged) throw std::runtime_error("sycl::malloc_host(h_t2_keys_merged) failed"); + q.memcpy(h_t2_keys_merged, d_t2_keys_merged, t2_count * sizeof(uint32_t)).wait(); + s_free(stats, d_t2_keys_merged); + d_t2_keys_merged = nullptr; + } + + // Stage 4a (compact only): JIT H2D the gather source buffers. + // d_t2_meta is alive only for the duration of its gather (2080 MB + // at k=28), then freed before d_t2_xbits is H2D'd. With stage 4c + // the gather peak drops to d_merged_vals (1040) + d_t2_meta (2080) + // + d_t2_meta_sorted (2080) = 5200 MB (no more d_t2_keys_merged). + // + // Plain mode: d_t2_meta and d_t2_xbits are already live from T2 + // match (never parked). Gather reads them directly and frees after. + int const t2_gather_N = scratch.plain_mode ? 1 : scratch.gather_tile_count; + uint64_t* d_t2_meta_sorted = nullptr; + uint32_t* d_t2_xbits_sorted = nullptr; + + if (t2_gather_N <= 1) { + // Single-shot path (compact / plain). + if (!scratch.plain_mode) { + s_malloc(stats, d_t2_meta, cap * sizeof(uint64_t), "d_t2_meta"); + q.memcpy(d_t2_meta, h_t2_meta, t2_count * sizeof(uint64_t)); + q.wait(); + if (h_meta_owned) sycl::free(h_t2_meta, q); + h_t2_meta = nullptr; + } + + s_malloc(stats, d_t2_meta_sorted, cap * sizeof(uint64_t), "d_t2_meta_sorted"); + launch_gather_u64(d_t2_meta, d_merged_vals, d_t2_meta_sorted, t2_count, q); + q.wait(); + s_free(stats, d_t2_meta); + + if (!scratch.plain_mode) { + s_malloc(stats, d_t2_xbits, cap * sizeof(uint32_t), "d_t2_xbits"); + q.memcpy(d_t2_xbits, h_t2_xbits, t2_count * sizeof(uint32_t)); + q.wait(); + if (h_xbits_owned) sycl::free(h_t2_xbits, q); + h_t2_xbits = nullptr; + } + + s_malloc(stats, d_t2_xbits_sorted, cap * sizeof(uint32_t), "d_t2_xbits_sorted"); + launch_gather_u32(d_t2_xbits, d_merged_vals, d_t2_xbits_sorted, t2_count, q); + end_phase(p_t2_sort); + s_free(stats, d_t2_xbits); + s_free(stats, d_merged_vals); + } else { + // Tiled-output gather (minimal tier). Both gathers stage their + // sorted outputs to host pinned (reusing h_t2_meta and + // h_t2_xbits — same buffers that just held the parked unsorted + // data) one tile at a time. Crucially, d_t2_meta_sorted is NOT + // re-allocated on device until BOTH gathers and d_merged_vals + // are done — otherwise the xbits gather peak (d_t2_meta_sorted + // 2080 + d_merged_vals 1040 + d_t2_xbits 1040 + tile 260) would + // still hit ~4420 MB. Deferring the rehydrate keeps the xbits + // gather peak at d_merged_vals (1040) + d_t2_xbits (1040) + + // tile (260 at N=4) = ~2340 MB. Final rehydrate peak: + // d_t2_meta_sorted (2080) + d_t2_xbits_sorted (1040) = 3120 MB. + uint64_t const tile_max = + (t2_count + uint64_t(t2_gather_N) - 1) / uint64_t(t2_gather_N); + + // --- Meta gather (tiled output → h_t2_meta) --- + s_malloc(stats, d_t2_meta, cap * sizeof(uint64_t), "d_t2_meta"); + q.memcpy(d_t2_meta, h_t2_meta, t2_count * sizeof(uint64_t)).wait(); + { + uint64_t* d_meta_tile = nullptr; + s_malloc(stats, d_meta_tile, tile_max * sizeof(uint64_t), "d_t2_meta_sorted_tile"); + for (int n = 0; n < t2_gather_N; ++n) { + uint64_t const tile_off = uint64_t(n) * tile_max; + if (tile_off >= t2_count) break; + uint64_t const tile_n = std::min(tile_max, t2_count - tile_off); + launch_gather_u64( + d_t2_meta, d_merged_vals + tile_off, + d_meta_tile, tile_n, q); + q.memcpy(h_t2_meta + tile_off, d_meta_tile, + tile_n * sizeof(uint64_t)).wait(); + } + s_free(stats, d_meta_tile); + } + s_free(stats, d_t2_meta); + + // --- Xbits gather (tiled output → h_t2_xbits) --- + s_malloc(stats, d_t2_xbits, cap * sizeof(uint32_t), "d_t2_xbits"); + q.memcpy(d_t2_xbits, h_t2_xbits, t2_count * sizeof(uint32_t)).wait(); + { + uint32_t* d_xbits_tile = nullptr; + s_malloc(stats, d_xbits_tile, tile_max * sizeof(uint32_t), "d_t2_xbits_sorted_tile"); + for (int n = 0; n < t2_gather_N; ++n) { + uint64_t const tile_off = uint64_t(n) * tile_max; + if (tile_off >= t2_count) break; + uint64_t const tile_n = std::min(tile_max, t2_count - tile_off); + launch_gather_u32( + d_t2_xbits, d_merged_vals + tile_off, + d_xbits_tile, tile_n, q); + q.memcpy(h_t2_xbits + tile_off, d_xbits_tile, + tile_n * sizeof(uint32_t)).wait(); + } + s_free(stats, d_xbits_tile); + } + s_free(stats, d_t2_xbits); + + // d_merged_vals dead now that both gathers have produced their + // sorted outputs on host. + s_free(stats, d_merged_vals); + + // Rehydrate d_t2_xbits_sorted to device (1040 MB at k=28). The + // T3 match kernel reads d_sorted_xbits[l] / d_sorted_xbits[r] + // by index and the random-access pattern would be too slow via + // PCIe with USM-host. + s_malloc(stats, d_t2_xbits_sorted, cap * sizeof(uint32_t), "d_t2_xbits_sorted"); + q.memcpy(d_t2_xbits_sorted, h_t2_xbits, t2_count * sizeof(uint32_t)).wait(); + if (h_xbits_owned) sycl::free(h_t2_xbits, q); + h_t2_xbits = nullptr; + + // Site 4: do NOT rehydrate d_t2_meta_sorted to device. h_t2_meta + // (now containing the sorted meta) stays alive across T3 match; + // the sliced T3 match path H2Ds a section_l + section_r pair of + // slices per pass, dropping T3 match peak from + // d_t2_meta_sorted (2080) + d_t2_xbits_sorted (1040) + + // d_t2_keys_merged (1040) + d_t3_stage (1040) = 5200 MB + // to + // d_meta_l (cap/N_sections × u64 = 520) + d_meta_r (520) + + // d_t2_xbits_sorted (1040) + d_t2_keys_merged (1040) + + // d_t3_stage (cap/N_sections × u64 = 520) = ~3640 MB at k=28. + // h_t2_meta is freed inside the T3 match block once all + // section-pair passes complete. + + end_phase(p_t2_sort); + } + + // ---------- Phase T3 match ---------- + // Plain mode: one-shot launch_t3_match writing directly into + // full-cap d_t3. No pinned-host staging, no round-trips — saves + // the per-plot sycl::malloc_host(2 GB) (~500 ms on NVIDIA) plus + // the two D2H halves + H2D re-hydration. Match live set: + // d_t2_keys_merged (1040) + d_t2_meta_sorted (2080) + // + d_t2_xbits_sorted (1040) + d_t3 (2080) + temp + // = ~6240 MB — fits under plain's 7290 MB T2-match floor. + // + // Compact mode (stage 4d.3, N=2 tiled): half-cap d_t3 staging + + // D2H-to-pinned-host between passes, then full-cap d_t3 + H2D + // before T3 sort. Keeps T3 match peak at 5200 MB. + stats.phase = "T3 match"; + auto t3p = make_t3_params(cfg.k, cfg.strength); + size_t t3_temp_bytes = 0; + launch_t3_match_prepare(cfg.plot_id.data(), t3p, nullptr, t2_count, + d_counter, nullptr, &t3_temp_bytes, q); + + // Stage 4c (compact only): H2D d_t2_keys_merged back from pinned + // host now that we're about to enter T3 match (its consumer). + // Pinned host freed after H2D. Plain mode: d_t2_keys_merged is + // already live (never parked). + if (!scratch.plain_mode) { + s_malloc(stats, d_t2_keys_merged, cap * sizeof(uint32_t), "d_t2_keys_merged"); + q.memcpy(d_t2_keys_merged, h_t2_keys_merged, t2_count * sizeof(uint32_t)).wait(); + if (h_keys_owned) sycl::free(h_t2_keys_merged, q); + h_t2_keys_merged = nullptr; + } + + T3PairingGpu* d_t3 = nullptr; + uint64_t t3_count = 0; + + if (scratch.plain_mode) { + // Plain: one-shot full-cap T3 match. + void* d_t3_match_temp = nullptr; + s_malloc(stats, d_t3, cap * sizeof(T3PairingGpu), "d_t3"); + s_malloc(stats, d_t3_match_temp, t3_temp_bytes, "d_t3_match_temp"); + + q.memset(d_counter, 0, sizeof(uint64_t)).wait(); + int p_t3 = begin_phase("T3 match + Feistel"); + launch_t3_match(cfg.plot_id.data(), t3p, + d_t2_meta_sorted, d_t2_xbits_sorted, + d_t2_keys_merged, t2_count, + d_t3, d_counter, cap, + d_t3_match_temp, &t3_temp_bytes, q); + end_phase(p_t3); + + q.memcpy(&t3_count, d_counter, sizeof(uint64_t)).wait(); + if (t3_count > cap) throw std::runtime_error("T3 overflow"); + + s_free(stats, d_t3_match_temp); + s_free(stats, d_t2_meta_sorted); + s_free(stats, d_t2_xbits_sorted); + s_free(stats, d_t2_keys_merged); + } else if (scratch.gather_tile_count > 1) { + // Minimal (sliced T3 match — site 4). d_t2_meta_sorted is NOT + // on device in this path; the sorted meta is parked on + // h_t2_meta (from the T2 sort tiled gather). For each section_l + // we H2D the matching pair of sections (l + r) into small + // device slices, run the kernel against those slices, D2H the + // stage output to h_t3, then free the slices. Drops T3 match + // peak from ~5200 MB (compact) to ~3665 MB at k=28. + uint32_t const num_sections = 1u << t3p.num_section_bits; + uint32_t const num_match_keys = 1u << t3p.num_match_key_bits; + uint32_t const num_buckets_t3 = num_sections * num_match_keys; + // Per-pass output capacity sized at cap/N × 1.25 (25% safety + // margin over the expected uniform-distribution average). + uint64_t const t3_section_cap = + ((cap + num_sections - 1) / num_sections) * 5ULL / 4ULL; + + T3PairingGpu* d_t3_stage = nullptr; + void* d_t3_match_temp = nullptr; + s_malloc(stats, d_t3_stage, t3_section_cap * sizeof(T3PairingGpu), "d_t3_stage"); + s_malloc(stats, d_t3_match_temp, t3_temp_bytes, "d_t3_match_temp"); + + bool const h_t3_owned = (scratch.h_t3 == nullptr); + T3PairingGpu* h_t3 = h_t3_owned + ? static_cast(sycl::malloc_host(cap * sizeof(T3PairingGpu), q)) + : reinterpret_cast(scratch.h_t3); + if (!h_t3) throw std::runtime_error("sycl::malloc_host(h_t3) failed"); + + // Compute bucket + fine-bucket offsets in d_t3_match_temp; also + // zero d_counter. Same call shape as compact path. + launch_t3_match_prepare(cfg.plot_id.data(), t3p, + d_t2_keys_merged, t2_count, + d_counter, d_t3_match_temp, &t3_temp_bytes, q); + + // D2H the bucket-offsets table (small: 17 × u64 at k=28 + // strength=2) so we can compute each section's global row range + // host-side. + std::vector h_t3_offsets(num_buckets_t3 + 1); + q.memcpy(h_t3_offsets.data(), d_t3_match_temp, + (num_buckets_t3 + 1) * sizeof(uint64_t)).wait(); + + auto compute_section_r = [&](uint32_t section_l) -> uint32_t { + // Mirror the kernel's section_l → section_r permutation. + uint32_t const mask = num_sections - 1u; + uint32_t const rl = ((section_l << 1) | + (section_l >> (t3p.num_section_bits - 1))) & mask; + uint32_t const rl1 = (rl + 1u) & mask; + return ((rl1 >> 1) | + (rl1 << (t3p.num_section_bits - 1))) & mask; + }; + + int p_t3 = begin_phase("T3 match + Feistel"); + uint64_t host_offset = 0; + for (uint32_t section_l = 0; section_l < num_sections; ++section_l) { + uint32_t const section_r = compute_section_r(section_l); + uint64_t const section_l_row_start = h_t3_offsets[section_l * num_match_keys]; + uint64_t const section_l_row_end = h_t3_offsets[(section_l + 1) * num_match_keys]; + uint64_t const section_l_count = section_l_row_end - section_l_row_start; + uint64_t const section_r_row_start = h_t3_offsets[section_r * num_match_keys]; + uint64_t const section_r_row_end = h_t3_offsets[(section_r + 1) * num_match_keys]; + uint64_t const section_r_count = section_r_row_end - section_r_row_start; + + // Skip empty sections — happens for tiny test plots where + // a section has zero rows. The kernel would early-return + // anyway but the slice malloc rejects bytes==0 since f1d3c67. + if (section_l_count == 0) continue; + + uint64_t* d_meta_l_slice = nullptr; + uint64_t* d_meta_r_slice = nullptr; + s_malloc(stats, d_meta_l_slice, section_l_count * sizeof(uint64_t), "d_t3_meta_l_slice"); + if (section_r_count > 0) { + s_malloc(stats, d_meta_r_slice, section_r_count * sizeof(uint64_t), "d_t3_meta_r_slice"); + } + + q.memcpy(d_meta_l_slice, h_t2_meta + section_l_row_start, + section_l_count * sizeof(uint64_t)).wait(); + if (section_r_count > 0) { + q.memcpy(d_meta_r_slice, h_t2_meta + section_r_row_start, + section_r_count * sizeof(uint64_t)).wait(); + } + + uint32_t const bucket_begin = section_l * num_match_keys; + uint32_t const bucket_end = (section_l + 1) * num_match_keys; + launch_t3_match_section_pair_range( + cfg.plot_id.data(), t3p, + d_meta_l_slice, section_l_row_start, + d_meta_r_slice, section_r_row_start, + d_t2_xbits_sorted, d_t2_keys_merged, t2_count, + d_t3_stage, d_counter, t3_section_cap, + d_t3_match_temp, bucket_begin, bucket_end, q); + + uint64_t pass_count = 0; + q.memcpy(&pass_count, d_counter, sizeof(uint64_t)).wait(); + if (pass_count > t3_section_cap) { + throw std::runtime_error( + "T3 match (sliced) section_l=" + std::to_string(section_l) + + " produced " + std::to_string(pass_count) + + " pairs, staging holds " + std::to_string(t3_section_cap) + + ". Lower N or widen t3_section_cap safety factor."); + } + q.memcpy(h_t3 + host_offset, d_t3_stage, + pass_count * sizeof(T3PairingGpu)).wait(); + host_offset += pass_count; + q.memset(d_counter, 0, sizeof(uint64_t)).wait(); + + if (section_r_count > 0) s_free(stats, d_meta_r_slice); + s_free(stats, d_meta_l_slice); + } + end_phase(p_t3); + + t3_count = host_offset; + if (t3_count > cap) throw std::runtime_error("T3 overflow"); + + // d_t2_meta_sorted is null in this path (never allocated) — skip + // its s_free. Free everything else that was alive across T3 match. + s_free(stats, d_t3_match_temp); + s_free(stats, d_t3_stage); + s_free(stats, d_t2_xbits_sorted); + s_free(stats, d_t2_keys_merged); + + // h_t2_meta was kept alive across T3 match for slicing; free now + // that all section pairs have been H2D'd. + if (h_meta_owned) sycl::free(h_t2_meta, q); + h_t2_meta = nullptr; + + // Re-hydrate full-cap d_t3 on device for T3 sort. + s_malloc(stats, d_t3, cap * sizeof(T3PairingGpu), "d_t3"); + q.memcpy(d_t3, h_t3, t3_count * sizeof(T3PairingGpu)).wait(); + if (h_t3_owned) sycl::free(h_t3, q); + } else { + // Compact: N=2 half-cap staging with pinned-host h_t3 accumulator. + uint64_t const t3_half_cap = (cap + 1) / 2; + + T3PairingGpu* d_t3_stage = nullptr; + void* d_t3_match_temp = nullptr; + s_malloc(stats, d_t3_stage, t3_half_cap * sizeof(T3PairingGpu), "d_t3_stage"); + s_malloc(stats, d_t3_match_temp, t3_temp_bytes, "d_t3_match_temp"); + + // Full-cap pinned host that will hold the concatenated T3 output. + // Stage 4f: reuse scratch.h_t3 when provided (amortised across + // batch). T3PairingGpu is just a uint64 proof_fragment, so the + // scratch buffer is declared as uint64_t* and reinterpret-cast. + bool const h_t3_owned = (scratch.h_t3 == nullptr); + T3PairingGpu* h_t3 = h_t3_owned + ? static_cast(sycl::malloc_host(cap * sizeof(T3PairingGpu), q)) + : reinterpret_cast(scratch.h_t3); + if (!h_t3) throw std::runtime_error("sycl::malloc_host(h_t3) failed"); + + // Compute bucket + fine-bucket offsets once; both match passes + // share them. Also zeroes d_counter. + launch_t3_match_prepare(cfg.plot_id.data(), t3p, + d_t2_keys_merged, t2_count, + d_counter, d_t3_match_temp, &t3_temp_bytes, q); + + uint32_t const t3_num_buckets = + (1u << t3p.num_section_bits) * (1u << t3p.num_match_key_bits); + uint32_t const t3_bucket_mid = t3_num_buckets / 2; + + auto run_t3_pass = [&](uint32_t bucket_begin, uint32_t bucket_end, + uint64_t host_offset) -> uint64_t + { + launch_t3_match_range(cfg.plot_id.data(), t3p, + d_t2_meta_sorted, d_t2_xbits_sorted, + d_t2_keys_merged, t2_count, + d_t3_stage, d_counter, t3_half_cap, + d_t3_match_temp, bucket_begin, bucket_end, q); + uint64_t pass_count = 0; + q.memcpy(&pass_count, d_counter, sizeof(uint64_t)).wait(); + if (pass_count > t3_half_cap) { + throw std::runtime_error( + "T3 match pass overflow: bucket range [" + + std::to_string(bucket_begin) + "," + std::to_string(bucket_end) + + ") produced " + std::to_string(pass_count) + + " pairs, staging holds " + std::to_string(t3_half_cap) + + ". Lower N or widen staging."); + } + q.memcpy(h_t3 + host_offset, d_t3_stage, + pass_count * sizeof(T3PairingGpu)).wait(); + // Reset counter so the next pass writes at stage index 0. + q.memset(d_counter, 0, sizeof(uint64_t)).wait(); + return pass_count; + }; + + int p_t3 = begin_phase("T3 match + Feistel"); + uint64_t const t3_count1 = run_t3_pass(0, t3_bucket_mid, /*host_offset=*/0); + uint64_t const t3_count2 = run_t3_pass(t3_bucket_mid, t3_num_buckets, /*host_offset=*/t3_count1); + end_phase(p_t3); + + t3_count = t3_count1 + t3_count2; + if (t3_count > cap) throw std::runtime_error("T3 overflow"); + + // Free everything that was alive across T3 match: staging, temp, + // sorted T2 inputs, keys_merged. + s_free(stats, d_t3_match_temp); + s_free(stats, d_t3_stage); + s_free(stats, d_t2_meta_sorted); + s_free(stats, d_t2_xbits_sorted); + s_free(stats, d_t2_keys_merged); + + // Re-hydrate full-cap d_t3 on device for T3 sort. + s_malloc(stats, d_t3, cap * sizeof(T3PairingGpu), "d_t3"); + q.memcpy(d_t3, h_t3, t3_count * sizeof(T3PairingGpu)).wait(); + if (h_t3_owned) sycl::free(h_t3, q); + } + + // ---------- Phase T3 sort ---------- + // Compact / plain: full-cap CUB sort_keys with separate keys_in + // (= d_t3) and keys_out (= d_frags_out) buffers — peaks at + // 2 × cap × u64 + scratch ≈ 4228 MB at k=28. + // + // Minimal: tile the sort in halves with a single cap/2 output + // buffer, D2H each tile to host pinned, std::inplace_merge on + // host, then H2D the merged result back into the full-cap + // d_frags_out the D2H phase below expects. Drops T3 sort peak to + // ~3152 MB at k=28 (d_t3 2080 + tile output 1040 + sort scratch + // sized for cap/2 ≈ 32). Adds one cap-sized PCIe round-trip per + // plot. + stats.phase = "T3 sort"; + uint64_t* d_frags_in = reinterpret_cast(d_t3); + uint64_t* d_frags_out = nullptr; + + if (!t1_match_sliced) { + size_t t3_sort_bytes = 0; + launch_sort_keys_u64( + nullptr, t3_sort_bytes, + static_cast(nullptr), static_cast(nullptr), + cap, 0, 2 * cfg.k, q); + + s_malloc(stats, d_frags_out, cap * sizeof(uint64_t), "d_frags_out"); + s_malloc(stats, d_sort_scratch, t3_sort_bytes, "d_sort_scratch(t3)"); + + int p_t3_sort = begin_phase("T3 sort"); + launch_sort_keys_u64( + d_sort_scratch, t3_sort_bytes, + d_frags_in, d_frags_out, + t3_count, /*begin_bit=*/0, /*end_bit=*/2 * cfg.k, q); + end_phase(p_t3_sort); + + s_free(stats, d_t3); + s_free(stats, d_sort_scratch); + } else { + // Tiled sort + host merge. + uint64_t const tile_max = (cap + 1) / 2; + uint64_t const tile_n0 = t3_count / 2; + uint64_t const tile_n1 = t3_count - tile_n0; + + size_t t3_tile_sort_bytes = 0; + launch_sort_keys_u64( + nullptr, t3_tile_sort_bytes, + static_cast(nullptr), static_cast(nullptr), + tile_max, 0, 2 * cfg.k, q); + + uint64_t* d_frags_out_tile = nullptr; + void* d_sort_scratch_tile = nullptr; + s_malloc(stats, d_frags_out_tile, tile_max * sizeof(uint64_t), "d_frags_out_tile"); + s_malloc(stats, d_sort_scratch_tile, t3_tile_sort_bytes, "d_sort_scratch(t3_tile)"); + + uint64_t* h_frags = static_cast( + sycl::malloc_host(cap * sizeof(uint64_t), q)); + if (!h_frags) throw std::runtime_error("sycl::malloc_host(h_frags) failed"); + + int p_t3_sort = begin_phase("T3 sort"); + if (tile_n0 > 0) { + launch_sort_keys_u64( + d_sort_scratch_tile, t3_tile_sort_bytes, + d_frags_in, d_frags_out_tile, + tile_n0, /*begin_bit=*/0, /*end_bit=*/2 * cfg.k, q); + q.memcpy(h_frags, d_frags_out_tile, + tile_n0 * sizeof(uint64_t)).wait(); + } + if (tile_n1 > 0) { + launch_sort_keys_u64( + d_sort_scratch_tile, t3_tile_sort_bytes, + d_frags_in + tile_n0, d_frags_out_tile, + tile_n1, /*begin_bit=*/0, /*end_bit=*/2 * cfg.k, q); + q.memcpy(h_frags + tile_n0, d_frags_out_tile, + tile_n1 * sizeof(uint64_t)).wait(); + } + end_phase(p_t3_sort); + + s_free(stats, d_frags_out_tile); + s_free(stats, d_sort_scratch_tile); + s_free(stats, d_t3); + + // Stable in-place merge of [0, tile_n0) and [tile_n0, t3_count) + // — both halves are individually sorted by launch_sort_keys_u64. + std::inplace_merge(h_frags, h_frags + tile_n0, h_frags + t3_count); + + // Re-hydrate full-cap d_frags_out for the existing D2H phase. + s_malloc(stats, d_frags_out, cap * sizeof(uint64_t), "d_frags_out"); + if (t3_count > 0) { + q.memcpy(d_frags_out, h_frags, t3_count * sizeof(uint64_t)).wait(); + } + sycl::free(h_frags, q); + } + + // ---------- D2H ---------- + // Two destination modes: + // caller-supplied pinned_dst (batch): copy D2H into pinned_dst and + // return a BORROWING result (external_fragments_ptr). Consumer + // must finish reading pinned_dst before the caller reuses it. + // no pinned_dst (one-shot): alloc a temp pinned region sized to + // t3_count, D2H, copy to an OWNING vector, free the temp. + stats.phase = "D2H"; + GpuPipelineResult result; + result.t1_count = t1_count; + result.t2_count = t2_count; + result.t3_count = t3_count; + + int p_d2h = begin_phase("D2H copy T3 fragments (pinned)"); + if (t3_count > 0) { + if (pinned_dst) { + if (pinned_capacity < t3_count) { + throw std::runtime_error( + "run_gpu_pipeline_streaming: pinned_capacity " + + std::to_string(pinned_capacity) + + " < t3_count " + std::to_string(t3_count)); + } + q.memcpy(pinned_dst, d_frags_out, sizeof(uint64_t) * t3_count); + q.wait(); + result.external_fragments_ptr = pinned_dst; + result.external_fragments_count = t3_count; + } else { + uint64_t* h_pinned = nullptr; + h_pinned = static_cast( + sycl::malloc_host(sizeof(uint64_t) * t3_count, sycl_backend::queue())); + if (!h_pinned) throw std::runtime_error("sycl::malloc_host(h_pinned) failed"); + q.memcpy(h_pinned, d_frags_out, sizeof(uint64_t) * t3_count); + q.wait(); + result.t3_fragments_storage.resize(t3_count); + std::memcpy(result.t3_fragments_storage.data(), h_pinned, + sizeof(uint64_t) * t3_count); + sycl::free(h_pinned, sycl_backend::queue()); + } + } + end_phase(p_d2h); + + s_free(stats, d_frags_out); + s_free(stats, d_counter); + + if (stats.verbose) { + std::fprintf(stderr, + "[streaming] k=%d strength=%d peak device VRAM = %.2f MB\n", + cfg.k, cfg.strength, stats.peak / 1048576.0); + } + report_phases(); + return result; +} + +} // namespace (anon — streaming impl) + +uint64_t* streaming_alloc_pinned_uint64(size_t count) +{ + uint64_t* p = nullptr; + p = static_cast( + sycl::malloc_host(count * sizeof(uint64_t), sycl_backend::queue())); + if (!p) return nullptr; + return p; +} + +uint32_t* streaming_alloc_pinned_uint32(size_t count) +{ + uint32_t* p = static_cast( + sycl::malloc_host(count * sizeof(uint32_t), sycl_backend::queue())); + return p; // nullptr on failure +} + +void streaming_free_pinned_uint32(uint32_t* ptr) +{ + if (ptr) sycl::free(ptr, sycl_backend::queue()); +} + +void streaming_free_pinned_uint64(uint64_t* ptr) +{ + if (ptr) sycl::free(ptr, sycl_backend::queue()); +} + +void bind_current_device(int device_id) +{ + sycl_backend::set_current_device_id(device_id); +} + +int gpu_device_count() +{ + try { + return sycl_backend::get_gpu_device_count(); + } catch (...) { + return 0; + } +} + +} // namespace pos2gpu diff --git a/src/host/GpuPipeline.cu b/src/host/GpuPipeline.cu deleted file mode 100644 index 2b28b7d..0000000 --- a/src/host/GpuPipeline.cu +++ /dev/null @@ -1,411 +0,0 @@ -// GpuPipeline.cu — orchestrates Xs → T1 → T2 → T3 on the device, with -// CUB radix sort between phases (each phase consumes sorted-by-match_info -// input). Final T3 output is sorted by proof_fragment (low 2k bits) to -// match pos2-chip Table3Constructor::post_construct_span. -// -// Two overloads live here: -// run_gpu_pipeline(cfg) — transient pool, one-shot. -// run_gpu_pipeline(cfg, pool) — shared pool, batch-friendly. This is the -// real implementation; the one-shot form -// just wraps it in a temporary pool. - -#include "host/GpuPipeline.hpp" -#include "host/GpuBufferPool.hpp" - -#include "gpu/AesGpu.cuh" -#include "gpu/XsKernel.cuh" -#include "gpu/T1Kernel.cuh" -#include "gpu/T2Kernel.cuh" -#include "gpu/T3Kernel.cuh" - -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace pos2gpu { - -namespace { - -#define CHECK(call) do { \ - cudaError_t err = (call); \ - if (err != cudaSuccess) { \ - throw std::runtime_error(std::string("CUDA: ") + \ - cudaGetErrorString(err)); \ - } \ -} while (0) - -// ===================================================================== -// T1 sort: by match_info, low k bits, stable. Uses CUB SortPairs with -// (key=match_info, value=index) then permutes T1Pairings. -// ===================================================================== - -// Permute the T1 match output by sort indices, writing only the 8-byte -// meta (meta_hi << 32 | meta_lo). match_info already lives in the sort's -// key-output stream so we don't rematerialise it; the T2 match kernel -// consumes (sorted_meta, sorted_mi) directly. -__global__ void permute_t1( - T1PairingGpu const* __restrict__ src, - uint32_t const* __restrict__ indices, - uint64_t* __restrict__ dst_meta, - uint64_t count) -{ - uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x; - if (idx >= count) return; - T1PairingGpu s = src[indices[idx]]; - dst_meta[idx] = (uint64_t(s.meta_hi) << 32) | uint64_t(s.meta_lo); -} - -__global__ void extract_t1_keys( - T1PairingGpu const* __restrict__ src, - uint32_t* __restrict__ keys_out, - uint32_t* __restrict__ vals_out, - uint64_t count) -{ - uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x; - if (idx >= count) return; - keys_out[idx] = src[idx].match_info; - vals_out[idx] = uint32_t(idx); -} - -// ===================================================================== -// T2 sort: same shape — sort indices by match_info. -// ===================================================================== - -// T3 match reads meta (8 B) and x_bits (4 B) from sorted_t2 but does not -// touch match_info (passed as the parallel sorted_mi stream). Splitting -// the sort output into meta[] and xbits[] arrays drops the per-access -// line footprint from 16 B to 12 B, cutting L1/TEX line fetches on an -// L1-throughput-bound kernel. -__global__ void permute_t2( - T2PairingGpu const* __restrict__ src, - uint32_t const* __restrict__ indices, - uint64_t* __restrict__ dst_meta, - uint32_t* __restrict__ dst_xbits, - uint64_t count) -{ - uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x; - if (idx >= count) return; - T2PairingGpu p = src[indices[idx]]; - dst_meta[idx] = p.meta; - dst_xbits[idx] = p.x_bits; -} - -__global__ void extract_t2_keys( - T2PairingGpu const* __restrict__ src, - uint32_t* __restrict__ keys_out, - uint32_t* __restrict__ vals_out, - uint64_t count) -{ - uint64_t idx = blockIdx.x * uint64_t(blockDim.x) + threadIdx.x; - if (idx >= count) return; - keys_out[idx] = src[idx].match_info; - vals_out[idx] = uint32_t(idx); -} - -} // namespace - -GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg, - GpuBufferPool& pool, - int pinned_index) -{ - if (cfg.k < 18 || cfg.k > 32 || (cfg.k & 1) != 0) { - throw std::runtime_error("k must be even in [18, 32]"); - } - if (cfg.strength < 2) { - throw std::runtime_error("strength must be >= 2"); - } - if (pool.k != cfg.k || pool.strength != cfg.strength - || pool.testnet != cfg.testnet) - { - throw std::runtime_error( - "GpuBufferPool was sized for different (k, strength, testnet)"); - } - if (pinned_index < 0 || pinned_index > 1) { - throw std::runtime_error("pinned_index must be 0 or 1"); - } - - uint64_t const total_xs = pool.total_xs; - uint64_t const cap = pool.cap; - - constexpr int kThreads = 256; - auto blocks = [&](uint64_t n) { - return unsigned((n + kThreads - 1) / kThreads); - }; - - cudaStream_t stream = nullptr; // default stream - - // ---- pool aliases ---- - // d_pair_a carries the "current phase match output": T1, then T2, then T3. - // d_pair_b carries the "current phase sort output": sorted T1, sorted T2, - // then final uint64_t fragments. Each subsequent phase's output overwrites - // the previous (consumed) contents in the same slot. - XsCandidateGpu* d_xs = static_cast(pool.d_storage); - T1PairingGpu* d_t1 = static_cast (pool.d_pair_a); - // Sorted T1 is now just meta (8 B/entry) — match_info comes from sort keys. - uint64_t* d_t1_meta_sorted = static_cast (pool.d_pair_b); - T2PairingGpu* d_t2 = static_cast (pool.d_pair_a); - // Sorted T2 is SoA-split across d_pair_b: meta[cap] then xbits[cap], - // 12 B total per entry (fits in d_pair_b's 16 B/entry budget). T3 - // match reads both; frags_out later reuses d_pair_b from offset 0. - uint64_t* d_t2_meta_sorted = static_cast (pool.d_pair_b); - uint32_t* d_t2_xbits_sorted = reinterpret_cast( - static_cast(pool.d_pair_b) + pool.cap * sizeof(uint64_t)); - T3PairingGpu* d_t3 = static_cast (pool.d_pair_a); - uint64_t* d_frags_out = static_cast (pool.d_pair_b); - - uint64_t* d_count = pool.d_counter; - // Xs phase needs ~4.34 GB scratch at k=28; d_pair_b is idle through - // the whole Xs phase (not touched until T1 sort permute writes to it), - // so we alias it rather than allocating separately. - void* d_xs_temp = pool.d_pair_b; - void* d_sort_scratch = pool.d_sort_scratch; - uint64_t* h_pinned_t3 = pool.h_pinned_t3[pinned_index]; - // T1/T2/T3 match kernels report 0 scratch bytes, but some CUDA paths - // reject a nullptr d_temp_storage with cudaErrorInvalidArgument even - // when bytes==0. Point them at d_sort_scratch (idle during match) to - // give the kernel a valid non-null handle. - void* d_match_temp = pool.d_sort_scratch; - - // Sort key/val arrays alias d_storage. Safe because Xs is fully consumed - // by T1 match (stream-synchronised) before we enter T1 sort. - auto storage_u32 = static_cast(pool.d_storage); - uint32_t* d_keys_in = storage_u32 + 0 * cap; - uint32_t* d_keys_out = storage_u32 + 1 * cap; - uint32_t* d_vals_in = storage_u32 + 2 * cap; - uint32_t* d_vals_out = storage_u32 + 3 * cap; - - // ---- profiling: cudaEvent helpers ---- - struct PhaseTimer { - cudaEvent_t start, stop; - std::string label; - }; - std::vector phases; - auto begin_phase = [&](char const* label) -> int { - if (!cfg.profile) return -1; - PhaseTimer pt; - pt.label = label; - cudaEventCreate(&pt.start); - cudaEventCreate(&pt.stop); - cudaEventRecord(pt.start, stream); - phases.push_back(pt); - return int(phases.size()) - 1; - }; - auto end_phase = [&](int idx) { - if (!cfg.profile || idx < 0) return; - cudaEventRecord(phases[idx].stop, stream); - }; - auto report_phases = [&]() { - if (!cfg.profile) return; - cudaDeviceSynchronize(); - std::fprintf(stderr, "=== gpu_pipeline phase breakdown ===\n"); - float total_ms = 0; - for (auto& pt : phases) { - float ms = 0; - cudaEventElapsedTime(&ms, pt.start, pt.stop); - std::fprintf(stderr, " %-30s %8.2f ms\n", pt.label.c_str(), ms); - total_ms += ms; - cudaEventDestroy(pt.start); - cudaEventDestroy(pt.stop); - } - std::fprintf(stderr, " %-30s %8.2f ms\n", "TOTAL device time:", total_ms); - }; - - // ---------- Phase Xs ---------- - size_t xs_temp_bytes = 0; - CHECK(launch_construct_xs(cfg.plot_id.data(), cfg.k, cfg.testnet, - nullptr, nullptr, &xs_temp_bytes)); - cudaEvent_t e_xs_start = nullptr, e_xs_gen_done = nullptr, e_xs_sort_done = nullptr; - if (cfg.profile) { - cudaEventCreate(&e_xs_start); - cudaEventCreate(&e_xs_gen_done); - cudaEventCreate(&e_xs_sort_done); - cudaEventRecord(e_xs_start, stream); - } - CHECK(launch_construct_xs_profiled(cfg.plot_id.data(), cfg.k, cfg.testnet, - d_xs, d_xs_temp, &xs_temp_bytes, - e_xs_gen_done, e_xs_sort_done, stream)); - - // ---------- Phase T1 ---------- - auto t1p = make_t1_params(cfg.k, cfg.strength); - size_t t1_temp_bytes = 0; - CHECK(launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs, - d_t1, d_count, cap, - nullptr, &t1_temp_bytes)); - CHECK(cudaMemsetAsync(d_count, 0, sizeof(uint64_t), stream)); - int p_t1 = begin_phase("T1 match"); - CHECK(launch_t1_match(cfg.plot_id.data(), t1p, d_xs, total_xs, - d_t1, d_count, cap, - d_match_temp, &t1_temp_bytes, stream)); - end_phase(p_t1); - - // No explicit sync: the next cudaMemcpy (non-async, default stream) - // implicitly drains prior stream work before the host reads t1_count. - uint64_t t1_count = 0; - CHECK(cudaMemcpy(&t1_count, d_count, sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - if (t1_count > cap) throw std::runtime_error("T1 overflow"); - - // Sort T1 by match_info (low k bits). d_storage is now repurposed - // as (keys_in, keys_out, vals_in, vals_out), Xs having been fully - // consumed by T1 match above. - int p_t1_sort = begin_phase("T1 sort"); - { - extract_t1_keys<<>>( - d_t1, d_keys_in, d_vals_in, t1_count); - CHECK(cudaGetLastError()); - - size_t sort_bytes = pool.sort_scratch_bytes; - CHECK(cub::DeviceRadixSort::SortPairs( - d_sort_scratch, sort_bytes, - d_keys_in, d_keys_out, d_vals_in, d_vals_out, - t1_count, /*begin_bit=*/0, /*end_bit=*/cfg.k, stream)); - - permute_t1<<>>( - d_t1, d_vals_out, d_t1_meta_sorted, t1_count); - CHECK(cudaGetLastError()); - } - end_phase(p_t1_sort); - - // ---------- Phase T2 ---------- - // Sorted T1 = (d_t1_meta_sorted: uint64 meta, d_keys_out: uint32 match_info). - // No AoS struct anymore — saves 33 % of sorted-T1 bandwidth on both the - // permute write and the match-kernel hot path. - auto t2p = make_t2_params(cfg.k, cfg.strength); - size_t t2_temp_bytes = 0; - CHECK(launch_t2_match(cfg.plot_id.data(), t2p, nullptr, nullptr, t1_count, - d_t2, d_count, cap, - nullptr, &t2_temp_bytes)); - CHECK(cudaMemsetAsync(d_count, 0, sizeof(uint64_t), stream)); - int p_t2 = begin_phase("T2 match"); - CHECK(launch_t2_match(cfg.plot_id.data(), t2p, d_t1_meta_sorted, d_keys_out, t1_count, - d_t2, d_count, cap, - d_match_temp, &t2_temp_bytes, stream)); - end_phase(p_t2); - - uint64_t t2_count = 0; - CHECK(cudaMemcpy(&t2_count, d_count, sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - if (t2_count > cap) throw std::runtime_error("T2 overflow"); - - int p_t2_sort = begin_phase("T2 sort"); - { - extract_t2_keys<<>>( - d_t2, d_keys_in, d_vals_in, t2_count); - CHECK(cudaGetLastError()); - - size_t sort_bytes = pool.sort_scratch_bytes; - CHECK(cub::DeviceRadixSort::SortPairs( - d_sort_scratch, sort_bytes, - d_keys_in, d_keys_out, d_vals_in, d_vals_out, - t2_count, 0, cfg.k, stream)); - - permute_t2<<>>( - d_t2, d_vals_out, d_t2_meta_sorted, d_t2_xbits_sorted, t2_count); - CHECK(cudaGetLastError()); - } - end_phase(p_t2_sort); - - // ---------- Phase T3 ---------- - // d_keys_out now holds the T2 sorted match_info (T1's was overwritten by - // the T2 sort above) — pass as the slim stream for binary search in T3. - auto t3p = make_t3_params(cfg.k, cfg.strength); - size_t t3_temp_bytes = 0; - CHECK(launch_t3_match(cfg.plot_id.data(), t3p, - d_t2_meta_sorted, d_t2_xbits_sorted, - nullptr, t2_count, - d_t3, d_count, cap, - nullptr, &t3_temp_bytes)); - CHECK(cudaMemsetAsync(d_count, 0, sizeof(uint64_t), stream)); - int p_t3 = begin_phase("T3 match + Feistel"); - CHECK(launch_t3_match(cfg.plot_id.data(), t3p, - d_t2_meta_sorted, d_t2_xbits_sorted, - d_keys_out, t2_count, - d_t3, d_count, cap, - d_match_temp, &t3_temp_bytes, stream)); - end_phase(p_t3); - - uint64_t t3_count = 0; - CHECK(cudaMemcpy(&t3_count, d_count, sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - if (t3_count > cap) throw std::runtime_error("T3 overflow"); - - // Sort T3 by proof_fragment (low 2k bits). T3PairingGpu is just a - // uint64_t, so reinterpret the d_pair_a slot directly. - uint64_t* d_frags_in = reinterpret_cast(d_t3); - int p_t3_sort = begin_phase("T3 sort"); - { - size_t sort_bytes = pool.sort_scratch_bytes; - CHECK(cub::DeviceRadixSort::SortKeys( - d_sort_scratch, sort_bytes, - d_frags_in, d_frags_out, - t3_count, /*begin_bit=*/0, /*end_bit=*/2 * cfg.k, stream)); - } - end_phase(p_t3_sort); - - // ---------- D2H ---------- - int p_d2h = begin_phase("D2H copy T3 fragments (pinned)"); - GpuPipelineResult result; - result.t1_count = t1_count; - result.t2_count = t2_count; - result.t3_count = t3_count; - - if (t3_count > 0) { - CHECK(cudaMemcpyAsync(h_pinned_t3, d_frags_out, - sizeof(uint64_t) * t3_count, - cudaMemcpyDeviceToHost, stream)); - CHECK(cudaStreamSynchronize(stream)); - } - end_phase(p_d2h); - - if (t3_count > 0) { - // Borrow: caller (batch producer) promises to finish consuming this - // pinned slot before reusing it for another plot. - result.external_fragments_ptr = h_pinned_t3; - result.external_fragments_count = t3_count; - } - - // Inject Xs gen / sort timings before reporting (avoids the double-event - // ownership headache by handling them out-of-band here). - if (cfg.profile) { - cudaDeviceSynchronize(); - float gen_ms = 0, sort_ms = 0; - cudaEventElapsedTime(&gen_ms, e_xs_start, e_xs_gen_done); - cudaEventElapsedTime(&sort_ms, e_xs_gen_done, e_xs_sort_done); - std::fprintf(stderr, " %-30s %8.2f ms\n", "Xs gen (g_x)", gen_ms); - std::fprintf(stderr, " %-30s %8.2f ms\n", "Xs sort", sort_ms); - cudaEventDestroy(e_xs_start); - cudaEventDestroy(e_xs_gen_done); - cudaEventDestroy(e_xs_sort_done); - } - - report_phases(); - return result; -} - -GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg) -{ - // One-shot convenience path: build a transient pool and run through it. - // Pays the full per-call allocator overhead (~2.4 s for k=28). Batch - // callers should construct a pool once and reuse it via the overload. - GpuBufferPool pool(cfg.k, cfg.strength, cfg.testnet); - GpuPipelineResult r = run_gpu_pipeline(cfg, pool, /*pinned_index=*/0); - // Pool (and its pinned buffer) is about to be destroyed, so materialise - // a self-contained copy before returning. - if (r.external_fragments_ptr && r.external_fragments_count > 0) { - r.t3_fragments_storage.resize(r.external_fragments_count); - std::memcpy(r.t3_fragments_storage.data(), - r.external_fragments_ptr, - sizeof(uint64_t) * r.external_fragments_count); - } - r.external_fragments_ptr = nullptr; - r.external_fragments_count = 0; - return r; -} - -} // namespace pos2gpu diff --git a/src/host/GpuPipeline.hpp b/src/host/GpuPipeline.hpp index ae8fabd..f70037e 100644 --- a/src/host/GpuPipeline.hpp +++ b/src/host/GpuPipeline.hpp @@ -62,6 +62,10 @@ struct GpuPipelineResult { // One-shot path: allocates a transient pool, runs the pipeline, then copies // the pinned T3 fragments into t3_fragments_storage so the result is // self-contained after the pool is destroyed. +// +// If XCHPLOT2_STREAMING=1 is set in the environment, this routes through +// run_gpu_pipeline_streaming() instead — useful for exercising the low-VRAM +// path from unchanged call sites. GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg); // Batch path: runs the pipeline writing D2H into pool.h_pinned_t3[pinned_index] @@ -74,4 +78,114 @@ GpuPipelineResult run_gpu_pipeline(GpuPipelineConfig const& cfg, GpuBufferPool& pool, int pinned_index); +// Streaming path: per-phase cudaMalloc / cudaFree instead of a persistent +// pool. Targets GPUs where the full pool (~15 GB at k=28) will not fit. +// +// Two overloads: +// run_gpu_pipeline_streaming(cfg) +// Allocates an internal pinned staging buffer for the final D2H, +// copies fragments into an owning std::vector, frees the pinned +// buffer. Self-contained result. Simplest for one-shot callers. +// +// run_gpu_pipeline_streaming(cfg, pinned_dst, pinned_capacity) +// Caller supplies a pinned host buffer (size ≥ cap × sizeof(uint64_t)) +// that the pipeline uses as the D2H target. Result borrows into +// pinned_dst via external_fragments_ptr; caller must not overwrite +// pinned_dst while the consumer is still reading it. Use this from +// BatchPlotter's streaming fallback to amortise the ~600 ms +// cudaMallocHost cost across plots and double-buffer D2H with the +// FSE consumer thread the same way the pool path does. +GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg); +GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg, + uint64_t* pinned_dst, + size_t pinned_capacity); + +// Caller-provided pinned-host scratch buffers for the streaming path. +// Allocate once per batch in BatchPlotter, reuse across all plots — +// avoids paying the ~300–600 ms sycl::malloc_host cost per plot per +// buffer on NVIDIA (measured as the dominant per-plot overhead in +// stages 4b-4e streaming runs). Lifetime analysis shows that phases +// using these buffers do not overlap, so two pairs can share a single +// allocation each: +// h_meta (cap × u64): T1 meta park → T2 meta park +// h_keys_merged (cap × u32): T1 keys_merged park → T2 keys_merged park +// h_t2_xbits (cap × u32): T2 xbits park (distinct) +// h_t3 (cap × T3PairingGpu = u64): T3 staging (distinct) +// +// Any field left nullptr makes the streaming pipeline allocate-on- +// demand for that buffer (one-shot `test` mode). A fully-populated +// StreamingPinnedScratch saves all 6 sycl::malloc_host calls per plot. +struct StreamingPinnedScratch { + uint64_t* h_meta = nullptr; + uint32_t* h_keys_merged = nullptr; + uint32_t* h_t2_xbits = nullptr; + uint64_t* h_t3 = nullptr; // reinterpreted as T3PairingGpu* + + // Plain mode: skip all parks and use single-pass T2 match. Higher + // peak (~7.3 GB at k=28) than compact (~5.2 GB) but ~400 ms/plot + // faster because there are no PCIe round-trips for T1 meta / T1 + // keys_merged / T2 meta / T2 xbits / T2 keys_merged parks. The + // BatchPlotter picks this tier when free VRAM fits the plain peak + // but not the pool (12-14 GB cards). When true, the h_* pointers + // above are ignored — plain mode does not park anything. + bool plain_mode = false; + + // T2 match staging tile count (compact path only — ignored when + // plain_mode is true). compact uses 2 (cap/2 staging, ~2.3 GB at + // k=28); minimal sets it to 8 (cap/8 staging, ~570 MB) to fit 4 + // GiB cards at the cost of more PCIe round-trips during T2 match. + // Must be a power of 2 in [2, t2_num_buckets] — at k=28 strength=2 + // that's [2, 16]. BatchPlotter's tier selection sets it. + int t2_tile_count = 2; + + // Sort-gather tile count (compact path only — ignored when + // plain_mode is true). Each of T1-sort gather, T2-sort meta gather, + // and T2-sort xbits gather peaks at ~5200 MB at k=28 because the + // input meta + indices + output buffer are all cap-sized and live + // simultaneously. With gather_tile_count = N > 1, the gather runs + // in N tiles, D2H'ing each tile to a host pinned staging buffer + // (reusing the parking scratch h_meta / h_t2_xbits) and + // re-allocating the full sorted output afterward via H2D. Drops + // each gather peak from 5200 to ~3640 MB at N=4 (peak = full input + // 2080 + indices 1040 + tile output 520). Default 1 = no tiling + // (compact / plain). Minimal tier sets it to 4. Adds ~3 PCIe round + // trips of cap-sized data per plot. + int gather_tile_count = 1; +}; + +GpuPipelineResult run_gpu_pipeline_streaming(GpuPipelineConfig const& cfg, + uint64_t* pinned_dst, + size_t pinned_capacity, + StreamingPinnedScratch const& scratch); + +// Allocate / free host-pinned memory — thin wrappers around +// cudaMallocHost / cudaFreeHost, exposed so plain .cpp consumers (which +// do not have cuda_runtime.h on the include path) can own the pinned +// buffers the streaming overload expects. Returns nullptr on failure. +uint64_t* streaming_alloc_pinned_uint64(size_t count); +void streaming_free_pinned_uint64(uint64_t* ptr); + +uint32_t* streaming_alloc_pinned_uint32(size_t count); +void streaming_free_pinned_uint32(uint32_t* ptr); + +// Multi-GPU device binding. bind_current_device() sets a thread-local +// target device id that sycl_backend::queue() reads when lazily +// constructing the worker thread's queue. Must be called on the worker +// thread BEFORE any kernel launch on that thread — ideally as the very +// first statement of the worker lambda. +// +// device_id < 0 → use the default SYCL gpu_selector_v (single-device, +// pre-multi-GPU behavior). Calling with -1 from the main thread is a +// no-op and is always safe. +// +// gpu_device_count() returns the number of SYCL GPU devices the runtime +// can enumerate, or 0 on error. BatchPlotter uses it to expand +// `--devices all` into an explicit id list. +// +// Declared here (instead of in SyclBackend.hpp) so plain .cpp consumers +// like BatchPlotter.cpp can call them without pulling +// onto their include path. +void bind_current_device(int device_id); +int gpu_device_count(); + } // namespace pos2gpu diff --git a/src/host/PlotFileWriterParallel.cpp b/src/host/PlotFileWriterParallel.cpp index 9f7c18f..5485888 100644 --- a/src/host/PlotFileWriterParallel.cpp +++ b/src/host/PlotFileWriterParallel.cpp @@ -18,11 +18,18 @@ #include "plot/PlotIO.hpp" #include "plot/Plotter.hpp" #include "pos/ProofParams.hpp" +#include "pos/ProofValidator.hpp" +#include "prove/Prover.hpp" #include +#include +#include +#include #include #include +#include #include +#include #include #include @@ -141,8 +148,23 @@ size_t write_plot_file_parallel( for (auto& f : tasks) f.get(); } - // Serial write phase — file I/O is sequential anyway. - std::ofstream out(filename, std::ios::binary); + // Serial write phase — file I/O is sequential anyway. Write to + // .partial and rename on success so SIGINT / crash / ENOSPC + // never leaves a malformed .plot2 at the destination. The guard + // unlinks the partial on early exit. + std::string const partial = filename + ".partial"; + struct PartialGuard { + std::string const& path; + bool committed = false; + ~PartialGuard() { + if (!committed) { + std::error_code ec; + std::filesystem::remove(path, ec); + } + } + } guard{partial}; + + std::ofstream out(partial, std::ios::binary | std::ios::trunc); if (!out) throw std::runtime_error("Failed to open " + filename); out.write("pos2", 4); @@ -191,9 +213,50 @@ size_t write_plot_file_parallel( if (!out) throw std::runtime_error("Failed to write chunk offsets to " + filename); out.seekp(0, std::ios::end); + // Close before rename so buffered writes are flushed and the destination + // sees the final byte image. + out.close(); + if (!out) throw std::runtime_error("Failed to close " + partial); + + std::error_code ec; + std::filesystem::rename(partial, filename, ec); + if (ec) { + throw std::runtime_error( + "Failed to rename " + partial + " -> " + filename + ": " + ec.message()); + } + guard.committed = true; + return bytes_written; } +VerifyResult verify_plot_file(std::string const& filename, size_t n_trials) +{ + VerifyResult res; + if (n_trials == 0) return res; + + Prover prover(filename); + + // Fresh entropy per call; the result only depends on the plot content, + // not the specific challenges, beyond being a uniform sample. + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dist; + + for (size_t i = 0; i < n_trials; ++i) { + std::array challenge{}; + for (size_t j = 0; j < 32; j += 8) { + uint64_t const v = dist(gen); + std::memcpy(challenge.data() + j, &v, 8); + } + auto const chains = prover.prove( + std::span(challenge.data(), 32)); + res.trials++; + res.proofs_found += chains.size(); + if (!chains.empty()) res.challenges_with_proof++; + } + return res; +} + std::vector read_plot_file_fragments(std::string const& filename) { PlotFile::PlotFileContents contents = PlotFile::readAllChunkedData(filename); diff --git a/src/host/PlotFileWriterParallel.hpp b/src/host/PlotFileWriterParallel.hpp index f066ad5..70acfdb 100644 --- a/src/host/PlotFileWriterParallel.hpp +++ b/src/host/PlotFileWriterParallel.hpp @@ -64,4 +64,21 @@ std::vector run_cpu_plotter_to_fragments( // plot/PlotFile.hpp to other TUs. std::vector read_plot_file_fragments(std::string const& filename); +// Result of a `verify_plot_file` call. +// trials — how many random challenges were tried +// challenges_with_proof — challenges that produced ≥ 1 proof +// proofs_found — total proofs summed across all trials +struct VerifyResult { + size_t trials = 0; + size_t challenges_with_proof = 0; + size_t proofs_found = 0; +}; + +// Opens `filename` via pos2-chip's `Prover` and runs `n_trials` random +// challenges. Each proof is internally validated by the prover; a result +// with zero proofs across a sensible sample (>= 100) strongly suggests +// the plot is corrupt. Lives here because Prover.hpp transitively pulls +// in pos2-chip plot/pos headers (see top-of-file comment in the .cpp). +VerifyResult verify_plot_file(std::string const& filename, size_t n_trials); + } // namespace pos2gpu diff --git a/src/host/PoolSizing.hpp b/src/host/PoolSizing.hpp new file mode 100644 index 0000000..abf7054 --- /dev/null +++ b/src/host/PoolSizing.hpp @@ -0,0 +1,26 @@ +// PoolSizing.hpp — inline helpers shared by the buffer pool, the +// pipeline orchestrator, and the match-kernel wrappers. Kept here so a +// single formula change updates every consumer. + +#pragma once + +#include +#include + +namespace pos2gpu { + +// Maximum L-side rows that can fall into any single (section, match_key) +// bucket at the given (k, section_bits). Used to size the persistent +// pool AND as the safe over-launch upper bound for the match kernels' +// `blocks_x` dimension. Over-launched threads early-exit on the +// `l >= l_end` guard at the top of the match body, so slight +// over-launch is free on the GPU. +// +// Formula mirrors pos2-chip's TableConstructorGeneric.hpp:23. +inline std::size_t max_pairs_per_section(int k, int num_section_bits) noexcept +{ + int const extra_margin_bits = 8 - ((28 - k) / 2); + return (1ULL << (k - num_section_bits)) + (1ULL << (k - extra_margin_bits)); +} + +} // namespace pos2gpu diff --git a/tools/parity/ParityCommon.hpp b/tools/parity/ParityCommon.hpp new file mode 100644 index 0000000..9e0660c --- /dev/null +++ b/tools/parity/ParityCommon.hpp @@ -0,0 +1,83 @@ +// ParityCommon.hpp — shared harness helpers for the parity tests. +// +// Keeps the PRNG seed shape, mismatch-reporting format, and the CUDA +// error-check macro consistent across every `*_parity` / `*_bench` +// binary in this directory. The audit that motivated this header +// found ~170 lines of verbatim copy-paste across 7-9 files (same +// derive_plot_id, same Stats/compare shape, same CHECK macro). +// +// Plain-header (inline) so .cu and .cpp TUs can both include it +// without changing the existing CMake layout. No library target +// needed. + +#pragma once + +#include +#include +#include +#include + +// CUDA error-check macro. Only meaningful inside a .cu TU (where +// cuda_runtime.h is in scope). Guarded behind __CUDACC__ so the +// header can still be included from plain .cpp parity tests for +// derive_plot_id / Stats / compare without pulling in CUDA. +#ifdef __CUDACC__ +#include +#define PARITY_CHECK(call) do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + std::fprintf(stderr, "CUDA error at %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(err)); \ + std::exit(2); \ + } \ +} while (0) +#endif + +namespace pos2gpu::parity { + +// Deterministic mixing from a 32-bit seed to a 32-byte plot_id. Not +// cryptographic — just spreads bits so parity tests for distinct seeds +// exercise non-trivially different plot_ids. Golden-ratio + splitmix- +// style step. +inline std::array derive_plot_id(uint32_t seed) +{ + std::array id{}; + uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL; + for (std::size_t i = 0; i < id.size(); ++i) { + s = s * 6364136223846793005ULL + 1442695040888963407ULL; + id[i] = static_cast(s >> 56); + } + return id; +} + +// Mismatch counter with pretty-print of the first 5 errors per +// (seed, label). Keeps test output useful when a regression lands: +// you see which labelled comparison first diverges and at what +// index, without a multi-thousand-line fault log. +struct Stats { + uint64_t total = 0; + uint64_t mismatches = 0; + bool ok() const { return mismatches == 0; } +}; + +// Cmp is any `bool(uint64_t i)` — returns true when host index i +// agrees between CPU reference and GPU result. +template +Stats compare(uint64_t n, Cmp const& cmp, char const* label, uint32_t seed) +{ + Stats s; + s.total = n; + for (uint64_t i = 0; i < n; ++i) { + if (!cmp(i)) { + if (s.mismatches < 5) { + std::printf(" [seed=%u %s] MISMATCH at i=%llu\n", + seed, label, + static_cast(i)); + } + ++s.mismatches; + } + } + return s; +} + +} // namespace pos2gpu::parity diff --git a/tools/parity/aes_parity.cu b/tools/parity/aes_parity.cu index e39cc2c..db37f6f 100644 --- a/tools/parity/aes_parity.cu +++ b/tools/parity/aes_parity.cu @@ -19,6 +19,8 @@ #include "pos/aes/AesHash.hpp" #include "pos/aes/intrin_portable.h" +#include "ParityCommon.hpp" + #include #include #include @@ -29,6 +31,10 @@ namespace { +using pos2gpu::parity::derive_plot_id; +using pos2gpu::parity::Stats; +using pos2gpu::parity::compare; + #define CHECK(call) do { \ cudaError_t err = (call); \ if (err != cudaSuccess) { \ @@ -122,40 +128,6 @@ std::vector launch_and_collect( return out; \ }() -std::array derive_plot_id(uint32_t seed) -{ - std::array id{}; - // Deterministic mixing — not crypto, just spreads bits across all 32 bytes. - uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL; - for (size_t i = 0; i < id.size(); ++i) { - s = s * 6364136223846793005ULL + 1442695040888963407ULL; - id[i] = static_cast(s >> 56); - } - return id; -} - -struct Stats { - uint64_t total = 0; - uint64_t mismatches = 0; - bool ok() const { return mismatches == 0; } -}; - -template -Stats compare(uint64_t n, Cmp const& cmp, char const* label, uint32_t seed) -{ - Stats s; s.total = n; - for (uint64_t i = 0; i < n; ++i) { - if (!cmp(i)) { - if (s.mismatches < 5) { - std::printf(" [seed=%u %s] MISMATCH at i=%llu\n", seed, label, - static_cast(i)); - } - ++s.mismatches; - } - } - return s; -} - // Per-plot-id full sweep. bool run_for_plot_id(uint32_t seed) { diff --git a/tools/parity/sycl_bucket_offsets_parity.cpp b/tools/parity/sycl_bucket_offsets_parity.cpp new file mode 100644 index 0000000..e48730c --- /dev/null +++ b/tools/parity/sycl_bucket_offsets_parity.cpp @@ -0,0 +1,168 @@ +// sycl_bucket_offsets_parity — SYCL port of compute_bucket_offsets +// (src/gpu/T1Kernel.cu:58) verified against a CPU reference on synthetic +// input. First slice of the SYCL backend port: proves the AdaptiveCpp +// toolchain works end-to-end before we touch the production pipeline. +// +// The kernel is "for each bucket b in [0, num_buckets), find the lowest +// index i in `sorted` such that (sorted[i].match_info >> shift) >= b" — +// one thread per bucket runs a binary search and writes offsets[b]. +// Thread num_buckets writes the sentinel offsets[num_buckets] = total. +// +// Synthetic input: a sorted random XsCandidateGpu[] with match_info +// drawn uniformly from [0, num_buckets << shift) so every bucket is +// non-trivially populated. Reference is std::lower_bound on the same +// shifted key. Pass criterion: byte-for-byte memcmp of offsets[]. + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +// Local copy of pos2gpu::XsCandidateGpu — keeps this TU free of the +// CUDA-laden gpu/XsKernel.cuh include chain. Layout-checked below. +struct XsCandidateGpu { + uint32_t match_info; + uint32_t x; +}; +static_assert(sizeof(XsCandidateGpu) == 8, "must match pos2-chip Xs_Candidate layout"); + +std::vector make_sorted_input(uint64_t total, uint64_t value_range, uint32_t seed) +{ + std::mt19937_64 rng(seed); + std::vector v(total); + for (uint64_t i = 0; i < total; ++i) { + v[i].match_info = static_cast(rng() % value_range); + v[i].x = static_cast(rng()); + } + std::sort(v.begin(), v.end(), + [](XsCandidateGpu const& a, XsCandidateGpu const& b) { + return a.match_info < b.match_info; + }); + return v; +} + +std::vector reference_offsets( + std::vector const& sorted, + int num_match_target_bits, + uint32_t num_buckets) +{ + std::vector offsets(num_buckets + 1); + uint32_t const shift = static_cast(num_match_target_bits); + uint64_t const total = sorted.size(); + for (uint32_t b = 0; b < num_buckets; ++b) { + uint64_t lo = 0, hi = total; + while (lo < hi) { + uint64_t mid = lo + ((hi - lo) >> 1); + uint32_t v = sorted[mid].match_info >> shift; + if (v < b) lo = mid + 1; + else hi = mid; + } + offsets[b] = lo; + } + offsets[num_buckets] = total; + return offsets; +} + +std::vector sycl_offsets( + sycl::queue& q, + std::vector const& sorted, + int num_match_target_bits, + uint32_t num_buckets) +{ + uint64_t const total = sorted.size(); + size_t const out_count = static_cast(num_buckets) + 1; + constexpr size_t threads = 256; + size_t const groups = (out_count + threads - 1) / threads; + + XsCandidateGpu* d_sorted = sycl::malloc_device(total, q); + uint64_t* d_offsets = sycl::malloc_device(out_count, q); + + q.memcpy(d_sorted, sorted.data(), sizeof(XsCandidateGpu) * total).wait(); + + q.parallel_for( + sycl::nd_range<1>{ groups * threads, threads }, + [=](sycl::nd_item<1> it) { + uint32_t b = static_cast(it.get_global_id(0)); + if (b > num_buckets) return; + if (b == num_buckets) { d_offsets[num_buckets] = total; return; } + + uint32_t bucket_shift = static_cast(num_match_target_bits); + uint64_t lo = 0, hi = total; + while (lo < hi) { + uint64_t mid = lo + ((hi - lo) >> 1); + uint32_t v = d_sorted[mid].match_info >> bucket_shift; + if (v < b) lo = mid + 1; + else hi = mid; + } + d_offsets[b] = lo; + }).wait(); + + std::vector out(out_count); + q.memcpy(out.data(), d_offsets, sizeof(uint64_t) * out_count).wait(); + + sycl::free(d_sorted, q); + sycl::free(d_offsets, q); + return out; +} + +bool run_for(sycl::queue& q, uint32_t seed, uint64_t total, + int num_match_target_bits, uint32_t num_buckets) +{ + uint64_t const value_range = uint64_t(num_buckets) << num_match_target_bits; + auto sorted = make_sorted_input(total, value_range, seed); + auto reference = reference_offsets(sorted, num_match_target_bits, num_buckets); + auto actual = sycl_offsets(q, sorted, num_match_target_bits, num_buckets); + + if (std::memcmp(reference.data(), actual.data(), + sizeof(uint64_t) * reference.size()) == 0) { + std::printf("PASS seed=%u total=%llu shift=%d buckets=%u\n", + seed, (unsigned long long)total, + num_match_target_bits, num_buckets); + return true; + } + for (size_t i = 0; i < reference.size(); ++i) { + if (reference[i] != actual[i]) { + std::fprintf(stderr, + "FAIL seed=%u bucket=%zu ref=%llu actual=%llu\n", + seed, i, + (unsigned long long)reference[i], + (unsigned long long)actual[i]); + break; + } + } + return false; +} + +} // namespace + +int main() +{ + sycl::queue q{ sycl::default_selector_v }; + std::printf("device: %s\n", + q.get_device().get_info().c_str()); + + // Sizes representative of T1 at small k (slice 1 is correctness, not perf). + // num_buckets = num_sections (4) * num_match_keys (4) = 16 for k<28. + struct Case { uint64_t total; int shift; uint32_t buckets; }; + Case const cases[] = { + { 1ull << 18, 14, 16 }, // k=18 + { 1ull << 20, 16, 16 }, // k=20 + { 1ull << 22, 18, 16 }, // k=22 + { 1ull << 24, 20, 16 }, // k=24 + }; + + bool all_pass = true; + for (uint32_t seed : { 1u, 7u, 31u }) { + for (auto const& c : cases) { + if (!run_for(q, seed, c.total, c.shift, c.buckets)) all_pass = false; + } + } + return all_pass ? 0 : 1; +} diff --git a/tools/parity/sycl_g_x_parity.cpp b/tools/parity/sycl_g_x_parity.cpp new file mode 100644 index 0000000..1389007 --- /dev/null +++ b/tools/parity/sycl_g_x_parity.cpp @@ -0,0 +1,120 @@ +// sycl_g_x_parity — validates the SYCL-compiled AES g_x_smem against the +// same function run on the host. Both compile from the same C++ source in +// AesHashGpu.cuh (the _smem family, now fully portable behind the +// PortableAttrs macros), but one goes through acpp's SSCP backend into a +// device kernel and the other through the host C++ compiler. Any +// codegen-introduced divergence shows up byte-by-byte here. +// +// For x in [0, 1< + +#include +#include +#include +#include +#include +#include + +namespace { + +std::array derive_plot_id(uint32_t seed) +{ + std::array id{}; + uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL; + for (size_t i = 0; i < id.size(); ++i) { + s = s * 6364136223846793005ULL + 1442695040888963407ULL; + id[i] = static_cast(s >> 56); + } + return id; +} + +// Build the 4×256 uint32_t sT layout the _smem AES functions expect, +// pulling the values from AesTables.inl so the same data feeds both +// the host reference and the device buffer. +std::vector build_sT() +{ + std::vector sT(4 * 256); + for (int i = 0; i < 256; ++i) { + sT[0 * 256 + i] = pos2gpu::aes_tables::T0[i]; + sT[1 * 256 + i] = pos2gpu::aes_tables::T1[i]; + sT[2 * 256 + i] = pos2gpu::aes_tables::T2[i]; + sT[3 * 256 + i] = pos2gpu::aes_tables::T3[i]; + } + return sT; +} + +bool run_for(sycl::queue& q, uint32_t seed, int k) +{ + uint64_t const N = 1ull << k; + auto plot_id = derive_plot_id(seed); + auto keys = pos2gpu::make_keys(plot_id.data()); + auto sT_host = build_sT(); + + std::vector ref(N); + for (uint64_t x = 0; x < N; ++x) { + ref[x] = pos2gpu::g_x_smem(keys, static_cast(x), k, sT_host.data()); + } + + uint32_t* d_sT = sycl::malloc_device(4 * 256, q); + uint32_t* d_out = sycl::malloc_device(N, q); + q.memcpy(d_sT, sT_host.data(), sizeof(uint32_t) * 4 * 256).wait(); + + constexpr size_t threads = 256; + size_t const groups = (N + threads - 1) / threads; + + q.parallel_for( + sycl::nd_range<1>{ groups * threads, threads }, + [=, keys_copy = keys](sycl::nd_item<1> it) { + uint64_t x = it.get_global_id(0); + if (x >= N) return; + d_out[x] = pos2gpu::g_x_smem(keys_copy, static_cast(x), k, d_sT); + }).wait(); + + std::vector actual(N); + q.memcpy(actual.data(), d_out, sizeof(uint32_t) * N).wait(); + sycl::free(d_sT, q); + sycl::free(d_out, q); + + if (std::memcmp(ref.data(), actual.data(), sizeof(uint32_t) * N) == 0) { + std::printf("PASS seed=%u k=%d N=%llu\n", + seed, k, (unsigned long long)N); + return true; + } + for (uint64_t x = 0; x < N; ++x) { + if (ref[x] != actual[x]) { + std::fprintf(stderr, + "FAIL seed=%u k=%d x=%llu ref=0x%08x actual=0x%08x\n", + seed, k, (unsigned long long)x, ref[x], actual[x]); + break; + } + } + return false; +} + +} // namespace + +int main() +{ + sycl::queue q{ sycl::gpu_selector_v }; + std::printf("device: %s\n", + q.get_device().get_info().c_str()); + + bool all_pass = true; + for (uint32_t seed : { 1u, 7u, 31u }) { + for (int k : { 14, 16, 18 }) { + if (!run_for(q, seed, k)) all_pass = false; + } + } + return all_pass ? 0 : 1; +} diff --git a/tools/parity/sycl_sort_parity.cpp b/tools/parity/sycl_sort_parity.cpp new file mode 100644 index 0000000..ff36235 --- /dev/null +++ b/tools/parity/sycl_sort_parity.cpp @@ -0,0 +1,176 @@ +// sycl_sort_parity — exercises launch_sort_pairs_u32_u32 and +// launch_sort_keys_u64 on synthetic input and compares against a +// std::sort reference. Built always (independent of XCHPLOT2_BUILD_CUDA), +// so it validates whichever Sort backend is wired into pos2_gpu: +// CUB on the NVIDIA build, oneDPL on the SYCL/AdaptiveCpp build. +// +// Pass criterion: byte-identical sorted streams. + +#include "gpu/Sort.cuh" +#include "gpu/SyclBackend.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +bool run_pairs(uint32_t seed, uint64_t count) +{ + auto& q = pos2gpu::sycl_backend::queue(); + + // Use unique keys (shuffled 0..count-1) so stable and unstable sorts + // produce byte-identical output — lets us test both CUB (stable) and + // the hand-rolled SYCL radix (unstable within equal keys) the same way. + std::mt19937_64 rng(seed); + std::vector h_keys(count), h_vals(count); + for (uint64_t i = 0; i < count; ++i) { + h_keys[i] = static_cast(i); + h_vals[i] = static_cast(i); + } + std::shuffle(h_keys.begin(), h_keys.end(), rng); + + // Reference: std::sort over indices by key. + std::vector ref_keys = h_keys; + std::vector ref_vals = h_vals; + { + std::vector idx(count); + for (uint64_t i = 0; i < count; ++i) idx[i] = static_cast(i); + std::sort(idx.begin(), idx.end(), + [&](uint32_t a, uint32_t b) { return h_keys[a] < h_keys[b]; }); + for (uint64_t i = 0; i < count; ++i) { + ref_keys[i] = h_keys[idx[i]]; + ref_vals[i] = h_vals[idx[i]]; + } + } + + uint32_t* d_keys_in = sycl::malloc_device(count, q); + uint32_t* d_keys_out = sycl::malloc_device(count, q); + uint32_t* d_vals_in = sycl::malloc_device(count, q); + uint32_t* d_vals_out = sycl::malloc_device(count, q); + q.memcpy(d_keys_in, h_keys.data(), sizeof(uint32_t) * count); + q.memcpy(d_vals_in, h_vals.data(), sizeof(uint32_t) * count).wait(); + + size_t scratch_bytes = 0; + pos2gpu::launch_sort_pairs_u32_u32( + nullptr, scratch_bytes, + nullptr, nullptr, nullptr, nullptr, + count, 0, 32, q); + + void* d_scratch = scratch_bytes ? sycl::malloc_device(scratch_bytes, q) : nullptr; + + auto const t0 = std::chrono::steady_clock::now(); + pos2gpu::launch_sort_pairs_u32_u32( + d_scratch ? d_scratch : reinterpret_cast(uintptr_t{1}), // any non-null + scratch_bytes, + d_keys_in, d_keys_out, + d_vals_in, d_vals_out, + count, 0, 32, q); + q.wait(); + auto const t1 = std::chrono::steady_clock::now(); + double const ms = std::chrono::duration(t1 - t0).count(); + + std::vector h_sorted_keys(count), h_sorted_vals(count); + q.memcpy(h_sorted_keys.data(), d_keys_out, sizeof(uint32_t) * count); + q.memcpy(h_sorted_vals.data(), d_vals_out, sizeof(uint32_t) * count).wait(); + + if (d_scratch) sycl::free(d_scratch, q); + sycl::free(d_keys_in, q); + sycl::free(d_keys_out, q); + sycl::free(d_vals_in, q); + sycl::free(d_vals_out, q); + + bool const keys_ok = std::memcmp(ref_keys.data(), h_sorted_keys.data(), + sizeof(uint32_t) * count) == 0; + bool const vals_ok = std::memcmp(ref_vals.data(), h_sorted_vals.data(), + sizeof(uint32_t) * count) == 0; + bool const sorted = std::is_sorted(h_sorted_keys.begin(), + h_sorted_keys.end()); + bool const ok = keys_ok && vals_ok; + std::printf("%s pairs seed=%u count=%llu [keys=%d vals=%d sorted=%d %.2fms]\n", + ok ? "PASS" : "FAIL", seed, (unsigned long long)count, + keys_ok, vals_ok, sorted, ms); + if (!ok) { + uint64_t const show = std::min(count, 16); + std::printf(" got [0..%llu): ", (unsigned long long)show); + for (uint64_t i = 0; i < show; ++i) std::printf("%u ", h_sorted_keys[i]); + std::printf("\n ref [0..%llu): ", (unsigned long long)show); + for (uint64_t i = 0; i < show; ++i) std::printf("%u ", ref_keys[i]); + std::printf("\n got [N-%llu..N): ", (unsigned long long)show); + for (uint64_t i = count - show; i < count; ++i) std::printf("%u ", h_sorted_keys[i]); + std::printf("\n"); + } + return ok; +} + +bool run_keys(uint32_t seed, uint64_t count) +{ + auto& q = pos2gpu::sycl_backend::queue(); + + std::mt19937_64 rng(seed); + std::vector h_keys(count); + for (uint64_t i = 0; i < count; ++i) { + h_keys[i] = rng() & 0x0000FFFFFFFFFFFFull; // ~48-bit keys + } + + std::vector ref = h_keys; + std::sort(ref.begin(), ref.end()); + + uint64_t* d_in = sycl::malloc_device(count, q); + uint64_t* d_out = sycl::malloc_device(count, q); + q.memcpy(d_in, h_keys.data(), sizeof(uint64_t) * count).wait(); + + size_t scratch_bytes = 0; + pos2gpu::launch_sort_keys_u64(nullptr, scratch_bytes, nullptr, nullptr, + count, 0, 48, q); + void* d_scratch = scratch_bytes ? sycl::malloc_device(scratch_bytes, q) : nullptr; + auto const t0 = std::chrono::steady_clock::now(); + pos2gpu::launch_sort_keys_u64( + d_scratch ? d_scratch : reinterpret_cast(uintptr_t{1}), + scratch_bytes, + d_in, d_out, + count, 0, 48, q); + q.wait(); + auto const t1 = std::chrono::steady_clock::now(); + double const ms = std::chrono::duration(t1 - t0).count(); + + std::vector h_sorted(count); + q.memcpy(h_sorted.data(), d_out, sizeof(uint64_t) * count).wait(); + + if (d_scratch) sycl::free(d_scratch, q); + sycl::free(d_in, q); + sycl::free(d_out, q); + + bool const ok = std::memcmp(ref.data(), h_sorted.data(), + sizeof(uint64_t) * count) == 0; + bool const sorted = std::is_sorted(h_sorted.begin(), h_sorted.end()); + std::printf("%s keys seed=%u count=%llu [match=%d sorted=%d %.2fms]\n", + ok ? "PASS" : "FAIL", seed, (unsigned long long)count, + ok, sorted, ms); + return ok; +} + +} // namespace + +int main() +{ + auto& q = pos2gpu::sycl_backend::queue(); + std::printf("device: %s\n", + q.get_device().get_info().c_str()); + + bool all_pass = true; + for (uint32_t seed : { 1u, 7u, 31u }) { + for (uint64_t n : { 16ull, 1ull << 14, 1ull << 18, 1ull << 20 }) { + if (!run_pairs(seed, n)) all_pass = false; + if (!run_keys (seed, n)) all_pass = false; + } + } + return all_pass ? 0 : 1; +} diff --git a/tools/parity/sycl_t1_parity.cpp b/tools/parity/sycl_t1_parity.cpp new file mode 100644 index 0000000..9ddb4ad --- /dev/null +++ b/tools/parity/sycl_t1_parity.cpp @@ -0,0 +1,317 @@ +// sycl_t1_parity — SYCL-native sibling of t1_parity.cu. Builds on every +// backend (CUDA / HIP / Level Zero / OMP) so the T1 matcher can be +// validated against the pos2-chip CPU reference on AMD and Intel +// devices, where the .cu version isn't compiled. +// +// Same comparison semantics as t1_parity.cu: both CPU and GPU outputs +// are sorted by (match_info, meta_hi, meta_lo) and compared as a set. +// Bit-exactness of the SET is what determines correctness for the +// downstream T2/T3/proof pipeline — the post-construct sort by +// match_info collapses the order in which matches were emitted. +// +// Usage: +// ./sycl_t1_parity # default sweep +// ./sycl_t1_parity --k 20 # single-k smoke test +// ./sycl_t1_parity --k 20 --strength 4 # custom strength +// +// The default sweep stays small (k <= 18) so it fits on 8 GiB cards +// and so the CPU reference completes in seconds. --k lets a triage +// session push the matcher to the largest k that fits on the device. + +#include "gpu/AesGpu.cuh" +#include "gpu/SyclBackend.hpp" +#include "gpu/XsKernel.cuh" +#include "gpu/T1Kernel.cuh" + +#include "plot/PlotLayout.hpp" +#include "plot/TableConstructorGeneric.hpp" +#include "pos/ProofCore.hpp" +#include "pos/ProofParams.hpp" + +#include "ParityCommon.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +using pos2gpu::parity::derive_plot_id; + +struct PairKey { + uint32_t mi; + uint32_t lo; + uint32_t hi; + bool operator<(PairKey const& o) const noexcept { + if (mi != o.mi) return mi < o.mi; + if (hi != o.hi) return hi < o.hi; + return lo < o.lo; + } + bool operator==(PairKey const& o) const noexcept { + return mi == o.mi && lo == o.lo && hi == o.hi; + } +}; + +template +T* sycl_alloc_device(sycl::queue& q, std::size_t n, char const* what) +{ + T* p = sycl::malloc_device(n, q); + if (!p) { + std::fprintf(stderr, " FAIL: sycl::malloc_device(%s, %zu * %zu B)\n", + what, n, sizeof(T)); + std::exit(2); + } + return p; +} + +bool run_for_id(sycl::queue& q, + std::array const& plot_id, + char const* label, + int k, + int strength) +{ + uint64_t const total = 1ULL << k; + std::printf("[%s k=%d strength=%d N=%llu]\n", + label, k, strength, static_cast(total)); + + ProofParams params(plot_id.data(), + static_cast(k), + static_cast(strength), + /*testnet=*/uint8_t{0}); + + // ---- CPU reference (XsConstructor → Table1Constructor::construct) ---- + std::size_t max_section_pairs = max_pairs_per_section_possible(params); + std::size_t num_sections = static_cast(params.get_num_sections()); + std::size_t max_pairs = max_section_pairs * num_sections; + std::size_t max_element_bytes = std::max({sizeof(Xs_Candidate), sizeof(T1Pairing), + sizeof(T2Pairing), sizeof(T3Pairing)}); + PlotLayout layout(max_section_pairs, num_sections, max_element_bytes, + /*minor_scratch_bytes=*/2 * 1024 * 1024); + + auto xsV = layout.xs(); + XsConstructor xs_ctor(params); + auto xs_sorted = xs_ctor.construct(xsV.out, xsV.post_sort_tmp, xsV.minor); + + // Mirror t1_parity.cu: if XsConstructor returned its output in the + // PrimaryOut slot, copy aside so T1's construct (which writes its + // output into PrimaryOut) doesn't corrupt the input. + if (xs_sorted.data() == xsV.out.data()) { + std::copy(xsV.out.begin(), xsV.out.end(), xsV.post_sort_tmp.begin()); + xs_sorted = xsV.post_sort_tmp.first(xs_sorted.size()); + } + + auto t1V = layout.t1(); + Table1Constructor t1_ctor(params, t1V.target, t1V.minor); + auto t1_pairs = t1_ctor.construct(xs_sorted, t1V.out, t1V.post_sort_tmp); + + std::vector cpu_keys; + cpu_keys.reserve(t1_pairs.size()); + for (auto const& p : t1_pairs) { + cpu_keys.push_back({p.match_info, p.meta_lo, p.meta_hi}); + } + std::sort(cpu_keys.begin(), cpu_keys.end()); + std::printf(" CPU produced %zu T1Pairings\n", cpu_keys.size()); + + // ---- GPU pipeline: launch_construct_xs, then launch_t1_match ---- + auto* d_xs = sycl_alloc_device(q, total, "d_xs"); + + std::size_t xs_temp_bytes = 0; + pos2gpu::launch_construct_xs(plot_id.data(), k, /*testnet=*/false, + nullptr, nullptr, &xs_temp_bytes, q); + void* d_xs_temp = sycl_alloc_device(q, xs_temp_bytes, "d_xs_temp"); + pos2gpu::launch_construct_xs(plot_id.data(), k, /*testnet=*/false, + d_xs, d_xs_temp, &xs_temp_bytes, q); + q.wait(); + + auto t1p = pos2gpu::make_t1_params(k, strength); + uint64_t const capacity = static_cast(max_pairs); + + auto* d_t1_meta = sycl_alloc_device(q, capacity, "d_t1_meta"); + auto* d_t1_mi = sycl_alloc_device(q, capacity, "d_t1_mi"); + auto* d_t1_count = sycl_alloc_device(q, 1, "d_t1_count"); + + // Mirror GpuPipeline.cpp: the streaming pipeline always memsets + // d_counter to 0 before the real launch_t1_match call. The size- + // query call below doesn't touch d_t1_count, but the real call's + // launch_t1_match_prepare also memsets it — keep the explicit + // pre-zero to make the test a one-shot if the prepare path ever + // changes. + q.memset(d_t1_count, 0, sizeof(uint64_t)).wait(); + + std::size_t t1_temp_bytes = 0; + pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total, + nullptr, nullptr, d_t1_count, capacity, + nullptr, &t1_temp_bytes, q); + void* d_t1_temp = sycl_alloc_device(q, t1_temp_bytes, "d_t1_temp"); + pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total, + d_t1_meta, d_t1_mi, d_t1_count, capacity, + d_t1_temp, &t1_temp_bytes, q); + q.wait(); + + uint64_t gpu_count = 0; + q.memcpy(&gpu_count, d_t1_count, sizeof(uint64_t)).wait(); + + auto free_all = [&]() { + sycl::free(d_t1_temp, q); + sycl::free(d_t1_count, q); + sycl::free(d_t1_mi, q); + sycl::free(d_t1_meta, q); + sycl::free(d_xs_temp, q); + sycl::free(d_xs, q); + }; + + if (gpu_count > capacity) { + std::printf(" GPU OVERFLOW: emitted %llu but capacity %llu\n", + static_cast(gpu_count), + static_cast(capacity)); + free_all(); + return false; + } + + std::vector h_meta(gpu_count); + std::vector h_mi (gpu_count); + if (gpu_count > 0) { + q.memcpy(h_meta.data(), d_t1_meta, sizeof(uint64_t) * gpu_count).wait(); + q.memcpy(h_mi.data(), d_t1_mi, sizeof(uint32_t) * gpu_count).wait(); + } + free_all(); + + std::vector gpu_keys; + gpu_keys.reserve(gpu_count); + for (uint64_t i = 0; i < gpu_count; ++i) { + uint32_t meta_lo = static_cast(h_meta[i]); + uint32_t meta_hi = static_cast(h_meta[i] >> 32); + gpu_keys.push_back({h_mi[i], meta_lo, meta_hi}); + } + std::sort(gpu_keys.begin(), gpu_keys.end()); + std::printf(" GPU produced %zu T1Pairings\n", gpu_keys.size()); + + if (cpu_keys.size() != gpu_keys.size()) { + std::printf(" count mismatch (CPU %zu vs GPU %zu) — analysing overlap\n", + cpu_keys.size(), gpu_keys.size()); + std::size_t in_cpu_only = 0, in_gpu_only = 0, common = 0; + std::vector only_in_gpu; + std::size_t i = 0, j = 0; + while (i < cpu_keys.size() && j < gpu_keys.size()) { + if (cpu_keys[i] == gpu_keys[j]) { ++common; ++i; ++j; } + else if (cpu_keys[i] < gpu_keys[j]) { ++in_cpu_only; ++i; } + else { + if (only_in_gpu.size() < 5) only_in_gpu.push_back(gpu_keys[j]); + ++in_gpu_only; ++j; + } + } + in_cpu_only += cpu_keys.size() - i; + while (j < gpu_keys.size()) { + if (only_in_gpu.size() < 5) only_in_gpu.push_back(gpu_keys[j]); + ++in_gpu_only; + ++j; + } + std::printf(" common=%zu cpu_only=%zu gpu_only=%zu\n", + common, in_cpu_only, in_gpu_only); + for (auto const& p : only_in_gpu) { + uint64_t meta = (uint64_t(p.hi) << 32) | uint64_t(p.lo); + uint32_t x_l = static_cast(meta >> static_cast(k)); + uint32_t x_r = static_cast(meta & ((1ULL << k) - 1)); + std::printf(" GPU-only sample: x_l=%u x_r=%u match_info=0x%08x\n", + x_l, x_r, p.mi); + } + return false; + } + + uint64_t mismatches = 0; + for (std::size_t i = 0; i < cpu_keys.size(); ++i) { + if (!(cpu_keys[i] == gpu_keys[i])) { + if (mismatches < 5) { + std::printf(" MISMATCH at i=%zu cpu=(mi=0x%08x lo=0x%08x hi=0x%08x) " + "gpu=(mi=0x%08x lo=0x%08x hi=0x%08x)\n", + i, + cpu_keys[i].mi, cpu_keys[i].lo, cpu_keys[i].hi, + gpu_keys[i].mi, gpu_keys[i].lo, gpu_keys[i].hi); + } + ++mismatches; + } + } + if (mismatches == 0) { + std::printf(" OK %zu / %zu T1Pairings match (sorted set comparison)\n", + cpu_keys.size(), cpu_keys.size()); + return true; + } + std::printf(" FAIL %llu mismatches / %zu\n", + static_cast(mismatches), cpu_keys.size()); + return false; +} + +bool parse_int_arg(std::string_view sv, int& out) +{ + auto const* first = sv.data(); + auto const* last = sv.data() + sv.size(); + auto r = std::from_chars(first, last, out); + return r.ec == std::errc{} && r.ptr == last; +} + +} // namespace + +int main(int argc, char** argv) +{ + pos2gpu::initialize_aes_tables(); + + int k_override = -1; + int strength_override = -1; + for (int i = 1; i + 1 < argc; ++i) { + std::string_view a = argv[i]; + if (a == "--k") { (void)parse_int_arg(argv[++i], k_override); } + else if (a == "--strength") { (void)parse_int_arg(argv[++i], strength_override); } + } + + sycl::queue q{ sycl::gpu_selector_v }; + std::printf("device: %s\n", + q.get_device().get_info().c_str()); + + bool all_ok = true; + + if (k_override > 0) { + int const s = (strength_override > 0) ? strength_override : 2; + // Use the same fixed plot_id family as the default sweep so a + // user-driven --k 22 run is reproducible alongside the seed=1 + // baseline. + std::string label = "k=" + std::to_string(k_override) + + " strength=" + std::to_string(s); + all_ok = run_for_id(q, derive_plot_id(/*seed=*/1u), + label.c_str(), k_override, s) && all_ok; + } else { + // Default sweep — k=18 only, since launch_t1_match_prepare rejects + // k < 18 (smallest size for which num_match_target_bits exceeds the + // FINE_BITS=8 floor with sensible margin). Seed and strength + // coverage is deliberately narrower than t1_parity.cu because + // this binary is meant to be run as a quick-triage check on + // AMD/Intel hardware where the CUDA test isn't available — the + // full coverage is in t1_parity.cu on the CUDA build path. + for (uint32_t seed : { 1u, 7u, 31u, 0xCAFEBABEu, 0xDEADBEEFu }) { + std::string label = "seed=" + std::to_string(seed); + all_ok = run_for_id(q, derive_plot_id(seed), + label.c_str(), /*k=*/18, /*strength=*/2) + && all_ok; + } + // Strength sweep at k=18 — exercises the test_mask path through + // the matcher which scales with strength. strength=7 leaves + // num_match_target_bits=9, still above the FINE_BITS=8 floor. + for (int strength : { 3, 4, 5, 6, 7 }) { + std::string label = "seed=1 strength=" + std::to_string(strength); + all_ok = run_for_id(q, derive_plot_id(1u), + label.c_str(), /*k=*/18, strength) + && all_ok; + } + } + + std::printf("\n==> %s\n", all_ok ? "ALL OK" : "FAIL"); + return all_ok ? 0 : 1; +} diff --git a/tools/parity/t1_debug.cu b/tools/parity/t1_debug.cu index a44606c..01c2e04 100644 --- a/tools/parity/t1_debug.cu +++ b/tools/parity/t1_debug.cu @@ -9,6 +9,8 @@ #include "pos/ProofParams.hpp" #include "pos/ProofCore.hpp" +#include "ParityCommon.hpp" + #include #include #include @@ -19,16 +21,7 @@ namespace { -std::array derive_plot_id(uint32_t seed) -{ - std::array id{}; - uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL; - for (size_t i = 0; i < id.size(); ++i) { - s = s * 6364136223846793005ULL + 1442695040888963407ULL; - id[i] = static_cast(s >> 56); - } - return id; -} +using pos2gpu::parity::derive_plot_id; __global__ void test_kernel( pos2gpu::AesHashKeys keys, diff --git a/tools/parity/t1_parity.cu b/tools/parity/t1_parity.cu index 71c9652..8195ba9 100644 --- a/tools/parity/t1_parity.cu +++ b/tools/parity/t1_parity.cu @@ -7,6 +7,7 @@ // downstream T2/T3/proof pipeline. #include "gpu/AesGpu.cuh" +#include "gpu/SyclBackend.hpp" #include "gpu/XsKernel.cuh" #include "gpu/T1Kernel.cuh" @@ -16,6 +17,8 @@ #include "pos/ProofCore.hpp" #include "pos/ProofParams.hpp" +#include "ParityCommon.hpp" + #include #include #include @@ -27,6 +30,8 @@ namespace { +using pos2gpu::parity::derive_plot_id; + #define CHECK(call) do { \ cudaError_t err = (call); \ if (err != cudaSuccess) { \ @@ -36,17 +41,6 @@ namespace { } \ } while (0) -std::array derive_plot_id(uint32_t seed) -{ - std::array id{}; - uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL; - for (size_t i = 0; i < id.size(); ++i) { - s = s * 6364136223846793005ULL + 1442695040888963407ULL; - id[i] = static_cast(s >> 56); - } - return id; -} - struct PairKey { uint32_t mi; // match_info uint32_t lo; // meta_lo @@ -111,10 +105,10 @@ bool run_for_id(std::array const& plot_id, char const* label, int k pos2gpu::XsCandidateGpu* d_xs = nullptr; CHECK(cudaMalloc(&d_xs, sizeof(pos2gpu::XsCandidateGpu) * total)); size_t xs_temp_bytes = 0; - CHECK(pos2gpu::launch_construct_xs(plot_id.data(), k, false, nullptr, nullptr, &xs_temp_bytes)); + pos2gpu::launch_construct_xs(plot_id.data(), k, false, nullptr, nullptr, &xs_temp_bytes, pos2gpu::sycl_backend::queue()); void* d_xs_temp = nullptr; CHECK(cudaMalloc(&d_xs_temp, xs_temp_bytes)); - CHECK(pos2gpu::launch_construct_xs(plot_id.data(), k, false, d_xs, d_xs_temp, &xs_temp_bytes)); + pos2gpu::launch_construct_xs(plot_id.data(), k, false, d_xs, d_xs_temp, &xs_temp_bytes, pos2gpu::sycl_backend::queue()); CHECK(cudaDeviceSynchronize()); auto t1p = pos2gpu::make_t1_params(k, strength); @@ -122,46 +116,55 @@ bool run_for_id(std::array const& plot_id, char const* label, int k // re-use it. uint64_t capacity = static_cast(max_pairs); - pos2gpu::T1PairingGpu* d_t1 = nullptr; - CHECK(cudaMalloc(&d_t1, sizeof(pos2gpu::T1PairingGpu) * capacity)); + // T1 match emits SoA: (uint64 meta, uint32 mi) parallel streams. + uint64_t* d_t1_meta = nullptr; + uint32_t* d_t1_mi = nullptr; + CHECK(cudaMalloc(&d_t1_meta, sizeof(uint64_t) * capacity)); + CHECK(cudaMalloc(&d_t1_mi, sizeof(uint32_t) * capacity)); uint64_t* d_t1_count = nullptr; CHECK(cudaMalloc(&d_t1_count, sizeof(uint64_t))); size_t t1_temp_bytes = 0; - CHECK(pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total, - d_t1, d_t1_count, capacity, - nullptr, &t1_temp_bytes)); + pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total, + nullptr, nullptr, d_t1_count, capacity, + nullptr, &t1_temp_bytes, pos2gpu::sycl_backend::queue()); void* d_t1_temp = nullptr; CHECK(cudaMalloc(&d_t1_temp, t1_temp_bytes)); - CHECK(pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total, - d_t1, d_t1_count, capacity, - d_t1_temp, &t1_temp_bytes)); + pos2gpu::launch_t1_match(plot_id.data(), t1p, d_xs, total, + d_t1_meta, d_t1_mi, d_t1_count, capacity, + d_t1_temp, &t1_temp_bytes, pos2gpu::sycl_backend::queue()); CHECK(cudaDeviceSynchronize()); uint64_t gpu_count = 0; CHECK(cudaMemcpy(&gpu_count, d_t1_count, sizeof(uint64_t), cudaMemcpyDeviceToHost)); + auto free_all = [&]() { + cudaFree(d_t1_temp); cudaFree(d_t1_count); + cudaFree(d_t1_meta); cudaFree(d_t1_mi); + cudaFree(d_xs_temp); cudaFree(d_xs); + }; + if (gpu_count > capacity) { std::printf(" GPU OVERFLOW: emitted %llu but capacity %llu\n", (unsigned long long)gpu_count, (unsigned long long)capacity); - cudaFree(d_t1_temp); cudaFree(d_t1_count); cudaFree(d_t1); - cudaFree(d_xs_temp); cudaFree(d_xs); + free_all(); return false; } - std::vector gpu_pairs(gpu_count); + std::vector h_meta(gpu_count); + std::vector h_mi (gpu_count); if (gpu_count > 0) { - CHECK(cudaMemcpy(gpu_pairs.data(), d_t1, - sizeof(pos2gpu::T1PairingGpu) * gpu_count, - cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy(h_meta.data(), d_t1_meta, sizeof(uint64_t) * gpu_count, cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy(h_mi.data(), d_t1_mi, sizeof(uint32_t) * gpu_count, cudaMemcpyDeviceToHost)); } - cudaFree(d_t1_temp); cudaFree(d_t1_count); cudaFree(d_t1); - cudaFree(d_xs_temp); cudaFree(d_xs); + free_all(); std::vector gpu_keys; - gpu_keys.reserve(gpu_pairs.size()); - for (auto const& p : gpu_pairs) { - gpu_keys.push_back({p.match_info, p.meta_lo, p.meta_hi}); + gpu_keys.reserve(gpu_count); + for (uint64_t i = 0; i < gpu_count; ++i) { + uint32_t meta_lo = uint32_t(h_meta[i]); + uint32_t meta_hi = uint32_t(h_meta[i] >> 32); + gpu_keys.push_back({h_mi[i], meta_lo, meta_hi}); } std::sort(gpu_keys.begin(), gpu_keys.end()); diff --git a/tools/parity/t2_parity.cu b/tools/parity/t2_parity.cu index dcb8550..4d7e80e 100644 --- a/tools/parity/t2_parity.cu +++ b/tools/parity/t2_parity.cu @@ -6,6 +6,7 @@ // correctness, which is already validated by t1_parity. #include "gpu/AesGpu.cuh" +#include "gpu/SyclBackend.hpp" #include "gpu/T1Kernel.cuh" #include "gpu/T2Kernel.cuh" @@ -15,6 +16,8 @@ #include "pos/ProofCore.hpp" #include "pos/ProofParams.hpp" +#include "ParityCommon.hpp" + #include #include #include @@ -26,6 +29,8 @@ namespace { +using pos2gpu::parity::derive_plot_id; + #define CHECK(call) do { \ cudaError_t err = (call); \ if (err != cudaSuccess) { \ @@ -35,17 +40,6 @@ namespace { } \ } while (0) -std::array derive_plot_id(uint32_t seed) -{ - std::array id{}; - uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL; - for (size_t i = 0; i < id.size(); ++i) { - s = s * 6364136223846793005ULL + 1442695040888963407ULL; - id[i] = static_cast(s >> 56); - } - return id; -} - // Sort key for T2Pairing: (match_info, x_bits, meta) — fully canonicalises // the pair regardless of emission order. struct T2Key { @@ -149,44 +143,59 @@ bool run_for_id(std::array const& plot_id, char const* label, int k auto t2p = pos2gpu::make_t2_params(k, strength); uint64_t capacity = static_cast(max_pairs); - pos2gpu::T2PairingGpu* d_t2 = nullptr; - CHECK(cudaMalloc(&d_t2, sizeof(pos2gpu::T2PairingGpu) * capacity)); + // T2 match emits SoA: three parallel streams. + uint64_t* d_t2_meta = nullptr; + uint32_t* d_t2_mi = nullptr; + uint32_t* d_t2_xbits = nullptr; + CHECK(cudaMalloc(&d_t2_meta, sizeof(uint64_t) * capacity)); + CHECK(cudaMalloc(&d_t2_mi, sizeof(uint32_t) * capacity)); + CHECK(cudaMalloc(&d_t2_xbits, sizeof(uint32_t) * capacity)); uint64_t* d_t2_count = nullptr; CHECK(cudaMalloc(&d_t2_count, sizeof(uint64_t))); size_t t2_temp_bytes = 0; - CHECK(pos2gpu::launch_t2_match(plot_id.data(), t2p, nullptr, nullptr, t1_snapshot.size(), - d_t2, d_t2_count, capacity, - nullptr, &t2_temp_bytes)); + pos2gpu::launch_t2_match(plot_id.data(), t2p, nullptr, nullptr, t1_snapshot.size(), + nullptr, nullptr, nullptr, + d_t2_count, capacity, + nullptr, &t2_temp_bytes, pos2gpu::sycl_backend::queue()); void* d_t2_temp = nullptr; CHECK(cudaMalloc(&d_t2_temp, t2_temp_bytes)); - CHECK(pos2gpu::launch_t2_match(plot_id.data(), t2p, d_t1_meta, d_t1_mi, t1_snapshot.size(), - d_t2, d_t2_count, capacity, - d_t2_temp, &t2_temp_bytes)); + pos2gpu::launch_t2_match(plot_id.data(), t2p, d_t1_meta, d_t1_mi, t1_snapshot.size(), + d_t2_meta, d_t2_mi, d_t2_xbits, + d_t2_count, capacity, + d_t2_temp, &t2_temp_bytes, pos2gpu::sycl_backend::queue()); CHECK(cudaDeviceSynchronize()); uint64_t gpu_count = 0; CHECK(cudaMemcpy(&gpu_count, d_t2_count, sizeof(uint64_t), cudaMemcpyDeviceToHost)); + auto free_all = [&]() { + cudaFree(d_t2_temp); cudaFree(d_t2_count); + cudaFree(d_t2_meta); cudaFree(d_t2_mi); cudaFree(d_t2_xbits); + cudaFree(d_t1_mi); cudaFree(d_t1_meta); cudaFree(d_t1); + }; + if (gpu_count > capacity) { std::printf(" GPU OVERFLOW: %llu / %llu\n", (unsigned long long)gpu_count, (unsigned long long)capacity); - cudaFree(d_t2_temp); cudaFree(d_t2_count); cudaFree(d_t2); cudaFree(d_t1_mi); cudaFree(d_t1_meta); cudaFree(d_t1); + free_all(); return false; } - std::vector gpu_pairs(gpu_count); + std::vector h_meta (gpu_count); + std::vector h_mi (gpu_count); + std::vector h_xbits(gpu_count); if (gpu_count > 0) { - CHECK(cudaMemcpy(gpu_pairs.data(), d_t2, - sizeof(pos2gpu::T2PairingGpu) * gpu_count, - cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy(h_meta.data(), d_t2_meta, sizeof(uint64_t) * gpu_count, cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy(h_mi.data(), d_t2_mi, sizeof(uint32_t) * gpu_count, cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy(h_xbits.data(), d_t2_xbits, sizeof(uint32_t) * gpu_count, cudaMemcpyDeviceToHost)); } - cudaFree(d_t2_temp); cudaFree(d_t2_count); cudaFree(d_t2); cudaFree(d_t1_mi); cudaFree(d_t1_meta); cudaFree(d_t1); + free_all(); std::vector gpu_keys; - gpu_keys.reserve(gpu_pairs.size()); - for (auto const& p : gpu_pairs) { - gpu_keys.push_back({p.match_info, p.x_bits, p.meta}); + gpu_keys.reserve(gpu_count); + for (uint64_t i = 0; i < gpu_count; ++i) { + gpu_keys.push_back({h_mi[i], h_xbits[i], h_meta[i]}); } std::sort(gpu_keys.begin(), gpu_keys.end()); diff --git a/tools/parity/t3_parity.cu b/tools/parity/t3_parity.cu index 3fb606b..0085dff 100644 --- a/tools/parity/t3_parity.cu +++ b/tools/parity/t3_parity.cu @@ -5,6 +5,7 @@ // from upstream phases (already validated by t1_parity / t2_parity). #include "gpu/AesGpu.cuh" +#include "gpu/SyclBackend.hpp" #include "gpu/T2Kernel.cuh" #include "gpu/T3Kernel.cuh" @@ -14,6 +15,8 @@ #include "pos/ProofCore.hpp" #include "pos/ProofParams.hpp" +#include "ParityCommon.hpp" + #include #include #include @@ -25,6 +28,8 @@ namespace { +using pos2gpu::parity::derive_plot_id; + #define CHECK(call) do { \ cudaError_t err = (call); \ if (err != cudaSuccess) { \ @@ -34,17 +39,6 @@ namespace { } \ } while (0) -std::array derive_plot_id(uint32_t seed) -{ - std::array id{}; - uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL; - for (size_t i = 0; i < id.size(); ++i) { - s = s * 6364136223846793005ULL + 1442695040888963407ULL; - id[i] = static_cast(s >> 56); - } - return id; -} - bool run_for_id(std::array const& plot_id, char const* label, int k, int strength) { uint64_t const total = 1ULL << k; @@ -145,18 +139,18 @@ bool run_for_id(std::array const& plot_id, char const* label, int k CHECK(cudaMalloc(&d_t3_count, sizeof(uint64_t))); size_t t3_temp_bytes = 0; - CHECK(pos2gpu::launch_t3_match(plot_id.data(), t3p, + pos2gpu::launch_t3_match(plot_id.data(), t3p, d_t2_meta, d_t2_xbits, nullptr, t2_snapshot.size(), d_t3, d_t3_count, capacity, - nullptr, &t3_temp_bytes)); + nullptr, &t3_temp_bytes, pos2gpu::sycl_backend::queue()); void* d_t3_temp = nullptr; CHECK(cudaMalloc(&d_t3_temp, t3_temp_bytes)); - CHECK(pos2gpu::launch_t3_match(plot_id.data(), t3p, + pos2gpu::launch_t3_match(plot_id.data(), t3p, d_t2_meta, d_t2_xbits, d_t2_mi, t2_snapshot.size(), d_t3, d_t3_count, capacity, - d_t3_temp, &t3_temp_bytes)); + d_t3_temp, &t3_temp_bytes, pos2gpu::sycl_backend::queue()); CHECK(cudaDeviceSynchronize()); uint64_t gpu_count = 0; diff --git a/tools/parity/xs_bench.cu b/tools/parity/xs_bench.cu index b0fd563..1dad15e 100644 --- a/tools/parity/xs_bench.cu +++ b/tools/parity/xs_bench.cu @@ -4,11 +4,14 @@ // chase further down the pipeline. #include "gpu/AesGpu.cuh" +#include "gpu/SyclBackend.hpp" #include "gpu/XsKernel.cuh" #include "plot/TableConstructorGeneric.hpp" #include "pos/ProofParams.hpp" +#include "ParityCommon.hpp" + #include #include #include @@ -26,16 +29,7 @@ } \ } while (0) -static std::array derive_plot_id(uint32_t seed) -{ - std::array id{}; - uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL; - for (size_t i = 0; i < id.size(); ++i) { - s = s * 6364136223846793005ULL + 1442695040888963407ULL; - id[i] = static_cast(s >> 56); - } - return id; -} +using pos2gpu::parity::derive_plot_id; static double bench_cpu(uint8_t const* plot_id, int k) { @@ -62,16 +56,16 @@ static double bench_gpu(uint8_t const* plot_id, int k) CHECK(cudaMalloc(&d_out, sizeof(pos2gpu::XsCandidateGpu) * total)); size_t temp_bytes = 0; - CHECK(pos2gpu::launch_construct_xs(plot_id, k, false, nullptr, nullptr, &temp_bytes)); + pos2gpu::launch_construct_xs(plot_id, k, false, nullptr, nullptr, &temp_bytes, pos2gpu::sycl_backend::queue()); void* d_temp = nullptr; CHECK(cudaMalloc(&d_temp, temp_bytes)); // Warm up to amortise context init. - CHECK(pos2gpu::launch_construct_xs(plot_id, k, false, d_out, d_temp, &temp_bytes)); + pos2gpu::launch_construct_xs(plot_id, k, false, d_out, d_temp, &temp_bytes, pos2gpu::sycl_backend::queue()); CHECK(cudaDeviceSynchronize()); auto t0 = std::chrono::steady_clock::now(); - CHECK(pos2gpu::launch_construct_xs(plot_id, k, false, d_out, d_temp, &temp_bytes)); + pos2gpu::launch_construct_xs(plot_id, k, false, d_out, d_temp, &temp_bytes, pos2gpu::sycl_backend::queue()); CHECK(cudaDeviceSynchronize()); auto t1 = std::chrono::steady_clock::now(); diff --git a/tools/parity/xs_parity.cu b/tools/parity/xs_parity.cu index f743bdd..b06d922 100644 --- a/tools/parity/xs_parity.cu +++ b/tools/parity/xs_parity.cu @@ -6,12 +6,15 @@ // (match_info, x) pair matches in order. #include "gpu/AesGpu.cuh" +#include "gpu/SyclBackend.hpp" #include "gpu/XsKernel.cuh" // pos2-chip headers for the CPU reference. #include "plot/TableConstructorGeneric.hpp" #include "pos/ProofParams.hpp" +#include "ParityCommon.hpp" + #include #include #include @@ -23,6 +26,8 @@ namespace { +using pos2gpu::parity::derive_plot_id; + #define CHECK(call) do { \ cudaError_t err = (call); \ if (err != cudaSuccess) { \ @@ -32,17 +37,6 @@ namespace { } \ } while (0) -std::array derive_plot_id(uint32_t seed) -{ - std::array id{}; - uint64_t s = 0x9E3779B97F4A7C15ULL ^ uint64_t(seed) * 0x100000001B3ULL; - for (size_t i = 0; i < id.size(); ++i) { - s = s * 6364136223846793005ULL + 1442695040888963407ULL; - id[i] = static_cast(s >> 56); - } - return id; -} - bool run_for(uint32_t seed, int k, bool testnet) { auto plot_id = derive_plot_id(seed); @@ -84,26 +78,16 @@ bool run_for(uint32_t seed, int k, bool testnet) CHECK(cudaMalloc(&d_out, sizeof(pos2gpu::XsCandidateGpu) * total)); size_t temp_bytes = 0; - auto err = pos2gpu::launch_construct_xs( + pos2gpu::launch_construct_xs( plot_id.data(), k, testnet, /*d_out=*/nullptr, /*d_temp_storage=*/nullptr, - &temp_bytes); - if (err != cudaSuccess) { - std::fprintf(stderr, " query temp_bytes failed: %s\n", cudaGetErrorString(err)); - return false; - } - + &temp_bytes, pos2gpu::sycl_backend::queue()); void* d_temp = nullptr; CHECK(cudaMalloc(&d_temp, temp_bytes)); - err = pos2gpu::launch_construct_xs( - plot_id.data(), k, testnet, d_out, d_temp, &temp_bytes); - if (err != cudaSuccess) { - std::fprintf(stderr, " launch failed: %s\n", cudaGetErrorString(err)); - cudaFree(d_temp); cudaFree(d_out); - return false; - } + pos2gpu::launch_construct_xs( + plot_id.data(), k, testnet, d_out, d_temp, &temp_bytes, pos2gpu::sycl_backend::queue()); CHECK(cudaDeviceSynchronize()); std::vector gpu_out(total); diff --git a/tools/sanity/hellosycl.cpp b/tools/sanity/hellosycl.cpp new file mode 100644 index 0000000..11cf500 --- /dev/null +++ b/tools/sanity/hellosycl.cpp @@ -0,0 +1,80 @@ +// hellosycl.cpp — minimal SYCL kernel-dispatch sanity check. +// +// Allocates 16 uint32_t on device, sentinel-fills via memset, runs a +// trivial parallel_for that writes a known pattern, copies back, prints +// pass/fail per slot. Exit 0 if all slots match expected values, else +// non-zero with a "FAIL" line for each mismatch. +// +// Used to localize "is AdaptiveCpp's HIP / CUDA backend actually +// dispatching kernels on this host?" before climbing the abstraction +// stack to sycl_t1_parity / xchplot2. If hellosycl FAILs, no +// xchplot2-level fix can recover the device — the issue is below our +// level (driver mismatch, missing libcudart / libamdhip64, AdaptiveCpp +// JIT producing no-op stubs, ACPP_TARGETS pointing at an ISA the +// installed AdaptiveCpp can't lower for, …). +// +// Compile via the project CMake build (rpath + includes set up +// automatically): +// +// cmake --build build --target hellosycl +// ./build/tools/sanity/hellosycl +// +// Or standalone, mirroring whatever ACPP_TARGETS the production binary +// is using (see the cargo:warning lines from `cargo install`): +// +// ACPP_TARGETS=hip:gfx1013 /opt/adaptivecpp/bin/acpp -O2 hellosycl.cpp -o hellosycl +// LD_LIBRARY_PATH=/opt/rocm/lib ./hellosycl + +#include + +#include +#include + +int main() +{ + sycl::queue q; + std::printf("Device: %s\n", + q.get_device().get_info().c_str()); + + constexpr std::size_t N = 16; + constexpr std::uint32_t kPattern = 0x12340000u; + + std::uint32_t* d = sycl::malloc_device(N, q); + if (!d) { + std::printf("FAIL: sycl::malloc_device returned null\n"); + return 1; + } + + // Sentinel-fill (0xABABABAB): a "kernel didn't write" outcome shows + // 0xAB, distinct from "kernel wrote a wrong value" (shows something + // else) and from random uninitialised bytes that might happen to + // match the expected pattern by coincidence. + q.memset(d, 0xAB, N * sizeof(std::uint32_t)).wait(); + q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> it) { + std::size_t idx = it.get_global_id(0); + d[idx] = kPattern | static_cast(idx); + }).wait(); + + std::uint32_t h[N]; + q.memcpy(h, d, N * sizeof(std::uint32_t)).wait(); + sycl::free(d, q); + + int fails = 0; + for (std::size_t i = 0; i < N; ++i) { + std::uint32_t want = kPattern | static_cast(i); + std::printf("[%2zu] got=0x%08x want=0x%08x %s\n", + i, h[i], want, h[i] == want ? "OK" : "FAIL"); + if (h[i] != want) ++fails; + } + + if (fails == 0) { + std::printf("\nALL OK — AdaptiveCpp can dispatch trivial kernels on this device.\n"); + } else { + std::printf("\nFAIL — %d/%zu slot(s) wrong. Common causes:\n" + " - libcudart / libamdhip64 not in rpath (check ldd of this binary)\n" + " - AdaptiveCpp JIT producing no-op stubs (ACPP_DEBUG_LEVEL=2 to see)\n" + " - ACPP_TARGETS picks an ISA the installed AdaptiveCpp can't lower\n", + fails, N); + } + return fails == 0 ? 0 : 1; +} diff --git a/tools/xchplot2/cli.cpp b/tools/xchplot2/cli.cpp index 6cfa62f..de7a5c9 100644 --- a/tools/xchplot2/cli.cpp +++ b/tools/xchplot2/cli.cpp @@ -6,20 +6,29 @@ // BLS keys via the keygen-rs Rust shim, then dispatches through // batch internally. The "real" entrypoint for users. +#include "gpu/SyclDeviceList.hpp" // list_gpu_devices() — backs the + // `devices` subcommand below. Plain + // types only; the SYCL include lives + // in SyclDeviceList.cpp (acpp-built). #include "host/GpuPlotter.hpp" #include "host/BatchPlotter.hpp" +#include "host/Cancel.hpp" +#include "host/PlotFileWriterParallel.hpp" #include "pos2_keygen.h" // Rust shim for plot_id + memo derivation #include #include +#include #include #include #include #include +#include #include #include #include #include +#include #include namespace { @@ -32,12 +41,15 @@ void print_usage(char const* prog) << " [-T|--testnet] [-o|--out DIR] [-m|--memo HEX] [-N|--out-name NAME]\n" << " [--gpu-t1] [--gpu-t2] [--gpu-t3] [-G|--gpu-all] [-P|--profile]\n" << " " << prog << " batch [-v|--verbose]\n" + << " [--skip-existing] [--continue-on-error]\n" + << " [--devices ]\n" << " Manifest: one plot per non-empty/non-# line, whitespace-separated:\n" << " k strength plot_index meta_group testnet plot_id_hex memo_hex out_dir out_name\n" << " Runs GPU compute and CPU FSE in a producer/consumer pipeline so they overlap\n" << " across consecutive plots. ~2x throughput vs separate `test` invocations.\n" << " " << prog << " plot -k K -n N -f HEX ( -p HEX | --pool-ph HEX | -c xch1... )\n" << " [-s S] [-o DIR] [-T] [-i N] [-g N] [-S HEX] [-v]\n" + << " [--skip-existing] [--continue-on-error]\n" << " Standalone farmable plot(s): derives plot_id + memo internally\n" << " from the keys via chia-rs, then batches through the GPU pipeline.\n" << " -f, --farmer-pk HEX : 96 hex chars (48 B G1 public key).\n" @@ -57,6 +69,51 @@ void print_usage(char const* prog) << " fresh /dev/urandom per plot.\n" << " -T, --testnet : testnet proof parameters.\n" << " -v, --verbose : per-plot progress on stderr.\n" + << " --skip-existing : skip plots whose output file is already a\n" + << " complete .plot2 (magic + non-trivial size).\n" + << " --continue-on-error : log per-plot failures and keep going\n" + << " instead of aborting the batch.\n" + << " --devices SPEC : multi-device. SPEC is a comma\n" + << " list mixing any of:\n" + << " all — every GPU + CPU\n" + << " gpu — every visible GPU\n" + << " cpu — CPU worker only (slow)\n" + << " 0,1,3 — explicit GPU ids\n" + << " e.g. gpu,cpu == all.\n" + << " Omitted = single device via default\n" + << " SYCL selector (zero-config).\n" + << " --cpu : add a CPU worker alongside the\n" + << " selected GPUs (or use CPU only when\n" + << " no GPU is selected). Plotting on CPU\n" + << " is 1-2 orders of magnitude slower\n" + << " than GPU; intended for GPU-less\n" + << " hosts or as an extra worker.\n" + << " --tier plain|compact|minimal|auto : force streaming pipeline tier\n" + << " when GPU pool doesn't fit. plain =\n" + << " ~7.24 GB floor (k=28), faster.\n" + << " compact = ~5.33 GB floor, fits on\n" + << " tight 8 GB cards. minimal = ~3.83 GB\n" + << " floor, fits on 4 GiB cards (extra\n" + << " PCIe round-trips during T2 match).\n" + << " auto (default) = pick the largest\n" + << " tier that fits. Equivalent to\n" + << " XCHPLOT2_STREAMING_TIER env var;\n" + << " CLI flag wins if both set.\n" + << " " << prog << " verify [--trials N]\n" + << " Open and run N random challenges through the CPU prover.\n" + << " Zero proofs across a sensible sample (>=100) strongly indicates a\n" + << " corrupt plot. Default N=100.\n" + << " " << prog << " parity-check [--dir PATH]\n" + << " Run every *_parity binary in PATH and summarize PASS/FAIL.\n" + << " Default PATH is ./build/tools/parity. Build the tests with\n" + << " `cmake --build ` first. Useful for post-refactor\n" + << " regression screening.\n" + << " " << prog << " devices\n" + << " List every visible SYCL GPU device + the host CPU plotter\n" + << " with id, name, backend, capacity, and which sort path the\n" + << " runtime dispatcher will route a worker to (CUB on cuda-\n" + << " backend devices when this build links CUB, otherwise SortSycl).\n" + << " Use the printed [N] / [cpu] index with --devices in plot/batch.\n" << "\n" << " test-mode positional args:\n" << " : even integer in [18, 32]\n" @@ -72,7 +129,18 @@ void print_usage(char const* prog) << " -N, --out-name NAME: override output filename (basename only)\n" << " --gpu-tN : run phase N on GPU (T1/T2/T3); default CPU\n" << " -G, --gpu-all : run all phases on GPU (where implemented)\n" - << " -P, --profile : print per-phase device-time breakdown\n"; + << " -P, --profile : print per-phase device-time breakdown\n" + << "\n" + << " Environment variables:\n" + << " XCHPLOT2_STREAMING=1 force the low-VRAM streaming pipeline even\n" + << " when the persistent pool would fit.\n" + << " POS2GPU_MAX_VRAM_MB=N cap the pool/streaming VRAM query to N MB\n" + << " (useful for testing the streaming fallback).\n" + << " POS2GPU_STREAMING_STATS=1 log every streaming-path alloc / free.\n" + << " POS2GPU_POOL_DEBUG=1 log pool allocation sizes at construction.\n" + << " POS2GPU_PHASE_TIMING=1 per-phase wall-time breakdown on stderr.\n" + << " ACPP_GFX=gfxXXXX AMD only — required at build time to AOT\n" + << " for the right amdgcn ISA (see README).\n"; } bool parse_hex_bytes(std::string const& s, std::vector& out) @@ -124,6 +192,67 @@ void read_urandom(uint8_t* out, size_t n) } } +// Parse a --devices value into BatchOptions. +// +// Accepted forms: +// "all" → use every GPU visible at runtime (sets +// use_all_devices; device_ids stays empty). +// "0" → use only GPU id 0. +// "0,2,3" → use these specific device ids, in sorted order. +// +// Zero-configuration default (no flag) produces device_ids.empty() and +// use_all_devices=false — which triggers the single-device +// gpu_selector_v path, identical to pre-multi-GPU behavior. +// +// Returns false on malformed input (caller prints usage + exits 1). +bool parse_devices_arg(std::string const& s, pos2gpu::BatchOptions& opts) +{ + // Accept comma-separated mix of: + // "all" → every GPU + the CPU worker + // "gpu" → every visible GPU only + // "cpu" → the CPU worker only + // "" → opts.device_ids.push_back(int) (real GPU index) + // "cpu" alone is OK; otherwise at least one GPU token is required. + opts.device_ids.clear(); + bool any_token = false; + bool any_gpu_token = false; + size_t start = 0; + while (start <= s.size()) { + size_t const end = s.find(',', start); + std::string const tok = s.substr( + start, end == std::string::npos ? std::string::npos : end - start); + if (tok.empty()) return false; + any_token = true; + if (tok == "all") { + opts.use_all_devices = true; + opts.include_cpu = true; + any_gpu_token = true; + } else if (tok == "gpu") { + opts.use_all_devices = true; + any_gpu_token = true; + } else if (tok == "cpu") { + opts.include_cpu = true; + } else { + char* endp = nullptr; + long const v = std::strtol(tok.c_str(), &endp, 10); + if (endp == tok.c_str() || *endp != '\0' || v < 0 || v > 1023) { + return false; + } + opts.device_ids.push_back(static_cast(v)); + any_gpu_token = true; + } + if (end == std::string::npos) break; + start = end + 1; + } + if (!any_token) return false; + if (!any_gpu_token && !opts.include_cpu) return false; + std::sort(opts.device_ids.begin(), opts.device_ids.end()); + opts.device_ids.erase( + std::unique(opts.device_ids.begin(), opts.device_ids.end()), + opts.device_ids.end()); + return true; +} + std::string plot_id_to_filename(int k, std::array const& plot_id) { // Match chia plots create's v2 filename scheme: plot-k{size}-{id}.plot2 @@ -142,6 +271,8 @@ std::string plot_id_to_filename(int k, std::array const& plot_id) extern "C" int xchplot2_main(int argc, char* argv[]) { + pos2gpu::install_cancel_signal_handlers(); + if (argc < 2) { print_usage(argv[0]); return 1; @@ -149,29 +280,218 @@ extern "C" int xchplot2_main(int argc, char* argv[]) std::string mode = argv[1]; + if (mode == "devices") { + // Enumerate every visible SYCL GPU device + the CPU plotter + // (always available via AdaptiveCpp's OpenMP host backend). + // Reports id, name, backend, capacity, and which sort path + // the runtime dispatcher will route a worker on this device + // to (CUB on cuda-backend queues when this build links the + // CUB sort path; SortSycl otherwise — see SortDispatch.cpp). + // Use the printed `[N]` / `[cpu]` index with `--devices`. + auto devices = pos2gpu::list_gpu_devices(); + std::printf("Visible devices (%zu GPU + 1 CPU):\n", devices.size()); + for (auto const& d : devices) { + std::size_t vram_mb = + static_cast(d.vram_bytes / (1024ull * 1024ull)); +#ifdef XCHPLOT2_HAVE_CUB + char const* sort_hint = d.is_cuda_backend ? "CUB" : "SYCL"; +#else + char const* sort_hint = "SYCL"; +#endif + std::printf(" [%zu] %-32s backend=%-10s vram=%5zu MB CUs=%-4u sort:%s\n", + d.id, d.name.c_str(), d.backend.c_str(), + vram_mb, d.cu_count, sort_hint); + } + // CPU row. hardware_concurrency() returns 0 when it can't + // figure out the count (rare), in which case print "?". + unsigned threads = std::thread::hardware_concurrency(); + if (threads == 0) { + std::printf(" [cpu] %-32s backend=%-10s threads= ? sort:SYCL (1-2 orders slower than GPU)\n", + "Host CPU plotter", "omp"); + } else { + std::printf(" [cpu] %-32s backend=%-10s threads=%-4u sort:SYCL (1-2 orders slower than GPU)\n", + "Host CPU plotter", "omp", threads); + } + if (devices.empty()) { + std::printf("\nNo GPU devices visible to AdaptiveCpp / SYCL.\n" + "Check rocminfo / nvidia-smi, ACPP_VISIBILITY_MASK, and that the\n" + "relevant SYCL backend was built into AdaptiveCpp.\n" + "The CPU plotter is always available via `--devices cpu` or `--cpu`.\n"); + } else { + std::printf("\nUse `--devices N` (id) for a specific GPU,\n" + " `--devices gpu` for every GPU,\n" + " `--devices cpu` for the host CPU only,\n" + " `--devices all` for every GPU + CPU,\n" + " or any comma combination (e.g. `0,2,cpu`).\n"); + } + return 0; + } + if (mode == "batch") { if (argc < 3) { print_usage(argv[0]); return 1; } std::string manifest = argv[2]; - bool verbose = false; + pos2gpu::BatchOptions opts{}; for (int i = 3; i < argc; ++i) { std::string a = argv[i]; - if (a == "-v" || a == "--verbose") verbose = true; + if (a == "-v" || a == "--verbose") opts.verbose = true; + else if (a == "--skip-existing") opts.skip_existing = true; + else if (a == "--continue-on-error") opts.continue_on_error = true; + else if (a == "--cpu") opts.include_cpu = true; + else if (a == "--tier" && i + 1 < argc) { + std::string t = argv[++i]; + if (t != "plain" && t != "compact" && t != "minimal" && t != "auto") { + std::cerr << "Error: --tier expects 'plain', 'compact', " + "'minimal', or 'auto' (got '" << t << "')\n"; + return 1; + } + opts.streaming_tier = (t == "auto") ? "" : t; + } + else if (a == "--devices" && i + 1 < argc) { + if (!parse_devices_arg(argv[++i], opts)) { + std::cerr << "Error: --devices expects 'all', 'cpu', or a " + "comma-separated list of device ids " + "(got '" << argv[i] << "')\n"; + return 1; + } + } + else { + std::cerr << "Error: unknown argument: " << a << "\n"; + print_usage(argv[0]); + return 1; + } } try { auto entries = pos2gpu::parse_manifest(manifest); std::cerr << "[batch] " << entries.size() << " plots queued\n"; - auto res = pos2gpu::run_batch(entries, verbose); - double per = res.plots_written ? res.total_wall_seconds / res.plots_written : 0; + auto res = pos2gpu::run_batch(entries, opts); + double per = res.plots_written + ? res.total_wall_seconds / double(res.plots_written) : 0; std::cerr << "[batch] wrote " << res.plots_written << " plots in " << res.total_wall_seconds << " s (" - << per << " s/plot)\n"; - return 0; + << per << " s/plot)"; + if (res.plots_skipped) std::cerr << "; skipped " << res.plots_skipped; + if (res.plots_failed) std::cerr << "; failed " << res.plots_failed; + std::cerr << "\n"; + return (res.plots_failed > 0) ? 3 : 0; } catch (std::exception const& e) { std::cerr << "[batch] FAILED: " << e.what() << "\n"; return 2; } } + if (mode == "verify") { + if (argc < 3) { print_usage(argv[0]); return 1; } + std::string plotfile = argv[2]; + size_t trials = 100; + for (int i = 3; i < argc; ++i) { + std::string a = argv[i]; + if ((a == "--trials" || a == "-n") && i + 1 < argc) { + long v = std::atol(argv[++i]); + if (v <= 0) { + std::cerr << "Error: --trials must be > 0\n"; + return 1; + } + trials = static_cast(v); + } else { + std::cerr << "Error: unknown argument: " << a << "\n"; + print_usage(argv[0]); + return 1; + } + } + try { + std::cerr << "[verify] " << plotfile << ": running " << trials + << " random challenges\n"; + auto res = pos2gpu::verify_plot_file(plotfile, trials); + std::cerr << "[verify] " << res.trials << " trials, " + << res.challenges_with_proof << " with >=1 proof, " + << res.proofs_found << " proofs total\n"; + if (res.proofs_found == 0) { + std::cerr << "[verify] FAIL: no proofs produced — plot is " + "likely corrupt\n"; + return 4; + } + std::cerr << "[verify] OK\n"; + return 0; + } catch (std::exception const& e) { + std::cerr << "[verify] FAILED: " << e.what() << "\n"; + return 2; + } + } + + if (mode == "parity-check") { + std::string dir = "./build/tools/parity"; + for (int i = 2; i < argc; ++i) { + std::string a = argv[i]; + if ((a == "--dir" || a == "-d") && i + 1 < argc) { + dir = argv[++i]; + } else { + std::cerr << "Error: unknown argument: " << a << "\n"; + print_usage(argv[0]); + return 1; + } + } + + // Glob every *_parity binary in `dir`. Same code path works for + // both branches — main ships sycl_*_parity extras that cuda-only + // doesn't, and the wildcard picks up whichever actually exists. + std::vector tests; + std::error_code ec; + if (std::filesystem::is_directory(dir, ec)) { + for (auto const& entry : + std::filesystem::directory_iterator(dir, ec)) + { + auto const name = entry.path().filename().string(); + constexpr char const kSuffix[] = "_parity"; + constexpr size_t kLen = sizeof(kSuffix) - 1; + bool const ends = + name.size() >= kLen && + name.compare(name.size() - kLen, kLen, kSuffix) == 0; + if (ends && entry.is_regular_file(ec)) { + tests.push_back(entry.path()); + } + } + } + if (tests.empty()) { + std::cerr << "No `*_parity` binaries found under " << dir << ".\n" + "Build them first:\n" + " cmake -B build -S . -DCMAKE_BUILD_TYPE=Release\n" + " cmake --build build --parallel\n" + "Then re-run from the repo root, or pass --dir .\n"; + return 2; + } + std::sort(tests.begin(), tests.end()); + + int pass = 0, fail = 0; + std::cerr << "==> parity tests (" << tests.size() << " found in " + << dir << ")\n"; + for (auto const& test : tests) { + auto const name = test.filename().string(); + std::string const log_path = + "/tmp/xchplot2-parity-" + name + ".log"; + // Redirecting through the shell: `test` is a path we + // generated ourselves from a directory listing — no user- + // controlled shell metachars reach this string. + std::string const cmd = + test.string() + " >" + log_path + " 2>&1"; + auto const t0 = std::chrono::steady_clock::now(); + int const rc = std::system(cmd.c_str()); + auto const ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0).count(); + if (rc == 0) { + std::fprintf(stderr, " PASS %-32s (%.1f ms)\n", + name.c_str(), ms); + ++pass; + } else { + std::fprintf(stderr, + " FAIL %-32s (exit %d; log: %s)\n", + name.c_str(), rc, log_path.c_str()); + ++fail; + } + } + std::fprintf(stderr, "\n==> %d passed, %d failed\n", pass, fail); + return fail > 0 ? 1 : 0; + } + if (mode == "plot") { // Standalone farmable-plot path: derive plot_id + memo internally. int k = 28; @@ -181,9 +501,15 @@ extern "C" int xchplot2_main(int argc, char* argv[]) int meta_group = 0; bool testnet = false; bool verbose = false; + bool skip_existing = false; + bool continue_on_error = false; std::string out_dir = "."; std::string farmer_pk_hex, pool_pk_hex, pool_ph_hex, pool_addr; std::string seed_hex; + std::vector plot_device_ids; + bool plot_use_all_devices = false; + bool plot_include_cpu = false; + std::string plot_streaming_tier; for (int i = 2; i < argc; ++i) { std::string a = argv[i]; @@ -207,6 +533,30 @@ extern "C" int xchplot2_main(int argc, char* argv[]) else if ((a == "--seed" || a == "-S") && need(1)) seed_hex = argv[++i]; else if (a == "--testnet" || a == "-T") testnet = true; else if (a == "-v" || a == "--verbose") verbose = true; + else if (a == "--skip-existing") skip_existing = true; + else if (a == "--continue-on-error") continue_on_error = true; + else if (a == "--cpu") plot_include_cpu = true; + else if (a == "--tier" && need(1)) { + std::string t = argv[++i]; + if (t != "plain" && t != "compact" && t != "minimal" && t != "auto") { + std::cerr << "Error: --tier expects 'plain', 'compact', " + "'minimal', or 'auto' (got '" << t << "')\n"; + return 1; + } + plot_streaming_tier = (t == "auto") ? "" : t; + } + else if (a == "--devices" && need(1)) { + pos2gpu::BatchOptions tmp; + if (!parse_devices_arg(argv[++i], tmp)) { + std::cerr << "Error: --devices expects 'all', 'cpu', or a " + "comma-separated list of device ids " + "(got '" << argv[i] << "')\n"; + return 1; + } + plot_device_ids = std::move(tmp.device_ids); + plot_use_all_devices = tmp.use_all_devices; + if (tmp.include_cpu) plot_include_cpu = true; + } else { std::cerr << "Error: unknown argument: " << a << "\n"; print_usage(argv[0]); @@ -222,9 +572,14 @@ extern "C" int xchplot2_main(int argc, char* argv[]) int const pool_specs = int(!pool_pk_hex.empty()) + int(!pool_ph_hex.empty()) + int(!pool_addr.empty()); - if (pool_specs != 1) { - std::cerr << "Error: exactly one of --pool-pk, --pool-ph, " - "--pool-contract-address is required\n"; + if (pool_specs == 0) { + std::cerr << "Error: a pool destination is required — pick one of " + "--pool-pk, --pool-ph, --pool-contract-address\n"; + return 1; + } + if (pool_specs > 1) { + std::cerr << "Error: --pool-pk, --pool-ph, and --pool-contract-address " + "are mutually exclusive (saw " << pool_specs << ")\n"; return 1; } if (num < 1) { @@ -350,16 +705,27 @@ extern "C" int xchplot2_main(int argc, char* argv[]) } } - auto res = pos2gpu::run_batch(entries, verbose); + pos2gpu::BatchOptions opts{}; + opts.verbose = verbose; + opts.skip_existing = skip_existing; + opts.continue_on_error = continue_on_error; + opts.device_ids = plot_device_ids; + opts.use_all_devices = plot_use_all_devices; + opts.include_cpu = plot_include_cpu; + opts.streaming_tier = plot_streaming_tier; + auto res = pos2gpu::run_batch(entries, opts); double per = res.plots_written ? res.total_wall_seconds / double(res.plots_written) : 0; std::cerr << "[plot] wrote " << res.plots_written << " plots in " << res.total_wall_seconds << " s (" - << per << " s/plot)\n"; + << per << " s/plot)"; + if (res.plots_skipped) std::cerr << "; skipped " << res.plots_skipped; + if (res.plots_failed) std::cerr << "; failed " << res.plots_failed; + std::cerr << "\n"; for (auto const& e : entries) { std::cout << out_dir << "/" << e.out_name << "\n"; } - return 0; + return (res.plots_failed > 0) ? 3 : 0; } catch (std::exception const& e) { std::cerr << "[plot] FAILED: " << e.what() << "\n"; return 2; diff --git a/tools/xchplot2/cli_devlink.cu b/tools/xchplot2/cli_devlink.cu new file mode 100644 index 0000000..f5c9054 --- /dev/null +++ b/tools/xchplot2/cli_devlink.cu @@ -0,0 +1,37 @@ +// cli_devlink.cu — exists only to make xchplot2_cli a CUDA-language +// target so CMake's CUDA_RESOLVE_DEVICE_SYMBOLS=ON actually triggers +// nvcc --device-link at static-archive creation time. +// +// xchplot2_cli is the static lib that build.rs hands to Rust's +// linker (cargo install). It depends on pos2_gpu (the CUDA library +// with separable compilation) but has no CUDA sources of its own. +// Without this stub, CMake silently treats xchplot2_cli as a pure- +// C++ static lib, skips the device-link step regardless of +// CUDA_RESOLVE_DEVICE_SYMBOLS, and the resulting libxchplot2_cli.a +// has every per-TU `__sti____cudaRegisterAll()` constructor +// referencing an undefined `__cudaRegisterLinkedBinary_*` stub. +// Rust's `cc` host linker has no way to provide those — it doesn't +// know to invoke nvcc — so the final link fails. +// +// Touching this file via add_library(... cli_devlink.cu) flips +// xchplot2_cli to a CUDA-language target, the device-link runs at +// archive creation, the resolution stubs land inside the .a, and +// the host linker finds them with no extra work. +// +// First reported on a Debian/Ubuntu host with a real GTX 1060 + +// `CUDA_ARCHITECTURES=61 cargo install` — the symptom was a cascade +// of "undefined reference to __cudaRegisterLinkedBinary_*" on every +// .cu TU in pos2_gpu. + +namespace { + +// Anonymous-namespace `__device__` function — nvcc emits it into the +// per-TU device fatbinary, which gives the device-link step at least +// one input from this TU. Never called from anywhere; marked +// __device__ so it's compiled into the device-side fatbinary, not +// the host-side .o. +__device__ int xchplot2_cli_device_link_anchor() noexcept { + return 0; +} + +} // namespace