diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index cb5dbc4c866..8846a70465d 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -381,6 +381,13 @@ jobs: CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS: all_must_work run: run-tests pathfinder + - name: Run samples tests + if: ${{ inputs.test-mode == 'standard' }} + env: + CUDA_VER: ${{ matrix.CUDA_VER }} + LOCAL_CTK: ${{ matrix.LOCAL_CTK }} + run: run-tests samples + # ── Nightly: install wheels + optional dep together ── - name: Install cuda-python wheels + PyTorch if: ${{ inputs.test-mode == 'nightly-pytorch' }} diff --git a/.spdx-ignore b/.spdx-ignore index 3e2cca9446d..084e3f5bd02 100644 --- a/.spdx-ignore +++ b/.spdx-ignore @@ -8,6 +8,10 @@ LICENSE requirements*.txt cuda_bindings/examples/* +# Samples are synced to NVIDIA/cuda-samples on release and carry the upstream +# verbose BSD-style copyright header instead of the SPDX identifiers. +samples/**/*.py + # Vendored cuda_core/cuda/core/_include/dlpack.h cuda_core/cuda/core/_include/aoti_shim.h diff --git a/ci/tools/run-tests b/ci/tools/run-tests index 1ca54ba8207..9271a19341a 100755 --- a/ci/tools/run-tests +++ b/ci/tools/run-tests @@ -13,8 +13,8 @@ if [[ ${#} -ne 1 ]]; then echo "Error: This script requires exactly 1 argument. You provided ${#}" exit 1 fi -if [[ "${1}" != "bindings" && "${1}" != "core" && "${1}" != "pathfinder" && "${1}" != nightly-* ]]; then - echo "Error: Invalid test module '${1}'. Must be 'bindings', 'core', 'pathfinder', or 'nightly-*'" +if [[ "${1}" != "bindings" && "${1}" != "core" && "${1}" != "pathfinder" && "${1}" != "samples" && "${1}" != nightly-* ]]; then + echo "Error: Invalid test module '${1}'. Must be 'bindings', 'core', 'pathfinder', 'samples', or 'nightly-*'" exit 1 fi @@ -56,6 +56,20 @@ elif [[ "${test_module}" == "bindings" ]]; then ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/cython fi popd +elif [[ "${test_module}" == "samples" ]]; then + # Samples re-use whatever cuda-bindings + cuda-core packages are already + # installed by an earlier ``core`` invocation. Install the cupy backend (the + # only test-only dep most samples need) and run the pytest harness. + TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< "${CUDA_VER}")" + echo "Installing optional sample deps (cupy-cuda${TEST_CUDA_MAJOR}x, nvtx, pillow)" + pip install --upgrade pip + pip install \ + "cupy-cuda${TEST_CUDA_MAJOR}x" \ + nvtx \ + pillow \ + || echo "Warning: optional sample deps install failed; affected samples will be waived" + echo "Running sample tests" + pytest -rxXs -v --durations=0 tests/samples/ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then # Shared setup for core and nightly modes. TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})" diff --git a/conftest.py b/conftest.py index 7a0c59065d5..2ce0aef9de5 100644 --- a/conftest.py +++ b/conftest.py @@ -41,6 +41,10 @@ def pytest_collection_modifyitems(config, items): # noqa: ARG001 if nodeid.startswith("tests/integration/") or "/tests/integration/" in nodeid: item.add_marker(pytest.mark.smoke) + # Sample tests (orchestrator under tests/samples/, sample sources under samples/) + if nodeid.startswith("tests/samples/") or "/tests/samples/" in nodeid: + item.add_marker(pytest.mark.samples) + # Cython tests (any tests/cython subtree) if ( "/tests/cython/" in nodeid diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock index b53c44c0ef9..ce2a9541ce5 100644 --- a/cuda_core/pixi.lock +++ b/cuda_core/pixi.lock @@ -2804,6 +2804,612 @@ environments: - conda: ../cuda_bindings build: py314hd7f1909_0 - conda: ../cuda_pathfinder + samples: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-7_kmp_llvm.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.16.1-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aom-3.14.1-pl5321h039972f_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/backports.strenum-1.3.1-haf276df_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.6.17-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-he90730b_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-15.2.0-h53410ce_19.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.6-py314hd8ed1ab_100.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.3.3.3.1-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-crt-tools-13.3.33-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.3.29-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.3.29-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.3.29-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.3.29-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cuobjdump-13.3.29-hffce074_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cupti-13.3.35-h676940d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvcc-tools-13.3.33-he02047a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvdisasm-13.3.29-hffce074_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.3.33-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvtx-13.3.29-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.3.33-h69a702a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.3.33-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.3.33-h4bc722e_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.3.33-h4bc722e_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.3-hcbadf70_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cupy-14.0.1-py314h31ce861_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cupy-core-14.0.1-py314hed3c566_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/dav1d-1.2.1-hd590300_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ffmpeg-8.1.2-gpl_h1bf8424_901.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.29.4-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/fmt-12.1.0-hff5e90c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.18.1-h27c8c51_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/freetype-2.14.3-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/fribidi-1.0.16-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2026.6.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-15.2.0-h0dff253_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-15.2.0-he0086c7_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gdk-pixbuf-2.44.6-h2b0a6b4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/glslang-16.3.0-h96af755_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gmpy2-2.3.0-py314h28848ee_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.15-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-15.2.0-h76987e4_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-15.2.0-hda75c37_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-14.2.1-h6083320_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/intel-gmmlib-22.10.0-hb700be7_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/intel-media-driver-26.1.6-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-4.18.0-he073ed8_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/lame-3.100-h166bdaf_1003.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.19.1-h0c24ade_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/lerc-4.1.0-hdb68285_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/level-zero-1.29.0-hb700be7_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libabseil-20260526.0-cxx17_h7b12aa8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libass-0.17.4-h96ad9f0_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-8_h5875eb1_mkl.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.2.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.2.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.2.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.78-hd0affe5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-8_hfef963f_mkl.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcublas-13.5.1.27-h676940d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcudnn-9.23.1.3-ha4b6413_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcudss-0.8.0.10-h7bcfba5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufft-12.3.0.29-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.18.0.66-h85c024f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurand-10.4.3.29-h676940d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcusolver-12.2.2.18-h676940d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcusparse-12.8.1.7-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libdovi-3.3.2-ha23c83e_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.127-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.8.1-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libflac-1.5.0-he200343_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.14.3-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.14.3-h73754d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_19.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-15.2.0-hcc6f6b0_119.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgl-devel-1.7.0-ha4b6fd6_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libglib-2.88.1-h0d30a3d_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-devel-1.7.0-ha4b6fd6_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.13.0-default_he001693_1000.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libhwy-1.4.0-h10be129_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h3b78370_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.4.1-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libjxl-0.11.2-h174a0a3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-8_h5e43f62_mkl.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmagma-2.10.0-hd93470c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.3.29-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.3.33-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-2026.2.1-h1f0fae8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-auto-batch-plugin-2026.2.1-h7e124b3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-auto-plugin-2026.2.1-h7e124b3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-hetero-plugin-2026.2.1-hd41364c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-intel-cpu-plugin-2026.2.1-h1f0fae8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-intel-gpu-plugin-2026.2.1-h1f0fae8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-intel-npu-plugin-2026.2.1-h1f0fae8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-ir-frontend-2026.2.1-hd41364c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-onnx-frontend-2026.2.1-h607c73d_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-paddle-frontend-2026.2.1-h607c73d_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-pytorch-frontend-2026.2.1-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-tensorflow-frontend-2026.2.1-h21c0c73_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-tensorflow-lite-frontend-2026.2.1-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopus-1.6.1-h280c20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.19-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libplacebo-7.360.1-h9eeb4b2_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.58-h421ea60_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-7.35.1-h3a69515_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/librsvg-2.62.3-h4c96295_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-15.2.0-h90f66d4_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsndfile-1.2.2-hc7d488a_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.2-hf4e2dac_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_19.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-15.2.0-hd446a21_119.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.2.0-hdf11a46_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.13-h084b8d7_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.1-h9d88235_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libtorch-2.12.1-cuda130_mkl_h5535f43_300.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.13-h084b8d7_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libunwind-1.8.3-h65a8314_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liburing-2.14-hb700be7_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libusb-1.0.29-h73b1eb8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42.2-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.52.1-h280c20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libva-2.23.0-he1eb515_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libvorbis-1.3.7-h54a6638_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libvpl-2.16.0-h54a6638_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libvpx-1.15.2-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libvulkan-loader-1.4.341.0-h5279c79_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.6.0-hd42ef1d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.13.2-hca5e8e5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-16-2.15.3-hca6bf5a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.15.3-h49c6c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-22.1.8-h4922eb0_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py314h67df5f8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/mkl-2026.0.0-hecca717_915.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/mpc-1.4.0-he0a73b1_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.2-he0a73b1_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/mpg123-1.32.9-hc50e24c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/nccl-2.30.7.1-h1aa9b5a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.6.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.5.0-py314h2b28147_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ocl-icd-2.3.4-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/onednn-3.12-omp_h83de36e_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/onemkl-license-2026.0.0-ha770c72_915.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/opencl-headers-2025.06.13-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openh264-2.6.0-hc22cd8d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.4-h55fea9a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.3-h35e630c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/optree-0.19.1-py314h9891dd4_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pango-1.56.4-hda50119_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.47-haa7fec5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-12.2.0-py314h8ec4b1a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.4-h54a6638_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pugixml-1.15-h3f63f65_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pulseaudio-client-17.0-h9a6aba3_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-3.0.3-pyhfe8187e_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-abi-11-hc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-global-3.0.3-pyh648e204_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-3.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pyglet-2.1.14-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.6-habeac84_100_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pytorch-2.12.1-cuda130_mkl_py314_h5d99997_300.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pytorch-gpu-2.12.1-cuda129_mkl_h0d04637_300.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-63.0-h192683f_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/sdl2-2.32.56-h54a6638_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/sdl3-3.4.10-hdeec2a5_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-81.0.0-pyh332efcf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/shaderc-2026.2-h718be3e_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/sleef-3.9.0-ha0421bc_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.2-h03e3b7b_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/spirv-tools-2026.2-hb700be7_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/svt-av1-4.0.1-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sympy-1.14.0-pyh2585a3b_106.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tbb-2023.0.0-hab88423_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/triton-3.7.1-cuda130py314h1cdc6f0_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/wayland-1.25.0-hd6090a7_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/wayland-protocols-1.49-hd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/x264-1!164.3095-h166bdaf_2.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/x265-3.5-h924138e_3.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.48-h280c20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.13-he1eb515_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcursor-1.2.3-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.7-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.2-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.8.3-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrandr-1.5.5-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxscrnsaver-1.2.4-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxtst-1.2.5-hb9d3cd8_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-xorgproto-2025.1-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-ng-2.3.3-hceb46e0_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + - conda: . + build: py314hd3a1e81_0 + - conda: ../cuda_bindings + build: py314hd3a1e81_0 + - conda: ../cuda_pathfinder + - pypi: https://files.pythonhosted.org/packages/f7/e1/e02fafc01c18f1868a2d2c030953f49e38d65f2d95884789a6c46ff308f1/nvtx-0.2.15-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + linux-aarch64: + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-7_kmp_llvm.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/alsa-lib-1.2.16.1-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/aom-3.14.1-pl5321h8fffa31_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/arm-variant-1.2.0-sbsa.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/backports.strenum-1.3.1-haf276df_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/binutils_impl_linux-aarch64-2.45.1-default_h5f4c503_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h4777abc_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.6.17-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cairo-1.18.4-h0b6afd8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cffi-2.0.0-py314h0bd77cf_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.6-py314hd8ed1ab_100.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-aarch64-13.3.3.3.1-h579c4fd_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-crt-tools-13.3.33-h579c4fd_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cudart-13.3.29-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-aarch64-13.3.29-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-aarch64-13.3.29-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-aarch64-13.3.29-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cuobjdump-13.3.29-h2079400_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cupti-13.3.35-he38c790_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvcc-tools-13.3.33-h614329b_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvdisasm-13.3.29-h40ab4d6_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.3.33-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvtx-13.3.29-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-13.3.33-he9431aa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.3.33-h579c4fd_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.3.33-h7b14b0b_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-tools-13.3.33-h7b14b0b_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.3-hcbadf70_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cupy-14.0.1-py314h8e5308c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cupy-core-14.0.1-py314h1d6db3a_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/dav1d-1.2.1-h31becfc_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/dbus-1.16.2-h70963c4_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ffmpeg-8.1.2-gpl_h0327ddc_901.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.29.4-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fmt-12.1.0-h20c602a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fontconfig-2.18.1-hba86a56_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/freetype-2.14.3-h8af1aa0_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fribidi-1.0.16-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2026.6.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gcc-15.2.0-h24a549f_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gcc_impl_linux-aarch64-15.2.0-h3530432_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gdk-pixbuf-2.44.6-h90308e0_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/glslang-16.3.0-h124e036_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gmp-6.3.0-h0a1ffab_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gmpy2-2.3.0-py314h887ad84_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/graphite2-1.3.15-hfae3067_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gxx-15.2.0-ha384071_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gxx_impl_linux-aarch64-15.2.0-h03e2352_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/harfbuzz-14.2.1-h1134a53_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/icu-78.3-hcab7f73_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-aarch64-4.18.0-h05a177a_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lame-3.100-h4e544f5_1003.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lcms2-2.19.1-h9d5b58d_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.45.1-default_h1979696_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lerc-4.1.0-h52b7260_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libabseil-20260526.0-cxx17_h6983b43_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libass-0.17.4-hcfe818d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-8_haddc8a3_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libbrotlicommon-1.2.0-he30d5cf_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libbrotlidec-1.2.0-he30d5cf_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libbrotlienc-1.2.0-he30d5cf_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.78-hf9559e3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-8_hd72aa62_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcublas-13.5.1.27-he38c790_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcudnn-9.23.1.3-h0bf6004_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcudss-0.8.0.10-he387df4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufft-12.3.0.29-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.18.0.66-h4243460_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcurand-10.4.3.29-he38c790_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcusolver-12.2.2.18-he38c790_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcusparse-12.8.1.7-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdovi-3.3.2-hf71c8f5_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.127-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.8.1-hfae3067_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.5.2-h376a255_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libflac-1.5.0-he9c94f4_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype-2.14.3-h8af1aa0_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype6-2.14.3-hdae7a39_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.2.0-h8acb6b2_19.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-aarch64-15.2.0-h55c397f_119.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-15.2.0-he9431aa_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran-15.2.0-he9431aa_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran5-15.2.0-h1b7bec0_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgl-1.7.0-hd24410f_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgl-devel-1.7.0-hd24410f_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglib-2.88.1-h96a7f82_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglvnd-1.7.0-hd24410f_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglx-1.7.0-hd24410f_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglx-devel-1.7.0-hd24410f_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.2.0-h8acb6b2_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libhwloc-2.13.0-default_ha95e27d_1000.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libhwy-1.4.0-h0626a34_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libiconv-1.18-h90929bb_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libjpeg-turbo-3.1.4.1-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libjxl-0.11.2-hbae46ee_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblapack-3.11.0-8_h88aeb00_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.3-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmagma-2.10.0-he3ecef4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmpdec-4.0.0-he30d5cf_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libnl-3.11.0-h86ecc28_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libnvfatbin-13.3.29-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libnvjitlink-13.3.33-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libogg-1.3.5-h86ecc28_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenblas-0.3.33-openmp_h1a8b088_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-2026.2.1-h9a8427e_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-arm-cpu-plugin-2026.2.1-h9a8427e_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-auto-batch-plugin-2026.2.1-he6b9e7b_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-auto-plugin-2026.2.1-he6b9e7b_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-hetero-plugin-2026.2.1-he07c6df_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-ir-frontend-2026.2.1-he07c6df_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-onnx-frontend-2026.2.1-h6fc7987_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-paddle-frontend-2026.2.1-h6fc7987_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-pytorch-frontend-2026.2.1-hfae3067_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-tensorflow-frontend-2026.2.1-h9dfe790_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-tensorflow-lite-frontend-2026.2.1-hfae3067_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopus-1.6.1-h80f16a2_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libpciaccess-0.19-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libplacebo-7.360.1-h07e46df_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libpng-1.6.58-h1abf092_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libprotobuf-7.35.1-h38371b1_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/librsvg-2.62.3-hf685517_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsanitizer-15.2.0-he19c465_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsndfile-1.2.2-h30591a0_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.53.2-h10b116e_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-15.2.0-hef695bb_19.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-aarch64-15.2.0-ha7b1723_119.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-ng-15.2.0-hdbbeba8_19.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsystemd0-257.13-hfcc8634_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libtiff-4.7.1-hdb009f0_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libtorch-2.12.1-cuda130_generic_h4328193_200.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libudev1-257.13-hfcc8634_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libunwind-1.8.3-h6470e1d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liburing-2.14-hfefdfc9_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libusb-1.0.29-h06eaf92_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.42.2-h1022ec0_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuv-1.52.1-h80f16a2_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libvorbis-1.3.7-h7ac5ae9_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libvpx-1.15.2-hfae3067_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libvulkan-loader-1.4.341.0-h8b8848b_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libwebp-base-1.6.0-ha2e29f5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxcb-1.17.0-h262b8f6_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxkbcommon-1.13.2-h3c6a4c8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxml2-16-2.15.3-h79dcc73_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxml2-2.15.3-h869d058_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.3.2-hdc9db2a_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/llvm-openmp-22.1.8-he40846f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/markupsafe-3.0.3-py314hb76de3f_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mpc-1.4.0-he6dc3fb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mpfr-4.2.2-h3faef18_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mpg123-1.32.9-h65af167_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nccl-2.30.7.1-h2b99535_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.6-hf8d1292_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.6.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/nomkl-1.0-h5ca1d4c_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/numpy-2.5.0-py314he1698a1_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/onednn-3.12-omp_h605b386_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openh264-2.6.0-h0564a2a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openjpeg-2.5.4-h5da879a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.6.3-h546c87b_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/optree-0.19.1-py314hd7d8586_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pango-1.56.4-h8547ced_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pcre2-10.47-hf841c20_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pillow-12.2.0-py314hac3e5ec_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pixman-0.46.4-h7ac5ae9_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pthread-stubs-0.4-h86ecc28_1002.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pugixml-1.15-h6ef32b0_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pulseaudio-client-17.0-hcf98165_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-3.0.3-pyhfe8187e_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-abi-11-hc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-global-3.0.3-pyh648e204_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-3.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pyglet-2.1.14-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.14.6-hc679e19_100_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pytorch-2.12.1-cuda130_generic_py314_h1da07bd_200.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pytorch-gpu-2.12.1-cuda129_generic_hda344be_200.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/rdma-core-63.0-h1f0f388_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.3-hb682ff5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sdl2-2.32.56-h7ac5ae9_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sdl3-3.4.10-had2c13b_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-81.0.0-pyh332efcf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/shaderc-2026.2-hfeb5c2c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sleef-3.9.0-h5bb93e2_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/snappy-1.2.2-he774c54_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/spirv-tools-2026.2-hfefdfc9_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/svt-av1-4.0.1-hfae3067_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sympy-1.14.0-pyh2585a3b_106.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-aarch64-2.28-h585391f_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tbb-2023.0.0-h57272ed_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-noxft_h0dc03b3_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/triton-3.7.1-cuda130py314ha788bc0_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/wayland-1.25.0-h4f8a99f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/x264-1!164.3095-h4e544f5_2.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/x265-3.5-hdd96247_3.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xkeyboard-config-2.48-h80f16a2_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libice-1.1.2-h86ecc28_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libsm-1.2.6-h0808dbd_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libx11-1.8.13-h63a1b12_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxau-1.0.12-he30d5cf_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxcursor-1.2.3-h86ecc28_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxdmcp-1.1.5-he30d5cf_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxext-1.3.7-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxfixes-6.0.2-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxi-1.8.3-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxrandr-1.5.5-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxrender-0.9.12-h86ecc28_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxscrnsaver-1.2.4-h86ecc28_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxtst-1.2.5-h57736b2_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-xorgproto-2025.1-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zlib-ng-2.3.3-ha7cb516_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-h85ac4a6_6.conda + - conda: . + build: py314h3ff45e1_0 + - conda: ../cuda_bindings + build: py314h3ff45e1_0 + - conda: ../cuda_pathfinder + - pypi: https://files.pythonhosted.org/packages/e0/5b/ca0ba6fa769d08174b7a5b4775c279e2e26611cdd5e7833aa699187871c7/nvtx-0.2.15-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl + win-64: + - conda: https://conda.anaconda.org/conda-forge/win-64/aom-3.14.1-pl5321h06fc181_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/backports.strenum-1.3.1-haf276df_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h0ad9c76_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.6.17-h4c7d964_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h477c42c_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cffi-2.0.0-py314h5a2d7ad_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.3.33-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.3.33-h719f0c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.3.33-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.3.33-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-13.3.33-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.3-hcbadf70_3.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/dav1d-1.2.1-hcfcfb64_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/ffmpeg-8.1.2-gpl_h6d5d71d_901.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.18.1-hd47e2ca_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/freetype-2.14.3-h57928b3_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/fribidi-1.0.16-hfd05255_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/gdk-pixbuf-2.44.6-h1f5b9c4_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/glslang-16.3.0-h294ba9c_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/graphite2-1.3.15-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/harfbuzz-14.2.1-h5a1b470_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/icu-78.3-h637d24d_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/lame-3.100-hcfcfb64_1003.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/win-64/lerc-4.1.0-hd936e49_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libblas-3.11.0-8_h8455456_mkl.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libbrotlicommon-1.2.0-hfd05255_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libbrotlidec-1.2.0-hfd05255_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libbrotlienc-1.2.0-hfd05255_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libcblas-3.11.0-8_h2a3cdd5_mkl.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libdeflate-1.25-h51727cc_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.8.1-hac47afa_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.5.2-h3d046cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libfreetype-2.14.3-h57928b3_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libfreetype6-2.14.3-hdbac1cb_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libglib-2.88.1-h7ce1215_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libhwloc-2.13.0-default_h049141e_1000.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libhwy-1.4.0-h172a326_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libiconv-1.18-hc1393d2_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libintl-0.22.5-h5728263_3.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libjpeg-turbo-3.1.4.1-hfd05255_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libjxl-0.11.2-h932607e_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/liblapack-3.11.0-8_hf9ab0e9_mkl.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.3-hfd05255_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libmpdec-4.0.0-hfd05255_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-13.3.29-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-13.3.33-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libogg-1.3.5-h2466b09_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libopus-1.6.1-h6a83c73_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.58-h7351971_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/librsvg-2.62.3-h15cfe45_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.53.2-hf5d6505_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libtiff-4.7.1-h8f73337_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libusb-1.0.29-h1839187_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libvorbis-1.3.7-h5112557_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libvulkan-loader-1.4.341.0-h477610d_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libwebp-base-1.6.0-h4d5522a_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libwinpthread-12.0.0.r4.gg4f2fc60ca-h57928b3_10.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libxml2-16-2.15.3-h3cfd58e_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libxml2-2.15.3-h8ef44ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.2-hfd05255_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/llvm-openmp-22.1.8-h4fa8253_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/mkl-2026.0.0-hac47afa_908.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/numpy-2.5.0-py314h02f10f6_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/onemkl-license-2026.0.0-h57928b3_908.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/openh264-2.6.0-hb17fa0b_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.6.3-hf411b9b_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/pango-1.56.4-h13911b6_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/pcre2-10.47-hd2b5f0e_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/pixman-0.46.4-h5112557_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-3.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pyglet-2.1.14-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.14.6-h4b44e0e_100_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/sdl2-2.32.56-h5112557_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/sdl3-3.4.10-h5112557_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/shaderc-2026.2-h8fa7867_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/spirv-tools-2026.2-h49e36cd_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/svt-av1-4.0.1-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/tbb-2023.0.0-hd3d4ead_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h6ed50ae_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.26100.0-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.5-h1b7c187_39.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.51.36231-h1b9f54f_39.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/vcomp14-14.51.36231-h1b9f54f_39.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.51.36231-h84cd919_39.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/x264-1!164.3095-h8ffe710_2.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/win-64/x265-3.5-h2d74725_3.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/win-64/zlib-1.3.2-hfd05255_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-h534d264_6.conda + - conda: . + build: py314hd7f1909_0 + - conda: ../cuda_bindings + build: py314hd7f1909_0 + - conda: ../cuda_pathfinder + - pypi: https://files.pythonhosted.org/packages/20/77/a2b64335bab7c75fe1c054cc4ebe2d3b3234cbdb04d2e1d6ca73551c54f5/nvtx-0.2.15-cp314-cp314-win_amd64.whl packages: - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda build_number: 20 @@ -2827,6 +3433,7 @@ packages: - llvm-openmp >=9.0.1 license: BSD-3-Clause license_family: BSD + purls: [] size: 8244 timestamp: 1764092331208 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-20_gnu.conda @@ -2850,6 +3457,7 @@ packages: - llvm-openmp >=9.0.1 license: BSD-3-Clause license_family: BSD + purls: [] size: 8293 timestamp: 1764092286102 - conda: https://conda.anaconda.org/conda-forge/win-64/_openmp_mutex-4.5-20_gnu.conda @@ -2910,6 +3518,17 @@ packages: license_family: GPL size: 584660 timestamp: 1768327524772 +- conda: https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.16.1-hb03c661_0.conda + sha256: cf93ca0f1f107e95a35969a4622684e08fcb8cf37f8cf4a1e9e424828386c921 + md5: 8904e09bda369377b3dd07e2ac828c5d + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: LGPL-2.1-or-later + license_family: LGPL + purls: [] + size: 592377 + timestamp: 1781521980743 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/alsa-lib-1.2.15.3-he30d5cf_0.conda sha256: ea2233e2db9908c2e5f29d3ca420a546b4583253f4f70abb5494cdd676866d42 md5: 4a98cbc4ade694520227402ff8880630 @@ -2919,6 +3538,28 @@ packages: license_family: GPL size: 615729 timestamp: 1768327548407 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/alsa-lib-1.2.16.1-he30d5cf_0.conda + sha256: 105e4c19cfa770affcb9a64b9d2451f406914cd09a67664009910869fa01a639 + md5: 5427b5dcb268bddf1a69c16d1cb77a47 + depends: + - libgcc >=14 + license: LGPL-2.1-or-later + license_family: LGPL + purls: [] + size: 621865 + timestamp: 1781522013595 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aom-3.14.1-pl5321h039972f_1.conda + sha256: b1d972a9b949a88babee681437535550b3ca5dbca6a23a40dffeb7900fec19fd + md5: 5a78a69eb3b50f24b379e9d2a93163ae + depends: + - __glibc >=2.17,<3.0.a0 + - libstdcxx >=14 + - libgcc >=14 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 3103347 + timestamp: 1780752473089 - conda: https://conda.anaconda.org/conda-forge/linux-64/aom-3.9.1-hac33072_0.conda sha256: b08ef033817b5f9f76ce62dfcac7694e7b6b4006420372de22494503decac855 md5: 346722a0be40f6edc53f12640d301338 @@ -2929,6 +3570,17 @@ packages: license_family: BSD size: 2706396 timestamp: 1718551242397 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/aom-3.14.1-pl5321h8fffa31_1.conda + sha256: a228f46f68fa3e2e50a09b5a4cefd1ee2c1ce868bfa2a288867b3d44b6e77427 + md5: a3c86229b531656c2bce99e8a6c6de4a + depends: + - libstdcxx >=14 + - libgcc >=14 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 4091040 + timestamp: 1780752489693 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/aom-3.9.1-hcccb83c_0.conda sha256: ac438ce5d3d3673a9188b535fc7cda413b479f0d52536aeeac1bd82faa656ea0 md5: cc744ac4efe5bcaa8cca51ff5b850df0 @@ -2939,6 +3591,18 @@ packages: license_family: BSD size: 3250813 timestamp: 1718551360260 +- conda: https://conda.anaconda.org/conda-forge/win-64/aom-3.14.1-pl5321h06fc181_1.conda + sha256: 3033fa8953f7f0c1bb5b89b5af77253badc14a89ba94d743dde3c9159e10fd5e + md5: 7a8ace8100a48355a34d87386012c57b + depends: + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + - ucrt >=10.0.20348.0 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 2214571 + timestamp: 1780752497150 - conda: https://conda.anaconda.org/conda-forge/win-64/aom-3.9.1-he0c23c2_0.conda sha256: 0524d0c0b61dacd0c22ac7a8067f977b1d52380210933b04141f5099c5b6fec7 md5: 3d7c14285d3eb3239a76ff79063f27a5 @@ -3103,6 +3767,18 @@ packages: license_family: GPL size: 3744895 timestamp: 1770267152681 +- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.45.1-default_hfdba357_102.conda + sha256: 0a7d405064f53b9d91d92515f1460f7906ee5e8523f3cd8973430e81219f4917 + md5: 8165352fdce2d2025bf884dc0ee85700 + depends: + - ld_impl_linux-64 2.45.1 default_hbd61a6d_102 + - sysroot_linux-64 + - zstd >=1.5.7,<1.6.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 3661455 + timestamp: 1774197460085 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/binutils_impl_linux-aarch64-2.45.1-default_h5f4c503_101.conda sha256: e90ab42a5225dc1eaa6e4e7201cd7b8ed52dad6ec46814be7e5a4039433ae85c md5: df6e1dc38cbe5642350fa09d4a1d546b @@ -3114,6 +3790,18 @@ packages: license_family: GPL size: 4741684 timestamp: 1770267224406 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/binutils_impl_linux-aarch64-2.45.1-default_h5f4c503_102.conda + sha256: 7fd4ddde2f0150d015dfa9f2db5f428bd1570078f270e4bd4f116487a52de169 + md5: 56a04d796d7e3cdc9f8d2e1278e91bff + depends: + - ld_impl_linux-aarch64 2.45.1 default_h1979696_102 + - sysroot_linux-aarch64 + - zstd >=1.5.7,<1.6.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 4683754 + timestamp: 1774197535605 - conda: https://conda.anaconda.org/conda-forge/win-64/binutils_impl_win-64-2.45.1-default_ha84baeb_101.conda sha256: 31211bd89e77203f731f31871ff13b5828fbd99f02ae2fc56ae15fcd568c4466 md5: 84d2e3fd656b05705b7cfe7a92a8c840 @@ -3227,6 +3915,24 @@ packages: purls: [] size: 147413 timestamp: 1772006283803 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.6.17-h4c7d964_0.conda + sha256: 7f458e4a82514d7bebbfef23d92817794a16aaf1c748a15f04870d4fb49aeab2 + md5: b9696b2cf00dfeec138c70cee38ed192 + depends: + - __win + license: ISC + purls: [] + size: 129352 + timestamp: 1781709016515 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.6.17-hbd8a1cb_0.conda + sha256: f8e3c730fa14ee3f170493779f06522c4acf89169f43db4f039727709b6419cf + md5: a9965dd99f683c5f444428f896635716 + depends: + - __unix + license: ISC + purls: [] + size: 128866 + timestamp: 1781708962055 - conda: https://conda.anaconda.org/conda-forge/noarch/cachecontrol-0.14.3-pyha770c72_0.conda sha256: ec791bb6f1ef504411f87b28946a7ae63ed1f3681cefc462cf1dfdaf0790b6a9 md5: 241ef6e3db47a143ac34c21bfba510f1 @@ -3264,6 +3970,7 @@ packages: - xorg-libxext >=1.3.6,<2.0a0 - xorg-libxrender >=0.9.12,<0.10.0a0 license: LGPL-2.1-only or MPL-1.1 + purls: [] size: 989514 timestamp: 1766415934926 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cairo-1.18.4-h0b6afd8_1.conda @@ -3289,6 +3996,7 @@ packages: - xorg-libxext >=1.3.6,<2.0a0 - xorg-libxrender >=0.9.12,<0.10.0a0 license: LGPL-2.1-only or MPL-1.1 + purls: [] size: 927045 timestamp: 1766416003626 - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h477c42c_1.conda @@ -3309,6 +4017,7 @@ packages: - vc >=14.3,<15 - vc14_runtime >=14.44.35208 license: LGPL-2.1-only or MPL-1.1 + purls: [] size: 1537783 timestamp: 1766416059188 - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2026.2.25-pyhd8ed1ab_0.conda @@ -3333,6 +4042,8 @@ packages: - python_abi 3.14.* *_cp314 license: MIT license_family: MIT + purls: + - pkg:pypi/cffi?source=hash-mapping size: 300271 timestamp: 1761203085220 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cffi-2.0.0-py314h0bd77cf_1.conda @@ -3346,6 +4057,8 @@ packages: - python_abi 3.14.* *_cp314 license: MIT license_family: MIT + purls: + - pkg:pypi/cffi?source=hash-mapping size: 318357 timestamp: 1761203973223 - conda: https://conda.anaconda.org/conda-forge/win-64/cffi-2.0.0-py314h5a2d7ad_1.conda @@ -3360,6 +4073,8 @@ packages: - vc14_runtime >=14.44.35208 license: MIT license_family: MIT + purls: + - pkg:pypi/cffi?source=hash-mapping size: 294731 timestamp: 1761203441365 - conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.7-pyhd8ed1ab_0.conda @@ -3442,6 +4157,16 @@ packages: license_family: GPL size: 31705 timestamp: 1771378159534 +- conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-15.2.0-h53410ce_19.conda + sha256: 1a53d0bd9d8197a7dc57f9b154e24d908ade29934e0a450ee6e40294d0a237f9 + md5: 3b482cadfc77f094c8b3016166292dfb + depends: + - gcc_impl_linux-64 >=15.2.0,<15.2.1.0a0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 31857 + timestamp: 1778269225076 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/conda-gcc-specs-14.3.0-hadff5d6_18.conda sha256: 7b018e74d2f828e887faabc9d5c5bef6d432c3356dcac3e691ee6b24bc82ef52 md5: 184c1aba41c40e6bc59fa91b37cd7c3f @@ -3462,15 +4187,26 @@ packages: timestamp: 1771382417485 - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.3-py314hd8ed1ab_101.conda noarch: generic - sha256: 91b06300879df746214f7363d6c27c2489c80732e46a369eb2afc234bcafb44c - md5: 3bb89e4f795e5414addaa531d6b1500a + sha256: 91b06300879df746214f7363d6c27c2489c80732e46a369eb2afc234bcafb44c + md5: 3bb89e4f795e5414addaa531d6b1500a + depends: + - python >=3.14,<3.15.0a0 + - python_abi * *_cp314 + license: Python-2.0 + purls: [] + size: 50078 + timestamp: 1770674447292 +- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.6-py314hd8ed1ab_100.conda + noarch: generic + sha256: 7a548856ef5307890a8cadfc196655117658f8c24589ce175caa4c1c2ded9d13 + md5: b28fe35fd43d5f425c0dccbe5b5039fd depends: - python >=3.14,<3.15.0a0 - python_abi * *_cp314 license: Python-2.0 purls: [] - size: 50078 - timestamp: 1770674447292 + size: 49333 + timestamp: 1781254618863 - conda: https://conda.anaconda.org/conda-forge/noarch/cssutils-2.11.1-pyhd8ed1ab_0.conda sha256: b9006cbd28ed63a6461717cb9234e1d1f39441d9db0493f55ee0ca72f3577833 md5: 99cf98eea444365238fb6ee8f518ef19 @@ -3643,6 +4379,7 @@ packages: depends: - cuda-version >=13.3,<13.4.0a0 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 1472271 timestamp: 1779895496841 - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-aarch64-12.9.27-h579c4fd_0.conda @@ -3661,6 +4398,7 @@ packages: - arm-variant * sbsa - cuda-version >=13.3,<13.4.0a0 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 1481900 timestamp: 1779895522474 - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-12.9.27-h57928b3_0.conda @@ -3884,6 +4622,7 @@ packages: depends: - cuda-version >=13.3,<13.4.0a0 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 30512 timestamp: 1779905082733 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-crt-tools-12.9.86-h579c4fd_2.conda @@ -3902,6 +4641,7 @@ packages: - arm-variant * sbsa - cuda-version >=13.3,<13.4.0a0 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 30727 timestamp: 1779905123621 - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-crt-tools-12.9.86-h57928b3_2.conda @@ -4064,6 +4804,7 @@ packages: - cuda-cudart_linux-64 - cuda-version >=13.3,<13.4.0a0 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 405020 timestamp: 1779898430134 - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-aarch64-12.9.79-h3ae8b8a_0.conda @@ -4088,6 +4829,7 @@ packages: - cuda-cudart_linux-aarch64 - cuda-version >=13.3,<13.4.0a0 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 403650 timestamp: 1779898443931 - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-12.9.79-he0c23c2_0.conda @@ -4186,6 +4928,7 @@ packages: depends: - cuda-version >=13.3,<13.4.0a0 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 1126340 timestamp: 1779898412056 - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-aarch64-12.9.79-h3ae8b8a_0.conda @@ -4204,6 +4947,7 @@ packages: - arm-variant * sbsa - cuda-version >=13.3,<13.4.0a0 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 1133087 timestamp: 1779898428591 - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-12.9.79-he0c23c2_0.conda @@ -4284,6 +5028,7 @@ packages: - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 312247 timestamp: 1779911081668 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cuobjdump-13.3.29-h2079400_0.conda @@ -4295,6 +5040,7 @@ packages: - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 318659 timestamp: 1779911096155 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cupti-13.3.35-h676940d_0.conda @@ -4306,6 +5052,7 @@ packages: - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 1600030 timestamp: 1779895779561 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cupti-13.3.35-he38c790_0.conda @@ -4320,6 +5067,7 @@ packages: constrains: - arm-variant * sbsa license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 1448548 timestamp: 1779895823294 - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvcc-dev_linux-64-12.9.86-he91c749_2.conda @@ -4439,6 +5187,7 @@ packages: constrains: - gcc_impl_linux-64 >=6,<16.0a0 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 34635831 timestamp: 1779905180976 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvcc-tools-12.9.86-h614329b_2.conda @@ -4469,6 +5218,7 @@ packages: constrains: - gcc_impl_linux-aarch64 >=6,<16.0a0 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 30810865 timestamp: 1779905234807 - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvcc-tools-12.9.86-he0c23c2_2.conda @@ -4493,6 +5243,7 @@ packages: - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 4704236 timestamp: 1779896392544 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvdisasm-13.3.29-h40ab4d6_0.conda @@ -4506,6 +5257,7 @@ packages: constrains: - arm-variant * sbsa license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 4673704 timestamp: 1779896419537 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-12.9.86-hecca717_1.conda @@ -4668,6 +5420,7 @@ packages: - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 33852 timestamp: 1779896656406 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvtx-13.3.29-h8f3c8d4_0.conda @@ -4681,6 +5434,7 @@ packages: constrains: - arm-variant * sbsa license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 35014 timestamp: 1779896683393 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.3.33-h69a702a_0.conda @@ -4999,6 +5753,26 @@ packages: license_family: MIT size: 385283 timestamp: 1771604567478 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cupy-14.0.1-py314h31ce861_1.conda + sha256: 645c77e7d3f3986d48fa5bb08d14a72d0f14e11ea276f28242052bb9b073d83b + md5: e8f7956463e9340710e9895f0d0418f3 + depends: + - cuda-cudart-dev_linux-64 + - cuda-nvrtc + - cuda-version >=13,<14.0a0 + - cupy-core 14.0.1 py314hed3c566_1 + - libcublas + - libcufft + - libcurand + - libcusolver + - libcusparse + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + license: MIT + license_family: MIT + purls: [] + size: 384966 + timestamp: 1779504073573 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cupy-14.0.1-py314h8e5308c_0.conda sha256: 10608ecb57bf7c2295a8ce5ed538305553290ec53fa960a91794559e42a204ba md5: f4200ad5b954b43fac5ae43415e82317 @@ -5018,6 +5792,26 @@ packages: license_family: MIT size: 385353 timestamp: 1771605185463 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cupy-14.0.1-py314h8e5308c_1.conda + sha256: e10e1284b4f7ba61bac287e0621ffe142914d78008ed96a480cdb727c11326e2 + md5: 6b68146846d55143a9304ad3ca0333ea + depends: + - cuda-cudart-dev_linux-aarch64 + - cuda-nvrtc + - cuda-version >=13,<14.0a0 + - cupy-core 14.0.1 py314h1d6db3a_1 + - libcublas + - libcufft + - libcurand + - libcusolver + - libcusparse + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + license: MIT + license_family: MIT + purls: [] + size: 384462 + timestamp: 1779504126757 - conda: https://conda.anaconda.org/conda-forge/linux-64/cupy-core-14.0.1-py314hed3c566_0.conda sha256: f048dbdee55577fad61221b87b0be44fc64de532138cf80b0e65fe0955d13b27 md5: 494ca91005c44e23bd74d6fd086228d2 @@ -5048,6 +5842,38 @@ packages: license_family: MIT size: 33970282 timestamp: 1771604499034 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cupy-core-14.0.1-py314hed3c566_1.conda + sha256: eb9af273306d14002d887f41a1966a741b53baf859de853e14cd6cca285d92d5 + md5: 7586c72642ed8c79b84aa66219a4cfca + depends: + - __glibc >=2.28,<3.0.a0 + - cuda-pathfinder >=1.3.3,<2.0a0 + - libgcc >=14 + - libstdcxx >=14 + - numpy >=1.23,<3 + - numpy >=2.0 + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + constrains: + - optuna ~=3.0 + - libcublas >=13,<14.0a0 + - nccl >=2.30.4.1,<3.0a0 + - libcusparse >=12,<13.0a0 + - cutensor >=2.6.0.4,<3.0a0 + - cupy >=14.0.1,<14.1.0a0 + - scipy >=1.10,<1.17 + - libcurand >=10,<11.0a0 + - __cuda >=13.0 + - libcufft >=12,<13.0a0 + - libcusolver >=12,<13.0a0 + - cuda-version >=13,<14.0a0 + - cuda-nvrtc >=13,<14.0a0 + license: MIT + license_family: MIT + purls: + - pkg:pypi/cupy?source=hash-mapping + size: 33860499 + timestamp: 1779504045871 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cupy-core-14.0.1-py314h1d6db3a_0.conda sha256: 125c488f2ac15a216575449baf03a9e16644c3e7e6773fd9e53ba68893a20396 md5: 0bd337aec5a1d18ecf0bf16a2d0d3ce8 @@ -5079,6 +5905,39 @@ packages: license_family: MIT size: 39128286 timestamp: 1771605119782 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cupy-core-14.0.1-py314h1d6db3a_1.conda + sha256: 50eeca5279bd44c7977e0babf0eaf4a78de8e448e34804d26f45277e93cda354 + md5: 9d5af704cc69aeb6e92547b0b68ccda4 + depends: + - __glibc >=2.28,<3.0.a0 + - cuda-pathfinder >=1.3.3,<2.0a0 + - libgcc >=14 + - libstdcxx >=14 + - numpy >=1.23,<3 + - numpy >=2.0 + - python >=3.14,<3.15.0a0 + - python >=3.14,<3.15.0a0 *_cp314 + - python_abi 3.14.* *_cp314 + constrains: + - libcusolver >=12,<13.0a0 + - libcusparse >=12,<13.0a0 + - __cuda >=13.0 + - nccl >=2.30.4.1,<3.0a0 + - cupy >=14.0.1,<14.1.0a0 + - cutensor >=2.6.0.4,<3.0a0 + - scipy >=1.10,<1.17 + - cuda-nvrtc >=13,<14.0a0 + - cuda-version >=13,<14.0a0 + - optuna ~=3.0 + - libcurand >=10,<11.0a0 + - libcublas >=13,<14.0a0 + - libcufft >=12,<13.0a0 + license: MIT + license_family: MIT + purls: + - pkg:pypi/cupy?source=hash-mapping + size: 39191318 + timestamp: 1779504103703 - conda: https://conda.anaconda.org/conda-forge/linux-64/cython-3.2.4-py314h1807b08_0.conda sha256: f700d10c2a794710a1656a6fdb8908fb04f3c7812ac4f17187777646ede1a3d9 md5: 866fd3d25b767bccb4adc8476f4035cd @@ -5131,6 +5990,7 @@ packages: - libgcc-ng >=12 license: BSD-2-Clause license_family: BSD + purls: [] size: 760229 timestamp: 1685695754230 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/dav1d-1.2.1-h31becfc_0.conda @@ -5140,6 +6000,7 @@ packages: - libgcc-ng >=12 license: BSD-2-Clause license_family: BSD + purls: [] size: 347363 timestamp: 1685696690003 - conda: https://conda.anaconda.org/conda-forge/win-64/dav1d-1.2.1-hcfcfb64_0.conda @@ -5151,6 +6012,7 @@ packages: - vc14_runtime >=14.29.30139 license: BSD-2-Clause license_family: BSD + purls: [] size: 618643 timestamp: 1685696352968 - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda @@ -5164,6 +6026,7 @@ packages: - libglib >=2.86.2,<3.0a0 - libexpat >=2.7.3,<3.0a0 license: AFL-2.1 OR GPL-2.0-or-later + purls: [] size: 447649 timestamp: 1764536047944 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/dbus-1.16.2-h70963c4_1.conda @@ -5176,6 +6039,7 @@ packages: - libzlib >=1.3.1,<2.0a0 - libexpat >=2.7.3,<3.0a0 license: AFL-2.1 OR GPL-2.0-or-later + purls: [] size: 480416 timestamp: 1764536098891 - conda: https://conda.anaconda.org/conda-forge/linux-64/debugpy-1.8.20-py314h42812f9_0.conda @@ -5373,6 +6237,71 @@ packages: license_family: GPL size: 12485347 timestamp: 1773008832077 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ffmpeg-8.1.2-gpl_h1bf8424_901.conda + sha256: 2d56b0d90a1e581e296a41aa0a0443c85f918a94780e779a23005be9128627be + md5: 0c457f1b2384bb0aa984831a79021a66 + depends: + - __glibc >=2.17,<3.0.a0 + - alsa-lib >=1.2.16.1,<1.3.0a0 + - aom >=3.14.1,<3.15.0a0 + - bzip2 >=1.0.8,<2.0a0 + - dav1d >=1.2.1,<1.2.2.0a0 + - fontconfig >=2.18.1,<3.0a0 + - fonts-conda-ecosystem + - gmp >=6.3.0,<7.0a0 + - harfbuzz >=14.2.1 + - lame >=3.100,<3.101.0a0 + - libass >=0.17.4,<0.17.5.0a0 + - libexpat >=2.8.1,<3.0a0 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + - libgcc >=14 + - libiconv >=1.18,<2.0a0 + - libjxl >=0.11,<1.0a0 + - liblzma >=5.8.3,<6.0a0 + - libopenvino >=2026.2.1,<2026.2.2.0a0 + - libopenvino-auto-batch-plugin >=2026.2.1,<2026.2.2.0a0 + - libopenvino-auto-plugin >=2026.2.1,<2026.2.2.0a0 + - libopenvino-hetero-plugin >=2026.2.1,<2026.2.2.0a0 + - libopenvino-intel-cpu-plugin >=2026.2.1,<2026.2.2.0a0 + - libopenvino-intel-gpu-plugin >=2026.2.1,<2026.2.2.0a0 + - libopenvino-intel-npu-plugin >=2026.2.1,<2026.2.2.0a0 + - libopenvino-ir-frontend >=2026.2.1,<2026.2.2.0a0 + - libopenvino-onnx-frontend >=2026.2.1,<2026.2.2.0a0 + - libopenvino-paddle-frontend >=2026.2.1,<2026.2.2.0a0 + - libopenvino-pytorch-frontend >=2026.2.1,<2026.2.2.0a0 + - libopenvino-tensorflow-frontend >=2026.2.1,<2026.2.2.0a0 + - libopenvino-tensorflow-lite-frontend >=2026.2.1,<2026.2.2.0a0 + - libopus >=1.6.1,<2.0a0 + - libplacebo >=7.360.1,<7.361.0a0 + - librsvg >=2.62.3,<3.0a0 + - libstdcxx >=14 + - libva >=2.23.0,<3.0a0 + - libvorbis >=1.3.7,<1.4.0a0 + - libvpl >=2.16.0,<2.17.0a0 + - libvpx >=1.15.2,<1.16.0a0 + - libvulkan-loader >=1.4.341.0,<2.0a0 + - libwebp-base >=1.6.0,<2.0a0 + - libxcb >=1.17.0,<2.0a0 + - libxml2 + - libxml2-16 >=2.14.6 + - libzlib >=1.3.2,<2.0a0 + - openh264 >=2.6.0,<2.6.1.0a0 + - openssl >=3.5.7,<4.0a0 + - pulseaudio-client >=17.0,<17.1.0a0 + - sdl2 >=2.32.56,<3.0a0 + - shaderc >=2026.2,<2026.3.0a0 + - svt-av1 >=4.0.1,<4.0.2.0a0 + - x264 >=1!164.3095,<1!165 + - x265 >=3.5,<3.6.0a0 + - xorg-libx11 >=1.8.13,<2.0a0 + constrains: + - __cuda >=12.8 + license: GPL-2.0-or-later + license_family: GPL + purls: [] + size: 13078770 + timestamp: 1782260267617 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ffmpeg-8.0.1-gpl_h62efc85_914.conda sha256: a2816bcef9d7b072597192fcb15b851eaee1ef358c0a3890ab255070d41b64cb md5: e9f109db13b0fad0c1f2f92d9770c8c3 @@ -5431,6 +6360,66 @@ packages: license_family: GPL size: 12035194 timestamp: 1773008913159 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ffmpeg-8.1.2-gpl_h0327ddc_901.conda + sha256: 12f1d33ba6975ae1f2f799111665bd3e527d5aefd29adec27bf731f4e59a9eeb + md5: 00e3601ba812380631f089f650625c61 + depends: + - alsa-lib >=1.2.16.1,<1.3.0a0 + - aom >=3.14.1,<3.15.0a0 + - bzip2 >=1.0.8,<2.0a0 + - dav1d >=1.2.1,<1.2.2.0a0 + - fontconfig >=2.18.1,<3.0a0 + - fonts-conda-ecosystem + - gmp >=6.3.0,<7.0a0 + - harfbuzz >=14.2.1 + - lame >=3.100,<3.101.0a0 + - libass >=0.17.4,<0.17.5.0a0 + - libexpat >=2.8.1,<3.0a0 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + - libgcc >=14 + - libiconv >=1.18,<2.0a0 + - libjxl >=0.11,<1.0a0 + - liblzma >=5.8.3,<6.0a0 + - libopenvino >=2026.2.1,<2026.2.2.0a0 + - libopenvino-arm-cpu-plugin >=2026.2.1,<2026.2.2.0a0 + - libopenvino-auto-batch-plugin >=2026.2.1,<2026.2.2.0a0 + - libopenvino-auto-plugin >=2026.2.1,<2026.2.2.0a0 + - libopenvino-hetero-plugin >=2026.2.1,<2026.2.2.0a0 + - libopenvino-ir-frontend >=2026.2.1,<2026.2.2.0a0 + - libopenvino-onnx-frontend >=2026.2.1,<2026.2.2.0a0 + - libopenvino-paddle-frontend >=2026.2.1,<2026.2.2.0a0 + - libopenvino-pytorch-frontend >=2026.2.1,<2026.2.2.0a0 + - libopenvino-tensorflow-frontend >=2026.2.1,<2026.2.2.0a0 + - libopenvino-tensorflow-lite-frontend >=2026.2.1,<2026.2.2.0a0 + - libopus >=1.6.1,<2.0a0 + - libplacebo >=7.360.1,<7.361.0a0 + - librsvg >=2.62.3,<3.0a0 + - libstdcxx >=14 + - libvorbis >=1.3.7,<1.4.0a0 + - libvpx >=1.15.2,<1.16.0a0 + - libvulkan-loader >=1.4.341.0,<2.0a0 + - libwebp-base >=1.6.0,<2.0a0 + - libxcb >=1.17.0,<2.0a0 + - libxml2 + - libxml2-16 >=2.14.6 + - libzlib >=1.3.2,<2.0a0 + - openh264 >=2.6.0,<2.6.1.0a0 + - openssl >=3.5.7,<4.0a0 + - pulseaudio-client >=17.0,<17.1.0a0 + - sdl2 >=2.32.56,<3.0a0 + - shaderc >=2026.2,<2026.3.0a0 + - svt-av1 >=4.0.1,<4.0.2.0a0 + - x264 >=1!164.3095,<1!165 + - x265 >=3.5,<3.6.0a0 + - xorg-libx11 >=1.8.13,<2.0a0 + constrains: + - __cuda >=12.8 + license: GPL-2.0-or-later + license_family: GPL + purls: [] + size: 12711897 + timestamp: 1782260316539 - conda: https://conda.anaconda.org/conda-forge/win-64/ffmpeg-8.0.1-gpl_hb2d76f6_914.conda sha256: fbe7916ed95bdc9650c9906865ab21cc04fb337548fdffec94f64a547ba3644d md5: 7cffff39ee349bddb81e1de24c780f34 @@ -5472,6 +6461,48 @@ packages: license_family: GPL size: 10417843 timestamp: 1773010275486 +- conda: https://conda.anaconda.org/conda-forge/win-64/ffmpeg-8.1.2-gpl_h6d5d71d_901.conda + sha256: 0ceea53997c09df6cda1911b53661af9e3b051989bc3780d1657a76028132057 + md5: 407eb5885e6399f8495f6796ebd71134 + depends: + - aom >=3.14.1,<3.15.0a0 + - bzip2 >=1.0.8,<2.0a0 + - dav1d >=1.2.1,<1.2.2.0a0 + - fontconfig >=2.18.1,<3.0a0 + - fonts-conda-ecosystem + - harfbuzz >=14.2.1 + - lame >=3.100,<3.101.0a0 + - libexpat >=2.8.1,<3.0a0 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + - libiconv >=1.18,<2.0a0 + - libjxl >=0.11,<1.0a0 + - liblzma >=5.8.3,<6.0a0 + - libopus >=1.6.1,<2.0a0 + - librsvg >=2.62.3,<3.0a0 + - libvorbis >=1.3.7,<1.4.0a0 + - libvulkan-loader >=1.4.341.0,<2.0a0 + - libwebp-base >=1.6.0,<2.0a0 + - libxml2 + - libxml2-16 >=2.14.6 + - libzlib >=1.3.2,<2.0a0 + - openh264 >=2.6.0,<2.6.1.0a0 + - openssl >=3.5.7,<4.0a0 + - sdl2 >=2.32.56,<3.0a0 + - shaderc >=2026.2,<2026.3.0a0 + - svt-av1 >=4.0.1,<4.0.2.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + - x264 >=1!164.3095,<1!165 + - x265 >=3.5,<3.6.0a0 + constrains: + - __cuda >=12.8 + license: GPL-2.0-or-later + license_family: GPL + purls: [] + size: 11020618 + timestamp: 1782262495007 - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.25.2-pyhd8ed1ab_0.conda sha256: dddea9ec53d5e179de82c24569d41198f98db93314f0adae6b15195085d5567f md5: f58064cec97b12a7136ebb8a6f8a129b @@ -5482,6 +6513,16 @@ packages: - pkg:pypi/filelock?source=compressed-mapping size: 25845 timestamp: 1773314012590 +- conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.29.4-pyhd8ed1ab_0.conda + sha256: feb5c13cc8f256212a979783a7645abd7e27925c51ee5431babbc0efc661cdfd + md5: 66f138d7a6dffb5c959cc4bf6dc2b797 + depends: + - python >=3.10 + license: Unlicense + purls: + - pkg:pypi/filelock?source=compressed-mapping + size: 36989 + timestamp: 1781381078337 - conda: https://conda.anaconda.org/conda-forge/linux-64/fmt-12.1.0-hff5e90c_0.conda sha256: d4e92ba7a7b4965341dc0fca57ec72d01d111b53c12d11396473115585a9ead6 md5: f7d7a4104082b39e3b3473fbd4a38229 @@ -5491,6 +6532,7 @@ packages: - libstdcxx >=14 license: MIT license_family: MIT + purls: [] size: 198107 timestamp: 1767681153946 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fmt-12.1.0-h20c602a_0.conda @@ -5501,6 +6543,7 @@ packages: - libstdcxx >=14 license: MIT license_family: MIT + purls: [] size: 197671 timestamp: 1767681179883 - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 @@ -5508,6 +6551,7 @@ packages: md5: 0c96522c6bdaed4b1566d11387caaf45 license: BSD-3-Clause license_family: BSD + purls: [] size: 397370 timestamp: 1566932522327 - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 @@ -5515,6 +6559,7 @@ packages: md5: 34893075a5c9e55cdafac56607368fc6 license: OFL-1.1 license_family: Other + purls: [] size: 96530 timestamp: 1620479909603 - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 @@ -5522,6 +6567,7 @@ packages: md5: 4d59c254e01d9cde7957100457e2d5fb license: OFL-1.1 license_family: Other + purls: [] size: 700814 timestamp: 1620479612257 - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda @@ -5529,6 +6575,7 @@ packages: md5: 49023d73832ef61042f6a237cb2687e7 license: LicenseRef-Ubuntu-Font-Licence-Version-1.0 license_family: Other + purls: [] size: 1620504 timestamp: 1727511233259 - conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.17.1-h27c8c51_0.conda @@ -5546,6 +6593,22 @@ packages: license_family: MIT size: 270705 timestamp: 1771382710863 +- conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.18.1-h27c8c51_0.conda + sha256: 2e50bdcebdf70a865b81f2456bbc586386451ec601c60f2b6cd22b8c40a2d384 + md5: e0e050cfa9fa85fe39632ab11cb7f3e0 + depends: + - __glibc >=2.17,<3.0.a0 + - libexpat >=2.8.1,<3.0a0 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + - libgcc >=14 + - libuuid >=2.42.1,<3.0a0 + - libzlib >=1.3.2,<2.0a0 + license: MIT + license_family: MIT + purls: [] + size: 281880 + timestamp: 1780450077431 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fontconfig-2.17.1-hba86a56_0.conda sha256: 835aff8615dd8d8fff377679710ce81b8a2c47b6404e21a92fb349fda193a15c md5: 0fed1ff55f4938a65907f3ecf62609db @@ -5560,6 +6623,21 @@ packages: license_family: MIT size: 279044 timestamp: 1771382728182 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fontconfig-2.18.1-hba86a56_0.conda + sha256: 2ccfd118269d363a5506161c4a0d96da46d2f01beecc74e0540a54b4737d0e45 + md5: f4d29a0cd77104a683607319a542ac7e + depends: + - libexpat >=2.8.1,<3.0a0 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + - libgcc >=14 + - libuuid >=2.42.1,<3.0a0 + - libzlib >=1.3.2,<2.0a0 + license: MIT + license_family: MIT + purls: [] + size: 290522 + timestamp: 1780450108132 - conda: https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.17.1-hd47e2ca_0.conda sha256: ff2db9d305711854de430f946dc59bd40167940a1de38db29c5a78659f219d9c md5: a0b1b87e871011ca3b783bbf410bc39f @@ -5576,6 +6654,24 @@ packages: license_family: MIT size: 195332 timestamp: 1771382820659 +- conda: https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.18.1-hd47e2ca_0.conda + sha256: 9217184c4a8e82101b0e512b059ae3ff67e3913133b9031edad89ab5341284e4 + md5: abd79bad98c99c1a116154d6de74ea89 + depends: + - libexpat >=2.8.1,<3.0a0 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + - libiconv >=1.18,<2.0a0 + - libintl >=0.22.5,<1.0a0 + - libzlib >=1.3.2,<2.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: MIT + license_family: MIT + purls: [] + size: 202630 + timestamp: 1780450217840 - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2 sha256: a997f2f1921bb9c9d76e6fa2f6b408b7fa549edd349a77639c9fe7a23ea93e61 md5: fee5683a3f04bd15cbd8318b096a27ab @@ -5583,6 +6679,7 @@ packages: - fonts-conda-forge license: BSD-3-Clause license_family: BSD + purls: [] size: 3667 timestamp: 1566974674465 - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda @@ -5595,6 +6692,7 @@ packages: - font-ttf-source-code-pro license: BSD-3-Clause license_family: BSD + purls: [] size: 4059 timestamp: 1762351264405 - conda: https://conda.anaconda.org/conda-forge/linux-64/freetype-2.14.2-ha770c72_0.conda @@ -5606,6 +6704,16 @@ packages: license: GPL-2.0-only OR FTL size: 174292 timestamp: 1772757205296 +- conda: https://conda.anaconda.org/conda-forge/linux-64/freetype-2.14.3-ha770c72_0.conda + sha256: c934c385889c7836f034039b43b05ccfa98f53c900db03d8411189892ced090b + md5: 8462b5322567212beeb025f3519fb3e2 + depends: + - libfreetype 2.14.3 ha770c72_0 + - libfreetype6 2.14.3 h73754d4_0 + license: GPL-2.0-only OR FTL + purls: [] + size: 173839 + timestamp: 1774298173462 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/freetype-2.14.2-h8af1aa0_0.conda sha256: ecbe6e811574fba5194b29ac3a2badea5eaa060bd9fe7f5bd48a70d16ef38e5a md5: 9cb47d7bbb36646c44d7cf1cb8047887 @@ -5615,6 +6723,16 @@ packages: license: GPL-2.0-only OR FTL size: 173437 timestamp: 1772756019067 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/freetype-2.14.3-h8af1aa0_1.conda + sha256: 1112c56bc19cbce233b30d9d31ce8eb6fcc100c9baa5145315aaa1e3a25b5178 + md5: 5e8e88bfb3fbb0df0f9f8bb890721e07 + depends: + - libfreetype 2.14.3 h8af1aa0_1 + - libfreetype6 2.14.3 hdae7a39_1 + license: GPL-2.0-only OR FTL + purls: [] + size: 174060 + timestamp: 1780933507786 - conda: https://conda.anaconda.org/conda-forge/win-64/freetype-2.14.2-h57928b3_0.conda sha256: 6dd4bb3862ea3d07015331059504cf3b6af1a11a6909e7a9b6e04a20e253da28 md5: c360b467564b875a9f5dc481b8726cee @@ -5624,6 +6742,17 @@ packages: license: GPL-2.0-only OR FTL size: 185633 timestamp: 1772756186241 +- conda: https://conda.anaconda.org/conda-forge/win-64/freetype-2.14.3-h57928b3_1.conda + sha256: a0e419e96146159f12344c870dca608d11bca36841f228092b986ffc2e1e0f02 + md5: e77293b32225b136a8be300f93d0e89f + depends: + - libfreetype 2.14.3 h57928b3_1 + - libfreetype6 2.14.3 hdbac1cb_1 + - zlib + license: GPL-2.0-only OR FTL + purls: [] + size: 185584 + timestamp: 1780934817461 - conda: https://conda.anaconda.org/conda-forge/linux-64/fribidi-1.0.16-hb03c661_0.conda sha256: 858283ff33d4c033f4971bf440cebff217d5552a5222ba994c49be990dacd40d md5: f9f81ea472684d75b9dd8d0b328cf655 @@ -5631,6 +6760,7 @@ packages: - __glibc >=2.17,<3.0.a0 - libgcc >=14 license: LGPL-2.1-or-later + purls: [] size: 61244 timestamp: 1757438574066 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fribidi-1.0.16-he30d5cf_0.conda @@ -5639,6 +6769,7 @@ packages: depends: - libgcc >=14 license: LGPL-2.1-or-later + purls: [] size: 62909 timestamp: 1757438620177 - conda: https://conda.anaconda.org/conda-forge/win-64/fribidi-1.0.16-hfd05255_0.conda @@ -5649,6 +6780,7 @@ packages: - vc >=14.3,<15 - vc14_runtime >=14.44.35208 license: LGPL-2.1-or-later + purls: [] size: 64394 timestamp: 1757438741305 - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2026.2.0-pyhd8ed1ab_0.conda @@ -5660,6 +6792,17 @@ packages: license_family: BSD size: 148757 timestamp: 1770387898414 +- conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2026.6.0-pyhd8ed1ab_0.conda + sha256: fe0156e6d658be3531aad2a99e42e8ad1ee23c69837d469c44c1b6010373913d + md5: 7d7e6c826ba0743fc491ebee0e7b899c + depends: + - python >=3.10 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/fsspec?source=compressed-mapping + size: 149709 + timestamp: 1781615868173 - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-14.3.0-h0dff253_18.conda sha256: 9b34b57b06b485e33a40d430f71ac88c8f381673592507cf7161c50ff0832772 md5: 52d6457abc42e320787ada5f9033fa99 @@ -5670,6 +6813,17 @@ packages: license_family: BSD size: 29506 timestamp: 1771378321585 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-15.2.0-h0dff253_19.conda + sha256: 54a0d9ee655ba83b78b7a796f12224b26c24943d8970559ecc47ccd6c2b0fa72 + md5: 18ec2ee87e4f532afa459ce8ea9a6b02 + depends: + - conda-gcc-specs + - gcc_impl_linux-64 15.2.0 he0086c7_19 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 29561 + timestamp: 1778269371353 - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc-15.2.0-h6f77f03_18.conda sha256: d120a7616f8b2717fc2a9d0246b53f69ce3fb33e565d22dba44e3d6827ee4f12 md5: 094638a454410aa77586ffcc9a403aef @@ -5702,6 +6856,18 @@ packages: license_family: BSD size: 29408 timestamp: 1771378529822 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gcc-15.2.0-h24a549f_19.conda + sha256: 3d75b775f6ab3977b2b72b9940806603158d10d60ff3e66930a59b74c4305219 + md5: 6b596ff6f13f6e46365d2182c6ea1e53 + depends: + - gcc_impl_linux-aarch64 15.2.0 h3530432_19 + track_features: + - gcc_no_conda_specs + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 29538 + timestamp: 1778269115054 - conda: https://conda.anaconda.org/conda-forge/win-64/gcc-15.2.0-hd556455_18.conda sha256: 349dd70890b3bb51d8f7a7976f53711f4606c076a659ee7fdc7c32e2ffa019a1 md5: 0f295318682c2fbefbe293399fae135f @@ -5728,6 +6894,23 @@ packages: license_family: GPL size: 76302378 timestamp: 1771378056505 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-15.2.0-he0086c7_19.conda + sha256: a48400ec4b73369c1c59babe4ad35821b63a88bba0ec40a80cea5f8c53a26b83 + md5: e3be72048d3c4a78b8e27ec48ba06252 + depends: + - binutils_impl_linux-64 >=2.45 + - libgcc >=15.2.0 + - libgcc-devel_linux-64 15.2.0 hcc6f6b0_119 + - libgomp >=15.2.0 + - libsanitizer 15.2.0 h90f66d4_19 + - libstdcxx >=15.2.0 + - libstdcxx-devel_linux-64 15.2.0 hd446a21_119 + - sysroot_linux-64 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 81180457 + timestamp: 1778269124617 - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-15.2.0-he420e7e_18.conda sha256: a088cfd3ae6fa83815faa8703bc9d21cc915f17bd1b51aac9c16ddf678da21e4 md5: cf56b6d74f580b91fd527e10d9a2e324 @@ -5760,6 +6943,23 @@ packages: license_family: GPL size: 69149627 timestamp: 1771377858762 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gcc_impl_linux-aarch64-15.2.0-h3530432_19.conda + sha256: cd23829b5fb7f3ff5f44eab2da1a993e06bdf759b681a0a7a73bb5783755b6b3 + md5: 66dfb62e7a47e2b511f9c5ee0ff1abf3 + depends: + - binutils_impl_linux-aarch64 >=2.45 + - libgcc >=15.2.0 + - libgcc-devel_linux-aarch64 15.2.0 h55c397f_119 + - libgomp >=15.2.0 + - libsanitizer 15.2.0 he19c465_19 + - libstdcxx >=15.2.0 + - libstdcxx-devel_linux-aarch64 15.2.0 ha7b1723_119 + - sysroot_linux-aarch64 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 73237372 + timestamp: 1778268860495 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gcc_impl_linux-aarch64-15.2.0-hcedddb3_18.conda sha256: 12919d985a6c6787872699c7a3c295dad07f4084f2d850e9c7fe592ee0a6806b md5: 761a75d8c098913bc1186b26588051e0 @@ -5806,6 +7006,22 @@ packages: license_family: LGPL size: 575109 timestamp: 1771530561157 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gdk-pixbuf-2.44.6-h2b0a6b4_0.conda + sha256: c5594497f0646e9079705b3199dbb2d5b13c48173cf110000fa1c8818e2b3e0c + md5: 7892f39a39ed39591a89a28eba03e987 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libglib >=2.86.4,<3.0a0 + - libjpeg-turbo >=3.1.2,<4.0a0 + - liblzma >=5.8.2,<6.0a0 + - libpng >=1.6.56,<1.7.0a0 + - libtiff >=4.7.1,<4.8.0a0 + license: LGPL-2.1-or-later + license_family: LGPL + purls: [] + size: 577414 + timestamp: 1774985848058 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gdk-pixbuf-2.44.5-h90308e0_1.conda sha256: aa95b37da0750fb93c5eeef79073b9b0d50976fa0dc02ed0301ff7bbbfc7ff36 md5: c75ae103325db056719dd51d6525e1cd @@ -5820,6 +7036,21 @@ packages: license_family: LGPL size: 584221 timestamp: 1771532437279 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gdk-pixbuf-2.44.6-h90308e0_0.conda + sha256: 53ac38045a8c0b6aa9cfaf784443a3744dc86ab4737c1479b44ae85c96926fe1 + md5: bdd860e72c5e10eb4ffa3d61f9b02ee0 + depends: + - libgcc >=14 + - libglib >=2.86.4,<3.0a0 + - libjpeg-turbo >=3.1.2,<4.0a0 + - liblzma >=5.8.2,<6.0a0 + - libpng >=1.6.56,<1.7.0a0 + - libtiff >=4.7.1,<4.8.0a0 + license: LGPL-2.1-or-later + license_family: LGPL + purls: [] + size: 583708 + timestamp: 1774987740322 - conda: https://conda.anaconda.org/conda-forge/win-64/gdk-pixbuf-2.44.5-h1f5b9c4_1.conda sha256: 82c725a67098c7c43dfc33ba292a48e68530135b94a8703f20566d90574acdfd md5: 4059b4975e2de5894286dbe6bd6728fb @@ -5837,6 +7068,24 @@ packages: license_family: LGPL size: 574950 timestamp: 1771530717329 +- conda: https://conda.anaconda.org/conda-forge/win-64/gdk-pixbuf-2.44.6-h1f5b9c4_0.conda + sha256: 3b8a4bdb183b3b9b70caa91498680add15fb70678ec2a21391e6860c5dfed3e7 + md5: e1ff1d17cb48f89d71f74b0c5eab3b47 + depends: + - libglib >=2.86.4,<3.0a0 + - libintl >=0.22.5,<1.0a0 + - libjpeg-turbo >=3.1.2,<4.0a0 + - liblzma >=5.8.2,<6.0a0 + - libpng >=1.6.56,<1.7.0a0 + - libtiff >=4.7.1,<4.8.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: LGPL-2.1-or-later + license_family: LGPL + purls: [] + size: 576065 + timestamp: 1774986034812 - conda: https://conda.anaconda.org/conda-forge/linux-64/glslang-16.2.0-h96af755_1.conda sha256: 88a5ad3571948bde22957d08ab01328b8a7eb04fdee66268b3125cc322dbde8b md5: ba5b655d827f263090ad2dc514810328 @@ -5849,6 +7098,19 @@ packages: license_family: BSD size: 1353008 timestamp: 1770195199411 +- conda: https://conda.anaconda.org/conda-forge/linux-64/glslang-16.3.0-h96af755_0.conda + sha256: 3c9b6a90937a96ad27d160304cdbe5e9961db613aba2b84ff673429f0c61d48e + md5: d175cb2c14104728ada04883786a309d + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - spirv-tools >=2026,<2027.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 1366082 + timestamp: 1777747028121 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/glslang-16.2.0-h124e036_1.conda sha256: a1c0db6c226b9d80e74bdd49f604eece637489c8c71e6ae63ada8db9e2359944 md5: 3ead7f968b529f76f972621558ed2f68 @@ -5860,6 +7122,18 @@ packages: license_family: BSD size: 1348415 timestamp: 1770195275881 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/glslang-16.3.0-h124e036_0.conda + sha256: bae4806f4076cf9f91089fbeae7c9357ce4348df3657c25b249ac4487beed230 + md5: c0045dffcc3660ecd1b9123df377796f + depends: + - libgcc >=14 + - libstdcxx >=14 + - spirv-tools >=2026,<2027.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 1359428 + timestamp: 1777747105441 - conda: https://conda.anaconda.org/conda-forge/win-64/glslang-16.2.0-h294ba9c_1.conda sha256: c46afa4a43b7709e07a69d0a2d70b10f59f22e96dbf9ec80e53a42cc6551111c md5: 4b5f576265df0a05d4e47e48c50bb4e6 @@ -5872,6 +7146,19 @@ packages: license_family: BSD size: 4929181 timestamp: 1770195251565 +- conda: https://conda.anaconda.org/conda-forge/win-64/glslang-16.3.0-h294ba9c_0.conda + sha256: d80276b89d8aeab6ff0d8d7d4b9af336b368fc0b8fa28ea8cde6f6f2aa07bacf + md5: 7d6fed8a6ebeeebd6362790e22e56bb3 + depends: + - spirv-tools >=2026,<2027.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 5074630 + timestamp: 1777747167205 - conda: https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda sha256: 309cf4f04fec0c31b6771a5809a1909b4b3154a2208f52351e1ada006f4c750c md5: c94a5994ef49749880a8139cf9afcbe1 @@ -5879,6 +7166,7 @@ packages: - libgcc-ng >=12 - libstdcxx-ng >=12 license: GPL-2.0-or-later OR LGPL-3.0-or-later + purls: [] size: 460055 timestamp: 1718980856608 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gmp-6.3.0-h0a1ffab_2.conda @@ -5888,6 +7176,7 @@ packages: - libgcc-ng >=12 - libstdcxx-ng >=12 license: GPL-2.0-or-later OR LGPL-3.0-or-later + purls: [] size: 417323 timestamp: 1718980707330 - conda: https://conda.anaconda.org/conda-forge/linux-64/gmpy2-2.3.0-py314h28848ee_1.conda @@ -5903,6 +7192,8 @@ packages: - python_abi 3.14.* *_cp314 license: LGPL-3.0-or-later license_family: LGPL + purls: + - pkg:pypi/gmpy2?source=hash-mapping size: 254716 timestamp: 1773245106880 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gmpy2-2.3.0-py314h887ad84_1.conda @@ -5918,6 +7209,8 @@ packages: - python_abi 3.14.* *_cp314 license: LGPL-3.0-or-later license_family: LGPL + purls: + - pkg:pypi/gmpy2?source=hash-mapping size: 245553 timestamp: 1773245145237 - conda: https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.14-hecca717_2.conda @@ -5931,6 +7224,18 @@ packages: license_family: LGPL size: 99596 timestamp: 1755102025473 +- conda: https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.15-hecca717_0.conda + sha256: 885fa7d1d7e2ad9ed0a700ee0d81ceb49de278253082d517959b22d6336eecce + md5: cf09e9fc938518e91d0706572cadf17a + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: LGPL-2.0-or-later + license_family: LGPL + purls: [] + size: 100054 + timestamp: 1780454302233 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/graphite2-1.3.14-hfae3067_2.conda sha256: c9b1781fe329e0b77c5addd741e58600f50bef39321cae75eba72f2f381374b7 md5: 4aa540e9541cc9d6581ab23ff2043f13 @@ -5941,6 +7246,17 @@ packages: license_family: LGPL size: 102400 timestamp: 1755102000043 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/graphite2-1.3.15-hfae3067_0.conda + sha256: 3e529c517a76a1f4497c51eeedeeb927d33f732dcdb48055a020a83eb3e4e95c + md5: 4db044857ab1d09b2e8f0013c65387c1 + depends: + - libgcc >=14 + - libstdcxx >=14 + license: LGPL-2.0-or-later + license_family: LGPL + purls: [] + size: 103119 + timestamp: 1780455096710 - conda: https://conda.anaconda.org/conda-forge/win-64/graphite2-1.3.14-hac47afa_2.conda sha256: 5f1714b07252f885a62521b625898326ade6ca25fbc20727cfe9a88f68a54bfd md5: b785694dd3ec77a011ccf0c24725382b @@ -5952,6 +7268,18 @@ packages: license_family: LGPL size: 96336 timestamp: 1755102441729 +- conda: https://conda.anaconda.org/conda-forge/win-64/graphite2-1.3.15-hac47afa_0.conda + sha256: 88b6601f8edae59834b59b521e293ff3b58361dc1603240f5a8328c24e6936ad + md5: ff9a9bfe791f56b0227597a7651a6af0 + depends: + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: LGPL-2.0-or-later + license_family: LGPL + purls: [] + size: 97308 + timestamp: 1780454389458 - conda: https://conda.anaconda.org/conda-forge/linux-64/greenlet-3.3.2-py314h42812f9_0.conda sha256: fdeec5dbb5f964b1709f3d6f697137f0e68650e09ffa80b9b1bee2afb2373da4 md5: 511748f9debe034ff88eef99bc215fd3 @@ -6017,6 +7345,17 @@ packages: license_family: BSD size: 28723 timestamp: 1771378698305 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx-15.2.0-h76987e4_19.conda + sha256: b00817919d7b2d68e3299031c5332855576ae086ac80032aa0a78b7f6f12dae4 + md5: 327876a856b3a45001cfb9a855efa65f + depends: + - gcc 15.2.0 h0dff253_19 + - gxx_impl_linux-64 15.2.0 hda75c37_19 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 28945 + timestamp: 1778269389494 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gxx-14.3.0-ha384071_18.conda sha256: 09fb56bcb1594d667e39b1ff4fced377f1b3f6c83f5b651d500db0b4865df68a md5: 3d5380505980f8859a796af4c1b49452 @@ -6037,6 +7376,17 @@ packages: license_family: BSD size: 28780 timestamp: 1771378557194 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gxx-15.2.0-ha384071_19.conda + sha256: 23e75a54b9ff48563a3be38c1d17232cf3d95fcddc938852b33090aea6d22505 + md5: e528c6fa3071fcad03ba0e530217fbe9 + depends: + - gcc 15.2.0 h24a549f_19 + - gxx_impl_linux-aarch64 15.2.0 h03e2352_19 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 29003 + timestamp: 1778269132414 - conda: https://conda.anaconda.org/conda-forge/win-64/gxx-15.2.0-hf1b5d6d_18.conda sha256: e85f25cee7618096463f426ec4c6ddd7c93058ed71c94d894c17dcb3269d867e md5: 882c461155d96001e0611b70ab620e9b @@ -6071,6 +7421,19 @@ packages: license_family: GPL size: 15587873 timestamp: 1771378609722 +- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-15.2.0-hda75c37_19.conda + sha256: 3f5288346b9fe233352443b3c2e31f1fde845e39d3e96475fc05ec2e782af158 + md5: 9d41f3899b512199af0a4bb939b83e21 + depends: + - gcc_impl_linux-64 15.2.0 he0086c7_19 + - libstdcxx-devel_linux-64 15.2.0 hd446a21_119 + - sysroot_linux-64 + - tzdata + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 16356816 + timestamp: 1778269332159 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gxx_impl_linux-aarch64-14.3.0-h0d4f5d4_18.conda sha256: 859a78ff16bef8d1d1d89d0604929c3c256ac0248b9a688e8defe9bbc027c886 md5: a12277d1ec675dbb993ad72dce735530 @@ -6095,6 +7458,19 @@ packages: license_family: GPL size: 15371317 timestamp: 1771378487467 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/gxx_impl_linux-aarch64-15.2.0-h03e2352_19.conda + sha256: afb0fc36b93539a8e43a8063c8d3e1b4bace38a5a0c3c9e1978c72792d633c62 + md5: 7214ae8a8aade7b48a2bfd8bbb4d9e79 + depends: + - gcc_impl_linux-aarch64 15.2.0 h3530432_19 + - libstdcxx-devel_linux-aarch64 15.2.0 ha7b1723_119 + - sysroot_linux-aarch64 + - tzdata + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 14640001 + timestamp: 1778269082840 - conda: https://conda.anaconda.org/conda-forge/win-64/gxx_impl_win-64-15.2.0-h22fd5bf_18.conda sha256: 55a524b1910bf26952d08aeb89b0496d423110378e991b5ff6ef2c662b884760 md5: 88379befc88f4efb16733dae4b96dac4 @@ -6140,6 +7516,26 @@ packages: license_family: MIT size: 2615630 timestamp: 1773217509651 +- conda: https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-14.2.1-h6083320_0.conda + sha256: da9901aa1e20cbc2369fda212039b294dd02bce95f005539bab840b7310bf7d0 + md5: 21ee4640b7c2d94e584349fa12b29b9a + depends: + - __glibc >=2.17,<3.0.a0 + - cairo >=1.18.4,<2.0a0 + - graphite2 >=1.3.14,<2.0a0 + - icu >=78.3,<79.0a0 + - libexpat >=2.8.1,<3.0a0 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + - libgcc >=14 + - libglib >=2.88.1,<3.0a0 + - libstdcxx >=14 + - libzlib >=1.3.2,<2.0a0 + license: MIT + license_family: MIT + purls: [] + size: 2362258 + timestamp: 1780450503234 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/harfbuzz-13.1.0-h1134a53_0.conda sha256: 49074457bdc624c0c0f39bb4b9b7689ec6334127ed7d5312484908f48e9a8e20 md5: 811bb5384d92870a3492fab4de4ff3f6 @@ -6158,6 +7554,25 @@ packages: license_family: MIT size: 2346492 timestamp: 1773222371375 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/harfbuzz-14.2.1-h1134a53_0.conda + sha256: 17a671aa62e1f0a8750514353e3d6e9aab80598908d9b107fc7f3cf7972176b6 + md5: 5f3ec279ab7cc391b7dff69dc08298fa + depends: + - cairo >=1.18.4,<2.0a0 + - graphite2 >=1.3.14,<2.0a0 + - icu >=78.3,<79.0a0 + - libexpat >=2.8.1,<3.0a0 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + - libgcc >=14 + - libglib >=2.88.1,<3.0a0 + - libstdcxx >=14 + - libzlib >=1.3.2,<2.0a0 + license: MIT + license_family: MIT + purls: [] + size: 2786349 + timestamp: 1780454506157 - conda: https://conda.anaconda.org/conda-forge/win-64/harfbuzz-13.1.0-h5a1b470_0.conda sha256: 27acd845926048481a831b7321674b3f92accde49869fb95438f0a35ea89419b md5: b3a4ff5d1e21d58090cd87060eb54c2d @@ -6177,6 +7592,26 @@ packages: license_family: MIT size: 1285640 timestamp: 1773217788574 +- conda: https://conda.anaconda.org/conda-forge/win-64/harfbuzz-14.2.1-h5a1b470_0.conda + sha256: 55d6d483e089afe68bdbb38a003d7b76002e65341665b80f38e6ce4b494beef6 + md5: 0bcbb7f911590beec914555c6b82050d + depends: + - cairo >=1.18.4,<2.0a0 + - graphite2 >=1.3.14,<2.0a0 + - icu >=78.3,<79.0a0 + - libexpat >=2.8.1,<3.0a0 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + - libglib >=2.88.1,<3.0a0 + - libzlib >=1.3.2,<2.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + license: MIT + license_family: MIT + purls: [] + size: 1304897 + timestamp: 1780450940279 - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba md5: 0a802cb9888dd14eeefc611f05c40b6e @@ -6267,6 +7702,18 @@ packages: license_family: MIT size: 13222158 timestamp: 1767970128854 +- conda: https://conda.anaconda.org/conda-forge/win-64/icu-78.3-h637d24d_0.conda + sha256: 1bda728d70a619731b278c859eda364146cb5b4b8c739a64da8128353d81d1c4 + md5: 0097b24800cb696915c3dbd1f5335d3f + depends: + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: MIT + license_family: MIT + purls: [] + size: 14954024 + timestamp: 1773822508646 - conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda sha256: ae89d0299ada2a3162c2614a9d26557a92aa6a77120ce142f8e0109bbf0342b0 md5: 53abe63df7e10a6ba605dc5f9f961d36 @@ -6349,6 +7796,18 @@ packages: - pkg:pypi/iniconfig?source=hash-mapping size: 13387 timestamp: 1760831448842 +- conda: https://conda.anaconda.org/conda-forge/linux-64/intel-gmmlib-22.10.0-hb700be7_0.conda + sha256: bc231d69eb6663db0e09738fb916c5e5507147cf1ac60f364f964004e0b29bab + md5: 10909406c1b0e4b57f9f4f0eb0999af8 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: MIT + license_family: MIT + purls: [] + size: 1013714 + timestamp: 1774422680665 - conda: https://conda.anaconda.org/conda-forge/linux-64/intel-gmmlib-22.9.0-hb700be7_0.conda sha256: edad668db79c6c4899d46e1cd4a331f5d008f9ed8f7d2e39e1dfe1a2d81acec0 md5: 26311c5112b5c713f472bdfbb5ec5aa3 @@ -6373,6 +7832,20 @@ packages: license_family: MIT size: 8783533 timestamp: 1773230300873 +- conda: https://conda.anaconda.org/conda-forge/linux-64/intel-media-driver-26.1.6-hecca717_0.conda + sha256: 7cbd7fda22db70c64af64c9173434a4ede58e4f220bda52a044e469aa94c65cb + md5: aaf7c3db8c7c4533deb5449d3ba1c51f + depends: + - __glibc >=2.17,<3.0.a0 + - intel-gmmlib >=22.10.0,<23.0a0 + - libgcc >=14 + - libstdcxx >=14 + - libva >=2.23.0,<3.0a0 + license: MIT + license_family: MIT + purls: [] + size: 8782375 + timestamp: 1776080148587 - conda: https://conda.anaconda.org/conda-forge/noarch/ipykernel-7.2.0-pyh6dadd2b_1.conda sha256: 9cdadaeef5abadca4113f92f5589db19f8b7df5e1b81cb0225f7024a3aedefa3 md5: b3a7d5842f857414d9ae831a799444dd @@ -6615,6 +8088,7 @@ packages: - sysroot_linux-64 ==2.28 license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later license_family: GPL + purls: [] size: 1278712 timestamp: 1765578681495 - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-aarch64-4.18.0-h05a177a_9.conda @@ -6624,6 +8098,7 @@ packages: - sysroot_linux-aarch64 ==2.28 license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later license_family: GPL + purls: [] size: 1248134 timestamp: 1765578613607 - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda @@ -6696,6 +8171,7 @@ packages: - libgcc-ng >=12 license: LGPL-2.0-only license_family: LGPL + purls: [] size: 508258 timestamp: 1664996250081 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lame-3.100-h4e544f5_1003.tar.bz2 @@ -6705,6 +8181,7 @@ packages: - libgcc-ng >=12 license: LGPL-2.0-only license_family: LGPL + purls: [] size: 604863 timestamp: 1664997611416 - conda: https://conda.anaconda.org/conda-forge/win-64/lame-3.100-hcfcfb64_1003.tar.bz2 @@ -6716,8 +8193,34 @@ packages: - vs2015_runtime >=14.29.30139 license: LGPL-2.0-only license_family: LGPL + purls: [] size: 570583 timestamp: 1664996824680 +- conda: https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.19.1-h0c24ade_1.conda + sha256: 112b5b9462572d970f4abd2912f76a25ee7db158b1e7260163d91dd8a630db84 + md5: 8b3ce45e929cd8e8e5f4d18586b56d8b + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libjpeg-turbo >=3.1.4.1,<4.0a0 + - libtiff >=4.7.1,<4.8.0a0 + license: MIT + license_family: MIT + purls: [] + size: 251971 + timestamp: 1780211695895 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lcms2-2.19.1-h9d5b58d_1.conda + sha256: ed213207bbf11663181941e0931caa9ce748f0544688e8e0fbcf330bca279389 + md5: 9183fda4be2b4ee5760cdb8e540439c8 + depends: + - libgcc >=14 + - libjpeg-turbo >=3.1.4.1,<4.0a0 + - libtiff >=4.7.1,<4.8.0a0 + license: MIT + license_family: MIT + purls: [] + size: 296564 + timestamp: 1780211834883 - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda sha256: 565941ac1f8b0d2f2e8f02827cbca648f4d18cd461afc31f15604cd291b5c5f3 md5: 12bd9a3f089ee6c9266a37dab82afabd @@ -6786,6 +8289,7 @@ packages: - libstdcxx >=14 license: Apache-2.0 license_family: Apache + purls: [] size: 261513 timestamp: 1773113328888 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lerc-4.1.0-h52b7260_0.conda @@ -6796,6 +8300,7 @@ packages: - libstdcxx >=14 license: Apache-2.0 license_family: Apache + purls: [] size: 240444 timestamp: 1773114901155 - conda: https://conda.anaconda.org/conda-forge/win-64/lerc-4.1.0-hd936e49_0.conda @@ -6807,6 +8312,7 @@ packages: - vc14_runtime >=14.44.35208 license: Apache-2.0 license_family: Apache + purls: [] size: 172395 timestamp: 1773113455582 - conda: https://conda.anaconda.org/conda-forge/linux-64/level-zero-1.28.2-hb700be7_0.conda @@ -6820,6 +8326,18 @@ packages: license_family: MIT size: 858387 timestamp: 1772045965844 +- conda: https://conda.anaconda.org/conda-forge/linux-64/level-zero-1.29.0-hb700be7_0.conda + sha256: d87cfc5eaa08eefff97d891ecb49faa958fcfc32a425767796269c4100d4e516 + md5: f3c3bc77c96af553f761af0e78bc8d9d + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: MIT + license_family: MIT + purls: [] + size: 875773 + timestamp: 1780142086148 - conda: https://conda.anaconda.org/conda-forge/linux-64/libabseil-20260107.1-cxx17_h7b12aa8_0.conda sha256: a7a4481a4d217a3eadea0ec489826a69070fcc3153f00443aa491ed21527d239 md5: 6f7b4302263347698fd24565fbf11310 @@ -6834,6 +8352,21 @@ packages: license_family: Apache size: 1384817 timestamp: 1770863194876 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libabseil-20260526.0-cxx17_h7b12aa8_1.conda + sha256: 32933de2d4fa6e6ffd949052815b49cb65a0649ad70007155c533ab97ea8cefd + md5: c4393db381bffa0a83a8d9e47b238106 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + constrains: + - abseil-cpp =20260526.0 + - libabseil-static =20260526.0=cxx17* + license: Apache-2.0 + license_family: Apache + purls: [] + size: 1437712 + timestamp: 1780524559298 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libabseil-20260107.1-cxx17_h6983b43_0.conda sha256: 37675140819e10235a8ff342cb09f688f843ac390b64856d8e230700bbd7d5aa md5: 2a19160c13e688710dd200812fc9a6d3 @@ -6847,6 +8380,20 @@ packages: license_family: Apache size: 1401836 timestamp: 1770863223557 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libabseil-20260526.0-cxx17_h6983b43_1.conda + sha256: 51f53ae6266889f0972a10c2773465da5554fdf55ac15b9dea3e7f77520022d9 + md5: d8637f7cc7143fe4ad2eceac8e8cf033 + depends: + - libgcc >=14 + - libstdcxx >=14 + constrains: + - abseil-cpp =20260526.0 + - libabseil-static =20260526.0=cxx17* + license: Apache-2.0 + license_family: Apache + purls: [] + size: 1457025 + timestamp: 1780524543286 - conda: https://conda.anaconda.org/conda-forge/linux-64/libass-0.17.4-h96ad9f0_0.conda sha256: 035eb8b54e03e72e42ef707420f9979c7427776ea99e0f1e3c969f92eb573f19 md5: d3be7b2870bf7aff45b12ea53165babd @@ -6862,6 +8409,7 @@ packages: - fonts-conda-ecosystem - harfbuzz >=11.0.1 license: ISC + purls: [] size: 152179 timestamp: 1749328931930 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libass-0.17.4-hcfe818d_0.conda @@ -6878,6 +8426,7 @@ packages: - libfreetype6 >=2.13.3 - libzlib >=1.3.1,<2.0a0 license: ISC + purls: [] size: 171287 timestamp: 1749328949722 - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda @@ -6933,6 +8482,25 @@ packages: purls: [] size: 18621 timestamp: 1774503034895 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-8_h5875eb1_mkl.conda + build_number: 8 + sha256: e30f7fa2a2fb6985f9ac6604575cb318b9ae44e263f6cacc282daee9dbd6127d + md5: 8ae84a87356b604a62f1aee136ef8efb + depends: + - mkl >=2026.0.0,<2027.0a0 + constrains: + - blas 2.308 mkl + - libcblas 3.11.0 8*_mkl + - liblapacke 3.11.0 8*_mkl + - liblapack 3.11.0 8*_mkl + track_features: + - blas_mkl + - blas_mkl_2 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 19257 + timestamp: 1779859078137 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda build_number: 5 sha256: 700f3c03d0fba8e687a345404a45fbabe781c1cf92242382f62cef2948745ec4 @@ -6968,6 +8536,24 @@ packages: purls: [] size: 18682 timestamp: 1774503047392 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-8_haddc8a3_openblas.conda + build_number: 8 + sha256: c897399c943168c646f659952f73a9154f9122d7e9b151649dbe075dfdcd484b + md5: 8b44dad125760faa2b3925f5a6e3112d + depends: + - libopenblas >=0.3.33,<0.3.34.0a0 + - libopenblas >=0.3.33,<1.0a0 + constrains: + - libcblas 3.11.0 8*_openblas + - liblapack 3.11.0 8*_openblas + - mkl <2027 + - blas 2.308 openblas + - liblapacke 3.11.0 8*_openblas + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 18843 + timestamp: 1779859042591 - conda: https://conda.anaconda.org/conda-forge/win-64/libblas-3.11.0-5_hf2e6a31_mkl.conda build_number: 5 sha256: f0cb7b2697461a306341f7ff32d5b361bb84f3e94478464c1e27ee01fc8f276b @@ -6999,6 +8585,22 @@ packages: purls: [] size: 68082 timestamp: 1774503684284 +- conda: https://conda.anaconda.org/conda-forge/win-64/libblas-3.11.0-8_h8455456_mkl.conda + build_number: 8 + sha256: 43a87b59e6d4c68d80b2e4de487b1b54d66fe1f9a06636909b5a5ab9eae27269 + md5: 4a0ce24b1a946ff77ae9eaa7ef015a33 + depends: + - mkl >=2026.0.0,<2027.0a0 + constrains: + - libcblas 3.11.0 8*_mkl + - liblapacke 3.11.0 8*_mkl + - blas 2.308 mkl + - liblapack 3.11.0 8*_mkl + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 68103 + timestamp: 1779859688049 - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.2.0-hb03c661_1.conda sha256: 318f36bd49ca8ad85e6478bd8506c88d82454cc008c1ac1c6bf00a3c42fa610e md5: 72c8fd1af66bd67bf580645b426513ed @@ -7007,6 +8609,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 79965 timestamp: 1764017188531 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libbrotlicommon-1.2.0-he30d5cf_1.conda @@ -7016,6 +8619,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 80030 timestamp: 1764017273715 - conda: https://conda.anaconda.org/conda-forge/win-64/libbrotlicommon-1.2.0-hfd05255_1.conda @@ -7027,6 +8631,7 @@ packages: - vc14_runtime >=14.44.35208 license: MIT license_family: MIT + purls: [] size: 82042 timestamp: 1764017799966 - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.2.0-hb03c661_1.conda @@ -7038,6 +8643,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 34632 timestamp: 1764017199083 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libbrotlidec-1.2.0-he30d5cf_1.conda @@ -7048,6 +8654,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 33166 timestamp: 1764017282936 - conda: https://conda.anaconda.org/conda-forge/win-64/libbrotlidec-1.2.0-hfd05255_1.conda @@ -7060,6 +8667,7 @@ packages: - vc14_runtime >=14.44.35208 license: MIT license_family: MIT + purls: [] size: 34449 timestamp: 1764017851337 - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.2.0-hb03c661_1.conda @@ -7071,6 +8679,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 298378 timestamp: 1764017210931 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libbrotlienc-1.2.0-he30d5cf_1.conda @@ -7081,6 +8690,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 309304 timestamp: 1764017292044 - conda: https://conda.anaconda.org/conda-forge/win-64/libbrotlienc-1.2.0-hfd05255_1.conda @@ -7093,6 +8703,7 @@ packages: - vc14_runtime >=14.44.35208 license: MIT license_family: MIT + purls: [] size: 252903 timestamp: 1764017901735 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda @@ -7117,6 +8728,17 @@ packages: purls: [] size: 124432 timestamp: 1774333989027 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.78-hd0affe5_0.conda + sha256: cc8c9fc6ddf0fbd3d1275b558ae9abad6cda23bced268732e2da21a87bb358cd + md5: f9f17eab7f3df1c6fd4b1a548a2f683a + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 124335 + timestamp: 1775488792584 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.77-h68e9139_0.conda sha256: 154eefd8f94010d89ba76a057949b9b1f75c7379bd0d19d4657c952bedcf5904 md5: 10fe36ec0a9f7b1caae0331c9ba50f61 @@ -7137,6 +8759,16 @@ packages: purls: [] size: 109458 timestamp: 1774335293336 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.78-hf9559e3_0.conda + sha256: 14b6654d942be7a68496d4e52d6aa4e217cf82005a42c20d1880eb473e34eb3a + md5: 1503ce9f8a3df149a33ccd7c300ec1d2 + depends: + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 109192 + timestamp: 1775490102029 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda build_number: 5 sha256: 0cbdcc67901e02dc17f1d19e1f9170610bd828100dc207de4d5b6b8ad1ae7ad8 @@ -7180,8 +8812,25 @@ packages: license: BSD-3-Clause license_family: BSD purls: [] - size: 18622 - timestamp: 1774503050205 + size: 18622 + timestamp: 1774503050205 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-8_hfef963f_mkl.conda + build_number: 8 + sha256: a3ea22126a74321ddf754a0efaf998486ffb8b9ec69fc735b3f0eacb6ffc8a4e + md5: 2101410a3915785b2c1595d1ae94e32c + depends: + - libblas 3.11.0 8_h5875eb1_mkl + constrains: + - blas 2.308 mkl + - liblapacke 3.11.0 8*_mkl + - liblapack 3.11.0 8*_mkl + track_features: + - blas_mkl + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 18902 + timestamp: 1779859085492 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda build_number: 5 sha256: 3fad5c9de161dccb4e42c8b1ae8eccb33f4ed56bccbcced9cbb0956ae7869e61 @@ -7211,6 +8860,21 @@ packages: purls: [] size: 18689 timestamp: 1774503058069 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-8_hd72aa62_openblas.conda + build_number: 8 + sha256: 3ba039f0705022939d90e36c1ed2fcbafd7f5bb77563e3702202ae796b32f4d2 + md5: 76242b7ad6e43809afa8671dd609b4ed + depends: + - libblas 3.11.0 8_haddc8a3_openblas + constrains: + - liblapack 3.11.0 8*_openblas + - liblapacke 3.11.0 8*_openblas + - blas 2.308 openblas + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 18817 + timestamp: 1779859049133 - conda: https://conda.anaconda.org/conda-forge/win-64/libcblas-3.11.0-5_h2a3cdd5_mkl.conda build_number: 5 sha256: 49dc59d8e58360920314b8d276dd80da7866a1484a9abae4ee2760bc68f3e68d @@ -7240,6 +8904,21 @@ packages: purls: [] size: 68221 timestamp: 1774503722413 +- conda: https://conda.anaconda.org/conda-forge/win-64/libcblas-3.11.0-8_h2a3cdd5_mkl.conda + build_number: 8 + sha256: 2a5b6555b481df4603e44cba49a6ef727584fd2f3c5235dd4bcb3028fffbdfb5 + md5: 09f1d8e4d2675d34ad2acb115211d10c + depends: + - libblas 3.11.0 8_h8455456_mkl + constrains: + - liblapacke 3.11.0 8*_mkl + - blas 2.308 mkl + - liblapack 3.11.0 8*_mkl + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 68443 + timestamp: 1779859701498 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcublas-13.5.1.27-h676940d_0.conda sha256: 39a1183f64d4ebff942117f7be9c0883b772ddf5796dee18bdda1d52949a9627 md5: 7bd32031313d7dca6c8250429b94bd03 @@ -7250,6 +8929,7 @@ packages: - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 382200769 timestamp: 1779912294439 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcublas-13.5.1.27-he38c790_0.conda @@ -7265,6 +8945,7 @@ packages: constrains: - arm-variant * sbsa license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 491684476 timestamp: 1779912373471 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcudnn-9.20.0.48-ha4b6413_0.conda @@ -7283,6 +8964,23 @@ packages: license: LicenseRef-cuDNN-Software-License-Agreement size: 332092133 timestamp: 1773180273500 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcudnn-9.23.1.3-ha4b6413_0.conda + sha256: 8fdcd511eef3167db18afca483e0c668f458fbf77c1c3a2acdb3ea787e6663bc + md5: 942361a0d125cd276b2e7b69660e4c67 + depends: + - __glibc >=2.28,<3.0.a0 + - cuda-nvrtc + - cuda-version >=13,<14.0a0 + - libcublas + - libgcc >=14 + - libstdcxx >=14 + - libzlib >=1.3.2,<2.0a0 + constrains: + - libcudnn-jit <0a + license: LicenseRef-cuDNN-Software-License-Agreement + purls: [] + size: 447957779 + timestamp: 1781299475496 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcudnn-9.20.0.48-h0bf6004_0.conda sha256: a3993991464f6ffe81c354e6d4e4edfeef4b9cb7c12cd7e13bc08d91c1826b09 md5: 9f5f39cc3a13eaa80d7727973fda0a43 @@ -7300,6 +8998,24 @@ packages: license: LicenseRef-cuDNN-Software-License-Agreement size: 407901008 timestamp: 1773180233415 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcudnn-9.23.1.3-h0bf6004_0.conda + sha256: 3c3322771cc71928526cbae349154678f9da8043ee5bccfcb02cfa3b738aefe9 + md5: 4b232bc6c7529d335ce6c338f56af648 + depends: + - __glibc >=2.28,<3.0.a0 + - arm-variant * sbsa + - cuda-nvrtc + - cuda-version >=13,<14.0a0 + - libcublas + - libgcc >=14 + - libstdcxx >=14 + - libzlib >=1.3.2,<2.0a0 + constrains: + - libcudnn-jit <0a + license: LicenseRef-cuDNN-Software-License-Agreement + purls: [] + size: 544123581 + timestamp: 1781299494730 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcudss-0.7.1.4-h7bcfba5_1.conda sha256: 7d3afc0e0e5bff4d9adcf2f3454ac97a8812b5802ca04498e1f5d8db9d3fb24c md5: 6111650cfce61896d705230a878cc1a8 @@ -7317,6 +9033,24 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 62715991 timestamp: 1770671835770 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcudss-0.8.0.10-h7bcfba5_0.conda + sha256: 05c4f349df99636cae828489ce0f842bbbc7da0006ce4cf37b6d3d7433d06aef + md5: b993006a7f8c1f2d0e4adf1c9ac17201 + depends: + - __glibc >=2.28,<3.0.a0 + - _openmp_mutex >=4.5 + - cuda-version >=13,<14.0a0 + - libcublas + - libgcc >=14 + - libstdcxx >=14 + constrains: + - libcudss-commlayer-nccl 0.8.0.10 h84ff803_0 + - libcudss-commlayer-mpi 0.8.0.10 h6647138_0 + - libcudss0 <0.0.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] + size: 78215168 + timestamp: 1780355336257 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcudss-0.7.1.4-he387df4_1.conda sha256: b51195f067cb90871b0673dfe8564015513c4f81509018313efcb0a14d3f2391 md5: c53276b4f3f8eeaa91813b8a0196eb91 @@ -7335,6 +9069,25 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 62631375 timestamp: 1770671821410 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcudss-0.8.0.10-he387df4_0.conda + sha256: f23ac30a306476480b94e9a5634b695e235426d78c426c1d0f69895ac3bfcee7 + md5: a595ebcdfd12786a1e9e1ad1a7d47537 + depends: + - __glibc >=2.28,<3.0.a0 + - _openmp_mutex >=4.5 + - arm-variant * sbsa + - cuda-version >=13,<14.0a0 + - libcublas + - libgcc >=14 + - libstdcxx >=14 + constrains: + - libcudss0 <0.0.0a0 + - libcudss-commlayer-nccl 0.8.0.10 h5eac28b_0 + - libcudss-commlayer-mpi 0.8.0.10 h40415f0_0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] + size: 78088194 + timestamp: 1780355374368 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufft-12.3.0.29-hecca717_0.conda sha256: bd69d4b63be28c36e0fa962256666672e3f2eff5dfc06bdd545acef278f83754 md5: b347b9844eb16238c7f7b62cd2bd1e68 @@ -7344,6 +9097,7 @@ packages: - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 150951336 timestamp: 1779897536120 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufft-12.3.0.29-h8f3c8d4_0.conda @@ -7357,6 +9111,7 @@ packages: constrains: - arm-variant * sbsa license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 151379070 timestamp: 1779897582547 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.14.1.1-hbc026e6_1.conda @@ -7422,6 +9177,7 @@ packages: - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 43805393 timestamp: 1779897559895 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcurand-10.4.3.29-he38c790_0.conda @@ -7436,6 +9192,7 @@ packages: constrains: - arm-variant * sbsa license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 44131451 timestamp: 1779897594729 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcusolver-12.2.2.18-h676940d_0.conda @@ -7450,6 +9207,7 @@ packages: - libnvjitlink >=13.3.33,<14.0a0 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 181482480 timestamp: 1779918401910 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcusolver-12.2.2.18-he38c790_0.conda @@ -7467,6 +9225,7 @@ packages: constrains: - arm-variant * sbsa license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 198192690 timestamp: 1779918383247 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcusparse-12.8.1.7-hecca717_0.conda @@ -7479,6 +9238,7 @@ packages: - libnvjitlink >=13.3.33,<14.0a0 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 145249472 timestamp: 1779913723266 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcusparse-12.8.1.7-h8f3c8d4_0.conda @@ -7493,6 +9253,7 @@ packages: constrains: - arm-variant * sbsa license: LicenseRef-NVIDIA-End-User-License-Agreement + purls: [] size: 161785577 timestamp: 1779913792758 - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda @@ -7503,6 +9264,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 73490 timestamp: 1761979956660 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda @@ -7512,6 +9274,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 71117 timestamp: 1761979776756 - conda: https://conda.anaconda.org/conda-forge/win-64/libdeflate-1.25-h51727cc_0.conda @@ -7523,8 +9286,34 @@ packages: - vc14_runtime >=14.44.35208 license: MIT license_family: MIT + purls: [] size: 156818 timestamp: 1761979842440 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libdovi-3.3.2-ha23c83e_4.conda + sha256: d15432f07f654583978712e034d308b103a8b4650f0fdec172b5031a8af2b6c9 + md5: b26a64dfb24fef32d3330e37ce5e4f44 + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + constrains: + - __glibc >=2.17 + license: MIT + license_family: MIT + purls: [] + size: 311420 + timestamp: 1777838991858 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdovi-3.3.2-hf71c8f5_4.conda + sha256: 4e41c61b67d6f077db7364bc2911c4d81e6c8080b86605e9d0adb97a5ed654b6 + md5: 530f83c19ee601cb9cda5b305ddf02fd + depends: + - libgcc >=14 + constrains: + - __glibc >=2.17 + license: MIT + license_family: MIT + purls: [] + size: 316167 + timestamp: 1777838999692 - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda sha256: c076a213bd3676cc1ef22eeff91588826273513ccc6040d9bea68bccdc849501 md5: 9314bc5a1fe7d1044dc9dfd3ef400535 @@ -7536,6 +9325,18 @@ packages: license_family: MIT size: 310785 timestamp: 1757212153962 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.127-hb03c661_0.conda + sha256: 7d3187c11b7ae66c5595a8afd5a7ce352a490527fdf6614cab129bc7f2c16ba3 + md5: d8d16b9b32a3c5df7e5b3350e2cbe058 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libpciaccess >=0.19,<0.20.0a0 + license: MIT + license_family: MIT + purls: [] + size: 311505 + timestamp: 1778975798004 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.125-he30d5cf_1.conda sha256: 4e6cdb5dd37db794b88bec714b4418a0435b04d14e9f7afc8cc32f2a3ced12f2 md5: 2079727b538f6dd16f3fa579d4c3c53f @@ -7546,6 +9347,17 @@ packages: license_family: MIT size: 344548 timestamp: 1757212128414 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.127-he30d5cf_0.conda + sha256: 2a941ffcd6b09380344c2cb5b198d2743ce4fc30ec9a5c8c83e53368d8015aef + md5: 987d35ad350bb552a30f3d314f6c7655 + depends: + - libgcc >=14 + - libpciaccess >=0.19,<0.20.0a0 + license: MIT + license_family: MIT + purls: [] + size: 345283 + timestamp: 1778975814771 - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda sha256: d789471216e7aba3c184cd054ed61ce3f6dac6f87a50ec69291b9297f8c18724 md5: c277e0a4d549b03ac1e9d6cbbe3d017b @@ -7580,6 +9392,16 @@ packages: license: LicenseRef-libglvnd size: 44840 timestamp: 1731330973553 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_3.conda + sha256: 9a25ea93e8272785405a21d30f84e620befb1d545f6dfaae18f06103b5df0443 + md5: 75e9f795be506c96dd43cb09c7c8d557 + depends: + - __glibc >=2.17,<3.0.a0 + - libglvnd 1.7.0 ha4b6fd6_3 + license: LicenseRef-libglvnd + purls: [] + size: 46500 + timestamp: 1779728188901 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_2.conda sha256: 8962abf38a58c235611ce356b9899f6caeb0352a8bce631b0bcc59352fda455e md5: cf105bce884e4ef8c8ccdca9fe6695e7 @@ -7588,6 +9410,15 @@ packages: license: LicenseRef-libglvnd size: 53551 timestamp: 1731330990477 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_3.conda + sha256: b987d3874edfcd9c7ddca86c003cb04ae51160a72c173a24cd46ab9eeb8886ab + md5: ec017f25e5d01ef9dd81e95ff73ff051 + depends: + - libglvnd 1.7.0 hd24410f_3 + license: LicenseRef-libglvnd + purls: [] + size: 54600 + timestamp: 1779728234591 - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda sha256: d78f1d3bea8c031d2f032b760f36676d87929b18146351c4464c66b0869df3f5 md5: e7f7ce06ec24cfcfb9e36d28cf82ba57 @@ -7613,6 +9444,19 @@ packages: purls: [] size: 76624 timestamp: 1774719175983 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.8.1-hecca717_1.conda + sha256: 16feffd9ddbbe5b718515d38ee376c685ba95491cd901244e24671d20b952a77 + md5: b24d3c612f71e7aa74158d92106318b2 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - expat 2.8.1.* + license: MIT + license_family: MIT + purls: [] + size: 77856 + timestamp: 1781203599810 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.7.4-hfae3067_0.conda sha256: 995ce3ad96d0f4b5ed6296b051a0d7b6377718f325bc0e792fbb96b0e369dad7 md5: 57f3b3da02a50a1be2a6fe847515417d @@ -7636,6 +9480,18 @@ packages: purls: [] size: 76523 timestamp: 1774719129371 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.8.1-hfae3067_1.conda + sha256: 20a5726bc8705d91437c9e6ef83b30da64a1719b869656d20a1ee818333ea5ac + md5: fac3b65a605cd253037fdf3daf2de8d9 + depends: + - libgcc >=14 + constrains: + - expat 2.8.1.* + license: MIT + license_family: MIT + purls: [] + size: 77649 + timestamp: 1781203572523 - conda: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.7.4-hac47afa_0.conda sha256: b31f6fb629c4e17885aaf2082fb30384156d16b48b264e454de4a06a313b533d md5: 1c1ced969021592407f16ada4573586d @@ -7663,6 +9519,20 @@ packages: purls: [] size: 70609 timestamp: 1774719377850 +- conda: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.8.1-hac47afa_1.conda + sha256: 1a54d874addda73b6f7164d5f3905821277a1831bcc05edd74b3085391688571 + md5: ccc490c81ffe14181861beac0e8f3169 + depends: + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + constrains: + - expat 2.8.1.* + license: MIT + license_family: MIT + purls: [] + size: 71631 + timestamp: 1781203724164 - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6 md5: a360c33a5abe61c07959e449fa1453eb @@ -7707,6 +9577,7 @@ packages: - libstdcxx >=14 license: BSD-3-Clause license_family: BSD + purls: [] size: 424563 timestamp: 1764526740626 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libflac-1.5.0-he9c94f4_1.conda @@ -7719,6 +9590,7 @@ packages: - libstdcxx >=14 license: BSD-3-Clause license_family: BSD + purls: [] size: 397272 timestamp: 1764526699497 - conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.14.2-ha770c72_0.conda @@ -7729,6 +9601,15 @@ packages: license: GPL-2.0-only OR FTL size: 8035 timestamp: 1772757210108 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.14.3-ha770c72_0.conda + sha256: 38f014a7129e644636e46064ecd6b1945e729c2140e21d75bb476af39e692db2 + md5: e289f3d17880e44b633ba911d57a321b + depends: + - libfreetype6 >=2.14.3 + license: GPL-2.0-only OR FTL + purls: [] + size: 8049 + timestamp: 1774298163029 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype-2.14.2-h8af1aa0_0.conda sha256: 23cdb94528bb4328b6f7550906dee5080952354445d8bd96241fa7d059c4af95 md5: 93bce8dee6a0a4906331db294ec250fe @@ -7737,6 +9618,15 @@ packages: license: GPL-2.0-only OR FTL size: 8108 timestamp: 1772756012710 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype-2.14.3-h8af1aa0_1.conda + sha256: db75d0fc080992dc67db8e24d7bb2a2f2a0b25bfce8870fa45a82a4b5f6111a2 + md5: a13e600f9d18488b1fd1257344dbfdaa + depends: + - libfreetype6 >=2.14.3 + license: GPL-2.0-only OR FTL + purls: [] + size: 8381 + timestamp: 1780933505754 - conda: https://conda.anaconda.org/conda-forge/win-64/libfreetype-2.14.2-h57928b3_0.conda sha256: 427c3072b311e65bd3eae3fcb78f6847b15b2dbb173a8546424de56550b2abfb md5: 153d52fd0e4ba2a5bd5bb4f4afa41417 @@ -7745,6 +9635,15 @@ packages: license: GPL-2.0-only OR FTL size: 8404 timestamp: 1772756167212 +- conda: https://conda.anaconda.org/conda-forge/win-64/libfreetype-2.14.3-h57928b3_1.conda + sha256: 035d0c67bf9f7a16f4a1764f420c120f1a995d071bb265fcc66ef688ef709d7b + md5: e45b52fb9a81c9e2708465a706e05952 + depends: + - libfreetype6 >=2.14.3 + license: GPL-2.0-only OR FTL + purls: [] + size: 8711 + timestamp: 1780934891782 - conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.14.2-h73754d4_0.conda sha256: aba65b94bdbed52de17ec3d0c6f2ebac2ef77071ad22d6900d1614d0dd702a0c md5: 8eaba3d1a4d7525c6814e861614457fd @@ -7758,6 +9657,20 @@ packages: license: GPL-2.0-only OR FTL size: 386316 timestamp: 1772757193822 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.14.3-h73754d4_0.conda + sha256: 16f020f96da79db1863fcdd8f2b8f4f7d52f177dd4c58601e38e9182e91adf1d + md5: fb16b4b69e3f1dcfe79d80db8fd0c55d + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libpng >=1.6.55,<1.7.0a0 + - libzlib >=1.3.2,<2.0a0 + constrains: + - freetype >=2.14.3 + license: GPL-2.0-only OR FTL + purls: [] + size: 384575 + timestamp: 1774298162622 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype6-2.14.2-hdae7a39_0.conda sha256: a2e9efb033f7519bbc0a54558d7c9bb96252adc22c6e09df2daee7615265fbb1 md5: 69d1cdfdabb66464cbde17890e8be3b9 @@ -7770,6 +9683,19 @@ packages: license: GPL-2.0-only OR FTL size: 423372 timestamp: 1772756012086 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype6-2.14.3-hdae7a39_1.conda + sha256: 34fe8276befd6c42956c4acd969caeddbdc7ea8e6ed054b8388709b6c3e94ba4 + md5: 426cc33f8745ce11a73baf73db3954a7 + depends: + - libgcc >=14 + - libpng >=1.6.58,<1.7.0a0 + - libzlib >=1.3.2,<2.0a0 + constrains: + - freetype >=2.14.3 + license: GPL-2.0-only OR FTL + purls: [] + size: 424236 + timestamp: 1780933505195 - conda: https://conda.anaconda.org/conda-forge/win-64/libfreetype6-2.14.2-hdbac1cb_0.conda sha256: 1e80e01e5662bd3a0c0e094fbeaec449dbb2288949ca55ca80345e7812904e67 md5: c21a474a38982cdb56b3454cf4f78389 @@ -7784,6 +9710,21 @@ packages: license: GPL-2.0-only OR FTL size: 340155 timestamp: 1772756166648 +- conda: https://conda.anaconda.org/conda-forge/win-64/libfreetype6-2.14.3-hdbac1cb_1.conda + sha256: 0bbd19c9f7c4d0232b31892e6a4d1f82b8d19d1b84d89725f1f491b336447758 + md5: 4e4d54f9f98383d977ba56ef39ebf46d + depends: + - libpng >=1.6.58,<1.7.0a0 + - libzlib >=1.3.2,<2.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + constrains: + - freetype >=2.14.3 + license: GPL-2.0-only OR FTL + purls: [] + size: 340411 + timestamp: 1780934813224 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5 md5: 0aa00f03f9e39fb9876085dee11a85d4 @@ -7798,6 +9739,20 @@ packages: purls: [] size: 1041788 timestamp: 1771378212382 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_19.conda + sha256: 8e0a3b5e41272e5678499b5dfc4cddb673f9e935de01eb0767ce857001229f46 + md5: 57736f29cc2b0ec0b6c2952d3f101b6a + depends: + - __glibc >=2.17,<3.0.a0 + - _openmp_mutex >=4.5 + constrains: + - libgcc-ng ==15.2.0=*_19 + - libgomp 15.2.0 he0feb66_19 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 1041084 + timestamp: 1778269013026 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.2.0-h8acb6b2_18.conda sha256: 43df385bedc1cab11993c4369e1f3b04b4ca5d0ea16cba6a0e7f18dbc129fcc9 md5: 552567ea2b61e3a3035759b2fdb3f9a6 @@ -7811,6 +9766,19 @@ packages: purls: [] size: 622900 timestamp: 1771378128706 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.2.0-h8acb6b2_19.conda + sha256: 4592b096e553f67799ae70d4b6167eeda3ec74587d68c7aecbf4e7b1df136681 + md5: f35b3f52d0a2ec4ffe3c89ba135cdb9a + depends: + - _openmp_mutex >=4.5 + constrains: + - libgomp 15.2.0 h8acb6b2_19 + - libgcc-ng ==15.2.0=*_19 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 622462 + timestamp: 1778268755949 - conda: https://conda.anaconda.org/conda-forge/win-64/libgcc-15.2.0-h8ee18e1_18.conda sha256: da2c96563c76b8c601746f03e03ac75d2b4640fa2ee017cb23d6c9fc31f1b2c6 md5: b085746891cca3bd2704a450a7b4b5ce @@ -7843,6 +9811,16 @@ packages: license_family: GPL size: 3085932 timestamp: 1771378098166 +- conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-15.2.0-hcc6f6b0_119.conda + sha256: 38a557eba305468ac1f90ac85e50d8defd76141cb0b8a43b2fc1aca71dd5d5f2 + md5: 683fcb168e1df9a21fa80d5aa2d9330b + depends: + - __unix + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 3095909 + timestamp: 1778268932148 - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-aarch64-14.3.0-h25ba3ff_118.conda sha256: 058fab0156cb13897f7e4a2fc9d63c922d3de09b6429390365f91b62f1dddb0e md5: 3733752e5a7a0737c8c4f1897f2074f9 @@ -7861,6 +9839,16 @@ packages: license_family: GPL size: 2364690 timestamp: 1771378032404 +- conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-aarch64-15.2.0-h55c397f_119.conda + sha256: fe600a63a39281e6994e27fe79360cd6bd8e576c3ce1af32ce8673b011f46c21 + md5: 18ad0f0b94071d91fa962a1bf3983a78 + depends: + - __unix + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 2353893 + timestamp: 1778268665954 - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_win-64-15.2.0-hbb59886_118.conda sha256: e43ffa48a88a7d77a0dc0d3ccfa3acc55702e9d964e8564e86927f5a389a6c51 md5: 1e020780767f809769807a442f5d6f6a @@ -7879,6 +9867,16 @@ packages: license_family: GPL size: 27526 timestamp: 1771378224552 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_19.conda + sha256: 9dcf54adfaa5e861123c2da4f2f0451a685464ea7e5a41ad91cf67b31d658d98 + md5: 331ee9b72b9dff570d56b1302c5ab37d + depends: + - libgcc 15.2.0 he0feb66_19 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 27694 + timestamp: 1778269016987 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-15.2.0-he9431aa_18.conda sha256: 83bb0415f59634dccfa8335d4163d1f6db00a27b36666736f9842b650b92cf2f md5: 4feebd0fbf61075a1a9c2e9b3936c257 @@ -7888,6 +9886,16 @@ packages: license_family: GPL size: 27568 timestamp: 1771378136019 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-15.2.0-he9431aa_19.conda + sha256: 1137f93f477f56199ded24117430045a0c02cbe8b10031beac3b9ad2138539d3 + md5: 770cf892e5530f43e63cadc673e85653 + depends: + - libgcc 15.2.0 h8acb6b2_19 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 27738 + timestamp: 1778268759211 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda sha256: d2c9fad338fd85e4487424865da8e74006ab2e2475bd788f624d7a39b2a72aee md5: 9063115da5bc35fdc3e1002e69b9ef6e @@ -7912,6 +9920,18 @@ packages: purls: [] size: 27587 timestamp: 1771378169244 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran-15.2.0-he9431aa_19.conda + sha256: e5ad94be72634233510b33ba792a3339921bd468f0b8bc6961ea05eded251d9b + md5: c7a5b5decf969ead5ecada83654164cf + depends: + - libgfortran5 15.2.0 h1b7bec0_19 + constrains: + - libgfortran-ng ==15.2.0=*_19 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 27728 + timestamp: 1778268784621 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda sha256: 539b57cf50ec85509a94ba9949b7e30717839e4d694bc94f30d41c9d34de2d12 md5: 646855f357199a12f02a87382d429b75 @@ -7937,6 +9957,18 @@ packages: purls: [] size: 1486341 timestamp: 1771378148102 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran5-15.2.0-h1b7bec0_19.conda + sha256: af8e9bdcaa77f133a8ee4c1ef57ef564d9c45aa262abf9f5ef9b50eb99d96407 + md5: 779dbb494de6d3d6477cab52eb34285a + depends: + - libgcc >=15.2.0 + constrains: + - libgfortran 15.2.0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 1487244 + timestamp: 1778268767295 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_2.conda sha256: dc2752241fa3d9e40ce552c1942d0a4b5eeb93740c9723873f6fcf8d39ef8d2d md5: 928b8be80851f5d8ffb016f9c81dae7a @@ -7947,6 +9979,17 @@ packages: license: LicenseRef-libglvnd size: 134712 timestamp: 1731330998354 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_3.conda + sha256: ec353b3076ed8e357ed961d0e9ff6997491cade0e603de5bd18a2e301ac78ebd + md5: f25206d7322c0e9648e8b83694d143ab + depends: + - __glibc >=2.17,<3.0.a0 + - libglvnd 1.7.0 ha4b6fd6_3 + - libglx 1.7.0 ha4b6fd6_3 + license: LicenseRef-libglvnd + purls: [] + size: 133469 + timestamp: 1779728207669 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgl-1.7.0-hd24410f_2.conda sha256: 3e954380f16255d1c8ae5da3bd3044d3576a0e1ac2e3c3ff2fe8f2f1ad2e467a md5: 0d00176464ebb25af83d40736a2cd3bb @@ -7956,6 +9999,16 @@ packages: license: LicenseRef-libglvnd size: 145442 timestamp: 1731331005019 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgl-1.7.0-hd24410f_3.conda + sha256: 05c75a2034bdbca29bab467d02ad770ed5e524e4f0670432258f2d8487c95348 + md5: 6e893c36f31502dd195d3d58f455fdbd + depends: + - libglvnd 1.7.0 hd24410f_3 + - libglx 1.7.0 hd24410f_3 + license: LicenseRef-libglvnd + purls: [] + size: 148112 + timestamp: 1779728248678 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgl-devel-1.7.0-ha4b6fd6_2.conda sha256: e281356c0975751f478c53e14f3efea6cd1e23c3069406d10708d6c409525260 md5: 53e7cbb2beb03d69a478631e23e340e9 @@ -7966,6 +10019,17 @@ packages: license: LicenseRef-libglvnd size: 113911 timestamp: 1731331012126 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgl-devel-1.7.0-ha4b6fd6_3.conda + sha256: 41d7d864ad1f199bdb06ff6cc3931455c8af62f1d2071a08c6fa08affbcb678f + md5: 63e43d278ee5084813fe3c2edf4834ce + depends: + - __glibc >=2.17,<3.0.a0 + - libgl 1.7.0 ha4b6fd6_3 + - libglx-devel 1.7.0 ha4b6fd6_3 + license: LicenseRef-libglvnd + purls: [] + size: 115664 + timestamp: 1779728218325 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgl-devel-1.7.0-hd24410f_2.conda sha256: ec5c3125b38295bad8acc80f793b8ee217ccb194338d73858be278db50ea82f1 md5: 5d8323dff6a93596fb6f985cf6e8521a @@ -7975,6 +10039,16 @@ packages: license: LicenseRef-libglvnd size: 113925 timestamp: 1731331014056 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgl-devel-1.7.0-hd24410f_3.conda + sha256: b7483884e5e8df362f113d7d7694f0a37ecf6409f1acaaa889f312688917c067 + md5: 3a0adce33b3b8a52c76389db1edfec1b + depends: + - libgl 1.7.0 hd24410f_3 + - libglx-devel 1.7.0 hd24410f_3 + license: LicenseRef-libglvnd + purls: [] + size: 116084 + timestamp: 1779728257534 - conda: https://conda.anaconda.org/conda-forge/linux-64/libglib-2.86.4-h6548e54_1.conda sha256: a27e44168a1240b15659888ce0d9b938ed4bdb49e9ea68a7c1ff27bcea8b55ce md5: bb26456332b07f68bf3b7622ed71c0da @@ -7990,6 +10064,22 @@ packages: license: LGPL-2.1-or-later size: 4398701 timestamp: 1771863239578 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libglib-2.88.1-h0d30a3d_2.conda + sha256: 33eb5d5310a5c2c0a4707a0afa644801c2e08c8f70c45e1f62f354116dfe0970 + md5: 17d484ab9c8179c6a6e5b7dbb5065afc + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libffi >=3.5.2,<3.6.0a0 + - pcre2 >=10.47,<10.48.0a0 + - libzlib >=1.3.2,<2.0a0 + - libiconv >=1.18,<2.0a0 + constrains: + - glib >2.66 + license: LGPL-2.1-or-later + purls: [] + size: 4754097 + timestamp: 1778508800134 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglib-2.86.4-hf53f6bf_1.conda sha256: afc503dbd04a5bf2709aa9d8318a03a8c4edb389f661ff280c3494bfef4341ec md5: 4ac4372fc4d7f20630a91314cdac8afd @@ -8004,6 +10094,21 @@ packages: license: LGPL-2.1-or-later size: 4512186 timestamp: 1771863220969 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglib-2.88.1-h96a7f82_2.conda + sha256: 050285afdb7bd98b1b8fb052af9da31fafde586a49d3b56dd33d5338b2d0e411 + md5: 16d72f76bf6fead4a29efb2fede0a06b + depends: + - libgcc >=14 + - libiconv >=1.18,<2.0a0 + - libzlib >=1.3.2,<2.0a0 + - pcre2 >=10.47,<10.48.0a0 + - libffi >=3.5.2,<3.6.0a0 + constrains: + - glib >2.66 + license: LGPL-2.1-or-later + purls: [] + size: 4946648 + timestamp: 1778508920982 - conda: https://conda.anaconda.org/conda-forge/win-64/libglib-2.86.4-h0c9aed9_1.conda sha256: f035fb25f8858f201e0055c719ef91022e9465cd51fe803304b781863286fb10 md5: 0329a7e92c8c8b61fcaaf7ad44642a96 @@ -8021,6 +10126,24 @@ packages: license: LGPL-2.1-or-later size: 4095369 timestamp: 1771863229701 +- conda: https://conda.anaconda.org/conda-forge/win-64/libglib-2.88.1-h7ce1215_2.conda + sha256: f61277e224e9889c221bb2eac0f57d5aeeb82fc45d3dc326957d251c97444f7c + md5: 5fb838786a8317ebb38056bbe236d3ff + depends: + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + - ucrt >=10.0.20348.0 + - libiconv >=1.18,<2.0a0 + - libzlib >=1.3.2,<2.0a0 + - pcre2 >=10.47,<10.48.0a0 + - libintl >=0.22.5,<1.0a0 + - libffi >=3.5.2,<3.6.0a0 + constrains: + - glib >2.66 + license: LGPL-2.1-or-later + purls: [] + size: 4522891 + timestamp: 1778508851933 - conda: https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_2.conda sha256: 1175f8a7a0c68b7f81962699751bb6574e6f07db4c9f72825f978e3016f46850 md5: 434ca7e50e40f4918ab701e3facd59a0 @@ -8029,12 +10152,28 @@ packages: license: LicenseRef-libglvnd size: 132463 timestamp: 1731330968309 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_3.conda + sha256: e019ebe4e3f5cdf23e2f5e58ddf7ade27988c53820115b17b98f218ebcc87748 + md5: eb83f3f8cecc3e9bff9e250817fc69b6 + depends: + - __glibc >=2.17,<3.0.a0 + license: LicenseRef-libglvnd + purls: [] + size: 133586 + timestamp: 1779728183422 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglvnd-1.7.0-hd24410f_2.conda sha256: 57ec3898a923d4bcc064669e90e8abfc4d1d945a13639470ba5f3748bd3090da md5: 9e115653741810778c9a915a2f8439e7 license: LicenseRef-libglvnd size: 152135 timestamp: 1731330986070 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglvnd-1.7.0-hd24410f_3.conda + sha256: ca124e53765a2b123e0ca6ce809c7caf188bb26e5fe125b69099378276d5e66f + md5: a2ad848c0aab2e326c6af08ea20502f4 + license: LicenseRef-libglvnd + purls: [] + size: 146645 + timestamp: 1779728228274 - conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_2.conda sha256: 2d35a679624a93ce5b3e9dd301fff92343db609b79f0363e6d0ceb3a6478bfa7 md5: c8013e438185f33b13814c5c488acd5c @@ -8045,6 +10184,17 @@ packages: license: LicenseRef-libglvnd size: 75504 timestamp: 1731330988898 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_3.conda + sha256: 2f74713c9ca408ea84e88a30a9028153e7b553e8bb42e06139eac9a753c27da9 + md5: ec3c4350aa0261bf7f87b8ca15c8e80e + depends: + - __glibc >=2.17,<3.0.a0 + - libglvnd 1.7.0 ha4b6fd6_3 + - xorg-libx11 >=1.8.13,<2.0a0 + license: LicenseRef-libglvnd + purls: [] + size: 76586 + timestamp: 1779728199059 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglx-1.7.0-hd24410f_2.conda sha256: 6591af640cb05a399fab47646025f8b1e1a06a0d4bbb4d2e320d6629b47a1c61 md5: 1d4269e233636148696a67e2d30dad2a @@ -8054,6 +10204,16 @@ packages: license: LicenseRef-libglvnd size: 77736 timestamp: 1731330998960 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglx-1.7.0-hd24410f_3.conda + sha256: 2698b415b9f7b692cd64e34db623e1a6e54ed54e78b0b4e5d4ea6762791e9118 + md5: 338faf34b78d053841098c0528699e34 + depends: + - libglvnd 1.7.0 hd24410f_3 + - xorg-libx11 >=1.8.13,<2.0a0 + license: LicenseRef-libglvnd + purls: [] + size: 76704 + timestamp: 1779728242753 - conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-devel-1.7.0-ha4b6fd6_2.conda sha256: 0a930e0148ab6e61089bbcdba25a2e17ee383e7de82e7af10cc5c12c82c580f3 md5: 27ac5ae872a21375d980bd4a6f99edf3 @@ -8065,6 +10225,18 @@ packages: license: LicenseRef-libglvnd size: 26388 timestamp: 1731331003255 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-devel-1.7.0-ha4b6fd6_3.conda + sha256: a17ae2d4cb2de04a20882ae14ec3cc1958e868a4dec81e3d7eca30115ee50e94 + md5: 16b6330783ce0d1ae8d22782173b32c9 + depends: + - __glibc >=2.17,<3.0.a0 + - libglx 1.7.0 ha4b6fd6_3 + - xorg-libx11 >=1.8.13,<2.0a0 + - xorg-xorgproto + license: LicenseRef-libglvnd + purls: [] + size: 27363 + timestamp: 1779728211402 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglx-devel-1.7.0-hd24410f_2.conda sha256: 4bc28ecc38f30ca1ac66a8fb6c5703f4d888381ec46d3938b7c3383210061ec5 md5: 1f9ddbb175a63401662d1c6222cef6ff @@ -8075,6 +10247,17 @@ packages: license: LicenseRef-libglvnd size: 26362 timestamp: 1731331008489 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libglx-devel-1.7.0-hd24410f_3.conda + sha256: b30433c4f56bec0a7d9d288e0a456ed280183e32f3f4880ada2189fc12804a52 + md5: 3da9719866b95bddcad86c8aec6a8ba2 + depends: + - libglx 1.7.0 hd24410f_3 + - xorg-libx11 >=1.8.13,<2.0a0 + - xorg-xorgproto + license: LicenseRef-libglvnd + purls: [] + size: 27651 + timestamp: 1779728252006 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110 md5: 239c5e9546c38a1e884d69effcf4c882 @@ -8085,6 +10268,16 @@ packages: purls: [] size: 603262 timestamp: 1771378117851 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_19.conda + sha256: 5abe4ab9d93f6c9757d654f1969ae2267d4505315c1f2f8fe705fd60af084f1b + md5: faac990cb7aedc7f3a2224f2c9b0c26c + depends: + - __glibc >=2.17,<3.0.a0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 603817 + timestamp: 1778268942614 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.2.0-h8acb6b2_18.conda sha256: fc716f11a6a8525e27a5d332ef6a689210b0d2a4dd1133edc0f530659aa9faa6 md5: 4faa39bf919939602e594253bd673958 @@ -8093,6 +10286,14 @@ packages: purls: [] size: 588060 timestamp: 1771378040807 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.2.0-h8acb6b2_19.conda + sha256: 2370ef0ffcbae5bede3c4bf136add4abc257245eb91f724c99bb4a43116c5a83 + md5: c5e8a379c4a2ec2aea4ba22758c001d9 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 587387 + timestamp: 1778268674393 - conda: https://conda.anaconda.org/conda-forge/win-64/libgomp-15.2.0-h8ee18e1_18.conda sha256: 94981bc2e42374c737750895c6fdcfc43b7126c4fc788cad0ecc7281745931da md5: 939fb173e2a4d4e980ef689e99b35223 @@ -8117,6 +10318,20 @@ packages: license_family: BSD size: 2449916 timestamp: 1765103845133 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.13.0-default_he001693_1000.conda + sha256: 5041d295813dfb84652557839825880aae296222ab725972285c5abe3b6e4288 + md5: c197985b58bc813d26b42881f0021c82 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - libxml2 + - libxml2-16 >=2.14.6 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 2436378 + timestamp: 1770953868164 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libhwloc-2.12.2-default_ha470c98_1000.conda sha256: e87cf64d87c7706403507df7329f5b597c3b487f4c72ef53ef899e38983ea70e md5: c8b05c85ae962a993d9b7d6c9d10571e @@ -8129,6 +10344,19 @@ packages: license_family: BSD size: 2467105 timestamp: 1765103804193 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libhwloc-2.13.0-default_ha95e27d_1000.conda + sha256: 88888d99e81c93e7331f2eb0fec08b3c4a47a1bfa1c88b3e641f6568569b6261 + md5: 974183f6420938051e2f3208922d057f + depends: + - libgcc >=14 + - libstdcxx >=14 + - libxml2 + - libxml2-16 >=2.14.6 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 2453519 + timestamp: 1770953713701 - conda: https://conda.anaconda.org/conda-forge/win-64/libhwloc-2.12.2-default_h4379cf1_1000.conda sha256: 8cdf11333a81085468d9aa536ebb155abd74adc293576f6013fc0c85a7a90da3 md5: 3b576f6860f838f950c570f4433b086e @@ -8144,6 +10372,21 @@ packages: purls: [] size: 2411241 timestamp: 1765104337762 +- conda: https://conda.anaconda.org/conda-forge/win-64/libhwloc-2.13.0-default_h049141e_1000.conda + sha256: 2ee12e37223dfcd0acd050c80a91150c482b6e2899198521e1800dce66662467 + md5: 6a01c986e30292c715038d2788aa1385 + depends: + - libwinpthread >=12.0.0.r4.gg4f2fc60ca + - libxml2 + - libxml2-16 >=2.14.6 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 2396128 + timestamp: 1770954127918 - conda: https://conda.anaconda.org/conda-forge/linux-64/libhwy-1.3.0-h4c17acf_1.conda sha256: 2bdd1cdd677b119abc5e83069bec2e28fe6bfb21ebaea3cd07acee67f38ea274 md5: c2a0c1d0120520e979685034e0b79859 @@ -8154,6 +10397,17 @@ packages: license: Apache-2.0 OR BSD-3-Clause size: 1448617 timestamp: 1758894401402 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libhwy-1.4.0-h10be129_0.conda + sha256: 8b70955d5e9a49d08945d4f8e2eab855b2efa5fce9cb9bc5e75d86764e6f2f38 + md5: 3a9428b74c403c71048104d38437b48c + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: Apache-2.0 OR BSD-3-Clause + purls: [] + size: 1435782 + timestamp: 1776989559668 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libhwy-1.3.0-h81d0cf9_1.conda sha256: a6a441692b27606f8ef64ee9e6a0c72c615c2e25b01c282ee080ee8f97861943 md5: d5b93534e24e7c15792b3f336c52af07 @@ -8163,6 +10417,16 @@ packages: license: Apache-2.0 OR BSD-3-Clause size: 1180000 timestamp: 1758894754411 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libhwy-1.4.0-h0626a34_0.conda + sha256: cff38f9a1df7bc3e5ac7856bc2e5c879151b54c07b55722ec0da1af49d869721 + md5: 61b4e7ef4624c692a3ebd07100795303 + depends: + - libgcc >=14 + - libstdcxx >=14 + license: Apache-2.0 OR BSD-3-Clause + purls: [] + size: 945401 + timestamp: 1776989517303 - conda: https://conda.anaconda.org/conda-forge/win-64/libhwy-1.3.0-ha71e874_1.conda sha256: c722a04f065656b988a46dee87303ff0bf037179c50e2e76704b693def7f9a96 md5: f4649d4b6bf40d616eda57d6255d2333 @@ -8173,6 +10437,17 @@ packages: license: Apache-2.0 OR BSD-3-Clause size: 536186 timestamp: 1758894243956 +- conda: https://conda.anaconda.org/conda-forge/win-64/libhwy-1.4.0-h172a326_0.conda + sha256: 4b45bf59ee46d3c746272c27651da9ce709fda4eee8536c7424acea60d0e2ad0 + md5: aeca1cb6665f19e560c1fbd20b5bcf34 + depends: + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: Apache-2.0 OR BSD-3-Clause + purls: [] + size: 562583 + timestamp: 1776989522919 - conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h3b78370_2.conda sha256: c467851a7312765447155e071752d7bf9bf44d610a5687e32706f480aad2833f md5: 915f5995e94f60e9a4826e0b0920ee88 @@ -8180,6 +10455,7 @@ packages: - __glibc >=2.17,<3.0.a0 - libgcc >=14 license: LGPL-2.1-only + purls: [] size: 790176 timestamp: 1754908768807 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libiconv-1.18-h90929bb_2.conda @@ -8188,6 +10464,7 @@ packages: depends: - libgcc >=14 license: LGPL-2.1-only + purls: [] size: 791226 timestamp: 1754910975665 - conda: https://conda.anaconda.org/conda-forge/win-64/libiconv-1.18-hc1393d2_2.conda @@ -8207,6 +10484,7 @@ packages: depends: - libiconv >=1.17,<2.0a0 license: LGPL-2.1-or-later + purls: [] size: 95568 timestamp: 1723629479451 - conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.2-hb03c661_0.conda @@ -8220,6 +10498,18 @@ packages: license: IJG AND BSD-3-Clause AND Zlib size: 633710 timestamp: 1762094827865 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.4.1-hb03c661_0.conda + sha256: 10056646c28115b174de81a44e23e3a0a3b95b5347d2e6c45cc6d49d35294256 + md5: 6178c6f2fb254558238ef4e6c56fb782 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - jpeg <0.0.0a + license: IJG AND BSD-3-Clause AND Zlib + purls: [] + size: 633831 + timestamp: 1775962768273 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libjpeg-turbo-3.1.2-he30d5cf_0.conda sha256: 84064c7c53a64291a585d7215fe95ec42df74203a5bf7615d33d49a3b0f08bb6 md5: 5109d7f837a3dfdf5c60f60e311b041f @@ -8230,6 +10520,17 @@ packages: license: IJG AND BSD-3-Clause AND Zlib size: 691818 timestamp: 1762094728337 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libjpeg-turbo-3.1.4.1-he30d5cf_0.conda + sha256: e97ec2af5f09f8f6ea8ecd550055c95ae80fae22015fcfadaa94eafe025c9ccc + md5: a85ba48648f6868016f2741fd9170250 + depends: + - libgcc >=14 + constrains: + - jpeg <0.0.0a + license: IJG AND BSD-3-Clause AND Zlib + purls: [] + size: 693143 + timestamp: 1775962625956 - conda: https://conda.anaconda.org/conda-forge/win-64/libjpeg-turbo-3.1.2-hfd05255_0.conda sha256: 795e2d4feb2f7fc4a2c6e921871575feb32b8082b5760726791f080d1e2c2597 md5: 56a686f92ac0273c0f6af58858a3f013 @@ -8242,6 +10543,34 @@ packages: license: IJG AND BSD-3-Clause AND Zlib size: 841783 timestamp: 1762094814336 +- conda: https://conda.anaconda.org/conda-forge/win-64/libjpeg-turbo-3.1.4.1-hfd05255_0.conda + sha256: 698d57b5b90120270eaa401298319fcb25ea186ae95b340c2f4813ed9171083d + md5: 25a127bad5470852b30b239f030ec95b + depends: + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + constrains: + - jpeg <0.0.0a + license: IJG AND BSD-3-Clause AND Zlib + purls: [] + size: 842806 + timestamp: 1775962811457 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libjxl-0.11.2-h174a0a3_1.conda + sha256: 0c8a78c6a42a6e4c6de3a5e82d692f60400d43f4cc80591745f28b37daad9c70 + md5: 850f48943d6b4589800a303f0de6a816 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - libhwy >=1.4.0,<1.5.0a0 + - libbrotlienc >=1.2.0,<1.3.0a0 + - libbrotlidec >=1.2.0,<1.3.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 1846962 + timestamp: 1777065125966 - conda: https://conda.anaconda.org/conda-forge/linux-64/libjxl-0.11.2-ha09017c_0.conda sha256: 0c2399cef02953b719afe6591223fb11d287d5a108ef8bb9a02dd509a0f738d7 md5: 1df8c1b1d6665642107883685db6cf37 @@ -8269,6 +10598,35 @@ packages: license_family: BSD size: 1489440 timestamp: 1770801995062 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libjxl-0.11.2-hbae46ee_1.conda + sha256: 237bbfa18c4f245a24000c12924d61a9e54f7e6f689f405c0dc8e188a40de890 + md5: 532faebf82c7d2c10539518347cff460 + depends: + - libgcc >=14 + - libstdcxx >=14 + - libbrotlienc >=1.2.0,<1.3.0a0 + - libbrotlidec >=1.2.0,<1.3.0a0 + - libhwy >=1.4.0,<1.5.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 1489188 + timestamp: 1777065125935 +- conda: https://conda.anaconda.org/conda-forge/win-64/libjxl-0.11.2-h932607e_1.conda + sha256: 4715e22c602526c85da09f73865676add67e0995a944b821fbff84547a9db533 + md5: 327bce3eb1ef1875c7145e915d25bcd3 + depends: + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + - ucrt >=10.0.20348.0 + - libhwy >=1.4.0,<1.5.0a0 + - libbrotlienc >=1.2.0,<1.3.0a0 + - libbrotlidec >=1.2.0,<1.3.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 1194926 + timestamp: 1777065171989 - conda: https://conda.anaconda.org/conda-forge/win-64/libjxl-0.11.2-hf3f85d1_0.conda sha256: 525c5382eb32a43e7baf45b452079bf23daf8f8bf19fee7c8dafa8c731ada8bd md5: 869e71fcf2135212c51a96f7f7dbd00d @@ -8328,6 +10686,23 @@ packages: purls: [] size: 18624 timestamp: 1774503065378 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-8_h5e43f62_mkl.conda + build_number: 8 + sha256: 0cb26d433dfa15a392eaeeb8a96ac468f4d007d7e7e37ef7bf46856aaf9a9785 + md5: 370e81464714060008e60ee53825bb3e + depends: + - libblas 3.11.0 8_h5875eb1_mkl + constrains: + - blas 2.308 mkl + - libcblas 3.11.0 8*_mkl + - liblapacke 3.11.0 8*_mkl + track_features: + - blas_mkl + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 18921 + timestamp: 1779859092867 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblapack-3.11.0-5_h88aeb00_openblas.conda build_number: 5 sha256: 692222d186d3ffbc99eaf04b5b20181fd26aee1edec1106435a0a755c57cce86 @@ -8357,6 +10732,21 @@ packages: purls: [] size: 18702 timestamp: 1774503068721 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblapack-3.11.0-8_h88aeb00_openblas.conda + build_number: 8 + sha256: d269a684afa0b2fdb44d6b60167f854f30410cdb5ee49a7275c026f6b10c8d05 + md5: 3af3f2aa755abc5e91351114ae214f55 + depends: + - libblas 3.11.0 8_haddc8a3_openblas + constrains: + - libcblas 3.11.0 8*_openblas + - liblapacke 3.11.0 8*_openblas + - blas 2.308 openblas + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 18828 + timestamp: 1779859055749 - conda: https://conda.anaconda.org/conda-forge/win-64/liblapack-3.11.0-5_hf9ab0e9_mkl.conda build_number: 5 sha256: a2d33f5cc2b8a9042f2af6981c6733ab1a661463823eaa56595a9c58c0ab77e1 @@ -8386,6 +10776,21 @@ packages: purls: [] size: 80571 timestamp: 1774503757128 +- conda: https://conda.anaconda.org/conda-forge/win-64/liblapack-3.11.0-8_hf9ab0e9_mkl.conda + build_number: 8 + sha256: 44999ed04bc0a56de44ee0ac8bd5b3702efd411a8b29491c0e3d3deb8619c94e + md5: d584799b920ecae9b75a2b70743a3de7 + depends: + - libblas 3.11.0 8_h8455456_mkl + constrains: + - libcblas 3.11.0 8*_mkl + - liblapacke 3.11.0 8*_mkl + - blas 2.308 mkl + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 81027 + timestamp: 1779859714698 - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda sha256: 755c55ebab181d678c12e49cced893598f2bab22d582fbbf4d8b83c18be207eb md5: c7c83eecbb72d88b940c249af56c8b17 @@ -8398,6 +10803,18 @@ packages: purls: [] size: 113207 timestamp: 1768752626120 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda + sha256: ec30e52a3c1bf7d0425380a189d209a52baa03f22fb66dd3eb587acaa765bd6d + md5: b88d90cad08e6bc8ad540cb310a761fb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - xz 5.8.3.* + license: 0BSD + purls: [] + size: 113478 + timestamp: 1775825492909 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.2-he30d5cf_0.conda sha256: 843c46e20519651a3e357a8928352b16c5b94f4cd3d5481acc48be2e93e8f6a3 md5: 96944e3c92386a12755b94619bae0b35 @@ -8409,6 +10826,17 @@ packages: purls: [] size: 125916 timestamp: 1768754941722 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.3-he30d5cf_0.conda + sha256: d61962b9cd54c3554361550203c64d5b65b71e3058a285b66e4b04b9769f0a5c + md5: 76298a9e6d71ee6e832a8d0d7373b261 + depends: + - libgcc >=14 + constrains: + - xz 5.8.3.* + license: 0BSD + purls: [] + size: 126102 + timestamp: 1775828008518 - conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.2-hfd05255_0.conda sha256: f25bf293f550c8ed2e0c7145eb404324611cfccff37660869d97abf526eb957c md5: ba0bfd4c3cf73f299ffe46ff0eaeb8e3 @@ -8422,6 +10850,38 @@ packages: purls: [] size: 106169 timestamp: 1768752763559 +- conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.3-hfd05255_0.conda + sha256: d636d1a25234063642f9c531a7bb58d84c1c496411280a36ea000bd122f078f1 + md5: 8f83619ab1588b98dd99c90b0bfc5c6d + depends: + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + constrains: + - xz 5.8.3.* + license: 0BSD + purls: [] + size: 106486 + timestamp: 1775825663227 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmagma-2.10.0-hd93470c_0.conda + sha256: 07607cffe1f53a5e405e29c897bfcad800f3c71b2a57ed7502a0ef82a60edc78 + md5: d9e0e4dbf5aff16fb804e6a656fc73bc + depends: + - __glibc >=2.28,<3.0.a0 + - _openmp_mutex >=4.5 + - cuda-cudart + - cuda-version >=13,<14.0a0 + - libblas >=3.9.0,<4.0a0 + - libcublas + - libcusparse + - libgcc >=14 + - liblapack >=3.9.0,<4.0a0 + - libstdcxx >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 272468878 + timestamp: 1773078724253 - conda: https://conda.anaconda.org/conda-forge/linux-64/libmagma-2.9.0-hd93470c_6.conda sha256: 5ea4675cb4a900795a5eb33519307cf985fd3787eb0cf33142e52ecc8eb8a7d4 md5: 886e83a08e0ad01d7fe868972bc729f3 @@ -8440,6 +10900,26 @@ packages: license_family: BSD size: 387811432 timestamp: 1767135866822 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmagma-2.10.0-he3ecef4_0.conda + sha256: 72b40c75a7bc547b856954f12ba32cd815b26ace064564dc2569e44967cf87fc + md5: 1392ce38ffe86b9612c27cdfef98404d + depends: + - __glibc >=2.28,<3.0.a0 + - _openmp_mutex >=4.5 + - arm-variant * sbsa + - cuda-cudart + - cuda-version >=13,<14.0a0 + - libblas >=3.9.0,<4.0a0 + - libcublas + - libcusparse + - libgcc >=14 + - liblapack >=3.9.0,<4.0a0 + - libstdcxx >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 324720276 + timestamp: 1773081195270 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmagma-2.9.0-he3ecef4_6.conda sha256: 1511c96dcab0968a344d16a5bbb6791aeefc344e2ef4740a1137cfb62f95ebc6 md5: c6eec8ae18b32f1e444353dd526fb040 @@ -8683,6 +11163,7 @@ packages: - __glibc >=2.17,<3.0.a0 license: BSD-3-Clause license_family: BSD + purls: [] size: 218500 timestamp: 1745825989535 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libogg-1.3.5-h86ecc28_1.conda @@ -8692,6 +11173,7 @@ packages: - libgcc >=13 license: BSD-3-Clause license_family: BSD + purls: [] size: 220653 timestamp: 1745826021156 - conda: https://conda.anaconda.org/conda-forge/win-64/libogg-1.3.5-h2466b09_1.conda @@ -8706,6 +11188,7 @@ packages: - ucrt >=10.0.20348.0 license: BSD-3-Clause license_family: BSD + purls: [] size: 35040 timestamp: 1745826086628 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda @@ -8782,6 +11265,25 @@ packages: purls: [] size: 5122134 timestamp: 1774471612323 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenblas-0.3.33-openmp_h1a8b088_0.conda + sha256: 723180af43679bcccd39ccea91a2c390834601a2ee338522c6b1d47b45d9db8d + md5: d413a7a3af9493de3be90a778e33a9f8 + depends: + - _openmp_mutex * *_llvm + - _openmp_mutex >=4.5 + - libgcc >=14 + - libgfortran + - libgfortran5 >=14.3.0 + - llvm-openmp >=22.1.4 + constrains: + - openblas >=0.3.33,<0.3.34.0a0 + track_features: + - openblas_threading_openmp + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 5136501 + timestamp: 1776993280434 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-2026.0.0-hb56ce9e_1.conda sha256: a396a2d1aa267f21c98717ac097138b32e41e4c40ae501729bded3801476eeb5 md5: 9f0596e995efe372c470ff45c93131cb @@ -8795,6 +11297,20 @@ packages: license_family: APACHE size: 6582302 timestamp: 1772727204779 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-2026.2.1-h1f0fae8_1.conda + sha256: 7941fb9ba8c3a5a0a2401dc4120e8fcb561b96d928c43374eb93f545019a2858 + md5: ea41753f926f73966629d81fdf20ec6f + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - pugixml >=1.15,<1.16.0a0 + - tbb >=2023.0.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 6823841 + timestamp: 1782219077259 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-2026.0.0-h1915271_1.conda sha256: 6f8558cc4ee4d490db88640e71d3f79fa7552701d91c09ad6f1371dadb9bd3f1 md5: c8ff442d02723939711a726d9ff71eac @@ -8807,6 +11323,19 @@ packages: license_family: APACHE size: 5742222 timestamp: 1772721263739 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-2026.2.1-h9a8427e_1.conda + sha256: b1b0553cfa613bffa8bed04b3a880d651cc08ffe565e07362c3071597069c4d4 + md5: e18fa81e8db0a20a8d5385902c12ce7c + depends: + - libgcc >=14 + - libstdcxx >=14 + - pugixml >=1.15,<1.16.0a0 + - tbb >=2023.0.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 5984918 + timestamp: 1782213524008 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-arm-cpu-plugin-2026.0.0-h1915271_1.conda sha256: 8fff4375f324bdf8a3fe20c489710b692340007b7af2da1d14f6832990c24891 md5: ef26404d824453138bf0a12a8bb033df @@ -8820,6 +11349,20 @@ packages: license_family: APACHE size: 10237615 timestamp: 1772721303162 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-arm-cpu-plugin-2026.2.1-h9a8427e_1.conda + sha256: 72d90363b68266a9dc704008032e1f9acb91e080212e89dc189ea8135b199781 + md5: 2641ed1135e1316f5dd6f9512bcd6845 + depends: + - libgcc >=14 + - libopenvino 2026.2.1 h9a8427e_1 + - libstdcxx >=14 + - pugixml >=1.15,<1.16.0a0 + - tbb >=2023.0.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 10313623 + timestamp: 1782213543505 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-auto-batch-plugin-2026.0.0-hd85de46_1.conda sha256: 286de85805dc69ce0bd25367ae2a20c8096ddef35eb2483474eb246dacd5387e md5: ee41df976413676f794af2785b291b0c @@ -8833,6 +11376,20 @@ packages: license_family: APACHE size: 114431 timestamp: 1772727230331 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-auto-batch-plugin-2026.2.1-h7e124b3_1.conda + sha256: 3ac14d36fa890840ae8474b8a9f0a094b8542fd8fbc409faf3d465c68f20aff0 + md5: 5698a64698e14e8a2e9e16f8f0de0e2e + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libstdcxx >=14 + - tbb >=2023.0.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 114628 + timestamp: 1782219097820 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-auto-batch-plugin-2026.0.0-h3d5001d_1.conda sha256: da7926f66318e539c9f20c2f5f3719a5ba663c6b9d5471e5223d290450219748 md5: 5e984d6405a8f8529d7429f28a7f285e @@ -8845,6 +11402,19 @@ packages: license_family: APACHE size: 111064 timestamp: 1772721336786 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-auto-batch-plugin-2026.2.1-he6b9e7b_1.conda + sha256: 523247ab847e3e71ae213311491d5cc0bbb26f2c39ad4c71df9179c1b469e39f + md5: 55060a39fb1c4f7b4c6f3c514879790b + depends: + - libgcc >=14 + - libopenvino 2026.2.1 h9a8427e_1 + - libstdcxx >=14 + - tbb >=2023.0.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 111073 + timestamp: 1782213572597 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-auto-plugin-2026.0.0-hd85de46_1.conda sha256: 9988ed6339a5eb044ae8d079e2b22f5a310c41e49a0cf716057f30b21ef9cec2 md5: ca025fa5c42ba94453636a2ae333de6b @@ -8858,6 +11428,20 @@ packages: license_family: APACHE size: 249056 timestamp: 1772727247597 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-auto-plugin-2026.2.1-h7e124b3_1.conda + sha256: 499a472fc7b598ad3753b8f2afe60eb5a277d48eca9362e8aca094b2862587a7 + md5: 2ce088ef09292930d4cb3262ce7e144d + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libstdcxx >=14 + - tbb >=2023.0.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 250912 + timestamp: 1782219111223 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-auto-plugin-2026.0.0-h3d5001d_1.conda sha256: 20f1958e160c64f3d207f1dbdb6960cc5642070a472bebffc0d587b2f6429033 md5: 573b3f5ec3963e0153501a2676660ee4 @@ -8870,19 +11454,46 @@ packages: license_family: APACHE size: 236010 timestamp: 1772721351244 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-auto-plugin-2026.2.1-he6b9e7b_1.conda + sha256: 803cf60b354ba780fffe509d013e2603bee30e2a3496cbb772105cdde60f99b5 + md5: ea0cea8d43d94d3118533f2f89e71cd4 + depends: + - libgcc >=14 + - libopenvino 2026.2.1 h9a8427e_1 + - libstdcxx >=14 + - tbb >=2023.0.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 238031 + timestamp: 1782213582645 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-hetero-plugin-2026.0.0-hd41364c_1.conda sha256: c7db498aeda5b0f36b347f4211b93b66ba108faaf54157a08bae8fa3c3af5f81 md5: 07a23e96db38f63d9763f666b2db66aa depends: - __glibc >=2.17,<3.0.a0 - libgcc >=14 - - libopenvino 2026.0.0 hb56ce9e_1 + - libopenvino 2026.0.0 hb56ce9e_1 + - libstdcxx >=14 + - pugixml >=1.15,<1.16.0a0 + license: Apache-2.0 + license_family: APACHE + size: 211582 + timestamp: 1772727264950 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-hetero-plugin-2026.2.1-hd41364c_1.conda + sha256: bec24379598a4405de171ad151945e79743c6bd049aceabf190b753c3f7a11da + md5: 02e71250f7ca786c4b183d0a39ef63ab + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 - libstdcxx >=14 - pugixml >=1.15,<1.16.0a0 license: Apache-2.0 license_family: APACHE - size: 211582 - timestamp: 1772727264950 + purls: [] + size: 215488 + timestamp: 1782219123433 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-hetero-plugin-2026.0.0-he07c6df_1.conda sha256: 3778ea3887c9a9300761e3f39ce86976746a35aa1392a4b76e4e4d3ce9e095b4 md5: 74bd299545a1fe23439bf6e071ed9710 @@ -8895,6 +11506,19 @@ packages: license_family: APACHE size: 202574 timestamp: 1772721365749 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-hetero-plugin-2026.2.1-he07c6df_1.conda + sha256: 5d216b218170b78e5513d6cef1b76aeebb4b517579a8d98af06a2cf4b2de7050 + md5: ba98371ee8a8122294c44381b4a128d9 + depends: + - libgcc >=14 + - libopenvino 2026.2.1 h9a8427e_1 + - libstdcxx >=14 + - pugixml >=1.15,<1.16.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 207050 + timestamp: 1782213592772 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-intel-cpu-plugin-2026.0.0-hb56ce9e_1.conda sha256: 01a28c0bd1f205b3800e7759e30bc8e8a75836e0d5a73a745b4da42837bbb174 md5: b43b96578573ddbcc8d084ae6e44c964 @@ -8909,6 +11533,21 @@ packages: license_family: APACHE size: 13173323 timestamp: 1772727282718 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-intel-cpu-plugin-2026.2.1-h1f0fae8_1.conda + sha256: eecc040a7838752a2dff9b4435a4c59bbc67b83e0c880457935b968206cb20b5 + md5: 7288f979a74cfe3fd4b32d8a0dc7baa4 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libstdcxx >=14 + - pugixml >=1.15,<1.16.0a0 + - tbb >=2023.0.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 13637410 + timestamp: 1782219135415 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-intel-gpu-plugin-2026.0.0-hb56ce9e_1.conda sha256: 720b87e1d5f1a10c577e040d4bf425072a978e925c6dfab8b1551bc848007c94 md5: 26e8e92c90d1a22af6eac8e9507d9b8f @@ -8924,6 +11563,22 @@ packages: license_family: APACHE size: 11402462 timestamp: 1772727323957 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-intel-gpu-plugin-2026.2.1-h1f0fae8_1.conda + sha256: a47442ce578b022e19a306f963536a108cc79385f4e09d57a14a849b6a864604 + md5: c0a258b12f0c18c476b8344dbd6db8d5 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libstdcxx >=14 + - ocl-icd >=2.3.4,<3.0a0 + - pugixml >=1.15,<1.16.0a0 + - tbb >=2023.0.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 12367381 + timestamp: 1782219178219 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-intel-npu-plugin-2026.0.0-hb56ce9e_1.conda sha256: df7eb2b23a1af38f2cd2281353309f2e2a04da1374ecedc7c6745c2a67ba617c md5: 01ba8b179ac45b2b37fe2d4225dddcc7 @@ -8939,6 +11594,22 @@ packages: license_family: APACHE size: 1994640 timestamp: 1772727360780 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-intel-npu-plugin-2026.2.1-h1f0fae8_1.conda + sha256: 45a91feb68ccce90ad0fa86520572233ca20be56deae0c920f86133d020ad1e8 + md5: c214b149e108e92672e0ee097ebe16f7 + depends: + - __glibc >=2.17,<3.0.a0 + - level-zero >=1.29.0,<2.0a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libstdcxx >=14 + - pugixml >=1.15,<1.16.0a0 + - tbb >=2023.0.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 2630818 + timestamp: 1782219217519 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-ir-frontend-2026.0.0-hd41364c_1.conda sha256: 8e7356b0b80b3f180615e264694d6811d388b210155d419553ff64e42f78ffa0 md5: aa002c4d343b01cdcc458c95cd071d1b @@ -8952,6 +11623,20 @@ packages: license_family: APACHE size: 192778 timestamp: 1772727380069 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-ir-frontend-2026.2.1-hd41364c_1.conda + sha256: ebeba9a3ac9505ee69b556865b7d1b9fbbad01ca1ebe6a4249ff62c3dc677b47 + md5: 2d946aebcf06e9ba438880987050e975 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libstdcxx >=14 + - pugixml >=1.15,<1.16.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 201061 + timestamp: 1782219232657 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-ir-frontend-2026.0.0-he07c6df_1.conda sha256: 5d191b9d29fb2bbaca95bcd7325fbc3329c1049eccda4b84cfd79c64d4b6dc83 md5: 0946447f9717222c95c24f958d73dba9 @@ -8964,6 +11649,19 @@ packages: license_family: APACHE size: 185648 timestamp: 1772721380070 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-ir-frontend-2026.2.1-he07c6df_1.conda + sha256: 20fe9837f7ef18faca2f6fa0c722fcfd3cf363444b2d0835c7e3a1c8a6a4bbff + md5: 6141a1a2540c990df35ee4c35d6ee4d1 + depends: + - libgcc >=14 + - libopenvino 2026.2.1 h9a8427e_1 + - libstdcxx >=14 + - pugixml >=1.15,<1.16.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 196276 + timestamp: 1782213602819 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-onnx-frontend-2026.0.0-h7a07914_1.conda sha256: 35a68214201e807bd9a31f94e618cb6a5385198e89eef46dde6c122cff77da58 md5: 218084544c2e7e78e4b8877ec37b8cdb @@ -8979,6 +11677,22 @@ packages: license_family: APACHE size: 1860687 timestamp: 1772727397981 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-onnx-frontend-2026.2.1-h607c73d_1.conda + sha256: 7b105c0102356352d6d9518a112ff6343dab6b8f32c837809117cd26cbf006df + md5: 3bd3599825189418ea14b2c9da3a6d87 + depends: + - __glibc >=2.17,<3.0.a0 + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libprotobuf >=7.35.1,<7.35.2.0a0 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 1944558 + timestamp: 1782219246849 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-onnx-frontend-2026.0.0-h558496d_1.conda sha256: 9496ef9b24c3dcf3dda58a11360095fdd427d828d33705a1d9b90a4f1a5783c3 md5: 55e11d3e2f930299df66be96928e432d @@ -8993,6 +11707,21 @@ packages: license_family: APACHE size: 1665115 timestamp: 1772721394860 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-onnx-frontend-2026.2.1-h6fc7987_1.conda + sha256: f4f0ffdc53e266cd66e019525fdcabb5544bd89b043b12cf73d37e7128dd32ec + md5: 284180cfb8d20ce6218531fdff158315 + depends: + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libgcc >=14 + - libopenvino 2026.2.1 h9a8427e_1 + - libprotobuf >=7.35.1,<7.35.2.0a0 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 1770837 + timestamp: 1782213614201 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-paddle-frontend-2026.0.0-h7a07914_1.conda sha256: cb37b717480207a66443a93d4342cf88210a74c0820fc0edd70e4fc791a64779 md5: 74915e5e271ef76a89f711eff5959a75 @@ -9008,6 +11737,22 @@ packages: license_family: APACHE size: 684224 timestamp: 1772727417276 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-paddle-frontend-2026.2.1-h607c73d_1.conda + sha256: af45c03d41ebe0b48c28b68be31ee919cb801ac5077164808a66db515ad6a316 + md5: 91e198085bff9d8fa02d4d947f026ba8 + depends: + - __glibc >=2.17,<3.0.a0 + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libprotobuf >=7.35.1,<7.35.2.0a0 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 690240 + timestamp: 1782219261154 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-paddle-frontend-2026.0.0-h558496d_1.conda sha256: 9e04b6c6b370e46bee7306afc9bc76e725042e981102f4c7b6b697b061c7324a md5: d26f5d445e0545ce674b11f496dba1a0 @@ -9022,6 +11767,21 @@ packages: license_family: APACHE size: 631754 timestamp: 1772721411589 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-paddle-frontend-2026.2.1-h6fc7987_1.conda + sha256: 5ab86f5cad02ff69d87776ddf9bad2c45c84f38579a8cf16464c03436d4c2362 + md5: b45b927c4d1b1128badee6e44146fd62 + depends: + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libgcc >=14 + - libopenvino 2026.2.1 h9a8427e_1 + - libprotobuf >=7.35.1,<7.35.2.0a0 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 645700 + timestamp: 1782213626352 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-pytorch-frontend-2026.0.0-hecca717_1.conda sha256: 086469e5cd8bfde48975fe8641a7d6924e3da00d75dd06c99e03a78df03a0568 md5: 559ef86008749861a53025f669004f18 @@ -9034,6 +11794,19 @@ packages: license_family: APACHE size: 1185558 timestamp: 1772727435039 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-pytorch-frontend-2026.2.1-hecca717_1.conda + sha256: e6353874a36143ffb7db7ec2c3767fd5e3434a8eeff41a569bc46e68259f668f + md5: 152d6694f1d05b53319b8376cdd811e4 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 1226625 + timestamp: 1782219274006 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-pytorch-frontend-2026.0.0-hfae3067_1.conda sha256: e62d016274d9aeae8033a37cd742162637ca37cd10a5d436934c2709c58240f2 md5: 0fd361e9e722e741146d818284feca74 @@ -9045,6 +11818,18 @@ packages: license_family: APACHE size: 1091266 timestamp: 1772721428223 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-pytorch-frontend-2026.2.1-hfae3067_1.conda + sha256: 6a45d6ec341451dacbb7fa379de9a4d13e613fba6d15f7597c5e8dd529c73382 + md5: a82793a12b99b1d04ce77ebe0fd62e51 + depends: + - libgcc >=14 + - libopenvino 2026.2.1 h9a8427e_1 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 1129309 + timestamp: 1782213636874 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-tensorflow-frontend-2026.0.0-h78e8023_1.conda sha256: 3a9a404bc9fd39e7395d49f4bd8facb58a01a31aeceabe8723a9d4f8eb5cc381 md5: fb20f4234bc0e29af1baa13d35e36785 @@ -9061,6 +11846,23 @@ packages: license_family: APACHE size: 1257870 timestamp: 1772727453738 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-tensorflow-frontend-2026.2.1-h21c0c73_1.conda + sha256: cffe112815b8eb57528fdfdf8b39f6a0915884291147dab5bc2066d2bf123031 + md5: 89d2455ec2f065786856b0cd2ac1c0c6 + depends: + - __glibc >=2.17,<3.0.a0 + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libprotobuf >=7.35.1,<7.35.2.0a0 + - libstdcxx >=14 + - snappy >=1.2.2,<1.3.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 1284650 + timestamp: 1782219287644 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-tensorflow-frontend-2026.0.0-h2cb6e3c_1.conda sha256: f4ecfddd9583fa475e2e637ac9226b6ae20482abda53bf4339a29407e6c05cb3 md5: f2c28f19267bfcdf9ec9ed4406a89d0b @@ -9076,6 +11878,22 @@ packages: license_family: APACHE size: 1184078 timestamp: 1772721443833 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-tensorflow-frontend-2026.2.1-h9dfe790_1.conda + sha256: cef784d6f72292993c06acc368c157f2093a48acf8556cdbdb6ecedf268e5b42 + md5: 535057ced1e60c5038e9adc80a0b2103 + depends: + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libgcc >=14 + - libopenvino 2026.2.1 h9a8427e_1 + - libprotobuf >=7.35.1,<7.35.2.0a0 + - libstdcxx >=14 + - snappy >=1.2.2,<1.3.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 1215232 + timestamp: 1782213648342 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-tensorflow-lite-frontend-2026.0.0-hecca717_1.conda sha256: e7cee37c92ed0b62c0458c13937b6ad66319f1879f236a31c3a67391a999f429 md5: 0f0281435478b981f672a44d0029018c @@ -9088,6 +11906,19 @@ packages: license_family: APACHE size: 456585 timestamp: 1772727473378 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-tensorflow-lite-frontend-2026.2.1-hecca717_1.conda + sha256: 142e7b24173ca8c32dbdb29c60f33a56ffb21a4ed733c9d6ab160c3a213ff52e + md5: c1a50f20847df0a8cb462138153ab46f + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libopenvino 2026.2.1 h1f0fae8_1 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 501906 + timestamp: 1782219300706 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-tensorflow-lite-frontend-2026.0.0-hfae3067_1.conda sha256: b0f32488fd11cd8ed563ad01934360df383f720a2adecf6d36aa3ea2565baab7 md5: 0a160f00a4050e3bf4749129750d0303 @@ -9099,6 +11930,18 @@ packages: license_family: APACHE size: 428895 timestamp: 1772721459028 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenvino-tensorflow-lite-frontend-2026.2.1-hfae3067_1.conda + sha256: 656276bd4072859725a47516c5954a91b408dd65a0d2d3f923964634a29d0ded + md5: 04e867fd452233f43885090c1533af28 + depends: + - libgcc >=14 + - libopenvino 2026.2.1 h9a8427e_1 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 468695 + timestamp: 1782213660401 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopus-1.6.1-h280c20c_0.conda sha256: f1061a26213b9653bbb8372bfa3f291787ca091a9a3060a10df4d5297aad74fd md5: 2446ac1fe030c2aa6141386c1f5a6aed @@ -9107,6 +11950,7 @@ packages: - libgcc >=14 license: BSD-3-Clause license_family: BSD + purls: [] size: 324993 timestamp: 1768497114401 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopus-1.6.1-h80f16a2_0.conda @@ -9116,6 +11960,7 @@ packages: - libgcc >=14 license: BSD-3-Clause license_family: BSD + purls: [] size: 383586 timestamp: 1768497303687 - conda: https://conda.anaconda.org/conda-forge/win-64/libopus-1.6.1-h6a83c73_0.conda @@ -9127,6 +11972,7 @@ packages: - ucrt >=10.0.20348.0 license: BSD-3-Clause license_family: BSD + purls: [] size: 307373 timestamp: 1768497136248 - conda: https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.18-hb9d3cd8_0.conda @@ -9139,6 +11985,17 @@ packages: license_family: MIT size: 28424 timestamp: 1749901812541 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.19-hb03c661_0.conda + sha256: f41721636a7c2e51bc2c642e1127955ab9c81145470714fdaac44d4d09e4af41 + md5: 33082e13b4769b48cfeb648e15bfe3fc + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 29147 + timestamp: 1773533027610 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libpciaccess-0.18-h86ecc28_0.conda sha256: 7641dfdfe9bda7069ae94379e9924892f0b6604c1a016a3f76b230433bb280f2 md5: 5044e160c5306968d956c2a0a2a440d6 @@ -9148,6 +12005,45 @@ packages: license_family: MIT size: 29512 timestamp: 1749901899881 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libpciaccess-0.19-he30d5cf_0.conda + sha256: 5d26d751b7cc4b66e28ed1ae75900956600aaa5c5d874d5a8cf106d3aff834d3 + md5: 462239e256bc180c9c45dd049ba797ee + depends: + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 30294 + timestamp: 1773533057559 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libplacebo-7.360.1-h9eeb4b2_0.conda + sha256: 26cbbd3d7b91801826c779c3f7e87d071856d5cbe3d55b22777ca0d984fb02ed + md5: e6324dfe6c02e0736bb9235f8ef3c8a6 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - libdovi >=3.3.2,<4.0a0 + - libvulkan-loader >=1.4.341.0,<2.0a0 + - lcms2 >=2.19,<3.0a0 + - shaderc >=2026.2,<2026.3.0a0 + license: LGPL-2.1-or-later + purls: [] + size: 549348 + timestamp: 1777835950707 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libplacebo-7.360.1-h07e46df_0.conda + sha256: 3af9437023ec7fa8f9bf5e390b7f6ad3df403aa736b0305121d1734af2d0620e + md5: 1909ad87fcdfa8397e3568d01500dc8d + depends: + - libstdcxx >=14 + - libgcc >=14 + - lcms2 >=2.19,<3.0a0 + - libvulkan-loader >=1.4.341.0,<2.0a0 + - libdovi >=3.3.2,<4.0a0 + - shaderc >=2026.2,<2026.3.0a0 + license: LGPL-2.1-or-later + purls: [] + size: 560813 + timestamp: 1777835957369 - conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.55-h421ea60_0.conda sha256: 36ade759122cdf0f16e2a2562a19746d96cf9c863ffaa812f2f5071ebbe9c03c md5: 5f13ffc7d30ffec87864e678df9957b4 @@ -9158,6 +12054,17 @@ packages: license: zlib-acknowledgement size: 317669 timestamp: 1770691470744 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.58-h421ea60_0.conda + sha256: 377cfe037f3eeb3b1bf3ad333f724a64d32f315ee1958581fc671891d63d3f89 + md5: eba48a68a1a2b9d3c0d9511548db85db + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libzlib >=1.3.2,<2.0a0 + license: zlib-acknowledgement + purls: [] + size: 317729 + timestamp: 1776315175087 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libpng-1.6.55-h1abf092_0.conda sha256: c7378c6b79de4d571d00ad1caf0a4c19d43c9c94077a761abb6ead44d891f907 md5: be4088903b94ea297975689b3c3aeb27 @@ -9167,6 +12074,16 @@ packages: license: zlib-acknowledgement size: 340156 timestamp: 1770691477245 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libpng-1.6.58-h1abf092_0.conda + sha256: 483eaa53da40a6a3e558709d9f7b1ca388735364ae21a1ba58cf942514649c92 + md5: f51503ac45a4888bce71af9027a2ecc9 + depends: + - libgcc >=14 + - libzlib >=1.3.2,<2.0a0 + license: zlib-acknowledgement + purls: [] + size: 341202 + timestamp: 1776315188425 - conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.55-h7351971_0.conda sha256: db23f281fa80597a0dc0445b18318346862602d7081ed76244df8cc4418d6d68 md5: 43f47a9151b9b8fc100aeefcf350d1a0 @@ -9178,6 +12095,18 @@ packages: license: zlib-acknowledgement size: 383155 timestamp: 1770691504832 +- conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.58-h7351971_0.conda + sha256: 218913aeee391460bd0e341b834dbd9c6fa6ae0a4276c0c300266cc99a816a28 + md5: 52f1280563f3b48b5f75414cd2d15dd1 + depends: + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + - ucrt >=10.0.20348.0 + - libzlib >=1.3.2,<2.0a0 + license: zlib-acknowledgement + purls: [] + size: 385227 + timestamp: 1776315248638 - conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-6.33.5-h2b00c02_0.conda sha256: afbf195443269ae10a940372c1d37cda749355d2bd96ef9587a962abd87f2429 md5: 11ac478fa72cf12c214199b8a96523f4 @@ -9192,6 +12121,21 @@ packages: license_family: BSD size: 3638698 timestamp: 1769749419271 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-7.35.1-h3a69515_1.conda + sha256: a14fc571ea573d733d2c18abb52123c09d56610dbf29d03cc85cf1470f5cc8ae + md5: c80393b49f041180405a587e5ac59b49 + depends: + - __glibc >=2.17,<3.0.a0 + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libgcc >=14 + - libstdcxx >=14 + - libzlib >=1.3.2,<2.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 3942143 + timestamp: 1781325648920 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libprotobuf-6.33.5-h1f88751_0.conda sha256: f68780642c215b93f4991c43d88ab0af8a08e66826e68affc65b8905cc21d86b md5: 7f4a589ae616399b7e375053e82a3b12 @@ -9205,6 +12149,20 @@ packages: license_family: BSD size: 3465308 timestamp: 1769748410724 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libprotobuf-7.35.1-h38371b1_1.conda + sha256: 19951d88c4e7ddd5b77036eb1343e396ce2a4160bb05db22f506a7fc1dae0e89 + md5: 76593c5f1a65a923de490a5100a10df7 + depends: + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libgcc >=14 + - libstdcxx >=14 + - libzlib >=1.3.2,<2.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 3711720 + timestamp: 1781325304099 - conda: https://conda.anaconda.org/conda-forge/linux-64/librsvg-2.60.2-h61e6d4b_0.conda sha256: 38b3189cf246f7265e06917f32d046ac375117c88834d045efe73ec48ceacc59 md5: d62da3d560992bfa2feb611d7be813b8 @@ -9221,6 +12179,26 @@ packages: license: LGPL-2.1-or-later size: 4011590 timestamp: 1771399906142 +- conda: https://conda.anaconda.org/conda-forge/linux-64/librsvg-2.62.3-h4c96295_0.conda + sha256: 5571bd8239d71961d4e3ce972f865b3ea95a91ce0b53d5749fe2dd24254ddbda + md5: 492c8d9b1c564c2e948b6cb4ba0f8261 + depends: + - __glibc >=2.17,<3.0.a0 + - cairo >=1.18.4,<2.0a0 + - fontconfig >=2.18.0,<3.0a0 + - fonts-conda-ecosystem + - gdk-pixbuf >=2.44.6,<3.0a0 + - harfbuzz >=14.2.0 + - libgcc >=14 + - libglib >=2.88.1,<3.0a0 + - libxml2-16 >=2.14.6 + - pango >=1.56.4,<2.0a0 + constrains: + - __glibc >=2.17 + license: LGPL-2.1-or-later + purls: [] + size: 3476570 + timestamp: 1780450632624 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/librsvg-2.60.2-h8171147_0.conda sha256: d02d3b23aa58d7767b820289b5b50653e73d70ae32f6ee5b88f63c5c5d96c2de md5: 1d6f1aff501c8104f7292ab787d65f15 @@ -9236,6 +12214,25 @@ packages: license: LGPL-2.1-or-later size: 4016799 timestamp: 1771406266442 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/librsvg-2.62.3-hf685517_0.conda + sha256: c95ac70755863d8522c1115b54afca86148ea25366b616aa84c993c2ca54b9ce + md5: 38209cc04b3e3e5624c534bc703e6939 + depends: + - cairo >=1.18.4,<2.0a0 + - fontconfig >=2.18.0,<3.0a0 + - fonts-conda-ecosystem + - gdk-pixbuf >=2.44.6,<3.0a0 + - harfbuzz >=14.2.0 + - libgcc >=14 + - libglib >=2.88.1,<3.0a0 + - libxml2-16 >=2.14.6 + - pango >=1.56.4,<2.0a0 + constrains: + - __glibc >=2.17 + license: LGPL-2.1-or-later + purls: [] + size: 3052373 + timestamp: 1780456154830 - conda: https://conda.anaconda.org/conda-forge/win-64/librsvg-2.60.0-hd5e4115_1.conda sha256: 3d06becb70212a7ed609eea07728b6545ddcff4889844290fed14a5d2fc18cd9 md5: a105938a4fae24539c89de6e7671d279 @@ -9251,6 +12248,23 @@ packages: license: LGPL-2.1-or-later size: 2877820 timestamp: 1771301866036 +- conda: https://conda.anaconda.org/conda-forge/win-64/librsvg-2.62.3-h15cfe45_0.conda + sha256: 6f678be6074b79fe754660d16857a6edba73dd197ad92086250dc38c11b179ab + md5: 3fffc63af7b943cde57aa72f5ffe6048 + depends: + - cairo >=1.18.4,<2.0a0 + - gdk-pixbuf >=2.44.6,<3.0a0 + - harfbuzz >=14.2.0 + - libglib >=2.88.1,<3.0a0 + - libxml2-16 >=2.14.6 + - pango >=1.56.4,<2.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: LGPL-2.1-or-later + purls: [] + size: 3361405 + timestamp: 1780451179155 - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-14.3.0-h8f1669f_18.conda sha256: e03ed186eefb46d7800224ad34bad1268c9d19ecb8f621380a50601c6221a4a7 md5: ad3a0e2dc4cce549b2860e2ef0e6d75b @@ -9273,6 +12287,18 @@ packages: license_family: GPL size: 8095113 timestamp: 1771378289674 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-15.2.0-h90f66d4_19.conda + sha256: 7a58892a52739ce4c0f7109de9e91b4353104748eb04fc6441d88e8af444ba99 + md5: 67eef12ce33f7ff99900c212d7076fc2 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=15.2.0 + - libstdcxx >=15.2.0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 7930689 + timestamp: 1778269054623 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsanitizer-14.3.0-hedb4206_18.conda sha256: 48641a458e3da681038af7ebdab143f9b6861ad9d1dcc2b4997ff2b744709423 md5: 03feac8b6e64b72ae536fdb264e2618d @@ -9293,6 +12319,17 @@ packages: license_family: GPL size: 7164557 timestamp: 1771378185265 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsanitizer-15.2.0-he19c465_19.conda + sha256: 8115604f113fe2b7be95b2d22183a4dda5779c1cc6db4b826af800581498b4b3 + md5: 95210a1edbd7fc6e12afc9f8276f450a + depends: + - libgcc >=15.2.0 + - libstdcxx >=15.2.0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 7067965 + timestamp: 1778268796086 - conda: https://conda.anaconda.org/conda-forge/linux-64/libsndfile-1.2.2-hc7d488a_2.conda sha256: 57cb5f92110324c04498b96563211a1bca6a74b2918b1e8df578bfed03cc32e4 md5: 067590f061c9f6ea7e61e3b2112ed6b3 @@ -9308,6 +12345,7 @@ packages: - mpg123 >=1.32.9,<1.33.0a0 license: LGPL-2.1-or-later license_family: LGPL + purls: [] size: 355619 timestamp: 1765181778282 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsndfile-1.2.2-h30591a0_2.conda @@ -9324,6 +12362,7 @@ packages: - mpg123 >=1.32.9,<1.33.0a0 license: LGPL-2.1-or-later license_family: LGPL + purls: [] size: 406978 timestamp: 1765181892661 - conda: https://conda.anaconda.org/conda-forge/linux-64/libsodium-1.0.21-h280c20c_3.conda @@ -9368,6 +12407,18 @@ packages: purls: [] size: 951405 timestamp: 1772818874251 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.2-hf4e2dac_1.conda + sha256: f2ad4d3abd4ed7a6a0d28c0ff153e27cb45c406814faa2570bc2581804283674 + md5: f283e98005089bf29ac5774c2e209fbf + depends: + - __glibc >=2.17,<3.0.a0 + - icu >=78.3,<79.0a0 + - libgcc >=14 + - libzlib >=1.3.2,<2.0a0 + license: blessing + purls: [] + size: 964141 + timestamp: 1782406552467 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.52.0-h10b116e_0.conda sha256: 1ddaf91b44fae83856276f4cb7ce544ffe41d4b55c1e346b504c6b45f19098d6 md5: 77891484f18eca74b8ad83694da9815e @@ -9379,6 +12430,17 @@ packages: purls: [] size: 952296 timestamp: 1772818881550 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.53.2-h10b116e_1.conda + sha256: 74abcfaa0f4c13024e07dbed75b378dd6e9ad4e6e2773ffe00993b0bf437ab58 + md5: 6d39fe0dc458643272905ecad03aaa77 + depends: + - icu >=78.3,<79.0a0 + - libgcc >=14 + - libzlib >=1.3.2,<2.0a0 + license: blessing + purls: [] + size: 968949 + timestamp: 1782406597817 - conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.52.0-hf5d6505_0.conda sha256: 5fccf1e4e4062f8b9a554abf4f9735a98e70f82e2865d0bfdb47b9de94887583 md5: 8830689d537fda55f990620680934bb1 @@ -9390,6 +12452,17 @@ packages: purls: [] size: 1297302 timestamp: 1772818899033 +- conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.53.2-hf5d6505_1.conda + sha256: a643cbc7593b443967093afb14bb892b94094e9135e30772fd1a9dfda8bb08b6 + md5: b510cd61047daf53a68be8daeb56b426 + depends: + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: blessing + purls: [] + size: 1314192 + timestamp: 1782406655419 - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda sha256: 78668020064fdaa27e9ab65cd2997e2c837b564ab26ce3bf0e58a2ce1a525c6e md5: 1b08cd684f34175e4514474793d44bcb @@ -9403,18 +12476,43 @@ packages: purls: [] size: 5852330 timestamp: 1771378262446 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_19.conda + sha256: dff1058c76ec6b8759e41cefa2508162d00e4a5e6721aa68ec3fd10094e702dc + md5: 5794b3bdc38177caf969dabd3af08549 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc 15.2.0 he0feb66_19 + constrains: + - libstdcxx-ng ==15.2.0=*_19 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 5852044 + timestamp: 1778269036376 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-15.2.0-hef695bb_18.conda sha256: 31fdb9ffafad106a213192d8319b9f810e05abca9c5436b60e507afb35a6bc40 md5: f56573d05e3b735cb03efeb64a15f388 depends: - libgcc 15.2.0 h8acb6b2_18 constrains: - - libstdcxx-ng ==15.2.0=*_18 + - libstdcxx-ng ==15.2.0=*_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 5541411 + timestamp: 1771378162499 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-15.2.0-hef695bb_19.conda + sha256: 1dadc45e599f510dd5f97141dddcdbb9844d9f1430c1f3a38075cf1c58f87b4e + md5: 543fbc8d71f2a0baf04cf88ce96cb8bb + depends: + - libgcc 15.2.0 h8acb6b2_19 + constrains: + - libstdcxx-ng ==15.2.0=*_19 license: GPL-3.0-only WITH GCC-exception-3.1 license_family: GPL purls: [] - size: 5541411 - timestamp: 1771378162499 + size: 5546559 + timestamp: 1778268777463 - conda: https://conda.anaconda.org/conda-forge/win-64/libstdcxx-15.2.0-hae5796f_18.conda sha256: 7134b90a850f0e14f15bd0f0218fd728f19cd5c58420a90c2f561f58272b8519 md5: 7c09facd8f5aced6b4c146e1c4053e50 @@ -9445,6 +12543,16 @@ packages: license_family: GPL size: 20669511 timestamp: 1771378139786 +- conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-15.2.0-hd446a21_119.conda + sha256: a2385f3611d5cd25378f9cf2367183320731709c067ddd08d43330d3170f15b8 + md5: bcfe7eae40158c3e355d2f9d3ed41230 + depends: + - __unix + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 20765069 + timestamp: 1778268963689 - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-aarch64-14.3.0-h57c8d61_118.conda sha256: 609585a02b05a2b0f2cabb18849328455cbce576f2e3eb8108f3ef7f4cb165a6 md5: bcf29f2ed914259a258204b05346abb1 @@ -9463,6 +12571,16 @@ packages: license_family: GPL size: 17628403 timestamp: 1771378058765 +- conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-aarch64-15.2.0-ha7b1723_119.conda + sha256: 6f7ceee16070781b7d642a37a35ffdf09c66796d3df105c919526210ce220443 + md5: 61da34d67f58dd4cf16683f6cdcb06c8 + depends: + - __unix + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 17627362 + timestamp: 1778268687968 - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_win-64-15.2.0-h0a72980_118.conda sha256: 0b27331f127c6c10017442cc98c483aa868298102e98aae70ad86b9a5ae0029e md5: b7a331c07d140e476fee0c70c9696e87 @@ -9481,6 +12599,16 @@ packages: license_family: GPL size: 27575 timestamp: 1771378314494 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.2.0-hdf11a46_19.conda + sha256: 0672b6b6e1791c92e8eccad58081a99d614fcf82bca5841f9dfa3c3e658f83b9 + md5: e5ce228e579726c07255dbf90dc62101 + depends: + - libstdcxx 15.2.0 h934c35e_19 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 27776 + timestamp: 1778269074600 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-ng-15.2.0-hdbbeba8_18.conda sha256: 035a31cde134e706e30029a837a31f729ad32b7c5bca023271dfe91a8ba6c896 md5: 699d294376fe18d80b7ce7876c3a875d @@ -9490,6 +12618,16 @@ packages: license_family: GPL size: 27645 timestamp: 1771378204663 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-ng-15.2.0-hdbbeba8_19.conda + sha256: 56b5ec297a988961486694f1c598889c3a697d77a0b42b8cea3faaa12e9bd360 + md5: c82ed61c3ec470c5ec624580e6ba16e4 + depends: + - libstdcxx 15.2.0 hef695bb_19 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 27803 + timestamp: 1778268813278 - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.10-hd0affe5_4.conda sha256: f0356bb344a684e7616fc84675cfca6401140320594e8686be30e8ac7547aed2 md5: 1d4c18d75c51ed9d00092a891a547a7d @@ -9500,6 +12638,17 @@ packages: license: LGPL-2.1-or-later size: 491953 timestamp: 1770738638119 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.13-h084b8d7_1.conda + sha256: 2293884d59cf0436c37fc0a4bad71011a8de2a6913610d1c701a7703377c1f75 + md5: ea0da9c20bbb221b530810c3c68bbe62 + depends: + - __glibc >=2.17,<3.0.a0 + - libcap >=2.78,<2.79.0a0 + - libgcc >=14 + license: LGPL-2.1-or-later + purls: [] + size: 493022 + timestamp: 1780084748140 - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.13-hd0affe5_0.conda sha256: c5008b602cb5c819f7b52d418b3ed17e1818cbbf6705b189e7ab36bb70cce3d8 md5: 8ee3cb7f64be0e8c4787f3a4dbe024e6 @@ -9530,6 +12679,16 @@ packages: purls: [] size: 516600 timestamp: 1773797150163 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsystemd0-257.13-hfcc8634_1.conda + sha256: 7938befc6a09d9f829663ea134b01bea78dabe08d928e9a7caa68e2d726e03c5 + md5: d8981d39a52ab992a033a68927da47e0 + depends: + - libcap >=2.78,<2.79.0a0 + - libgcc >=14 + license: LGPL-2.1-or-later + purls: [] + size: 515284 + timestamp: 1780084773602 - conda: https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.1-h9d88235_1.conda sha256: e5f8c38625aa6d567809733ae04bb71c161a42e44a9fa8227abe61fa5c60ebe0 md5: cd5a90476766d53e901500df9215e927 @@ -9545,6 +12704,7 @@ packages: - libzlib >=1.3.1,<2.0a0 - zstd >=1.5.7,<1.6.0a0 license: HPND + purls: [] size: 435273 timestamp: 1762022005702 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libtiff-4.7.1-hdb009f0_1.conda @@ -9561,6 +12721,7 @@ packages: - libzlib >=1.3.1,<2.0a0 - zstd >=1.5.7,<1.6.0a0 license: HPND + purls: [] size: 488407 timestamp: 1762022048105 - conda: https://conda.anaconda.org/conda-forge/win-64/libtiff-4.7.1-h8f73337_1.conda @@ -9577,6 +12738,7 @@ packages: - vc14_runtime >=14.44.35208 - zstd >=1.5.7,<1.6.0a0 license: HPND + purls: [] size: 993166 timestamp: 1762022118895 - conda: https://conda.anaconda.org/conda-forge/linux-64/libtorch-2.10.0-cuda130_mkl_hb2e6204_303.conda @@ -9623,6 +12785,52 @@ packages: license_family: BSD size: 487186668 timestamp: 1772223626192 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libtorch-2.12.1-cuda130_mkl_h5535f43_300.conda + sha256: 2dd082b5b2c631b62824f41a13cfa88e0e37c6f28d187a9987cbfd116f5a3fd5 + md5: 49b40832a1320f8eb0020d3e9bd18421 + depends: + - __glibc >=2.28,<3.0.a0 + - _openmp_mutex * *_llvm + - _openmp_mutex >=4.5 + - cuda-cudart >=13.0.96,<14.0a0 + - cuda-cupti >=13.0.85,<14.0a0 + - cuda-nvrtc >=13.0.88,<14.0a0 + - cuda-nvtx >=13.0.85,<14.0a0 + - cuda-version >=13.0,<14 + - fmt >=12.1.0,<12.2.0a0 + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libblas * *mkl + - libcblas >=3.11.0,<4.0a0 + - libcublas >=13.1.1.3,<14.0a0 + - libcudnn >=9.23.1.3,<10.0a0 + - libcudss >=0.8.0.10,<0.8.1.0a0 + - libcufft >=12.0.0.61,<13.0a0 + - libcufile >=1.15.1.6,<2.0a0 + - libcurand >=10.4.0.35,<11.0a0 + - libcusolver >=12.0.4.66,<13.0a0 + - libcusparse >=12.6.3.3,<13.0a0 + - libgcc >=14 + - libmagma >=2.10.0,<2.10.1.0a0 + - libprotobuf >=7.35.1,<7.35.2.0a0 + - libstdcxx >=14 + - libuv >=1.52.1,<2.0a0 + - libzlib >=1.3.2,<2.0a0 + - llvm-openmp >=22.1.8 + - mkl >=2026.0.0,<2027.0a0 + - nccl >=2.30.7.1,<3.0a0 + - onednn >=3.12,<4.0a0 + - pybind11-abi 11 + - sleef >=3.9.0,<4.0a0 + constrains: + - pytorch-gpu 2.12.1 + - pytorch 2.12.1 cuda130_mkl_*_300 + - pytorch-cpu <0.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 472545686 + timestamp: 1781841819133 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libtorch-2.10.0-cuda130_generic_he6ac1af_203.conda sha256: 9ca0feffff3f5c7b5ce0a2ab66ba8b15dd33c8b812e149cf98933964e51a4dfd md5: f344404036b9bf7fe26e91e92f6c2b7c @@ -9670,6 +12878,55 @@ packages: license_family: BSD size: 468842829 timestamp: 1772296520985 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libtorch-2.12.1-cuda130_generic_h4328193_200.conda + sha256: 0742a662e62af3e15a326d9927b62b6398fde5c4971a3863aeca178c80af49ff + md5: 46a34bf52048dda3aacf071da129e979 + depends: + - __glibc >=2.28,<3.0.a0 + - _openmp_mutex * *_llvm + - _openmp_mutex >=4.5 + - arm-variant * sbsa + - cuda-cudart >=13.0.96,<14.0a0 + - cuda-cupti >=13.0.85,<14.0a0 + - cuda-nvrtc >=13.0.88,<14.0a0 + - cuda-nvtx >=13.0.85,<14.0a0 + - cuda-version >=13.0,<14 + - fmt >=12.1.0,<12.2.0a0 + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libblas >=3.9.0,<4.0a0 + - libcblas >=3.9.0,<4.0a0 + - libcublas >=13.1.1.3,<14.0a0 + - libcudnn >=9.23.1.3,<10.0a0 + - libcudss >=0.8.0.10,<0.8.1.0a0 + - libcufft >=12.0.0.61,<13.0a0 + - libcufile >=1.15.1.6,<2.0a0 + - libcurand >=10.4.0.35,<11.0a0 + - libcusolver >=12.0.4.66,<13.0a0 + - libcusparse >=12.6.3.3,<13.0a0 + - libgcc >=14 + - liblapack >=3.9.0,<4.0a0 + - libmagma >=2.10.0,<2.10.1.0a0 + - libprotobuf >=7.35.1,<7.35.2.0a0 + - libstdcxx >=14 + - libuv >=1.52.1,<2.0a0 + - libzlib >=1.3.2,<2.0a0 + - llvm-openmp >=22.1.8 + - nccl >=2.30.7.1,<3.0a0 + - onednn >=3.12,<4.0a0 + - pybind11-abi 11 + - sleef >=3.9.0,<4.0a0 + constrains: + - pytorch 2.12.1 cuda130_generic_*_200 + - pytorch-cpu <0.0a0 + - pytorch-gpu 2.12.1 + - openblas * openmp_* + - libopenblas * openmp_* + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 457116875 + timestamp: 1781841668799 - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.10-hd0affe5_4.conda sha256: ed4d2c01fbeb1330f112f7e399408634db277d3dfb2dec1d0395f56feaa24351 md5: 6c74fba677b61a0842cbf0f63eee683b @@ -9680,6 +12937,17 @@ packages: license: LGPL-2.1-or-later size: 144654 timestamp: 1770738650966 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.13-h084b8d7_1.conda + sha256: 287d05680e49eea51b8145fbf34bc213c0618b04f32e450e9da5d715e5134e38 + md5: 89e5671a076d99516a6acd72a35b1640 + depends: + - __glibc >=2.17,<3.0.a0 + - libcap >=2.78,<2.79.0a0 + - libgcc >=14 + license: LGPL-2.1-or-later + purls: [] + size: 145969 + timestamp: 1780084753104 - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.13-hd0affe5_0.conda sha256: 1a1e367c04d66030aa93b4d33905f7f6fbb59cfc292e816fe3e9c1e8b3f4d1e2 md5: 2c2270f93d6f9073cbf72d821dfc7d72 @@ -9710,6 +12978,16 @@ packages: purls: [] size: 156357 timestamp: 1773797159424 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libudev1-257.13-hfcc8634_1.conda + sha256: 1963dbd5a5c08390db2321dd2fa5c9df45c0fe68701fce4f9c36141155b4de13 + md5: 67728797901490baae52b3ce8d738d34 + depends: + - libcap >=2.78,<2.79.0a0 + - libgcc >=14 + license: LGPL-2.1-or-later + purls: [] + size: 156922 + timestamp: 1780084778404 - conda: https://conda.anaconda.org/conda-forge/linux-64/libunwind-1.8.3-h65a8314_0.conda sha256: 71c8b9d5c72473752a0bb6e91b01dd209a03916cb71f36cc6a564e3a2a132d7a md5: e179a69edd30d75c0144d7a380b88f28 @@ -9719,6 +12997,7 @@ packages: - libstdcxx >=14 license: MIT license_family: MIT + purls: [] size: 75995 timestamp: 1757032240102 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libunwind-1.8.3-h6470e1d_0.conda @@ -9729,6 +13008,7 @@ packages: - libstdcxx >=14 license: MIT license_family: MIT + purls: [] size: 94555 timestamp: 1757032278900 - conda: https://conda.anaconda.org/conda-forge/linux-64/liburing-2.14-hb700be7_0.conda @@ -9740,6 +13020,7 @@ packages: - libstdcxx >=14 license: MIT license_family: MIT + purls: [] size: 154203 timestamp: 1770566529700 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liburing-2.14-hfefdfc9_0.conda @@ -9750,6 +13031,7 @@ packages: - libstdcxx >=14 license: MIT license_family: MIT + purls: [] size: 155011 timestamp: 1770567701524 - conda: https://conda.anaconda.org/conda-forge/linux-64/libusb-1.0.29-h73b1eb8_0.conda @@ -9760,6 +13042,7 @@ packages: - libgcc >=13 - libudev1 >=257.4 license: LGPL-2.1-or-later + purls: [] size: 89551 timestamp: 1748856210075 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libusb-1.0.29-h06eaf92_0.conda @@ -9769,6 +13052,7 @@ packages: - libgcc >=13 - libudev1 >=257.4 license: LGPL-2.1-or-later + purls: [] size: 93129 timestamp: 1748856228398 - conda: https://conda.anaconda.org/conda-forge/win-64/libusb-1.0.29-h1839187_0.conda @@ -9782,6 +13066,7 @@ packages: - vc14_runtime >=14.29.30139 - ucrt >=10.0.20348.0 license: LGPL-2.1-or-later + purls: [] size: 118204 timestamp: 1748856290542 - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda @@ -9805,6 +13090,17 @@ packages: purls: [] size: 40297 timestamp: 1775052476770 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42.2-h5347b49_0.conda + sha256: 9b1bdce27a7e31f7d241aeecff67a1f3101d52a2b1e33ccc2cdf2613072bf81f + md5: 01bb81d12c957de066ea7362007df642 + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 40017 + timestamp: 1781625522462 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.41.3-h1022ec0_0.conda sha256: c37a8e89b700646f3252608f8368e7eb8e2a44886b92776e57ad7601fc402a11 md5: cf2861212053d05f27ec49c3784ff8bb @@ -9824,6 +13120,16 @@ packages: purls: [] size: 43567 timestamp: 1775052485727 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.42.2-h1022ec0_0.conda + sha256: 7663489f97c104ae3814db10f384932c74b439f3c1fd4247e4fe3599830c090a + md5: 58fa42bc4bc71fc329889497ec15effb + depends: + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 43248 + timestamp: 1781625528371 - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb03c661_1.conda sha256: c180f4124a889ac343fc59d15558e93667d894a966ec6fdb61da1604481be26b md5: 0f03292cc56bf91a077a134ea8747118 @@ -9834,6 +13140,17 @@ packages: license_family: MIT size: 895108 timestamp: 1753948278280 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.52.1-h280c20c_0.conda + sha256: e28e4519223f78b3163599ca89c3f2d80bfb53e907e7fc74e806e60d1efa578b + md5: 4e33d49bf4fc853855a3b00643aa5484 + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + license: MIT + license_family: MIT + purls: [] + size: 419935 + timestamp: 1779396012261 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuv-1.51.0-he30d5cf_1.conda sha256: 7a0fb5638582efc887a18b7d270b0c4a6f6e681bf401cab25ebafa2482569e90 md5: 8e62bf5af966325ee416f19c6f14ffa3 @@ -9843,6 +13160,16 @@ packages: license_family: MIT size: 629238 timestamp: 1753948296190 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuv-1.52.1-h80f16a2_0.conda + sha256: 3e2ead35f47d01364031f323f1be984018c8f19a3a264f952ddcd043685a1c86 + md5: ac7bcbd2c77691cd6d1ede8c029e8c8a + depends: + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 456627 + timestamp: 1779396031450 - conda: https://conda.anaconda.org/conda-forge/linux-64/libva-2.23.0-he1eb515_0.conda sha256: 255c7d00b54e26f19fad9340db080716bced1d8539606e2b8396c57abd40007c md5: 25813fe38b3e541fc40007592f12bae5 @@ -9861,6 +13188,7 @@ packages: - xorg-libxfixes >=6.0.2,<7.0a0 license: MIT license_family: MIT + purls: [] size: 221308 timestamp: 1765652453244 - conda: https://conda.anaconda.org/conda-forge/linux-64/libvorbis-1.3.7-h54a6638_2.conda @@ -9875,6 +13203,7 @@ packages: - libogg >=1.3.5,<1.4.0a0 license: BSD-3-Clause license_family: BSD + purls: [] size: 285894 timestamp: 1753879378005 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libvorbis-1.3.7-h7ac5ae9_2.conda @@ -9887,6 +13216,7 @@ packages: - libogg >=1.3.5,<1.4.0a0 license: BSD-3-Clause license_family: BSD + purls: [] size: 289391 timestamp: 1753879417231 - conda: https://conda.anaconda.org/conda-forge/win-64/libvorbis-1.3.7-h5112557_2.conda @@ -9903,6 +13233,7 @@ packages: - libogg >=1.3.5,<1.4.0a0 license: BSD-3-Clause license_family: BSD + purls: [] size: 243401 timestamp: 1753879416570 - conda: https://conda.anaconda.org/conda-forge/linux-64/libvpl-2.16.0-h54a6638_0.conda @@ -9916,6 +13247,7 @@ packages: - libva >=2.23.0,<3.0a0 license: MIT license_family: MIT + purls: [] size: 287992 timestamp: 1772980546550 - conda: https://conda.anaconda.org/conda-forge/linux-64/libvpx-1.15.2-hecca717_0.conda @@ -9927,6 +13259,7 @@ packages: - libstdcxx >=14 license: BSD-3-Clause license_family: BSD + purls: [] size: 1070048 timestamp: 1762010217363 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libvpx-1.15.2-hfae3067_0.conda @@ -9937,6 +13270,7 @@ packages: - libstdcxx >=14 license: BSD-3-Clause license_family: BSD + purls: [] size: 1296382 timestamp: 1762012332100 - conda: https://conda.anaconda.org/conda-forge/linux-64/libvulkan-loader-1.4.341.0-h5279c79_0.conda @@ -9952,6 +13286,7 @@ packages: - libvulkan-headers 1.4.341.0.* license: Apache-2.0 license_family: APACHE + purls: [] size: 199795 timestamp: 1770077125520 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libvulkan-loader-1.4.341.0-h8b8848b_0.conda @@ -9966,6 +13301,7 @@ packages: - libvulkan-headers 1.4.341.0.* license: Apache-2.0 license_family: APACHE + purls: [] size: 217655 timestamp: 1770077141862 - conda: https://conda.anaconda.org/conda-forge/win-64/libvulkan-loader-1.4.341.0-h477610d_0.conda @@ -9979,6 +13315,7 @@ packages: - libvulkan-headers 1.4.341.0.* license: Apache-2.0 license_family: APACHE + purls: [] size: 282251 timestamp: 1770077165680 - conda: https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.6.0-hd42ef1d_0.conda @@ -9991,6 +13328,7 @@ packages: - libwebp 1.6.0 license: BSD-3-Clause license_family: BSD + purls: [] size: 429011 timestamp: 1752159441324 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libwebp-base-1.6.0-ha2e29f5_0.conda @@ -10002,6 +13340,7 @@ packages: - libwebp 1.6.0 license: BSD-3-Clause license_family: BSD + purls: [] size: 359496 timestamp: 1752160685488 - conda: https://conda.anaconda.org/conda-forge/win-64/libwebp-base-1.6.0-h4d5522a_0.conda @@ -10015,6 +13354,7 @@ packages: - libwebp 1.6.0 license: BSD-3-Clause license_family: BSD + purls: [] size: 279176 timestamp: 1752159543911 - conda: https://conda.anaconda.org/conda-forge/win-64/libwinpthread-12.0.0.r4.gg4f2fc60ca-h57928b3_10.conda @@ -10040,6 +13380,7 @@ packages: - xorg-libxdmcp license: MIT license_family: MIT + purls: [] size: 395888 timestamp: 1727278577118 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxcb-1.17.0-h262b8f6_0.conda @@ -10052,6 +13393,7 @@ packages: - xorg-libxdmcp license: MIT license_family: MIT + purls: [] size: 397493 timestamp: 1727280745441 - conda: https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.13.1-hca5e8e5_0.conda @@ -10070,6 +13412,23 @@ packages: license_family: MIT size: 837922 timestamp: 1764794163823 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.13.2-hca5e8e5_0.conda + sha256: 046f2ff4acebd8729fac03e99c8c307dfb48b6a32894ba8c11576e78f6e76e43 + md5: dc8b067e22b414172bedd8e3f03f3c95 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - libxcb >=1.17.0,<2.0a0 + - libxml2 + - libxml2-16 >=2.14.6 + - xkeyboard-config + - xorg-libxau >=1.0.12,<2.0a0 + license: MIT/X11 Derivative + license_family: MIT + purls: [] + size: 851166 + timestamp: 1780213397575 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxkbcommon-1.13.1-h3c6a4c8_0.conda sha256: 37e4aa45b71c35095a01835bd42fa37c08218fec44eb2c6bf4b9e2826b0351d4 md5: 22c1ce28d481e490f3635c1b6a2bb23f @@ -10085,6 +13444,22 @@ packages: license_family: MIT size: 863646 timestamp: 1764794352540 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxkbcommon-1.13.2-h3c6a4c8_0.conda + sha256: 8f44670a714a12589bc82ea179e46ba4a19c4458d5cee765ddd4d5224eccd912 + md5: d6fc9ac66ea61eb662747959d0a68c57 + depends: + - libgcc >=14 + - libstdcxx >=14 + - libxcb >=1.17.0,<2.0a0 + - libxml2 + - libxml2-16 >=2.14.6 + - xkeyboard-config + - xorg-libxau >=1.0.12,<2.0a0 + license: MIT/X11 Derivative + license_family: MIT + purls: [] + size: 875994 + timestamp: 1780213408784 - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.15.2-he237659_0.conda sha256: 275c324f87bda1a3b67d2f4fcc3555eeff9e228a37655aa001284a7ceb6b0392 md5: e49238a1609f9a4a844b09d9926f2c3d @@ -10100,6 +13475,22 @@ packages: license_family: MIT size: 45968 timestamp: 1772704614539 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.15.3-h49c6c72_0.conda + sha256: 3bc5551720c58591f6ea1146f7d1539c734ed1c40e7b9f5cb8cb7e900c509aba + md5: 995d8c8bad2a3cc8db14675a153dec2b + depends: + - __glibc >=2.17,<3.0.a0 + - icu >=78.3,<79.0a0 + - libgcc >=14 + - libiconv >=1.18,<2.0a0 + - liblzma >=5.8.3,<6.0a0 + - libxml2-16 2.15.3 hca6bf5a_0 + - libzlib >=1.3.2,<2.0a0 + license: MIT + license_family: MIT + purls: [] + size: 46810 + timestamp: 1776376751152 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxml2-2.15.2-h825857f_0.conda sha256: 3e51e1952cb60c8107094b6b78473d91ff49d428ad4bef6806124b383e8fe29c md5: 19de96909ee1198e2853acd8aba89f6c @@ -10114,6 +13505,21 @@ packages: license_family: MIT size: 47837 timestamp: 1772704681112 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxml2-2.15.3-h869d058_0.conda + sha256: e3af6af9df73bd3c7a8e4e6c8cc38df3699e7f588b0705c257a8601e40acfbdf + md5: 2cffef27cb2eb9ed1e315a1e269d4335 + depends: + - icu >=78.3,<79.0a0 + - libgcc >=14 + - libiconv >=1.18,<2.0a0 + - liblzma >=5.8.3,<6.0a0 + - libxml2-16 2.15.3 h79dcc73_0 + - libzlib >=1.3.2,<2.0a0 + license: MIT + license_family: MIT + purls: [] + size: 48101 + timestamp: 1776376766341 - conda: https://conda.anaconda.org/conda-forge/win-64/libxml2-2.15.2-h5d26750_0.conda sha256: f905eb7046987c336122121759e7f09144729f6898f48cd06df2a945b86998d8 md5: 1007e1bfe181a2aee214779ee7f13d30 @@ -10148,6 +13554,23 @@ packages: license_family: MIT size: 43866 timestamp: 1772704745691 +- conda: https://conda.anaconda.org/conda-forge/win-64/libxml2-2.15.3-h8ef44ab_0.conda + sha256: a4599c6bbbbdd7db570896e520c557eec8e66d94e839a59d17dc1f24a3d5f82b + md5: 95591ca5671d2213f5b2d5aa7818420d + depends: + - icu >=78.3,<79.0a0 + - libiconv >=1.18,<2.0a0 + - liblzma >=5.8.3,<6.0a0 + - libxml2-16 2.15.3 h3cfd58e_0 + - libzlib >=1.3.2,<2.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: MIT + license_family: MIT + purls: [] + size: 43684 + timestamp: 1776376992865 - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-16-2.15.2-hca6bf5a_0.conda sha256: 08d2b34b49bec9613784f868209bb7c3bb8840d6cf835ff692e036b09745188c md5: f3bc152cb4f86babe30f3a4bf0dbef69 @@ -10164,6 +13587,23 @@ packages: license_family: MIT size: 557492 timestamp: 1772704601644 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-16-2.15.3-hca6bf5a_0.conda + sha256: 3d44f737c5ae52d5af32682cc1530df433f401f8e58a7533926536244127572a + md5: e79d2c2f24b027aa8d5ab1b1ba3061e7 + depends: + - __glibc >=2.17,<3.0.a0 + - icu >=78.3,<79.0a0 + - libgcc >=14 + - libiconv >=1.18,<2.0a0 + - liblzma >=5.8.3,<6.0a0 + - libzlib >=1.3.2,<2.0a0 + constrains: + - libxml2 2.15.3 + license: MIT + license_family: MIT + purls: [] + size: 559775 + timestamp: 1776376739004 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxml2-16-2.15.2-h79dcc73_0.conda sha256: da6b2ebbcecc158200d90be39514e4e902971628029b35b7f6ad57270659c5d9 md5: e3ec9079759d35b875097d6a9a69e744 @@ -10179,6 +13619,22 @@ packages: license_family: MIT size: 598438 timestamp: 1772704671710 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxml2-16-2.15.3-h79dcc73_0.conda + sha256: ad048a9ca1bf2cdfedb2b0c231050da416c44ee1436a3d1a83b51d2e2deaa842 + md5: 68866231cfe8789e780347f2482df96d + depends: + - icu >=78.3,<79.0a0 + - libgcc >=14 + - libiconv >=1.18,<2.0a0 + - liblzma >=5.8.3,<6.0a0 + - libzlib >=1.3.2,<2.0a0 + constrains: + - libxml2 2.15.3 + license: MIT + license_family: MIT + purls: [] + size: 601948 + timestamp: 1776376758674 - conda: https://conda.anaconda.org/conda-forge/win-64/libxml2-16-2.15.2-h3cfd58e_0.conda sha256: d6d792f8f1d6786b9144adfa62c33a04aeec3d76682351b353ca1224fc1a74f3 md5: f6dd496a1f2b66951110a3a0817f699b @@ -10214,6 +13670,24 @@ packages: purls: [] size: 520078 timestamp: 1772704728534 +- conda: https://conda.anaconda.org/conda-forge/win-64/libxml2-16-2.15.3-h3cfd58e_0.conda + sha256: 3b61ee3caba702d2ff432fa3920835db963026e5c99c4e6fdca0c6114f59e7ce + md5: 9e8dd0d90ed830107b2c36801035b7db + depends: + - icu >=78.3,<79.0a0 + - libiconv >=1.18,<2.0a0 + - liblzma >=5.8.3,<6.0a0 + - libzlib >=1.3.2,<2.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + constrains: + - libxml2 2.15.3 + license: MIT + license_family: MIT + purls: [] + size: 519871 + timestamp: 1776376969852 - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 md5: edb0dca6bc32e4f4789199455a1dbeb8 @@ -10298,6 +13772,19 @@ packages: license_family: APACHE size: 6136884 timestamp: 1772024545 +- conda: https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-22.1.8-h4922eb0_0.conda + sha256: a37aba21b85800af1e7c5b04ba76abab96b6e591eedf99dc6e4df83b0fefd7a5 + md5: 7bbfdc5a6eca997d3b0873a575c3e155 + depends: + - __glibc >=2.17,<3.0.a0 + constrains: + - intel-openmp <0.0a0 + - openmp 22.1.8|22.1.8.* + license: Apache-2.0 WITH LLVM-exception + license_family: APACHE + purls: [] + size: 6123597 + timestamp: 1781736521736 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/llvm-openmp-22.1.0-he40846f_0.conda sha256: 08e50e981736118b6cc379096395bd725eeac1cb3852bcdfa1d2980acba39c29 md5: 757e953866f430da9de3fcebf44d1474 @@ -10308,6 +13795,17 @@ packages: license_family: APACHE size: 5902242 timestamp: 1772024546951 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/llvm-openmp-22.1.8-he40846f_0.conda + sha256: 30bbb0ce4ae7cebbeb9801af5bbd2a29f8627237a38d08fc5d8d800e79c817aa + md5: 2107bb80c5587c116702ce215daddd73 + constrains: + - openmp 22.1.8|22.1.8.* + - intel-openmp <0.0a0 + license: Apache-2.0 WITH LLVM-exception + license_family: APACHE + purls: [] + size: 5888931 + timestamp: 1781736538044 - conda: https://conda.anaconda.org/conda-forge/win-64/llvm-openmp-22.1.0-h4fa8253_0.conda sha256: bb55a3736380759d338f87aac68df4fd7d845ae090b94400525f5d21a55eea31 md5: e5505e0b7d6ef5c19d5c0c1884a2f494 @@ -10337,6 +13835,21 @@ packages: purls: [] size: 348400 timestamp: 1774733045609 +- conda: https://conda.anaconda.org/conda-forge/win-64/llvm-openmp-22.1.8-h4fa8253_0.conda + sha256: 50c02902bb516eeb56680358f052be38b5bf74b40e78ea4b2a675e84957e7307 + md5: de3551bf6508d45ca46b714639e52823 + depends: + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + constrains: + - openmp 22.1.8|22.1.8.* + - intel-openmp <0.0a0 + license: Apache-2.0 WITH LLVM-exception + license_family: APACHE + purls: [] + size: 348002 + timestamp: 1781737042070 - conda: https://conda.anaconda.org/conda-forge/win-64/m2-conda-epoch-20250515-0_x86_64.conda build_number: 0 sha256: 51e9214548f177db9c3fe70424e3774c95bf19cd69e0e56e83abe2e393228ba1 @@ -10533,6 +14046,23 @@ packages: license_family: Proprietary size: 125728406 timestamp: 1767634121080 +- conda: https://conda.anaconda.org/conda-forge/linux-64/mkl-2026.0.0-hecca717_915.conda + sha256: 740a02cf7b3c0d6dd47dbb4d2e222ed23d326971fe608d737614db1033bd107d + md5: 09feb8740f611ceb96f8b598bf08cdba + depends: + - __glibc >=2.17,<3.0.a0 + - _openmp_mutex * *_llvm + - _openmp_mutex >=4.5 + - libgcc >=14 + - libstdcxx >=14 + - llvm-openmp >=22.1.7 + - onemkl-license 2026.0.0 ha770c72_915 + - tbb >=2023.0.0 + license: LicenseRef-IntelSimplifiedSoftwareOct2022 + license_family: Proprietary + purls: [] + size: 143201396 + timestamp: 1781016571972 - conda: https://conda.anaconda.org/conda-forge/win-64/mkl-2025.3.0-hac47afa_455.conda sha256: b2b4c84b95210760e4d12319416c60ab66e03674ccdcbd14aeb59f82ebb1318d md5: fd05d1e894497b012d05a804232254ed @@ -10560,6 +14090,21 @@ packages: purls: [] size: 99997309 timestamp: 1774449747739 +- conda: https://conda.anaconda.org/conda-forge/win-64/mkl-2026.0.0-hac47afa_908.conda + sha256: f997bfc9bc4d4e14261cdcd1ad195d64a72ee44dca3145d24c1349f8d1311aa5 + md5: 36ea6e1292e9d5e89374201da79646ef + depends: + - llvm-openmp >=22.1.5 + - onemkl-license 2026.0.0 h57928b3_908 + - tbb >=2023.0.0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: LicenseRef-IntelSimplifiedSoftwareOct2022 + license_family: Proprietary + purls: [] + size: 114354729 + timestamp: 1779293121860 - conda: https://conda.anaconda.org/conda-forge/linux-64/ml_dtypes-0.5.4-np2py314h6477eea_1.conda sha256: bf58f5b2d89958e8880cfde4e5e3d86f230485c5f5f1043fc47a56656f9655c6 md5: af93de29d470abbe21a6adc2ec58516e @@ -10623,6 +14168,19 @@ packages: license_family: LGPL size: 116777 timestamp: 1725629179524 +- conda: https://conda.anaconda.org/conda-forge/linux-64/mpc-1.4.0-he0a73b1_0.conda + sha256: c1fdeebc9f8e4f51df265efca4ea20c7a13911193cc255db73cccb6e422ae486 + md5: 770d00bf57b5599c4544d61b61d8c6c6 + depends: + - __glibc >=2.17,<3.0.a0 + - gmp >=6.3.0,<7.0a0 + - libgcc >=14 + - mpfr >=4.2.2,<5.0a0 + license: LGPL-3.0-or-later + license_family: LGPL + purls: [] + size: 100245 + timestamp: 1774472435333 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mpc-1.3.1-h783934e_1.conda sha256: b5b674f496ed28c0b2d08533c6f11eaf1840bf7d9c830655f51514f2f9d9a9c8 md5: d3758cd24507dc1bda3483ce051d48ac @@ -10634,6 +14192,18 @@ packages: license_family: LGPL size: 132799 timestamp: 1725629168783 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mpc-1.4.0-he6dc3fb_0.conda + sha256: 69f25e0c9ce2827097549a74a83f2c31c9c40fa4a668a4db96462e5a7eeb9634 + md5: b3aa59caa59a7b0288a21b097f8b6bc4 + depends: + - gmp >=6.3.0,<7.0a0 + - libgcc >=14 + - mpfr >=4.2.2,<5.0a0 + license: LGPL-3.0-or-later + license_family: LGPL + purls: [] + size: 121080 + timestamp: 1774472380541 - conda: https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda sha256: f25d2474dd557ca66c6231c8f5ace5af312efde1ba8290a6ea5e1732a4e669c0 md5: 2eeb50cab6652538eee8fc0bc3340c81 @@ -10645,6 +14215,18 @@ packages: license_family: LGPL size: 634751 timestamp: 1725746740014 +- conda: https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.2-he0a73b1_0.conda + sha256: 8690f550a780f75d9c47f7ffc15f5ff1c149d36ac17208e50eda101ca16611b9 + md5: 85ce2ffa51ab21da5efa4a9edc5946aa + depends: + - __glibc >=2.17,<3.0.a0 + - gmp >=6.3.0,<7.0a0 + - libgcc >=14 + license: LGPL-3.0-only + license_family: LGPL + purls: [] + size: 730422 + timestamp: 1773413915171 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mpfr-4.2.1-h2305555_3.conda sha256: abb35c37de2ec6c9ee89995142b1cfea9e6547202ba5578e5307834eca6d436f md5: 65b21e8d5f0ec6a2f7e87630caed3318 @@ -10655,6 +14237,17 @@ packages: license_family: LGPL size: 1841314 timestamp: 1725746723157 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mpfr-4.2.2-h3faef18_0.conda + sha256: ca2c993ad80a54f3f13b6c7857f17301acaf30b48bb1c455d890f596892417f7 + md5: 0fa4a1bcdb9e3224ab97b966d27e4949 + depends: + - gmp >=6.3.0,<7.0a0 + - libgcc >=14 + license: LGPL-3.0-only + license_family: LGPL + purls: [] + size: 1933306 + timestamp: 1773413839223 - conda: https://conda.anaconda.org/conda-forge/linux-64/mpg123-1.32.9-hc50e24c_0.conda sha256: 39c4700fb3fbe403a77d8cc27352fa72ba744db487559d5d44bf8411bb4ea200 md5: c7f302fd11eeb0987a6a5e1f3aed6a21 @@ -10664,6 +14257,7 @@ packages: - libstdcxx >=13 license: LGPL-2.1-only license_family: LGPL + purls: [] size: 491140 timestamp: 1730581373280 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mpg123-1.32.9-h65af167_0.conda @@ -10674,6 +14268,7 @@ packages: - libstdcxx >=13 license: LGPL-2.1-only license_family: LGPL + purls: [] size: 558708 timestamp: 1730581372400 - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.3.0-pyhd8ed1ab_1.conda @@ -10685,6 +14280,17 @@ packages: license_family: BSD size: 439705 timestamp: 1733302781386 +- conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.4.1-pyhd8ed1ab_0.conda + sha256: 5bbf2f8179ec43d34d67ca8e4989d216c1bdb4b749fe6cb40e86ebf88c1b5300 + md5: 2e81b32b805f406d23ba61938a184081 + depends: + - python >=3.10 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/mpmath?source=hash-mapping + size: 464918 + timestamp: 1773662068273 - conda: https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-1.1.2-py314h9891dd4_1.conda sha256: d41c2734d314303e329680aeef282766fe399a0ce63297a68a2f8f9b43b1b68a md5: c6752022dcdbf4b9ef94163de1ab7f03 @@ -10834,6 +14440,19 @@ packages: license_family: BSD size: 221809897 timestamp: 1770778626119 +- conda: https://conda.anaconda.org/conda-forge/linux-64/nccl-2.30.7.1-h1aa9b5a_0.conda + sha256: f92f617b266c24ce25766750ee45e326cbf572a97af41ca6ccb2d140f6c3859c + md5: d09e75d1fb0481ad255c74a23506696c + depends: + - __glibc >=2.28,<3.0.a0 + - cuda-version >=13,<14.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 239421791 + timestamp: 1781141768144 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nccl-2.29.3.1-h7d52dd6_0.conda sha256: 46facf5f8442e407d4953ad993a5e16c4929d3a8f1d25eb5b433f3777761b2cf md5: 2c5a62a7e72792a3af760f7016c3871c @@ -10847,6 +14466,20 @@ packages: license_family: BSD size: 271172034 timestamp: 1770779652233 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nccl-2.30.7.1-h2b99535_0.conda + sha256: 9a3b9fc08e42350eb1228bd17db136d063f9fedd388316eabd2af3e99ab732b7 + md5: f963884eca3abe4251b3d9dd40c78109 + depends: + - __glibc >=2.28,<3.0.a0 + - arm-variant * sbsa + - cuda-version >=13,<14.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 291132360 + timestamp: 1781142421045 - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586 md5: 47e340acb35de30501a76c7c799c41d7 @@ -10857,6 +14490,16 @@ packages: purls: [] size: 891641 timestamp: 1738195959188 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda + sha256: fc89f74bbe362fb29fa3c037697a89bec140b346a2469a90f7936d1d7ea4d8a3 + md5: fc21868a1a5aacc937e7a18747acb8a5 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: X11 AND BSD-3-Clause + purls: [] + size: 918956 + timestamp: 1777422145199 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.5-ha32ae93_3.conda sha256: 91cfb655a68b0353b2833521dc919188db3d8a7f4c64bea2c6a7557b24747468 md5: 182afabe009dc78d8b73100255ee6868 @@ -10866,6 +14509,15 @@ packages: purls: [] size: 926034 timestamp: 1738196018799 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.6-hf8d1292_0.conda + sha256: 369db85c5cd8d99dde364ce70725d76511d9c8199e5b820c740414091bf5bcca + md5: b2a43456aa56fe80c2477a5094899eff + depends: + - libgcc >=14 + license: X11 AND BSD-3-Clause + purls: [] + size: 960036 + timestamp: 1777422174534 - conda: https://conda.anaconda.org/conda-forge/noarch/nest-asyncio-1.6.0-pyhd8ed1ab_1.conda sha256: bb7b21d7fd0445ddc0631f64e66d91a179de4ba920b8381f29b9d006a42788c0 md5: 598fd7d4d0de2455fb74f56063969a97 @@ -10890,6 +14542,8 @@ packages: - pandas >=2.0 license: BSD-3-Clause license_family: BSD + purls: + - pkg:pypi/networkx?source=hash-mapping size: 1587439 timestamp: 1765215107045 - conda: https://conda.anaconda.org/conda-forge/noarch/nomkl-1.0-h5ca1d4c_0.tar.bz2 @@ -10899,6 +14553,7 @@ packages: - mkl <0.a0 license: BSD-3-Clause license_family: BSD + purls: [] size: 3843 timestamp: 1582593857545 - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py314h2b28147_1.conda @@ -10939,6 +14594,26 @@ packages: - pkg:pypi/numpy?source=hash-mapping size: 8927860 timestamp: 1773839233468 +- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.5.0-py314h2b28147_0.conda + sha256: bbc665584886c90daf3f33cfbf665f279cf91d4bd5323f0432c16d2bf4d525e7 + md5: bdb21d2b990f9d3aee10fd43aca851fe + depends: + - python + - libgcc >=14 + - libstdcxx >=14 + - __glibc >=2.17,<3.0.a0 + - libcblas >=3.9.0,<4.0a0 + - libblas >=3.9.0,<4.0a0 + - python_abi 3.14.* *_cp314 + - liblapack >=3.9.0,<4.0a0 + constrains: + - numpy-base <0a0 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/numpy?source=compressed-mapping + size: 9075918 + timestamp: 1782112541752 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/numpy-2.4.2-py314haac167e_1.conda sha256: 1e1366e700156cbddc4daae0fec34a72b74105ba45f9c144f777120552924747 md5: 98ef547c85356475adb2197965c716b6 @@ -10977,6 +14652,25 @@ packages: - pkg:pypi/numpy?source=hash-mapping size: 8008045 timestamp: 1773839355275 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/numpy-2.5.0-py314he1698a1_0.conda + sha256: 8677e6bd3a1a95f8ecf2b0f1ca39f30f55b1aa0865b217bb3cb55b29d3e092fa + md5: 10165160938f6498096bda3e0ff051ac + depends: + - python + - libstdcxx >=14 + - libgcc >=14 + - libcblas >=3.9.0,<4.0a0 + - libblas >=3.9.0,<4.0a0 + - python_abi 3.14.* *_cp314 + - liblapack >=3.9.0,<4.0a0 + constrains: + - numpy-base <0a0 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/numpy?source=hash-mapping + size: 8158865 + timestamp: 1782112546539 - conda: https://conda.anaconda.org/conda-forge/win-64/numpy-2.4.2-py314h06c3c77_1.conda sha256: 34fc25b81cfa987e1825586ddb1a4ac76a246fdef343c9171109017674ad6503 md5: 2fccd2c4e9feb4e4c2a90043015525d6 @@ -11015,6 +14709,26 @@ packages: - pkg:pypi/numpy?source=hash-mapping size: 7311362 timestamp: 1773839141373 +- conda: https://conda.anaconda.org/conda-forge/win-64/numpy-2.5.0-py314h02f10f6_0.conda + sha256: 86c3e926fa1d6f27ebe6b9db11ff12e9a3b6e4b0343bf4a9b489dafd9614da3f + md5: f92585b1624ecdd117b6d13fd4d691ed + depends: + - python + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + - ucrt >=10.0.20348.0 + - liblapack >=3.9.0,<4.0a0 + - libcblas >=3.9.0,<4.0a0 + - libblas >=3.9.0,<4.0a0 + - python_abi 3.14.* *_cp314 + constrains: + - numpy-base <0a0 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/numpy?source=compressed-mapping + size: 7436159 + timestamp: 1782112573833 - conda: https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.10.0-pyhcf101f3_0.conda sha256: 482d94fce136c4352b18c6397b9faf0a3149bfb12499ab1ffebad8db0cb6678f md5: 3aa4b625f20f55cf68e92df5e5bf3c39 @@ -11037,6 +14751,33 @@ packages: - sphinx>=7.1 - pydata-sphinx-theme>=0.15 requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/20/77/a2b64335bab7c75fe1c054cc4ebe2d3b3234cbdb04d2e1d6ca73551c54f5/nvtx-0.2.15-cp314-cp314-win_amd64.whl + name: nvtx + version: 0.2.15 + sha256: 9934fad0b441cfa6e896a848b092498ba23e2ff205c2b9a7b60520ff8367ffef + requires_dist: + - pytest ; extra == 'test' + - setuptools ; extra == 'test' + - sphinx ; extra == 'docs' + - nvidia-sphinx-theme ; extra == 'docs' +- pypi: https://files.pythonhosted.org/packages/e0/5b/ca0ba6fa769d08174b7a5b4775c279e2e26611cdd5e7833aa699187871c7/nvtx-0.2.15-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl + name: nvtx + version: 0.2.15 + sha256: b5171b8283dd3ea9ae688a86d16901b4c2c142c4eb0a4bdbf6c222f5f67f9524 + requires_dist: + - pytest ; extra == 'test' + - setuptools ; extra == 'test' + - sphinx ; extra == 'docs' + - nvidia-sphinx-theme ; extra == 'docs' +- pypi: https://files.pythonhosted.org/packages/f7/e1/e02fafc01c18f1868a2d2c030953f49e38d65f2d95884789a6c46ff308f1/nvtx-0.2.15-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + name: nvtx + version: 0.2.15 + sha256: 3c6d0f27d4f8a2f479eb64a6b842c13aee32120348a1715d995b9bb9f75b35cf + requires_dist: + - pytest ; extra == 'test' + - setuptools ; extra == 'test' + - sphinx ; extra == 'docs' + - nvidia-sphinx-theme ; extra == 'docs' - conda: https://conda.anaconda.org/conda-forge/linux-64/ocl-icd-2.3.3-hb9d3cd8_0.conda sha256: 2254dae821b286fb57c61895f2b40e3571a070910fdab79a948ff703e1ea807b md5: 56f8947aa9d5cf37b0b3d43b83f34192 @@ -11048,6 +14789,57 @@ packages: license_family: BSD size: 106742 timestamp: 1743700382939 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ocl-icd-2.3.4-hb03c661_1.conda + sha256: 75f3bf733523a338f73d6c276c4a26634877cd970edb558f2769d9fa52b100a9 + md5: c2871ba95727fd1382c05db66048b64c + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - opencl-headers >=2025.6.13 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 109598 + timestamp: 1780362789611 +- conda: https://conda.anaconda.org/conda-forge/linux-64/onednn-3.12-omp_h83de36e_0.conda + sha256: 0555c7f54e7192b30412cdb462adcf2151153c03fc9f20c0d6846a9381efea56 + md5: 1edfb47e2c1cce4978bbebc467999977 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 13069211 + timestamp: 1779565995400 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/onednn-3.12-omp_h605b386_0.conda + sha256: ff6c1d53eaa1221a46bb77ac871dc8eea8ef070fb975ce9810329a28d65b523e + md5: 365b9ebd06388b4c7647b4b477cde089 + depends: + - libgcc >=14 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 7480320 + timestamp: 1779566014380 +- conda: https://conda.anaconda.org/conda-forge/linux-64/onemkl-license-2026.0.0-ha770c72_915.conda + sha256: 80008386bb19f8dffc8873d6c1c16f22bb63f19c960d774b647b9a01e99ad624 + md5: 0f40953c960dc51ed18611a48f4b22a0 + license: LicenseRef-IntelSimplifiedSoftwareOct2022 + license_family: Proprietary + purls: [] + size: 39966 + timestamp: 1781016460562 +- conda: https://conda.anaconda.org/conda-forge/win-64/onemkl-license-2026.0.0-h57928b3_908.conda + sha256: 42ad15cbb3bf31830efa04d4b86dd2d5c0dd590c86f98adcd3c8c1f75acf5dd5 + md5: 9c9303e08b50e09f5c23e1dac99d0936 + license: LicenseRef-IntelSimplifiedSoftwareOct2022 + license_family: Proprietary + purls: [] + size: 41580 + timestamp: 1779292867015 - conda: https://conda.anaconda.org/conda-forge/linux-64/opencl-headers-2025.06.13-h5888daf_0.conda sha256: 2b6ce54174ec19110e1b3c37455f7cd138d0e228a75727a9bba443427da30a36 md5: 45c3d2c224002d6d0d7769142b29f986 @@ -11059,6 +14851,18 @@ packages: license_family: APACHE size: 55357 timestamp: 1749853464518 +- conda: https://conda.anaconda.org/conda-forge/linux-64/opencl-headers-2025.06.13-hecca717_0.conda + sha256: 8de2f0cd8a659b01abf86e7fbb8cea4f28ada62fd288429a2bbc040db1b98dd0 + md5: c930c8052d780caa41216af7de472226 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 55754 + timestamp: 1773844383536 - conda: https://conda.anaconda.org/conda-forge/linux-64/openh264-2.6.0-hc22cd8d_0.conda sha256: 3f231f2747a37a58471c82a9a8a80d92b7fece9f3fce10901a5ac888ce00b747 md5: b28cf020fd2dead0ca6d113608683842 @@ -11068,6 +14872,7 @@ packages: - libstdcxx >=13 license: BSD-2-Clause license_family: BSD + purls: [] size: 731471 timestamp: 1739400677213 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openh264-2.6.0-h0564a2a_0.conda @@ -11078,6 +14883,7 @@ packages: - libstdcxx >=13 license: BSD-2-Clause license_family: BSD + purls: [] size: 774512 timestamp: 1739400731652 - conda: https://conda.anaconda.org/conda-forge/win-64/openh264-2.6.0-hb17fa0b_0.conda @@ -11089,8 +14895,38 @@ packages: - vc14_runtime >=14.29.30139 license: BSD-2-Clause license_family: BSD + purls: [] size: 411269 timestamp: 1739401120354 +- conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.4-h55fea9a_0.conda + sha256: 3900f9f2dbbf4129cf3ad6acf4e4b6f7101390b53843591c53b00f034343bc4d + md5: 11b3379b191f63139e29c0d19dee24cd + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libpng >=1.6.50,<1.7.0a0 + - libstdcxx >=14 + - libtiff >=4.7.1,<4.8.0a0 + - libzlib >=1.3.1,<2.0a0 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 355400 + timestamp: 1758489294972 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openjpeg-2.5.4-h5da879a_0.conda + sha256: bd1bc8bdde5e6c5cbac42d462b939694e40b59be6d0698f668515908640c77b8 + md5: cea962410e327262346d48d01f05936c + depends: + - libgcc >=14 + - libpng >=1.6.50,<1.7.0a0 + - libstdcxx >=14 + - libtiff >=4.7.1,<4.8.0a0 + - libzlib >=1.3.1,<2.0a0 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 392636 + timestamp: 1758489353577 - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda sha256: 44c877f8af015332a5d12f5ff0fb20ca32f896526a7d0cdb30c769df1144fb5c md5: f61eb8cd60ff9057122a3d338b99c00f @@ -11103,6 +14939,18 @@ packages: purls: [] size: 3164551 timestamp: 1769555830639 +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.3-h35e630c_0.conda + sha256: d48f5c22b9897c01e4dff3680f1f57ceb02711ab9c62f74339b080419dfad34b + md5: 79dd2074b5cd5c5c6b2930514a11e22d + depends: + - __glibc >=2.17,<3.0.a0 + - ca-certificates + - libgcc >=14 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 3159683 + timestamp: 1781069855778 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.6.1-h546c87b_1.conda sha256: 7f8048c0e75b2620254218d72b4ae7f14136f1981c5eb555ef61645a9344505f md5: 25f5885f11e8b1f075bccf4a2da91c60 @@ -11114,6 +14962,17 @@ packages: purls: [] size: 3692030 timestamp: 1769557678657 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.6.3-h546c87b_0.conda + sha256: da4a5df42614166b69c2f6d8602fc1425f7aaa699f77c3bafb5c7fe69b3d9fb7 + md5: fa6260b3e6eababf6ca85a7eb3336383 + depends: + - ca-certificates + - libgcc >=14 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 3704664 + timestamp: 1781069675555 - conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.6.1-hf411b9b_1.conda sha256: 53a5ad2e5553b8157a91bb8aa375f78c5958f77cb80e9d2ce59471ea8e5c0bd6 md5: eb585509b815415bc964b2c7e11c7eb3 @@ -11127,6 +14986,19 @@ packages: purls: [] size: 9343023 timestamp: 1769557547888 +- conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.6.3-hf411b9b_0.conda + sha256: cb6e7ba0d010ee0d3249ce9886de3d7613d26d9965d4c95666fa66b9c4c31001 + md5: e99f95734a326c0fd4d02bbd995150d4 + depends: + - ca-certificates + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 9414790 + timestamp: 1781071745579 - conda: https://conda.anaconda.org/conda-forge/linux-64/optree-0.19.0-py314h9891dd4_0.conda sha256: 620379ebc27e1c43b9a8defdb167442a3413de949a464305443833db32ba7a83 md5: e13172f02effa3c9f07571ed0ddef44d @@ -11141,6 +15013,22 @@ packages: license_family: Apache size: 504345 timestamp: 1771868359859 +- conda: https://conda.anaconda.org/conda-forge/linux-64/optree-0.19.1-py314h9891dd4_0.conda + sha256: 0bc01fdc2dccad1a38f680249414c0f6a006ce3bd3c3043bde89711ec7b3d074 + md5: 44ffc8b345a7844a847d4fdf469d64ea + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + - typing-extensions >=4.12 + license: Apache-2.0 + license_family: Apache + purls: + - pkg:pypi/optree?source=hash-mapping + size: 513161 + timestamp: 1778047690925 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/optree-0.19.0-py314hd7d8586_0.conda sha256: 78deba0984ab747179c1aa87f024d6597ecfba75378c9b4601046d9c8ab59956 md5: 214ab44a77f6135a6b0178c1c9cb5149 @@ -11155,6 +15043,22 @@ packages: license_family: Apache size: 467867 timestamp: 1771868413266 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/optree-0.19.1-py314hd7d8586_0.conda + sha256: 4c255c7f435badb89df01c9eba8d9e9df0b6f8d36fe85abe1da54b81aaf04ff3 + md5: 1b70477fda9714b24e56bf5691dd8cdc + depends: + - libgcc >=14 + - libstdcxx >=14 + - python >=3.14,<3.15.0a0 + - python >=3.14,<3.15.0a0 *_cp314 + - python_abi 3.14.* *_cp314 + - typing-extensions >=4.12 + license: Apache-2.0 + license_family: Apache + purls: + - pkg:pypi/optree?source=hash-mapping + size: 477463 + timestamp: 1778047736212 - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58 md5: b76541e68fea4d511b1ac46a28dcd2c6 @@ -11167,6 +15071,18 @@ packages: - pkg:pypi/packaging?source=compressed-mapping size: 72010 timestamp: 1769093650580 +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda + sha256: 3906abfb6511a3bb309e39b9b1b7bc38f50a723971de2395489fd1f379255890 + md5: 4c06a92e74452cfa53623a81592e8934 + depends: + - python >=3.8 + - python + license: Apache-2.0 + license_family: APACHE + purls: + - pkg:pypi/packaging?source=hash-mapping + size: 91574 + timestamp: 1777103621679 - conda: https://conda.anaconda.org/conda-forge/linux-64/pango-1.56.4-hadf4263_0.conda sha256: 3613774ad27e48503a3a6a9d72017087ea70f1426f6e5541dbdb59a3b626eaaf md5: 79f71230c069a287efe3a8614069ddf1 @@ -11187,6 +15103,47 @@ packages: license: LGPL-2.1-or-later size: 455420 timestamp: 1751292466873 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pango-1.56.4-hda50119_1.conda + sha256: 315b52bfa6d1a820f4806f6490d472581438a28e21df175290477caec18972b0 + md5: d53ffc0edc8eabf4253508008493c5bc + depends: + - __glibc >=2.17,<3.0.a0 + - cairo >=1.18.4,<2.0a0 + - fontconfig >=2.17.1,<3.0a0 + - fonts-conda-ecosystem + - fribidi >=1.0.16,<2.0a0 + - harfbuzz >=13.2.1 + - libexpat >=2.7.4,<3.0a0 + - libfreetype >=2.14.2 + - libfreetype6 >=2.14.2 + - libgcc >=14 + - libglib >=2.86.4,<3.0a0 + - libpng >=1.6.55,<1.7.0a0 + - libzlib >=1.3.2,<2.0a0 + license: LGPL-2.1-or-later + purls: [] + size: 458036 + timestamp: 1774281947855 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pango-1.56.4-h8547ced_1.conda + sha256: d209c8b0d53c441ee0bc0d8fce0fcae8e7e05755e51b13b6b9da02c7aa032f98 + md5: 3fc7cc25bba3381e77b753578058e3b0 + depends: + - cairo >=1.18.4,<2.0a0 + - fontconfig >=2.17.1,<3.0a0 + - fonts-conda-ecosystem + - fribidi >=1.0.16,<2.0a0 + - harfbuzz >=13.2.0 + - libexpat >=2.7.4,<3.0a0 + - libfreetype >=2.14.2 + - libfreetype6 >=2.14.2 + - libgcc >=14 + - libglib >=2.86.4,<3.0a0 + - libpng >=1.6.55,<1.7.0a0 + - libzlib >=1.3.2,<2.0a0 + license: LGPL-2.1-or-later + purls: [] + size: 470441 + timestamp: 1774284032397 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pango-1.56.4-he55ef5b_0.conda sha256: dd36cd5b6bc1c2988291a6db9fa4eb8acade9b487f6f1da4eaa65a1eebb0a12d md5: a22cc88bf6059c9bcc158c94c9aab5b8 @@ -11227,6 +15184,28 @@ packages: license: LGPL-2.1-or-later size: 454854 timestamp: 1751292618315 +- conda: https://conda.anaconda.org/conda-forge/win-64/pango-1.56.4-h13911b6_1.conda + sha256: 3d4e6e541e633f6fd22fc2c1d79ad5ec39503dea3ba04fc3e01d5be904ec7cea + md5: 1f1cf3772ba7d4eef989e4679ddf97f7 + depends: + - cairo >=1.18.4,<2.0a0 + - fontconfig >=2.17.1,<3.0a0 + - fonts-conda-ecosystem + - fribidi >=1.0.16,<2.0a0 + - harfbuzz >=13.2.1 + - libexpat >=2.7.4,<3.0a0 + - libfreetype >=2.14.2 + - libfreetype6 >=2.14.2 + - libglib >=2.86.4,<3.0a0 + - libpng >=1.6.55,<1.7.0a0 + - libzlib >=1.3.2,<2.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + license: LGPL-2.1-or-later + purls: [] + size: 454919 + timestamp: 1774282149607 - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.6-pyhcf101f3_0.conda sha256: 42b2d77ccea60752f3aa929a6413a7835aaacdbbde679f2f5870a744fa836b94 md5: 97c1ce2fffa1209e7afb432810ec6e12 @@ -11249,6 +15228,7 @@ packages: - libzlib >=1.3.1,<2.0a0 license: BSD-3-Clause license_family: BSD + purls: [] size: 1222481 timestamp: 1763655398280 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pcre2-10.47-hf841c20_0.conda @@ -11260,6 +15240,7 @@ packages: - libzlib >=1.3.1,<2.0a0 license: BSD-3-Clause license_family: BSD + purls: [] size: 1166552 timestamp: 1763655534263 - conda: https://conda.anaconda.org/conda-forge/win-64/pcre2-10.47-hd2b5f0e_0.conda @@ -11273,6 +15254,7 @@ packages: - vc14_runtime >=14.44.35208 license: BSD-3-Clause license_family: BSD + purls: [] size: 995992 timestamp: 1763655708300 - conda: https://conda.anaconda.org/conda-forge/noarch/pexpect-4.9.0-pyhd8ed1ab_1.conda @@ -11283,9 +15265,55 @@ packages: - python >=3.9 license: ISC purls: - - pkg:pypi/pexpect?source=hash-mapping - size: 53561 - timestamp: 1733302019362 + - pkg:pypi/pexpect?source=hash-mapping + size: 53561 + timestamp: 1733302019362 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-12.2.0-py314h8ec4b1a_0.conda + sha256: 123d8a7c16c88658b4f29e9f115a047598c941708dade74fbaff373a32dbec5e + md5: 76c4757c0ec9d11f969e8eb44899307b + depends: + - python + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - libtiff >=4.7.1,<4.8.0a0 + - openjpeg >=2.5.4,<3.0a0 + - libxcb >=1.17.0,<2.0a0 + - libwebp-base >=1.6.0,<2.0a0 + - zlib-ng >=2.3.3,<2.4.0a0 + - libjpeg-turbo >=3.1.2,<4.0a0 + - python_abi 3.14.* *_cp314 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + - lcms2 >=2.18,<3.0a0 + - tk >=8.6.13,<8.7.0a0 + license: HPND + purls: + - pkg:pypi/pillow?source=hash-mapping + size: 1082797 + timestamp: 1775060059882 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pillow-12.2.0-py314hac3e5ec_0.conda + sha256: 96b26c2657275ffe84ab510edf0865e21999d791485d12794edd4a71b837beb6 + md5: 87d58d103b47c4a8567b3d7666647684 + depends: + - python + - libgcc >=14 + - python 3.14.* *_cp314 + - openjpeg >=2.5.4,<3.0a0 + - libxcb >=1.17.0,<2.0a0 + - libwebp-base >=1.6.0,<2.0a0 + - zlib-ng >=2.3.3,<2.4.0a0 + - python_abi 3.14.* *_cp314 + - lcms2 >=2.18,<3.0a0 + - tk >=8.6.13,<8.7.0a0 + - libtiff >=4.7.1,<4.8.0a0 + - libjpeg-turbo >=3.1.2,<4.0a0 + - libfreetype >=2.14.3 + - libfreetype6 >=2.14.3 + license: HPND + purls: + - pkg:pypi/pillow?source=hash-mapping + size: 1062080 + timestamp: 1775060067775 - conda: https://conda.anaconda.org/conda-forge/noarch/pip-26.0.1-pyh145f28c_0.conda sha256: 5f66ea31d62188c266c5a8752119b0cc90a5bf05963f665cf48a33e0ec58d39c md5: 09a970fbf75e8ed1aa633827ded6aa4f @@ -11307,6 +15335,7 @@ packages: - __glibc >=2.17,<3.0.a0 license: MIT license_family: MIT + purls: [] size: 450960 timestamp: 1754665235234 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pixman-0.46.4-h7ac5ae9_1.conda @@ -11318,6 +15347,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 357913 timestamp: 1754665583353 - conda: https://conda.anaconda.org/conda-forge/win-64/pixman-0.46.4-h5112557_1.conda @@ -11332,6 +15362,7 @@ packages: - ucrt >=10.0.20348.0 license: MIT license_family: MIT + purls: [] size: 542795 timestamp: 1754665193489 - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.4-pyhcf101f3_0.conda @@ -11423,6 +15454,7 @@ packages: - libgcc >=13 license: MIT license_family: MIT + purls: [] size: 8252 timestamp: 1726802366959 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pthread-stubs-0.4-h86ecc28_1002.conda @@ -11432,6 +15464,7 @@ packages: - libgcc >=13 license: MIT license_family: MIT + purls: [] size: 8342 timestamp: 1726803319942 - conda: https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.7.0-pyhd8ed1ab_1.conda @@ -11453,6 +15486,7 @@ packages: - libstdcxx >=13 license: MIT license_family: MIT + purls: [] size: 118488 timestamp: 1736601364156 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pugixml-1.15-h6ef32b0_0.conda @@ -11463,6 +15497,7 @@ packages: - libstdcxx >=13 license: MIT license_family: MIT + purls: [] size: 113424 timestamp: 1737355438448 - conda: https://conda.anaconda.org/conda-forge/linux-64/pulseaudio-client-17.0-h9a6aba3_3.conda @@ -11481,6 +15516,7 @@ packages: - pulseaudio 17.0 *_3 license: LGPL-2.1-or-later license_family: LGPL + purls: [] size: 750785 timestamp: 1763148198088 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pulseaudio-client-17.0-hcf98165_3.conda @@ -11498,6 +15534,7 @@ packages: - pulseaudio 17.0 *_3 license: LGPL-2.1-or-later license_family: LGPL + purls: [] size: 760306 timestamp: 1763148231117 - conda: https://conda.anaconda.org/conda-forge/noarch/pure_eval-0.2.3-pyhd8ed1ab_1.conda @@ -11533,11 +15570,27 @@ packages: license_family: BSD size: 232875 timestamp: 1755953378112 +- conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-3.0.3-pyhfe8187e_0.conda + sha256: 71a9524f44d6ac6304feae71e2bbe8d8ce0816f0be7a0271c15681ad1040965d + md5: e0f4549ccb507d4af8ed5c5345210673 + depends: + - python >=3.8 + - pybind11-global ==3.0.3 *_0 + - python + constrains: + - pybind11-abi ==11 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/pybind11?source=hash-mapping + size: 247963 + timestamp: 1775004608640 - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-abi-11-hc364b38_1.conda sha256: 9e7fe12f727acd2787fb5816b2049cef4604b7a00ad3e408c5e709c298ce8bf1 md5: f0599959a2447c1e544e216bddf393fa license: BSD-3-Clause license_family: BSD + purls: [] size: 14671 timestamp: 1752769938071 - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-global-3.0.1-pyhc7ab6ef_0.conda @@ -11553,6 +15606,21 @@ packages: license_family: BSD size: 228871 timestamp: 1755953338243 +- conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-global-3.0.3-pyh648e204_0.conda + sha256: 97a0fbd2a81d95e90d714e5c628fe860b29a3caad53abcfb90add1965ad85bef + md5: 7fdc3e18c14b862ae5f064c1ea8e2636 + depends: + - python >=3.8 + - __unix + - python + constrains: + - pybind11-abi ==11 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/pybind11-global?source=hash-mapping + size: 243898 + timestamp: 1775004520432 - conda: https://conda.anaconda.org/conda-forge/noarch/pyclibrary-0.2.2-pyhd8ed1ab_1.conda sha256: 210a7beee6dce5e57d4d4166b6fd93693ede3e213510efa7373103f10c18d057 md5: 0cda5dbfd261b08292fcf16429662b0a @@ -11575,6 +15643,18 @@ packages: license_family: BSD size: 110100 timestamp: 1733195786147 +- conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-3.0-pyhcf101f3_0.conda + sha256: e27e0473fc6723311a0bd48b89b616fa1b996a2f7a2b555338cbbcfb9c640568 + md5: 9c5491066224083c41b6d5635ed7107b + depends: + - python >=3.10 + - python + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/pycparser?source=compressed-mapping + size: 55886 + timestamp: 1779293633166 - conda: https://conda.anaconda.org/conda-forge/noarch/pydata-sphinx-theme-0.17.0-pyhcf101f3_0.conda sha256: 03ae7063dd18f070cf28a441dd86ea476c20ff7fc174d8365a476a650a6ae20f md5: c09bb5f9960ff1cd334c5573b5ad79c2 @@ -11605,6 +15685,19 @@ packages: license_family: BSD size: 725938 timestamp: 1770169149613 +- conda: https://conda.anaconda.org/conda-forge/noarch/pyglet-2.1.14-pyhd8ed1ab_0.conda + sha256: 9436a5bdcf63c215a9b4c0bc7be4ba16dae43a714cd15c2133ca851c588279ac + md5: 85911d1b0c7ffe28189b6d461915ad9c + depends: + - ffmpeg >=4.0.0 + - freetype + - python >=3.10 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/pyglet?source=hash-mapping + size: 728286 + timestamp: 1775384075243 - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a md5: 6b6ece66ebcae2d5f326c77ef2c5a066 @@ -11622,7 +15715,7 @@ packages: license: BSD-2-Clause license_family: BSD purls: - - pkg:pypi/pygments?source=compressed-mapping + - pkg:pypi/pygments?source=hash-mapping size: 893031 timestamp: 1774796815820 - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda @@ -11683,6 +15776,27 @@ packages: - pkg:pypi/pytest?source=hash-mapping size: 299581 timestamp: 1765062031645 +- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda + sha256: 430051d80765207a7d782b2b188230ba1489d35c6e75fd9903f76cb9fda4af16 + md5: 64c98a12c4e23eb238bf66bbecafdf3c + depends: + - colorama + - pygments >=2.7.2 + - python >=3.10 + - iniconfig >=1.0.1 + - packaging >=22 + - pluggy >=1.5,<2 + - tomli >=1 + - exceptiongroup >=1 + - python + constrains: + - pytest-faulthandler >=2 + license: MIT + license_family: MIT + purls: + - pkg:pypi/pytest?source=compressed-mapping + size: 306724 + timestamp: 1782127176429 - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-benchmark-5.2.3-pyhd8ed1ab_0.conda sha256: 2f2229415a6e5387c1faaedf442ea8c07471cb2bf5ad1007b9cfb83ea85ca29a md5: 0e7294ed4af8b833fcd2c101d647c3da @@ -11754,6 +15868,34 @@ packages: size: 36702440 timestamp: 1770675584356 python_site_packages_path: lib/python3.14/site-packages +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.6-habeac84_100_cp314.conda + build_number: 100 + sha256: 6d28ac2b061179deb434d3d57afa98ffd20ec3c5d44ab8048a1ca33424b22d38 + md5: 0b9b2f83b5b600e1ac38becde8d0dd44 + depends: + - __glibc >=2.17,<3.0.a0 + - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-64 >=2.36.1 + - libexpat >=2.8.1,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - liblzma >=5.8.3,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.53.2,<4.0a0 + - libuuid >=2.42.1,<3.0a0 + - libzlib >=1.3.2,<2.0a0 + - ncurses >=6.6,<7.0a0 + - openssl >=3.5.7,<4.0a0 + - python_abi 3.14.* *_cp314 + - readline >=8.3,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + - zstd >=1.5.7,<1.6.0a0 + license: Python-2.0 + purls: [] + size: 36717183 + timestamp: 1781255094700 + python_site_packages_path: lib/python3.14/site-packages - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.14.3-hb06a95a_101_cp314.conda build_number: 101 sha256: 87e9dff5646aba87cecfbc08789634c855871a7325169299d749040b0923a356 @@ -11781,6 +15923,33 @@ packages: size: 37305578 timestamp: 1770674395875 python_site_packages_path: lib/python3.14/site-packages +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.14.6-hc679e19_100_cp314.conda + build_number: 100 + sha256: dd56fd95db3cb49a69fbe41df80afc8bd5214daa829bcd3930de80f0408ba5eb + md5: 416c74941d13d9f2b9e68b1a900f7f50 + depends: + - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-aarch64 >=2.36.1 + - libexpat >=2.8.1,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - liblzma >=5.8.3,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.53.2,<4.0a0 + - libuuid >=2.42.1,<3.0a0 + - libzlib >=1.3.2,<2.0a0 + - ncurses >=6.6,<7.0a0 + - openssl >=3.5.7,<4.0a0 + - python_abi 3.14.* *_cp314 + - readline >=8.3,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + - zstd >=1.5.7,<1.6.0a0 + license: Python-2.0 + purls: [] + size: 34900936 + timestamp: 1781254861576 + python_site_packages_path: lib/python3.14/site-packages - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.14.3-h4b44e0e_101_cp314.conda build_number: 101 sha256: 3f99d83bfd95b9bdae64a42a1e4bf5131dc20b724be5ac8a9a7e1ac2c0f006d7 @@ -11806,6 +15975,31 @@ packages: size: 18273230 timestamp: 1770675442998 python_site_packages_path: Lib/site-packages +- conda: https://conda.anaconda.org/conda-forge/win-64/python-3.14.6-h4b44e0e_100_cp314.conda + build_number: 100 + sha256: f1acb89cb1a6bec9a94ae9f8e7411839de009cd64d3ac6a6aec4f3d8a481099a + md5: 8333e3ca6f8d1ebcd30b678dd53f0a25 + depends: + - bzip2 >=1.0.8,<2.0a0 + - libexpat >=2.8.1,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - liblzma >=5.8.3,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.53.2,<4.0a0 + - libzlib >=1.3.2,<2.0a0 + - openssl >=3.5.7,<4.0a0 + - python_abi 3.14.* *_cp314 + - tk >=8.6.13,<8.7.0a0 + - tzdata + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + - zstd >=1.5.7,<1.6.0a0 + license: Python-2.0 + purls: [] + size: 18481352 + timestamp: 1781256034828 + python_site_packages_path: Lib/site-packages - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda sha256: d6a17ece93bbd5139e02d2bd7dbfa80bee1a4261dced63f65f679121686bf664 md5: 5b8d21249ff20967101ffa321cab24e8 @@ -11911,6 +16105,67 @@ packages: license_family: BSD size: 25484211 timestamp: 1772227839293 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pytorch-2.12.1-cuda130_mkl_py314_h5d99997_300.conda + sha256: dabbecc8dcf3af4718a4c5a80a5d44f280811873f27daa4b041f3ec20d8f533c + md5: b25f00231badfc906d431c515ce7ecc6 + depends: + - __cuda + - __glibc >=2.28,<3.0.a0 + - _openmp_mutex * *_llvm + - _openmp_mutex >=4.5 + - cuda-cudart >=13.0.96,<14.0a0 + - cuda-cupti >=13.0.85,<14.0a0 + - cuda-nvrtc >=13.0.88,<14.0a0 + - cuda-nvtx >=13.0.85,<14.0a0 + - cuda-version >=13.0,<14 + - filelock + - fmt >=12.1.0,<12.2.0a0 + - fsspec + - jinja2 + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libblas * *mkl + - libcblas >=3.11.0,<4.0a0 + - libcublas >=13.1.1.3,<14.0a0 + - libcudnn >=9.23.1.3,<10.0a0 + - libcudss >=0.8.0.10,<0.8.1.0a0 + - libcufft >=12.0.0.61,<13.0a0 + - libcufile >=1.15.1.6,<2.0a0 + - libcurand >=10.4.0.35,<11.0a0 + - libcusolver >=12.0.4.66,<13.0a0 + - libcusparse >=12.6.3.3,<13.0a0 + - libgcc >=14 + - libmagma >=2.10.0,<2.10.1.0a0 + - libprotobuf >=7.35.1,<7.35.2.0a0 + - libstdcxx >=14 + - libtorch 2.12.1 cuda130_mkl_h5535f43_300 + - libuv >=1.52.1,<2.0a0 + - libzlib >=1.3.2,<2.0a0 + - llvm-openmp >=22.1.8 + - mkl >=2026.0.0,<2027.0a0 + - nccl >=2.30.7.1,<3.0a0 + - networkx + - numpy >=1.23,<3 + - onednn >=3.12,<4.0a0 + - optree >=0.13.0 + - pybind11 + - pybind11-abi 11 + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + - setuptools <82 + - sleef >=3.9.0,<4.0a0 + - sympy >=1.13.3 + - triton 3.7.1 + - typing_extensions >=4.10.0 + constrains: + - pytorch-gpu 2.12.1 + - pytorch-cpu <0.0a0 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/torch?source=hash-mapping + size: 26990927 + timestamp: 1781843342171 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pytorch-2.10.0-cuda130_generic_py314_h7cb4a1c_203.conda sha256: 70b45b24d9591f943ff3a5ffff9419af85293e318a5f001be41cd9538d4e21c9 md5: eec5f372504eec64c324446bbfc8442a @@ -11972,6 +16227,69 @@ packages: license_family: BSD size: 24872224 timestamp: 1772302448327 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pytorch-2.12.1-cuda130_generic_py314_h1da07bd_200.conda + sha256: 46688459ca322948021c689eff157a2f87d0235c08d49f086fbf3dda0b8870d0 + md5: 1736ee7eb6b9e5a82dc025b3a013a6e2 + depends: + - __cuda + - __glibc >=2.28,<3.0.a0 + - _openmp_mutex * *_llvm + - _openmp_mutex >=4.5 + - arm-variant * sbsa + - cuda-cudart >=13.0.96,<14.0a0 + - cuda-cupti >=13.0.85,<14.0a0 + - cuda-nvrtc >=13.0.88,<14.0a0 + - cuda-nvtx >=13.0.85,<14.0a0 + - cuda-version >=13.0,<14 + - filelock + - fmt >=12.1.0,<12.2.0a0 + - fsspec + - jinja2 + - libabseil * cxx17* + - libabseil >=20260526.0,<20260527.0a0 + - libcblas >=3.9.0,<4.0a0 + - libcublas >=13.1.1.3,<14.0a0 + - libcudnn >=9.23.1.3,<10.0a0 + - libcudss >=0.8.0.10,<0.8.1.0a0 + - libcufft >=12.0.0.61,<13.0a0 + - libcufile >=1.15.1.6,<2.0a0 + - libcurand >=10.4.0.35,<11.0a0 + - libcusolver >=12.0.4.66,<13.0a0 + - libcusparse >=12.6.3.3,<13.0a0 + - libgcc >=14 + - liblapack >=3.9.0,<4.0a0 + - libmagma >=2.10.0,<2.10.1.0a0 + - libprotobuf >=7.35.1,<7.35.2.0a0 + - libstdcxx >=14 + - libtorch 2.12.1 cuda130_generic_h4328193_200 + - libuv >=1.52.1,<2.0a0 + - libzlib >=1.3.2,<2.0a0 + - llvm-openmp >=22.1.8 + - nccl >=2.30.7.1,<3.0a0 + - networkx + - nomkl + - numpy >=1.23,<3 + - onednn >=3.12,<4.0a0 + - optree >=0.13.0 + - pybind11 + - pybind11-abi 11 + - python >=3.14,<3.15.0a0 + - python >=3.14,<3.15.0a0 *_cp314 + - python_abi 3.14.* *_cp314 + - setuptools <82 + - sleef >=3.9.0,<4.0a0 + - sympy >=1.13.3 + - triton 3.7.1 + - typing_extensions >=4.10.0 + constrains: + - pytorch-gpu 2.12.1 + - pytorch-cpu <0.0a0 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/torch?source=hash-mapping + size: 26781142 + timestamp: 1781843840773 - conda: https://conda.anaconda.org/conda-forge/linux-64/pytorch-gpu-2.10.0-cuda129_mkl_h0d04637_303.conda sha256: 9807474ce5bbbf81e7a92dd724f38f0dffcfefe5494619c2e94f2df469553bf5 md5: 1050dc8cf80cd0a9e63f361c12ee0e82 @@ -11981,6 +16299,16 @@ packages: license_family: BSD size: 53576 timestamp: 1772317256452 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pytorch-gpu-2.12.1-cuda129_mkl_h0d04637_300.conda + sha256: 6962557638794695e9368fec8358913138de6ac42263f4106b0b0d289aa8c90f + md5: e0e4cda52c574ebc52a1433de2f64b42 + depends: + - pytorch 2.12.1 cuda*_mkl*300 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 55687 + timestamp: 1781849857048 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pytorch-gpu-2.10.0-cuda130_generic_h63a1e35_203.conda sha256: 98c965bf49b087129af7ac9df6c218dbd5c9b9cd1d63d9f19af7b5cb8031a41c md5: 0f82b7f7e338cd8d5b04b9b29c9db41f @@ -11991,6 +16319,17 @@ packages: license_family: BSD size: 53542 timestamp: 1772302593139 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pytorch-gpu-2.12.1-cuda129_generic_hda344be_200.conda + sha256: 0c149b792c00a17accbc4f3b944090a95e95e02f9a7a93dc81f5775f4ed20b9d + md5: be94b3604dab4bba2c92d71aaf10d5c3 + depends: + - arm-variant * sbsa + - pytorch 2.12.1 cuda*_generic*200 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 55745 + timestamp: 1781849755104 - conda: https://conda.anaconda.org/conda-forge/win-64/pywin32-311-py314h8f8f202_1.conda sha256: 6918a8067f296f3c65d43e84558170c9e6c3f4dd735cfe041af41a7fdba7b171 md5: 2d7b7ba21e8a8ced0eca553d4d53f773 @@ -12123,6 +16462,21 @@ packages: purls: [] size: 1268666 timestamp: 1769154883613 +- conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-63.0-h192683f_1.conda + sha256: f0931894c751b22be09d7c976343a2957a14a59cfe0db04d916d1b93bd66ffcf + md5: da47d3251c0f0d16b2801afe5a77b532 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libnl >=3.11.0,<4.0a0 + - libstdcxx >=14 + - libsystemd0 >=257.13 + - libudev1 >=257.13 + license: Linux-OpenIB + license_family: BSD + purls: [] + size: 1281605 + timestamp: 1778528449130 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/rdma-core-61.0-h1f0f388_0.conda sha256: 1c69fab2e833080d48f24d5ac06ea6745c470a8ef779d526bd1edd846184da7e md5: 58f1eb9b507e3e098091840c6f1f9c11 @@ -12137,6 +16491,20 @@ packages: purls: [] size: 1341616 timestamp: 1769154919140 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/rdma-core-63.0-h1f0f388_1.conda + sha256: 89dc4066bf0a2ee8e0cdeb6b6e8884c2c36c9a82855a438a0720ee59297fae3e + md5: 94e99208cc8828d5953fac098814a0e9 + depends: + - libgcc >=14 + - libnl >=3.11.0,<4.0a0 + - libstdcxx >=14 + - libsystemd0 >=257.13 + - libudev1 >=257.13 + license: Linux-OpenIB + license_family: BSD + purls: [] + size: 1351719 + timestamp: 1778528506759 - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 md5: d7d95fc8287ea7bf33e0e7116d2b95ec @@ -12373,6 +16741,7 @@ packages: - sdl3 >=3.2.22,<4.0a0 - libegl >=1.7.0,<2.0a0 license: Zlib + purls: [] size: 589145 timestamp: 1757842881 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sdl2-2.32.56-h7ac5ae9_0.conda @@ -12385,6 +16754,7 @@ packages: - libgl >=1.7.0,<2.0a0 - libegl >=1.7.0,<2.0a0 license: Zlib + purls: [] size: 597756 timestamp: 1757842928996 - conda: https://conda.anaconda.org/conda-forge/win-64/sdl2-2.32.56-h5112557_0.conda @@ -12399,8 +16769,39 @@ packages: - ucrt >=10.0.20348.0 - sdl3 >=3.2.22,<4.0a0 license: Zlib + purls: [] size: 572101 timestamp: 1757842925694 +- conda: https://conda.anaconda.org/conda-forge/linux-64/sdl3-3.4.10-hdeec2a5_0.conda + sha256: 04fa7dab2b8f688e3fc4b7ae4522fd3935fb0601e3329cda8b40d63c60d6cc05 + md5: 845c0b154836c034f361668bec2a4f20 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - xorg-libxcursor >=1.2.3,<2.0a0 + - libusb >=1.0.29,<2.0a0 + - libxkbcommon >=1.13.2,<2.0a0 + - xorg-libx11 >=1.8.13,<2.0a0 + - xorg-libxi >=1.8.3,<2.0a0 + - liburing >=2.14,<2.15.0a0 + - libunwind >=1.8.3,<1.9.0a0 + - xorg-libxtst >=1.2.5,<2.0a0 + - wayland >=1.25.0,<2.0a0 + - dbus >=1.16.2,<2.0a0 + - libgl >=1.7.0,<2.0a0 + - xorg-libxscrnsaver >=1.2.4,<2.0a0 + - xorg-libxext >=1.3.7,<2.0a0 + - libdrm >=2.4.127,<2.5.0a0 + - pulseaudio-client >=17.0,<17.1.0a0 + - libvulkan-loader >=1.4.341.0,<2.0a0 + - libegl >=1.7.0,<2.0a0 + - xorg-libxfixes >=6.0.2,<7.0a0 + - libudev1 >=257.13 + license: Zlib + purls: [] + size: 2148830 + timestamp: 1780262823658 - conda: https://conda.anaconda.org/conda-forge/linux-64/sdl3-3.4.2-hdeec2a5_0.conda sha256: 64b982664550e01c25f8f09333c0ee54d4764a80fe8636b8aaf881fe6e8a0dbe md5: 88a69db027a8ff59dab972a09d69a1ab @@ -12428,8 +16829,37 @@ packages: - xorg-libxi >=1.8.2,<2.0a0 - wayland >=1.24.0,<2.0a0 license: Zlib - size: 2138749 - timestamp: 1771668185803 + size: 2138749 + timestamp: 1771668185803 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sdl3-3.4.10-had2c13b_0.conda + sha256: 8fe249e71c077a09f94f49256a8738a2ef22bb018f119194ece94361721dff65 + md5: 90e79fd7ec05af2fb5424cd84fd70d20 + depends: + - libstdcxx >=14 + - libgcc >=14 + - xorg-libxcursor >=1.2.3,<2.0a0 + - xorg-libxscrnsaver >=1.2.4,<2.0a0 + - pulseaudio-client >=17.0,<17.1.0a0 + - libusb >=1.0.29,<2.0a0 + - libvulkan-loader >=1.4.341.0,<2.0a0 + - libgl >=1.7.0,<2.0a0 + - libunwind >=1.8.3,<1.9.0a0 + - xorg-libxext >=1.3.7,<2.0a0 + - wayland >=1.25.0,<2.0a0 + - libdrm >=2.4.127,<2.5.0a0 + - libudev1 >=257.13 + - xorg-libxfixes >=6.0.2,<7.0a0 + - xorg-libxtst >=1.2.5,<2.0a0 + - libxkbcommon >=1.13.2,<2.0a0 + - libegl >=1.7.0,<2.0a0 + - liburing >=2.14,<2.15.0a0 + - xorg-libxi >=1.8.3,<2.0a0 + - xorg-libx11 >=1.8.13,<2.0a0 + - dbus >=1.16.2,<2.0a0 + license: Zlib + purls: [] + size: 2144308 + timestamp: 1780262838628 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sdl3-3.4.2-had2c13b_0.conda sha256: 17aad2e3439d6d778bf995134f37e442a8420adc740457f43d647d4dbf0b10fe md5: c667298eebd2296ace8cb07dbbba95c0 @@ -12458,6 +16888,19 @@ packages: license: Zlib size: 2136476 timestamp: 1771668207211 +- conda: https://conda.anaconda.org/conda-forge/win-64/sdl3-3.4.10-h5112557_0.conda + sha256: 0331417611907f1891c1c8b1c52fed48e337157a6e2d6a893ed352792f8f7ea0 + md5: 82adb3bed17cc9189d81ca90b41c77b9 + depends: + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + - ucrt >=10.0.20348.0 + - libusb >=1.0.29,<2.0a0 + - libvulkan-loader >=1.4.341.0,<2.0a0 + license: Zlib + purls: [] + size: 1677765 + timestamp: 1780262836463 - conda: https://conda.anaconda.org/conda-forge/win-64/sdl3-3.4.2-h5112557_0.conda sha256: a4677774a9d542c6f4bac8779a2d7105748d38d8b7d56c8d02f36d14fba471b9 md5: a0256884d35489e520360267e67ce3fc @@ -12470,6 +16913,17 @@ packages: license: Zlib size: 1669623 timestamp: 1771668231217 +- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-81.0.0-pyh332efcf_0.conda + sha256: 6ecf738d5590bf228f09c4ecd1ea91d811f8e0bd9acdef341bc4d6c36beb13a3 + md5: d629a398d7bf872f9ed7b27ab959de15 + depends: + - python >=3.10 + license: MIT + license_family: MIT + purls: + - pkg:pypi/setuptools?source=hash-mapping + size: 676888 + timestamp: 1770456470072 - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda sha256: 82088a6e4daa33329a30bc26dc19a98c7c1d3f05c0f73ce9845d4eab4924e9e1 md5: 8e194e7b992f99a5015edbd4ebd38efd @@ -12492,6 +16946,20 @@ packages: license_family: Apache size: 113513 timestamp: 1770208767759 +- conda: https://conda.anaconda.org/conda-forge/linux-64/shaderc-2026.2-h718be3e_0.conda + sha256: c6e3280867e54c97996a4fedda0ab72c92d48d1d69258bddf910130df72c169d + md5: 6438976979721e2f60ec47327d8d38df + depends: + - __glibc >=2.17,<3.0.a0 + - glslang >=16,<17.0a0 + - libgcc >=14 + - libstdcxx >=14 + - spirv-tools >=2026,<2027.0a0 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 113684 + timestamp: 1777360595361 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/shaderc-2025.5-hfeb5c2c_1.conda sha256: bf3f47847832e33acbcb7a1aba948f3b574979ad2a91f2ebdc9fc685c09433db md5: 8268bdcd82d8f9abcb7f0fd6a9568ba4 @@ -12504,6 +16972,19 @@ packages: license_family: Apache size: 115498 timestamp: 1770208786806 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/shaderc-2026.2-hfeb5c2c_0.conda + sha256: 487c021f4f10ae963e9192c9bbc0d3bba8f11cb3a2bb91fd351e4ea3e1ebc109 + md5: 9a389f225e6d2a8cc1e425c128caffe8 + depends: + - glslang >=16,<17.0a0 + - libgcc >=14 + - libstdcxx >=14 + - spirv-tools >=2026,<2027.0a0 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 115991 + timestamp: 1777360628740 - conda: https://conda.anaconda.org/conda-forge/win-64/shaderc-2025.5-h8fa7867_1.conda sha256: b2f6e199df47ca314294ad393818d6b499fd544703abcede0f19007b8f8f10e4 md5: 04d62bc008ee442843e2f24f603ea1a6 @@ -12517,6 +16998,20 @@ packages: license_family: Apache size: 1558909 timestamp: 1770208850155 +- conda: https://conda.anaconda.org/conda-forge/win-64/shaderc-2026.2-h8fa7867_0.conda + sha256: 3a4edc274c947d34258af01886d9ca301098fe037dacd91ccd5f5291dda5ca0b + md5: dd6d0d119b1ca747af3ba964eaa3c565 + depends: + - glslang >=16,<17.0a0 + - spirv-tools >=2026,<2027.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 1559352 + timestamp: 1777360694042 - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda sha256: 458227f759d5e3fcec5d9b7acce54e10c9e1f4f4b7ec978f3bfd54ce4ee9853d md5: 3339e3b65d58accf4ca4fb8748ab16b3 @@ -12538,6 +17033,7 @@ packages: - libgcc >=14 - libstdcxx >=14 license: BSL-1.0 + purls: [] size: 1951720 timestamp: 1756274576844 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sleef-3.9.0-h5bb93e2_0.conda @@ -12548,6 +17044,7 @@ packages: - libgcc >=14 - libstdcxx >=14 license: BSL-1.0 + purls: [] size: 1190849 timestamp: 1756276271706 - conda: https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.2-h03e3b7b_1.conda @@ -12560,6 +17057,7 @@ packages: - libgcc >=14 license: BSD-3-Clause license_family: BSD + purls: [] size: 45829 timestamp: 1762948049098 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/snappy-1.2.2-he774c54_1.conda @@ -12571,6 +17069,7 @@ packages: - libgcc >=14 license: BSD-3-Clause license_family: BSD + purls: [] size: 47096 timestamp: 1762948094646 - conda: https://conda.anaconda.org/conda-forge/noarch/snowballstemmer-3.0.1-pyhd8ed1ab_0.conda @@ -12800,6 +17299,20 @@ packages: license_family: APACHE size: 2296977 timestamp: 1770089626195 +- conda: https://conda.anaconda.org/conda-forge/linux-64/spirv-tools-2026.2-hb700be7_0.conda + sha256: 309d1a3317e91a03611bc960fc807cf2c0c5baacbfddea0f5636438a76c52256 + md5: 0c2b1d811632f1f4aa923450a002ff4f + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + constrains: + - spirv-headers >=1.4.350.0,<1.4.350.1.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 2392190 + timestamp: 1780139567779 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/spirv-tools-2026.1-hfefdfc9_0.conda sha256: 841a7df4b73a13a148410e677b1bf07ed81bd181cc686278d64d65e033f4a06a md5: ad8208c6618a543687d754dc57876091 @@ -12812,6 +17325,19 @@ packages: license_family: APACHE size: 2255599 timestamp: 1770089690097 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/spirv-tools-2026.2-hfefdfc9_0.conda + sha256: fcd1bb3c246ffc0beee0c66d1f240610288c81286041190b42402435408b5cc5 + md5: c82bb7d70fffe04afe74d55542af1d41 + depends: + - libgcc >=14 + - libstdcxx >=14 + constrains: + - spirv-headers >=1.4.350.0,<1.4.350.1.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 2290233 + timestamp: 1780139661664 - conda: https://conda.anaconda.org/conda-forge/win-64/spirv-tools-2026.1-h49e36cd_0.conda sha256: 9976eeaf650d43833c110447ba264a72f470928d8a8fa5d1cfbadcd2a276184c md5: bf5a4eb05c8b38dbc4e32ce17ab36389 @@ -12825,6 +17351,20 @@ packages: license_family: APACHE size: 13881533 timestamp: 1770089875437 +- conda: https://conda.anaconda.org/conda-forge/win-64/spirv-tools-2026.2-h49e36cd_0.conda + sha256: 256818df74531014639af4cd0791a2a2d238bef74b593db92baeb3c881d89ef2 + md5: 64190192873306d90833d53a820311b8 + depends: + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + constrains: + - spirv-headers >=1.4.350.0,<1.4.350.1.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 14305012 + timestamp: 1780140089597 - conda: https://conda.anaconda.org/conda-forge/linux-64/sqlalchemy-2.0.49-py314h0f05182_0.conda sha256: 85b8d29abab6896abc18956a6e6cff3cba939b63440039be8471f5ca51096686 md5: 40330dd2ec87f319b1c4dffe0db4f4e7 @@ -12896,6 +17436,7 @@ packages: - libstdcxx >=14 license: BSD-2-Clause license_family: BSD + purls: [] size: 2619743 timestamp: 1769664536467 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/svt-av1-4.0.1-hfae3067_0.conda @@ -12906,6 +17447,7 @@ packages: - libstdcxx >=14 license: BSD-2-Clause license_family: BSD + purls: [] size: 2042800 timestamp: 1769668627820 - conda: https://conda.anaconda.org/conda-forge/win-64/svt-av1-4.0.1-hac47afa_0.conda @@ -12917,6 +17459,7 @@ packages: - vc14_runtime >=14.44.35208 license: BSD-2-Clause license_family: BSD + purls: [] size: 1808810 timestamp: 1769664619287 - conda: https://conda.anaconda.org/conda-forge/noarch/sympy-1.14.0-pyh2585a3b_106.conda @@ -12930,6 +17473,8 @@ packages: - python >=3.10 license: BSD-3-Clause license_family: BSD + purls: + - pkg:pypi/sympy?source=hash-mapping size: 4661767 timestamp: 1771952371059 - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.28-h4ee821c_9.conda @@ -12941,6 +17486,7 @@ packages: - tzdata license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later license_family: GPL + purls: [] size: 24008591 timestamp: 1765578833462 - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-aarch64-2.28-h585391f_9.conda @@ -12952,6 +17498,7 @@ packages: - tzdata license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later license_family: GPL + purls: [] size: 23644746 timestamp: 1765578629426 - conda: https://conda.anaconda.org/conda-forge/noarch/tabulate-0.10.0-pyhcf101f3_0.conda @@ -12978,6 +17525,19 @@ packages: license_family: APACHE size: 181329 timestamp: 1767886632911 +- conda: https://conda.anaconda.org/conda-forge/linux-64/tbb-2023.0.0-hab88423_2.conda + sha256: 30cb9355c2fefc20ff1a3d6566b9714d5614086a2524c07721fc344eb20515ae + md5: 7073b15f9364ebc118998601ac6ca6a6 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libhwloc >=2.13.0,<2.13.1.0a0 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 182331 + timestamp: 1778673758649 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tbb-2022.3.0-hfefdfc9_2.conda sha256: 2e875ba342c2cde6301b088cd6471f67e44d961bd292abcdfa6ba3fc32506935 md5: 4d424acd246a5ba42512c097139ed0a0 @@ -12989,6 +17549,18 @@ packages: license_family: APACHE size: 144746 timestamp: 1767888618836 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tbb-2023.0.0-h57272ed_2.conda + sha256: 7ed4e93fad3707aa1686c5be286604c63aad33c9765a0d53fab7adbd179510b3 + md5: 0bc302bd45e5f744a672eb4f4a930398 + depends: + - libgcc >=14 + - libhwloc >=2.13.0,<2.13.1.0a0 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 145425 + timestamp: 1778675412470 - conda: https://conda.anaconda.org/conda-forge/win-64/tbb-2022.3.0-h3155e25_2.conda sha256: abd9a489f059fba85c8ffa1abdaa4d515d6de6a3325238b8e81203b913cf65a9 md5: 0f9817ffbe25f9e69ceba5ea70c52606 @@ -13002,6 +17574,19 @@ packages: purls: [] size: 155869 timestamp: 1767886839029 +- conda: https://conda.anaconda.org/conda-forge/win-64/tbb-2023.0.0-hd3d4ead_2.conda + sha256: 8a4053839b8e997a5965e2dff7d6cf3c77be62d82c0e48c8a04a5ed2d2e73035 + md5: 8ee01a693aecff5432069eaaf1183c45 + depends: + - libhwloc >=2.13.0,<2.13.1.0a0 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 156515 + timestamp: 1778673901757 - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac md5: cffd3bdd58090148f4cfcd831f4b26ab @@ -13060,7 +17645,7 @@ packages: license: MIT license_family: MIT purls: - - pkg:pypi/tomli?source=compressed-mapping + - pkg:pypi/tomli?source=hash-mapping size: 21561 timestamp: 1774492402955 - conda: https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.5-py314h5bd0f2a_0.conda @@ -13138,6 +17723,30 @@ packages: license_family: MIT size: 236152104 timestamp: 1771627549811 +- conda: https://conda.anaconda.org/conda-forge/linux-64/triton-3.7.1-cuda130py314h1cdc6f0_1.conda + sha256: c526abff27dd48435fe15019c4fec3c8a31c0435ea1d1f9fb5a8145cdc4319f0 + md5: 13491b40def8e902433945de06b88b85 + depends: + - python + - setuptools + - cuda-nvcc-tools + - cuda-cuobjdump + - cuda-cudart + - cuda-cupti + - libstdcxx >=14 + - libgcc >=14 + - cuda-version >=13.0,<14 + - __glibc >=2.28,<3.0.a0 + - libzlib >=1.3.2,<2.0a0 + - zstd >=1.5.7,<1.6.0a0 + - cuda-cupti >=13.0.85,<14.0a0 + - python_abi 3.14.* *_cp314 + license: MIT + license_family: MIT + purls: + - pkg:pypi/triton?source=hash-mapping + size: 40376689 + timestamp: 1781881965775 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/triton-3.6.0-cuda130py314h75a4554_1.conda sha256: 7a5e51bea6dd90c2d59fcbf6b06f722289409ff34eeaece2362b0a1429b6c84f md5: c1e928be7f75d193cb83923f60ebc07e @@ -13162,6 +17771,31 @@ packages: license_family: MIT size: 244688202 timestamp: 1771627574163 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/triton-3.7.1-cuda130py314ha788bc0_1.conda + sha256: 29a6a5028fd2d99a35886b268045da54d27b9b15b30b96cf4f351d4b5b43c2b0 + md5: aaee2741e58071208ce075fd1f89e4e9 + depends: + - python + - setuptools + - cuda-nvcc-tools + - cuda-cuobjdump + - cuda-cudart + - cuda-cupti + - cuda-version >=13.0,<14 + - arm-variant * sbsa + - __glibc >=2.28,<3.0.a0 + - libstdcxx >=14 + - libgcc >=14 + - cuda-cupti >=13.0.85,<14.0a0 + - zstd >=1.5.7,<1.6.0a0 + - libzlib >=1.3.2,<2.0a0 + - python_abi 3.14.* *_cp314 + license: MIT + license_family: MIT + purls: + - pkg:pypi/triton?source=hash-mapping + size: 47202278 + timestamp: 1781881972923 - conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda sha256: 7c2df5721c742c2a47b2c8f960e718c930031663ac1174da67c1ed5999f7938c md5: edd329d7d3a4ab45dcf905899a7a6115 @@ -13241,6 +17875,18 @@ packages: purls: [] size: 19356 timestamp: 1767320221521 +- conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.5-h1b7c187_39.conda + sha256: 17693b60cb54f80c60275f003f3bfc1b128af56dbfd65c4fae37c64eeb755ce1 + md5: 2eacea63f545b97342da520df6854276 + depends: + - vc14_runtime >=14.51.36231 + track_features: + - vc14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 20362 + timestamp: 1781320968457 - conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.44.35208-h818238b_34.conda sha256: 02732f953292cce179de9b633e74928037fa3741eb5ef91c3f8bae4f761d32a5 md5: 37eb311485d2d8b2c419449582046a42 @@ -13254,6 +17900,19 @@ packages: purls: [] size: 683233 timestamp: 1767320219644 +- conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.51.36231-h1b9f54f_39.conda + sha256: 8153ed849c92e891eacac0f2f8d7ecb79f9b5fd7f7917fbb896f252a60a40390 + md5: 06a5bf5a1ca16cce0df6eaa91fc42bc2 + depends: + - ucrt >=10.0.20348.0 + - vcomp14 14.51.36231 h1b9f54f_39 + constrains: + - vs2015_runtime 14.51.36231.* *_39 + license: LicenseRef-MicrosoftVisualCpp2015-2022Runtime + license_family: Proprietary + purls: [] + size: 737434 + timestamp: 1781320964561 - conda: https://conda.anaconda.org/conda-forge/win-64/vcomp14-14.44.35208-h818238b_34.conda sha256: 878d5d10318b119bd98ed3ed874bd467acbe21996e1d81597a1dbf8030ea0ce6 md5: 242d9f25d2ae60c76b38a5e42858e51d @@ -13266,6 +17925,18 @@ packages: purls: [] size: 115235 timestamp: 1767320173250 +- conda: https://conda.anaconda.org/conda-forge/win-64/vcomp14-14.51.36231-h1b9f54f_39.conda + sha256: 07fb14713c4bc62e2533a2e23a363abfb0e65650681fba0ae4c840e2219350f3 + md5: 8b53a83fda40ec679e4d63fa32fae989 + depends: + - ucrt >=10.0.20348.0 + constrains: + - vs2015_runtime 14.51.36231.* *_39 + license: LicenseRef-MicrosoftVisualCpp2015-2022Runtime + license_family: Proprietary + purls: [] + size: 120684 + timestamp: 1781320948530 - conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.44.35208-h38c0c73_34.conda sha256: 63ff4ec6e5833f768d402f5e95e03497ce211ded5b6f492e660e2bfc726ad24d md5: f276d1de4553e8fca1dfb6988551ebb4 @@ -13275,6 +17946,16 @@ packages: license_family: BSD size: 19347 timestamp: 1767320221943 +- conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.51.36231-h84cd919_39.conda + sha256: 6de6c2cf008fc2dce61060b583f2d8494c83883106952b201381b6b0505f03d7 + md5: 2ccc63d7b7d066a814ed9f99072832d7 + depends: + - vc14_runtime >=14.51.36231 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 20355 + timestamp: 1781320968804 - conda: https://conda.anaconda.org/conda-forge/linux-64/wayland-1.24.0-hd6090a7_1.conda sha256: 3aa04ae8e9521d9b56b562376d944c3e52b69f9d2a0667f77b8953464822e125 md5: 035da2e4f5770f036ff704fa17aace24 @@ -13288,6 +17969,20 @@ packages: license_family: MIT size: 329779 timestamp: 1761174273487 +- conda: https://conda.anaconda.org/conda-forge/linux-64/wayland-1.25.0-hd6090a7_0.conda + sha256: ea374d57a8fcda281a0a89af0ee49a2c2e99cc4ac97cf2e2db7064e74e764bdb + md5: 996583ea9c796e5b915f7d7580b51ea6 + depends: + - __glibc >=2.17,<3.0.a0 + - libexpat >=2.7.4,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: MIT + license_family: MIT + purls: [] + size: 334139 + timestamp: 1773959575393 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/wayland-1.24.0-h4f8a99f_1.conda sha256: d94af8f287db764327ac7b48f6c0cd5c40da6ea2606afd34ac30671b7c85d8ee md5: f6966cb1f000c230359ae98c29e37d87 @@ -13300,6 +17995,19 @@ packages: license_family: MIT size: 331480 timestamp: 1761174368396 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/wayland-1.25.0-h4f8a99f_0.conda + sha256: 3cc479df517b0ce110835a1256f91ca568581cb6dfe1c53a0786f0a226039a45 + md5: 0a7a9548726f98d5869fd4c43e110f0f + depends: + - libexpat >=2.7.4,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: MIT + license_family: MIT + purls: [] + size: 335260 + timestamp: 1773959583826 - conda: https://conda.anaconda.org/conda-forge/noarch/wayland-protocols-1.47-hd8ed1ab_0.conda sha256: 9ab2c12053ea8984228dd573114ffc6d63df42c501d59fda3bf3aeb1eaa1d23e md5: 7da1571f560d4ba3343f7f4c48a79c76 @@ -13307,6 +18015,14 @@ packages: license_family: MIT size: 140476 timestamp: 1765821981856 +- conda: https://conda.anaconda.org/conda-forge/noarch/wayland-protocols-1.49-hd8ed1ab_0.conda + sha256: 04ce686cd187d379344f9b2be7b4da5f431b265dc0944a6b764fab9da9171948 + md5: 0839a3421140d4a9ba93fb988698fc00 + license: MIT + license_family: MIT + purls: [] + size: 147954 + timestamp: 1780946721169 - conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.6.0-pyhd8ed1ab_0.conda sha256: e298b508b2473c4227206800dfb14c39e4b14fd79d4636132e9e1e4244cdf4aa md5: c3197f8c0d5b955c904616b716aca093 @@ -13347,6 +18063,7 @@ packages: - libgcc-ng >=12 license: GPL-2.0-or-later license_family: GPL + purls: [] size: 897548 timestamp: 1660323080555 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/x264-1!164.3095-h4e544f5_2.tar.bz2 @@ -13356,6 +18073,7 @@ packages: - libgcc-ng >=12 license: GPL-2.0-or-later license_family: GPL + purls: [] size: 1000661 timestamp: 1660324722559 - conda: https://conda.anaconda.org/conda-forge/win-64/x264-1!164.3095-h8ffe710_2.tar.bz2 @@ -13366,6 +18084,7 @@ packages: - vs2015_runtime >=14.16.27033 license: GPL-2.0-or-later license_family: GPL + purls: [] size: 1041889 timestamp: 1660323726084 - conda: https://conda.anaconda.org/conda-forge/linux-64/x265-3.5-h924138e_3.tar.bz2 @@ -13376,6 +18095,7 @@ packages: - libstdcxx-ng >=10.3.0 license: GPL-2.0-or-later license_family: GPL + purls: [] size: 3357188 timestamp: 1646609687141 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/x265-3.5-hdd96247_3.tar.bz2 @@ -13386,6 +18106,7 @@ packages: - libstdcxx-ng >=10.3.0 license: GPL-2.0-or-later license_family: GPL + purls: [] size: 1018181 timestamp: 1646610147365 - conda: https://conda.anaconda.org/conda-forge/win-64/x265-3.5-h2d74725_3.tar.bz2 @@ -13396,6 +18117,7 @@ packages: - vs2015_runtime >=14.16.27033 license: GPL-2.0-or-later license_family: GPL + purls: [] size: 5517425 timestamp: 1646611941216 - conda: https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.47-hb03c661_0.conda @@ -13409,6 +18131,18 @@ packages: license_family: MIT size: 399291 timestamp: 1772021302485 +- conda: https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.48-h280c20c_0.conda + sha256: 3b04afd5d1a65d2d27ac2d49a63b01ab8bcd875776779ec63e337370ed38afdc + md5: b233b41be0bf210989d57160ed39b394 + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - xorg-libx11 >=1.8.13,<2.0a0 + license: MIT + license_family: MIT + purls: [] + size: 441670 + timestamp: 1782027360439 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xkeyboard-config-2.47-he30d5cf_0.conda sha256: ec7ff9dffbd41faa31a30fa0724699f05bca000d57c745a195ecdb56888a8605 md5: 4ac707a4279972357712af099cd1ae50 @@ -13419,6 +18153,17 @@ packages: license_family: MIT size: 399629 timestamp: 1772021320967 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xkeyboard-config-2.48-h80f16a2_0.conda + sha256: 96078068df25ddccc60958be740e6fa99efb1e0fa2dae2f84e775201bf84d70c + md5: 3dbc6d9e1f8a8768e7ef9f57585a43ca + depends: + - libgcc >=14 + - xorg-libx11 >=1.8.13,<2.0a0 + license: MIT + license_family: MIT + purls: [] + size: 442725 + timestamp: 1782027381059 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda sha256: c12396aabb21244c212e488bbdc4abcdef0b7404b15761d9329f5a4a39113c4b md5: fb901ff28063514abb6046c9ec2c4a45 @@ -13427,6 +18172,7 @@ packages: - libgcc >=13 license: MIT license_family: MIT + purls: [] size: 58628 timestamp: 1734227592886 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libice-1.1.2-h86ecc28_0.conda @@ -13436,6 +18182,7 @@ packages: - libgcc >=13 license: MIT license_family: MIT + purls: [] size: 60433 timestamp: 1734229908988 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda @@ -13448,6 +18195,7 @@ packages: - xorg-libice >=1.1.2,<2.0a0 license: MIT license_family: MIT + purls: [] size: 27590 timestamp: 1741896361728 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libsm-1.2.6-h0808dbd_0.conda @@ -13459,6 +18207,7 @@ packages: - xorg-libice >=1.1.2,<2.0a0 license: MIT license_family: MIT + purls: [] size: 28701 timestamp: 1741897678254 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.13-he1eb515_0.conda @@ -13470,6 +18219,7 @@ packages: - libxcb >=1.17.0,<2.0a0 license: MIT license_family: MIT + purls: [] size: 839652 timestamp: 1770819209719 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libx11-1.8.13-h63a1b12_0.conda @@ -13480,6 +18230,7 @@ packages: - libxcb >=1.17.0,<2.0a0 license: MIT license_family: MIT + purls: [] size: 869058 timestamp: 1770819244991 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb03c661_1.conda @@ -13490,6 +18241,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 15321 timestamp: 1762976464266 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxau-1.0.12-he30d5cf_1.conda @@ -13499,6 +18251,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 16317 timestamp: 1762977521691 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcursor-1.2.3-hb9d3cd8_0.conda @@ -13512,6 +18265,7 @@ packages: - xorg-libxrender >=0.9.11,<0.10.0a0 license: MIT license_family: MIT + purls: [] size: 32533 timestamp: 1730908305254 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxcursor-1.2.3-h86ecc28_0.conda @@ -13524,6 +18278,7 @@ packages: - xorg-libxrender >=0.9.11,<0.10.0a0 license: MIT license_family: MIT + purls: [] size: 34596 timestamp: 1730908388714 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb03c661_1.conda @@ -13534,6 +18289,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 20591 timestamp: 1762976546182 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxdmcp-1.1.5-he30d5cf_1.conda @@ -13543,6 +18299,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 21039 timestamp: 1762979038025 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.7-hb03c661_0.conda @@ -13554,6 +18311,7 @@ packages: - xorg-libx11 >=1.8.12,<2.0a0 license: MIT license_family: MIT + purls: [] size: 50326 timestamp: 1769445253162 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxext-1.3.7-he30d5cf_0.conda @@ -13564,6 +18322,7 @@ packages: - xorg-libx11 >=1.8.12,<2.0a0 license: MIT license_family: MIT + purls: [] size: 52409 timestamp: 1769446753771 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.2-hb03c661_0.conda @@ -13575,6 +18334,7 @@ packages: - xorg-libx11 >=1.8.12,<2.0a0 license: MIT license_family: MIT + purls: [] size: 20071 timestamp: 1759282564045 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxfixes-6.0.2-he30d5cf_0.conda @@ -13585,6 +18345,7 @@ packages: - xorg-libx11 >=1.8.12,<2.0a0 license: MIT license_family: MIT + purls: [] size: 20704 timestamp: 1759284028146 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.8.2-hb9d3cd8_0.conda @@ -13600,6 +18361,20 @@ packages: license_family: MIT size: 47179 timestamp: 1727799254088 +- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.8.3-hb03c661_0.conda + sha256: 495f99c8eacfa4ae2d8fed2a7f2105777af89acdc204df145d2bbbc380ac631b + md5: adba2e334082bb218db806d4c12277c9 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - xorg-libx11 >=1.8.13,<2.0a0 + - xorg-libxext >=1.3.7,<2.0a0 + - xorg-libxfixes >=6.0.2,<7.0a0 + license: MIT + license_family: MIT + purls: [] + size: 47717 + timestamp: 1779111857071 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxi-1.8.2-h57736b2_0.conda sha256: 7b587407ecb9ccd2bbaf0fb94c5dbdde4d015346df063e9502dc0ce2b682fb5e md5: eeee3bdb31c6acde2b81ad1b8c287087 @@ -13612,6 +18387,19 @@ packages: license_family: MIT size: 48197 timestamp: 1727801059062 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxi-1.8.3-he30d5cf_0.conda + sha256: 0c1c7b39763469cfe0e9c6d0f9a39415321f477710719f4c5d63c61ea270271c + md5: f8ad5777ecc217d383a722598dbeb1ac + depends: + - libgcc >=14 + - xorg-libx11 >=1.8.13,<2.0a0 + - xorg-libxext >=1.3.7,<2.0a0 + - xorg-libxfixes >=6.0.2,<7.0a0 + license: MIT + license_family: MIT + purls: [] + size: 49292 + timestamp: 1779113229775 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrandr-1.5.5-hb03c661_0.conda sha256: 80ed047a5cb30632c3dc5804c7716131d767089f65877813d4ae855ee5c9d343 md5: e192019153591938acf7322b6459d36e @@ -13623,6 +18411,7 @@ packages: - xorg-libxrender >=0.9.12,<0.10.0a0 license: MIT license_family: MIT + purls: [] size: 30456 timestamp: 1769445263457 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxrandr-1.5.5-he30d5cf_0.conda @@ -13635,6 +18424,7 @@ packages: - xorg-libxrender >=0.9.12,<0.10.0a0 license: MIT license_family: MIT + purls: [] size: 31122 timestamp: 1769445286951 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda @@ -13646,6 +18436,7 @@ packages: - xorg-libx11 >=1.8.10,<2.0a0 license: MIT license_family: MIT + purls: [] size: 33005 timestamp: 1734229037766 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxrender-0.9.12-h86ecc28_0.conda @@ -13656,6 +18447,7 @@ packages: - xorg-libx11 >=1.8.10,<2.0a0 license: MIT license_family: MIT + purls: [] size: 33649 timestamp: 1734229123157 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxscrnsaver-1.2.4-hb9d3cd8_0.conda @@ -13668,6 +18460,7 @@ packages: - xorg-libxext >=1.3.6,<2.0a0 license: MIT license_family: MIT + purls: [] size: 14412 timestamp: 1727899730073 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxscrnsaver-1.2.4-h86ecc28_0.conda @@ -13679,6 +18472,7 @@ packages: - xorg-libxext >=1.3.6,<2.0a0 license: MIT license_family: MIT + purls: [] size: 15720 timestamp: 1750007336692 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxtst-1.2.5-hb9d3cd8_3.conda @@ -13692,6 +18486,7 @@ packages: - xorg-libxi >=1.7.10,<2.0a0 license: MIT license_family: MIT + purls: [] size: 32808 timestamp: 1727964811275 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxtst-1.2.5-h57736b2_3.conda @@ -13704,6 +18499,7 @@ packages: - xorg-libxi >=1.7.10,<2.0a0 license: MIT license_family: MIT + purls: [] size: 33786 timestamp: 1727964907993 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-xorgproto-2025.1-hb03c661_0.conda @@ -13714,6 +18510,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 570010 timestamp: 1766154256151 - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-xorgproto-2025.1-he30d5cf_0.conda @@ -13723,6 +18520,7 @@ packages: - libgcc >=14 license: MIT license_family: MIT + purls: [] size: 569539 timestamp: 1766155414260 - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda @@ -13814,6 +18612,42 @@ packages: - pkg:pypi/zipp?source=hash-mapping size: 24194 timestamp: 1764460141901 +- conda: https://conda.anaconda.org/conda-forge/win-64/zlib-1.3.2-hfd05255_2.conda + sha256: ef408f85f664a4b9c9dac3cb2e36154d9baa15a88984ea800e11060e0f2394a1 + md5: 5187ecf958be3c39110fe691cbd6873e + depends: + - libzlib 1.3.2 hfd05255_2 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + license: Zlib + license_family: Other + purls: [] + size: 850351 + timestamp: 1774072891049 +- conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-ng-2.3.3-hceb46e0_1.conda + sha256: ea4e50c465d70236408cb0bfe0115609fd14db1adcd8bd30d8918e0291f8a75f + md5: 2aadb0d17215603a82a2a6b0afd9a4cb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: Zlib + license_family: Other + purls: [] + size: 122618 + timestamp: 1770167931827 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zlib-ng-2.3.3-ha7cb516_1.conda + sha256: 638a3a41a4fbfed52d3c60c8ef5a3693b3f12a5b1a3f58fa29f5698d0a0702e2 + md5: f731af71c723065d91b4c01bb822641b + depends: + - libgcc >=14 + - libstdcxx >=14 + license: Zlib + license_family: Other + purls: [] + size: 121046 + timestamp: 1770167944449 - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7 md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829 diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml index d2e72807af7..d4e97b5aede 100644 --- a/cuda_core/pixi.toml +++ b/cuda_core/pixi.toml @@ -24,31 +24,37 @@ cloudpickle = "*" psutil = "*" pyglet = "*" -[feature.examples.dependencies] +[feature.local-deps.dependencies] +cuda-bindings = { path = "../cuda_bindings" } +cuda-pathfinder = { path = "../cuda_pathfinder" } + +[feature.samples.dependencies] cuda-core = { path = "." } cffi = "*" pyglet = "*" +pytest = "*" +numpy = "*" -[feature.examples.system-requirements] +[feature.samples.system-requirements] cuda = "13" -[feature.local-deps.dependencies] -cuda-bindings = { path = "../cuda_bindings" } -cuda-pathfinder = { path = "../cuda_pathfinder" } +[feature.samples.pypi-dependencies] +nvtx = "*" -[feature.examples.target.linux.dependencies] +[feature.samples.target.linux.dependencies] cupy = "*" pytorch-gpu = "*" libgl-devel = "*" gxx = "*" +pillow = "*" -[feature.examples.target.linux-64.activation.env] +[feature.samples.target.linux-64.activation.env] CUDA_HOME = "$CONDA_PREFIX/targets/x86_64-linux" -[feature.examples.target.linux-aarch64.activation.env] +[feature.samples.target.linux-aarch64.activation.env] CUDA_HOME = "$CONDA_PREFIX/targets/sbsa-linux" -[feature.examples.target.win-64.activation.env] +[feature.samples.target.win-64.activation.env] CUDA_HOME = "$CONDA_PREFIX/Library" [feature.cython-tests.dependencies] @@ -115,7 +121,7 @@ nvidia-sphinx-theme = "*" make = "*" # We keep both cu12 and cu13 because cuda.core works with either major version. -# The local sibling checkouts are wired into the default/cu13/examples workflows; +# The local sibling checkouts are wired into the default/cu13/samples workflows; # cu12 intentionally solves against published packages instead. [environments] default = { features = [ @@ -125,7 +131,7 @@ default = { features = [ ], solve-group = "default" } cu13 = { features = ["cu13", "test", "cython-tests", "local-deps"], solve-group = "cu13" } cu12 = { features = ["cu12", "test", "cython-tests"], solve-group = "cu12" } -examples = { features = ["cu13", "examples", "local-deps"], solve-group = "examples" } +samples = { features = ["cu13", "samples", "local-deps"], solve-group = "samples" } docs = { features = ["cu13", "docs", "local-deps"], solve-group = "docs" } # TODO: check if these can be extracted from pyproject.toml @@ -227,3 +233,7 @@ cmd = [ "norecursedirs=\"\"", # include cython tests (ignore by default config) ] depends-on = [{ task = "build-cython-tests" }] + +[tasks.test-samples] +cmd = ["pytest", "-rxXs", "-v", "$PIXI_PROJECT_ROOT/../tests/samples"] +default-environment = "samples" diff --git a/pixi.toml b/pixi.toml index f73d299e012..ed78119b47f 100644 --- a/pixi.toml +++ b/pixi.toml @@ -53,6 +53,13 @@ cmd = [ 'pixi run --manifest-path "$PIXI_PROJECT_ROOT/cuda_core" -e "$PIXI_ENVIRONMENT_NAME" test', ] +[target.linux.tasks.test-samples] +cmd = [ + "bash", + "-c", + 'pixi run --manifest-path "$PIXI_PROJECT_ROOT/cuda_core" test-samples', +] + [target.linux.tasks.test] depends-on = [ { task = "test-pathfinder" }, diff --git a/pytest.ini b/pytest.ini index 9c11c2b5f56..2e025401e29 100644 --- a/pytest.ini +++ b/pytest.ini @@ -6,18 +6,21 @@ addopts = --showlocals norecursedirs = cuda_bindings/examples cuda_core/examples + samples testpaths = cuda_pathfinder/tests cuda_bindings/tests cuda_core/tests tests/integration + tests/samples ci/tools/tests markers = pathfinder: tests for cuda_pathfinder bindings: tests for cuda_bindings core: tests for cuda_core + samples: tests for the standalone samples under ./samples cython: cython tests smoke: meta-level smoke tests flaky: mark test as flaky (provided by pytest-rerunfailures) diff --git a/ruff.toml b/ruff.toml index 210f852cd3e..77331428859 100644 --- a/ruff.toml +++ b/ruff.toml @@ -114,6 +114,19 @@ inline-quotes = "double" "RUF059", # unused unpacked variable ] +"samples/**" = [ + "T201", # print + "E402", # module-level import not at top of file (sys.path mutation before import) + "N801", # CUDA naming conventions (CamelCase classes) + "N802", # CUDA naming conventions (camelCase methods) + "N803", # CUDA naming conventions (argument names) + "N806", # non-lowercase variable in function (e.g. d_A, h_blockSums) + "N816", # mixed-case variable in global scope + "N999", # invalid module name (samples/Utilities is a directory, not a module) + "RUF001", # ambiguous unicode in strings (math symbols like ``×`` in shape annotations) + "RUF003", # ambiguous unicode in comments (same as RUF001) +] + "**/benchmarks/**" = [ "T201", # print "RUF012", # mutable class default (ctypes _fields_ is standard) diff --git a/samples/Utilities/README.md b/samples/Utilities/README.md new file mode 100644 index 00000000000..91816a232f4 --- /dev/null +++ b/samples/Utilities/README.md @@ -0,0 +1,134 @@ +# CUDA Python Utilities + +Common utilities for CUDA Python samples using the `cuda.core` API. + +## Overview + +This module provides reusable utility functions for CUDA samples to reduce code duplication. Samples import from `cuda_samples_utils.py` using simple path-based imports (no package structure needed). + +## Installation Requirements + +Install from the Python samples directory: + +```bash +cd /path/to/cuda-samples/Python +pip install -r requirements.txt +``` + +This installs a common CUDA 13 stack (see `python/requirements.txt`): + +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) +- `numpy` (>=2.3.2) + +## How to Use in Samples + +Import utilities using path-based import: + +```python +import sys +from pathlib import Path + +# Add Utilities directory to path +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import verify_array_result + +# Use the utility +if verify_array_result(result, expected): + print("Success!") +``` + +## Available Functions + +### Result Verification + +#### `verify_array_result(result, expected, rtol=1e-5, atol=1e-8, verbose=True)` + +Verify computed results match expected values. The helper detects whether both +arguments are NumPy arrays or both are CuPy arrays and uses the matching +library's `allclose` (no unnecessary cross-device transfers). + +**Parameters:** +- `result`: NumPy or CuPy array with computed results +- `expected`: NumPy or CuPy array with expected values (same kind as `result`) +- `rtol`: Relative tolerance (default: 1e-5) +- `atol`: Absolute tolerance (default: 1e-8) +- `verbose`: Print test result (default: True) + +**Returns:** +- `True` if results match within tolerance, `False` otherwise + +**Example:** +```python +expected = a + b +if verify_array_result(c, expected): + print("Computation correct!") +``` + +### Package Check + +#### `check_cuda_requirements()` + +Check if required CUDA packages are available. + +**Returns:** +- `True` if requirements are met, `False` otherwise + +**Example:** +```python +if not check_cuda_requirements(): + sys.exit(1) +``` + +## Design Philosophy + +These utilities focus on common operations that are **not** part of `cuda.core` API: +- Result verification for NumPy or CuPy arrays +- Package requirements checking + +For CUDA operations like device initialization, kernel compilation, and grid size calculations, samples should use `cuda.core` API directly to demonstrate the proper usage patterns. + +## Complete Example + +See `../vectorAdd/vectorAdd.py` for a complete example: + +```python +import sys +from pathlib import Path + +# Import utility +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import verify_array_result + +import cupy as cp +from cuda.core import Device, Program, ProgramOptions, LaunchConfig, launch + +# Use cuda.core directly for device and kernel operations +device = Device(0) +device.set_current() + +program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") +program = Program(kernel_source, code_type="c++", options=program_options) +module = program.compile("cubin", name_expressions=("kernel_name",)) +kernel = module.get_kernel("kernel_name") + +# Calculate grid size inline +threads_per_block = 256 +blocks_per_grid = (num_elements + threads_per_block - 1) // threads_per_block + +# Launch kernel - pass cupy arrays directly +config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block) +launch(stream, config, kernel, a, b, c, cp.int32(num_elements)) + +# Verify results using utility +verify_array_result(c, expected) +``` + +## Benefits + +- **Code Reuse**: Write common functionality once +- **Consistency**: All samples use the same patterns +- **Maintainability**: Bug fixes benefit all samples +- **Transparency**: Samples show cuda.core API usage directly +- **Simplicity**: No complex package structure needed diff --git a/samples/Utilities/__init__.py b/samples/Utilities/__init__.py new file mode 100644 index 00000000000..a84c3bf5b70 --- /dev/null +++ b/samples/Utilities/__init__.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +CUDA Python Samples - Utilities + +Common utilities for CUDA Python samples. + +Provides: +- Package requirements checking +- Result verification +""" + +from .cuda_samples_utils import ( + check_cuda_requirements, + verify_array_result, +) + +__version__ = "1.0.0" + +__all__ = [ + "check_cuda_requirements", + "verify_array_result", +] diff --git a/samples/Utilities/cuda_samples_utils.py b/samples/Utilities/cuda_samples_utils.py new file mode 100644 index 00000000000..50b21713fb1 --- /dev/null +++ b/samples/Utilities/cuda_samples_utils.py @@ -0,0 +1,140 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# distribution and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Common CUDA utilities for Python samples. + +This module provides common utility functions for CUDA samples including: +- Package requirements checking +- Result verification +- GPU device information + +Requirements: +- Python 3.10+ +- CUDA Toolkit 13.0+ (recommended; matches cuda-python 13.x) +- cuda-python >= 13.0.0 +- cuda-core >= 0.6.0 +- cupy-cuda13x >= 13.0.0 +- numpy >= 2.3.2 (when used with samples that install it) +""" + + +def check_cuda_requirements() -> bool: + """ + Check if required CUDA packages are available. + + Returns + ------- + bool + True if requirements are met, False otherwise + """ + try: + import cupy as cp # noqa: F401 + + from cuda.core import Device # noqa: F401 + + return True + except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + return False + + +def verify_array_result(result, expected, rtol: float = 1e-5, atol: float = 1e-8, verbose: bool = True) -> bool: + """ + Verify that computed result matches expected result. + + Automatically detects whether arrays are NumPy or CuPy and uses the + appropriate library without unnecessary data transfers. + + Parameters + ---------- + result : numpy.ndarray or cupy.ndarray + Computed result array. + expected : numpy.ndarray or cupy.ndarray + Expected result array. + rtol : float + Relative tolerance (default: 1e-5) + atol : float + Absolute tolerance (default: 1e-8) + verbose : bool + Whether to print verification result (default: True). + + Returns + ------- + bool + True if results match, False otherwise. + + Raises + ------ + TypeError + If arrays are not both NumPy or both CuPy, or if CuPy is needed + but not available. + """ + import numpy as np + + is_np = isinstance(result, np.ndarray) and isinstance(expected, np.ndarray) + + if is_np: + allclose = np.allclose + abs_ = np.abs + max_ = np.max + else: + import cupy as cp + + is_cp = isinstance(result, cp.ndarray) and isinstance(expected, cp.ndarray) + + if not is_cp: + raise TypeError("verify_array_result expects both arrays to be either numpy.ndarray or cupy.ndarray") + + allclose = cp.allclose + abs_ = cp.abs + max_ = cp.max + + if allclose(result, expected, rtol=rtol, atol=atol): + if verbose: + print("Test PASSED") + return True + else: + max_error = max_(abs_(result - expected)) + if verbose: + print(f"Test FAILED - Max error: {max_error}") + return False + + +def print_gpu_info(device) -> None: + """ + Print GPU device information. + + Parameters + ---------- + device : cuda.core.Device + CUDA device object + """ + print(f"Device: {device.name}") + cc = device.compute_capability + print(f"Compute Capability: {cc.major}.{cc.minor}") diff --git a/samples/binarySearch/README.md b/samples/binarySearch/README.md new file mode 100644 index 00000000000..4bbddcecd63 --- /dev/null +++ b/samples/binarySearch/README.md @@ -0,0 +1,129 @@ +# binarySearch (Python) + +## Description + +This sample demonstrates the parallel binary-search algorithms +exposed by **cuda.compute** (from the `cuda-cccl` package). Given +a sorted `d_data` array and a batch of `d_values` to locate, one +device-wide call returns the insertion index for every value: + +- `cuda.compute.lower_bound` writes, for each value, the lowest index + where it could be inserted into `d_data` without breaking the sort + order. Equivalent to `numpy.searchsorted(..., side="left")`. +- `cuda.compute.upper_bound` is the analogous upper form, equivalent + to `numpy.searchsorted(..., side="right")`. + +The sample runs both algorithms on two curated inputs: one with +distinct elements (where `lower_bound` and `upper_bound` agree on +any value not in the data) and one with duplicates (where they +diverge on present values). Results are verified against +`numpy.searchsorted`. + +## What You'll Learn + +- How to call `cuda.compute.lower_bound` / `upper_bound` with CuPy + arrays +- The semantic difference between `lower_bound` and `upper_bound`, + especially for inputs containing duplicates +- How the output dtype (`np.uintp`) is used for indices + +## Key Libraries + +- [`cuda.compute`](https://nvidia.github.io/cccl/python.html) (from the `cuda-cccl` package) - device algorithms +- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - device setup +- `cupy` - device buffers +- `numpy` - host-side reference via `numpy.searchsorted` + +## Key APIs + +### From `cuda.compute` + +- `cuda.compute.lower_bound(d_data, num_items, d_values, num_values, d_out)` +- `cuda.compute.upper_bound(d_data, num_items, d_values, num_values, d_out)` + +### From `cuda_samples_utils` + +- `print_gpu_info()` - print device name and compute capability + +## Requirements + +### Hardware + +- NVIDIA GPU with Compute Capability 7.0 or higher +- Minimum GPU memory: 512 MB + +### Software + +- CUDA Toolkit 13.0 or newer +- Python 3.10 or newer +- `cuda-cccl` (>=1.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +If the CUDA toolkit is not on your `PATH`, set `CUDA_HOME` so that +cuda.compute's JIT path can locate its dependencies: + +```bash +export CUDA_HOME=/usr/local/cuda +``` + +## Installation + +Install the required packages from `requirements.txt`: + +```bash +cd /path/to/cuda-samples/python/2_CoreConcepts/binarySearch +pip install -r requirements.txt +``` + +The `requirements.txt` installs: + +- `cuda-cccl` (>=1.0.0) - ships the `cuda.compute` module +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) +- `numpy` (>=1.24.0) + +## How to Run + +### Basic usage + +```bash +cd cuda-samples/python/2_CoreConcepts/binarySearch +python binarySearch.py +``` + +### With custom parameters + +```bash +python binarySearch.py --device 1 +``` + +## Expected Output + +``` +Device: +Compute Capability: + +Case 1: distinct data, mixed queries + data = [1, 3, 5, 7, 9] + values = [0, 3, 4, 10] + lower_bound: got [0, 1, 2, 5] expected [0, 1, 2, 5] OK + upper_bound: got [0, 2, 2, 5] expected [0, 2, 2, 5] OK + +Case 2: duplicates in data + data = [1, 3, 3, 5, 7, 9] + values = [3, 3, 5, 8] + lower_bound: got [1, 1, 3, 5] expected [1, 1, 3, 5] OK + upper_bound: got [3, 3, 4, 5] expected [3, 3, 4, 5] OK + +Done +``` + +**Note:** Device name and compute capability will vary based on your GPU. + +## Files + +- `binarySearch.py` - Python implementation +- `README.md` - This file +- `requirements.txt` - Sample dependencies +- `../../Utilities/cuda_samples_utils.py` - Common utilities (imported by this sample) diff --git a/samples/binarySearch/binarySearch.py b/samples/binarySearch/binarySearch.py new file mode 100644 index 00000000000..ccd6f488c0c --- /dev/null +++ b/samples/binarySearch/binarySearch.py @@ -0,0 +1,144 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-cccl[cu13]>=1.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0", "numpy>=1.24.0"] +# /// + +""" +This sample demonstrates the parallel binary-search algorithms exposed +by cuda.compute (from the cuda-cccl package). Given a sorted +``d_data`` array and a batch of ``d_values`` to locate, cuda.compute: + + - ``cuda.compute.lower_bound(d_data, num_items, d_values, num_values, d_out)`` + writes, for each value, the lowest index where it could be inserted + into d_data without breaking the sort order. Matches + ``numpy.searchsorted(..., side="left")``. + + - ``cuda.compute.upper_bound(d_data, num_items, d_values, num_values, d_out)`` + is the analogous upper form, matching ``side="right"``. + +The sample runs both algorithms on a curated sorted input with +duplicates so the lower/upper distinction is visible, verifies the +results against ``numpy.searchsorted``, and prints both sets of +indices side-by-side. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import cupy as cp + import numpy as np + from cuda_samples_utils import print_gpu_info + + import cuda.compute + from cuda.core import Device +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +def run_binary_search(h_data: np.ndarray, h_values: np.ndarray) -> bool: + d_data = cp.asarray(h_data) + d_values = cp.asarray(h_values) + + d_lb = cp.empty(len(h_values), dtype=np.uintp) + d_ub = cp.empty(len(h_values), dtype=np.uintp) + + cuda.compute.lower_bound( + d_data=d_data, + num_items=len(d_data), + d_values=d_values, + num_values=len(d_values), + d_out=d_lb, + ) + cuda.compute.upper_bound( + d_data=d_data, + num_items=len(d_data), + d_values=d_values, + num_values=len(d_values), + d_out=d_ub, + ) + + got_lb = cp.asnumpy(d_lb) + got_ub = cp.asnumpy(d_ub) + expected_lb = np.searchsorted(h_data, h_values, side="left").astype(np.uintp) + expected_ub = np.searchsorted(h_data, h_values, side="right").astype(np.uintp) + + ok_lb = np.array_equal(got_lb, expected_lb) + ok_ub = np.array_equal(got_ub, expected_ub) + + print(f" data = {h_data.tolist()}") + print(f" values = {h_values.tolist()}") + print(f" lower_bound: got {got_lb.tolist()} expected {expected_lb.tolist()} {'OK' if ok_lb else 'FAIL'}") + print(f" upper_bound: got {got_ub.tolist()} expected {expected_ub.tolist()} {'OK' if ok_ub else 'FAIL'}") + return ok_lb and ok_ub + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Parallel upper_bound / lower_bound via cuda.compute") + parser.add_argument("--device", type=int, default=0, help="CUDA device id") + args = parser.parse_args() + + device = Device(args.device) + device.set_current() + print_gpu_info(device) + print() + + ok = True + + # Case 1: values both inside and outside the data range; no duplicates + # in the data. lower_bound and upper_bound agree on values not present. + print("Case 1: distinct data, mixed queries") + h_data1 = np.array([1, 3, 5, 7, 9], dtype=np.int32) + h_values1 = np.array([0, 3, 4, 10], dtype=np.int32) + ok &= run_binary_search(h_data1, h_values1) + print() + + # Case 2: duplicates in the data so lower_bound and upper_bound diverge + # on present values. + print("Case 2: duplicates in data") + h_data2 = np.array([1, 3, 3, 5, 7, 9], dtype=np.int32) + h_values2 = np.array([3, 3, 5, 8], dtype=np.int32) + ok &= run_binary_search(h_data2, h_values2) + + print() + if ok: + print("Done") + return 0 + print("FAILED") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/binarySearch/requirements.txt b/samples/binarySearch/requirements.txt new file mode 100644 index 00000000000..3110a76e934 --- /dev/null +++ b/samples/binarySearch/requirements.txt @@ -0,0 +1,4 @@ +cuda-cccl[cu13]>=1.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 +numpy>=1.24.0 diff --git a/samples/blockwiseSum/README.md b/samples/blockwiseSum/README.md new file mode 100644 index 00000000000..19fcc922770 --- /dev/null +++ b/samples/blockwiseSum/README.md @@ -0,0 +1,102 @@ +# Sample: Block-wise Array Sum (Python) + +## Description + +Demonstrates fundamental CUDA thread cooperation: thread/block indexing, strided loops, and block-wise reduction using shared memory. This sample shows three progressively complex kernel patterns using the **cuda.core API**: + +1. **Simple indexing** - One thread per element +2. **Strided loop** - Each thread processes multiple elements +3. **Block partial sum** - Shared memory reduction within each block + +## What You'll Learn + +- How to calculate global thread ID from block and thread indices +- Strided loop pattern for processing arrays larger than grid size +- Block-level cooperation using shared memory and `__syncthreads()` + +## Key Concepts + +### Thread and Block Indexing + +``` +Global Thread ID = blockIdx.x * blockDim.x + threadIdx.x +Stride = blockDim.x * gridDim.x +``` + +### Strided Loop Pattern + +Each thread processes multiple elements, enabling fixed grid size for arbitrary array lengths: + +```c +for (size_t i = tid; i < N; i += stride) { + output[i] = input[i] * 2.0f; +} +``` + +## Key APIs + +### From `cuda.core`: + +- `Device` - Device management and context +- `Program` - Compile CUDA C++ kernels +- `ProgramOptions` - Kernel compilation options (architecture target) +- `LaunchConfig` - Configure grid/block dimensions and shared memory +- `launch()` - Execute kernel +- `EventOptions` - GPU timing configuration + +### From CuPy: + +- `cp.asarray()` - Transfer data to GPU +- `cp.zeros_like()` - Allocate GPU arrays + +## Requirements + +### Hardware: + +- NVIDIA GPU with CUDA support + +### Software: + +- CUDA Toolkit 13.0 or newer +- Python 3.10 or newer +- See `requirements.txt` for Python packages + +## Installation + +```bash +pip install -r requirements.txt +``` + +## How to Run + +```bash +python blockwiseSum.py +``` + +## Expected Output + +``` +Device: +Compute Capability: sm_XX +Array size: 1,048,576 elements + +Simple indexing: Test PASSED +Strided loop: Test PASSED +Block-wise sum: Test PASSED + +Kernel time: X.XXX ms, Bandwidth: XXX.X GB/s + +Done +``` + +## Files + +- `blockwiseSum.py` - Python implementation with CUDA kernels +- `README.md` - This file +- `requirements.txt` - Sample dependencies + +## See Also + +- [cuda.core Documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [CUDA Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) +- [CuPy Documentation](https://docs.cupy.dev/) diff --git a/samples/blockwiseSum/blockwiseSum.py b/samples/blockwiseSum/blockwiseSum.py new file mode 100644 index 00000000000..b61bfa00d1b --- /dev/null +++ b/samples/blockwiseSum/blockwiseSum.py @@ -0,0 +1,256 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0", "numpy>=2.3.2"] +# /// + +""" +Block-wise Array Sum with Threaded Access + +Demonstrates thread/block indexing, strided loops, and block-wise reduction. + +Key Concepts: + Global Thread ID = blockIdx.x * blockDim.x + threadIdx.x + Stride = blockDim.x * gridDim.x +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import verify_array_result + +try: + import cupy as cp + import numpy as np + + from cuda.core import ( + Device, + EventOptions, + LaunchConfig, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Install with: pip install -r requirements.txt") + sys.exit(1) + + +KERNELS_CODE: str = r""" +// Each thread processes one element +extern "C" __global__ +void simple_indexing(const float* input, float* output, size_t N) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < N) { + output[tid] = input[tid] * 2.0f; + } +} + +// Each thread processes multiple elements via strided access +extern "C" __global__ +void strided_loop(const float* input, float* output, size_t N) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = (size_t)blockDim.x * gridDim.x; + for (size_t i = tid; i < N; i += stride) { + output[i] = input[i] * 2.0f; + } +} + +// Block-wise partial sum with shared memory reduction +extern "C" __global__ +void block_partial_sum(const float* input, float* partial_sums, size_t N) { + extern __shared__ float sdata[]; + + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int local_tid = threadIdx.x; + size_t stride = (size_t)blockDim.x * gridDim.x; + + // Each thread accumulates multiple elements (strided) + float sum = 0.0f; + for (size_t i = tid; i < N; i += stride) { + sum += input[i]; + } + sdata[local_tid] = sum; + __syncthreads(); + + // Block-level tree reduction + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (local_tid < s) { + sdata[local_tid] += sdata[local_tid + s]; + } + __syncthreads(); + } + + if (local_tid == 0) { + partial_sums[blockIdx.x] = sdata[0]; + } +} +""" + + +def run_sample(num_elements: int = 1024 * 1024, device_id: int = 0) -> bool: + """ + Run block-wise sum demonstration. + + Parameters + ---------- + num_elements : int + Number of array elements + device_id : int + CUDA device ID + + Returns + ------- + bool + True if all tests passed + """ + threads_per_block = 256 + num_blocks = 64 + + device = Device(device_id) + device.set_current() + stream = device.create_stream() + + arch = f"sm_{device.arch}" + print(f"Device: {device.name}") + print(f"Compute Capability: {arch}") + print(f"Array size: {num_elements:,} elements\n") + + try: + # Make CuPy use our stream + cp.cuda.Stream.from_external(stream).use() + + # Compile kernels + program = Program(KERNELS_CODE, code_type="c++", options=ProgramOptions(arch=arch)) + module = program.compile(target_type="cubin") + kernel_simple = module.get_kernel("simple_indexing") + kernel_strided = module.get_kernel("strided_loop") + kernel_sum = module.get_kernel("block_partial_sum") + + # Test data + h_input = np.arange(num_elements, dtype=np.float32) + d_input = cp.asarray(h_input) + d_output = cp.zeros_like(d_input) + expected = cp.asarray(h_input * 2.0) + + # Demo 1: Simple indexing (1 thread = 1 element) + full_blocks = (num_elements + threads_per_block - 1) // threads_per_block + config = LaunchConfig(grid=full_blocks, block=threads_per_block) + launch( + stream, + config, + kernel_simple, + d_input.data.ptr, + d_output.data.ptr, + cp.uint64(num_elements), + ) + stream.sync() + print("Simple indexing: ", end="") + test1 = verify_array_result(d_output, expected) + + # Demo 2: Strided loop (threads process multiple elements) + d_output.fill(0) + config = LaunchConfig(grid=num_blocks, block=threads_per_block) + launch( + stream, + config, + kernel_strided, + d_input.data.ptr, + d_output.data.ptr, + cp.uint64(num_elements), + ) + stream.sync() + print("Strided loop: ", end="") + test2 = verify_array_result(d_output, expected) + + # Demo 3: Block-wise sum with shared memory + d_ones = cp.ones(num_elements, dtype=cp.float32) + d_partial = cp.zeros(num_blocks, dtype=cp.float32) + shared_mem = threads_per_block * 4 + + config = LaunchConfig(grid=num_blocks, block=threads_per_block, shmem_size=shared_mem) + launch( + stream, + config, + kernel_sum, + d_ones.data.ptr, + d_partial.data.ptr, + cp.uint64(num_elements), + ) + stream.sync() + + # Each block sums num_elements/num_blocks elements (strided access). + # Requires num_elements % num_blocks == 0 for correct expected values. + assert num_elements % num_blocks == 0, "num_elements must be divisible by num_blocks for block_partial_sum" + expected_partial = cp.full(num_blocks, num_elements / num_blocks, dtype=cp.float32) + print("Block-wise sum: ", end="") + test3 = verify_array_result(d_partial, expected_partial) + + # Performance timing + event_opts = EventOptions(timing_enabled=True) + iterations = 100 + + stream.sync() + start = stream.record(options=event_opts) + for _ in range(iterations): + launch( + stream, + config, + kernel_sum, + d_ones.data.ptr, + d_partial.data.ptr, + cp.uint64(num_elements), + ) + end = stream.record(options=event_opts) + end.sync() + + time_ms = (end - start) / iterations + bandwidth = (num_elements * 4) / (time_ms * 1e6) + print(f"\nKernel time: {time_ms:.3f} ms, Bandwidth: {bandwidth:.1f} GB/s") + + return test1 and test2 and test3 + + finally: + # Explicit resource cleanup + cp.cuda.Stream.null.use() + stream.close() + + +def main() -> None: + """Entry point.""" + success = run_sample() + if success: + print("\nDone") + else: + print("\nSome tests failed") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/samples/blockwiseSum/requirements.txt b/samples/blockwiseSum/requirements.txt new file mode 100644 index 00000000000..e70c86353b6 --- /dev/null +++ b/samples/blockwiseSum/requirements.txt @@ -0,0 +1,6 @@ +# Block-wise Array Sum Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 +numpy>=2.3.2 diff --git a/samples/blurImageUnifiedMemory/README.md b/samples/blurImageUnifiedMemory/README.md new file mode 100644 index 00000000000..93bdd26d233 --- /dev/null +++ b/samples/blurImageUnifiedMemory/README.md @@ -0,0 +1,177 @@ +# Sample: Image Blur with Unified Memory (Python) + +## Description + +Blur images on GPU using modern `cuda.core` APIs for kernel compilation, execution, and memory management. This sample demonstrates **zero-copy data sharing** between CPU and GPU using unified (managed) memory. + +## What You'll Learn + +- Compiling CUDA kernels at runtime with `cuda.core.Program` +- Launching kernels with `cuda.core.launch` and `LaunchConfig` +- Using unified memory with `cuda.core.ManagedMemoryResource` +- **Zero-copy CPU access** to unified memory via `np.from_dlpack()` +- Seamless CPU/GPU memory access without explicit transfers + +## Key Concepts + +### Kernel Compilation with cuda.core.Program + +```python +# Compile CUDA C++ kernel at runtime +program = Program(KERNEL_CODE, code_type="c++", options=options) +compiled = program.compile(target_type="cubin") +kernel = compiled.get_kernel("box_blur_3x3") +``` + +### Kernel Launch with cuda.core.launch + +```python +# Configure and launch kernel +config = LaunchConfig(grid=grid_size, block=block_size) + +# Buffers can be passed directly as kernel arguments +launch(stream, config, kernel, src_buf, dst_buf, H, W) +``` + +### Unified Memory (Managed Memory) + +This sample uses `ManagedMemoryResource` for simplicity: a single allocation is accessible from both CPU and GPU without explicit transfers. For performance-critical workloads, consider `LegacyPinnedMemoryResource` + `DeviceMemoryResource` instead, which gives explicit control over host/device placement and transfer costs. + +Unified memory is accessible from both CPU and GPU without explicit data transfers: + +```python +# Allocate unified memory +options = ManagedMemoryResourceOptions(preferred_location=device.device_id) +mr = ManagedMemoryResource(options) +src_buf = mr.allocate(n_bytes, stream) +dst_buf = mr.allocate(n_bytes, stream) +try: + # Synchronize to ensure allocations are complete before CPU access + stream.sync() + + # Create numpy views of unified memory using DLPack protocol (zero-copy) + src_np = np.from_dlpack(src_buf).view(np.float32).reshape(H, W) + dst_np = np.from_dlpack(dst_buf).view(np.float32).reshape(H, W) + + # CPU writes directly to unified memory + src_np[:] = input_data + + # Launch kernel - buffers can be passed directly as arguments + launch(stream, config, kernel, src_buf, dst_buf, H, W) + stream.sync() + + # Return zero-copy view; caller must close buffers when done + return dst_np, src_buf, dst_buf +except Exception: + src_buf.close() + dst_buf.close() + raise +``` + +When returning a zero-copy view, the caller must close the buffers after use (e.g., in a `try/finally` block) to avoid leaking managed memory. + +## Key APIs + +### From `cuda.core`: + +- `Device` - CUDA device management +- `Program` - Runtime kernel compilation (NVRTC) +- `ProgramOptions` - Compilation options (architecture target) +- `LaunchConfig` - Kernel launch configuration (grid/block dimensions) +- `launch` - Execute compiled kernel +- `ManagedMemoryResource` - Unified memory allocation + +### Zero-Copy Techniques: + +- `np.from_dlpack(buffer)` - Create numpy view of unified memory using DLPack protocol +- Pass `buffer` directly to `launch()` as kernel arguments +- When returning a zero-copy view, return `(view, src_buf, dst_buf)` and have the caller close buffers in `try/finally` after use + +## Kernel Techniques + +- **2D Thread Mapping** - Each thread computes one output pixel +- **Stencil Pattern** - Read neighboring pixels (3x3 neighborhood) +- **Boundary Handling** - Clamp to edge for border pixels +- **Box Filter** - 3x3 averaging for blur effect + +## Requirements + +### Hardware: + +- NVIDIA GPU with CUDA support +- Minimum GPU memory: 256 MB + +### Software: + +- CUDA Toolkit 13.0 or newer +- Python 3.10 or newer +- `cuda-python` package (13.0.0+) +- `cuda-core` package (>=1.0.0) +- `numpy` package (>=2.3.2) +- `pillow` package (10.0.0+) + +### Platform Support: + +This sample relies on `ManagedMemoryResource` with **concurrent host access** +to managed allocations while GPU kernels are in flight. That behavior +requires the device property `concurrent_managed_access=True`, which is only +supported on Linux with HMM (Pascal and newer). On Windows (WDDM/MCDM/TCC) +the property is `False`, so the sample exits early with a waive message and +exit code `2` instead of attempting a run that would crash the process. + +## Installation + +```bash +cd /path/to/cuda-samples/python/1_GettingStarted/blurImageUnifiedMemory +pip install -r requirements.txt +``` + +## How to Run + +```bash +python blurImageUnifiedMemory.py +``` + +## Expected Output + +``` +============================================================ +Image Blur with Unified Memory (cuda.core) +============================================================ + +Device: +Compute Capability: sm_ + +Compiling CUDA kernel with cuda.core.Program... + Compiled for architecture: sm_ + +Image size: 256x256 grayscale +Creating sample image... +Blurring image on GPU... + +Saving results... + Saved: original_image.png + Saved: blurred_image.png + +Verifying result... + Test PASSED + Max difference from original: +``` + +## Output Files + +- `original_image.png` - Test pattern image before blur +- `blurred_image.png` - Image after 3x3 box blur + +## Files + +- `blurImageUnifiedMemory.py` - Python implementation using cuda.core +- `README.md` - This file +- `requirements.txt` - Sample dependencies + +## See Also + +- [cuda.core Documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [cuda.core.Program](https://nvidia.github.io/cuda-python/cuda-core/latest/generated/cuda.core.Program.html) +- [cuda.core.ManagedMemoryResource](https://nvidia.github.io/cuda-python/cuda-core/latest/generated/cuda.core.ManagedMemoryResource.html) +- [CUDA Managed Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd) diff --git a/samples/blurImageUnifiedMemory/blurImageUnifiedMemory.py b/samples/blurImageUnifiedMemory/blurImageUnifiedMemory.py new file mode 100644 index 00000000000..062a966552c --- /dev/null +++ b/samples/blurImageUnifiedMemory/blurImageUnifiedMemory.py @@ -0,0 +1,278 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "numpy>=2.3.2", "pillow>=10.0.0"] +# /// + +""" +Image Blur with Unified Memory using cuda.core + +Demonstrates GPU image blurring using cuda.core APIs for kernel compilation, +launch, and unified memory allocation. +""" + +import sys + +try: + import numpy as np + from PIL import Image + + from cuda.core import ( + Device, + LaunchConfig, + ManagedMemoryResource, + ManagedMemoryResourceOptions, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +# CUDA kernel source code - compiled at runtime by cuda.core.Program +BOX_BLUR_KERNEL_CODE = r""" +extern "C" __global__ +void box_blur_3x3(const float* __restrict__ src, + float* __restrict__ dst, int H, int W) { + /* + * Simple 3x3 box blur CUDA kernel. + * + * Each thread computes one output pixel by averaging + * the 3x3 neighborhood of input pixels (stencil pattern). + */ + + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= W || y >= H) return; + + float sum = 0.0f; + int count = 0; + + // 3x3 stencil: iterate over neighborhood + for (int dy = -1; dy <= 1; dy++) { + for (int dx = -1; dx <= 1; dx++) { + int nx = x + dx; + int ny = y + dy; + + // Boundary check (clamp to edge) + if (nx >= 0 && nx < W && ny >= 0 && ny < H) { + sum += src[ny * W + nx]; + count++; + } + } + } + + dst[y * W + x] = sum / count; +} +""" + + +def make_test_image(h: int, w: int, dtype=np.uint8) -> np.ndarray: + """Create a test grayscale image for demonstration.""" + img = np.zeros((h, w), dtype=dtype) + + # Create horizontal stripes + for i in range(0, h, 50): + img[i : i + 25, :] = 255 + + # Create vertical stripes with different intensity + for j in range(0, w, 50): + img[:, j : j + 25] = 128 + + # Add circular pattern for interesting blur effects + center_y, center_x = h // 2, w // 2 + y, x = np.ogrid[:h, :w] + circle_mask = (x - center_x) ** 2 + (y - center_y) ** 2 <= (min(h, w) // 6) ** 2 + img[circle_mask] = 200 + + return np.ascontiguousarray(img) + + +def blur_image_unified_memory(host_np: np.ndarray, device: Device, stream, kernel) -> tuple[np.ndarray, object, object]: + """ + Blur image on GPU using unified memory with cuda.core. + + This function demonstrates: + 1. Allocate managed memory using ManagedMemoryResource + 2. Create zero-copy numpy views using np.from_dlpack() + 3. Launch kernel via cuda.core.launch + + Args: + host_np: NumPy array containing image data on CPU + device: CUDA device to use + stream: cuda.core Stream for async operations + kernel: Compiled cuda.core Kernel object + + Returns: + Tuple of (dst_np, src_buf, dst_buf). dst_np is a zero-copy view into + unified memory. Caller must close src_buf and dst_buf when done with + dst_np to avoid leaking managed memory. + """ + H, W = host_np.shape + n_bytes = H * W * np.dtype(np.float32).itemsize + + # Create managed memory resource for unified memory allocation + options = ManagedMemoryResourceOptions(preferred_location=device.device_id) + mr = ManagedMemoryResource(options) + + # Allocate unified memory buffers for source and destination images + src_buf = mr.allocate(n_bytes, stream=stream) + dst_buf = mr.allocate(n_bytes, stream=stream) + try: + # Synchronize to ensure allocations are complete before CPU access + stream.sync() + + # Create numpy views of unified memory using DLPack protocol (zero-copy) + src_np = np.from_dlpack(src_buf).view(np.float32).reshape(H, W) + dst_np = np.from_dlpack(dst_buf).view(np.float32).reshape(H, W) + + # Write input data to unified memory (CPU can access directly) + src_np[:] = host_np.astype(np.float32) / 255.0 + + # Configure kernel launch parameters + block_size = (16, 16) + grid_size = ( + (W + block_size[0] - 1) // block_size[0], + (H + block_size[1] - 1) // block_size[1], + ) + + # Create LaunchConfig for kernel execution + config = LaunchConfig(grid=grid_size, block=block_size) + + # Launch kernel - buffers can be passed directly as kernel arguments + launch( + stream, + config, + kernel, + src_buf, + dst_buf, + np.int32(H), + np.int32(W), + ) + + # Synchronize to ensure kernel completion before reading results + stream.sync() + + # Return zero-copy view; caller closes buffers when done + return (dst_np, src_buf, dst_buf) + except Exception: + src_buf.close() + dst_buf.close() + raise + + +def main(): + """ + Complete demonstration of GPU image blurring with cuda.core. + + This example shows: + 1. Device initialization with cuda.core.Device + 2. Kernel compilation with cuda.core.Program + 3. Unified memory with cuda.core.ManagedMemoryResource + 4. Kernel launch with cuda.core.launch and LaunchConfig + """ + if sys.platform == "win32": + print( + "This sample relies on ManagedMemoryResource with concurrent host " + "access, which is not supported on Windows " + "(concurrent_managed_access=False). Waiving this sample." + ) + sys.exit(2) + + print("=" * 60) + print("Image Blur with Unified Memory (cuda.core)") + print("=" * 60) + + # Initialize CUDA device + device = Device(0) + device.set_current() + + print(f"\nDevice: {device.name}") + print(f"Compute Capability: sm_{device.arch}") + + # Create stream for async operations + stream = device.create_stream() + try: + # Compile kernel using cuda.core.Program + print("\nCompiling CUDA kernel with cuda.core.Program...") + arch = f"sm_{device.arch}" + options = ProgramOptions(arch=arch) + program = Program(BOX_BLUR_KERNEL_CODE, code_type="c++", options=options) + compiled = program.compile(target_type="cubin") + kernel = compiled.get_kernel("box_blur_3x3") + print(f" Compiled for architecture: {arch}") + + # Image parameters + H, W = 256, 256 + print(f"\nImage size: {H}x{W} grayscale") + + # Create test image + print("Creating sample image...") + host_np = make_test_image(H, W, dtype=np.uint8) + + # Blur image on GPU using cuda.core (returns zero-copy view + buffers) + print("Blurring image on GPU...") + blurred_result, src_buf, dst_buf = blur_image_unified_memory(host_np, device, stream, kernel) + try: + # Save images (use zero-copy view before releasing buffers) + print("\nSaving results...") + original_pil = Image.fromarray(host_np, mode="L") + original_pil.save("original_image.png") + print(" Saved: original_image.png") + + blurred_uint8 = (np.clip(blurred_result, 0, 1) * 255).astype(np.uint8) + blurred_pil = Image.fromarray(blurred_uint8, mode="L") + blurred_pil.save("blurred_image.png") + print(" Saved: blurred_image.png") + + # Verify blur was applied + print("\nVerifying result...") + original_float = host_np.astype(np.float32) / 255.0 + max_diff = np.max(np.abs(blurred_result - original_float)) + blur_applied = max_diff > 0.01 + + if blur_applied: + print(" Test PASSED") + else: + print(" Test FAILED - blur not applied") + sys.exit(1) + + print(f" Max difference from original: {max_diff:.4f}") + finally: + src_buf.close() + dst_buf.close() + finally: + stream.close() + + +if __name__ == "__main__": + main() diff --git a/samples/blurImageUnifiedMemory/blurred_image.png b/samples/blurImageUnifiedMemory/blurred_image.png new file mode 100644 index 00000000000..2d24272e22f Binary files /dev/null and b/samples/blurImageUnifiedMemory/blurred_image.png differ diff --git a/samples/blurImageUnifiedMemory/original_image.png b/samples/blurImageUnifiedMemory/original_image.png new file mode 100644 index 00000000000..103d81df01c Binary files /dev/null and b/samples/blurImageUnifiedMemory/original_image.png differ diff --git a/samples/blurImageUnifiedMemory/requirements.txt b/samples/blurImageUnifiedMemory/requirements.txt new file mode 100644 index 00000000000..5213dcc2546 --- /dev/null +++ b/samples/blurImageUnifiedMemory/requirements.txt @@ -0,0 +1,6 @@ +# Image Blur with Unified Memory Sample Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +numpy>=2.3.2 +pillow>=10.0.0 diff --git a/samples/copyImageArraytoGPU/README.md b/samples/copyImageArraytoGPU/README.md new file mode 100644 index 00000000000..46120af2e8a --- /dev/null +++ b/samples/copyImageArraytoGPU/README.md @@ -0,0 +1,119 @@ +# Sample: Image Array Copy to GPU (Python) + +## Description + +Copy image arrays between CPU and GPU memory using the modern `cuda.core` API with optimal performance through pinned memory and asynchronous transfers. + +## What You'll Learn + +- How to use pinned memory for faster CPU↔GPU transfers +- Using the `cuda.core` API for memory management +- Working with DLPack for zero-copy array views +- Performing asynchronous memory transfers with CUDA streams +- Interoperability between CUDA Core API and CuPy +- Proper CUDA resource management and cleanup + +## Key Libraries + +- `cuda.core` - Modern CUDA Python API +- `numpy` - Array operations and DLPack support +- `cupy` - GPU array operations and CUDA interoperability + +## Key APIs + +### From `cuda.core`: + +- `Device()` - Initialize and access CUDA device +- `Device.set_current()` - Set the current device for API calls +- `Device.create_stream()` - Create CUDA stream for async operations +- `Device.memory_resource` - Access device memory allocator +- `PinnedMemoryResource()` - Allocate pinned host memory +- `buffer.copy_to()` - Copy data between memory spaces +- `buffer.close()` - Release allocated memory + +### From `numpy`: + +- `np.from_dlpack()` - Create array view from DLPack capsule +- `np.copyto()` - Copy data between arrays + +### From `cupy`: + +- `cp.from_dlpack()` - Create GPU array view from DLPack capsule +- `cp.cuda.Stream.from_external()` - Use external CUDA stream + +### From `cuda_samples_utils`: + +- `verify_array_result()` - Verify computation results + +## Requirements + +### Hardware: + +- NVIDIA GPU with CUDA support +- Sufficient GPU memory for image data (sample uses ~200KB for 256×256×3 image) + +### Software: + +- CUDA Toolkit 13.0 or newer +- Python 3.10 or newer +- NumPy 2.3.2 or newer (required for DLPack support) +- `cuda-python` package (>=13.0.0+) +- `cuda-core` package (>=1.0.0) +- `cupy-cuda13x` package (14.0.0+) + +## Installation + +Install the required packages from requirements.txt: + +```bash +cd /path/to/cuda-samples/python/1_GettingStarted/copyImageArraytoGPU +pip install -r requirements.txt +``` + +The requirements.txt installs: +- `numpy` (2.3.2+, required for DLPack) +- `cuda-python` (>=13.0.0+) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (14.0.0+) + +## How to Run + +### Basic usage: + +```bash +cd samples/python/1_GettingStarted/copyImageArraytoGPU +python copyImageArraytoGPU.py +``` + +## Expected Output + +``` +[Image Array Copy to GPU using CUDA Core API] +Device: NVIDIA GeForce RTX 4090 +[Image array copy of 256x256x3 image] +Creating sample image... +Copying image to GPU... +Creating CuPy view of GPU data... +Mean pixel value (computed on GPU): 127.50 +Copying image back from GPU... +Verifying result... +Test PASSED + +Done +``` + +**Note:** Device name will vary based on your GPU. + +## Files + +- `copyImageArraytoGPU.py` - Python implementation using cuda.core API +- `README.md` - This file +- `requirements.txt` - Sample dependencies +- `../../Utilities/cuda_samples_utils.py` - Common utilities (imported by this sample) + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [cuda.core API Guide](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [DLPack Specification](https://dmlc.github.io/dlpack/latest/) +- [CuPy Documentation](https://docs.cupy.dev/) diff --git a/samples/copyImageArraytoGPU/copyImageArraytoGPU.py b/samples/copyImageArraytoGPU/copyImageArraytoGPU.py new file mode 100644 index 00000000000..dd8d3da15fc --- /dev/null +++ b/samples/copyImageArraytoGPU/copyImageArraytoGPU.py @@ -0,0 +1,242 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["numpy>=2.3.2", "cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0"] +# /// + +""" +Image Array Copy to GPU using CUDA Core API + +This sample demonstrates how to copy image arrays between CPU and GPU memory +using NVIDIA's CUDA Core Python API with optimal performance. +""" + +import sys +from pathlib import Path + +# Add parent directory to path to import utilities +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import verify_array_result + +try: + import cupy as cp + import numpy as np + + from cuda.core import Buffer, Device, PinnedMemoryResource, Stream +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +# ----------------------------- Helper Functions ------------------------------ + + +def make_random_image(h: int, w: int, c: int, dtype=np.uint8) -> np.ndarray: + """ + Create a random test image for demonstration. + + Args: + h: Image height in pixels + w: Image width in pixels + c: Number of channels (e.g., 3 for RGB) + dtype: NumPy data type (e.g., np.uint8 for 0-255 pixel values) + + Returns: + A contiguous NumPy array representing the image + """ + img = np.random.randint(0, 256, size=(h, w, c), dtype=dtype) + return np.ascontiguousarray(img) # Ensure memory is contiguous for GPU transfer + + +# ----------------------------- Core GPU Functions --------------------------- + + +def copy_image_to_gpu_cuda_core(host_np: np.ndarray, dev: Device, stream: Stream) -> tuple[Buffer, Buffer]: + """ + Copy image from CPU memory to GPU memory using optimal transfer method. + + This function demonstrates the recommended approach: + 1. Use pinned memory for faster transfers + 2. Use DLPack for zero-copy array views + 3. Perform async transfers on a CUDA stream + + Args: + host_np: NumPy array containing image data on CPU + dev: CUDA device object + stream: CUDA stream for async operations + + Returns: + Tuple of (device_buffer, pinned_buffer) - both need to be cleaned up later + """ + nbytes = host_np.nbytes # Calculate total bytes needed + + # Step 1: Set up memory resources + # Device memory resource - allocates on GPU + device_mr = dev.memory_resource + # Pinned memory resource - allocates CPU memory that GPU can access faster + pinned_mr = PinnedMemoryResource() + + # Step 2: Allocate memory buffers + pinned_buffer = pinned_mr.allocate(nbytes, stream=stream) # Fast CPU memory + device_buffer = device_mr.allocate(nbytes, stream=stream) # GPU memory + + # Step 3: Create a NumPy view of pinned memory using DLPack + # This allows us to work with pinned memory as if it's a regular NumPy array + pinned_view = np.from_dlpack(pinned_buffer).view(dtype=host_np.dtype).reshape(host_np.shape) + + # Step 4: Copy image data from regular CPU memory to pinned CPU memory + # This is a CPU-to-CPU copy, so it's very fast + np.copyto(pinned_view, host_np) + + # Step 5: Copy from pinned CPU memory to GPU memory + # This is the actual CPU-to-GPU transfer, done asynchronously + pinned_buffer.copy_to(device_buffer, stream=stream) + + return device_buffer, pinned_buffer + + +def copy_image_from_gpu_cuda_core( + device_buffer: Buffer, + shape: tuple, + dtype: type, + _dev: Device, # unused: kept for symmetry with copy_image_to_gpu_cuda_core + stream: Stream, +) -> np.ndarray: + """ + Copy image from GPU memory back to CPU memory. + + This function reverses the GPU-to-CPU transfer process: + 1. Allocate pinned CPU memory for fast transfer + 2. Copy from GPU to pinned CPU memory + 3. Create NumPy view and copy to regular CPU memory + + Args: + device_buffer: GPU buffer containing image data + shape: Original image shape tuple (height, width, channels) + dtype: Original image data type + dev: CUDA device object + stream: CUDA stream for async operations + + Returns: + NumPy array with image data copied from GPU + """ + nbytes = np.prod(shape) * np.dtype(dtype).itemsize # Calculate total bytes + + # Step 1: Create pinned memory for fast GPU-to-CPU transfer + pinned_mr = PinnedMemoryResource() + pinned_buffer = pinned_mr.allocate(nbytes, stream=stream) + + # Step 2: Copy from GPU memory to pinned CPU memory + device_buffer.copy_to(pinned_buffer, stream=stream) + stream.sync() # Wait for the GPU transfer to complete + + # Step 3: Create NumPy view of pinned memory using DLPack + pinned_view = np.from_dlpack(pinned_buffer).view(dtype=dtype).reshape(shape) + + # Step 4: Copy from pinned CPU memory to regular CPU memory + # This creates the final result that can be used normally + host_result = pinned_view.copy() + + # Step 5: Clean up the temporary pinned buffer + pinned_buffer.close(stream) + + return host_result + + +# ------------------------------ Main Demo ------------------------------------ + + +def main(): + """ + Complete demonstration of GPU image copying workflow. + + This example shows: + 1. Setting up CUDA device and stream + 2. Creating a sample image + 3. Copying image to GPU + 4. Accessing GPU data with CuPy (optional) + 5. Copying image back from GPU + 6. Verifying data integrity + 7. Proper cleanup of resources + """ + print("[Image Array Copy to GPU using CUDA Core API]") + + # Image parameters - modify these to test different sizes + H, W, C = 256, 256, 3 # Height=256, Width=256, Channels=3 (RGB) + dtype = np.uint8 # Standard image pixel type (0-255 values) + + # Step 1: Set up CUDA device and stream + dev = Device() # Get default CUDA device (GPU 0) + dev.set_current() # Make this device the active one + stream = dev.create_stream() # Create stream for async operations + + print(f"Device: {dev.name}") + print(f"[Image array copy of {H}x{W}x{C} image]") + + # Step 2: Configure CuPy to use our CUDA stream (for interoperability) + cp.cuda.Stream.from_external(stream).use() + + # Step 3: Create a test image on CPU + print("Creating sample image...") + host_np = make_random_image(H, W, C, dtype=dtype) + + # Step 4: Copy image from CPU to GPU + print("Copying image to GPU...") + device_buffer, pinned_buffer = copy_image_to_gpu_cuda_core(host_np, dev, stream) + + # Step 5: (Optional) Get a CuPy view of GPU data for processing + # This shows how you can work with the GPU data without copying it back + print("Creating CuPy view of GPU data...") + device_cp = cp.from_dlpack(device_buffer).view(dtype=dtype).reshape(H, W, C) + + # Example: compute mean pixel value on GPU + mean_value = float(cp.mean(device_cp)) + print(f"Mean pixel value (computed on GPU): {mean_value:.2f}") + + # Step 6: Copy image back from GPU to CPU + print("Copying image back from GPU...") + host_back = copy_image_from_gpu_cuda_core(device_buffer, host_np.shape, host_np.dtype, dev, stream) + + # Step 7: Verify that the data survived the round trip + print("Verifying result...") + host_back_cp = cp.asarray(host_back) + host_np_cp = cp.asarray(host_np) + verify_array_result(host_back_cp, host_np_cp, rtol=0, atol=0) + + # Step 8: Clean up all allocated resources + device_buffer.close(stream) # Free GPU memory + pinned_buffer.close(stream) # Free pinned CPU memory + stream.close() # Close CUDA stream + cp.cuda.Stream.null.use() # Reset CuPy's stream to default + + print("\nDone") + + +if __name__ == "__main__": + main() diff --git a/samples/copyImageArraytoGPU/requirements.txt b/samples/copyImageArraytoGPU/requirements.txt new file mode 100644 index 00000000000..31aed2541ce --- /dev/null +++ b/samples/copyImageArraytoGPU/requirements.txt @@ -0,0 +1,6 @@ +# Image Array Copy to GPU Sample Requirements + +numpy>=2.3.2 +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 diff --git a/samples/cudaComputeLambdas/README.md b/samples/cudaComputeLambdas/README.md new file mode 100644 index 00000000000..db0586a3a8f --- /dev/null +++ b/samples/cudaComputeLambdas/README.md @@ -0,0 +1,130 @@ +# cudaComputeLambdas (Python) + +## Description + +This sample demonstrates how **cuda.compute** (from the +`cuda-cccl` package) accepts plain Python callables, including +lambdas, as the operators that drive device-wide reductions, +transforms, and scans. Internally `cuda.compute` JIT-compiles the +callable through Numba for the GPU, so you can iterate on the +operator in pure Python and still get a fused device-wide kernel. + +The sample exercises three algorithm families: + +1. `cuda.compute.reduce_into` - sum via `lambda a, b: a + b`. +2. `cuda.compute.unary_transform` - elementwise `y = x*x + 1` via a + lambda. +3. `cuda.compute.inclusive_scan` - prefix sum over only the even + values, driven by a regular Python function as the binary + operator. + +## What You'll Learn + +- Passing a Python `lambda` directly as the operator to a cuda.compute + device algorithm +- Using a regular Python `def` function for the same purpose when the + op is non-trivial +- The three core algorithm families in cuda.compute: reductions, + transforms, and scans +- How cuda.compute auto-compiles the op to LTO-IR via Numba + +## Key Libraries + +- [`cuda.compute`](https://nvidia.github.io/cccl/python.html) (from the `cuda-cccl` package) - device algorithms and JIT-compiled Python ops +- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - device setup +- `cupy` - device buffers +- `numpy` - scalar init values and host-side verification + +## Key APIs + +### From `cuda.compute` + +- `cuda.compute.reduce_into(d_in, d_out, num_items, op, h_init)` - device-wide reduction +- `cuda.compute.unary_transform(d_in, d_out, num_items, op)` - elementwise unary transform +- `cuda.compute.inclusive_scan(d_in, d_out, op, init_value, num_items)` - inclusive prefix scan + +### From `cuda_samples_utils` + +- `print_gpu_info()` - print device name and compute capability + +## Requirements + +### Hardware + +- NVIDIA GPU with Compute Capability 7.0 or higher + +### Software + +- CUDA Toolkit 13.0 or newer (cuda.compute compiles ops to LTO-IR via + Numba, which needs the toolkit's `nvvm` and `libdevice`). +- Python 3.10 or newer +- `cuda-cccl` (>=1.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) +- `numba-cuda` (pulled in transitively by `cuda-cccl`) + +If the CUDA toolkit is not on your `PATH`, set `CUDA_HOME` so Numba +can locate `libdevice`: + +```bash +export CUDA_HOME=/usr/local/cuda +``` + +## Installation + +Install the required packages from `requirements.txt`: + +```bash +cd /path/to/cuda-samples/python/2_CoreConcepts/cudaComputeLambdas +pip install -r requirements.txt +``` + +The `requirements.txt` installs: + +- `cuda-cccl` (>=1.0.0) - ships the `cuda.compute` module +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) +- `numpy` (>=1.24.0) + +## How to Run + +### Basic usage + +```bash +cd cuda-samples/python/2_CoreConcepts/cudaComputeLambdas +python cudaComputeLambdas.py +``` + +### With custom parameters + +```bash +python cudaComputeLambdas.py --device 1 +``` + +## Expected Output + +``` +Device: +Compute Capability: + +reduce_into(lambda a,b: a+b) over 1..10 -> 55 (expected 55) OK + +unary_transform(lambda x: x*x + 1): + got = [1, 2, 5, 10, 17, 26, 37, 50] + expected = [1, 2, 5, 10, 17, 26, 37, 50] OK + +inclusive_scan(add-evens-only) over [1,2,3,4,5,6]: + got = [0, 2, 2, 6, 6, 12] + expected = [0, 2, 2, 6, 6, 12] OK + +Done +``` + +**Note:** Device name and compute capability will vary based on your GPU. + +## Files + +- `cudaComputeLambdas.py` - Python implementation +- `README.md` - This file +- `requirements.txt` - Sample dependencies +- `../../Utilities/cuda_samples_utils.py` - Common utilities (imported by this sample) diff --git a/samples/cudaComputeLambdas/cudaComputeLambdas.py b/samples/cudaComputeLambdas/cudaComputeLambdas.py new file mode 100644 index 00000000000..5137adbd2c5 --- /dev/null +++ b/samples/cudaComputeLambdas/cudaComputeLambdas.py @@ -0,0 +1,179 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-cccl[cu13]>=1.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0", "numpy>=1.24.0"] +# /// + +""" +cuda.compute: Python lambdas as device-wide operators + +This sample demonstrates how cuda.compute 1.0 (from the cuda-cccl +package) accepts plain Python callables, including lambdas, as the +operators that drive device-wide reductions, transforms, and scans. +Internally cuda.compute JIT-compiles the callable with Numba for the +device, so you can iterate on the operator in pure Python and still +get a fused GPU kernel. + +The sample exercises three algorithm families with Python lambdas / +regular functions: + + 1. cuda.compute.reduce_into - sum via a lambda. + 2. cuda.compute.unary_transform - elementwise y = x*x + 1 via a lambda. + 3. cuda.compute.inclusive_scan - prefix sum over only the even values, + using a regular Python function as the binary operator. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import cupy as cp + import numpy as np + from cuda_samples_utils import print_gpu_info + + import cuda.compute + from cuda.core import Device +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +def demo_reduce_lambda() -> bool: + """reduce_into driven by a lambda.""" + dtype = np.int32 + h_init = np.array([0], dtype=dtype) + d_in = cp.arange(1, 11, dtype=dtype) # 1..10 + d_out = cp.empty(1, dtype=dtype) + + cuda.compute.reduce_into( + d_in=d_in, + d_out=d_out, + num_items=int(d_in.size), + op=lambda a, b: a + b, + h_init=h_init, + ) + + got = int(d_out.get()[0]) + expected = int(d_in.get().sum()) + ok = got == expected + print(f"reduce_into(lambda a,b: a+b) over 1..10 -> {got} (expected {expected}) {'OK' if ok else 'FAIL'}") + return ok + + +def demo_unary_transform_lambda() -> bool: + """unary_transform driven by a lambda: y = x*x + 1.""" + d_in = cp.arange(8, dtype=cp.int32) + d_out = cp.empty_like(d_in) + + cuda.compute.unary_transform( + d_in=d_in, + d_out=d_out, + num_items=int(d_in.size), + op=lambda x: x * x + 1, + ) + + got = d_out.get() + expected = (d_in.get().astype(np.int64) ** 2 + 1).astype(np.int32) + ok = np.array_equal(got, expected) + print( + f"unary_transform(lambda x: x*x + 1):\n" + f" got = {got.tolist()}\n" + f" expected = {expected.tolist()} {'OK' if ok else 'FAIL'}" + ) + return ok + + +def demo_scan_custom_op() -> bool: + """inclusive_scan with a Python function that sums only even values. + + This shows the same pattern that also works for reduce/transform: + the Python callable is JIT-compiled for the device by cuda.compute. + """ + dtype = np.int32 + d_in = cp.array([1, 2, 3, 4, 5, 6], dtype=dtype) + d_out = cp.empty_like(d_in) + h_init = np.array([0], dtype=dtype) + + def add_evens(a, b): + # Treat odd operands as zero; scan accumulates only even values. + return (a if a % 2 == 0 else 0) + (b if b % 2 == 0 else 0) + + cuda.compute.inclusive_scan( + d_in=d_in, + d_out=d_out, + op=add_evens, + init_value=h_init, + num_items=int(d_in.size), + ) + + got = d_out.get() + # Host reference: running sum of even-only projection of the input. + h_in = d_in.get() + proj = np.where(h_in % 2 == 0, h_in, 0) + expected = np.cumsum(proj).astype(dtype) + ok = np.array_equal(got, expected) + print( + f"inclusive_scan(add-evens-only) over [1,2,3,4,5,6]:\n" + f" got = {got.tolist()}\n" + f" expected = {expected.tolist()} {'OK' if ok else 'FAIL'}" + ) + return ok + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Drive cuda.compute device algorithms with Python lambdas / callables") + parser.add_argument("--device", type=int, default=0, help="CUDA device id") + args = parser.parse_args() + + device = Device(args.device) + device.set_current() + print_gpu_info(device) + print() + + ok = True + ok &= demo_reduce_lambda() + print() + ok &= demo_unary_transform_lambda() + print() + ok &= demo_scan_custom_op() + + print() + if ok: + print("Done") + return 0 + print("FAILED") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/cudaComputeLambdas/requirements.txt b/samples/cudaComputeLambdas/requirements.txt new file mode 100644 index 00000000000..3110a76e934 --- /dev/null +++ b/samples/cudaComputeLambdas/requirements.txt @@ -0,0 +1,4 @@ +cuda-cccl[cu13]>=1.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 +numpy>=1.24.0 diff --git a/samples/cudaGraphs/README.md b/samples/cudaGraphs/README.md new file mode 100644 index 00000000000..60b8c5bdc78 --- /dev/null +++ b/samples/cudaGraphs/README.md @@ -0,0 +1,140 @@ +# cudaGraphs (Python) + +## Description + +This sample demonstrates how to capture a multi-stage kernel pipeline as a +CUDA graph with `cuda.core` and replay it with a single driver call. + +The sample runs a three-stage elementwise pipeline +`r3 = (a + b) * c - a` in two modes: + +1. **Individual launches** - one `launch(stream, ...)` per stage, repeated + for every iteration of the pipeline. +2. **CUDA graph replay** - the same three launches are recorded into a + `Graph` once and replayed with `graph.launch(stream)` on each + iteration. + +Both paths are timed over N iterations and their results are verified +against a reference computation. The sample also re-launches the graph +after mutating the input buffers to show that the graph captures +pointers (not data), so the same graph can process new inputs without +rebuilding. + +## What You'll Learn + +- Creating a `GraphBuilder` from a stream with `stream.create_graph_builder()` +- Capturing launches with `begin_building()` and `end_building()` +- Completing a graph with `builder.complete()` and uploading it to a stream +- Replaying the graph with `graph.launch(stream)` +- Measuring the launch-overhead savings for small kernels +- Re-running the same graph against updated input data + +## Key Libraries + +- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - Pythonic access to CUDA runtime, programs, and graphs +- `cupy` - input buffers and result verification +- `numpy` - scalar kernel arguments + +## Key APIs + +### From `cuda.core` + +- `Stream.create_graph_builder()` - obtain a `GraphBuilder` +- `GraphBuilder.begin_building()` / `end_building()` - begin and finish recording launches issued against the builder +- `GraphBuilder.complete()` - produce an executable `Graph` +- `Graph.upload(stream)` - upload the graph structure to the device +- `Graph.launch(stream)` - replay the entire graph +- `launch(graph_builder, config, kernel, ...)` - record a kernel launch into the graph being built + +### From `cuda_samples_utils` + +- `print_gpu_info()` - print device name and compute capability + +## Requirements + +### Hardware + +- NVIDIA GPU with Compute Capability 7.0 or higher +- Minimum GPU memory: 512 MB + +### Software + +- CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x) +- Python 3.10 or newer +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## Installation + +Install the required packages from `requirements.txt`: + +```bash +cd /path/to/cuda-samples/python/2_CoreConcepts/cudaGraphs +pip install -r requirements.txt +``` + +The `requirements.txt` installs: + +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## How to Run + +### Basic usage + +```bash +cd cuda-samples/python/2_CoreConcepts/cudaGraphs +python cudaGraphs.py +``` + +### With custom parameters + +```bash +# Larger vectors and more iterations +python cudaGraphs.py --elements 4096 --iters 2000 + +# Use a specific GPU +python cudaGraphs.py --device 1 +``` + +Short vectors exaggerate the launch-overhead savings; larger vectors +will show the two approaches converging because per-launch overhead +becomes negligible next to kernel runtime. + +## Expected Output + +Speedup numbers vary with GPU and host CPU. + +``` +Device: +Compute Capability: + +Individual launches: 1000 iters in 0.0085s (8.49 us/iter) + +Building CUDA graph... +Graph replay: 1000 iters in 0.0034s (3.41 us/iter) +Graph speedup: 2.49x + +Graph replay on updated data verified (same graph, new buffer contents) + +Done +``` + +**Note:** Device name, compute capability, and speedup will vary based on +your GPU and host CPU. + +## Files + +- `cudaGraphs.py` - Python implementation using `cuda.core` CUDA graphs +- `README.md` - This file +- `requirements.txt` - Sample dependencies +- `../../Utilities/cuda_samples_utils.py` - Common utilities (imported by this sample) + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [`cuda.core` graphs API](https://nvidia.github.io/cuda-python/cuda-core/latest/api.html#cuda-graphs) +- Upstream `cuda.core` example: [`cuda_graphs.py`](https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/cuda_graphs.py) +- [CUDA Graphs programming guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs) diff --git a/samples/cudaGraphs/cudaGraphs.py b/samples/cudaGraphs/cudaGraphs.py new file mode 100644 index 00000000000..5b6c7f58464 --- /dev/null +++ b/samples/cudaGraphs/cudaGraphs.py @@ -0,0 +1,254 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0"] +# /// + +""" +CUDA Graphs with cuda.core + +CUDA graphs let you record a DAG of operations once, then replay the entire +graph with a single driver call. For workflows that issue many small kernels +this can significantly reduce CPU-side launch overhead. + +This sample runs a three-stage elementwise pipeline (add -> multiply -> +subtract) in two modes: + + 1. Individually launched kernels on a stream. + 2. A single CUDA graph that captures the same three launches and is + replayed with ``graph.launch(stream)``. + +We then measure the wall-clock time of each mode across many iterations to +illustrate the graph replay advantage for short kernels, and demonstrate that +a graph can be relaunched against new data (the pointers are baked in, but +the contents of those buffers are not). +""" + +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import cupy as cp + import numpy as np + from cuda_samples_utils import print_gpu_info + + from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +PIPELINE_KERNELS = r""" +extern "C" __global__ +void vec_add(const float* A, const float* B, float* C, size_t N) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = (size_t)gridDim.x * blockDim.x; + for (size_t i = tid; i < N; i += stride) C[i] = A[i] + B[i]; +} + +extern "C" __global__ +void vec_mul(const float* A, const float* B, float* C, size_t N) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = (size_t)gridDim.x * blockDim.x; + for (size_t i = tid; i < N; i += stride) C[i] = A[i] * B[i]; +} + +extern "C" __global__ +void vec_sub(const float* A, const float* B, float* C, size_t N) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = (size_t)gridDim.x * blockDim.x; + for (size_t i = tid; i < N; i += stride) C[i] = A[i] - B[i]; +} +""" + + +def run_pipeline_individual(stream, kernels, config, buffers, size, n_iters): + """Run the 3-stage pipeline `n_iters` times with one launch per stage.""" + add_k, mul_k, sub_k = kernels + a, b, c, r1, r2, r3 = buffers + stream.sync() + t0 = time.perf_counter() + for _ in range(n_iters): + launch(stream, config, add_k, a.data.ptr, b.data.ptr, r1.data.ptr, np.uint64(size)) + launch(stream, config, mul_k, r1.data.ptr, c.data.ptr, r2.data.ptr, np.uint64(size)) + launch(stream, config, sub_k, r2.data.ptr, a.data.ptr, r3.data.ptr, np.uint64(size)) + stream.sync() + return time.perf_counter() - t0 + + +def build_graph(stream, kernels, config, buffers, size): + """Capture the 3-stage pipeline into a CUDA graph and return it.""" + add_k, mul_k, sub_k = kernels + a, b, c, r1, r2, r3 = buffers + + graph_builder = stream.create_graph_builder() + graph_builder.begin_building() + launch( + graph_builder, + config, + add_k, + a.data.ptr, + b.data.ptr, + r1.data.ptr, + np.uint64(size), + ) + launch( + graph_builder, + config, + mul_k, + r1.data.ptr, + c.data.ptr, + r2.data.ptr, + np.uint64(size), + ) + launch( + graph_builder, + config, + sub_k, + r2.data.ptr, + a.data.ptr, + r3.data.ptr, + np.uint64(size), + ) + graph_builder.end_building() + graph = graph_builder.complete() + graph.upload(stream) + return graph_builder, graph + + +def run_pipeline_graph(stream, graph, n_iters): + """Launch the compiled graph `n_iters` times.""" + stream.sync() + t0 = time.perf_counter() + for _ in range(n_iters): + graph.launch(stream) + stream.sync() + return time.perf_counter() - t0 + + +def main() -> int: + import argparse + + parser = argparse.ArgumentParser(description="CUDA Graphs demo with cuda.core") + parser.add_argument( + "--elements", + type=int, + default=1 << 12, + help="Elements per vector (default: 4096 - small to emphasize launch overhead)", + ) + parser.add_argument( + "--iters", + type=int, + default=1000, + help="Number of pipeline iterations to time (default: 1000)", + ) + parser.add_argument("--device", type=int, default=0, help="CUDA device id") + args = parser.parse_args() + + device = Device(args.device) + device.set_current() + print_gpu_info(device) + + stream = device.create_stream() + # Tell CuPy to order its allocations on our stream so buffer initialization + # below is serialized with the kernels we launch. + cp.cuda.Stream.from_external(stream).use() + + graph_builder = graph = None + try: + program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + program = Program(PIPELINE_KERNELS, code_type="c++", options=program_options) + module = program.compile("cubin") + add_k = module.get_kernel("vec_add") + mul_k = module.get_kernel("vec_mul") + sub_k = module.get_kernel("vec_sub") + kernels = (add_k, mul_k, sub_k) + + N = args.elements + rng = cp.random.default_rng(seed=0) + a = rng.random(N, dtype=cp.float32) + b = rng.random(N, dtype=cp.float32) + c = rng.random(N, dtype=cp.float32) + r1 = cp.empty_like(a) + r2 = cp.empty_like(a) + r3 = cp.empty_like(a) + buffers = (a, b, c, r1, r2, r3) + + expected = (a + b) * c - a + + config = LaunchConfig(grid=(N + 255) // 256, block=256) + device.sync() + + # Warm up compilation/caches, then measure individual launches. + run_pipeline_individual(stream, kernels, config, buffers, N, n_iters=5) + t_individual = run_pipeline_individual(stream, kernels, config, buffers, N, n_iters=args.iters) + assert cp.allclose(r3, expected, rtol=1e-5, atol=1e-5), "Individual pipeline produced incorrect results" + print( + f"\nIndividual launches: {args.iters} iters in {t_individual:.4f}s" + f" ({t_individual * 1e6 / args.iters:.2f} us/iter)" + ) + + # Capture the same pipeline as a graph and measure the replay. + print("\nBuilding CUDA graph...") + graph_builder, graph = build_graph(stream, kernels, config, buffers, N) + + run_pipeline_graph(stream, graph, n_iters=5) # warm up + t_graph = run_pipeline_graph(stream, graph, n_iters=args.iters) + assert cp.allclose(r3, expected, rtol=1e-5, atol=1e-5), "Graph pipeline produced incorrect results" + print(f"Graph replay: {args.iters} iters in {t_graph:.4f}s ({t_graph * 1e6 / args.iters:.2f} us/iter)") + if t_graph > 0: + print(f"Graph speedup: {t_individual / t_graph:.2f}x") + + # Demonstrate that the graph replays against current buffer contents. + a[:] = cp.ones(N, dtype=cp.float32) + b[:] = cp.full(N, 2.0, dtype=cp.float32) + c[:] = cp.full(N, 3.0, dtype=cp.float32) + device.sync() + # r3 = (a + b) * c - a = (1 + 2) * 3 - 1 = 8 + graph.launch(stream) + stream.sync() + assert cp.allclose(r3, 8.0), "Graph replay with new data produced wrong result" + print("\nGraph replay on updated data verified (same graph, new buffer contents)") + + print("\nDone") + return 0 + finally: + if graph is not None: + graph.close() + if graph_builder is not None: + graph_builder.close() + stream.close() + cp.cuda.Stream.null.use() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/cudaGraphs/requirements.txt b/samples/cudaGraphs/requirements.txt new file mode 100644 index 00000000000..c650cd51f17 --- /dev/null +++ b/samples/cudaGraphs/requirements.txt @@ -0,0 +1,5 @@ +# CUDA Graphs Sample Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 diff --git a/samples/customPyTorchKernel/README.md b/samples/customPyTorchKernel/README.md new file mode 100644 index 00000000000..502b3b74071 --- /dev/null +++ b/samples/customPyTorchKernel/README.md @@ -0,0 +1,62 @@ +# Sample: PyTorch Custom GPU Operator + +## Description + +This sample demonstrates how to add a custom GPU operation to PyTorch using the `cuda.core` API. It implements a simple square operation (y = x²) to show the complete workflow from CUDA kernel to PyTorch integration with autograd support. + +## Requirements + +- NVIDIA GPU with Compute Capability 7.0+ +- CUDA Toolkit 13.0+ +- Python 3.10+ +- PyTorch 2.0+ +- cuda-python >= 13.0.0 +- cuda-core >=1.0.0 + +## Installation + +```bash +cd python/3_FrameworkInterop/customPyTorchKernel +pip install -r requirements.txt +``` + +**Windows users:** The default `torch` wheel on PyPI for Windows is CPU-only and will cause `torch.cuda.is_available()` to return `False`. Install a CUDA-enabled build from PyTorch's wheel index *before* (or after) the command above: + +```bash +pip install torch --index-url https://download.pytorch.org/whl/cu128 +``` + +Replace `cu128` with the wheel suffix matching your installed CUDA driver (e.g. `cu121`, `cu124`, `cu126`, `cu128`). The driver's CUDA version must be >= the wheel's bundled runtime. + +## How to Run + +```bash +# Basic usage +python customPyTorchKernel.py + +# Test with more elements +python customPyTorchKernel.py --size 1000000 + +# Use specific GPU +CUDA_VISIBLE_DEVICES=1 python customPyTorchKernel.py +``` + +## Expected Output + +The sample runs three tests: +1. Forward pass correctness (y = x²) +2. Backward pass correctness (gradient computation) +3. Multi-dimensional tensor support + +All tests should pass, confirming the custom operator works correctly with PyTorch's autograd system. + +## Key Concepts + +The sample demonstrates: +- Writing CUDA kernels with grid-stride loops +- Runtime kernel compilation with cuda.core +- PyTorch autograd integration via `torch.autograd.Function` +- Stream management using PyTorch's current stream +- Kernel caching for performance + +The code is self-documenting with inline comments explaining each step. diff --git a/samples/customPyTorchKernel/customPyTorchKernel.py b/samples/customPyTorchKernel/customPyTorchKernel.py new file mode 100644 index 00000000000..b092bc94706 --- /dev/null +++ b/samples/customPyTorchKernel/customPyTorchKernel.py @@ -0,0 +1,389 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# distribution and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "torch>=2.0.0"] +# /// + +""" +PyTorch Custom GPU Operator using cuda.core + +Question: How do I add a custom GPU op to PyTorch? +Answer: This sample shows the complete workflow. + +This sample implements a custom square operation (y = x²) to demonstrate: +- Writing a CUDA kernel +- Compiling with cuda.core +- Integrating with PyTorch's autograd system +- Proper device and stream management +""" + +import sys + +try: + import torch + + from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install: pip install torch cuda-python cuda-core") + sys.exit(1) + + +# ============================================================================ +# Step 1: Define CUDA Kernel +# ============================================================================ +# Simple element-wise square: y = x² +# This kernel is easy to understand and verify + +SQUARE_KERNEL = """ +extern "C" __global__ +void square_kernel(const float* x, float* y, int n) +{ + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < n; i += gridDim.x * blockDim.x) { + y[i] = x[i] * x[i]; + } +} +""" + + +# ============================================================================ +# PyTorch Stream Wrapper +# ============================================================================ +# cuda.core requires objects with __cuda_stream__ protocol +class PyTorchStreamWrapper: + def __init__(self, pt_stream): + self.pt_stream = pt_stream + + def __cuda_stream__(self): + stream_id = self.pt_stream.cuda_stream + return (0, stream_id) # Return format required by CUDA Python + + +# ============================================================================ +# Step 2: Kernel Compilation and Caching +# ============================================================================ +# Compile kernel once per device and cache it to avoid recompilation overhead +# In real training loops, this avoids paying compilation cost on every forward. + + +_kernel_cache = {} + + +def get_square_kernel(device): + """ + Get or compile the square kernel for a given device. + + Parameters + ---------- + device : Device + CUDA device object + + Returns + ------- + Kernel + Compiled CUDA kernel + """ + # Cache key based on device to avoid recompiling for the same GPU + key = device.pci_bus_id + + if key not in _kernel_cache: + # Compile the kernel with appropriate architecture + opts = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + prog = Program(SQUARE_KERNEL, code_type="c++", options=opts) + mod = prog.compile("cubin") + _kernel_cache[key] = mod.get_kernel("square_kernel") + + return _kernel_cache[key] + + +# ============================================================================ +# Step 3: PyTorch Autograd Function +# ============================================================================ +# This integrates the CUDA kernel with PyTorch's automatic differentiation + + +class SquareOp(torch.autograd.Function): + """ + Custom square operation using cuda.core. + + Forward: y = x² (computed with custom CUDA kernel) + Backward: grad_x = 2 * x * grad_y (computed with PyTorch) + """ + + @staticmethod + def forward(ctx, x): + """ + Forward pass: compute y = x² using custom CUDA kernel. + + Parameters + ---------- + ctx : Context + PyTorch context for saving tensors + x : torch.Tensor + Input tensor (must be CUDA, float32, contiguous) + + Returns + ------- + torch.Tensor + Output tensor with y = x² + """ + # Validate input requirements + if not x.is_cuda: + raise RuntimeError("SquareOp only supports CUDA tensors") + if x.dtype != torch.float32: + raise RuntimeError("SquareOp only supports float32 tensors") + + # Ensure contiguous memory layout for efficient kernel access + x = x.contiguous() + + device = Device() + # Use PyTorch's current stream to ensure proper ordering with other PyTorch ops + # Create a cuda.core Stream from PyTorch's stream wrapper + torch_stream = torch.cuda.current_stream(device=x.device) + stream = device.create_stream(PyTorchStreamWrapper(torch_stream)) + + # Create a try/finally block to ensure the stream is properly closed + try: + # Get compiled kernel (cached) + kernel = get_square_kernel(device) + + # Allocate output tensor + y = torch.empty_like(x) + + # Configure kernel launch + n = int(x.numel()) + threads_per_block = 256 + blocks_per_grid = (n + threads_per_block - 1) // threads_per_block + config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block) + + # Launch the kernel + launch(stream, config, kernel, x.data_ptr(), y.data_ptr(), n) + finally: + # Ensure stream is properly closed + stream.close() + + # Save input for backward pass + ctx.save_for_backward(x) + + return y + + @staticmethod + def backward(ctx, grad_output): + """ + Backward pass: compute gradient. + + For y = x², the derivative is dy/dx = 2x + Therefore: grad_x = grad_output * 2x + + Parameters + ---------- + ctx : Context + PyTorch context with saved tensors + grad_output : torch.Tensor + Gradient from upstream + + Returns + ------- + torch.Tensor + Gradient with respect to input + """ + # Retrieve saved input + (x,) = ctx.saved_tensors + + # Note: We assume grad_output has the same dtype and device as x. + # This is guaranteed by PyTorch's autograd system. + + # Compute gradient: d(x²)/dx = 2x + grad_x = 2.0 * x * grad_output + + return grad_x + + +# ============================================================================ +# Step 4: Public API +# ============================================================================ + + +def square(x): + """ + Apply element-wise square operation using custom CUDA kernel. + + Parameters + ---------- + x : torch.Tensor + Input tensor (must be on CUDA device, dtype=float32) + + Returns + ------- + torch.Tensor + Output tensor with y = x² + + Examples + -------- + >>> x = torch.randn(100, device="cuda") + >>> y = square(x) + >>> assert torch.allclose(y, x**2) + """ + return SquareOp.apply(x) + + +# ============================================================================ +# Step 5: Testing and Verification +# ============================================================================ + + +def main(): + """Test the custom square operation.""" + import argparse + + parser = argparse.ArgumentParser(description="Custom PyTorch Square Operator using cuda.core") + parser.add_argument("--size", type=int, default=10000, help="Number of elements (default: 10000)") + + args = parser.parse_args() + + # Device info + device = Device() + device.set_current() + major, minor = device.compute_capability + + print("\nDevice Information:") + print(f" Name: {device.name}") + print(f" Compute Capability: sm_{major}{minor}") + + print("\n" + "=" * 70) + print("Custom PyTorch Square Operator Test") + print("=" * 70) + + # ======================================================================== + # Test 1: Forward Pass Correctness + # ======================================================================== + print("\n" + "-" * 70) + print("Test 1: Forward Pass") + print("-" * 70) + + x = torch.randn(args.size, dtype=torch.float32, device="cuda") + + # Custom square operation + y_custom = square(x) + + # PyTorch reference + y_reference = x**2 + + # Check correctness + max_error = torch.max(torch.abs(y_custom - y_reference)).item() + + print(f"Max absolute error: {max_error:.2e}") + + if torch.allclose(y_custom, y_reference, rtol=1e-5, atol=1e-6): + print("[PASS] Forward pass PASSED") + else: + print("[FAIL] Forward pass FAILED") + return 1 + + # ======================================================================== + # Test 2: Backward Pass (Gradient) Correctness + # ======================================================================== + print("\n" + "-" * 70) + print("Test 2: Backward Pass") + print("-" * 70) + + # Test with requires_grad + x_custom = torch.randn(args.size, dtype=torch.float32, device="cuda", requires_grad=True) + x_reference = x_custom.clone().detach().requires_grad_(True) + + # Forward pass + y_custom = square(x_custom) + y_reference = x_reference**2 + + # Create upstream gradient + grad_output = torch.randn_like(y_custom) + + # Backward pass + y_custom.backward(grad_output) + y_reference.backward(grad_output) + + # Check gradients + max_grad_error = torch.max(torch.abs(x_custom.grad - x_reference.grad)).item() + + print(f"Max gradient error: {max_grad_error:.2e}") + + if torch.allclose(x_custom.grad, x_reference.grad, rtol=1e-5, atol=1e-6): + print("[PASS] Backward pass PASSED") + else: + print("[FAIL] Backward pass FAILED") + return 1 + + # ======================================================================== + # Test 3: Multi-dimensional Tensors + # ======================================================================== + print("\n" + "-" * 70) + print("Test 3: Multi-dimensional Tensors") + print("-" * 70) + + # Test with 2D tensor + x_2d = torch.randn(100, 100, dtype=torch.float32, device="cuda") + y_2d_custom = square(x_2d) + y_2d_reference = x_2d**2 + + if torch.allclose(y_2d_custom, y_2d_reference, rtol=1e-5, atol=1e-6): + print("[PASS] 2D tensor test PASSED") + else: + print("[FAIL] 2D tensor test FAILED") + return 1 + + # Test with 3D tensor + x_3d = torch.randn(10, 20, 30, dtype=torch.float32, device="cuda") + y_3d_custom = square(x_3d) + y_3d_reference = x_3d**2 + + if torch.allclose(y_3d_custom, y_3d_reference, rtol=1e-5, atol=1e-6): + print("[PASS] 3D tensor test PASSED") + else: + print("[FAIL] 3D tensor test FAILED") + return 1 + + # ======================================================================== + # Summary + # ======================================================================== + print("\n" + "=" * 70) + print("All tests PASSED!") + print("=" * 70) + print("\nYour custom GPU operator is working correctly!") + print("You can now use it in your PyTorch models like any built-in op.") + print("\nExample usage:") + print(" x = torch.randn(100, device='cuda')") + print(" y = square(x) # Uses your custom CUDA kernel") + print(" loss = y.sum()") + print(" loss.backward() # Gradients computed automatically") + print("=" * 70 + "\n") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/customPyTorchKernel/requirements.txt b/samples/customPyTorchKernel/requirements.txt new file mode 100644 index 00000000000..001018fab75 --- /dev/null +++ b/samples/customPyTorchKernel/requirements.txt @@ -0,0 +1,10 @@ +# Custom PyTorch Kernel Sample Requirements +# +# NOTE: On Windows, the default `torch` wheel from PyPI is CPU-only and the +# sample will fail with "Torch not compiled with CUDA enabled". Install a +# CUDA-enabled torch from PyTorch's wheel index first (see README.md): +# pip install torch --index-url https://download.pytorch.org/whl/cu128 + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +torch>=2.0.0 diff --git a/samples/customTensorFlowKernel/README.md b/samples/customTensorFlowKernel/README.md new file mode 100644 index 00000000000..77192b487ef --- /dev/null +++ b/samples/customTensorFlowKernel/README.md @@ -0,0 +1,81 @@ +# Sample: TensorFlow Custom GPU Operator + +## Description + +Learn how to add a custom GPU operation to TensorFlow using `cuda.core` with `tf.py_function`. This sample implements a custom **ReLU operation** (y = max(0, x)) for rapid prototyping of GPU operations. + +## Key Question Answered + +**Q: How do I add a custom GPU op to TensorFlow?** + +**A:** Use `tf.py_function` to wrap cuda.core kernels: +1. Write CUDA kernels (forward + backward) with grid-stride loops +2. Compile them with cuda.core +3. Wrap in Python functions +4. Use `tf.py_function` to call from TensorFlow +5. Register gradients with `@tf.custom_gradient` + +## Requirements + +- NVIDIA GPU with Compute Capability 7.0+ +- CUDA Toolkit 13.0+ +- Python 3.10+ +- TensorFlow 2.10+ +- cuda-python >= 13.0.0 +- cuda-core >=1.0.0 (required for LEGACY_DEFAULT_STREAM) +- numpy >= 2.3.2 +- CuPy (for device pointer access) + +## Installation + +```bash +cd python/3_FrameworkInterop/customTensorFlowKernel +pip install -r requirements.txt +``` + +## How to Run + +```bash +python customTensorFlowKernel.py +python customTensorFlowKernel.py --size 1000000 +``` + +## Usage Example + +```python +import tensorflow as tf +from customTensorFlowKernel import custom_relu + +# Simple usage +x = tf.random.normal([100], dtype=tf.float32) +y = custom_relu(x) + +# In a Keras model +model = tf.keras.Sequential([ + tf.keras.layers.Dense(128), + tf.keras.layers.Lambda(custom_relu), + tf.keras.layers.Dense(10) +]) +``` + +## Key Concepts + +- **tf.py_function**: Bridges TensorFlow and Python code using cuda.core (has overhead, not XLA-compatible) +- **@tf.custom_gradient**: Registers custom backward pass +- **cuda.core**: Primary GPU manager (device, stream, kernel compilation) +- **CuPy**: Internal helper for device pointer access only + +## Production Alternatives + +This sample is for rapid prototyping. For production: +- **TensorFlow C++ Custom Op**: Full performance, XLA compatible +- **XLA Custom Calls**: For XLA-compiled models +- See TensorFlow documentation for details + +## See Also + +- [cuda.core Documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [TensorFlow tf.py_function](https://www.tensorflow.org/api_docs/python/tf/py_function) +- [TensorFlow @custom_gradient](https://www.tensorflow.org/api_docs/python/tf/custom_gradient) +- [TensorFlow C++ Custom Op Guide](https://www.tensorflow.org/guide/create_op) +- [CuPy Documentation](https://docs.cupy.dev/) diff --git a/samples/customTensorFlowKernel/customTensorFlowKernel.py b/samples/customTensorFlowKernel/customTensorFlowKernel.py new file mode 100644 index 00000000000..8b4c1c05c3e --- /dev/null +++ b/samples/customTensorFlowKernel/customTensorFlowKernel.py @@ -0,0 +1,429 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# distribution and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["numpy>=2.3.2", "tensorflow>=2.10.0", "cupy-cuda13x>=14.0.0", "cuda-python>=13.0.0", "cuda-core>=1.0.0"] +# /// + +""" +TensorFlow Custom GPU Operator using cuda.core + +Question: How do I add a custom GPU op to TensorFlow? +Answer: This sample shows rapid prototyping with cuda.core + tf.py_function. + +This sample implements a custom ReLU operation (y = max(0, x)) to demonstrate: +- Writing CUDA kernels (forward + backward) with grid-stride loops +- Compiling with cuda.core +- Integrating with TensorFlow via tf.py_function +- Proper gradient registration + +Dependencies: +- tensorflow: Deep learning framework +- cuda-core: GPU kernel compilation and launch + (requires >=0.6.0 for LEGACY_DEFAULT_STREAM) +- cuda-python: CUDA driver API bindings +- cupy: Array operations and device pointer access + +Note: This approach uses tf.py_function for rapid prototyping. For production +TensorFlow applications, use TensorFlow's C++ Custom Op API. +""" + +import sys + +try: + # CuPy is required for array operations and device pointer access + import cupy as cp + import tensorflow as tf + + from cuda.core import ( + LEGACY_DEFAULT_STREAM, + Device, + LaunchConfig, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install: pip install tensorflow cupy cuda-python cuda-core") + sys.exit(1) + + +# ============================================================================ +# Step 1: Define CUDA Kernels +# ============================================================================ +# Simple element-wise ReLU: y = max(0, x) + +RELU_KERNEL = """ +extern "C" __global__ +void relu_forward_kernel(const float* x, float* y, int n) +{ + // Grid-stride loop: each thread processes multiple elements + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + for (int i = idx; i < n; i += stride) { + y[i] = x[i] > 0.0f ? x[i] : 0.0f; + } +} + +extern "C" __global__ +void relu_backward_kernel(const float* x, const float* grad_y, float* grad_x, int n) +{ + // Grid-stride loop: each thread processes multiple elements + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + for (int i = idx; i < n; i += stride) { + grad_x[i] = x[i] > 0.0f ? grad_y[i] : 0.0f; + } +} +""" + + +# ============================================================================ +# Step 2: Kernel Compilation and Caching +# ============================================================================ +# Compile kernel once per device and cache it to avoid recompilation overhead +# In real training loops, this avoids paying compilation cost on every forward. + +_kernel_cache = {} + + +def _get_relu_kernels(device): + """ + Get or compile the ReLU kernels for a given device. + + Parameters + ---------- + device : Device + CUDA device object + + Returns + ------- + tuple + (forward_kernel, backward_kernel) compiled CUDA kernels + """ + # Cache key based on device to avoid recompiling for the same GPU + key = device.pci_bus_id + + if key not in _kernel_cache: + # Compile the kernel with appropriate architecture + opts = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + prog = Program(RELU_KERNEL, code_type="c++", options=opts) + mod = prog.compile("cubin") + forward_kernel = mod.get_kernel("relu_forward_kernel") + backward_kernel = mod.get_kernel("relu_backward_kernel") + _kernel_cache[key] = (forward_kernel, backward_kernel) + + return _kernel_cache[key] + + +def _launch_relu_forward(x_np): + """ + Internal function: Launch forward CUDA kernel. + + Takes numpy array, returns numpy array. + Uses CuPy for array operations and device pointer access, cuda.core for + device/stream management. + + Note: LEGACY_DEFAULT_STREAM doesn't require explicit cleanup, but kernel + launch failures should be handled by the caller. CuPy arrays are + automatically cleaned up when they go out of scope. + """ + device = Device() + + # Ensure this device is current (TensorFlow usually does this already) + device.set_current() + + # Get compiled kernel (cached) + forward_kernel, _ = _get_relu_kernels(device) + + # Convert numpy to CuPy (CPU-to-GPU copy) + # CuPy is used for array operations and getting device pointers + x_cp = cp.asarray(x_np) + y_cp = cp.empty_like(x_cp) + + # Configure kernel launch + n = int(x_cp.size) + threads_per_block = 256 + blocks_per_grid = (n + threads_per_block - 1) // threads_per_block + config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block) + + # Launch on the legacy default stream (stream 0) for TensorFlow interop + launch(LEGACY_DEFAULT_STREAM, config, forward_kernel, x_cp.data.ptr, y_cp.data.ptr, n) + + # Return as numpy array (GPU-to-CPU copy via cp.asnumpy) + return cp.asnumpy(y_cp) + + +def _launch_relu_backward(x_np, grad_y_np): + """ + Internal function: Launch backward CUDA kernel. + + Takes numpy arrays, returns numpy array. + Uses CuPy for array operations and device pointer access, cuda.core for + device/stream management. + + Note: LEGACY_DEFAULT_STREAM doesn't require explicit cleanup, but kernel + launch failures should be handled by the caller. CuPy arrays are + automatically cleaned up when they go out of scope. + """ + device = Device() + + # Ensure this device is current (TensorFlow usually does this already) + device.set_current() + + # Get compiled kernel (cached) + _, backward_kernel = _get_relu_kernels(device) + + # Convert numpy to CuPy (CPU-to-GPU copy) + # CuPy is used for array operations and getting device pointers + x_cp = cp.asarray(x_np) + grad_y_cp = cp.asarray(grad_y_np) + grad_x_cp = cp.empty_like(x_cp) + + # Configure kernel launch + n = int(x_cp.size) + threads_per_block = 256 + blocks_per_grid = (n + threads_per_block - 1) // threads_per_block + config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block) + + # Launch on the legacy default stream (stream 0) for TensorFlow interop + launch( + LEGACY_DEFAULT_STREAM, + config, + backward_kernel, + x_cp.data.ptr, + grad_y_cp.data.ptr, + grad_x_cp.data.ptr, + n, + ) + + # Return as numpy array (GPU-to-CPU copy via cp.asnumpy) + return cp.asnumpy(grad_x_cp) + + +# ============================================================================ +# Step 3: TensorFlow Integration via tf.py_function +# ============================================================================ + + +@tf.custom_gradient +def custom_relu(x): + """ + Custom ReLU operation using cuda.core. + + This function provides a TensorFlow-native interface to custom CUDA kernels + compiled with cuda.core. The implementation uses tf.py_function internally + to bridge TensorFlow and cuda.core. + + Parameters + ---------- + x : tf.Tensor + Input tensor (must be float32 on GPU) + + Returns + ------- + tf.Tensor + Output tensor with ReLU applied + + Examples + -------- + >>> x = tf.random.normal([100], dtype=tf.float32) + >>> y = custom_relu(x) + >>> # Use in models + >>> model = tf.keras.Sequential( + ... [ + ... tf.keras.layers.Dense(128), + ... tf.keras.layers.Lambda(custom_relu), # Custom ReLU + ... tf.keras.layers.Dense(10), + ... ] + ... ) + """ + # Validate input + if x.dtype != tf.float32: + raise ValueError("custom_relu only supports float32 tensors") + + # Forward pass using tf.py_function + # py_function allows us to call arbitrary Python code (including cuda.core) + y = tf.py_function(func=_launch_relu_forward, inp=[x], Tout=tf.float32) + + # Restore shape information (py_function loses shape) + y.set_shape(x.shape) + + # Define gradient function + def grad_fn(grad_y): + """Backward pass using custom CUDA kernel""" + grad_x = tf.py_function(func=_launch_relu_backward, inp=[x, grad_y], Tout=tf.float32) + grad_x.set_shape(x.shape) + return grad_x + + return y, grad_fn + + +# ============================================================================ +# Step 4: Testing and Verification +# ============================================================================ + + +def main(): + """Test the custom ReLU operation.""" + import argparse + + parser = argparse.ArgumentParser(description="Custom TensorFlow ReLU Operator using cuda.core") + parser.add_argument("--size", type=int, default=10000, help="Number of elements (default: 10000)") + + args = parser.parse_args() + + # Device info + device = Device() + device.set_current() + major, minor = device.compute_capability + + print("\nDevice Information:") + print(f" Name: {device.name}") + print(f" Compute Capability: sm_{major}.{minor}") + + print("\n" + "=" * 70) + print("Custom TensorFlow ReLU Operator Test") + print("=" * 70) + + # ======================================================================== + # Test 1: Forward Pass Correctness + # ======================================================================== + print("\n" + "-" * 70) + print("Test 1: Forward Pass") + print("-" * 70) + + # Run on the first visible GPU (respects CUDA_VISIBLE_DEVICES), + # aligning with cuda.core Device(). + with tf.device("/GPU:0"): + x = tf.random.normal([args.size], dtype=tf.float32) + + # Custom ReLU operation + y_custom = custom_relu(x) + + # TensorFlow reference + y_reference = tf.nn.relu(x) + + # Check correctness + max_error = tf.reduce_max(tf.abs(y_custom - y_reference)).numpy() + + print(f"Max absolute error: {max_error:.2e}") + + if tf.reduce_all(tf.abs(y_custom - y_reference) < 1e-5): + print("[PASS] Forward pass PASSED") + else: + print("[FAIL] Forward pass FAILED") + return 1 + + # ======================================================================== + # Test 2: Backward Pass (Gradient) Correctness + # ======================================================================== + print("\n" + "-" * 70) + print("Test 2: Backward Pass") + print("-" * 70) + + with tf.device("/GPU:0"): + x_custom = tf.random.normal([args.size], dtype=tf.float32) + x_reference = tf.identity(x_custom) + + # Compute gradients with GradientTape + with tf.GradientTape() as tape_custom: + tape_custom.watch(x_custom) + y_custom = custom_relu(x_custom) + grad_custom = tape_custom.gradient(y_custom, x_custom) + + with tf.GradientTape() as tape_reference: + tape_reference.watch(x_reference) + y_reference = tf.nn.relu(x_reference) + grad_reference = tape_reference.gradient(y_reference, x_reference) + + # Check gradients + max_grad_error = tf.reduce_max(tf.abs(grad_custom - grad_reference)).numpy() + + print(f"Max gradient error: {max_grad_error:.2e}") + + if tf.reduce_all(tf.abs(grad_custom - grad_reference) < 1e-5): + print("[PASS] Backward pass PASSED") + else: + print("[FAIL] Backward pass FAILED") + return 1 + + # ======================================================================== + # Test 3: Multi-dimensional Tensors + # ======================================================================== + print("\n" + "-" * 70) + print("Test 3: Multi-dimensional Tensors") + print("-" * 70) + + with tf.device("/GPU:0"): + # Test with 2D tensor + x_2d = tf.random.normal([100, 100], dtype=tf.float32) + y_2d_custom = custom_relu(x_2d) + y_2d_reference = tf.nn.relu(x_2d) + + if tf.reduce_all(tf.abs(y_2d_custom - y_2d_reference) < 1e-5): + print("[PASS] 2D tensor test PASSED") + else: + print("[FAIL] 2D tensor test FAILED") + return 1 + + # Test with 3D tensor + x_3d = tf.random.normal([10, 20, 30], dtype=tf.float32) + y_3d_custom = custom_relu(x_3d) + y_3d_reference = tf.nn.relu(x_3d) + + if tf.reduce_all(tf.abs(y_3d_custom - y_3d_reference) < 1e-5): + print("[PASS] 3D tensor test PASSED") + else: + print("[FAIL] 3D tensor test FAILED") + return 1 + + # ======================================================================== + # Summary + # ======================================================================== + print("\n" + "=" * 70) + print("All tests PASSED!") + print("=" * 70) + print("\nYour custom GPU operator is working correctly!") + print("You can now use it in your TensorFlow models.") + print("\nExample usage:") + print(" x = tf.random.normal([100], dtype=tf.float32)") + print(" y = custom_relu(x) # Uses your custom CUDA kernel") + print(" ") + print(" # In a model:") + print(" model = tf.keras.Sequential([") + print(" tf.keras.layers.Dense(128),") + print(" tf.keras.layers.Lambda(custom_relu),") + print(" tf.keras.layers.Dense(10)") + print(" ])") + print("=" * 70 + "\n") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/customTensorFlowKernel/requirements.txt b/samples/customTensorFlowKernel/requirements.txt new file mode 100644 index 00000000000..ff4c4229de8 --- /dev/null +++ b/samples/customTensorFlowKernel/requirements.txt @@ -0,0 +1,14 @@ +# TensorFlow Custom GPU Operator using cuda.core +# +# This sample demonstrates a cuda.core-first approach: +# - cuda-python and cuda-core: Primary GPU management (device, stream, kernel) +# - TensorFlow 2.10+: Deep learning framework (tf.py_function, tf.custom_gradient) +# - CuPy: Internal helper for device pointer access only +# +# Note: cuda-core>=1.0.0 is required for LEGACY_DEFAULT_STREAM constant + +numpy>=2.3.2 +tensorflow>=2.10.0 +cupy-cuda13x>=14.0.0 +cuda-python>=13.0.0 +cuda-core>=1.0.0 diff --git a/samples/deviceQuery/README.md b/samples/deviceQuery/README.md new file mode 100644 index 00000000000..17e639c31ff --- /dev/null +++ b/samples/deviceQuery/README.md @@ -0,0 +1,189 @@ +# Sample: Device Query (Python) + +## Description + +Query and display detailed properties of all CUDA-capable devices in your system using the modern `cuda.core` API. + +## What You'll Learn + +- How to enumerate CUDA devices in the system +- Using the `cuda.core` API for device management +- Querying comprehensive device properties (compute capability, memory, limits) +- Accessing low-level device attributes via `cuda.bindings` +- Checking peer-to-peer (P2P) access capabilities between GPUs + +## Key Libraries + +- `cuda.core` - Modern CUDA Python API +- `cuda.bindings` - Low-level CUDA bindings for runtime and driver APIs + +## Key APIs + +### From `cuda.core`: + +- `Device.get_all_devices()` - Get tuple of all available Device instances +- `Device(device_id)` - Get Device object for specific device ID +- `system.get_driver_version()` - Query CUDA driver version +- `Device.set_current()` - Set the current device for API calls +- `Device.properties` - Access comprehensive device properties +- `Device.name` - Get device name string +- `Device.can_access_peer()` - Check P2P access to peer device + +### From `cuda.bindings.runtime`: + +- `cudart.cudaRuntimeGetVersion()` - Get CUDA runtime version +- `cudart.cudaDeviceGetAttribute()` - Query specific device attributes + +### From `cuda.bindings.driver`: + +- `cuda.cuMemGetInfo()` - Get memory information for current device + +## Device Properties Queried + +### Compute Capabilities: +- Compute capability version (major.minor) +- Driver and runtime versions +- Number of multiprocessors and CUDA cores + +### Memory Information: +- Total global memory +- Memory clock rate and bus width +- L2 cache size +- Constant and shared memory sizes +- Maximum memory pitch + +### Execution Configuration Limits: +- Maximum threads per block and per multiprocessor +- Maximum block dimensions (x, y, z) +- Maximum grid dimensions (x, y, z) +- Warp size +- Registers per block + +### Texture Capabilities: +- Maximum texture dimensions (1D, 2D, 3D) +- Maximum layered texture sizes + +### Feature Support: +- Unified Addressing (UVA) +- Managed Memory +- Compute Preemption +- Cooperative Kernel Launch +- ECC support +- Host page-locked memory mapping +- Concurrent copy and kernel execution + +### System Information: +- PCI bus information +- Compute mode +- Driver mode (Windows only) +- P2P access matrix (multi-GPU systems) + +## Requirements + +### Hardware: + +- NVIDIA GPU with CUDA support (any compute capability) +- No specific GPU memory requirement (query only) + +### Software: + +- CUDA Toolkit 13.0 or newer (recommended; matches `cuda-python` 13.x) +- Python 3.10 or newer +- `cuda-python` package (>=13.0.0) +- `cuda-core` package (>=1.0.0) + +## Installation + +Install the required packages from requirements.txt: + +```bash +cd cuda-samples/python/1_GettingStarted/deviceQuery +pip install -r requirements.txt +``` + +The requirements.txt installs: +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) + +## How to Run + +### Basic usage: + +```bash +cd cuda-samples/python/1_GettingStarted/deviceQuery +python deviceQuery.py +``` + +### Skip P2P information: + +```bash +python deviceQuery.py --no-p2p +``` + +## Expected Output + +``` +[CUDA Device Query using CUDA Core API] +Detected 1 CUDA Capable device(s) + +Device 0: + CUDA Driver Version / Runtime Version 12.4 / 12.6 + CUDA Capability Major/Minor version number: 8.9 + Total amount of global memory: 24217 MBytes (25393954816 bytes) + (132) Multiprocessors, (128) CUDA Cores/MP: 16896 CUDA Cores + GPU Max Clock rate: 1980 MHz (1.98 GHz) + Memory Clock rate: 10501 Mhz + Memory Bus Width: 384-bit + L2 Cache Size: 67108864 bytes + Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384) + Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers + Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers + Total amount of constant memory: 65536 bytes + Total amount of shared memory per block: 49152 bytes + Total shared memory per multiprocessor: 102400 bytes + Total number of registers available per block: 65536 + Warp size: 32 + Maximum number of threads per multiprocessor: 1536 + Maximum number of threads per block: 1024 + Max dimension size of a thread block (x,y,z): (1024, 1024, 64) + Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) + Maximum memory pitch: 2147483647 bytes + Texture alignment: 512 bytes + Concurrent copy and kernel execution: Yes with 2 copy engine(s) + Run time limit on kernels: Yes + Integrated GPU sharing Host Memory: No + Support host page-locked memory mapping: Yes + Device has ECC support: Enabled + Device supports Unified Addressing (UVA): Yes + Device supports Managed Memory: Yes + Device supports Compute Preemption: Yes + Supports Cooperative Kernel Launch: Yes + Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0 + Compute Mode: + < Default (multiple host threads can use cudaSetDevice() with device simultaneously) > + +Done +``` + +**Note:** Output will vary based on your specific GPU model and system configuration. + +For multi-GPU systems, the output will include information for all detected devices and a P2P access matrix showing which GPUs can directly access each other's memory. + +## Files + +- `deviceQuery.py` - Python implementation using cuda.core API +- `requirements.txt` - Sample dependencies + +## Use Cases + +- **System Diagnostics** - Verify CUDA installation and GPU detection +- **Hardware Profiling** - Understand GPU capabilities before optimization +- **Multi-GPU Systems** - Identify P2P topology for optimal data placement +- **Kernel Development** - Determine execution configuration limits +- **Compatibility Checks** - Verify compute capability requirements + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [cuda.core API Guide](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [CUDA Programming Guide - Device Information](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-enumeration) diff --git a/samples/deviceQuery/deviceQuery.py b/samples/deviceQuery/deviceQuery.py new file mode 100755 index 00000000000..0618dab8310 --- /dev/null +++ b/samples/deviceQuery/deviceQuery.py @@ -0,0 +1,360 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0"] +# /// + +""" +Device Query using CUDA Core API + +This sample enumerates the properties of the CUDA devices present in the system. +""" + +import platform +import sys + +# cuda.bindings used for properties not yet exposed in cuda.core (see comments below) +try: + from cuda.bindings import driver as cuda + from cuda.bindings import runtime as cudart + from cuda.core import Device, system +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +def print_property(label, value, indent=2): + """ + Helper function to print device properties with aligned formatting. + + Parameters + ---------- + label : str + Property label + value : any + Property value + indent : int + Number of spaces for indentation (default: 2) + """ + field_width = 47 + spaces = " " * indent + print(f"{spaces}{label:<{field_width}}{value}") + + +def fmt_bytes(size_in_bytes): + """Format bytes to human-readable string with MBytes.""" + return f"{size_in_bytes / (1024 * 1024):.0f} MBytes ({size_in_bytes} bytes)" + + +def fmt_hz(rate_in_khz): + """Format frequency in kHz to MHz and GHz.""" + return f"{rate_in_khz * 1e-3:.0f} MHz ({rate_in_khz * 1e-6:.2f} GHz)" + + +def fmt_yes_no(val): + """Format boolean value to Yes/No string.""" + return "Yes" if val else "No" + + +def convert_sm_ver_to_cores(major, minor): + """ + Maps SM version to the number of CUDA cores per SM. + + Information taken from: + https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h + + Parameters + ---------- + major : int + Major compute capability version + minor : int + Minor compute capability version + + Returns + ------- + int + Number of CUDA cores per SM, or 0 if unknown + """ + sm_to_cores = { + (3, 0): 192, + (3, 2): 192, + (3, 5): 192, + (3, 7): 192, + (5, 0): 128, + (5, 2): 128, + (5, 3): 128, + (6, 0): 64, + (6, 1): 128, + (6, 2): 128, + (7, 0): 64, + (7, 2): 64, + (7, 5): 64, + (8, 0): 64, + (8, 6): 128, + (8, 7): 128, + (8, 9): 128, + (9, 0): 128, + (10, 0): 128, + (10, 1): 128, + (10, 3): 128, + (11, 0): 128, + (12, 0): 128, + (12, 1): 128, + } + return sm_to_cores.get((major, minor), 0) + + +def print_device_info(dev_id, device): + """ + Print detailed information for a single CUDA device. + Uses device.properties (cuda.core) for most fields; cuda.bindings for + runtime version and global memory (not yet in high-level API). + """ + device.set_current() + props = device.properties + + print() + print(f"Device {dev_id}: {device.name}") + + # cuda.bindings workaround: runtime version not in cuda.core + driver_major, driver_minor = system.get_user_mode_driver_version() + err, runtime_version = cudart.cudaRuntimeGetVersion() + if err != cudart.cudaError_t.cudaSuccess: + raise RuntimeError(f"Failed to get CUDA runtime version: {err}") + runtime_major = runtime_version // 1000 + runtime_minor = (runtime_version % 1000) // 10 + + print_property( + "CUDA Driver Version / Runtime Version", + f"{driver_major}.{driver_minor} / {runtime_major}.{runtime_minor}", + ) + print_property( + "CUDA Capability Major/Minor version number:", + f"{props.compute_capability_major}.{props.compute_capability_minor}", + ) + + # cuda.bindings workaround: global memory (free/total) not in device.properties + err, _free_mem, total_mem_bytes = cuda.cuMemGetInfo() + if err != cuda.CUresult.CUDA_SUCCESS: + raise RuntimeError(f"Failed to get memory info: {err}") + print_property("Total amount of global memory:", fmt_bytes(total_mem_bytes)) + + sm_cores = convert_sm_ver_to_cores(props.compute_capability_major, props.compute_capability_minor) + total_cores = sm_cores * props.multiprocessor_count + print_property( + f"({props.multiprocessor_count:3d}) Multiprocessors, ({sm_cores:3d}) CUDA Cores/MP:", + f"{total_cores} CUDA Cores", + ) + + print_property("GPU Max Clock rate:", fmt_hz(props.clock_rate)) + print_property("Memory Clock rate:", f"{props.memory_clock_rate * 1e-3:.0f} Mhz") + print_property("Memory Bus Width:", f"{props.global_memory_bus_width}-bit") + if props.l2_cache_size > 0: + print_property("L2 Cache Size:", f"{props.l2_cache_size} bytes") + + print_property( + "Maximum Texture Dimension Size (x,y,z)", + f"1D=({props.maximum_texture1d_width}), " + f"2D=({props.maximum_texture2d_width}, {props.maximum_texture2d_height}), " + f"3D=({props.maximum_texture3d_width}, {props.maximum_texture3d_height}, " + f"{props.maximum_texture3d_depth})", + ) + print_property( + "Maximum Layered 1D Texture Size, (num) layers", + f"1D=({props.maximum_texture1d_layered_width}), {props.maximum_texture1d_layered_layers} layers", + ) + print_property( + "Maximum Layered 2D Texture Size, (num) layers", + f"2D=({props.maximum_texture2d_layered_width}, " + f"{props.maximum_texture2d_layered_height}), " + f"{props.maximum_texture2d_layered_layers} layers", + ) + + print_property("Total amount of constant memory:", f"{props.total_constant_memory} bytes") + print_property( + "Total amount of shared memory per block:", + f"{props.max_shared_memory_per_block} bytes", + ) + print_property( + "Total shared memory per multiprocessor:", + f"{props.max_shared_memory_per_multiprocessor} bytes", + ) + print_property("Total number of registers available per block:", props.max_registers_per_block) + + print_property("Warp size:", props.warp_size) + print_property( + "Maximum number of threads per multiprocessor:", + props.max_threads_per_multiprocessor, + ) + print_property("Maximum number of threads per block:", props.max_threads_per_block) + print_property( + "Max dimension size of a thread block (x,y,z):", + f"({props.max_block_dim_x}, {props.max_block_dim_y}, {props.max_block_dim_z})", + ) + print_property( + "Max dimension size of a grid size (x,y,z):", + f"({props.max_grid_dim_x}, {props.max_grid_dim_y}, {props.max_grid_dim_z})", + ) + print_property("Maximum memory pitch:", f"{props.max_pitch} bytes") + print_property("Texture alignment:", f"{props.texture_alignment} bytes") + + print_property( + "Concurrent copy and kernel execution:", + f"{fmt_yes_no(props.gpu_overlap)} with {props.async_engine_count} copy engine(s)", + ) + print_property("Run time limit on kernels:", fmt_yes_no(props.kernel_exec_timeout)) + + print_property("Integrated GPU sharing Host Memory:", fmt_yes_no(props.integrated)) + print_property( + "Support host page-locked memory mapping:", + fmt_yes_no(props.can_map_host_memory), + ) + print_property("Device has ECC support:", "Enabled" if props.ecc_enabled else "Disabled") + if platform.system() == "Windows": + mode = "TCC (Tesla Compute Cluster Driver)" if props.tcc_driver else "WDDM (Windows Display Driver Model)" + print_property("CUDA Device Driver Mode (TCC or WDDM):", mode) + + print_property( + "Device supports Unified Addressing (UVA):", + fmt_yes_no(props.unified_addressing), + ) + print_property("Device supports Managed Memory:", fmt_yes_no(props.managed_memory)) + print_property( + "Device supports Compute Preemption:", + fmt_yes_no(props.compute_preemption_supported), + ) + print_property("Supports Cooperative Kernel Launch:", fmt_yes_no(props.cooperative_launch)) + + print_property( + "Device PCI Domain ID / Bus ID / location ID:", + f"{props.pci_domain_id} / {props.pci_bus_id} / {props.pci_device_id}", + ) + compute_modes = { + 0: ("Default (multiple host threads can use cudaSetDevice() with device simultaneously)"), + 1: ("Exclusive (only one host thread in one process is able to use cudaSetDevice() with this device)"), + 2: "Prohibited (no host thread can use cudaSetDevice() with this device)", + 3: ("Exclusive Process (many threads in one process is able to use cudaSetDevice() with this device)"), + } + print_property("Compute Mode:", "") + print(f" < {compute_modes.get(props.compute_mode, 'Unknown')} >") + + +def print_p2p_access_info(devices): + """ + Print peer-to-peer access information for multi-GPU systems. + + Parameters + ---------- + devices : tuple of Device + Tuple of CUDA device objects + """ + print() + print("Peer-to-Peer (P2P) access support:") + for i, dev_i in enumerate(devices): + for j, dev_j in enumerate(devices): + if i == j: + continue + try: + can_access = dev_i.can_access_peer(dev_j) + print(f"> Peer access from {dev_i.name} (GPU{i}) -> {dev_j.name} (GPU{j}) : {fmt_yes_no(can_access)}") + except Exception as e: + print(f"Warning: Could not check peer access between device {i} and {j}: {e}") + + +def query_devices(show_p2p=True): + """ + Query and display information about all CUDA devices. + + Parameters + ---------- + show_p2p : bool + Whether to show peer-to-peer access information (default: True) + + Returns + ------- + bool + True if successful, False otherwise + """ + try: + print("[CUDA Device Query using CUDA Core API]") + devices = Device.get_all_devices() + except Exception as e: + print(f"Error: Failed to get devices: {e}") + import traceback + + traceback.print_exc() + return False + + if len(devices) == 0: + print("There are no available device(s) that support CUDA") + return True + + print(f"Detected {len(devices)} CUDA Capable device(s)") + + for dev_id, device in enumerate(devices): + try: + print_device_info(dev_id, device) + except Exception as e: + print(f"Error: Failed to get information for device {dev_id}: {e}") + import traceback + + traceback.print_exc() + return False + + if show_p2p and len(devices) >= 2: + print_p2p_access_info(devices) + + return True + + +def main(): + """ + Main entry point for the device query sample. + """ + import argparse + + parser = argparse.ArgumentParser( + description="Query CUDA Device Properties using CUDA Core API", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--no-p2p", action="store_true", help="Skip peer-to-peer access information") + + args = parser.parse_args() + + success = query_devices(show_p2p=not args.no_p2p) + + if success: + print("\nDone") + return 0 + else: + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/deviceQuery/requirements.txt b/samples/deviceQuery/requirements.txt new file mode 100644 index 00000000000..a0e4feab6d0 --- /dev/null +++ b/samples/deviceQuery/requirements.txt @@ -0,0 +1,4 @@ +# Device Query Sample Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 diff --git a/samples/fftSignalAnalysis/README.md b/samples/fftSignalAnalysis/README.md new file mode 100644 index 00000000000..87c84913b6b --- /dev/null +++ b/samples/fftSignalAnalysis/README.md @@ -0,0 +1,136 @@ +# Sample: FFT Signal Analysis (Python) + +## Description + +Analyze signal frequencies using Fast Fourier Transform (FFT) on the GPU. This sample demonstrates CuPy's cuFFT for GPU-accelerated frequency analysis: generating composite signals, computing magnitude spectrum, detecting dominant frequencies via peak detection, and comparing GPU vs CPU FFT performance. + +## What You'll Learn + +- Using CuPy's `cp.fft.rfft()` for real-to-complex FFT on GPU +- Computing magnitude spectrum from FFT results +- Peak detection to identify dominant frequencies +- Comparing GPU (cuFFT) vs CPU (NumPy) FFT performance +- Uses cuda.core APIs for device management and CUDA event timing + +## Key Concepts + +- **FFT (Fast Fourier Transform)**: Efficiently computes the Discrete Fourier Transform +- **Magnitude Spectrum**: `|FFT(signal)| * 2 / N` gives amplitude at each frequency +- **rfft**: Real FFT - optimized for real-valued input signals +- **Peak Detection**: Finding local maxima to identify dominant frequencies + +### Stream Interop + +This sample demonstrates CuPy integration with cuda.core streams: + +```python +# Create stream with cuda.core +stream = device.create_stream() + +# Use with CuPy operations +cp.cuda.Stream.from_external(stream).use() +``` + +## Key APIs + +### From `cuda.core`: + +- `Device` - Device management and context +- `EventOptions` - Configure events for GPU timing +- `stream.record()` - Record events for timing + +### From CuPy: + +- `cp.fft.rfft()` - Real-to-complex FFT (GPU-accelerated via cuFFT) +- `cp.fft.rfftfreq()` - Generate frequency bins for rfft +- `cp.cuda.Stream.from_external()` - Interop with cuda.core streams + +### From NumPy: + +- `np.fft.rfft()` - CPU FFT for comparison + +## Requirements + +### Hardware: + +- NVIDIA GPU with CUDA support + +### Software: + +- CUDA Toolkit 13.0 or newer +- Python 3.10 or newer +- See `requirements.txt` for Python packages + +## Installation + +```bash +pip install -r requirements.txt +``` + +## How to Run + +```bash +python fftSignalAnalysis.py +``` + +## Expected Output + +``` +============================================================ +FFT Signal Analysis +============================================================ + +Device: +Compute Capability: sm_XX + +Signal Parameters: + Samples: 1,048,576 + Sample Rate: 44,100 Hz + ... + +------------------------------------------------------------ +GPU FFT (cuFFT) +------------------------------------------------------------ +Time: X.XXX ms + +Detected Frequencies: + 440.0 Hz (magnitude: X.XXXX) + ... + +------------------------------------------------------------ +CPU FFT (NumPy) +------------------------------------------------------------ +Time: XX.XXX ms + +------------------------------------------------------------ +PERFORMANCE SUMMARY +------------------------------------------------------------ +GPU (cuFFT): X.XXX ms +CPU (NumPy): XX.XXX ms +Speedup: XXx + +------------------------------------------------------------ +VERIFICATION +------------------------------------------------------------ +GPU vs CPU FFT magnitude: Test PASSED + +Frequency Detection Accuracy: + 440 Hz: [OK] + ... + +Done +``` + +**Note:** Times and speedup vary by hardware. + +## Files + +- `fftSignalAnalysis.py` - Main sample using cuda.core and CuPy +- `README.md` - This file +- `requirements.txt` - Dependencies + +## See Also + +- [cuda.core Documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [CuPy FFT Documentation](https://docs.cupy.dev/en/stable/reference/fft.html) +- [cuFFT Documentation](https://docs.nvidia.com/cuda/cufft/) diff --git a/samples/fftSignalAnalysis/fftSignalAnalysis.py b/samples/fftSignalAnalysis/fftSignalAnalysis.py new file mode 100644 index 00000000000..f62db19c4bd --- /dev/null +++ b/samples/fftSignalAnalysis/fftSignalAnalysis.py @@ -0,0 +1,319 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0", "numpy>=2.3.2"] +# /// + +""" +FFT Signal Analysis + +Demonstrates how to analyze signal frequencies using Fast Fourier Transform (FFT): +- Generate composite signals with multiple frequency components +- Use CuPy's cuFFT for GPU-accelerated frequency analysis +- Detect dominant frequencies (peak detection) +- Compare GPU vs CPU FFT performance + +Uses cuda.core APIs for device management and timing. +""" + +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import verify_array_result + +try: + import cupy as cp + import numpy as np + + from cuda.core import Device, EventOptions +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Install with: pip install -r requirements.txt") + sys.exit(1) + + +def generate_composite_signal( + num_samples: int, + sample_rate: float, + frequencies: list[float], + amplitudes: list[float], +) -> np.ndarray: + """ + Generate a composite signal with multiple frequency components. + + Parameters + ---------- + num_samples : int + Number of samples in the signal + sample_rate : float + Sampling rate in Hz + frequencies : list[float] + List of frequency components in Hz + amplitudes : list[float] + List of amplitudes for each frequency component + + Returns + ------- + np.ndarray + Signal array + """ + t = np.arange(num_samples, dtype=np.float32) / sample_rate + signal = np.zeros(num_samples, dtype=np.float32) + + for freq, amp in zip(frequencies, amplitudes): + signal += amp * np.sin(2 * np.pi * freq * t) + + return signal + + +def find_dominant_frequencies( + fft_magnitude: cp.ndarray, + frequencies: cp.ndarray, + num_peaks: int = 5, + threshold_ratio: float = 0.1, +) -> list[tuple[float, float]]: + """ + Find dominant frequencies from FFT magnitude spectrum. + + Uses CPU-based peak detection (transfers magnitude/frequencies via cp.asnumpy). + Suitable for small-to-medium spectra; for large-scale analysis, consider + GPU-native peak detection. + + Parameters + ---------- + fft_magnitude : cp.ndarray + Magnitude of FFT (positive frequencies only) + frequencies : cp.ndarray + Frequency bins + num_peaks : int + Maximum number of peaks to return + threshold_ratio : float + Minimum peak height as ratio of max peak + + Returns + ------- + list[tuple[float, float]] + List of (frequency, magnitude) tuples for detected peaks + """ + # Find peaks above threshold + max_magnitude = float(cp.max(fft_magnitude)) + threshold = max_magnitude * threshold_ratio + + # Simple peak detection: find local maxima above threshold + magnitude_cpu = cp.asnumpy(fft_magnitude) + freq_cpu = cp.asnumpy(frequencies) + + peaks = [] + for i in range(1, len(magnitude_cpu) - 1): + if ( + magnitude_cpu[i] > threshold + and magnitude_cpu[i] > magnitude_cpu[i - 1] + and magnitude_cpu[i] > magnitude_cpu[i + 1] + ): + peaks.append((freq_cpu[i], magnitude_cpu[i])) + + # Sort by magnitude and return top peaks + peaks.sort(key=lambda x: x[1], reverse=True) + return peaks[:num_peaks] + + +def run_fft_analysis( + num_samples: int = 2**20, + sample_rate: float = 44100.0, + device_id: int = 0, + num_iterations: int = 10, +) -> bool: + """ + Run FFT signal analysis benchmark. + + device_id and num_iterations are not exposed via CLI; modify defaults + or call this function directly for customization. + + Parameters + ---------- + num_samples : int + Number of samples (power of 2 recommended for FFT) + sample_rate : float + Sampling rate in Hz + device_id : int + CUDA device ID + num_iterations : int + Number of iterations for timing + + Returns + ------- + bool + True if analysis succeeded + """ + print("=" * 60) + print("FFT Signal Analysis") + print("=" * 60) + + # Initialize device + device = Device(device_id) + device.set_current() + stream = device.create_stream() + + try: + print(f"\nDevice: {device.name}") + print(f"Compute Capability: sm_{device.arch}") + + # Make CuPy use our cuda.core stream + cp.cuda.Stream.from_external(stream).use() + + # Define test signal: composite of multiple frequencies + test_frequencies = [440.0, 880.0, 1320.0, 2000.0, 5000.0] # Hz + test_amplitudes = [1.0, 0.5, 0.3, 0.7, 0.4] + + print("\nSignal Parameters:") + print(f" Samples: {num_samples:,}") + print(f" Sample Rate: {sample_rate:,.0f} Hz") + print(f" Duration: {num_samples / sample_rate:.3f} seconds") + print(f" Input Frequencies: {test_frequencies} Hz") + print(f" Input Amplitudes: {test_amplitudes}") + + # Generate composite signal on CPU + h_signal = generate_composite_signal(num_samples, sample_rate, test_frequencies, test_amplitudes) + + # Transfer to GPU + d_signal = cp.asarray(h_signal) + + # --------------------------------------------------------------------- + # GPU FFT (cuFFT via CuPy) + # --------------------------------------------------------------------- + print("\n" + "-" * 60) + print("GPU FFT (cuFFT)") + print("-" * 60) + + event_opts = EventOptions(timing_enabled=True) + + # Warmup + d_fft_result = cp.fft.rfft(d_signal) + stream.sync() + + # Timed runs + start = stream.record(options=event_opts) + for _ in range(num_iterations): + d_fft_result = cp.fft.rfft(d_signal) + end = stream.record(options=event_opts) + end.sync() + + gpu_time_ms = (end - start) / num_iterations + print(f"Time: {gpu_time_ms:.3f} ms") + + # Compute magnitude spectrum + d_magnitude = cp.abs(d_fft_result) * 2 / num_samples + d_frequencies = cp.fft.rfftfreq(num_samples, 1 / sample_rate) + + # Find dominant frequencies + detected_peaks = find_dominant_frequencies(d_magnitude, d_frequencies) + + print("\nDetected Frequencies:") + for freq, mag in detected_peaks: + print(f" {freq:8.1f} Hz (magnitude: {mag:.4f})") + + # --------------------------------------------------------------------- + # CPU FFT (NumPy) for comparison + # --------------------------------------------------------------------- + print("\n" + "-" * 60) + print("CPU FFT (NumPy)") + print("-" * 60) + + # Warmup + h_fft_result = np.fft.rfft(h_signal) + + # Timed runs + cpu_start = time.perf_counter() + for _ in range(num_iterations): + h_fft_result = np.fft.rfft(h_signal) + cpu_end = time.perf_counter() + + cpu_time_ms = (cpu_end - cpu_start) * 1000 / num_iterations + print(f"Time: {cpu_time_ms:.3f} ms") + + # --------------------------------------------------------------------- + # Performance Summary + # --------------------------------------------------------------------- + print("\n" + "-" * 60) + print("PERFORMANCE SUMMARY") + print("-" * 60) + speedup = cpu_time_ms / gpu_time_ms + print(f"GPU (cuFFT): {gpu_time_ms:.3f} ms") + print(f"CPU (NumPy): {cpu_time_ms:.3f} ms") + print(f"Speedup: {speedup:.1f}x") + + # --------------------------------------------------------------------- + # Verification + # --------------------------------------------------------------------- + print("\n" + "-" * 60) + print("VERIFICATION") + print("-" * 60) + + # Compare GPU and CPU results + h_magnitude = cp.asarray(np.abs(h_fft_result).astype(np.float32)) * 2 / num_samples + + print("GPU vs CPU FFT magnitude: ", end="") + success = verify_array_result( + d_magnitude, + h_magnitude, + rtol=1e-4, + atol=1e-6, + ) + + # Verify detected frequencies match input + print("\nFrequency Detection Accuracy:") + detected_freqs = [freq for freq, _ in detected_peaks] + all_found = True + for expected_freq in test_frequencies: + found = any(abs(f - expected_freq) < 10 for f in detected_freqs) + status = "[OK]" if found else "[FAIL]" + print(f" {expected_freq:6.0f} Hz: {status}") + all_found = all_found and found + + success = success and all_found + return success + + finally: + # Cleanup - always close resources + cp.cuda.Stream.null.use() + stream.close() + + +def main() -> None: + """Entry point.""" + success = run_fft_analysis() + if success: + print("\nDone") + else: + print("\nAnalysis completed with errors") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/samples/fftSignalAnalysis/requirements.txt b/samples/fftSignalAnalysis/requirements.txt new file mode 100644 index 00000000000..12db0490abe --- /dev/null +++ b/samples/fftSignalAnalysis/requirements.txt @@ -0,0 +1,6 @@ +# FFT Signal Analysis Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 +numpy>=2.3.2 diff --git a/samples/greenContext/README.md b/samples/greenContext/README.md new file mode 100644 index 00000000000..3afd80831d0 --- /dev/null +++ b/samples/greenContext/README.md @@ -0,0 +1,250 @@ +# greenContext (Python) + +## Description + +This sample demonstrates how to use **green contexts** with +`cuda.core` to statically partition a GPU's streaming multiprocessors +(SMs) so that independent kernels can run on dedicated subsets of the +device. + +This examples takes A long-running kernel that fills the GPU's SMs, +and a short but latency-sensitive "critical" kernel is launched shortly after. +Without green contexts, the critical kernel must wait for SMs to +free up. With green contexts, the GPU's SMs are partitioned so the +critical kernel has its own dedicated SMs and can start immediately. + +Three timed scenarios are compared: + +1. **Reference**: the critical kernel alone on the primary context, + with no competing work. Establishes the pure compute time of the + critical kernel when every SM on the device is available to it. +2. **Baseline**: both kernels run on the device's primary context, + on two non-blocking streams that contend for all SMs. +3. **Green contexts**: the SMs are split into two disjoint groups + (e.g. 112 + 16). Each kernel runs on a stream that belongs to its + own green context, so the critical kernel never waits for SMs + held by the long-running kernel. + +The headline metric is the total wall time of the critical kernel +from launch to completion. In the baseline it is dominated by time +spent waiting behind the long-running kernel. With green contexts it +reflects the kernel's own compute time on its (smaller) SM +partition. The reference row lets you separate those two effects: + +- `baseline - reference` is roughly the time the critical kernel + spent waiting for SMs in the baseline run (the cost that green + contexts eliminate). +- `green / reference` is the compute slowdown caused by running on + a smaller SM partition (the cost that green contexts introduce). + +## What You'll Learn + +- Querying a device's SM resources via `Device.resources.sm` and + reading `sm_count`, `min_partition_size`, `coscheduled_alignment` +- Splitting an `SMResource` into disjoint partitions with + `sm.split(SMResourceOptions(count=(A, B)))` +- Creating a green context from an SM partition via + `Device.create_context(ContextOptions(resources=[group]))` +- Creating a non-blocking stream on a green context with + `ctx.create_stream()` +- Using CUDA events with timing enabled to measure kernel wall time + across streams +- Cleaning up green contexts safely with `ctx.close()` + +## Key Libraries + +- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - device management, SM partitioning, green contexts, compilation, and launching +- `numpy` - scalar kernel arguments + +## Key APIs + +### From `cuda.core` + +- `Device.resources.sm` - the device's SM-type device resource +- `SMResource.split(SMResourceOptions(count=(A, B)))` - partition SMs + into disjoint groups (plus an optional remainder) +- `Device.create_context(ContextOptions(resources=[sm_group]))` - + create a green context provisioned with a specific SM partition +- `Context.is_green` / `Context.resources` - introspect a green + context +- `Context.create_stream()` - create a non-blocking stream that is + tied to the green context's SM partition +- `Context.close()` - destroy a green context (must not be the + thread's current context when closed) +- `Device.create_event(EventOptions(timing_enabled=True))` / + `Stream.record(event)` / `event2 - event1` - measure elapsed time + in milliseconds between two events on the device +- `Program(..., ProgramOptions(std="c++17", arch=f"sm_{device.arch}"))` + / `program.compile("cubin", name_expressions=(...))` - compile the + delay and critical kernels in one TU +- `launch(stream, LaunchConfig(grid=..., block=...), kernel, ...)` - + submit a kernel on a specific stream + +## Requirements + +### Hardware + +- Any NVIDIA GPU supported by green contexts. +- Green-context SM partitioning is designed for larger server GPUs + (H100, H200, B200, ...) but works on any supported GPU as long as + the SM count is large enough to split meaningfully. + +### Software + +- NVIDIA driver >= 12.4 +- CUDA Toolkit 13.0 or newer. +- Python 3.10 or newer. +- `cuda-core` (`>=1.0.0`) + +## Installation + +Install the required packages from `requirements.txt`: + +```bash +cd /path/to/cuda-samples/python/2_CoreConcepts/greenContext +pip install -r requirements.txt +``` + +## How to Run + +### Basic usage + +The auto-default split reserves a small partition (~16 SMs) for the +critical kernel and gives the rest to the long-running kernel. The +exact sizes are chosen by probing the driver with a dry-run `sm.split`, +escalating the alignment granularity in powers of two until the driver +accepts the pair. This handles architectures where the driver enforces +stricter alignment (e.g. TPC/GPC-pair alignment on Blackwell) than the +reported `min_partition_size`. When that happens the sample prints a +`Note:` line with the granularity it landed on. + +```bash +cd cuda-samples/python/2_CoreConcepts/greenContext +python greenContext.py +``` + +### Match the CUDA programming guide example (112 + 16) + +```bash +python greenContext.py --split 112,16 +``` + +### Tune the workload + +```bash +# Longer long-running kernel, larger host launch gap +python greenContext.py --delay-us 3000 --launch-gap-ms 2.0 + +# Smaller/lighter critical kernel so its own compute time is negligible +python greenContext.py --critical-n 65536 --critical-iters 128 + +# Symmetric split: maximum SMs for the critical kernel, long kernel is +# roughly 2x slower but the critical kernel runs close to its reference time. +python greenContext.py --split 64,64 + +# Use a specific GPU +python greenContext.py --device 1 +``` + +### All options + +``` +--device CUDA device ID (default: 0) +--split SM split as 'LONG,CRITICAL', e.g. '112,16'. + Each side must be a multiple of the device's + min_partition_size, and the driver may enforce additional + architecture-specific alignment (e.g. TPC/partition-grid + alignment on Blackwell). Omit --split to auto-select a + driver-accepted split. +--delay-us Per-block busy-wait of the delay kernel, in us (default: 2000) +--delay-waves Number of waves of the delay kernel on the long + partition. Drives the default --delay-blocks (default: 16) +--delay-blocks Number of blocks for the delay kernel. Overrides + --delay-waves if set. + (default: --delay-waves * device SM count) +--critical-n Work size of the critical kernel (default: 4194304) +--critical-iters Inner math-loop iterations inside the critical kernel. + Higher values make the critical kernel's own compute + time more substantial relative to its wait time + (default: 1024) +--launch-gap-ms Host delay between launching the long and critical + kernels (default: 1.0 ms) +``` + +## Expected Output + +The output depends on the GPU and the number of SMs. +On an RTX 4090 (128 SMs) with the default auto split: + +``` +[Green Context Sample using CUDA Core API] +Device: NVIDIA GeForce RTX 4090 +Compute Capability: sm_89 +Total SMs: 128 +Min. SM partition size: 2 +SM co-scheduled alignment: 2 +SM split (long/critical): 112 / 16 +Workload parameters: + delay kernel: 2048 blocks, 2000 us/block (~32.0 ms on 128 SMs) + critical kernel: 4194304 elements, 1024 inner iterations + host launch gap: 1.0 ms + +Compiling kernels ... +Running reference scenario (critical kernel alone) ... +Running baseline scenario (primary context) ... +Running green context scenario ... + +scenario SMs (long/crit) long (ms) crit total (ms) crit offset (ms) +------------------------------------------------------------------------------------------------------- +crit alone (primary ctx) -/128 - 0.425 - +baseline (primary ctx) 128/128 32.034 30.024 1.090 +green ctx (112+16 SMs) 112/16 38.017 2.696 1.075 + +long (ms) : wall time of the delay kernel +crit total (ms) : launch-to-complete wall time of the critical kernel +crit offset (ms) : when the critical stream started, relative to the long stream start + +Critical-kernel latency speedup (baseline vs green ctx): 11.14x +Green-ctx compute cost vs unconstrained (crit alone): 6.34x +Baseline time spent waiting for SMs (not computing): ~29.60 ms + +Done +``` + +**What to look for:** + +- The critical kernel alone (reference row) takes only a fraction of + a millisecond; almost all of the baseline's `crit total` is time + spent queued waiting for SMs, not compute. +- The **critical kernel's wall time drops sharply** in the + green-context scenario (from ~30 ms to a few ms in the example + above) because it no longer waits for SMs held by the long-running + kernel. +- The **long-running kernel's duration may increase** proportional + to the reduction in SMs available to it (128 -> 112 SMs ~= 14% + slower; 128 -> 64 SMs ~= 2x slower). This is an expected tradeoff: + you reserve SMs for a critical kernel by taking them away from the + background workload. +- The **compute cost** ratio (`green / reference`) shows how close + the critical kernel is to ideal linear scaling with its SM count. + A 112/16 split gives the critical kernel only 12.5% of the SMs and + costs it roughly 6-7x its reference time; a 64/64 split gives it + half the SMs and costs roughly 1.5-2x. +- The `crit offset` column is approximately `--launch-gap-ms` in + both full scenarios; it confirms the host launched the critical + kernel the same amount of time after the long kernel in both runs. + +Exact timings vary with GPU model, driver version, clock state, and +other concurrent GPU work. + +## Files + +- `greenContext.py` - Python implementation using `cuda.core` green-context APIs +- `README.md` - This file +- `requirements.txt` - Sample dependencies + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [Green Contexts in the CUDA C++ Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#green-contexts) +- [`cuda.core` green-context test suite](https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/tests/test_green_context.py) - the authoritative API reference diff --git a/samples/greenContext/greenContext.py b/samples/greenContext/greenContext.py new file mode 100644 index 00000000000..e8464c0d6fe --- /dev/null +++ b/samples/greenContext/greenContext.py @@ -0,0 +1,727 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "numpy>=2.3.2"] +# /// + +""" +Green Context Sample using CUDA Core API. + +Three scenarios are timed with CUDA events and compared: + + 1. Reference: the critical kernel alone on the primary context, + with no competing work. Establishes the pure compute time of + the critical kernel with access to every SM on the device. + 2. Baseline: both kernels run on the device's primary context, + on two non-blocking streams. They contend for all SMs. + 3. Green contexts: SMs are split into two disjoint groups; each + kernel runs on a stream belonging to its own green context. + +The headline metric is the total wall time of the critical kernel +from launch to completion on its stream. In the baseline it is +dominated by waiting behind the long-running kernel; with green +contexts it reflects only the kernel's own compute time on a +smaller SM partition. The reference row separates those effects. + +Note: Parallel execution on the GPU is never guaranteed. Green +contexts remove one common source of contention (shared SMs) but +they are not a hard scheduling promise. +""" + +import argparse +import sys +import time +from dataclasses import dataclass +from typing import List, Tuple + +import numpy as np + +from cuda.core import ( + ContextOptions, + Device, + EventOptions, + LaunchConfig, + Program, + ProgramOptions, + SMResourceOptions, + launch, +) + +# Two CUDA kernels: +# 1. The delay kernel spins until `cycles` SM clock ticks have elapsed. +# 2. The critical kernel does a small amount of useful work. + +KERNEL_SRC = r""" +extern "C" __global__ void delay_kernel(unsigned long long cycles) +{ + unsigned long long start = clock64(); + while ((unsigned long long)(clock64() - start) < cycles) { } +} + +extern "C" __global__ void critical_kernel(float *out, int n, int iters) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + // Two dependent accumulators so the compiler cannot collapse the + // loop into a closed-form expression. `iters` is a runtime argument + // for the same reason. + float v = (float)i * 1e-6f + 1.0f; + float u = (float)i * 1e-7f + 0.5f; + for (int k = 0; k < iters; ++k) { + v = v * 1.000001f + u; + u = u * 0.999999f + v * 1e-7f; + } + out[i] = v + u; + } +} +""" + + +@dataclass +class ScenarioResult: + name: str + critical_total_ms: float + critical_sm_count: int + long_ms: float | None = None + critical_offset_ms: float | None = None + long_sm_count: int | None = None + + +def print_sm_topology(device: Device) -> None: + sm = device.resources.sm + print("[Green Context Sample using CUDA Core API]") + print(f"Device: {device.name}") + print(f"Compute Capability: sm_{device.arch}") + print(f"Total SMs: {sm.sm_count}") + print(f"Min. SM partition size: {sm.min_partition_size}") + print(f"SM co-scheduled alignment: {sm.coscheduled_alignment}") + + +def _align_down(n: int, k: int) -> int: + if k <= 0: + return n + return (n // k) * k + + +def _driver_accepts_split(sm, long_count: int, critical_count: int) -> bool: + if long_count <= 0 or critical_count <= 0: + return False + try: + groups, _ = sm.split( + SMResourceOptions(count=(long_count, critical_count)), + dry_run=True, + ) + except Exception: + return False + actual = tuple(g.sm_count for g in groups) + return actual == (long_count, critical_count) + + +def _find_working_split(sm, prefer_critical: int | None = None) -> Tuple[int, int, int] | None: + """ + Probe the driver for a (long, critical) split it actually accepts. + + Escalates the alignment granularity from `min_partition_size` upward in + powers of two, requiring BOTH sides to be multiples of the current + granularity. This handles architectures where the driver's true + allocation granularity is larger than the reported + `min_partition_size` (e.g. TPC/GPC-pair alignment on Blackwell: on a + 188-SM part `min_partition_size` is 8 but the driver actually requires + each side to be a multiple of 16). + + Returns (long_count, critical_count, granularity) or None. The + granularity is the smallest power-of-two multiple of + `min_partition_size` at which both sides are aligned and the driver + accepts the pair. + """ + total = sm.sm_count + min_part = sm.min_partition_size + if min_part <= 0: + return None + + if prefer_critical is None or prefer_critical <= 0: + prefer_critical = max(min_part, min(16, total // 8)) + + # Escalate granularity in powers of two. The upper bound is half of + # `total` because below that we cannot fit two partitions of size + # >= granularity. + granularity = min_part + while granularity * 2 <= total: + base = max(granularity, _align_down(prefer_critical, granularity)) + + candidates: List[int] = [] + seen = set() + + # Default-arg binding captures the current loop iteration's values + # (granularity, seen, candidates). push is never stored — it's only + # called in this same iteration — but the explicit binding silences + # ruff B023 and documents the intent. + def push(c: int, granularity: int = granularity, seen: set = seen, candidates: List[int] = candidates) -> None: + if c >= granularity and c <= total - granularity and c not in seen: + seen.add(c) + candidates.append(c) + + # Walk outward from `base` (the preferred critical size, aligned + # down to the current granularity) in steps of granularity. + push(base) + max_steps = max(total // granularity, 1) + for step in range(1, max_steps + 1): + push(base + step * granularity) + push(base - step * granularity) + + for critical in candidates: + long_count = _align_down(total - critical, granularity) + if long_count < granularity: + continue + if _driver_accepts_split(sm, long_count, critical): + return long_count, critical, granularity + + granularity *= 2 + + return None + + +def _format_suggestion(sm, prefer_critical: int | None) -> str | None: + """ + Return a '--split A,B' string the driver is known to accept, or None + if we couldn't find one. + """ + found = _find_working_split(sm, prefer_critical=prefer_critical) + if found is None: + return None + long_count, critical_count, _granularity = found + return f"--split {long_count},{critical_count}" + + +def parse_split(arg: str | None, device: Device) -> Tuple[int, int]: + """ + Parse the --split "A,B" CLI argument and validate it against the device. + + Returns (long_count, critical_count). + """ + sm = device.resources.sm + total = sm.sm_count + min_part = sm.min_partition_size + + if arg is None: + # Auto: reserve a small aligned slice for the critical kernel and + # hand the rest (also aligned) to the long-running kernel. We + # can't trust `min_partition_size` alone: on some GPUs (e.g. + # 188-SM Blackwell) the driver requires stricter alignment than + # it reports. Escalate the granularity until the driver accepts + # a pair. + prefer_critical = max(min_part, min(16, total // 8)) + found = _find_working_split(sm, prefer_critical=prefer_critical) + if found is None: + print( + "Error: could not find an SM split that the driver accepts " + f"on this device (total SMs={total}, " + f"min_partition_size={min_part})." + ) + print( + " The driver enforces architecture-specific alignment " + "rules beyond min_partition_size; try passing an explicit " + "--split." + ) + sys.exit(1) + long_count, critical_count, granularity = found + if granularity > min_part: + print( + f"Note: driver required stricter alignment than " + f"min_partition_size={min_part}; selected split uses " + f"granularity={granularity} SMs." + ) + return long_count, critical_count + + # User-provided split. + try: + parts = [int(x.strip()) for x in arg.split(",")] + except ValueError: + print(f"Error: --split must look like 'A,B', got: {arg!r}") + sys.exit(1) + if len(parts) != 2: + print(f"Error: --split must contain exactly two comma-separated integers, got: {arg!r}") + sys.exit(1) + long_count, critical_count = parts + + errors = [] + if long_count <= 0 or critical_count <= 0: + errors.append("both partition sizes must be positive") + if long_count % min_part != 0 or critical_count % min_part != 0: + errors.append(f"each size must be a multiple of min_partition_size={min_part}") + if long_count + critical_count > total: + errors.append(f"sum {long_count + critical_count} exceeds device total of {total} SMs") + + if errors: + print("Error: invalid --split value:") + for e in errors: + print(f" - {e}") + suggestion = _format_suggestion(sm, prefer_critical=critical_count if critical_count > 0 else None) + if suggestion is not None: + print(f"Tip: a driver-accepted split on this device is {suggestion}") + sys.exit(1) + + # Confirm the driver itself accepts the split. The well-known alignment + # checks above are necessary but not sufficient on every architecture. + try: + groups, _ = sm.split( + SMResourceOptions(count=(long_count, critical_count)), + dry_run=True, + ) + except Exception as e: + print(f"Error: driver rejected the requested split: {e}") + print( + " The sample's own alignment checks are not exhaustive on " + "every architecture; the driver enforces additional hardware " + "constraints (for example TPC/partition-grid alignment)." + ) + suggestion = _format_suggestion(sm, prefer_critical=critical_count) + if suggestion is not None: + print(f"Tip: a driver-accepted split on this device is {suggestion}") + sys.exit(1) + + actual = tuple(g.sm_count for g in groups) + if actual != (long_count, critical_count): + print(f"Error: driver adjusted the requested split to {actual}.") + suggestion = _format_suggestion(sm, prefer_critical=critical_count) + if suggestion is not None: + print(f"Tip: a driver-accepted split on this device is {suggestion}") + else: + print(" Pick a different --split, or omit it for the auto default.") + sys.exit(1) + + return long_count, critical_count + + +def compile_kernels(device: Device): + options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + program = Program(KERNEL_SRC, code_type="c++", options=options) + module = program.compile( + "cubin", + name_expressions=("delay_kernel", "critical_kernel"), + ) + return module.get_kernel("delay_kernel"), module.get_kernel("critical_kernel") + + +def microseconds_to_cycles(device: Device, microseconds: float) -> int: + """ + Convert microseconds to SM clock cycles, using the reported GPU clock rate. + clock_rate is in kHz, so 1 us = clock_rate_kHz / 1000 cycles. + """ + clock_khz = device.properties.clock_rate + return int(microseconds * clock_khz / 1000.0) + + +def _run_one( + device: Device, + name: str, + long_stream, + critical_stream, + long_sm_count: int, + critical_sm_count: int, + delay_kernel, + critical_kernel, + delay_cycles: int, + delay_blocks: int, + critical_out_ptr: int, + critical_n: int, + critical_iters: int, + launch_gap_s: float, +) -> ScenarioResult: + """ + Launch the delay kernel on `long_stream`, wait `launch_gap_s` on the host, + launch the critical kernel on `critical_stream`, and time both with events. + """ + + # Create events with timing enabled. + opts = EventOptions(timing_enabled=True) + e_long_start = device.create_event(opts) + e_long_end = device.create_event(opts) + e_crit_start = device.create_event(opts) + e_crit_end = device.create_event(opts) + + # 1024 threads/block ensures at most one delay block is resident per SM + # on current architectures, so grid size directly controls the number of + # waves: delay_blocks / sm_count_visible_to_stream. + delay_block = 1024 + delay_cfg = LaunchConfig(grid=delay_blocks, block=delay_block) + critical_block = 256 + critical_grid = (critical_n + critical_block - 1) // critical_block + critical_cfg = LaunchConfig(grid=critical_grid, block=critical_block) + + # Start of timed region + long_stream.record(e_long_start) + launch(long_stream, delay_cfg, delay_kernel, np.uint64(delay_cycles)) + long_stream.record(e_long_end) + + time.sleep(launch_gap_s) + + critical_stream.record(e_crit_start) + launch( + critical_stream, + critical_cfg, + critical_kernel, + critical_out_ptr, + np.int32(critical_n), + np.int32(critical_iters), + ) + critical_stream.record(e_crit_end) + + # Sync both streams so every event has completed and is measurable. + long_stream.sync() + critical_stream.sync() + # End of timed region + + return ScenarioResult( + name=name, + long_ms=e_long_end - e_long_start, + critical_total_ms=e_crit_end - e_crit_start, + critical_offset_ms=e_crit_start - e_long_start, + long_sm_count=long_sm_count, + critical_sm_count=critical_sm_count, + ) + + +def run_critical_alone( + device: Device, + critical_kernel, + critical_n: int, + critical_iters: int, +) -> ScenarioResult: + """ + Critical kernel alone on the primary context, no competing work. + Establishes the pure compute time with every SM on the device available. + """ + stream = device.create_stream() + out = device.allocate(critical_n * 4, stream=stream) + total_sm = device.resources.sm.sm_count + try: + opts = EventOptions(timing_enabled=True) + e_start = device.create_event(opts) + e_end = device.create_event(opts) + block = 256 + grid = (critical_n + block - 1) // block + cfg = LaunchConfig(grid=grid, block=block) + + stream.record(e_start) + launch( + stream, + cfg, + critical_kernel, + int(out.handle), + np.int32(critical_n), + np.int32(critical_iters), + ) + stream.record(e_end) + stream.sync() + + return ScenarioResult( + name="crit alone (primary ctx)", + critical_total_ms=e_end - e_start, + critical_sm_count=total_sm, + ) + finally: + out.close() + + +def run_baseline( + device: Device, + delay_kernel, + critical_kernel, + delay_cycles: int, + delay_blocks: int, + critical_n: int, + critical_iters: int, + launch_gap_s: float, +) -> ScenarioResult: + """Both kernels on the primary context, two non-blocking streams.""" + long_stream = device.create_stream() + critical_stream = device.create_stream() + out = device.allocate(critical_n * 4, stream=critical_stream) + total_sm = device.resources.sm.sm_count + try: + return _run_one( + device, + name="baseline (primary ctx)", + long_stream=long_stream, + critical_stream=critical_stream, + long_sm_count=total_sm, + critical_sm_count=total_sm, + delay_kernel=delay_kernel, + critical_kernel=critical_kernel, + delay_cycles=delay_cycles, + delay_blocks=delay_blocks, + critical_out_ptr=int(out.handle), + critical_n=critical_n, + critical_iters=critical_iters, + launch_gap_s=launch_gap_s, + ) + finally: + out.close() + + +def run_green_context( + device: Device, + split: Tuple[int, int], + delay_kernel, + critical_kernel, + delay_cycles: int, + delay_blocks: int, + critical_n: int, + critical_iters: int, + launch_gap_s: float, +) -> ScenarioResult: + """Each kernel on its own green context, with disjoint SM partitions.""" + long_count, critical_count = split + sm = device.resources.sm + groups, _remainder = sm.split(SMResourceOptions(count=(long_count, critical_count))) + assert len(groups) == 2 + long_group, critical_group = groups + + # Create the large ctx last so it's closed first: order matters only for + # ensuring we never try to close a ctx that's currently the thread's + # active ctx. + ctx_long = device.create_context(ContextOptions(resources=[long_group])) + ctx_crit = None + out = None + try: + ctx_crit = device.create_context(ContextOptions(resources=[critical_group])) + + long_stream = ctx_long.create_stream() + critical_stream = ctx_crit.create_stream() + out = device.allocate(critical_n * 4, stream=critical_stream) + + return _run_one( + device, + name=f"green ctx ({long_count}+{critical_count} SMs)", + long_stream=long_stream, + critical_stream=critical_stream, + long_sm_count=ctx_long.resources.sm.sm_count, + critical_sm_count=ctx_crit.resources.sm.sm_count, + delay_kernel=delay_kernel, + critical_kernel=critical_kernel, + delay_cycles=delay_cycles, + delay_blocks=delay_blocks, + critical_out_ptr=int(out.handle), + critical_n=critical_n, + critical_iters=critical_iters, + launch_gap_s=launch_gap_s, + ) + finally: + if out is not None: + out.close() + # Streams must be released before their owning ctx; letting them go out + # of scope here is sufficient since no references escape this frame. + if ctx_crit is not None: + ctx_crit.close() + ctx_long.close() + + +def _fmt_ms(value: float | None, width: int) -> str: + if value is None: + return f"{'-':>{width}}" + return f"{value:>{width}.3f}" + + +def print_results(results: List[ScenarioResult]) -> None: + print() + header = f"{'scenario':<32}{'SMs (long/crit)':>20}{'long (ms)':>14}{'crit total (ms)':>18}{'crit offset (ms)':>19}" + print(header) + print("-" * len(header)) + for r in results: + long_sm = "-" if r.long_sm_count is None else str(r.long_sm_count) + sms = f"{long_sm}/{r.critical_sm_count}" + print( + f"{r.name:<32}{sms:>20}" + f"{_fmt_ms(r.long_ms, 14)}{_fmt_ms(r.critical_total_ms, 18)}" + f"{_fmt_ms(r.critical_offset_ms, 19)}" + ) + print() + print("long (ms) : wall time of the delay kernel") + print("crit total (ms) : launch-to-complete wall time of the critical kernel") + print("crit offset (ms) : when the critical stream started, relative to the long stream start") + + +def report_speedup( + alone: ScenarioResult, + baseline: ScenarioResult, + green: ScenarioResult, +) -> None: + """ + Print three headline numbers that put the raw scenario timings in context: + """ + if baseline.critical_total_ms <= 0 or alone.critical_total_ms <= 0: + return + latency_speedup = baseline.critical_total_ms / max(green.critical_total_ms, 1e-9) + compute_cost = green.critical_total_ms / alone.critical_total_ms + wait_ms = max(0.0, baseline.critical_total_ms - alone.critical_total_ms) + print() + print(f"Critical-kernel latency speedup (baseline vs green ctx): {latency_speedup:.2f}x") + print(f"Green-ctx compute cost vs unconstrained (crit alone): {compute_cost:.2f}x") + print(f"Baseline time spent waiting for SMs (not computing): ~{wait_ms:.2f} ms") + + +def main(): + parser = argparse.ArgumentParser( + description="Green Context sample using CUDA Core API", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--device", type=int, default=0, help="CUDA device ID (default: 0)") + parser.add_argument( + "--split", + type=str, + default=None, + help="SM split as 'LONG,CRITICAL', e.g. '112,16'. Default: auto.", + ) + parser.add_argument( + "--delay-us", + type=int, + default=2000, + help=("Per-block busy-wait duration of the delay kernel, in microseconds (default: 2000)"), + ) + parser.add_argument( + "--delay-waves", + type=int, + default=16, + help=( + "Number of waves of the delay kernel on the long partition. " + "Drives the default --delay-blocks (default: 16)." + ), + ) + parser.add_argument( + "--delay-blocks", + type=int, + default=None, + help=( + "Number of blocks launched for the delay kernel. " + "Overrides --delay-waves if set. " + "Default: --delay-waves * device SM count." + ), + ) + parser.add_argument( + "--critical-n", + type=int, + default=1 << 22, + help="Work size of the critical kernel (default: 4194304)", + ) + parser.add_argument( + "--critical-iters", + type=int, + default=1024, + help=( + "Iterations of the inner math loop inside the critical kernel. " + "Higher values make the critical kernel's compute time more " + "substantial (default: 1024)." + ), + ) + parser.add_argument( + "--launch-gap-ms", + type=float, + default=1.0, + help=("Host delay between launching the long and critical kernels, in ms (default: 1.0)"), + ) + args = parser.parse_args() + + try: + device = Device(args.device) + device.set_current() + except Exception as e: + print(f"Error: failed to initialize CUDA device {args.device}: {e}") + return 1 + + print_sm_topology(device) + + long_count, critical_count = parse_split(args.split, device) + print(f"SM split (long/critical): {long_count} / {critical_count}") + + sm_count = device.resources.sm.sm_count + delay_blocks = args.delay_blocks or args.delay_waves * sm_count + delay_cycles = microseconds_to_cycles(device, args.delay_us) + launch_gap_s = max(0.0, args.launch_gap_ms / 1000.0) + + # Rough estimate of the long kernel's duration on the full device. Mostly + # informational; the real value is reported after the run. + est_long_ms = (delay_blocks / sm_count) * (args.delay_us / 1000.0) + + print("Workload parameters:") + print(f" delay kernel: {delay_blocks} blocks, {args.delay_us} us/block (~{est_long_ms:.1f} ms on {sm_count} SMs)") + print(f" critical kernel: {args.critical_n} elements, {args.critical_iters} inner iterations") + print(f" host launch gap: {args.launch_gap_ms} ms") + + print() + print("Compiling kernels ...") + delay_k, crit_k = compile_kernels(device) + + try: + print("Running reference scenario (critical kernel alone) ...") + alone = run_critical_alone( + device, + crit_k, + args.critical_n, + args.critical_iters, + ) + + print("Running baseline scenario (primary context) ...") + baseline = run_baseline( + device, + delay_k, + crit_k, + delay_cycles, + delay_blocks, + args.critical_n, + args.critical_iters, + launch_gap_s, + ) + + print("Running green context scenario ...") + green = run_green_context( + device, + (long_count, critical_count), + delay_k, + crit_k, + delay_cycles, + delay_blocks, + args.critical_n, + args.critical_iters, + launch_gap_s, + ) + except Exception as e: + print(f"Error: scenario failed: {e}") + import traceback + + traceback.print_exc() + return 1 + + print_results([alone, baseline, green]) + report_speedup(alone, baseline, green) + + print("\nDone") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/greenContext/requirements.txt b/samples/greenContext/requirements.txt new file mode 100644 index 00000000000..e25bb158237 --- /dev/null +++ b/samples/greenContext/requirements.txt @@ -0,0 +1,3 @@ +cuda-python>=13.0.0 +cuda-core>=1.0.0 +numpy>=2.3.2 diff --git a/samples/ipcMemoryPool/README.md b/samples/ipcMemoryPool/README.md new file mode 100644 index 00000000000..cfecb531681 --- /dev/null +++ b/samples/ipcMemoryPool/README.md @@ -0,0 +1,140 @@ +# ipcMemoryPool (Python) + +## Description + +This sample demonstrates how to share GPU memory between Python +processes using CUDA Inter-Process Communication (IPC) and +`cuda.core`'s IPC-enabled memory pools. + +By default each process has its own CUDA virtual address space and +cannot see allocations made by another process. With an IPC-enabled +`DeviceMemoryResource` the parent allocates once, and the child +process maps that same physical GPU memory into its own address space +so both read and write the same bytes. The sample performs a +round-trip test: + +1. Parent creates an IPC-enabled `DeviceMemoryResource` and allocates + a `Buffer`. +2. Parent fills the buffer with a known pattern. +3. Parent sends the `Buffer` to a child process through an + `multiprocessing.Queue`. cuda.core's pickle reducers re-create the + memory resource and map the buffer in the child. +4. Child verifies the parent's pattern, writes a new pattern, and + signals completion. +5. Parent verifies the child's writes. + +## What You'll Learn + +- Enabling IPC on a `DeviceMemoryResource` with `ipc_enabled=True` +- Sending `Buffer` objects across process boundaries via `mp.Queue` +- How cuda.core's pickle reducers rebuild the MR and map the buffer + in the receiving process +- Why `multiprocessing` must use the `"spawn"` start method with CUDA +- Detecting IPC support at runtime (POSIX file-descriptor handle + type, memory-pool support, Linux-only) + +## Key Libraries + +- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - IPC-enabled memory resources and buffer reducers +- `cupy` - zero-copy views over the shared device memory via DLPack +- `multiprocessing` - standard library process management + +## Key APIs + +### From `cuda.core` + +- `DeviceMemoryResource(device, options=DeviceMemoryResourceOptions(ipc_enabled=True))` - create an IPC-enabled memory pool +- `DeviceMemoryResourceOptions(max_size=..., ipc_enabled=True)` - configure the underlying pool +- `mr.allocate(nbytes)` - allocate a `Buffer` from the IPC pool +- `Buffer.is_mapped` - True when the buffer is usable in the current process +- `Device.properties.memory_pools_supported` - runtime feature check +- `Device.properties.handle_type_posix_file_descriptor_supported` - runtime feature check + +### From `cuda_samples_utils` + +- `print_gpu_info()` - print device name and compute capability + +## Requirements + +### Hardware + +- NVIDIA GPU with Compute Capability 7.0 or higher +- Device that supports CUDA memory pools and POSIX file-descriptor IPC handles (the sample detects and reports this at startup) +- Minimum GPU memory: 512 MB + +### Software + +- Linux x86_64 (POSIX file-descriptor IPC handles are not available on Windows or macOS) +- CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x) +- Python 3.10 or newer +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## Installation + +Install the required packages from `requirements.txt`: + +```bash +cd /path/to/cuda-samples/python/4_DistributedComputing/ipcMemoryPool +pip install -r requirements.txt +``` + +The `requirements.txt` installs: + +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## How to Run + +### Basic usage + +```bash +cd cuda-samples/python/4_DistributedComputing/ipcMemoryPool +python ipcMemoryPool.py +``` + +### With custom parameters + +```bash +# Larger shared buffer +python ipcMemoryPool.py --elements 65536 + +# Use a specific GPU +python ipcMemoryPool.py --device 1 +``` + +On platforms or devices that do not support CUDA IPC, the sample +prints a diagnostic and exits cleanly with status 0. + +## Expected Output + +``` +Device: +Compute Capability: + +Created IPC-enabled DeviceMemoryResource (is_ipc_enabled=True) +Parent wrote pattern (first 5 values): [100. 101. 102. 103. 104.] +Parent sent buffer to child pid=; waiting... +[child pid=] received buffer: is_mapped=True, size=4096 +Parent sees child's pattern (first 5 values): [-0. -1. -2. -3. -4.] +IPC round-trip: OK +``` + +**Note:** Device name, compute capability, and child PID will vary +based on your system. + +## Files + +- `ipcMemoryPool.py` - Python implementation using `cuda.core` IPC memory pools +- `README.md` - This file +- `requirements.txt` - Sample dependencies +- `../../Utilities/cuda_samples_utils.py` - Common utilities (imported by this sample) + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [`cuda.core` memory API](https://nvidia.github.io/cuda-python/cuda-core/latest/api.html#memory) +- Upstream `cuda.core` IPC tests: [`test_memory_ipc.py`](https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/tests/memory_ipc/test_memory_ipc.py) +- [CUDA IPC programming guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#interprocess-communication) diff --git a/samples/ipcMemoryPool/ipcMemoryPool.py b/samples/ipcMemoryPool/ipcMemoryPool.py new file mode 100644 index 00000000000..f0df564bcc2 --- /dev/null +++ b/samples/ipcMemoryPool/ipcMemoryPool.py @@ -0,0 +1,214 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0", "numpy>=1.24.0"] +# /// + +""" +IPC Memory Pool with cuda.core + +Share GPU memory between Python processes using CUDA Inter-Process +Communication (IPC) and cuda.core's IPC-enabled memory pools. By default +each worker process has its own CUDA virtual address space and cannot see +allocations made by another process. With an IPC-enabled +``DeviceMemoryResource`` the parent can allocate once, and the child +process can map that same physical GPU memory into its own address space +so both read and write the same bytes. + +The sample does a round-trip test: + + 1. Parent creates an IPC-enabled ``DeviceMemoryResource`` and allocates + a ``Buffer``. + 2. Parent fills the buffer with a known pattern. + 3. Parent sends the ``Buffer`` to a child process through an + ``mp.Queue`` - cuda.core's pickle reducers take care of re-creating + the memory resource and mapping the buffer in the child. + 4. Child verifies the parent's pattern, writes a new pattern, and + signals completion. + 5. Parent verifies the child's writes. + +IPC requires Linux (POSIX file-descriptor handles) and device support for +memory pools. On unsupported platforms the sample prints a diagnostic and +exits cleanly. +""" + +import multiprocessing as mp +import platform +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import cupy as cp + import numpy as np + from cuda_samples_utils import print_gpu_info + + from cuda.core import ( + Device, + DeviceMemoryResource, + DeviceMemoryResourceOptions, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +CHILD_TIMEOUT_SEC = 30 + + +def check_ipc_support(device) -> bool: + """Return True if this device/platform supports CUDA IPC memory pools.""" + if platform.system() != "Linux": + print(f"IPC via POSIX file descriptors is only supported on Linux (detected {platform.system()}).") + return False + if not device.properties.memory_pools_supported: + print("Device does not support CUDA memory pools.") + return False + if not device.properties.handle_type_posix_file_descriptor_supported: + print("Device/platform does not support POSIX-fd IPC handles.") + return False + return True + + +def child_worker(q_in, q_out, n_elements, parent_seed, child_seed): + """Runs in a separate process. Verifies and modifies the shared buffer.""" + device = Device(0) + device.set_current() + pid = mp.current_process().pid + + # The Buffer (and its MR) are reconstructed and mapped in this process + # when the queued object is unpickled. Both ``is_mapped`` flags are + # True here. + buffer = q_in.get(timeout=CHILD_TIMEOUT_SEC) + print(f"[child pid={pid}] received buffer: is_mapped={buffer.is_mapped}, size={buffer.size}") + + # Build a zero-copy CuPy view of the shared device memory. + arr = cp.from_dlpack(buffer).view(dtype=cp.float32) + + # Verify the parent's pattern. + expected_parent = cp.arange(n_elements, dtype=cp.float32) + float(parent_seed) + if not cp.allclose(arr, expected_parent): + print("[child] ERROR: parent's pattern did not match expectation") + buffer.close() + q_out.put("fail") + return + + # Write a new pattern for the parent to verify. + arr[:] = cp.arange(n_elements, dtype=cp.float32) * float(child_seed) + device.sync() + + buffer.close() + q_out.put("done") + + +def main() -> int: + import argparse + + parser = argparse.ArgumentParser(description="Share a GPU buffer between two processes via CUDA IPC") + parser.add_argument( + "--elements", + type=int, + default=1024, + help="Number of float32 elements in the shared buffer (default: 1024)", + ) + parser.add_argument("--device", type=int, default=0, help="CUDA device id") + args = parser.parse_args() + + # CUDA is incompatible with the ``fork`` start method because forked + # children inherit a corrupt CUDA state. Always use ``spawn``. + mp.set_start_method("spawn", force=True) + + device = Device(args.device) + device.set_current() + print_gpu_info(device) + + if not check_ipc_support(device): + print("\nCUDA IPC is not available on this system; exiting cleanly.") + return 0 + + N = args.elements + nbytes = N * np.dtype(np.float32).itemsize + parent_seed = 100 + child_seed = -1.0 + + # Create an IPC-enabled memory pool. Buffers allocated from this MR + # are picklable and can be shared across processes. + mr = DeviceMemoryResource( + device, + options=DeviceMemoryResourceOptions( + max_size=max(nbytes * 4, 1 << 20), + ipc_enabled=True, + ), + ) + print(f"Created IPC-enabled DeviceMemoryResource (is_ipc_enabled={mr.is_ipc_enabled})") + + buffer = mr.allocate(nbytes, stream=device.default_stream) + try: + # Fill the buffer with a known pattern from the parent side. + arr = cp.from_dlpack(buffer).view(dtype=cp.float32) + arr[:] = cp.arange(N, dtype=cp.float32) + float(parent_seed) + device.sync() + print(f"Parent wrote pattern (first 5 values): {arr[:5].get()}") + + # Launch the child process and hand the buffer over. + q_to_child = mp.Queue() + q_from_child = mp.Queue() + child = mp.Process( + target=child_worker, + args=(q_to_child, q_from_child, N, parent_seed, child_seed), + ) + child.start() + q_to_child.put(buffer) + print(f"Parent sent buffer to child pid={child.pid}; waiting...") + + msg = q_from_child.get(timeout=CHILD_TIMEOUT_SEC) + child.join(timeout=CHILD_TIMEOUT_SEC) + + if msg != "done" or child.exitcode != 0: + print(f"Child failed: msg={msg!r}, exitcode={child.exitcode}") + return 1 + + # Verify the child's writes are visible from the parent. + device.sync() + got = arr[:5].get() + expected = (np.arange(N, dtype=np.float32) * child_seed)[:5] + print(f"Parent sees child's pattern (first 5 values): {got}") + if np.allclose(got, expected): + print("IPC round-trip: OK") + return 0 + print(f"IPC round-trip: FAILED (expected {expected})") + return 1 + finally: + buffer.close() + mr.close() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/ipcMemoryPool/requirements.txt b/samples/ipcMemoryPool/requirements.txt new file mode 100644 index 00000000000..c33f5dd8d9f --- /dev/null +++ b/samples/ipcMemoryPool/requirements.txt @@ -0,0 +1,4 @@ +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 +numpy>=1.24.0 diff --git a/samples/jitLtoLinking/README.md b/samples/jitLtoLinking/README.md new file mode 100644 index 00000000000..8f7dc7aa4c6 --- /dev/null +++ b/samples/jitLtoLinking/README.md @@ -0,0 +1,133 @@ +# JIT Compilation and Link-Time Optimization (Python) + +## Description + +This sample demonstrates how to build a kernel out of two independently +compiled translation units and link them at runtime with +`cuda.core.Linker`. This is the pattern a library would use to accept +user-supplied device code as a plug-in without recompiling its own +kernels from scratch. + +The sample runs the same program in two linking modes: + +1. **PTX linking** - each module is compiled with + `ProgramOptions(relocatable_device_code=True)` down to PTX, and the + `Linker` emits a final cubin. The two modules stay independently + compiled (no cross-module inlining). +2. **Link-Time Optimization (LTO)** - each module is compiled with + `ProgramOptions(link_time_optimization=True)` down to LTO IR, and the + `Linker` is configured with `LinkerOptions(link_time_optimization=True)` + so the optimizer runs again across both modules, typically matching + the code generation of a single-source build. + +The "main" kernel `apply_transform` calls a `user_transform` device +function that lives in a separate source string, and the results of both +linking modes are verified against a NumPy reference. + +## What You'll Learn + +- Compiling multiple `Program` objects into PTX or LTO IR +- Linking independent object codes into a single cubin with `Linker` +- Choosing between `relocatable_device_code` and `link_time_optimization` +- How a library's main kernel can call into user-supplied device code +- When to prefer LTO over plain PTX linking + +## Key Libraries + +- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - Pythonic access to CUDA runtime, programs, and the JIT linker +- `cupy` - input and output buffers on the GPU +- `numpy` - reference computation on the host + +## Key APIs + +### From `cuda.core` + +- `ProgramOptions(relocatable_device_code=True)` + `Program.compile("ptx")` - produce relocatable PTX +- `ProgramOptions(link_time_optimization=True)` + `Program.compile("ltoir")` - produce LTO IR +- `Linker(*object_codes, options=LinkerOptions(...))` - create a JIT linker over multiple object codes +- `LinkerOptions(link_time_optimization=True)` - opt into LTO during linking +- `Linker.link("cubin")` - produce a loadable module +- `ObjectCode.get_kernel(name)` - fetch a kernel from the linked module + +### From `cuda_samples_utils` + +- `print_gpu_info()` - print device name and compute capability + +## Requirements + +### Hardware + +- NVIDIA GPU with Compute Capability 7.0 or higher + +### Software + +- CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x) +- Python 3.10 or newer +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## Installation + +Install the required packages from `requirements.txt`: + +```bash +cd /path/to/cuda-samples/python/2_CoreConcepts/jitLtoLinking +pip install -r requirements.txt +``` + +The `requirements.txt` installs: + +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## How to Run + +### Basic usage + +```bash +cd cuda-samples/python/2_CoreConcepts/jitLtoLinking +python jitLtoLinking.py +``` + +### With custom parameters + +```bash +# Larger element count +python jitLtoLinking.py --elements 1048576 + +# Use a specific GPU +python jitLtoLinking.py --device 1 +``` + +## Expected Output + +``` +Device: +Compute Capability: + +[1] PTX linking (no LTO) + [ptx] result verified against NumPy reference + +[2] LTO linking (link-time optimization) + [lto] result verified against NumPy reference + +Both PTX and LTO linked kernels produced matching results. Done +``` + +**Note:** Device name and compute capability will vary based on your GPU. + +## Files + +- `jitLtoLinking.py` - Python implementation using `cuda.core.Linker` +- `README.md` - This file +- `requirements.txt` - Sample dependencies +- `../../Utilities/cuda_samples_utils.py` - Common utilities (imported by this sample) + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [`cuda.core` compilation API](https://nvidia.github.io/cuda-python/cuda-core/latest/api.html#cuda-compilation-toolchain) +- Upstream `cuda.core` example: [`jit_lto_fractal.py`](https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/jit_lto_fractal.py) +- [NVIDIA nvJitLink documentation](https://docs.nvidia.com/cuda/nvjitlink/index.html) diff --git a/samples/jitLtoLinking/jitLtoLinking.py b/samples/jitLtoLinking/jitLtoLinking.py new file mode 100644 index 00000000000..b5455107579 --- /dev/null +++ b/samples/jitLtoLinking/jitLtoLinking.py @@ -0,0 +1,222 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0"] +# /// + +""" +JIT Compilation and Link-Time Optimization with cuda.core + +Real-world GPU code is rarely a single source string. Libraries ship a +"main" kernel that is compiled once, then link in user-supplied device +functions at runtime to customize behavior without recompiling the whole +program. + +cuda.core exposes this pattern through ``Program`` (NVRTC compilation) +and ``Linker`` (JIT linking of multiple object codes). Two modes are +shown here: + + * **PTX linking**: compile each translation unit with + ``relocatable_device_code=True`` to PTX and link to a CUBIN. + The two modules remain independently compiled: no cross-module + inlining. + + * **LTO (Link-Time Optimization)**: compile each translation unit + with ``link_time_optimization=True`` to LTO IR, then link with + ``LinkerOptions(link_time_optimization=True)``. The linker reruns + the optimizer across both modules and can inline the device function + into the main kernel, typically matching a single-source build. + +The same kernel math runs in both modes and is verified against a +NumPy reference. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import cupy as cp + import numpy as np + from cuda_samples_utils import print_gpu_info + + from cuda.core import ( + Device, + LaunchConfig, + Linker, + LinkerOptions, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +# -------------------------------------------------------------------------- +# Module A: the "library" main kernel. It forwards each element through a +# user-supplied device function (resolved at link time) and writes the result. +# -------------------------------------------------------------------------- +MAIN_SRC = r""" +// Forward declare the user-supplied hook. Its definition lives in a separate +// translation unit and is resolved by the Linker at runtime. +extern "C" __device__ float user_transform(float x); + +extern "C" __global__ +void apply_transform(const float* __restrict__ in, + float* __restrict__ out, + size_t N) +{ + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = (size_t)gridDim.x * blockDim.x; + for (size_t i = tid; i < N; i += stride) { + out[i] = user_transform(in[i]); + } +} +""" + +# -------------------------------------------------------------------------- +# Module B: the user-supplied "plug-in" device function. A different +# implementation of ``user_transform`` here produces different results without +# rebuilding MAIN_SRC. +# -------------------------------------------------------------------------- +USER_SRC = r""" +extern "C" __device__ +float user_transform(float x) +{ + // A deliberately non-trivial expression so LTO has something to inline / + // optimize across the module boundary. + float y = x * x + 3.0f * x - 1.0f; + return y > 0.0f ? y : 0.0f; +} +""" + + +def host_reference(x: np.ndarray) -> np.ndarray: + y = x * x + 3.0 * x - 1.0 + return np.where(y > 0.0, y, 0.0).astype(np.float32) + + +def link_ptx(device): + """Compile both modules to PTX and link them into a cubin (no LTO).""" + prog_opts = ProgramOptions(std="c++17", arch=f"sm_{device.arch}", relocatable_device_code=True) + main_obj = Program(MAIN_SRC, "c++", options=prog_opts).compile("ptx") + user_obj = Program(USER_SRC, "c++", options=prog_opts).compile("ptx") + + linker = Linker(main_obj, user_obj, options=LinkerOptions(arch=f"sm_{device.arch}")) + return linker.link("cubin") + + +def link_lto(device): + """Compile both modules to LTO IR and link with LTO enabled.""" + prog_opts = ProgramOptions(std="c++17", arch=f"sm_{device.arch}", link_time_optimization=True) + main_obj = Program(MAIN_SRC, "c++", options=prog_opts).compile("ltoir") + user_obj = Program(USER_SRC, "c++", options=prog_opts).compile("ltoir") + + linker_opts = LinkerOptions(arch=f"sm_{device.arch}", link_time_optimization=True) + linker = Linker(main_obj, user_obj, options=linker_opts) + return linker.link("cubin") + + +def run_one_mode(mode, module, stream, d_in, d_out, size, expected): + kernel = module.get_kernel("apply_transform") + config = LaunchConfig(grid=(size + 255) // 256, block=256) + launch( + stream, + config, + kernel, + d_in.data.ptr, + d_out.data.ptr, + np.uint64(size), + ) + stream.sync() + actual = cp.asnumpy(d_out) + if not np.allclose(actual, expected, rtol=1e-5, atol=1e-5): + max_err = np.max(np.abs(actual - expected)) + print(f" [{mode}] verification FAILED (max_err={max_err})") + return False + print(f" [{mode}] result verified against NumPy reference") + return True + + +def main() -> int: + import argparse + + parser = argparse.ArgumentParser(description="JIT + LTO linking of two device modules with cuda.core") + parser.add_argument( + "--elements", + type=int, + default=1 << 16, + help="Number of float32 elements (default: 65536)", + ) + parser.add_argument("--device", type=int, default=0, help="CUDA device id") + args = parser.parse_args() + + device = Device(args.device) + device.set_current() + print_gpu_info(device) + + stream = device.create_stream() + cp.cuda.Stream.from_external(stream).use() + + try: + N = args.elements + rng = np.random.default_rng(seed=0) + host_in = rng.standard_normal(N).astype(np.float32) + expected = host_reference(host_in) + + d_in = cp.asarray(host_in) + d_out = cp.empty(N, dtype=cp.float32) + device.sync() + + print("\n[1] PTX linking (no LTO)") + ptx_module = link_ptx(device) + ok_ptx = run_one_mode("ptx", ptx_module, stream, d_in, d_out, N, expected) + + d_out.fill(0) + device.sync() + + print("\n[2] LTO linking (link-time optimization)") + lto_module = link_lto(device) + ok_lto = run_one_mode("lto", lto_module, stream, d_in, d_out, N, expected) + + print() + if ok_ptx and ok_lto: + print("Both PTX and LTO linked kernels produced matching results. Done") + return 0 + return 1 + finally: + stream.close() + cp.cuda.Stream.null.use() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/jitLtoLinking/requirements.txt b/samples/jitLtoLinking/requirements.txt new file mode 100644 index 00000000000..3b328a39b0c --- /dev/null +++ b/samples/jitLtoLinking/requirements.txt @@ -0,0 +1,5 @@ +# JIT + LTO Linking Sample Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 diff --git a/samples/kernelNsysProfile/README.md b/samples/kernelNsysProfile/README.md new file mode 100644 index 00000000000..2979fe8d7a0 --- /dev/null +++ b/samples/kernelNsysProfile/README.md @@ -0,0 +1,72 @@ +# Sample: Kernel Nsys Profiling - CUDA C++ Kernel Profiling with cuda.core (Python) + +## Description + +This sample demonstrates how to profile custom CUDA C++ kernels compiled and launched with `cuda.core` using NVIDIA Nsight Systems. It implements three GPU operations (vector addition, SAXPY, vector transform) as custom kernels and shows how to instrument code with NVTX markers for profiling analysis. + +## What you will learn + +- How to write and compile CUDA C++ kernels with `cuda.core.Program` +- How to launch kernels with `LaunchConfig` and manage CUDA streams +- How to use NVTX markers (`nvtx.annotate()`) to annotate code sections +- How to profile kernels with Nsight Systems and analyze performance +- Modern CUDA Python workflow with `cuda.core.Device` and proper resource cleanup + +## Requirements + +- NVIDIA GPU with Compute Capability 7.0+ +- CUDA Toolkit 13.0+ +- Python 3.10+ +- Packages: `numpy`, `cuda-python`, `cuda-core`, `cupy-cuda13x`, `nvtx` (see `requirements.txt`; NumPy >=2.3.2) + +**Install:** +```bash +pip install -r requirements.txt +``` + +## How to run + +```bash +python kernelNsysProfile.py +python kernelNsysProfile.py --array-size 10000000 # Custom size +``` + +## Nsys Profiling + +**Basic profile:** +```bash +nsys profile -o gpu_profile python kernelNsysProfile.py +nsys-ui gpu_profile.nsys-rep # View results +``` + +The program uses color-coded NVTX markers: +- **Purple**: Phase 2 (cuda.core Custom Kernels - main focus) +- **Yellow/Blue/Green**: Other phases +- **Cyan**: Nested operations + +Focus on Phase 2 to analyze kernel execution times, launch overhead, and GPU utilization. + +**For detailed Nsys usage and analysis techniques, see the [NVIDIA Nsight Systems documentation](https://docs.nvidia.com/nsight-systems/).** + +## Troubleshooting + +**Missing packages:** +```bash +pip install -r requirements.txt +``` + +**Out of memory:** +```bash +python kernelNsysProfile.py -n 10000000 # Reduce array size +``` + +**Nsys not found:** +```bash +export PATH=/usr/local/cuda/bin:$PATH +``` + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [NVIDIA Nsight Systems Documentation](https://docs.nvidia.com/nsight-systems/) +- [CuPy Documentation](https://docs.cupy.dev/) diff --git a/samples/kernelNsysProfile/kernelNsysProfile.py b/samples/kernelNsysProfile/kernelNsysProfile.py new file mode 100644 index 00000000000..99a68d19fa3 --- /dev/null +++ b/samples/kernelNsysProfile/kernelNsysProfile.py @@ -0,0 +1,323 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# distribution and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["numpy>=2.3.2", "cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0", "nvtx"] +# /// + +""" +Kernel Nsys Profiling Sample - CUDA C++ Kernel Profiling with cuda.core + +This sample demonstrates how to profile custom CUDA C++ kernels compiled and +launched with cuda.core using NVIDIA Nsight Systems. + +The sample implements three common GPU operations as custom CUDA C++ kernels: +- Vector addition: c = a + b +- SAXPY: y = alpha * x + y +- Vector transform: sqrt(x*x + 1) + sin(x) + +Use Nsight Systems to analyze: +- Custom kernel execution times +- Kernel launch patterns and overhead +- GPU utilization and memory access patterns +- NVTX markers for structured profiling + +Workflow: +- Phase 1: Create GPU arrays +- Phase 2: Compile and execute cuda.core custom kernels (profiling focus) +- Phase 3: Verify correctness with CuPy reference implementation +- Phase 4: Validate results +""" + +import argparse +import sys +from pathlib import Path + +try: + import cupy as cp + import numpy as np + import nvtx + + from cuda.core import Device, LaunchConfig, launch +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + +# Add parent directory to path to import utilities +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import verify_array_result + +# CUDA C++ kernel definitions +# For larger projects, separating kernels into a separate file is also valid. +KERNELS_CODE = """ +template +__global__ void vector_add(const T* a, const T* b, T* c, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + c[i] = a[i] + b[i]; + } +} + +template +__global__ void saxpy(const T alpha, const T* x, T* y, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + y[i] = alpha * x[i] + y[i]; + } +} + +template +__global__ void vector_transform(const T* a, T* b, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + T val = a[i]; + b[i] = sqrt(val * val + T(1.0)) + sin(val); + } +} +""" + + +def get_cuda_core_kernels(device): + """ + Compile cuda.core kernels and return them. + + Args: + device: cuda.core.Device object + + Returns: + dict: Dictionary of compiled kernels + """ + from cuda.core import Program, ProgramOptions + + # Compile all kernels at once + program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + prog = Program(KERNELS_CODE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=( + "vector_add", + "saxpy", + "vector_transform", + ), + ) + + # Extract individual kernels + return { + "vector_add": mod.get_kernel("vector_add"), + "saxpy": mod.get_kernel("saxpy"), + "vector_transform": mod.get_kernel("vector_transform"), + } + + +def run(size): + """Main execution function""" + + # ================================================================= + # Device Initialization using cuda.core + # ================================================================= + with nvtx.annotate("Device Initialization", color="green"): + try: + # Create device object (defaults to device 0) + dev = Device() + dev.set_current() + + print() + print(f"Device: {dev.name}") + print(f"Compute Capability: sm_{dev.arch}") + print() + + # Synchronize device + dev.sync() + + except Exception as e: + print("ERROR: CUDA initialization failed!") + print(f"Error: {e}") + sys.exit(1) + + print("Profiling cuda.core Custom Kernels") + print(f"Array size: {size:,}\n") + + # Constant for SAXPY operation + alpha = 2.5 + + # Initialize random seed + rng = cp.random.default_rng(42) + + # ================================================================= + # Phase 1: Create GPU Arrays with CuPy + # ================================================================= + with nvtx.annotate("Create GPU Arrays", color="yellow"): + a_gpu = rng.standard_normal(size, dtype=cp.float32) + b_gpu = rng.standard_normal(size, dtype=cp.float32) + dev.sync() + + print("Phase 1: Created arrays on GPU with CuPy") + print(f" Array shape: {a_gpu.shape}") + print(f" Array dtype: {a_gpu.dtype}") + print(f" Array a - Mean: {float(cp.mean(a_gpu)):.4f}, Std: {float(cp.std(a_gpu)):.4f}") + print(f" Array b - Mean: {float(cp.mean(b_gpu)):.4f}, Std: {float(cp.std(b_gpu)):.4f}\n") + + # ================================================================= + # Phase 2: cuda.core Custom Kernels on GPU + # ================================================================= + with nvtx.annotate("cuda.core Custom Kernels", color="purple"): + print("Phase 2: cuda.core custom CUDA C++ kernels on GPU") + + # Create a stream for cuda.core operations + stream = dev.create_stream() + try: + with nvtx.annotate("Compile Kernels", color="cyan"): + kernels_dict = get_cuda_core_kernels(dev) + stream.sync() + print("Compiled custom CUDA C++ kernels") + + # Prepare launch configuration + # Grid-stride loops in kernels handle any grid size robustly + block = 256 + grid = (size + block - 1) // block + config = LaunchConfig(grid=grid, block=block) + + # Execute cuda.core vector_add kernel + with nvtx.annotate("Vector Add (cuda.core)", color="cyan"): + c_cuda = cp.empty_like(a_gpu) + launch( + stream, + config, + kernels_dict["vector_add"], + a_gpu.data.ptr, + b_gpu.data.ptr, + c_cuda.data.ptr, + cp.uint64(size), + ) + stream.sync() + + # Execute cuda.core SAXPY kernel + with nvtx.annotate("SAXPY (cuda.core)", color="cyan"): + y_cuda = b_gpu.copy() + launch( + stream, + config, + kernels_dict["saxpy"], + np.float32(alpha), + a_gpu.data.ptr, + y_cuda.data.ptr, + cp.uint64(size), + ) + stream.sync() + + # Execute cuda.core vector_transform kernel + with nvtx.annotate("Vector Transform (cuda.core)", color="cyan"): + transform_cuda = cp.empty_like(a_gpu) + launch( + stream, + config, + kernels_dict["vector_transform"], + a_gpu.data.ptr, + transform_cuda.data.ptr, + cp.uint64(size), + ) + stream.sync() + + print("Vector Addition (custom kernel)") + print("SAXPY (custom kernel)") + print("Vector Transform (custom kernel)\n") + finally: + stream.close() + + # ================================================================= + # Phase 3: Generate Reference Results with CuPy (for verification) + # ================================================================= + with nvtx.annotate("Generate Reference Results", color="blue"): + print("Phase 3: Generate reference results for verification") + + with nvtx.annotate("Vector Add (Reference)", color="cyan"): + c_cupy = a_gpu + b_gpu + dev.sync() + + with nvtx.annotate("SAXPY (Reference)", color="cyan"): + y_cupy = alpha * a_gpu + b_gpu + dev.sync() + + with nvtx.annotate("Vector Transform (Reference)", color="cyan"): + transform_cupy = cp.sqrt(a_gpu * a_gpu + 1.0) + cp.sin(a_gpu) + dev.sync() + + print("Reference results generated\n") + + # ================================================================= + # Phase 4: Verify Kernel Correctness + # ================================================================= + with nvtx.annotate("Verification", color="green"): + print("Phase 4: Verify kernel correctness") + + # Verify custom kernels against reference results + # Use relaxed tolerances for single-precision float comparisons + # Small differences can occur due to instruction ordering and + # compiler optimizations + print(" Validating cuda.core kernels:") + + print(" Vector Add: ", end="") + vec_add_match = verify_array_result(c_cuda, c_cupy, rtol=1e-5, atol=1e-6) + + print(" SAXPY: ", end="") + saxpy_match = verify_array_result(y_cuda, y_cupy, rtol=1e-5, atol=1e-6) + + print(" Transform: ", end="") + transform_match = verify_array_result(transform_cuda, transform_cupy, rtol=1e-5, atol=1e-6) + + all_pass = vec_add_match and saxpy_match and transform_match + + if not all_pass: + print("\n ERROR: Kernel verification failed!") + return 1 + print() + + # Final synchronization + dev.sync() + print("The sample is complete PASSED!") + + +def main(): + parser = argparse.ArgumentParser( + description="Kernel Nsys Profiling - Profile custom CUDA C++ kernels with cuda.core" + ) + parser.add_argument( + "-n", + "--array-size", + type=int, + default=50000, + metavar="N", + help="Array size (default: 50,000)", + ) + + args = parser.parse_args() + run(size=args.array_size) + + +if __name__ == "__main__": + main() diff --git a/samples/kernelNsysProfile/requirements.txt b/samples/kernelNsysProfile/requirements.txt new file mode 100644 index 00000000000..e86d7bbf31f --- /dev/null +++ b/samples/kernelNsysProfile/requirements.txt @@ -0,0 +1,7 @@ +# Nsight System Kernels Profiling Sample - Requirements + +numpy>=2.3.2 +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 +nvtx diff --git a/samples/launchConfigTuning/README.md b/samples/launchConfigTuning/README.md new file mode 100644 index 00000000000..6870f1d7e9c --- /dev/null +++ b/samples/launchConfigTuning/README.md @@ -0,0 +1,193 @@ +# Sample: Launch Configuration Tuning (Python) + +## Description + +Benchmark different CUDA kernel launch configurations to find the optimal block-size setting using `cuda.core` APIs. This sample demonstrates **performance tuning** by measuring execution time across various thread block sizes. + +## What You'll Learn + +- Compiling CUDA kernels at runtime with `cuda.core.Program` +- Launching kernels with different `LaunchConfig` settings +- Benchmarking kernel performance with precise timing +- Understanding how thread block size affects performance +- Tuning for memory-bound vs compute-bound kernels + +## Key Concepts + +### Launch Configuration with cuda.core + +```python +# Configure kernel launch with specific thread block size +config = LaunchConfig( + grid=(grid_size,), + block=(block_size,), + shmem_size=shared_memory_bytes +) + +# Launch kernel +launch(stream, config, kernel, *args) +stream.sync() +``` + +### Thread Block Sizing + +Thread block size significantly impacts performance due to: + +| Factor | Impact | +|--------|--------| +| **Occupancy** | More active warps can hide memory latency | +| **Registers** | More threads/block = fewer registers/thread | +| **Shared Memory** | Divided among blocks on each SM | +| **Warp Efficiency** | Block size should be multiple of 32 | + +### Benchmarking Approach + +```python +# Use CUDA events for accurate GPU timing (not CPU wall-clock) +start_event = device.create_event(options=EventOptions(enable_timing=True)) +end_event = device.create_event(options=EventOptions(enable_timing=True)) + +stream.record(start_event) +for _ in range(n_iterations): + launch(stream, config, kernel, *args) +stream.record(end_event) +end_event.sync() +elapsed_ms = (end_event - start_event) / n_iterations +``` + +## Key APIs + +### From `cuda.core`: + +- `Device` - CUDA device management +- `Program` - Runtime kernel compilation (NVRTC) +- `ProgramOptions` - Compilation options (architecture target) +- `LaunchConfig` - Kernel launch configuration (grid/block dimensions) +- `launch` - Execute compiled kernel (accepts Buffer objects directly) +- `EventOptions` - GPU timing with CUDA events +- `ManagedMemoryResource` - Device-preferred unified memory +- `ManagedMemoryResourceOptions` - Set preferred_location for representative benchmarks + +### From `numpy`: + +- `np.from_dlpack()` - Zero-copy view of GPU buffers via DLPack + +### Benchmarked Kernels: + +- **vector_add** - Simple memory-bound kernel (C = A + B) - low sensitivity to block size +- **reduce_sum** - Shared memory reduction - high sensitivity to block size + +## Requirements + +### Hardware: + +- NVIDIA GPU with CUDA support +- Minimum GPU memory: 512 MB + +### Software: + +- CUDA Toolkit 13.0 or newer +- Python 3.10 or newer +- See `requirements.txt` for Python packages + +### Platform Support: + +The benchmark loops in this sample read kernel results back from +`ManagedMemoryResource` allocations between launches, which requires the +device property `concurrent_managed_access=True`. This is only supported on +Linux with HMM (Pascal and newer). On Windows (WDDM/MCDM/TCC) the property +is `False`, so the sample exits early with a waive message and exit code +`2`. + +## Installation + +```bash +pip install -r requirements.txt +``` + +## How to Run + +```bash +python launchConfigTuning.py +``` + +## Expected Output + +``` +============================================================ +Launch Configuration Tuning (cuda.core) +Finding the Best Block Size for Your Kernel +============================================================ + +Device: +Compute Capability: X.X + +Compiling CUDA kernels with cuda.core.Program... + Target architecture: sm_XX + [OK] vector_add kernel compiled + [OK] reduce_sum kernel compiled + +============================================================ +VECTOR ADDITION - Launch Configuration Tuning +============================================================ + +Problem size: 10,000,000 elements +Kernel: vector_add (C = A + B) + +Testing thread configurations: [32, 64, 128, 256, 512, 1024] +------------------------------------------------------------ +Block Size: 32 | Blocks: 312500 | Time: X.XXXX ± X.XXXX ms +Block Size: 64 | Blocks: 156250 | Time: X.XXXX ± X.XXXX ms +... +------------------------------------------------------------ + +[OK] OPTIMAL: block_size=XXX (X.XXXX ms) +[FAIL] WORST: block_size=XXX (X.XXXX ms) + Speedup: X.XXx + +[OK] Results verified correct! + +... + +============================================================ +SAMPLE COMPLETE +============================================================ + +Key Takeaway: The optimal thread configuration depends on your +specific kernel characteristics. Always benchmark to find the best! +``` + +## Tuning Guidelines + +### Start Here +- **128-256 threads/block** is a good starting point for most kernels +- Always use **multiples of 32** (warp size) + +### Memory-Bound Kernels +- Less sensitive to thread configuration +- Focus on memory access patterns +- Higher thread counts help hide latency + +### Compute-Bound Kernels +- More sensitive to thread configuration +- Watch for register pressure at high thread counts +- Profile with Nsight Compute + +### Reduction Kernels +- Block size affects shared memory usage +- Power-of-2 sizes simplify reduction logic +- Often 256-512 threads works well + +## Files + +- `launchConfigTuning.py` - Python implementation using cuda.core +- `README.md` - This file +- `requirements.txt` - Sample dependencies + +## See Also + +- [cuda.core Documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [cuda.core.LaunchConfig](https://nvidia.github.io/cuda-python/cuda-core/latest/generated/cuda.core.LaunchConfig.html) +- [CUDA Occupancy Calculator](https://docs.nvidia.com/cuda/cuda-occupancy-calculator/) +- [CUDA Best Practices Guide - Execution Configuration](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#execution-configuration-optimizations) +- [Nsight Compute Profiler](https://developer.nvidia.com/nsight-compute) diff --git a/samples/launchConfigTuning/launchConfigTuning.py b/samples/launchConfigTuning/launchConfigTuning.py new file mode 100644 index 00000000000..bbd6d30dca9 --- /dev/null +++ b/samples/launchConfigTuning/launchConfigTuning.py @@ -0,0 +1,387 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "numpy>=2.3.2"] +# /// + +""" +Launch Configuration Tuning + +Demonstrates how to find the optimal threads-per-block configuration for CUDA +kernels using cuda.core APIs. Benchmarks different thread layouts to answer: +"What is the best threads-per-block for my kernel?" +""" + +import sys + +try: + import numpy as np + + from cuda.core import ( + Device, + EventOptions, + LaunchConfig, + ManagedMemoryResource, + ManagedMemoryResourceOptions, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +# ============================================================================= +# CUDA Kernel Source Code +# ============================================================================= + +# Vector Addition Kernel - Simple memory-bound kernel (grid-stride loop) +VECTOR_ADD_KERNEL = r""" +extern "C" __global__ +void vector_add(const float* __restrict__ a, + const float* __restrict__ b, + float* __restrict__ c, + int n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = idx; i < n; i += stride) { + c[i] = a[i] + b[i]; + } +} +""" + +# Reduction Kernel - Sensitive to block size due to shared memory (grid-stride load) +REDUCTION_KERNEL = r""" +extern "C" __global__ +void reduce_sum(const float* __restrict__ input, + float* __restrict__ partial_sums, + int n) { + extern __shared__ float sdata[]; + + unsigned int tid = threadIdx.x; + unsigned int stride = blockDim.x * gridDim.x; + + // Load data into shared memory (grid-stride loop) + float sum = 0.0f; + for (unsigned int i = blockIdx.x * blockDim.x + tid; i < n; i += stride) { + sum += input[i]; + } + sdata[tid] = sum; + __syncthreads(); + + // Perform reduction in shared memory + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); + } + + // Write result for this block + if (tid == 0) { + partial_sums[blockIdx.x] = sdata[0]; + } +} +""" + + +# ============================================================================= +# Utility Functions +# ============================================================================= + + +def compile_kernel(device, kernel_code, kernel_name): + """Compile a CUDA kernel using cuda.core.Program.""" + arch = f"sm_{device.arch}" + options = ProgramOptions(arch=arch) + program = Program(kernel_code, code_type="c++", options=options) + compiled = program.compile(target_type="cubin") + return compiled.get_kernel(kernel_name) + + +def benchmark_kernel_1d( + device, + stream, + kernel, + args, + n_elements, + block_size, + n_iterations=100, + shared_mem_bytes=0, +): + """ + Benchmark a 1D kernel with given threads-per-block configuration. + Uses CUDA events for accurate GPU timing. + + Returns timing statistics as a dictionary. + """ + grid_size = (n_elements + block_size - 1) // block_size + + config = LaunchConfig(grid=(grid_size,), block=(block_size,), shmem_size=shared_mem_bytes) + + # Warm-up run + launch(stream, config, kernel, *args) + stream.sync() + + # Timed runs with CUDA events + event_opts = EventOptions(timing_enabled=True) + start_event = device.create_event(options=event_opts) + end_event = device.create_event(options=event_opts) + + stream.record(start_event) + for _ in range(n_iterations): + launch(stream, config, kernel, *args) + stream.record(end_event) + end_event.sync() + + elapsed_ms = (end_event - start_event) / n_iterations + + return { + "block_size": block_size, + "grid_size": grid_size, + "mean_time_ms": elapsed_ms, + "std_time_ms": 0.0, # Single measurement with events + } + + +def print_gpu_info(device): + """Print GPU information relevant to launch configuration.""" + print(f"\nDevice: {device.name}") + cc = device.compute_capability + print(f"Compute Capability: {cc.major}.{cc.minor}") + + +def allocate_managed_array(mr, stream, n_elements, dtype=np.float32): + """Allocate device-preferred unified memory and return buffer with numpy view.""" + n_bytes = n_elements * np.dtype(dtype).itemsize + buffer = mr.allocate(n_bytes, stream=stream) + stream.sync() + + # Zero-copy numpy view via DLPack (holds reference to buffer) + np_view = np.from_dlpack(buffer).view(dtype).reshape(n_elements) + return buffer, np_view + + +# ============================================================================= +# Benchmark Demonstrations +# ============================================================================= + + +def demo_vector_add_tuning(device, stream, mr, kernel): + """Demonstrate launch configuration tuning for vector addition.""" + print("\n" + "=" * 60) + print("VECTOR ADDITION - Launch Configuration Tuning") + print("=" * 60) + + N = 10_000_000 # 10 million elements + print(f"\nProblem size: {N:,} elements") + print("Kernel: vector_add (C = A + B)") + + # Allocate device-preferred unified memory via cuda.core + d_a, np_a = allocate_managed_array(mr, stream, N) + d_b, np_b = allocate_managed_array(mr, stream, N) + d_c, np_c = allocate_managed_array(mr, stream, N) + try: + # Initialize data via numpy views + np_a[:] = np.random.rand(N).astype(np.float32) + np_b[:] = np.random.rand(N).astype(np.float32) + stream.sync() + + # Thread configurations to test (multiples of warp size = 32) + thread_configs = [32, 64, 128, 256, 512, 1024] + + print(f"\nTesting thread configurations: {thread_configs}") + print("-" * 60) + + results = [] + for tpb in thread_configs: + result = benchmark_kernel_1d( + device, + stream, + kernel, + (d_a, d_b, d_c, np.int32(N)), + N, + tpb, + n_iterations=100, + ) + results.append(result) + print(f"Block Size: {tpb:4d} | Blocks: {result['grid_size']:6d} | Time: {result['mean_time_ms']:.4f} ms") + + # Find optimal and worst configurations + best = min(results, key=lambda x: x["mean_time_ms"]) + worst = max(results, key=lambda x: x["mean_time_ms"]) + + print("-" * 60) + print(f"\n[OK] OPTIMAL: block_size={best['block_size']} ({best['mean_time_ms']:.4f} ms)") + print(f"[FAIL] WORST: block_size={worst['block_size']} ({worst['mean_time_ms']:.4f} ms)") + print(f" Speedup: {worst['mean_time_ms'] / best['mean_time_ms']:.2f}x") + + # Verify result + stream.sync() + expected = np_a + np_b + if np.allclose(np_c, expected): + print("\n[OK] Results verified correct!") + + return results + finally: + d_a.close() + d_b.close() + d_c.close() + + +def demo_reduction_tuning(device, stream, mr, kernel): + """Demonstrate launch config tuning for reduction (shared memory).""" + print("\n" + "=" * 60) + print("REDUCTION - Launch Configuration Tuning") + print("=" * 60) + + N = 16_777_216 # 16M elements (power of 2) + + print(f"\nProblem size: {N:,} elements") + print("Kernel: reduce_sum (parallel reduction)") + print("Note: Reduction uses shared memory - more sensitive to block size!") + + # Allocate device-preferred unified memory via cuda.core + d_input, np_input = allocate_managed_array(mr, stream, N) + try: + np_input[:] = np.random.rand(N).astype(np.float32) + stream.sync() + + thread_configs = [32, 64, 128, 256, 512, 1024] + + print(f"\nTesting thread configurations: {thread_configs}") + print("-" * 60) + + results = [] + for tpb in thread_configs: + # Allocate partial sums array + n_blocks = (N + tpb - 1) // tpb + d_partial, _ = allocate_managed_array(mr, stream, n_blocks) + try: + # Shared memory size = block_size * sizeof(float) + shared_mem_bytes = tpb * 4 + + result = benchmark_kernel_1d( + device, + stream, + kernel, + (d_input, d_partial, np.int32(N)), + N, + tpb, + n_iterations=50, + shared_mem_bytes=shared_mem_bytes, + ) + results.append(result) + print( + f"Block Size: {tpb:4d} | Blocks: {result['grid_size']:6d} | Time: {result['mean_time_ms']:.4f} ms" + ) + finally: + d_partial.close() + + best = min(results, key=lambda x: x["mean_time_ms"]) + worst = max(results, key=lambda x: x["mean_time_ms"]) + + print("-" * 60) + print(f"\n[OK] OPTIMAL: block_size={best['block_size']}") + print(f" Speedup over worst: {worst['mean_time_ms'] / best['mean_time_ms']:.2f}x") + + return results + finally: + d_input.close() + + +# ============================================================================= +# Main +# ============================================================================= + + +def main(): + """ + Complete demonstration of CUDA launch configuration tuning. + + This sample shows: + 1. Device initialization with cuda.core.Device + 2. Kernel compilation with cuda.core.Program + 3. Benchmarking different thread block configurations + 4. Finding optimal threads-per-block for various kernel types + """ + if sys.platform == "win32": + print( + "This sample relies on ManagedMemoryResource with concurrent host " + "access, which is not supported on Windows " + "(concurrent_managed_access=False). Waiving this sample." + ) + sys.exit(2) + + print("=" * 60) + print("Launch Configuration Tuning (cuda.core)") + print("Finding the Best Block Size for Your Kernel") + print("=" * 60) + + # Initialize CUDA device + device = Device(0) + device.set_current() + + # Print GPU information + print_gpu_info(device) + + # Create stream and device-preferred memory resource + stream = device.create_stream() + mr_options = ManagedMemoryResourceOptions(preferred_location=device.device_id) + mr = ManagedMemoryResource(mr_options) + + try: + # Compile kernels + print("\nCompiling CUDA kernels with cuda.core.Program...") + arch = f"sm_{device.arch}" + print(f" Target architecture: {arch}") + + vec_add_kernel = compile_kernel(device, VECTOR_ADD_KERNEL, "vector_add") + print(" [OK] vector_add kernel compiled") + + reduction_kernel = compile_kernel(device, REDUCTION_KERNEL, "reduce_sum") + print(" [OK] reduce_sum kernel compiled") + + # Run demonstrations + demo_vector_add_tuning(device, stream, mr, vec_add_kernel) + demo_reduction_tuning(device, stream, mr, reduction_kernel) + + print("\n" + "=" * 60) + print("SAMPLE COMPLETE") + print("=" * 60) + print("\nKey Takeaway: The optimal thread configuration depends on your") + print("specific kernel characteristics. Always benchmark to find the best!") + print() + finally: + stream.close() + + +if __name__ == "__main__": + main() diff --git a/samples/launchConfigTuning/requirements.txt b/samples/launchConfigTuning/requirements.txt new file mode 100644 index 00000000000..c9685b2f67a --- /dev/null +++ b/samples/launchConfigTuning/requirements.txt @@ -0,0 +1,6 @@ +# Launch Configuration Tuning Sample Requirements +# Requires Python 3.10+, CUDA Toolkit 13.0+ + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +numpy>=2.3.2 diff --git a/samples/matrixMulSharedMem/README.md b/samples/matrixMulSharedMem/README.md new file mode 100644 index 00000000000..67c0df69751 --- /dev/null +++ b/samples/matrixMulSharedMem/README.md @@ -0,0 +1,183 @@ +# Matrix Multiplication with Shared Memory (GEMM) + +> **Known issue — version-pinned sample.** Unlike the other samples in this +> repository, this sample is pinned to `cuda-core==0.7.0` and +> `nvmath-python[cu13]==0.9.0`. The reason is that nvmath-python 0.9.0 +> still uses `cuda-core`'s pre-1.0 API name `EventOptions(enable_timing=...)` +> in its own internals, which `cuda-core>=1.0` no longer accepts. +> +> If you install this sample's `requirements.txt` into the same environment +> as the other samples, pip will downgrade `cuda-core` and the other +> samples (which use the 1.0 API) will stop working. The recommended +> workflow is one of: +> +> - Install this sample's requirements in a **dedicated virtual +> environment**, or +> - Re-run the other samples' `pip install -r requirements.txt` afterwards +> to upgrade `cuda-core` back to 1.0. +> +> This sample will be re-aligned with the rest of the repository +> (`cuda-core>=1.0.0`) once nvmath-python ships a release that targets +> cuda-core's 1.0 naming audit. + +Demonstrates efficient matrix multiplication using nvmath-python APIs and custom CUDA kernels with tiling, shared memory, and loop unrolling. + +## Overview + +- Uses nvmath.linalg.advanced.Matmul for high-performance GEMM via cuBLASLt +- Compares with custom CUDA kernel using tiling and shared memory +- Shows how tiling reduces global memory bandwidth requirements +- Demonstrates shared memory for data reuse within thread blocks +- Uses loop unrolling to improve instruction-level parallelism + +## What You'll Learn + +- How to use nvmath stateful API for optimized matrix multiplication +- How to tile matrix operations for better cache locality +- Using shared memory to reduce redundant global memory accesses +- Loop unrolling techniques for GPU kernels +- Benchmarking and comparing kernel performance + +## Key Libraries + +- `nvmath-python` - NVIDIA math library with cuBLASLt access +- `cuda.core` - Modern CUDA Python API for custom kernel compilation +- `cupy` - GPU array library for Python + +## Key APIs + +### From `nvmath.linalg.advanced`: + +- `Matmul()` - Stateful matrix multiplication with planning and execution phases +- `MatmulComputeType` - Compute type options for mixed-precision + +### From `cuda.core`: + +- `Device()` - CUDA device management and properties +- `Program()` - Runtime kernel compilation (NVRTC) +- `LaunchConfig()` - Kernel launch configuration (grid/block dimensions) +- `launch()` - Kernel execution on a stream +- `Stream.record_event()` / `Event.elapsed_time()` - GPU timing + +## Requirements + +### Hardware: + +- NVIDIA GPU with Compute Capability 7.0 or higher +- Minimum GPU memory: 256 MB (for 1024×1024 matrices) + +### Software: + +- CUDA Toolkit 13.0 or newer +- Python 3.10 or newer +- See requirements.txt for package dependencies + +## Installation + +```bash +cd cuda-samples/python/2_CoreConcepts/matrixMulSharedMem +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +## How to Run + +```bash +python matrixMulSharedMem.py +``` + +## Expected Output + +``` +====================================================================== +Matrix Multiplication with Shared Memory (GEMM) +Using nvmath and cuda.core APIs +====================================================================== + +Device: NVIDIA GeForce RTX 4090 +Compute Capability: sm_89 + +Custom kernel compiled [OK] + +Matrix dimensions: A(1024x1024) × B(1024x1024) = C(1024x1024) +Custom kernel tile size: 16x16 + +---------------------------------------------------------------------- +NVMATH MATMUL (cuBLASLt) +---------------------------------------------------------------------- +Using nvmath.linalg.advanced.Matmul stateful API +Average time: X.XXX ms +Performance: XXXX.XX GFLOPS + +---------------------------------------------------------------------- +CUSTOM KERNEL (Tiled + Shared Memory + Loop Unrolling) +---------------------------------------------------------------------- +Grid: (64, 64), Block: (16, 16) +Average time: X.XXX ms +Performance: XXX.XX GFLOPS + +---------------------------------------------------------------------- +VERIFICATION +---------------------------------------------------------------------- +nvmath : PASSED (max error: X.XXe-XX) +Custom kernel : PASSED (max error: X.XXe-XX) + +====================================================================== +PERFORMANCE SUMMARY +====================================================================== +Implementation Time (ms) GFLOPS +---------------------------------------------------------------------- +nvmath (cuBLASLt) X.XXX XXXX.XX +Custom (shared mem + unroll) X.XXX XXX.XX +``` + +## Tiling Concept + +``` + Matrix A (M×K) Matrix B (K×N) Matrix C (M×N) + ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ + │ T00 │ T01 │...│ │ T00 │ T01 │...│ │ │ │ │ + ├─────┼─────┼───┤ ├─────┼─────┼───┤ ├─────┼─────┼───┤ + │ T10 │ T11 │...│ × │ T10 │ T11 │...│ = │ │ Cij │ │ + ├─────┼─────┼───┤ ├─────┼─────┼───┤ ├─────┼─────┼───┤ + │ ... │ ... │...│ │ ... │ ... │...│ │ │ │ │ + └───────────────┘ └───────────────┘ └───────────────┘ + + Cij = Σ (A_tile_row × B_tile_col) for all tiles along K +``` + +## nvmath Stateful API + +```python +import nvmath.linalg.advanced as nvmath_advanced + +# Create matrices (CuPy arrays) +A = cp.random.rand(m, k).astype(cp.float32) +B = cp.random.rand(k, n).astype(cp.float32) + +# Use stateful API for fine-grained control +with nvmath_advanced.Matmul(A, B) as mm: + mm.plan() # Find optimal algorithm + C = mm.execute() # Execute computation +``` + +## Memory Access Optimization (Custom Kernel) + +| Implementation | Global Reads per C element | Reduction | +|---------------|---------------------------|-----------| +| Naive | 2 × K | (baseline)| +| Tiled (16×16) | 2 × K / 16 | 16× | + +## Files + +- `matrixMulSharedMem.py` - Python implementation comparing nvmath vs custom kernel +- `README.md` - This file +- `requirements.txt` - Sample dependencies + +## See Also + +- [nvmath-python Documentation](https://docs.nvidia.com/cuda/nvmath-python/) +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [cuda.core API Guide](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [CuPy Documentation](https://docs.cupy.dev/) diff --git a/samples/matrixMulSharedMem/matrixMulSharedMem.py b/samples/matrixMulSharedMem/matrixMulSharedMem.py new file mode 100644 index 00000000000..6086a7aaf12 --- /dev/null +++ b/samples/matrixMulSharedMem/matrixMulSharedMem.py @@ -0,0 +1,253 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core==0.7.0", "cupy-cuda13x>=14.0.0", "numpy>=2.3.2", "nvmath-python[cu13]==0.9.0"] +# /// + +""" +Matrix Multiplication with Shared Memory (GEMM) + +Demonstrates efficient matrix multiplication using: +- nvmath.linalg.advanced.Matmul for high-performance GEMM via cuBLASLt +- Custom CUDA kernel with tiling, shared memory, and loop unrolling + +Uses cuda.core APIs with CuPy arrays via Stream.from_external. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import cupy as cp + import numpy as np + import nvmath.linalg.advanced as nvmath_advanced + + from cuda.core import ( + Device, + EventOptions, + LaunchConfig, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Install with: pip install -r requirements.txt") + sys.exit(1) + + +TILE_SIZE: int = 16 + +MATMUL_KERNEL: str = r""" +#define TILE_SIZE 16 + +extern "C" __global__ +void matmul_shared(const float* A, const float* B, float* C, + int M, int N, int K) { + __shared__ float As[TILE_SIZE][TILE_SIZE]; + __shared__ float Bs[TILE_SIZE][TILE_SIZE]; + + int bx = blockIdx.x, by = blockIdx.y; + int tx = threadIdx.x, ty = threadIdx.y; + int row = by * TILE_SIZE + ty; + int col = bx * TILE_SIZE + tx; + + float sum = 0.0f; + int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE; + + for (int t = 0; t < numTiles; t++) { + int aCol = t * TILE_SIZE + tx; + int bRow = t * TILE_SIZE + ty; + + As[ty][tx] = (row < M && aCol < K) ? A[row * K + aCol] : 0.0f; + Bs[ty][tx] = (bRow < K && col < N) ? B[bRow * N + col] : 0.0f; + __syncthreads(); + + #pragma unroll + for (int k = 0; k < TILE_SIZE; k += 4) { + sum += As[ty][k] * Bs[k][tx]; + sum += As[ty][k + 1] * Bs[k + 1][tx]; + sum += As[ty][k + 2] * Bs[k + 2][tx]; + sum += As[ty][k + 3] * Bs[k + 3][tx]; + } + __syncthreads(); + } + + if (row < M && col < N) { + C[row * N + col] = sum; + } +} +""" + + +def run_matmul_benchmark( + m: int = 1024, + n: int = 1024, + k: int = 1024, + device_id: int = 0, + num_iterations: int = 10, +) -> bool: + """Run matrix multiplication benchmark comparing nvmath vs custom kernel.""" + print("=" * 60) + print("Matrix Multiplication with Shared Memory (GEMM)") + print("=" * 60) + + # Initialize device and stream + device = Device(device_id) + device.set_current() + stream = device.create_stream() + print(f"\nDevice: {device.name}") + print(f"Compute Capability: sm_{device.arch}") + + # Make CuPy use our cuda.core stream + cp.cuda.Stream.from_external(stream).use() + + # Compile custom kernel + arch = f"sm_{device.arch}" + program = Program(MATMUL_KERNEL, code_type="c++", options=ProgramOptions(arch=arch)) + kernel = program.compile(target_type="cubin").get_kernel("matmul_shared") + print("Custom kernel compiled [OK]") + + # Setup + print(f"\nMatrix: A({m}x{k}) × B({k}x{n}) = C({m}x{n})") + total_ops = 2 * m * n * k + # NOTE: this sample is pinned to cuda-core==0.7.0 (see requirements.txt) + # because nvmath-python 0.9.0 still uses cuda-core's pre-1.0 API name + # `enable_timing`. Once nvmath ships a release compatible with cuda-core + # 1.0, bump the pins in requirements.txt and rename this kwarg to + # `timing_enabled` to match the rest of the samples. + event_opts = EventOptions(enable_timing=True) + + # Allocate matrices + rng = cp.random.default_rng(42) + d_A = rng.random((m, k), dtype=cp.float32) + d_B = rng.random((k, n), dtype=cp.float32) + d_C_custom = cp.zeros((m, n), dtype=cp.float32) + + success = True + try: + # ------------------------------------------------------------------------- + # nvmath GEMM (cuBLASLt) + # ------------------------------------------------------------------------- + print("\n" + "-" * 60) + print("NVMATH (cuBLASLt) - plan once, execute many") + print("-" * 60) + + with nvmath_advanced.Matmul(d_A, d_B, stream=int(stream.handle)) as mm: + mm.plan() + d_C_nvmath = mm.execute() + stream.sync() + + start = stream.record(options=event_opts) + for _ in range(num_iterations): + d_C_nvmath = mm.execute() + end = stream.record(options=event_opts) + end.sync() + + nvmath_ms = (end - start) / num_iterations + nvmath_gflops = (total_ops / 1e9) / (nvmath_ms / 1e3) + print(f"Time: {nvmath_ms:.3f} ms | {nvmath_gflops:.2f} GFLOPS") + + # ------------------------------------------------------------------------- + # Custom kernel (tiled + shared memory + unroll) + # ------------------------------------------------------------------------- + print("\n" + "-" * 60) + print("CUSTOM KERNEL (tiled + shared memory + unroll)") + print("-" * 60) + + block = (TILE_SIZE, TILE_SIZE) + grid = ((n + TILE_SIZE - 1) // TILE_SIZE, (m + TILE_SIZE - 1) // TILE_SIZE) + config = LaunchConfig(grid=grid, block=block) + + launch( + stream, + config, + kernel, + d_A.data.ptr, + d_B.data.ptr, + d_C_custom.data.ptr, + np.int32(m), + np.int32(n), + np.int32(k), + ) + stream.sync() + + start = stream.record(options=event_opts) + for _ in range(num_iterations): + launch( + stream, + config, + kernel, + d_A.data.ptr, + d_B.data.ptr, + d_C_custom.data.ptr, + np.int32(m), + np.int32(n), + np.int32(k), + ) + end = stream.record(options=event_opts) + end.sync() + + custom_ms = (end - start) / num_iterations + custom_gflops = (total_ops / 1e9) / (custom_ms / 1e3) + print(f"Time: {custom_ms:.3f} ms | {custom_gflops:.2f} GFLOPS") + + # ------------------------------------------------------------------------- + # Verification + # ------------------------------------------------------------------------- + print("\n" + "-" * 60) + print("VERIFICATION") + print("-" * 60) + + d_C_ref = d_A @ d_B + + # Host-side verification: cp.allclose triggers NVRTC failure on sm_120 + # (ldexp_cexp undefined). Use asnumpy + np.allclose instead. + ref_host = cp.asnumpy(d_C_ref) + for name, d_C in [("nvmath", d_C_nvmath), ("custom", d_C_custom)]: + print(f"{name}: ", end="") + passed = np.allclose(cp.asnumpy(d_C), ref_host, rtol=1e-4, atol=1e-4) + print("Test PASSED" if passed else "Test FAILED") + success = success and passed + + return success + finally: + cp.cuda.Stream.null.use() + stream.close() + + +def main() -> bool: + """Entry point. Returns True if benchmark passed.""" + return run_matmul_benchmark() + + +if __name__ == "__main__": + success = main() + if not success: + sys.exit(1) diff --git a/samples/matrixMulSharedMem/requirements.txt b/samples/matrixMulSharedMem/requirements.txt new file mode 100644 index 00000000000..0b397174cd2 --- /dev/null +++ b/samples/matrixMulSharedMem/requirements.txt @@ -0,0 +1,20 @@ +# Matrix Multiplication with Shared Memory (GEMM) Requirements +# +# IMPORTANT: this sample pins older versions of cuda-core and nvmath-python +# on purpose. nvmath-python 0.9.0 (the current CUDA-13 release at the time +# of CTK 13.3) calls cuda-core's pre-1.0 API name `EventOptions(enable_timing=...)` +# in its own internals. With cuda-core 1.0+ that kwarg was renamed to +# `timing_enabled` and the old name is rejected, so any cuda-core>=1.0 + +# nvmath-python 0.9.0 combination raises a TypeError at runtime. +# +# Until nvmath-python ships a release that targets the cuda-core 1.0 naming +# audit, this sample requires the older cuda-core 0.7 line. Installing this +# requirements.txt into the same environment as the other samples will +# downgrade cuda-core; use a dedicated venv for this sample, or reinstall +# the other samples' requirements afterwards to upgrade cuda-core back. + +cuda-python>=13.0.0 +cuda-core==0.7.0 +cupy-cuda13x>=14.0.0 +numpy>=2.3.2 +nvmath-python[cu13]==0.9.0 diff --git a/samples/memoryResources/README.md b/samples/memoryResources/README.md new file mode 100644 index 00000000000..6d6d6a94564 --- /dev/null +++ b/samples/memoryResources/README.md @@ -0,0 +1,151 @@ +# Sample: Memory Resources and Buffers (Python) + +## Description + +This sample demonstrates the `cuda.core` memory management model: a +`MemoryResource` owns a pool of memory and hands out `Buffer` objects that +can be passed to kernels, copied between resources with +`Buffer.copy_to()`, and viewed as NumPy or CuPy arrays through DLPack. The +script exercises three common resources side-by-side: + +1. **`DeviceMemoryResource`** - device-local GPU memory. Every `Device` + exposes a default pool via `Device.memory_resource`, and applications + can create additional pools explicitly. +2. **`PinnedMemoryResource`** - page-locked host memory, used here as the + input and output staging buffers around a GPU kernel (the canonical + pinned-H2D / compute / pinned-D2H pattern). +3. **`ManagedMemoryResource`** - unified memory that the driver migrates + between host and device on demand; host views see the GPU's writes + without an explicit copy. + +The same `scale_and_bias` kernel runs on each resource and every result is +verified on the host. + +## What You'll Learn + +- Creating and using `DeviceMemoryResource`, `PinnedMemoryResource`, and + `ManagedMemoryResource` +- Allocating `Buffer` objects from a resource with a bound stream +- Copying between buffers across resources with `Buffer.copy_to()` +- Taking zero-copy NumPy or CuPy views of a `Buffer` via DLPack +- Releasing buffers with stream-ordered `close(stream)` semantics + +## Key Libraries + +- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - Pythonic access to CUDA runtime, programs, and memory resources +- `cupy` - GPU array views of device buffers +- `numpy` - host array views of pinned and managed buffers + +## Key APIs + +### From `cuda.core` + +- `Device.memory_resource` - default memory pool attached to a device +- `DeviceMemoryResource`, `PinnedMemoryResource`, `ManagedMemoryResource` - allocate buffers of the corresponding memory kind +- `MemoryResource.allocate(nbytes, stream=...)` - returns a `Buffer` +- `Buffer.copy_to(dst_buffer, stream=...)` - async, stream-ordered copy +- `Buffer.close(stream)` - stream-ordered deallocation +- `Buffer` supports `__dlpack__` for zero-copy views + +### From CuPy and NumPy + +- `cp.from_dlpack()` / `np.from_dlpack()` - zero-copy array view of a `Buffer` + +### From `cuda_samples_utils` + +- `print_gpu_info()` - print device name and compute capability + +## Requirements + +### Hardware + +- NVIDIA GPU with Compute Capability 7.0 or higher +- Managed memory support (most discrete GPUs) + +### Software + +- CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x) +- Python 3.10 or newer +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +### Platform Support + +The `ManagedMemoryResource` demo in this sample exercises **concurrent host +access** to managed allocations while the GPU is active, which requires the +device property `concurrent_managed_access=True`. This is only supported on +Linux with HMM (Pascal and newer). On Windows (WDDM/MCDM/TCC) the property +is `False`, so the sample exits early with a waive message and exit code +`2`. The `DeviceMemoryResource` + `PinnedMemoryResource` demos in this +sample would still work on Windows on their own, but to keep the sample +self-contained the entire script waives when concurrent managed access is +unavailable. + +## Installation + +Install the required packages from `requirements.txt`: + +```bash +cd /path/to/cuda-samples/python/2_CoreConcepts/memoryResources +pip install -r requirements.txt +``` + +The `requirements.txt` installs: + +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## How to Run + +### Basic usage + +```bash +cd cuda-samples/python/2_CoreConcepts/memoryResources +python memoryResources.py +``` + +### With custom parameters + +```bash +# Larger buffer size +python memoryResources.py --elements 1048576 + +# Use a specific GPU +python memoryResources.py --device 1 +``` + +## Expected Output + +``` +Device: +Compute Capability: + +[1] DeviceMemoryResource + PinnedMemoryResource (staging) + Pinned staging, device kernel, and copy_to verified + +[2] ManagedMemoryResource (unified memory) + GPU writes observed directly through the host-visible mapping + +[3] Explicit DeviceMemoryResource + Explicit DeviceMemoryResource allocation verified + +All memory resource demos passed. +``` + +**Note:** Device name and compute capability will vary based on your GPU. + +## Files + +- `memoryResources.py` - Python implementation using `cuda.core` memory resources +- `README.md` - This file +- `requirements.txt` - Sample dependencies +- `../../Utilities/cuda_samples_utils.py` - Common utilities (imported by this sample) + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [`cuda.core` memory API](https://nvidia.github.io/cuda-python/cuda-core/latest/api.html#memory-management) +- Upstream `cuda.core` example: [`memory_ops.py`](https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/memory_ops.py) +- Upstream `cuda.core` example: [`memory_pool_resources.py`](https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/memory_pool_resources.py) diff --git a/samples/memoryResources/memoryResources.py b/samples/memoryResources/memoryResources.py new file mode 100644 index 00000000000..3680f7c9e02 --- /dev/null +++ b/samples/memoryResources/memoryResources.py @@ -0,0 +1,257 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0"] +# /// + +""" +Memory management with cuda.core: Buffers and Memory Resources + +Demonstrates the Memory Resource / Buffer abstraction in cuda.core: + + * ``DeviceMemoryResource`` - GPU-only memory (device pool) + * ``PinnedMemoryResource`` - page-locked host memory accessible by the GPU + * ``ManagedMemoryResource`` - unified memory that migrates between + host and device on demand + +Each resource hands out ``Buffer`` objects that can be: + * passed to kernels as pointers + * copied between each other with ``buffer.copy_to(...)`` + * viewed as NumPy or CuPy arrays via DLPack (``__dlpack__``) + +The kernel below performs a fused scale + bias on both a device buffer +and a pinned buffer, then we copy the result across resources to confirm +each pathway works end-to-end. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import cupy as cp + import numpy as np + from cuda_samples_utils import print_gpu_info + + from cuda.core import ( + Device, + DeviceMemoryResource, + LaunchConfig, + ManagedMemoryResource, + PinnedMemoryResource, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +SCALE_BIAS_KERNEL = r""" +extern "C" __global__ +void scale_and_bias(float* data, size_t N, float scale, float bias) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + const unsigned int stride = blockDim.x * gridDim.x; + for (size_t i = tid; i < N; i += stride) { + data[i] = data[i] * scale + bias; + } +} +""" + + +def demo_device_and_pinned(device, stream, kernel, size): + """Use pinned host memory as a staging area for a device-side kernel. + + Canonical H2D / compute / D2H pattern: + host (pinned) -> device -> launch -> device -> host (pinned) + """ + print("\n[1] DeviceMemoryResource + PinnedMemoryResource (staging)") + dtype = np.float32 + nbytes = size * dtype().itemsize + + # The device's built-in memory resource is a good default for GPU memory. + device_mr = device.memory_resource + pinned_mr = PinnedMemoryResource() + + pinned_in = pinned_mr.allocate(nbytes, stream=stream) + pinned_out = pinned_mr.allocate(nbytes, stream=stream) + device_buffer = device_mr.allocate(nbytes, stream=stream) + try: + # Wrap each Buffer as a typed array via DLPack (no copies). + pinned_in_view = np.from_dlpack(pinned_in).view(dtype=dtype) + pinned_out_view = np.from_dlpack(pinned_out).view(dtype=dtype) + + # Initialize host-side input. + pinned_in_view[:] = np.arange(size, dtype=dtype) + original = pinned_in_view.copy() + + # Stage H2D: pinned -> device. + pinned_in.copy_to(device_buffer, stream=stream) + + # Launch kernel on the device buffer. + config = LaunchConfig(grid=(size + 255) // 256, block=256) + launch( + stream, + config, + kernel, + device_buffer, + np.uint64(size), + np.float32(3.0), + np.float32(-0.5), + ) + + # Stage D2H: device -> pinned. + device_buffer.copy_to(pinned_out, stream=stream) + stream.sync() + + expected = original * 3.0 - 0.5 + assert np.allclose(pinned_out_view, expected), "H2D -> kernel -> D2H mismatch" + print(" Pinned staging, device kernel, and copy_to verified") + finally: + device_buffer.close(stream) + pinned_out.close(stream) + pinned_in.close(stream) + + +def demo_managed(device, stream, kernel, size): + """Allocate a managed (unified) buffer; kernel writes are visible on host.""" + print("\n[2] ManagedMemoryResource (unified memory)") + dtype = np.float32 + nbytes = size * dtype().itemsize + + managed_mr = ManagedMemoryResource() + managed_buffer = managed_mr.allocate(nbytes, stream=stream) + try: + managed_view = np.from_dlpack(managed_buffer).view(dtype=dtype) + + managed_view[:] = np.arange(size, dtype=dtype) + original = managed_view.copy() + # Before launching, make sure host writes have reached the GPU. + device.sync() + + config = LaunchConfig(grid=(size + 255) // 256, block=256) + launch( + stream, + config, + kernel, + managed_buffer, + np.uint64(size), + np.float32(0.5), + np.float32(10.0), + ) + stream.sync() + + # No explicit copy: the same numpy view observes the GPU's writes. + assert np.allclose(managed_view, original * 0.5 + 10.0), "Managed memory result mismatch" + print(" GPU writes observed directly through the host-visible mapping") + finally: + managed_buffer.close(stream) + + +def demo_explicit_device_pool(device, stream, kernel, size): + """Allocate from a user-created DeviceMemoryResource with default options.""" + print("\n[3] Explicit DeviceMemoryResource") + dtype = np.float32 + nbytes = size * dtype().itemsize + + # Explicitly create a pool tied to this device. Use .close() to tear it down. + explicit_mr = DeviceMemoryResource(device) + buffer = explicit_mr.allocate(nbytes, stream=stream) + try: + view = cp.from_dlpack(buffer).view(dtype=cp.float32) + view[:] = cp.arange(size, dtype=cp.float32) + device.sync() + + config = LaunchConfig(grid=(size + 255) // 256, block=256) + launch( + stream, + config, + kernel, + buffer, + np.uint64(size), + np.float32(1.0), + np.float32(100.0), + ) + stream.sync() + + expected = cp.arange(size, dtype=cp.float32) + 100.0 + assert cp.allclose(view, expected), "Explicit device pool result mismatch" + print(" Explicit DeviceMemoryResource allocation verified") + finally: + buffer.close(stream) + explicit_mr.close() + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Demonstrate cuda.core memory resources (Buffer + MR)") + parser.add_argument( + "--elements", + type=int, + default=1 << 16, + help="Number of float32 elements per buffer (default: 65536)", + ) + parser.add_argument("--device", type=int, default=0, help="CUDA device id") + args = parser.parse_args() + + if sys.platform == "win32": + print( + "This sample relies on ManagedMemoryResource with concurrent host " + "access, which is not supported on Windows " + "(concurrent_managed_access=False). Waiving this sample." + ) + sys.exit(2) + + device = Device(args.device) + device.set_current() + print_gpu_info(device) + + stream = device.create_stream() + + try: + program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + program = Program(SCALE_BIAS_KERNEL, code_type="c++", options=program_options) + module = program.compile("cubin") + kernel = module.get_kernel("scale_and_bias") + + demo_device_and_pinned(device, stream, kernel, args.elements) + demo_managed(device, stream, kernel, args.elements) + demo_explicit_device_pool(device, stream, kernel, args.elements) + + print("\nDone") + return 0 + finally: + stream.close() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/memoryResources/requirements.txt b/samples/memoryResources/requirements.txt new file mode 100644 index 00000000000..641e4e20c71 --- /dev/null +++ b/samples/memoryResources/requirements.txt @@ -0,0 +1,5 @@ +# Memory Resources Sample Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 diff --git a/samples/multiGPUGradientAverage/README.md b/samples/multiGPUGradientAverage/README.md new file mode 100644 index 00000000000..c6cfa8594c0 --- /dev/null +++ b/samples/multiGPUGradientAverage/README.md @@ -0,0 +1,128 @@ +# Sample: multiGPUGradientAverage (Python) + +## Description + +This sample demonstrates gradient averaging across multiple GPUs using MPI and cuda.core. Each GPU computes local gradients, which are synchronized (averaged) across all GPUs using MPI Allreduce with host-staging (GPU → CPU → MPI → CPU → GPU) for maximum compatibility. + +## What you will learn + +- How to initialize MPI for multi-process GPU communication +- How to map MPI ranks to CUDA devices consistently +- How to integrate cuda.core streams with CuPy using `Stream.from_external` +- How to compile and launch custom CUDA kernels using cuda.core +- How to use cuda.core Event for GPU timing measurements +- How to use MPI Allreduce with host-staging for universal compatibility + +## Prerequisites + +- Python 3.10+ +- CUDA Toolkit 13.0+ +- Standard MPI implementation (OpenMPI, MPICH, or Intel MPI) +- Multiple NVIDIA GPUs (tested with 2+ GPUs) + +## Installation + +```bash +pip install -r requirements.txt +``` + +## Running + +**IMPORTANT:** This sample **MUST** be launched by an MPI runtime with at +least 2 processes. On Linux/macOS this is typically `mpirun`; on Windows with +Microsoft MPI the launcher is `mpiexec` (and the flag for process count is +`-n`). Either form is accepted by most MPI stacks. + +Linux / macOS (OpenMPI, MPICH, Intel MPI): + +```bash +# Single node (2 GPUs) +mpirun -np 2 python multiGPUGradientAverage.py --size 10000 + +# Single node (4 GPUs) +mpirun -np 4 python multiGPUGradientAverage.py --size 10000 + +# With specific GPUs +CUDA_VISIBLE_DEVICES=0,2 mpirun -np 2 python multiGPUGradientAverage.py +``` + +Windows (Microsoft MPI — `mpiexec` is installed under +`C:\Program Files\Microsoft MPI\Bin\` and is not on PATH by default): + +```powershell +& "C:\Program Files\Microsoft MPI\Bin\mpiexec.exe" -n 2 ` + python multiGPUGradientAverage.py --size 10000 +``` + +## Sample Output + +``` +[Rank 0] World size = 4 + +====================================================================== +Multi-GPU Gradient Average Demo +====================================================================== +Number of MPI ranks (GPUs): 4 +Gradient vector length per GPU: 10000 +Device: NVIDIA GeForce RTX 4090 +Computation: gradients computed on GPU via cuda.core. +Communication: gradients averaged via MPI_Allreduce on host (CPU) buffers. +====================================================================== + +Sample averaged gradient values (rank 0): + avg_grad[0] = 1.500000 + avg_grad[5000] = 6.500000 + avg_grad[9999] = 11.499000 + +Expected values: + expected[0] = 1.500000 + expected[5000] = 6.500000 + expected[9999] = 11.499000 + +Verifying gradient averaging correctness... +[PASS] Gradient averaging is correct. +[PASS] Gradient averaging is correct on all ranks. + +Performance: + Kernel time (GPU only): 0.123 ms + MPI communication time (host-staging, end-to-end): 0.456 ms + Total time: 0.579 ms + +====================================================================== +Demo complete. +====================================================================== +``` + +## Key Technical Details + +The sample uses cuda.core streams and makes CuPy use them via `Stream.from_external`: + +```python +stream = device.create_stream() +cp.cuda.Stream.from_external(stream).use() +``` + +GPU timing is measured using cuda.core Event: + +```python +from cuda.core import EventOptions +timing_options = EventOptions(enable_timing=True) +start_event = stream.record(options=timing_options) +# ... GPU work ... +end_event = stream.record(options=timing_options) +end_event.sync() +kernel_time = end_event - start_event # Returns milliseconds +``` + +The host-staging pattern transfers data GPU → CPU → MPI → CPU → GPU for universal MPI compatibility without requiring CUDA-aware MPI. + +## Troubleshooting + +**Error: "This sample requires at least 2 MPI processes!"** + +Solution: +- Linux / macOS: `mpirun -np 2 python multiGPUGradientAverage.py` +- Windows (Microsoft MPI): `& "C:\Program Files\Microsoft MPI\Bin\mpiexec.exe" -n 2 python multiGPUGradientAverage.py` + (or `mpiexec -n 2 ...` after adding `C:\Program Files\Microsoft MPI\Bin\` to `PATH`). + +See the **Running** section above for fully-formed examples. diff --git a/samples/multiGPUGradientAverage/multiGPUGradientAverage.py b/samples/multiGPUGradientAverage/multiGPUGradientAverage.py new file mode 100644 index 00000000000..612ca80a2bf --- /dev/null +++ b/samples/multiGPUGradientAverage/multiGPUGradientAverage.py @@ -0,0 +1,402 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# distribution and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["mpi4py>=3.1.4", "cupy-cuda13x>=14.0.0", "cuda-python>=13.0.0", "cuda-core>=1.0.0"] +# /// + +""" +Multi-GPU Gradient Average using MPI and cuda.core (Host-staging Allreduce) + +Question: How do I synchronize gradients across GPUs? + +Answer: +Each GPU (each MPI rank) computes local gradients on device via CUDA. +Gradients are then averaged across ranks via an MPI Allreduce over host +(CPU) buffers, following the classic data-parallel training pattern. + +This sample shows how to: +- Initialize MPI for multi-process GPU workloads +- Map MPI ranks to GPUs +- Use cuda.core for kernel compilation and execution +- Integrate cuda.core with CuPy using the stream protocol +- Perform gradient averaging with MPI Allreduce (using host staging) +- Use cuda.core Event for GPU timing measurements +- Verify correctness of distributed gradient synchronization + +Key concepts: Allreduce, NCCL collectives (conceptually), distributed training + +Note: +- All gradient computation and validation happen on GPUs. +- MPI Allreduce is executed on CPU (host) buffers via a simple + GPU -> CPU -> MPI -> CPU -> GPU staging pattern so that the sample + works on any MPI stack, without requiring CUDA-aware MPI. +- In production deep learning frameworks (e.g., PyTorch DDP), NCCL + usually implements the GPU Allreduce directly; the communication + pattern and semantics are the same as demonstrated here. +""" + +import sys +from pathlib import Path + +# Add parent directory to path to import utilities +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import verify_array_result + +try: + import cupy as cp + from mpi4py import MPI + + from cuda.core import ( + Device, + EventOptions, + LaunchConfig, + Program, + ProgramOptions, + launch, + system, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install: pip install mpi4py cupy-cuda12x cuda-python cuda-core") + sys.exit(1) + + +# ============================================================================ +# CUDA device selection and stream management +# ============================================================================ + + +def init_device(rank: int): + """ + Initialize CUDA device and stream for this MPI rank. + + For a simple single-node run, we map rank % num_gpus to a device id. + This covers both the common case (world_size == num_gpus) and the case + where multiple ranks share a GPU. + + Returns + ------- + tuple[Device, Stream] + CUDA device object and stream object. + """ + num_gpus = system.get_num_devices() + if num_gpus == 0: + raise RuntimeError("No CUDA devices available") + + dev_id = rank % num_gpus # simple mapping: rank -> GPU in round-robin + + try: + device = Device(dev_id) + except (RuntimeError, ValueError) as e: + if rank == 0: + print(f"Warning: Cannot assign GPU {dev_id}, using GPU 0. Error: {e}") + device = Device(0) + + device.set_current() + # Align CuPy with cuda.core's chosen device ID + cp.cuda.Device(device.device_id).use() + + # Create cuda.core stream and make CuPy use it + stream = device.create_stream() + cp.cuda.Stream.from_external(stream).use() + + return device, stream + + +# ============================================================================ +# CUDA kernel definition and compilation +# ============================================================================ + +# Tiny CUDA kernel to initialize local "gradients" +# Uses grid-stride loop to handle arrays larger than grid size +INIT_KERNEL = r""" +extern "C" __global__ +void init_grad_kernel(float* grad, int n, int rank) +{ + // Grid-stride loop: each thread processes multiple elements + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = gridDim.x * blockDim.x; + for (size_t i = tid; i < n; i += stride) { + // Gradient value depends on MPI rank so we can verify reduction: + // grad_i = rank + 0.001 * i + grad[i] = rank + 0.001f * i; + } +} +""" + +_kernel_cache = {} + + +def get_init_kernel(device: Device): + """Compile (or retrieve cached) init_grad_kernel for this device.""" + key = device.pci_bus_id + if key not in _kernel_cache: + opts = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + prog = Program(INIT_KERNEL, code_type="c++", options=opts) + mod = prog.compile("cubin") + _kernel_cache[key] = mod.get_kernel("init_grad_kernel") + return _kernel_cache[key] + + +# ============================================================================ +# Local gradient computation on each GPU +# ============================================================================ + + +def compute_local_gradients(num_elements: int, device: Device, stream: object, rank: int) -> cp.ndarray: + """ + Compute a local "gradient" vector on the current GPU. + + For demo purposes, we initialize: + grad[i] = rank + 0.001 * i + + Parameters + ---------- + num_elements : int + Length of gradient vector. + device : Device + CUDA device object. + stream : Stream + CUDA stream object (created at device initialization). + rank : int + MPI rank ID. + + Returns + ------- + cupy.ndarray + Gradient vector on GPU. + """ + # Create gradient array (CuPy uses the stream set at device initialization) + grad = cp.empty(num_elements, dtype=cp.float32) + + # Use a CUDA kernel compiled with cuda.core to fill the array + kernel = get_init_kernel(device) + + threads_per_block = 256 + blocks_per_grid = (num_elements + threads_per_block - 1) // threads_per_block + config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block) + + # Launch kernel using cuda.core stream + launch(stream, config, kernel, grad.data.ptr, num_elements, rank) + + return grad + + +# ============================================================================ +# MPI Allreduce to average gradients (host-staging) +# ============================================================================ + + +def average_gradients(local_grad: cp.ndarray, comm: object, world_size: int) -> cp.ndarray: + """ + Average gradients across all MPI ranks using host-staging Allreduce. + + Steps: + 1. Copy local gradients from GPU to CPU (NumPy). + 2. Perform MPI_Allreduce on host buffers. + 3. Divide by world_size to obtain the average. + 4. Copy the averaged gradients back to GPU. + + This pattern is environment-agnostic and works on any MPI stack. + """ + assert local_grad.dtype == cp.float32 + + # GPU -> CPU + local_host = local_grad.get() # NumPy array on host + avg_host = local_host.copy() + + # Allreduce on host buffers + comm.Allreduce(local_host, avg_host, op=MPI.SUM) + + # Average + avg_host /= world_size + + # CPU -> GPU + avg_grad = cp.asarray(avg_host) + + return avg_grad + + +# ============================================================================ +# Testing and verification +# ============================================================================ + + +def main(): + """Demo: Multi-GPU gradient averaging with MPI (host-staging Allreduce).""" + import argparse + + # Initialize MPI + comm = MPI.COMM_WORLD + world_size = comm.Get_size() + rank = comm.Get_rank() + + parser = argparse.ArgumentParser( + description=("Multi-GPU Gradient Average with mpi4py + cuda.core (host-staging Allreduce)") + ) + parser.add_argument( + "--size", + type=int, + default=1024, + help="Number of gradient elements per GPU (default: 1024)", + ) + args = parser.parse_args() + + num_elements = args.size + + # Initialize device and stream + device = None + stream = None + try: + device, stream = init_device(rank) + + if rank == 0: + print(f"[Rank 0] World size = {world_size}") + comm.Barrier() + + # Validate world size + if world_size < 2: + if rank == 0: + print("=" * 70) + print("ERROR: This sample requires at least 2 MPI processes!") + print("=" * 70) + print("\nPlease run with mpirun:") + print(" mpirun -np 2 python multiGPUGradientAverage.py") + print(" mpirun -np 4 python multiGPUGradientAverage.py --size 10000") + print("\nFor multi-GPU systems:") + print(" mpirun -np N python multiGPUGradientAverage.py") + print(" (where N = number of GPUs)") + print("=" * 70) + sys.exit(1) + + # Validate input + if num_elements <= 0: + if rank == 0: + print("Error: --size must be positive") + sys.exit(1) + + if rank == 0: + print("\n" + "=" * 70) + print("Multi-GPU Gradient Average Demo") + print("=" * 70) + print(f"Number of MPI ranks (GPUs): {world_size}") + print(f"Gradient vector length per GPU: {num_elements}") + print(f"Device: {device.name}") + print("Computation: gradients computed on GPU via cuda.core.") + print("Communication: gradients averaged via MPI_Allreduce on host (CPU) buffers.") + print("=" * 70) + + # Step 1: Compute local gradients on each GPU + # Use cuda.core Event for GPU timing measurements + timing_options = EventOptions(timing_enabled=True) + start_event = stream.record(options=timing_options) + + local_grad = compute_local_gradients(num_elements, device, stream, rank) + + # Record end event and synchronize to ensure timing is complete + end_event = stream.record(options=timing_options) + end_event.sync() + + # Calculate elapsed time: Event subtraction returns milliseconds + kernel_time = end_event - start_event + + # Step 2: Average gradients across all ranks (host-staging Allreduce) + # Use CPU timing for MPI communication (host-staging includes GPU↔CPU transfers) + import time + + comm_start = time.time() + avg_grad = average_gradients(local_grad, comm, world_size) + comm_time = (time.time() - comm_start) * 1000 # Convert to ms + + # Step 3: Sanity check on rank 0 + # For each element i: + # local_grad_r[i] = r + 0.001 * i, r = 0..world_size-1 + # Sum over ranks: + # sum[i] = sum_r r + 0.001 * i * world_size + # Average: + # avg[i] = (0 + ... + (world_size-1))/world_size + 0.001 * i + # = (world_size - 1)/2 + 0.001 * i + # + # We verify this formula. + + expected_base = (world_size - 1) / 2.0 + i0 = 0 + i1 = num_elements // 2 + i2 = num_elements - 1 + + # Copy a few sample elements back to host for printing on rank 0 + if rank == 0: + avg_host_samples = avg_grad[[i0, i1, i2]].get() + print("\nSample averaged gradient values (rank 0):") + print(f" avg_grad[{i0}] = {avg_host_samples[0]:.6f}") + print(f" avg_grad[{i1}] = {avg_host_samples[1]:.6f}") + print(f" avg_grad[{i2}] = {avg_host_samples[2]:.6f}") + + expected0 = expected_base + 0.001 * i0 + expected1 = expected_base + 0.001 * i1 + expected2 = expected_base + 0.001 * i2 + print("\nExpected values:") + print(f" expected[{i0}] = {expected0:.6f}") + print(f" expected[{i1}] = {expected1:.6f}") + print(f" expected[{i2}] = {expected2:.6f}") + + # All ranks perform a full-array correctness check on GPU + expected_full = expected_base + 0.001 * cp.arange(num_elements, dtype=cp.float32) + + # Use utility function to verify results + if rank == 0: + print("\nVerifying gradient averaging correctness...") + ok = verify_array_result(avg_grad, expected_full, rtol=1e-5, atol=1e-8, verbose=(rank == 0)) + + # Ensure all ranks agree on correctness + ok_all = comm.allreduce(ok, op=MPI.LAND) + + if rank == 0: + if ok_all: + print("[PASS] Gradient averaging is correct on all ranks.") + else: + print("[FAIL] Gradient averaging mismatch detected on one or more ranks.") + + print("\nPerformance:") + print(f" Kernel time (GPU only): {kernel_time:.3f} ms") + print(f" MPI communication time (host-staging, end-to-end): {comm_time:.3f} ms") + print(f" Total time: {kernel_time + comm_time:.3f} ms") + + print("\n" + "=" * 70) + print("Demo complete.") + print("=" * 70) + + return 0 if ok_all else 1 + finally: + # Clean up stream resources + if stream is not None: + stream.close() + cp.cuda.Stream.null.use() # Reset CuPy's current stream to the null stream + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/multiGPUGradientAverage/requirements.txt b/samples/multiGPUGradientAverage/requirements.txt new file mode 100644 index 00000000000..7ff01293945 --- /dev/null +++ b/samples/multiGPUGradientAverage/requirements.txt @@ -0,0 +1,21 @@ +# Multi-GPU Gradient Average Sample Requirements + +# MPI Python bindings for distributed communication +mpi4py>=3.1.4 + +# GPU array library (NumPy-compatible arrays on CUDA) +# Use cupy-cuda11x, cupy-cuda12x, or cupy-cuda13x depending on your CUDA version +cupy-cuda13x>=14.0.0 + +# CUDA Python bindings (low-level CUDA driver API) +cuda-python>=13.0.0 + +# cuda.core - Modern Python interface for CUDA +# Provides Program, LaunchConfig, Device, and launch APIs +cuda-core>=1.0.0 + +# Note: This sample uses host-staging for MPI communication +# Standard MPI installation is sufficient (no CUDA-aware MPI required) +# Install MPI using system package manager: +# Ubuntu/Debian: sudo apt-get install openmpi-bin libopenmpi-dev +# Or build from source: https://www.open-mpi.org/software/ompi/ diff --git a/samples/numpyVsCupy/README.md b/samples/numpyVsCupy/README.md new file mode 100644 index 00000000000..28a42848b72 --- /dev/null +++ b/samples/numpyVsCupy/README.md @@ -0,0 +1,73 @@ +# Sample: Numpy vs. Cupy (Python) + +## Description + +This sample demonstrates performance comparison between NumPy (CPU) and CuPy (GPU) for matrix multiplication operations. It benchmarks the execution time of matrix dot products on both CPU and GPU, showing the performance benefits of GPU acceleration for numerical computations. + +## What you will learn + +- How to set up and use CuPy for GPU-accelerated numerical computations. +- How to benchmark NumPy vs CuPy performance for matrix operations. +- How to transfer data between CPU (NumPy) and GPU (CuPy) memory using `cp.asarray()`. +- How to use CUDA device management with the cuda-core library. +- How to validate computational results between CPU and GPU implementations using `np.testing.assert_allclose()`. +- How to handle GPU warmup to avoid first-run overhead in benchmarking. +- How to create and manage explicit CUDA streams with `device.create_stream()`. +- How to properly cleanup streams with `stream.close()` in try/finally blocks. +- How to access GPU device information (name, compute capability). +- How to create timing context managers for performance measurement using CUDA events. + +## Key libraries + +- `numpy` +- `cupy` +- `cuda-core` + +## Key APIs + +**From cuda.core:** +- `Device()` – Get CUDA device object for specific GPU +- `device.create_stream()` – Create explicit CUDA stream +- `stream.close()` – Close and cleanup stream resources + +## Requirements +1. **NVIDIA Graphics Card** with CUDA support +2. **CUDA Drivers** installed on your system +3. **CUDA Toolkit** installed on your system +4. **Python 3.12 or newer** + +**Install packages:** +```bash +pip install -r requirements.txt +``` + +## How to run + +Basic usage: +```bash +# Pre-steps: +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +# Run from the Python directory: +cd /path/to/numpyVsCupy/Python +python -m 1_GettingStarted.numpyVsCupy.numpyVsCupy +``` + +With custom parameters: +```bash +python -m 1_GettingStarted.numpyVsCupy.numpyVsCupy --n_size 5000 +``` + +### Command line arguments + +- `--n_size`, `-n`: Size of the matrix (n * n) for benchmarking (default: 4096) + +## Expected Output +``` +Validation PASSED: NumPy and CuPy results match within tolerance +Demo completed successfully! +``` + +## Files +- `numpyVsCupy.py` – Python implementation +- `README.md` – This file +- `requirements.txt` – Required packages diff --git a/samples/numpyVsCupy/numpyVsCupy.py b/samples/numpyVsCupy/numpyVsCupy.py new file mode 100644 index 00000000000..06e4204503a --- /dev/null +++ b/samples/numpyVsCupy/numpyVsCupy.py @@ -0,0 +1,141 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# distribution and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["numpy>=2.3.2", "cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0"] +# /// + +import argparse +import contextlib +import sys +import time +from pathlib import Path + +try: + import cupy as cp + import numpy as np + + from cuda.core import Device, EventOptions +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + +# Add parent directory to path to import utilities +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import verify_array_result + + +@contextlib.contextmanager +def timer(message): + """CPU timing context manager.""" + start = time.time() + yield + end = time.time() + print(f"{message}: {(end - start):.6f} seconds") + + +@contextlib.contextmanager +def gpu_timer(message, stream): + """GPU timing context manager using cuda.core CUDA events.""" + event_options = EventOptions(timing_enabled=True) + start_event = stream.record(options=event_options) + yield + end_event = stream.record(options=event_options) + end_event.sync() + + elapsed_time_ms = end_event - start_event # Returns milliseconds + elapsed_time_s = elapsed_time_ms / 1000.0 # Convert to seconds + print(f"{message}: {elapsed_time_s:.6f} seconds") + + +def warmup(): + # Pre-runs a simple GPU operation to avoid first-run overhead in benchmarking. + print("Warmup...") + a_cp = cp.ones((16, 16)) + b_cp = cp.ones((16, 16)) + result_cp = cp.dot(a_cp, b_cp) + return result_cp + + +def run(n): + # Benchmarks NumPy vs. CuPy matrix multiplication for n x n random arrays. + # Prints timing results. + + device = Device() # Use device 0 explicitly + device.set_current() + major, minor = device.compute_capability + print() + print(f"Device Name: {device.name}, SM: {major}.{minor}") + print() + + # Create explicit stream + stream = device.create_stream() + + try: + # Warm up GPU before measuring + warmup() + stream.sync() + + # Generate random matrices on CPU + a_np = np.random.rand(n, n) + b_np = np.random.rand(n, n) + + # NumPy dot product (CPU) + with timer(f"NumPy dot of {n}*{n} arrays"): + result_np = np.dot(a_np, b_np) + + # Transfer NumPy arrays to GPU (using events for timing) + with gpu_timer("Transfer arrays to GPU", stream): + a_cp = cp.asarray(a_np) + b_cp = cp.asarray(b_np) + + # CuPy dot product (GPU) - using events for accurate GPU timing + with gpu_timer(f"CuPy dot of {n}*{n} arrays", stream): + result_cp = cp.dot(a_cp, b_cp) + + print() + # Result validation + if not verify_array_result(result_np, result_cp.get()): + print("Validation FAILED: NumPy and CuPy results do not match within tolerance") + sys.exit(1) + + print("Validation PASSED: NumPy and CuPy results match within tolerance") + finally: + stream.close() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--n_size", "-n", default=4096, type=int, help="Size of the matrix(n * n).") + args = parser.parse_args() + run(args.n_size) + print("Demo completed successfully!") + + +if __name__ == "__main__": + main() diff --git a/samples/numpyVsCupy/requirements.txt b/samples/numpyVsCupy/requirements.txt new file mode 100644 index 00000000000..c895afaa1b7 --- /dev/null +++ b/samples/numpyVsCupy/requirements.txt @@ -0,0 +1,7 @@ +# Numpy vs. Cupy - Requirements +# Install with: pip install -r requirements.txt + +numpy>=2.3.2 +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 diff --git a/samples/pageRank/README.md b/samples/pageRank/README.md new file mode 100644 index 00000000000..3be6964c688 --- /dev/null +++ b/samples/pageRank/README.md @@ -0,0 +1,184 @@ +# Sample: PageRank Algorithm (Python) + +> **Known issue — version-pinned sample.** Unlike the other samples in this +> repository, this sample is pinned to `cuda-core<1.0.0`. The reason is that +> `cudf-cu13` transitively requires `numba-cuda<0.29.0`, and every +> `numba-cuda` release in that range pins `cuda-core<1.0.0`. Installing this +> sample's `requirements.txt` into a shared environment will downgrade +> `cuda-core` and break the other samples (which use the 1.0 API). +> +> The recommended workflow is one of: +> +> - Install this sample's requirements in a **dedicated virtual +> environment**, or +> - Re-run the other samples' `pip install -r requirements.txt` afterwards +> to upgrade `cuda-core` back to 1.0. +> +> This sample will be re-aligned with the rest of the repository +> (`cuda-core>=1.0.0`) once `cudf-cu13` ships a release that lifts its +> `numba-cuda` upper bound. + +## Description + +Demonstrates GPU-accelerated PageRank computation for graph analysis using RAPIDS cuGraph, with cuda.core for device, stream, and GPU timing. This sample focuses on cuda.core integration with high-level libraries (cuGraph/cuDF); for custom kernel programming (Program, LaunchConfig, launch), see the blockwiseSum sample. + +## What You'll Learn + +- Graph representation using cuDF DataFrames for edge lists +- GPU-optimized PageRank via RAPIDS cuGraph library +- Performance comparison between cuGraph GPU and CPU reference implementation +- cuda.core device/stream management and GPU timing + +## Key Libraries + +- `cugraph` - RAPIDS GPU-accelerated graph analytics +- `cudf` - RAPIDS GPU DataFrame library +- `cuda.core` - Device, stream, and event APIs for GPU timing +- `cupy` - GPU array library (Stream.from_external for cuDF/cuGraph) +- `numpy` - CPU reference implementation + +## Key APIs + +### From cuda.core: + +- `Device(0)` - Create device, `device.set_current()`, `device.create_stream()` +- `EventOptions(enable_timing=True)` - GPU timing via `stream.record()` +- `cp.cuda.Stream.from_external(stream).use()` - Make cuDF/cuGraph use cuda.core stream + +### From cuGraph: + +- `cugraph.Graph(directed=True)` - Create directed graph structure +- `Graph.from_cudf_edgelist()` - Build graph from edge list DataFrame +- `cugraph.pagerank()` - GPU-accelerated PageRank algorithm + +### From cuDF: + +- `cudf.DataFrame()` - GPU DataFrame for edge lists + +## Requirements + +### Hardware: + +- NVIDIA GPU with Compute Capability 7.0 or higher +- Minimum GPU memory: 512 MB (for 10K node graph) + +### Software: + +- CUDA Toolkit 13.0 or newer +- Python 3.10 or newer +- See requirements.txt for package dependencies + +### Platform Support: + +This sample depends on RAPIDS (`cugraph-cu13`, `cudf-cu13`, `dask-cuda`), +which is currently published only as **Linux (manylinux) wheels** on +`pypi.nvidia.com` — no Windows wheels exist. On Windows the sample exits +early with a waive message and exit code `2` instead of attempting an +install that cannot succeed. + +## Installation + +```bash +cd /path/to/cuda-samples/python/2_CoreConcepts/pageRank +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +## How to Run + +```bash +python pageRank.py +``` + +## Algorithm + +The PageRank formula iteratively computes node importance: + +``` +PR(v) = (1-d)/N + d * Σ PR(u)/out_degree(u) +``` + +Where: +- `d` = damping factor (typically 0.85) +- `N` = total number of nodes +- Sum is over all nodes `u` that link to `v` + +## Expected Output + +``` +============================================================ +PageRank Algorithm (using RAPIDS cuGraph) +============================================================ + +Device: NVIDIA GeForce RTX ... +Compute Capability: sm_XX + +Graph Parameters: + Nodes: 10,000 + Avg edges/node: 15 + Total edges: ~150,000 + Avg in-degree: 14.9 + +------------------------------------------------------------ +GPU PageRank (RAPIDS cuGraph) +------------------------------------------------------------ +Time: X.XXX ms + +Top 5 nodes by PageRank: + 1. Node XXXXX: 0.XXXXXX + ... + +------------------------------------------------------------ +CPU PageRank (Reference) +------------------------------------------------------------ +Time: XXXX.XXX ms +Iterations: XX + +------------------------------------------------------------ +PERFORMANCE SUMMARY +------------------------------------------------------------ +GPU (cuGraph): X.XXX ms +CPU (Reference): XXXX.XXX ms +Speedup: XXXX.Xx + +------------------------------------------------------------ +VERIFICATION +------------------------------------------------------------ +GPU vs CPU PageRank scores: Test PASSED + +PageRank Properties: + Sum of scores: 1.000000 (should be ~1.0) + Sum check: ✓ + +Done +``` + +## Files + +- `pageRank.py` - Python implementation using RAPIDS cuGraph +- `README.md` - This file +- `requirements.txt` - Sample dependencies + +## Why cuGraph? + +RAPIDS cuGraph provides production-grade, GPU-accelerated graph analytics: + +- **Highly optimized** - Uses advanced GPU parallelization techniques +- **Scalable** - Handles graphs with billions of edges +- **Easy to use** - Simple Python API similar to NetworkX +- **Integrated** - Works seamlessly with cuDF, cuML, and other RAPIDS libraries + +## Applications + +- Web page ranking (original Google PageRank) +- Social network influence analysis +- Citation network analysis +- Recommendation systems +- Fraud detection in financial networks + +## See Also + +- [RAPIDS cuGraph Documentation](https://docs.rapids.ai/api/cugraph/stable/) +- [cuGraph GitHub Repository](https://github.com/rapidsai/cugraph) +- [RAPIDS Installation Guide](https://rapids.ai/start.html) diff --git a/samples/pageRank/pageRank.py b/samples/pageRank/pageRank.py new file mode 100644 index 00000000000..b587ec2710b --- /dev/null +++ b/samples/pageRank/pageRank.py @@ -0,0 +1,364 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core<1.0.0", "cugraph-cu13>=26.0.0", "cudf-cu13>=26.0.0", "dask-cuda>=26.4.0", "cupy-cuda13x>=14.0.0", "numpy>=2.3.2"] +# /// + +""" +PageRank Algorithm + +Demonstrates GPU-accelerated PageRank computation for graph analysis: +- Graph representation using edge lists and cuDF DataFrames +- GPU-optimized PageRank via RAPIDS cuGraph library +- Performance comparison: cuGraph GPU vs CPU reference + +Uses RAPIDS cuGraph for production-grade graph analytics on GPU. + +PageRank Algorithm: + PR(v) = (1-d)/N + d * sum(PR(u)/out_degree(u)) for all u linking to v + where d = damping factor (typically 0.85), N = number of nodes +""" + +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import print_gpu_info, verify_array_result + +if sys.platform == "win32": + print( + "This sample depends on RAPIDS (cugraph-cu13 / cudf-cu13), which is " + "currently published only as Linux (manylinux) wheels on " + "pypi.nvidia.com. Waiving this sample on Windows." + ) + sys.exit(2) + +try: + import cudf + import cugraph + import cupy as cp + import numpy as np + + from cuda.core import Device, EventOptions, Stream +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Install with: pip install -r requirements.txt") + sys.exit(1) + + +def generate_random_graph( + num_nodes: int, + avg_edges_per_node: int = 10, + seed: int = 42, +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Generate a random directed graph as edge list. + + Parameters + ---------- + num_nodes : int + Number of nodes in the graph + avg_edges_per_node : int + Average number of outgoing edges per node + seed : int + Random seed for reproducibility + + Returns + ------- + tuple[np.ndarray, np.ndarray, np.ndarray] + (sources, destinations, out_degree) arrays + """ + rng = np.random.default_rng(seed) + + sources_list: list[int] = [] + destinations_list: list[int] = [] + out_degree = np.zeros(num_nodes, dtype=np.int32) + + for src in range(num_nodes): + # Random number of outgoing edges (Poisson distribution) + n_edges = max(1, rng.poisson(avg_edges_per_node)) + n_edges = min(n_edges, num_nodes - 1) + # Random destinations (no self-loops); rejection sampling avoids O(N²) memory + dests: set[int] = set() + while len(dests) < n_edges: + d = int(rng.integers(0, num_nodes)) + if d != src: + dests.add(d) + dests = np.array(list(dests), dtype=np.int32) + for dst in dests: + sources_list.append(src) + destinations_list.append(dst) + out_degree[src] = len(dests) + + sources = np.array(sources_list, dtype=np.int32) + destinations = np.array(destinations_list, dtype=np.int32) + + return sources, destinations, out_degree + + +def pagerank_cpu( + sources: np.ndarray, + destinations: np.ndarray, + out_degree: np.ndarray, + num_nodes: int, + damping: float = 0.85, + max_iterations: int = 100, + tolerance: float = 1e-6, +) -> tuple[np.ndarray, int]: + """ + Compute PageRank on CPU using iterative method. + + Parameters + ---------- + sources : np.ndarray + Source nodes of edges + destinations : np.ndarray + Destination nodes of edges + out_degree : np.ndarray + Outgoing degree for each node + num_nodes : int + Number of nodes + damping : float + Damping factor (default: 0.85) + max_iterations : int + Maximum iterations + tolerance : float + Convergence tolerance + + Returns + ------- + tuple[np.ndarray, int] + (PageRank scores, iterations until convergence) + """ + # Build incoming edges list for each node + incoming: list[list[int]] = [[] for _ in range(num_nodes)] + for src, dst in zip(sources, destinations): + incoming[dst].append(src) + + # Initialize PageRank uniformly + pr = np.ones(num_nodes, dtype=np.float32) / num_nodes + pr_new = np.zeros(num_nodes, dtype=np.float32) + + base_score = (1.0 - damping) / num_nodes + + for iteration in range(max_iterations): + # Handle dangling nodes (nodes with no outgoing edges) + dangling_sum = np.sum(pr[out_degree == 0]) + dangling_contrib = damping * dangling_sum / num_nodes + + for v in range(num_nodes): + # Sum contributions from incoming neighbors + incoming_sum = 0.0 + for u in incoming[v]: + if out_degree[u] > 0: + incoming_sum += pr[u] / out_degree[u] + + pr_new[v] = base_score + damping * incoming_sum + dangling_contrib + + # Check convergence + diff = np.sum(np.abs(pr_new - pr)) + pr, pr_new = pr_new, pr + + if diff < tolerance: + return pr, iteration + 1 + + return pr, max_iterations + + +def run_pagerank_benchmark( + num_nodes: int = 10000, + avg_edges: int = 15, + max_iterations: int = 100, +) -> bool: + """ + Run PageRank benchmark comparing cuGraph GPU and CPU performance. + + Parameters + ---------- + num_nodes : int + Number of nodes in the graph + avg_edges : int + Average edges per node + max_iterations : int + Maximum PageRank iterations + + Returns + ------- + bool + True if benchmark succeeded + """ + print("=" * 60) + print("PageRank Algorithm (using RAPIDS cuGraph)") + print("=" * 60) + + # Initialize cuda.core device and stream + device = Device(0) + device.set_current() + stream: Stream = device.create_stream() + print() + print_gpu_info(device) + + # RAPIDS cuGraph wheels currently don't ship kernel binaries for + # every CUDA architecture. Skip cleanly on architectures known to + # be unsupported instead of failing deep inside cuGraph with a + # cryptic cudaErrorNoKernelImageForDevice. Remove an arch from this + # set once the matching cuGraph release ships kernels for it. + _CUGRAPH_UNSUPPORTED_ARCHES = {"110"} # sm_110 = Thor / Tegra + if device.arch in _CUGRAPH_UNSUPPORTED_ARCHES: + print(f"RAPIDS cuGraph does not yet ship kernels for sm_{device.arch}, waiving this sample.") + stream.close() + sys.exit(2) + + # Make CuPy/cuDF use our cuda.core stream + cp.cuda.Stream.from_external(stream).use() + + # Generate random graph + print("\nGraph Parameters:") + print(f" Nodes: {num_nodes:,}") + print(f" Avg edges/node: {avg_edges}") + + sources, destinations, out_degree = generate_random_graph(num_nodes, avg_edges, seed=42) + + total_edges = len(sources) + print(f" Total edges: {total_edges:,}") + print(f" Avg in-degree: {total_edges / num_nodes:.1f}") + + # ------------------------------------------------------------------------- + # GPU PageRank (cuGraph) + # ------------------------------------------------------------------------- + print("\n" + "-" * 60) + print("GPU PageRank (RAPIDS cuGraph)") + print("-" * 60) + + # Create cuGraph graph from edge list with store_transposed for optimal perf + gdf = cudf.DataFrame( + { + "src": sources, + "dst": destinations, + } + ) + G = cugraph.Graph(directed=True) + G.from_cudf_edgelist(gdf, source="src", destination="dst", store_transposed=True) + + event_opts = EventOptions(enable_timing=True) + + try: + # Warmup + _ = cugraph.pagerank(G, alpha=0.85, max_iter=100, tol=1e-5) + stream.sync() + + # Timed run using cuda.core events + start = stream.record(options=event_opts) + pr_result = cugraph.pagerank(G, alpha=0.85, max_iter=max_iterations, tol=1e-6) + end = stream.record(options=event_opts) + end.sync() + + gpu_time_ms = end - start + print(f"Time: {gpu_time_ms:.3f} ms") + + # Extract results sorted by vertex ID (to numpy for verification) + pr_df = pr_result.sort_values("vertex").reset_index(drop=True) + pr_gpu = pr_df["pagerank"].to_numpy() + + # Show top PageRank nodes + top_k = 5 + top_df = pr_result.nlargest(top_k, "pagerank") + print(f"\nTop {top_k} nodes by PageRank:") + for i, row in enumerate(top_df.to_pandas().itertuples()): + print(f" {i + 1}. Node {row.vertex:5d}: {row.pagerank:.6f}") + + # ------------------------------------------------------------------------- + # CPU PageRank + # ------------------------------------------------------------------------- + print("\n" + "-" * 60) + print("CPU PageRank (Reference)") + print("-" * 60) + + cpu_start = time.perf_counter() + pr_cpu, cpu_iters = pagerank_cpu(sources, destinations, out_degree, num_nodes, max_iterations=max_iterations) + cpu_end = time.perf_counter() + + cpu_time_ms = (cpu_end - cpu_start) * 1000 + print(f"Time: {cpu_time_ms:.3f} ms") + print(f"Iterations: {cpu_iters}") + + # ------------------------------------------------------------------------- + # Performance Summary + # ------------------------------------------------------------------------- + print("\n" + "-" * 60) + print("PERFORMANCE SUMMARY") + print("-" * 60) + speedup = cpu_time_ms / gpu_time_ms + print(f"GPU (cuGraph): {gpu_time_ms:.3f} ms") + print(f"CPU (Reference): {cpu_time_ms:.3f} ms") + print(f"Speedup: {speedup:.1f}x") + + # ------------------------------------------------------------------------- + # Verification + # ------------------------------------------------------------------------- + print("\n" + "-" * 60) + print("VERIFICATION") + print("-" * 60) + + # Compare GPU and CPU results (cuGraph and CPU ref may converge differently) + print("GPU vs CPU PageRank scores: ", end="") + success = verify_array_result(pr_gpu, pr_cpu, rtol=1e-2, atol=1e-4, verbose=True) + + # Verify PageRank properties + print("\nPageRank Properties:") + pr_sum = float(np.sum(pr_gpu)) + print(f" Sum of scores: {pr_sum:.6f} (should be ~1.0)") + + pr_min = float(np.min(pr_gpu)) + pr_max = float(np.max(pr_gpu)) + print(f" Min score: {pr_min:.6f}") + print(f" Max score: {pr_max:.6f}") + + # Check that sum is approximately 1 + sum_ok = abs(pr_sum - 1.0) < 0.01 + print(f" Sum check: {'✓' if sum_ok else '✗'}") + + success = success and sum_ok + return success + finally: + cp.cuda.Stream.null.use() + stream.close() + + +def main() -> None: + """Entry point.""" + success = run_pagerank_benchmark() + if success: + print("\nDone") + else: + print("\nBenchmark completed with errors") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/samples/pageRank/requirements.txt b/samples/pageRank/requirements.txt new file mode 100644 index 00000000000..fafc9dae18f --- /dev/null +++ b/samples/pageRank/requirements.txt @@ -0,0 +1,11 @@ +# PageRank Requirements (RAPIDS cuGraph) + +cuda-python>=13.0.0 +# cudf-cu13 transitively pins numba-cuda<0.29.0 which requires cuda-core<1.0.0 +cuda-core<1.0.0 +cugraph-cu13>=26.0.0 +cudf-cu13>=26.0.0 +# dask-cuda <26.4 incorrectly pins cuda-core==0.3.*; require the fixed release +dask-cuda>=26.4.0 +cupy-cuda13x>=14.0.0 +numpy>=2.3.2 diff --git a/samples/parallelHistogram/README.md b/samples/parallelHistogram/README.md new file mode 100644 index 00000000000..2820102d599 --- /dev/null +++ b/samples/parallelHistogram/README.md @@ -0,0 +1,117 @@ +# Sample: Parallel Histogram with Atomics (Python) + +## Description + +Compute histograms on the GPU using atomic operations to handle concurrent updates from multiple threads. This sample demonstrates the modern **cuda.core API** for kernel compilation and launch, comparing two approaches: + +1. **Global Atomics** - All threads atomically update a single global histogram +2. **Privatized Histograms** - Each block uses shared memory, then merges to global + +## What You'll Learn + +- Compiling CUDA C kernels with `cuda.core.Program` +- Configuring kernel launches with `cuda.core.LaunchConfig` +- Launching kernels with `cuda.core.launch()` +- Using **atomic operations** (`atomicAdd`) for thread-safe updates +- Optimizing with **shared memory privatization** +- GPU timing with `cuda.core` Events + +## Key Concepts + +### Atomic Operations + +When multiple threads update the same histogram bin, a race condition occurs. Atomic operations ensure thread-safe updates: + +```cuda +atomicAdd(&histogram[data[i]], 1); // Thread-safe increment +``` + +### Global vs Privatized Atomics + +| Approach | Pros | Cons | +|----------|------|------| +| Global | Simple | High contention on popular bins | +| Privatized | Significantly faster | Extra shared memory, synchronization | + +## Key APIs + +### From `cuda.core`: + +- `Device` - Device management and context +- `Program` - Compile CUDA C source code +- `ProgramOptions` - Set architecture, optimization flags +- `LaunchConfig` - Configure grid and block dimensions +- `launch()` - Launch compiled kernel +- `Stream` - Async stream management +- `EventOptions` - Configure events for GPU timing +- `stream.record()` - Record events for timing + +### From `cupy`: + +- `cp.random.randint()` - Generate random data directly on GPU +- `cp.zeros()` - Allocate zeroed GPU arrays + +### CUDA Atomic Functions (in kernel): + +- `atomicAdd()` - Thread-safe addition + +## Requirements + +### Hardware: +- NVIDIA GPU with CUDA support + +### Software: +- CUDA Toolkit 13.0 or newer +- Python 3.10 or newer +- See `requirements.txt` for Python packages + +## Installation + +```bash +pip install -r requirements.txt +``` + +## How to Run + +```bash +python parallelHistogram.py +``` + +## Expected Output + +``` +============================================================ +Parallel Histogram with Atomics (cuda.core) +============================================================ + +Device: +Compute Capability: ComputeCapability(major=X, minor=Y) + +Compiling CUDA kernels with cuda.core.Program... + Compiled for architecture: sm_XY + +Generating 10,000,000 random values on GPU... + +Verifying correctness... + Global atomics: PASSED + Privatized atomics: PASSED + +Benchmarking (100 iterations)... + Global atomics: X.XXX ms + Privatized atomics: X.XXX ms + Speedup: XXx + +Test PASSED +``` + +## Files + +- `parallelHistogram.py` - Main sample using cuda.core +- `README.md` - This file +- `requirements.txt` - Dependencies + +## See Also + +- [cuda.core Documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [CUDA Atomic Functions](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions) +- [CUDA Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) diff --git a/samples/parallelHistogram/parallelHistogram.py b/samples/parallelHistogram/parallelHistogram.py new file mode 100644 index 00000000000..7931ce20b16 --- /dev/null +++ b/samples/parallelHistogram/parallelHistogram.py @@ -0,0 +1,240 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "numpy>=2.3.2", "cupy-cuda13x>=14.0.0"] +# /// + +""" +Parallel Histogram with Atomics using cuda.core + +This sample demonstrates GPU histogram computation using atomic operations, +showcasing the modern cuda.core API for: +- Kernel compilation (Program, ProgramOptions) +- Kernel launch configuration (LaunchConfig) +- Stream management (Stream) +- Event timing (EventOptions) + +Two histogram approaches are compared: +1. Global Atomics - All threads atomically update global memory +2. Privatized Histograms - Shared memory reduces global atomic contention +""" + +import sys + +try: + import cupy as cp + import numpy as np + + from cuda.core import ( + Device, + EventOptions, + LaunchConfig, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install: pip install -r requirements.txt") + sys.exit(1) + + +NUM_BINS = 256 + +# CUDA C source code for both histogram kernels +HISTOGRAM_KERNELS = r""" +// Global Atomics - simple but high contention on popular bins +extern "C" __global__ +void histogram_global(const unsigned char* data, unsigned int* histogram, int n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += stride) { + atomicAdd(&histogram[data[i]], 1); + } +} + +// Privatized - uses shared memory to reduce global atomic contention +extern "C" __global__ +void histogram_privatized(const unsigned char* data, unsigned int* histogram, int n) { + __shared__ unsigned int local_hist[256]; + + int tid = threadIdx.x; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + // Initialize shared memory + for (int i = tid; i < 256; i += blockDim.x) + local_hist[i] = 0; + __syncthreads(); + + // Accumulate into shared memory (fast) + for (int i = idx; i < n; i += stride) + atomicAdd(&local_hist[data[i]], 1); + __syncthreads(); + + // Merge to global (fewer atomics) + for (int i = tid; i < 256; i += blockDim.x) + if (local_hist[i] > 0) + atomicAdd(&histogram[i], local_hist[i]); +} +""" + + +def main(): + print("=" * 60) + print("Parallel Histogram with Atomics (cuda.core)") + print("=" * 60) + + # Initialize device using cuda.core + device = Device(0) + device.set_current() + print(f"\nDevice: {device.name}") + print(f"Compute Capability: {device.compute_capability}") + + # Create stream using cuda.core + stream = device.create_stream() + + # Make CuPy use the same stream for correct ordering (avoids null-stream sync) + cp.cuda.Stream.from_external(stream).use() + + try: + _run_histogram(device, stream) + finally: + cp.cuda.Stream.null.use() # Restore CuPy to default stream before closing + stream.close() + + +def _run_histogram(device, stream): + """Run histogram computation and benchmarking.""" + # Compile CUDA kernels using cuda.core.Program + print("\nCompiling CUDA kernels with cuda.core.Program...") + arch = f"sm_{device.arch}" + options = ProgramOptions(arch=arch) + program = Program(HISTOGRAM_KERNELS, code_type="c++", options=options) + object_code = program.compile("cubin") + + kernel_global = object_code.get_kernel("histogram_global") + kernel_privatized = object_code.get_kernel("histogram_privatized") + print(f" Compiled for architecture: {arch}") + + # Generate test data directly on GPU (more efficient than CPU->GPU copy) + n = 10_000_000 + print(f"\nGenerating {n:,} random values on GPU...") + data_gpu = cp.random.randint(0, 256, size=n, dtype=cp.uint8) + hist_gpu = cp.zeros(NUM_BINS, dtype=cp.uint32) + + # Compute reference histogram on CPU for verification + data_cpu = cp.asnumpy(data_gpu) + hist_cpu, _ = np.histogram(data_cpu, bins=NUM_BINS, range=(0, 256)) + hist_cpu = hist_cpu.astype(np.uint32) + + # Configure kernel launch using cuda.core.LaunchConfig + block_size = 256 + grid_size = min((n + block_size - 1) // block_size, 1024) + config = LaunchConfig(grid=(grid_size,), block=(block_size,)) + + print("\nVerifying correctness...") + + # Ensure CuPy allocations complete before kernel launch on our stream + stream.sync() + + # Launch global atomics kernel (hist_gpu is already zeros from cp.zeros) + launch(stream, config, kernel_global, data_gpu.data.ptr, hist_gpu.data.ptr, np.int32(n)) + stream.sync() + + hist_global = cp.asnumpy(hist_gpu) + global_ok = np.array_equal(hist_cpu, hist_global) + print(f" Global atomics: {'PASSED' if global_ok else 'FAILED'}") + + # Reset histogram and launch privatized kernel (fill on same stream) + hist_gpu.fill(0) + launch( + stream, + config, + kernel_privatized, + data_gpu.data.ptr, + hist_gpu.data.ptr, + np.int32(n), + ) + stream.sync() + + hist_privatized = cp.asnumpy(hist_gpu) + privatized_ok = np.array_equal(hist_cpu, hist_privatized) + print(f" Privatized atomics: {'PASSED' if privatized_ok else 'FAILED'}") + + if not (global_ok and privatized_ok): + sys.exit(1) + + # Benchmark using cuda.core Events (explicit Event objects recorded on stream) + print("\nBenchmarking (100 iterations)...") + num_iterations = 100 + event_opts = EventOptions(timing_enabled=True) + start_event = device.create_event(options=event_opts) + end_event = device.create_event(options=event_opts) + + # Benchmark global atomics + stream.record(start_event) + for _ in range(num_iterations): + hist_gpu.fill(0) + launch( + stream, + config, + kernel_global, + data_gpu.data.ptr, + hist_gpu.data.ptr, + np.int32(n), + ) + stream.record(end_event) + end_event.sync() + time_global = (end_event - start_event) / num_iterations + + # Benchmark privatized + stream.record(start_event) + for _ in range(num_iterations): + hist_gpu.fill(0) + launch( + stream, + config, + kernel_privatized, + data_gpu.data.ptr, + hist_gpu.data.ptr, + np.int32(n), + ) + stream.record(end_event) + end_event.sync() + time_privatized = (end_event - start_event) / num_iterations + + print(f" Global atomics: {time_global:.3f} ms") + print(f" Privatized atomics: {time_privatized:.3f} ms") + print(f" Speedup: {time_global / time_privatized:.1f}x") + + print("\nTest PASSED") + + +if __name__ == "__main__": + main() diff --git a/samples/parallelHistogram/requirements.txt b/samples/parallelHistogram/requirements.txt new file mode 100644 index 00000000000..7f6d06ebac9 --- /dev/null +++ b/samples/parallelHistogram/requirements.txt @@ -0,0 +1,7 @@ +# Parallel Histogram with Atomics Sample Requirements +# Requires Python 3.10+, CUDA Toolkit 13.0+ + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +numpy>=2.3.2 +cupy-cuda13x>=14.0.0 diff --git a/samples/parallelReduction/README.md b/samples/parallelReduction/README.md new file mode 100644 index 00000000000..7ca010f245f --- /dev/null +++ b/samples/parallelReduction/README.md @@ -0,0 +1,119 @@ +# Sample: Parallel Reduction (Python) + +## Description + +Efficiently sum a large array on GPU using parallel reduction. This sample demonstrates: +1. **Custom CUDA kernel** showing reduction tree pattern and synchronization +2. **cuda.compute.reduce_into()** for production-ready reduction + +## What You'll Learn + +- **Reduction tree pattern**: Divide-and-conquer parallel algorithm +- **Thread synchronization**: Using `__syncthreads()` for coordination +- **Avoiding warp divergence**: Sequential thread IDs vs strided IDs + +## Key Concepts + +### Reduction Tree Pattern + +Parallel reduction uses a tree-based approach where each iteration halves active elements: + +``` +Initial: [a0, a1, a2, a3, a4, a5, a6, a7] +Step 1: [a0+a4, a1+a5, a2+a6, a3+a7] threads 0-3 active +Step 2: [a0+a2+a4+a6, a1+a3+a5+a7] threads 0-1 active +Step 3: [sum of all] thread 0 only +``` + +This requires only `log2(N)` steps to reduce N elements. + +### Avoiding Warp Divergence + +```c +// Good: Sequential thread IDs (warps stay coherent) +if (tid < s) { + sdata[tid] += sdata[tid + s]; +} + +// Bad: Strided IDs (causes warp divergence) +if (tid % (2 * s) == 0) { // Don't do this! + sdata[tid] += sdata[tid + s]; +} +``` + +## Requirements + +### Hardware + +- NVIDIA GPU with CUDA support + +### Software + +- CUDA Toolkit 13.0+ +- Python 3.10+ +- `cuda-python` (13.0.0+) +- `cuda-core` (>=1.0.0) +- `cuda-cccl` (1.0.0+) +- `cupy-cuda13x` (>=14.0.0) +- `numpy` (>=2.3.2) + +## Installation + +```bash +pip install -r requirements.txt +``` + +## How to Run + +```bash +python parallelReduction.py +``` + +## Expected Output + +``` +====================================================================== +Parallel Reduction - Efficient GPU Array Summation +====================================================================== + +Device: +Compute Capability: + +Array size: 1,048,576 elements (4.2 MB) +Expected sum: + +Compiling custom CUDA kernel... + +====================================================================== +PART 1: Custom Kernel (Educational) +====================================================================== + +Reduction tree kernel: +Expected: +Error: +Time: ms + +====================================================================== +PART 2: cuda.compute.reduce_into() (Production) +====================================================================== + +cuda.compute result: +Expected: +Error: +Time: ms + +Test PASSED! +``` + +Note: Exact values vary due to random input data. `cuda.compute.reduce_into()` is typically faster than the custom kernel because it calls CUB's `DeviceReduce`, which uses highly tuned, architecture‑specific kernels and optimized memory access patterns. + +## Files + +- `parallelReduction.py` - Custom kernel + cuda.compute comparison +- `README.md` - This documentation +- `requirements.txt` - Python dependencies + +## See Also + +- [Mark Harris - Optimizing Parallel Reduction in CUDA](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf) +- [cuda.core Documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/) diff --git a/samples/parallelReduction/parallelReduction.py b/samples/parallelReduction/parallelReduction.py new file mode 100644 index 00000000000..32b0f19f1d7 --- /dev/null +++ b/samples/parallelReduction/parallelReduction.py @@ -0,0 +1,375 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "cuda-cccl>=1.0.0", "cupy-cuda13x>=14.0.0", "numpy>=2.3.2"] +# /// + +""" +Parallel Reduction using cuda.core and cuda.compute + +Demonstrates efficient parallel summation of large arrays on GPU: +1. Custom CUDA kernel showing reduction tree pattern and synchronization +2. cuda.compute.reduce_into() for production-ready reduction + +Key Concepts: +- Reduction tree pattern: Divide-and-conquer parallel algorithm +- Thread synchronization: Using __syncthreads() for coordination +- Sequential thread IDs: How to avoid warp divergence +- cuda.core Stream integration with CuPy via Stream.from_external +""" + +import math +import sys +from pathlib import Path + +# Add Utilities to path +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import cupy as cp + import numpy as np + from cuda_samples_utils import print_gpu_info, verify_array_result + + from cuda.compute import OpKind, reduce_into + from cuda.core import ( + Device, + Kernel, + LaunchConfig, + Program, + ProgramOptions, + Stream, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +# ============================================================================= +# CUDA Kernel: Parallel Reduction (optimized - no warp divergence) +# ============================================================================= +REDUCTION_KERNEL: str = r""" +extern "C" __global__ +void reduce_sum(const float* __restrict__ input, + float* __restrict__ output, int n) { + /* + * Parallel reduction using grid-stride loop (canonical pattern) and + * sequential thread IDs for the reduction tree (avoids warp divergence). + * + * Grid-stride loop: each thread processes multiple elements + * for (i = tid; i < n; i += gridDim.x * blockDim.x) + * + * Reduction tree: sequential addressing keeps warps coherent. + */ + extern __shared__ float sdata[]; + + unsigned int tid = threadIdx.x; + unsigned int grid_stride = (unsigned int)gridDim.x * blockDim.x; + + float sum = 0.0f; + for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; + i += grid_stride) { + sum += input[i]; + } + sdata[tid] = sum; + __syncthreads(); + + // Reduction in shared memory (sequential addressing - no divergence) + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); // Wait for all threads before next iteration + } + + // Thread 0 writes block result + if (tid == 0) { + output[blockIdx.x] = sdata[0]; + } +} +""" + + +def compile_kernel(device: Device) -> Kernel: + """Compile the reduction kernel for the given device.""" + arch = f"sm_{device.arch}" + options = ProgramOptions(arch=arch) + program = Program(REDUCTION_KERNEL, code_type="c++", options=options) + return program.compile(target_type="cubin").get_kernel("reduce_sum") + + +def reduction_stage_output_counts(n: int, block_size: int) -> list[int]: + """Lengths of intermediate arrays for each multi-launch reduction stage.""" + counts: list[int] = [] + while n > 1: + num_blocks = math.ceil(n / block_size) + counts.append(num_blocks) + n = num_blocks + return counts + + +def reduce_custom( + stream: Stream, + kernel: Kernel, + d_input: cp.ndarray, + block_size: int = 256, + sync: bool = True, + work_buffers: list[cp.ndarray] | None = None, +) -> float | cp.ndarray: + """ + Perform parallel reduction using custom CUDA kernel. + + Uses multiple kernel launches to reduce array to single value. + Each launch reduces by factor of block_size. + + When sync=True (default), syncs and returns the scalar result. + When sync=False, returns the 1-element array without syncing; + caller must sync before reading (avoids host overhead in benchmarks). + + work_buffers: optional list of device arrays, one per stage, with length + at least each stage's output count (from ``reduction_stage_output_counts``). + When provided, avoids per-call allocation (e.g. for benchmarking). + """ + n = len(d_input) + current = d_input + stage = 0 + + if work_buffers is not None: + expected_counts = reduction_stage_output_counts(n, block_size) + if len(work_buffers) != len(expected_counts): + msg = f"work_buffers length {len(work_buffers)} != {len(expected_counts)} stages" + raise ValueError(msg) + + while n > 1: + num_blocks = math.ceil(n / block_size) + if work_buffers is not None: + d_output = work_buffers[stage] + if d_output.size < num_blocks: + msg = f"work_buffers[{stage}] size {d_output.size} < {num_blocks}" + raise ValueError(msg) + if d_output.size != num_blocks: + d_output = d_output[:num_blocks] + else: + d_output = cp.empty(num_blocks, dtype=cp.float32) + + config = LaunchConfig( + grid=(num_blocks, 1, 1), + block=(block_size, 1, 1), + shmem_size=block_size * 4, # float = 4 bytes + ) + + launch( + stream, + config, + kernel, + current.data.ptr, + d_output.data.ptr, + np.int32(n), + ) + + current = d_output + n = num_blocks + stage += 1 + + if sync: + stream.sync() + return float(current[0]) + return current + + +def benchmark_custom( + stream: Stream, + kernel: Kernel, + d_input: cp.ndarray, + num_runs: int = 10, + block_size: int = 256, +) -> tuple[float, float]: + """Benchmark custom reduction kernel using cuda.core events.""" + stage_counts = reduction_stage_output_counts(len(d_input), block_size) + work_buffers = [cp.empty(c, dtype=cp.float32) for c in stage_counts] + + # Warmup run (with sync to get valid result) + _ = reduce_custom(stream, kernel, d_input, block_size=block_size, work_buffers=work_buffers) + + event_opts = {"timing_enabled": True} + start_event = stream.device.create_event(options=event_opts) + end_event = stream.device.create_event(options=event_opts) + + times: list[float] = [] + result = 0.0 + + for _ in range(num_runs): + stream.record(start_event) + d_result = reduce_custom( + stream, + kernel, + d_input, + block_size=block_size, + sync=False, + work_buffers=work_buffers, + ) + stream.record(end_event) + end_event.sync() + result = float(d_result[0]) + + times.append(end_event - start_event) + + return result, float(np.mean(times)) + + +def benchmark_cuda_compute( + stream: Stream, + d_input: cp.ndarray, + num_runs: int = 10, +) -> tuple[float, float]: + """Benchmark cuda.compute.reduce_into() using cuda.core events.""" + h_init = np.array([0.0], dtype=np.float32) + + # Warmup (includes JIT compilation) + d_warmup = cp.empty(1, dtype=cp.float32) + reduce_into( + d_in=d_input, + d_out=d_warmup, + op=OpKind.PLUS, + num_items=len(d_input), + h_init=h_init, + stream=stream, + ) + stream.sync() + + d_output = cp.empty(1, dtype=cp.float32) + event_opts = {"timing_enabled": True} + start_event = stream.device.create_event(options=event_opts) + end_event = stream.device.create_event(options=event_opts) + + times: list[float] = [] + result = 0.0 + + for _ in range(num_runs): + stream.record(start_event) + reduce_into( + d_in=d_input, + d_out=d_output, + op=OpKind.PLUS, + num_items=len(d_input), + h_init=h_init, + stream=stream, + ) + stream.record(end_event) + end_event.sync() + + result = float(d_output[0]) + times.append(end_event - start_event) + + return result, float(np.mean(times)) + + +def main() -> bool: + """Main function demonstrating parallel reduction.""" + print("=" * 70) + print("Parallel Reduction - Efficient GPU Array Summation") + print("=" * 70) + + device = Device(0) + device.set_current() + stream = device.create_stream() + cp_stream = cp.cuda.Stream.from_external(stream) + + print() + print_gpu_info(device) + + array_size = 1 << 20 # 1M elements + h_input = np.random.rand(array_size).astype(np.float32) + expected_sum = float(np.sum(h_input)) + + print(f"\nArray size: {array_size:,} elements ({array_size * 4 / 1e6:.1f} MB)") + print(f"Expected sum: {expected_sum:.6f}") + + print("\nCompiling custom CUDA kernel...") + kernel = compile_kernel(device) + + try: + with cp_stream: + d_input = cp.asarray(h_input) + + # ====================================================================== + # Part 1: Custom Kernel + # ====================================================================== + print("\n" + "=" * 70) + print("PART 1: Custom Kernel (Educational)") + print("=" * 70) + + result, time_ms = benchmark_custom(stream, kernel, d_input) + + print(f"\nReduction tree kernel: {result:>14.2f}") + print(f"Expected: {expected_sum:>14.2f}") + print(f"Time: {time_ms:>14.3f} ms") + + # ====================================================================== + # Part 2: cuda.compute (Production) + # ====================================================================== + print("\n" + "=" * 70) + print("PART 2: cuda.compute.reduce_into() (Production)") + print("=" * 70) + + result_cc, time_cc = benchmark_cuda_compute(stream, d_input) + + print(f"\ncuda.compute result: {result_cc:>14.2f}") + print(f"Expected: {expected_sum:>14.2f}") + print(f"Time: {time_cc:>14.3f} ms") + + # Verify both results using principled rtol/atol + with cp_stream: + d_expected = cp.array([expected_sum], dtype=cp.float32) + custom_ok = verify_array_result( + cp.array([result], dtype=cp.float32), + d_expected, + rtol=1e-5, + atol=1e-8, + verbose=False, + ) + compute_ok = verify_array_result( + cp.array([result_cc], dtype=cp.float32), + d_expected, + rtol=1e-5, + atol=1e-8, + verbose=False, + ) + if custom_ok and compute_ok: + print("\nTest PASSED!") + return True + else: + print("\nTest FAILED - Error too large!") + return False + finally: + stream.close() + + +if __name__ == "__main__": + sys.exit(0 if main() else 1) diff --git a/samples/parallelReduction/requirements.txt b/samples/parallelReduction/requirements.txt new file mode 100644 index 00000000000..a3acd886536 --- /dev/null +++ b/samples/parallelReduction/requirements.txt @@ -0,0 +1,7 @@ +# Parallel Reduction Sample Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cuda-cccl>=1.0.0 +cupy-cuda13x>=14.0.0 +numpy>=2.3.2 diff --git a/samples/prefixSum/README.md b/samples/prefixSum/README.md new file mode 100644 index 00000000000..6542dbf36ec --- /dev/null +++ b/samples/prefixSum/README.md @@ -0,0 +1,83 @@ +# Prefix Sum (Scan) + +Demonstrates parallel prefix sum (scan) algorithms using cuda.compute with cuda.core stream management. + +## Overview + +- Inclusive scan: `output[i] = [init_value] + input[0] + input[1] + ... + input[i]` +- Exclusive scan: `output[i] = init_value + input[0] + input[1] + ... + input[i-1]` +- Uses cuda.compute APIs for optimized CUB-based implementations +- Uses cuda.core APIs for device and stream management +- Demonstrates CuPy integration via `Stream.from_external` + +## Requirements + +### Hardware + +- NVIDIA GPU with CUDA support + +### Software + +- CUDA Toolkit 13.0+ +- Python 3.10+ +- `cuda-python` (13.0.0+) +- `cuda-core` (>=1.0.0) +- `cuda-cccl` (1.0.0+) +- `cupy-cuda13x` (14.0.0+) +- `numpy` (>=2.3.2) + +## Usage + +```bash +# Create and activate virtual environment +python -m venv venv +source venv/bin/activate # Linux/macOS +# venv\Scripts\activate # Windows + +# Install dependencies +pip install -r requirements.txt + +# Run sample +python prefixSum.py +``` + +## Key Concepts + +| Scan Type | Formula | First Element | +|-----------|---------|---------------| +| Inclusive | `output[i] = [init_value] + Σ input[0..i]` | `[init_value] + input[0]` | +| Exclusive | `output[i] = init_value + Σ input[0..i-1]` | `init_value` (typically `0`, the identity for sum) | + +### Stream Management + +This sample demonstrates proper stream usage across libraries: + +```python +# Create stream with cuda.core +stream = device.create_stream() + +# Wrap for CuPy compatibility (cuda.core Stream implements the __cuda_stream__ protocol) +cp_stream = cp.cuda.Stream.from_external(stream) + +# Use with CuPy operations +with cp_stream: + d_input = cp.asarray(data) + d_output = cp.empty_like(d_input) + +# Pass to cuda.compute +inclusive_scan( + d_in=d_input, + d_out=d_output, + op=OpKind.PLUS, + init_value=None, + num_items=len(d_input), + stream=stream, +) +``` + +## Applications + +- Stream compaction +- Radix sort +- Histogram computation +- Polynomial evaluation diff --git a/samples/prefixSum/prefixSum.py b/samples/prefixSum/prefixSum.py new file mode 100644 index 00000000000..a1ba73e3b97 --- /dev/null +++ b/samples/prefixSum/prefixSum.py @@ -0,0 +1,202 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "cuda-cccl>=1.0.0", "cupy-cuda13x>=14.0.0", "numpy>=2.3.2"] +# /// + +""" +Prefix Sum (Scan) + +Demonstrates parallel prefix sum algorithms using cuda.compute: +- Inclusive scan: output[i] = [init_value] + input[0] + ... + input[i] +- Exclusive scan: output[i] = init_value + input[0] + ... + input[i-1] + +Uses cuda.compute APIs for optimized CUB-based scan operations. +Uses cuda.core APIs for device and stream management. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import cupy as cp + import numpy as np + from cuda_samples_utils import print_gpu_info, verify_array_result + + from cuda.compute import OpKind, exclusive_scan, inclusive_scan + from cuda.core import Device, EventOptions +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +def main() -> bool: + """Run prefix sum sample. Returns True if all tests passed.""" + print("=" * 60) + print("Prefix Sum (Scan) - Using cuda.compute") + print("=" * 60) + + device = Device(0) + device.set_current() + stream = device.create_stream() + cp_stream = cp.cuda.Stream.from_external(stream) + + ok = True + try: + print() + print_gpu_info(device) + + h_input = np.array([3, 1, 4, 1, 5, 9, 2, 6], dtype=np.int32) + init_value = np.array([0], dtype=np.int32) + + # ========================================================================= + # Inclusive Scan + # ========================================================================= + print("\n" + "-" * 60) + print("INCLUSIVE SCAN") + print("-" * 60) + print("Formula: output[i] = [init_value] + input[0] + input[1] + ... + input[i]") + + with cp_stream: + d_input = cp.asarray(h_input) + d_output = cp.empty_like(d_input) + + print(f"\nInput: {h_input.tolist()}") + + inclusive_scan( + d_in=d_input, + d_out=d_output, + op=OpKind.PLUS, + init_value=None, + num_items=len(h_input), + stream=stream, + ) + stream.sync() + print(f"Output: {cp.asnumpy(d_output).tolist()}") + + with cp_stream: + expected = cp.asarray(np.cumsum(h_input)) + ok &= verify_array_result(d_output, expected, rtol=0, atol=0) + + # ========================================================================= + # Exclusive Scan + # ========================================================================= + print("\n" + "-" * 60) + print("EXCLUSIVE SCAN") + print("-" * 60) + print("Formula: output[i] = init_value + input[0] + ... + input[i-1]") + + with cp_stream: + d_output = cp.empty_like(d_input) + + print(f"\nInput: {h_input.tolist()}") + + exclusive_scan( + d_in=d_input, + d_out=d_output, + op=OpKind.PLUS, + init_value=init_value, + num_items=len(h_input), + stream=stream, + ) + stream.sync() + print(f"Output: {cp.asnumpy(d_output).tolist()}") + + with cp_stream: + expected = cp.asarray(np.concatenate([init_value, np.cumsum(h_input)[:-1]])) + ok &= verify_array_result(d_output, expected, rtol=0, atol=0) + + # ========================================================================= + # Large Array Performance + # ========================================================================= + print("\n" + "-" * 60) + print("PERFORMANCE (10M elements)") + print("-" * 60) + + N = 10_000_000 + with cp_stream: + d_large_in = cp.ones(N, dtype=np.int32) + d_large_out = cp.empty_like(d_large_in) + + inclusive_scan( + d_in=d_large_in, + d_out=d_large_out, + op=OpKind.PLUS, + init_value=None, + num_items=N, + stream=stream, + ) + stream.sync() + + event_opts = EventOptions(timing_enabled=True) + start_event = device.create_event(options=event_opts) + end_event = device.create_event(options=event_opts) + + num_iterations = 10 + stream.record(start_event) + for _ in range(num_iterations): + inclusive_scan( + d_in=d_large_in, + d_out=d_large_out, + op=OpKind.PLUS, + init_value=None, + num_items=N, + stream=stream, + ) + stream.record(end_event) + end_event.sync() + elapsed_ms = (end_event - start_event) / num_iterations + + print(f"Inclusive scan: {elapsed_ms:.3f} ms") + print(f"Throughput: {N / elapsed_ms / 1e6:.1f} M elements/ms") + + # ========================================================================= + # Summary + # ========================================================================= + print("\n" + "=" * 60) + print("KEY CONCEPTS") + print("=" * 60) + print("• Inclusive: output[i] includes input[i]") + print("• Exclusive: output[i] excludes input[i], starts with init_value") + print("• cuda.compute provides CUB-based optimized implementations") + print("• cuda.core Stream integrates with CuPy via Stream.from_external") + print("• Applications: stream compaction, radix sort, histograms") + print("=" * 60) + return ok + finally: + cp.cuda.Stream.null.use() + stream.close() + + +if __name__ == "__main__": + success = main() + if not success: + sys.exit(1) diff --git a/samples/prefixSum/requirements.txt b/samples/prefixSum/requirements.txt new file mode 100644 index 00000000000..b05c5375317 --- /dev/null +++ b/samples/prefixSum/requirements.txt @@ -0,0 +1,8 @@ +# Prefix Sum Sample Requirements +# Requires Python 3.10+, CUDA Toolkit 13.0+ + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cuda-cccl>=1.0.0 +cupy-cuda13x>=14.0.0 +numpy>=2.3.2 diff --git a/samples/processCheckpoint/README.md b/samples/processCheckpoint/README.md new file mode 100644 index 00000000000..74c4f2c3086 --- /dev/null +++ b/samples/processCheckpoint/README.md @@ -0,0 +1,206 @@ +# processCheckpoint (Python) + +## Description + +This sample demonstrates how to use the **CUDA process checkpoint API** +via `cuda.core.checkpoint.Process` to suspend, capture, and restore the +CUDA state of a running Linux process. + +CUDA process checkpointing is the driver-level primitive that powers +CRIU + `cuda-checkpoint` integration. + +The sample: + +1. Allocates a GPU buffer and fills it with a deterministic pattern + via a small kernel. +2. Reads the buffer back to host and computes a SHA-256 hash. +3. Runs the full checkpoint lifecycle on its own process: + `lock → checkpoint → restore → unlock`. +4. Reads the buffer back again and verifies that the hash is + unchanged, proving that GPU memory contents survived the round + trip. + +The sample prints the CUDA process state after each step so the +full state machine is visible: + +``` + lock() checkpoint() restore() unlock() +running ---------> locked ------------> checkpointed -----------> locked ---------> running +``` + +## What You'll Learn + +- Creating a `cuda.core.checkpoint.Process` for the current process + by PID and observing its `.state` transitions. +- Running the full `lock → checkpoint → restore → unlock` cycle with + a lock timeout. +- The fact that `restore()` leaves the process in the `locked` state; + you must still call `unlock()` to return to `running`. +- Verifying that GPU memory is preserved across the checkpoint + round-trip by comparing SHA-256 hashes of the buffer before and + after. +- The rough cost of each step (checkpoint and restore dominate and + scale with the device-memory footprint being captured). + +## Key Libraries + +- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) + - device management, memory allocation, kernel compilation and + launch, and the `checkpoint.Process` wrapper. +- [`cuda.bindings`](https://nvidia.github.io/cuda-python/cuda-bindings/latest/) + - used directly for a pageable `cuMemcpyDtoH`. + +## Key APIs + +### From `cuda.core.checkpoint` + +- `checkpoint.Process(pid)` - create a handle to a CUDA process by + PID. Accepts `os.getpid()` for the self-checkpoint case shown + here. +- `Process.state` - one of `"running"`, `"locked"`, `"checkpointed"`, + or `"failed"`. +- `Process.lock(timeout_ms=…)` - block further CUDA API calls on the + process; completes already-submitted work. Always pass a non-zero + timeout to avoid deadlocks. +- `Process.checkpoint()` - copy device memory to host-side driver + allocations and release GPU resources. Process state becomes + `checkpointed`. +- `Process.restore(gpu_mapping=None)` - re-acquire GPU resources and + copy memory back to device. Leaves the process in the `locked` + state. +- `Process.unlock()` - return the process to `running`. +- `Process.restore_thread_id` - thread ID that `restore()` must be + called from in the target process (not used in the self-checkpoint + case here). + +### From `cuda.core` + +- `Device.set_current()` / `Device.memory_resource.allocate(...)` / + `Stream`, `LaunchConfig`, `Program`, `launch` - standard device, + compile, and launch primitives used to produce the buffer + contents. + +### From `cuda.bindings.driver` + +- `cuMemcpyDtoH(host_ptr, device_handle, nbytes)` - synchronous D2H + copy into a pageable host buffer. + +## Requirements + +### Hardware + +- Any NVIDIA GPU supported by CUDA process checkpointing. CUDA + checkpointing is currently limited to x86-64 Linux. + +### Software + +- Linux (the CUDA checkpoint API is Linux-only). +- NVIDIA driver with CUDA process checkpoint support. +- CUDA Toolkit 13.0 or newer. +- Python 3.10 or newer. +- `cuda-core >= 1.0.0`. + +## Installation + +Install the required packages from `requirements.txt`: + +```bash +cd /path/to/cuda-samples/python/2_CoreConcepts/processCheckpoint +pip install -r requirements.txt +``` + +## How to Run + +### Basic usage + +```bash +python processCheckpoint.py +``` + +### Larger GPU footprint to see checkpoint time scale + +```bash +python processCheckpoint.py --buffer-mib 512 +``` + +### Use a specific GPU + +```bash +python processCheckpoint.py --device 1 +``` + +### All options + +``` +--device CUDA device ID (default: 0) +--buffer-mib GPU buffer size in MiB (default: 16) +--lock-timeout-ms Timeout passed to Process.lock in ms (default: 5000) +``` + +## Expected Output + +On an RTX 4090 with a 16 MiB buffer: + +``` +[Process Checkpoint Sample using CUDA Core API] +PID: 748330 +Device: NVIDIA GeForce RTX 4090 +Compute Capability: sm_89 +Buffer size: 16 MiB +Lock timeout: 5000 ms + +Compiling kernel ... +Writing deterministic pattern to GPU buffer ... +Buffer hash (before): b045f7975dc23352 + +Running checkpoint lifecycle on self ... + +step duration (ms) state after +-------------------------------------------------- +initial - running +lock 0.578 locked +checkpoint 268.369 checkpointed +restore 235.024 locked +unlock 1.648 running +-------------------------------------------------- +total 505.618 + +Buffer hash (before): b045f7975dc23352 +Buffer hash (after): b045f7975dc23352 + +PASS: GPU buffer contents survived checkpoint/restore. + +Done +``` + +**What to look for:** + +- The **four state transitions** are all observable: `running → +locked → checkpointed → locked → running`. Note that `restore()` + leaves the process in `locked`, not `running`. +- The **checkpoint and restore steps dominate** the wall-clock time + (hundreds of ms even for a small buffer) - they copy GPU memory to + and from driver-managed host allocations. Increasing + `--buffer-mib` visibly increases the checkpoint time. +- The `lock` and `unlock` steps are essentially free (sub-ms) - they + just flip the process state. +- The SHA-256 **hashes before and after match**, proving the GPU + memory contents survived the round trip. + +Exact timings vary with GPU model, driver version, system load, and +the size of the device memory footprint being captured. + +## Files + +- `processCheckpoint.py` - Python implementation using `cuda.core.checkpoint` +- `README.md` - This file +- `requirements.txt` - Sample dependencies + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [`NVIDIA/cuda-checkpoint`](https://github.com/NVIDIA/cuda-checkpoint) + - the CUDA checkpoint/restore utility, the CRIU plugin, and C + reference programs (`r570-features.c`, `r580-migration-api.c`). +- [Checkpointing CUDA Applications with CRIU](https://developer.nvidia.com/blog/checkpointing-cuda-applications-with-criu/) + - NVIDIA technical blog post on the broader CRIU workflow. diff --git a/samples/processCheckpoint/processCheckpoint.py b/samples/processCheckpoint/processCheckpoint.py new file mode 100644 index 00000000000..4bb19b10859 --- /dev/null +++ b/samples/processCheckpoint/processCheckpoint.py @@ -0,0 +1,263 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS `AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0,<13.3.0", "cuda-core>=1.0.0", "numpy>=2.3.2"] +# /// + +""" +Process Checkpointing Sample using CUDA Core API. + +The sample allocates a GPU buffer, fills it with a deterministic +pattern via a kernel, hashes the contents, runs the full +lock/checkpoint/restore/unlock cycle on its own PID, and re-hashes +the buffer afterwards to verify that the GPU memory contents +survived the round trip. +""" + +import argparse +import hashlib +import os +import sys +import time +from dataclasses import dataclass +from typing import List + +import numpy as np + +from cuda.bindings import driver as cudrv +from cuda.core import ( + Device, + LaunchConfig, + Program, + ProgramOptions, + checkpoint, + launch, +) + +# Small fill kernel: deterministic, non-trivial pattern so the before/after +# hashes would disagree on any bit flip. +KERNEL_SRC = r""" +extern "C" __global__ void fill_pattern(float *out, unsigned long long n) +{ + unsigned long long i = (unsigned long long)blockIdx.x * blockDim.x + threadIdx.x; + + if (i < n) { + float v = (float)(i & 0xFFFFu) * 1e-3f + 1.0f; + float u = (float)((i >> 16) & 0xFFFFu) * 1e-4f + 0.5f; + // A handful of dependent ops per element. Deterministic given i. + for (int k = 0; k < 8; ++k) { + v = v * 1.000001f + u; + u = u * 0.999999f + v * 1e-6f; + } + out[i] = v + u; + } +} +""" + + +@dataclass +class StepTiming: + label: str + duration_ms: float + state_after: str + + +def _cu_check(result) -> None: + err = result[0] + if int(err) != 0: + raise RuntimeError(f"CUDA driver call failed: {err}") + + +def compile_fill_kernel(device: Device): + options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + program = Program(KERNEL_SRC, code_type="c++", options=options) + module = program.compile("cubin", name_expressions=("fill_pattern",)) + return module.get_kernel("fill_pattern") + + +def hash_device_buffer(device_buffer, host: np.ndarray) -> str: + _cu_check( + cudrv.cuMemcpyDtoH( + host.ctypes.data, + device_buffer.handle, + host.nbytes, + ) + ) + return hashlib.sha256(host.tobytes()).hexdigest()[:16] + + +def _time_call(fn, *args, **kwargs) -> float: + t0 = time.monotonic() + fn(*args, **kwargs) + return (time.monotonic() - t0) * 1000.0 + + +def run_lifecycle(proc: checkpoint.Process, lock_timeout_ms: int) -> List[StepTiming]: + """ + Drive the full `lock -> checkpoint -> restore -> unlock` cycle on + `proc` and return per-step timings with the state observed after + each step. + + Note on state after `restore()`: the driver leaves the process in + the `locked` state. You must still call `unlock()` to return to + `running`. + """ + timings: List[StepTiming] = [StepTiming("initial", 0.0, proc.state)] + + ms = _time_call(proc.lock, timeout_ms=lock_timeout_ms) + timings.append(StepTiming("lock", ms, proc.state)) + + ms = _time_call(proc.checkpoint) + timings.append(StepTiming("checkpoint", ms, proc.state)) + + ms = _time_call(proc.restore) + timings.append(StepTiming("restore", ms, proc.state)) + + ms = _time_call(proc.unlock) + timings.append(StepTiming("unlock", ms, proc.state)) + + return timings + + +def print_timings(timings: List[StepTiming]) -> None: + print() + header = f"{'step':<14}{'duration (ms)':>18}{'state after':>18}" + print(header) + print("-" * len(header)) + total = 0.0 + for t in timings: + if t.label == "initial": + dur = "-" + else: + dur = f"{t.duration_ms:.3f}" + total += t.duration_ms + print(f"{t.label:<14}{dur:>18}{t.state_after:>18}") + print("-" * len(header)) + print(f"{'total':<14}{total:>18.3f}{'':>18}") + + +def main(): + parser = argparse.ArgumentParser( + description="CUDA process checkpoint sample using cuda.core", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--device", type=int, default=0, help="CUDA device ID (default: 0)") + parser.add_argument( + "--buffer-mib", + type=int, + default=16, + help="GPU buffer size in MiB (default: 16)", + ) + parser.add_argument( + "--lock-timeout-ms", + type=int, + default=5000, + help="Timeout passed to Process.lock in ms (default: 5000)", + ) + args = parser.parse_args() + + if sys.platform != "linux": + print("Error: CUDA process checkpointing is Linux-only.") + return 1 + + if args.buffer_mib <= 0: + print("Error: --buffer-mib must be positive") + return 1 + + print("[Process Checkpoint Sample using CUDA Core API]") + print(f"PID: {os.getpid()}") + + device = Device(args.device) + device.set_current() + print(f"Device: {device.name}") + print(f"Compute Capability: sm_{device.arch}") + print(f"Buffer size: {args.buffer_mib} MiB") + print(f"Lock timeout: {args.lock_timeout_ms} ms") + + # CUDA process checkpointing relies on kernel-mode driver features + # that aren't shipped on integrated-GPU platforms (e.g. Tegra / + # Jetson / Thor). On those, Process.lock() can hang indefinitely + # instead of returning a clean "not supported" error. Skip cleanly + # rather than hanging. Remove this guard once integrated platforms + # gain checkpoint support. + if device.properties.integrated: + print( + f"CUDA process checkpointing is not supported on integrated GPUs (sm_{device.arch}), waiving this sample." + ) + return 2 + + print() + print("Compiling kernel ...") + fill_kernel = compile_fill_kernel(device) + + buffer_bytes = args.buffer_mib * 1024 * 1024 + n_elements = buffer_bytes // 4 # float32 + + stream = device.create_stream() + device_buffer = device.memory_resource.allocate(buffer_bytes, stream=stream) + try: + print("Writing deterministic pattern to GPU buffer ...") + block = 256 + grid = (n_elements + block - 1) // block + cfg = LaunchConfig(grid=grid, block=block) + launch(stream, cfg, fill_kernel, device_buffer, np.uint64(n_elements)) + stream.sync() + + host = np.empty(n_elements, dtype=np.float32) + + hash_before = hash_device_buffer(device_buffer, host) + print(f"Buffer hash (before): {hash_before}") + + print() + print("Running checkpoint lifecycle on self ...") + proc = checkpoint.Process(os.getpid()) + timings = run_lifecycle(proc, args.lock_timeout_ms) + print_timings(timings) + + hash_after = hash_device_buffer(device_buffer, host) + + print() + print(f"Buffer hash (before): {hash_before}") + print(f"Buffer hash (after): {hash_after}") + + if hash_before != hash_after: + print() + print("FAIL: GPU buffer contents changed across checkpoint/restore.") + return 1 + + print() + print("PASS: GPU buffer contents survived checkpoint/restore.") + finally: + device_buffer.close(stream) + + print() + print("Done") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/processCheckpoint/requirements.txt b/samples/processCheckpoint/requirements.txt new file mode 100644 index 00000000000..a0605400b31 --- /dev/null +++ b/samples/processCheckpoint/requirements.txt @@ -0,0 +1,4 @@ +# cuda-bindings 13.3.0 drops the CUcheckpointRestoreArgs alias that cuda-core requires +cuda-python>=13.0.0,<13.3.0 +cuda-core>=1.0.0 +numpy>=2.3.2 diff --git a/samples/reduction/README.md b/samples/reduction/README.md new file mode 100644 index 00000000000..0f2a0056ea9 --- /dev/null +++ b/samples/reduction/README.md @@ -0,0 +1,137 @@ +# Sample: Fast Array Sum using Shared Memory (Python) + +## Description + +Two-stage parallel reduction: each GPU block sums its chunk in **shared memory** (tree reduction, two elements per thread), writes one partial sum per block; the host combines partial sums for the final result. + +**Stack:** `cuda-core` for `Device`, stream, events, `Program` / `launch()`. **CuPy** allocates device memory and copies; `launch()` takes device pointers as `ndarray.data.ptr` (Python `int`). Copies run on the same CUDA stream as the kernel via `cp.cuda.Stream.from_external(stream)` (cuda.core `Stream` implements the CUDA stream protocol) and `with cp_stream:`. + +## What you will learn + +- Shared-memory block reduction and sequential-addressing tree reduction +- `LaunchConfig` with dynamic shared memory and `launch()` with pointer arguments +- Aligning CuPy transfers with a `cuda.core` stream (`Stream.from_external`) +- GPU timing with `EventOptions` / `device.create_event()` + +## Key libraries + +| Library | Role | +|------------|------| +| `cuda-core`| Device, stream, events, compile, launch | +| `cupy` | `cp.empty`, `cp.asarray`, `cp.asnumpy`, `Stream.from_external` | +| `numpy` | Host data and CPU reference sum | + +## Key APIs (quick reference) + +- **cuda.core:** `Device`, `create_stream`, `Program` / `ProgramOptions`, `LaunchConfig`, `launch`, `EventOptions`, `create_event` +- **CuPy:** `cp.empty`, `cp.asarray`, `cp.cuda.Stream.from_external(stream)`, `with cp_stream:`, `cp.asnumpy` + +## Requirements + +- NVIDIA GPU, CUDA-capable driver; **CUDA Toolkit 13+** (for toolchain alignment with `cuda-core`) +- **Python 3.10+** + +```bash +pip install -r requirements.txt +``` + +## How to run + +```bash +python reduction.py +``` + +Defaults: 2²⁴ elements, 256 threads/block, `float`, 100 benchmark iterations. + +**Change data type** (selects `blockReduceKernel_int` / `_float` / `_double`): + +```bash +python reduction.py --type float # default; 32-bit float +python reduction.py --type double # 64-bit float +python reduction.py --type int # 32-bit integer (exact equality check) +``` + +Combine with other flags as needed, e.g. `python reduction.py --type int --n 1048576`. + +Other main flags: `--n`, `--threads`, `--iterations`. Full list: `python reduction.py --help`. + +## Output + +Example run (`python reduction.py`, defaults) on **Tesla T10**, compute capability **7.5**: + +``` +====================================================================== +Fast Array Sum using Shared Memory - Two-Stage Reduction +====================================================================== + +Demonstrates: Efficient parallel reduction using shared memory + +Device Information: + Name: Tesla T10 + Compute Capability: sm_7.5 + +Configuration: + Array size: 16,777,216 elements + Data type: float + Memory: 64.00 MB + Threads per block: 256 + +Two-Stage Reduction Strategy: + Stage 1: GPU block reduction + - Number of blocks: 32768 + - Elements per block: 512 + - Output: 32768 partial sums + Stage 2: CPU final reduction + - Combine 32768 partial sums -> 1 final result + +Compiling CUDA kernel... + Kernel 'blockReduceKernel_float' compiled successfully + +> Generating random input data... +> Computing reference result on CPU... + CPU time: 2.428208 seconds + +> Warming up GPU... + Warm-up completed + +> Benchmarking Stage 1 (GPU block reduction)... + Running 100 iterations... + +> Running Stage 2 (CPU final reduction)... + +====================================================================== +Performance Results +====================================================================== + +Stage 1 (GPU block reduction): + Average time: 0.338404 ms + Throughput: 198.31 GB/s + +Stage 2 (CPU final reduction): + Time: 0.078073 ms + (32768 partial sums) + +Total time: 0.416477 ms +Speedup vs CPU: 5830.35x + +> Validating results... + GPU result: 2147639808.00000000 + CPU result: 2147639929.62027407 +Test PASSED + +====================================================================== +Summary +====================================================================== +Key optimizations: + - Load 2 elements per thread: 8,388,608 global reads (50% savings) + - Shared memory for reduction: ~10-20x faster than global memory + - Parallel block outputs: 32768 independent writes +Result: 198.31 GB/s throughput +====================================================================== +Two-Stage Reduction completed successfully! +====================================================================== +``` + +## Files + +`reduction.py` · `requirements.txt` · `README.md` diff --git a/samples/reduction/reduction.py b/samples/reduction/reduction.py new file mode 100644 index 00000000000..5903bb8a4c6 --- /dev/null +++ b/samples/reduction/reduction.py @@ -0,0 +1,480 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# distribution and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["numpy>=2.3.2", "cuda-core>=1.0.0", "cuda-python>=13.0.0", "cupy-cuda13x>=14.0.0"] +# /// + +""" +Fast Array Sum using Shared Memory - Two-Stage Reduction + +Demonstrates efficient parallel reduction using shared memory and +two-stage approach to avoid atomic operation bottlenecks. + +Key Features: +- Block-level reduction using shared memory +- Each thread loads 2 elements to reduce global memory traffic +- Sequential addressing tree reduction pattern +- No atomic operations - eliminates serialization bottleneck +- Device memory via CuPy; ``launch()`` takes pointers as ``ndarray.data.ptr`` +- CuPy uses ``cp.cuda.Stream.from_external(stream)``. +""" + +import argparse +import os +import sys +import time + +try: + import cupy as cp + import numpy as np + + from cuda.core import ( + Device, + EventOptions, + LaunchConfig, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install dependencies:") + print(" pip install -r requirements.txt") + sys.exit(1) + +# Import utilities +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "Utilities")) +from cuda_samples_utils import verify_array_result + +# Two-stage block reduction kernel +REDUCTION_KERNEL = """ +/* + * Block-level reduction kernel using shared memory + * + * Strategy: + * - Each block processes blockSize * 2 elements + * - Uses shared memory for fast intra-block reduction + * - Outputs one partial sum per block (no atomics) + * + * Key optimizations: + * - Load 2 elements per thread (reduces global memory traffic by 50%) + * - Tree reduction with sequential addressing (avoids divergence) + * - Shared memory instead of atomic operations (eliminates bottleneck) + * + * Note: This sample provides separate implementations for each data type + * for clarity. Production code typically uses templates with SharedMemory + * or reinterpret_cast to avoid duplication. See NVIDIA reduction guide for + * template-based approaches. + */ + +extern "C" __global__ void blockReduceKernel_int( + const int *__restrict__ input, + int *__restrict__ blockSums, + unsigned int n) +{ + extern __shared__ int sdata_int[]; + + unsigned int tid = threadIdx.x; + unsigned int blockSize = blockDim.x; + unsigned int gid = blockIdx.x * (blockSize * 2) + tid; + + // Load 2 elements per thread + int sum = 0; + if (gid < n) sum += input[gid]; + if (gid + blockSize < n) sum += input[gid + blockSize]; + + sdata_int[tid] = sum; + __syncthreads(); + + // Tree reduction with sequential addressing + for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata_int[tid] += sdata_int[tid + s]; + } + __syncthreads(); + } + + // Write block result + if (tid == 0) { + blockSums[blockIdx.x] = sdata_int[0]; + } +} + +extern "C" __global__ void blockReduceKernel_float( + const float *__restrict__ input, + float *__restrict__ blockSums, + unsigned int n) +{ + extern __shared__ float sdata_float[]; + + unsigned int tid = threadIdx.x; + unsigned int blockSize = blockDim.x; + unsigned int gid = blockIdx.x * (blockSize * 2) + tid; + + // Load 2 elements per thread + float sum = 0.0f; + if (gid < n) sum += input[gid]; + if (gid + blockSize < n) sum += input[gid + blockSize]; + + sdata_float[tid] = sum; + __syncthreads(); + + // Tree reduction with sequential addressing + for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata_float[tid] += sdata_float[tid + s]; + } + __syncthreads(); + } + + // Write block result + if (tid == 0) { + blockSums[blockIdx.x] = sdata_float[0]; + } +} + +extern "C" __global__ void blockReduceKernel_double( + const double *__restrict__ input, + double *__restrict__ blockSums, + unsigned int n) +{ + extern __shared__ double sdata_double[]; + + unsigned int tid = threadIdx.x; + unsigned int blockSize = blockDim.x; + unsigned int gid = blockIdx.x * (blockSize * 2) + tid; + + // Load 2 elements per thread + double sum = 0.0; + if (gid < n) sum += input[gid]; + if (gid + blockSize < n) sum += input[gid + blockSize]; + + sdata_double[tid] = sum; + __syncthreads(); + + // Tree reduction with sequential addressing + for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata_double[tid] += sdata_double[tid + s]; + } + __syncthreads(); + } + + // Write block result + if (tid == 0) { + blockSums[blockIdx.x] = sdata_double[0]; + } +} +""" + + +def reduce_cpu(data): + """Compute sum using Kahan summation for numerical accuracy.""" + if len(data) == 0: + return 0 + + sum_val = float(data[0]) + c = 0.0 + + for i in range(1, len(data)): + y = float(data[i]) - c + t = sum_val + y + c = (t - sum_val) - y + sum_val = t + + return sum_val + + +def _validate_threads_per_block(threads_per_block): + if threads_per_block <= 0 or threads_per_block > 1024: + return "threads per block must be between 1 and 1024" + if (threads_per_block & (threads_per_block - 1)) != 0: + return "threads per block must be a power of 2 (required by the shared-memory tree reduction kernel)" + return None + + +def run(num_elements=1 << 24, threads_per_block=256, test_iterations=100, datatype="float"): + """Run two-stage reduction benchmark.""" + + print("\n" + "=" * 70) + print("Fast Array Sum using Shared Memory - Two-Stage Reduction") + print("=" * 70) + print("\nDemonstrates: Efficient parallel reduction using shared memory") + + # Map datatype + dtype_map = {"int": np.int32, "float": np.float32, "double": np.float64} + if datatype not in dtype_map: + print(f"Unknown datatype '{datatype}', using 'float'") + datatype = "float" + dtype = dtype_map[datatype] + itemsize = np.dtype(dtype).itemsize + + # Initialize device + device = Device() + device.set_current() + major, minor = device.compute_capability + + print("\nDevice Information:") + print(f" Name: {device.name}") + print(f" Compute Capability: sm_{major}.{minor}") + + # Configuration + print("\nConfiguration:") + print(f" Array size: {num_elements:,} elements") + print(f" Data type: {datatype}") + print(f" Memory: {num_elements * itemsize / (1024**2):.2f} MB") + print(f" Threads per block: {threads_per_block}") + + # Calculate number of blocks + # Each block processes threads_per_block * 2 elements + num_blocks = (num_elements + threads_per_block * 2 - 1) // (threads_per_block * 2) + + print("\nTwo-Stage Reduction Strategy:") + print(" Stage 1: GPU block reduction") + print(f" - Number of blocks: {num_blocks}") + print(f" - Elements per block: {threads_per_block * 2}") + print(f" - Output: {num_blocks} partial sums") + print(" Stage 2: CPU final reduction") + print(f" - Combine {num_blocks} partial sums -> 1 final result") + + # Compile kernel + print("\nCompiling CUDA kernel...") + program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + prog = Program(REDUCTION_KERNEL, code_type="c++", options=program_options) + mod = prog.compile("cubin") + kernel_name = f"blockReduceKernel_{datatype}" + kernel = mod.get_kernel(kernel_name) + print(f" Kernel '{kernel_name}' compiled successfully") + + # Generate input data + print("\n> Generating random input data...") + rng = np.random.default_rng(42) + if datatype == "int": + h_input = rng.integers(0, 256, size=num_elements, dtype=dtype) + else: + h_input = (rng.random(num_elements) * 256).astype(dtype) + + # cuda.core stream for launch/events; CuPy copies use the same stream via + # Stream.from_external. + stream = device.create_stream() + cp_stream = cp.cuda.Stream.from_external(stream) + try: + d_blockSums = cp.empty(num_blocks, dtype=dtype) + with cp_stream: + d_input = cp.asarray(h_input, dtype=dtype) + stream.sync() + + # Compute CPU reference + print("> Computing reference result on CPU...") + cpu_start = time.perf_counter() + cpu_result = reduce_cpu(h_input) + cpu_time = time.perf_counter() - cpu_start + print(f" CPU time: {cpu_time:.6f} seconds") + + # Configure launch + shared_mem_bytes = threads_per_block * itemsize + config = LaunchConfig(grid=num_blocks, block=threads_per_block, shmem_size=shared_mem_bytes) + + # Warm-up + print("\n> Warming up GPU...") + launch( + stream, + config, + kernel, + d_input.data.ptr, + d_blockSums.data.ptr, + np.uint32(num_elements), + ) + stream.sync() + print(" Warm-up completed") + + # Benchmark Stage 1 (GPU) + print("\n> Benchmarking Stage 1 (GPU block reduction)...") + print(f" Running {test_iterations} iterations...") + + # cuda.core event elapsed time (end - start) is in milliseconds (CUDA API). + stage1_times_ms = [] + event_options = EventOptions(timing_enabled=True) + start_event = stream.device.create_event(options=event_options) + end_event = stream.device.create_event(options=event_options) + for _ in range(test_iterations): + stream.record(start_event) + launch( + stream, + config, + kernel, + d_input.data.ptr, + d_blockSums.data.ptr, + np.uint32(num_elements), + ) + stream.record(end_event) + end_event.sync() + stage1_times_ms.append(float(end_event - start_event)) + + avg_stage1_ms = np.mean(stage1_times_ms) + avg_stage1_s = avg_stage1_ms / 1000.0 + + # Stage 2 (CPU) + print("\n> Running Stage 2 (CPU final reduction)...") + # Device -> Host: after stream sync, partial sums are visible on host. + stream.sync() + with cp_stream: + h_blockSums = cp.asnumpy(d_blockSums) + stage2_start = time.perf_counter() + gpu_result = float(np.sum(h_blockSums)) + stage2_time = time.perf_counter() - stage2_start + + total_time = avg_stage1_s + stage2_time + + # Performance metrics (use seconds for throughput; CPU times are in seconds) + bytes_processed = num_elements * itemsize + throughput = bytes_processed / avg_stage1_s / 1e9 + + print("\n" + "=" * 70) + print("Performance Results") + print("=" * 70) + print("\nStage 1 (GPU block reduction):") + print(f" Average time: {avg_stage1_ms:.6f} ms") + print(f" Throughput: {throughput:.2f} GB/s") + print("\nStage 2 (CPU final reduction):") + print(f" Time: {stage2_time * 1000:.6f} ms") + print(f" ({num_blocks} partial sums)") + print(f"\nTotal time: {total_time * 1000:.6f} ms") + print(f"Speedup vs CPU: {cpu_time / total_time:.2f}x") + + # Validation + print("\n> Validating results...") + if datatype == "int": + print(f" GPU result: {int(gpu_result):,}") + print(f" CPU result: {int(cpu_result):,}") + rtol, atol = 0.0, 0.0 + else: + precision = 8 if datatype == "float" else 12 + print(f" GPU result: {gpu_result:.{precision}f}") + print(f" CPU result: {cpu_result:.{precision}f}") + if datatype == "float": + rtol, atol = 1e-5, 1e-8 * num_elements + else: # double + rtol, atol = 1e-8, 1e-12 * num_elements + + success = verify_array_result( + np.array([gpu_result]), + np.array([cpu_result]), + rtol=rtol, + atol=atol, + verbose=True, + ) + + # Summary + print("\n" + "=" * 70) + print("Summary") + print("=" * 70) + print("Key optimizations:") + half_reads = num_elements // 2 + print(f" - Load 2 elements per thread: {half_reads:,} global reads (50% savings)") + print(" - Shared memory for reduction: ~10-20x faster than global memory") + print(f" - Parallel block outputs: {num_blocks} independent writes") + print(f"Result: {throughput:.2f} GB/s throughput") + + print("=" * 70) + if success: + print("Two-Stage Reduction completed successfully!") + else: + print("Two-Stage Reduction FAILED!") + print("=" * 70 + "\n") + + return 0 if success else 1 + finally: + stream.close() + + +def main(): + """Main entry point with argument parsing.""" + parser = argparse.ArgumentParser( + description="Two-Stage Reduction with Shared Memory", + epilog="See README.md for usage examples and detailed documentation.", + ) + + parser.add_argument( + "--n", + type=int, + default=1 << 24, + help="Number of elements to reduce (default: 16777216 = 2^24)", + ) + + parser.add_argument( + "--threads", + type=int, + default=256, + help="Threads per block, power of 2 in [1, 1024] (default: 256)", + ) + + parser.add_argument( + "--type", + type=str, + default="float", + choices=["int", "float", "double"], + help="Data type for reduction (default: float)", + ) + + parser.add_argument( + "--iterations", + type=int, + default=100, + help="Number of benchmark iterations (default: 100)", + ) + + args = parser.parse_args() + + # Validate arguments + if args.n <= 0: + print("Error: n must be positive") + return 1 + + err = _validate_threads_per_block(args.threads) + if err: + print(f"Error: {err}") + return 1 + + try: + exit_code = run( + num_elements=args.n, + threads_per_block=args.threads, + test_iterations=args.iterations, + datatype=args.type, + ) + sys.exit(exit_code) + except Exception as e: + print(f"\nError: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/samples/reduction/requirements.txt b/samples/reduction/requirements.txt new file mode 100644 index 00000000000..84a6b40aef6 --- /dev/null +++ b/samples/reduction/requirements.txt @@ -0,0 +1,8 @@ +# Python CUDA Reduction Sample Requirements +# Install with: pip install -r requirements.txt + +numpy>=2.3.2 +cuda-core>=1.0.0 +cuda-python>=13.0.0 +# Use cupy-cuda13x>=14.0.0 for cp.cuda.Stream.from_external(stream) +cupy-cuda13x>=14.0.0 diff --git a/samples/reductionMultiBlockCG/README.md b/samples/reductionMultiBlockCG/README.md new file mode 100644 index 00000000000..825e3506621 --- /dev/null +++ b/samples/reductionMultiBlockCG/README.md @@ -0,0 +1,140 @@ +# Sample: Single-Pass Multi-Block Reduction with Cooperative Groups (Python) + +## Description + +Single-kernel, two-stage reduction using **Cooperative Groups** and `grid.sync()` so all blocks synchronize inside one launch—no second kernel or CPU stage for the reduction tree. + +**Stack:** `cuda-core` (device, compile, cooperative `launch()`, stream, **CUDA events** for GPU timing). **CuPy** for H↔D copies on the same stream (`Stream.from_external(cuda.core_stream)`, `ndarray.data.ptr` to `launch()`). **`try`/`finally`** closes the stream if cooperative launch fails. Requires **compute capability > 6.0** (Pascal+). + +## What you will learn + +- `cooperative_groups::grid_group` and `grid.sync()` across the grid +- Cooperative `LaunchConfig(..., cooperative_launch=True)` and sizing blocks for residency +- Timing the GPU path with `EventOptions` / `stream.record()` / event elapsed time + +## Key libraries + +| Library | Role | +|---------|------| +| `cuda-core` | Device, stream, events, `Program` / `ProgramOptions`, cooperative `launch()` | +| `cupy` | `cp.empty`, `cp.asarray`, `cp.asnumpy`, `Stream.from_external` | +| `numpy` | Host data, reference sum, `default_rng` | + +## Requirements + +- NVIDIA GPU, **Pascal or newer**; **CUDA Toolkit 13+**; **Python 3.10+** +- NVRTC must see **`cooperative_groups.h`** and **CCCL** headers (`cuda/std/*`) + +```bash +pip install -r requirements.txt +``` + +Pick a CuPy wheel that matches your CUDA major version (e.g. `cupy-cuda13x` in `requirements.txt`). + +## How to run + +**`--cuda-include-dir` is required.** Multiple paths can be combined using the +OS path separator (`:` on Linux/macOS, `;` on Windows). + +Linux / macOS: + +```bash +python reductionMultiBlockCG.py \ + --cuda-include-dir /usr/local/cuda/include/cccl:/usr/local/cuda/include +``` + +Windows (PowerShell or cmd, note the `;` separator and quotes around the +combined value): + +```powershell +python reductionMultiBlockCG.py ` + --cuda-include-dir "$env:CUDA_PATH\include;$env:CUDA_PATH\include\cccl" +``` + +**Jetson / split include trees:** pass every directory NVRTC needs in one `--cuda-include-dir` argument, e.g. +`/usr/local/cuda/include/cccl:/usr/local/cuda/targets/sbsa-linux/include` (adjust paths to your image). If headers are scattered, you can instead merge them into one tree with symlinks and point `--cuda-include-dir` at that folder. + +Defaults: **2²⁵** elements, threads = device max (capped at 1024), auto `--maxblocks`, **100** iterations. Other flags: `--n`, `--threads`, `--maxblocks`, `--iterations`. See **`python reductionMultiBlockCG.py --help`**. + +## Output + +``` +====================================================================== +Single-Pass Multi-Block Reduction with Cooperative Groups +====================================================================== + +Demonstrates: Multi-stage reduction in a single kernel using grid.sync() + +Device Information: + Name: NVIDIA Thor + Compute Capability: sm_11.0 + +Reduction Configuration: + Number of elements: 33,554,432 + Data size: 128.00 MB + +Compiling CUDA kernel... + Kernel compiled successfully + +Launch Configuration: + Threads per block: 1024 + Number of blocks: 20 + Total threads: 20,480 + Shared memory per block: 4096 bytes + Launch mode: Cooperative (grid-wide sync enabled) + +> Generating random input data... +> Computing reference result on CPU... + CPU time: 0.008903 seconds + +> Warming up GPU... + Warm-up successful + +> Running benchmark (100 iterations)... + +> Performance Results: + Average GPU time: 0.977166 ms + Throughput: 137.35 GB/s + Speedup vs CPU: 9.11x + +> Validating results... +Test PASSED + +====================================================================== +Summary +====================================================================== + +Single-kernel two-stage reduction: + Stage 1: 20 blocks -> 20 partial sums + grid.sync() <- All blocks synchronize (KEY innovation) + Stage 2: Block 0 -> 1 final result + Total: 1 kernel launch, 137.35 GB/s + +Comparison: + • Traditional: 2 kernel launches or kernel + CPU + • This sample: 1 kernel with grid.sync() between stages + • Benefit: Eliminates ~5-20us launch overhead per stage + +====================================================================== +Single-Pass Multi-Block Reduction completed successfully! +====================================================================== +``` + +## Troubleshooting (short) + +- **Cooperative launch not supported / fails:** need sm_60+; reduce `--maxblocks` or `--threads` so all blocks can be resident. +- **Compile errors missing headers:** extend `--cuda-include-dir` with the path that contains CCCL / cooperative groups (see Jetson note above). +- **Low throughput:** often block count vs occupancy; try defaults first, then tune `--threads` / `--maxblocks`. + +## Related samples + +**blockArraySum** (atomics + grid-stride) → **reduction** (two-stage shared memory) → **this sample** (single kernel + `grid.sync()`). + +## Further reading + +- [CUDA Cooperative Groups](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cooperative-groups) +- [Reduction whitepaper (PDF)](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf) + +## Files + +`reductionMultiBlockCG.py` · `requirements.txt` · `README.md` diff --git a/samples/reductionMultiBlockCG/reductionMultiBlockCG.py b/samples/reductionMultiBlockCG/reductionMultiBlockCG.py new file mode 100644 index 00000000000..03e79f951d7 --- /dev/null +++ b/samples/reductionMultiBlockCG/reductionMultiBlockCG.py @@ -0,0 +1,465 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# distribution and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["numpy>=2.3.2", "cuda-core>=1.0.0", "cuda-python>=13.0.0", "cuda-cccl>=1.0.0", "cupy-cuda13x>=14.0.0"] +# /// + +""" +Single-Pass Multi-Block Reduction with Cooperative Groups + +Demonstrates single-kernel multi-stage reduction using grid-wide +synchronization. Traditional reduction needs multiple kernel launches, +but with grid.sync() from Cooperative Groups, we can complete all +stages in ONE kernel. + +Key Features: +- Grid-wide synchronization with grid.sync() +- Two-stage reduction in a single kernel (no atomic operations) +- Requires compute capability 6.0+ and cooperative launch +- Achieves 400-700 GB/s on modern GPUs + +How it differs from other samples: +- blockArraySum.py: Basic thread/block indexing + atomicAdd +- reduction.py: High-performance shared memory, two-kernel approach +- This sample: Single-kernel multi-stage with grid.sync() + +Transfers use CuPy on the same CUDA stream as ``launch()`` (``Stream.from_external``), +not ``cuda.bindings.driver`` memcpy. GPU timing uses CUDA events. +""" + +import argparse +import os +import sys +import time + +try: + import cupy as cp + import numpy as np + + from cuda.core import ( + Device, + EventOptions, + LaunchConfig, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install dependencies:") + print(" pip install -r requirements.txt") + sys.exit(1) + +# Import utilities +utilities_path = os.path.join(os.path.dirname(__file__), "..", "Utilities") +sys.path.insert(0, utilities_path) +from cuda_samples_utils import verify_array_result + + +def _validate_threads_arg(threads): + if threads is None: + return None + if threads <= 0 or threads > 1024: + return "threads must be between 1 and 1024" + if (threads & (threads - 1)) != 0: + return "threads must be a power of 2 (required by the shared-memory tree reduction kernel)" + return None + + +# Single-pass multi-block reduction kernel with grid-wide sync +REDUCTION_KERNEL = """ +/* + * Single-Kernel Multi-Stage Reduction using grid.sync() + * + * Strategy: + * Stage 1: Each block reduces its portion → partial sum + * grid.sync() ← KEY: All blocks synchronize + * Stage 2: Block 0 reduces all partial sums → final result + * + * Key feature: grid.sync() enables multi-stage within ONE kernel + */ + +#include + +namespace cg = cooperative_groups; +extern "C" __global__ void reduceSinglePassMultiBlockCG( + const float *__restrict__ g_idata, + float *__restrict__ g_odata, + unsigned int n) +{ + cg::thread_block cta = cg::this_thread_block(); + cg::grid_group grid = cg::this_grid(); + extern __shared__ float sdata[]; + + unsigned int tid = threadIdx.x; + unsigned int blockSize = blockDim.x; + + // Stage 1: Grid-stride loop + block reduction + float sum = 0.0f; + for (unsigned int i = grid.thread_rank(); i < n; i += grid.size()) { + sum += g_idata[i]; + } + + sdata[tid] = sum; + cg::sync(cta); + + // Block reduction (sequential addressing) + for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + cg::sync(cta); + } + + if (tid == 0) { + g_odata[blockIdx.x] = sdata[0]; + } + + // KEY: Grid-wide synchronization (all blocks wait here) + grid.sync(); + + // Stage 2: Block 0 reduces all partial sums → final result + // Use a stride loop so all gridDim.x partial sums are covered even + // when gridDim.x > blockDim.x. + if (blockIdx.x == 0) { + // mySum stays 0.0f when tid >= gridDim.x (loop never executes), + // implicitly zero-filling sdata for threads beyond the partial-sum count. + float mySum = 0.0f; + for (unsigned int i = tid; i < gridDim.x; i += blockSize) { + mySum += g_odata[i]; + } + sdata[tid] = mySum; + cg::sync(cta); + + for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + cg::sync(cta); + } + + if (tid == 0) { + g_odata[0] = sdata[0]; + } + } +} +""" + + +def get_max_cooperative_blocks(device, _kernel, threads_per_block, _shared_mem_bytes): + """ + Calculate max blocks for cooperative launch (all must be resident). + + This is a conservative estimate that ignores kernel/shared-memory limits; + for precise tuning, use cudaOccupancyMaxActiveBlocksPerMultiprocessor. + The kernel and shared_mem_bytes parameters are accepted (and underscore- + prefixed) so the call signature mirrors that helper for readability. + """ + # Get device properties + prop = device.properties + + # Calculate maximum blocks per SM + # Note: We use cudaOccupancyMaxActiveBlocksPerMultiprocessor functionality + # For simplicity in Python, we'll use a conservative estimate + num_sms = prop.multiprocessor_count + max_threads_per_sm = prop.max_threads_per_multiprocessor + max_blocks_per_sm = max_threads_per_sm // threads_per_block + + # Total blocks = blocks per SM × number of SMs + max_blocks = max_blocks_per_sm * num_sms + + # Also respect max_grid_dim_x + max_blocks = min(max_blocks, prop.max_grid_dim_x) + + return max_blocks + + +def run( + num_elements=1 << 25, + max_threads=None, + max_blocks=None, + test_iterations=100, + cuda_include_dir=None, +): + """Run single-pass multi-block reduction benchmark.""" + + if cuda_include_dir is None: + raise ValueError("cuda_include_dir is required") + + print("\n" + "=" * 70) + print("Single-Pass Multi-Block Reduction with Cooperative Groups") + print("=" * 70) + msg = "Multi-stage reduction in a single kernel using grid.sync()" + print(f"\nDemonstrates: {msg}") + + # Initialize device + device = Device() + device.set_current() + major, minor = device.compute_capability + + print("\nDevice Information:") + print(f" Name: {device.name}") + print(f" Compute Capability: sm_{major}.{minor}") + + # Get device properties for configuration + prop = device.properties + + # Determine threads per block + if max_threads is None: + max_threads = prop.max_threads_per_block + threads_per_block = min(max_threads, 1024) + + # Define data type and itemsize + itemsize = np.dtype(np.float32).itemsize + + print("\nReduction Configuration:") + print(f" Number of elements: {num_elements:,}") + print(f" Data size: {num_elements * itemsize / (1024**2):.2f} MB") + + # Compile kernel + print("\nCompiling CUDA kernel...") + # Support multiple include paths separated by the OS path separator + # (':' on POSIX, ';' on Windows). os.pathsep avoids splitting Windows + # drive prefixes like "C:\..." by accident. + include_paths = cuda_include_dir.split(os.pathsep) + program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}", include_path=include_paths) + prog = Program(REDUCTION_KERNEL, code_type="c++", options=program_options) + mod = prog.compile("cubin") + kernel = mod.get_kernel("reduceSinglePassMultiBlockCG") + print(" Kernel compiled successfully") + + # Calculate blocks for cooperative launch + shared_mem_bytes = threads_per_block * itemsize + + if max_blocks is None: + max_blocks = get_max_cooperative_blocks(device, kernel, threads_per_block, shared_mem_bytes) + + # Calculate optimal blocks (all must be resident) + num_blocks = min(max_blocks, (num_elements + threads_per_block - 1) // threads_per_block) + + print("\nLaunch Configuration:") + print(f" Threads per block: {threads_per_block}") + print(f" Number of blocks: {num_blocks}") + print(f" Total threads: {num_blocks * threads_per_block:,}") + print(f" Shared memory per block: {shared_mem_bytes} bytes") + print(" Launch mode: Cooperative (grid-wide sync enabled)") + + # Generate random input data + print("\n> Generating random input data...") + rng = np.random.default_rng(42) + h_idata = (rng.random(num_elements) * 256).astype(np.float32) + + stream = device.create_stream() + cp_stream = cp.cuda.Stream.from_external(stream) + try: + d_odata = cp.empty(num_blocks, dtype=np.float32) + with cp_stream: + d_idata = cp.asarray(h_idata, dtype=np.float32) + stream.sync() + + # Compute CPU reference + print("> Computing reference result on CPU...") + cpu_start = time.perf_counter() + cpu_result = float(np.sum(h_idata)) + cpu_time = time.perf_counter() - cpu_start + print(f" CPU time: {cpu_time:.6f} seconds") + + # Warm-up + print("\n> Warming up GPU...") + + launch_config = LaunchConfig( + grid=(num_blocks, 1, 1), + block=(threads_per_block, 1, 1), + shmem_size=shared_mem_bytes, + is_cooperative=True, + ) + + n_u32 = np.uint32(num_elements) + ptr_in = d_idata.data.ptr + ptr_out = d_odata.data.ptr + + try: + launch(stream, launch_config, kernel, ptr_in, ptr_out, n_u32) + except Exception as e: + print(f" Cooperative launch failed: {e}") + return 1 + + stream.sync() + print(" Warm-up successful") + + # Benchmark (CUDA events — not host wall clock around the whole loop) + print(f"\n> Running benchmark ({test_iterations} iterations)...") + event_options = EventOptions(timing_enabled=True) + start_event = stream.device.create_event(options=event_options) + end_event = stream.device.create_event(options=event_options) + # cuda.core event elapsed time (end - start) is in milliseconds (CUDA API). + gpu_times_ms = [] + for _ in range(test_iterations): + try: + stream.record(start_event) + launch(stream, launch_config, kernel, ptr_in, ptr_out, n_u32) + stream.record(end_event) + end_event.sync() + gpu_times_ms.append(float(end_event - start_event)) + except Exception as e: + print(f"Benchmark iteration failed: {e}") + return 1 + + avg_gpu_ms = float(np.mean(gpu_times_ms)) + avg_gpu_s = avg_gpu_ms / 1000.0 + + stream.sync() + with cp_stream: + h_result = cp.asnumpy(d_odata[:1]) + gpu_result = float(h_result[0]) + + # Performance metrics use seconds for throughput and speedup. + # CPU time is already in seconds. + bytes_processed = num_elements * 4 + throughput_gb_s = bytes_processed / avg_gpu_s / 1e9 + + print("\n> Performance Results:") + print(f" Average GPU time: {avg_gpu_ms:.6f} ms") + print(f" Throughput: {throughput_gb_s:.2f} GB/s") + print(f" Speedup vs CPU: {cpu_time / avg_gpu_s:.2f}x") + + # Validate results + print("\n> Validating results...") + success = verify_array_result( + np.array([gpu_result]), + np.array([cpu_result]), + rtol=1e-5, + atol=1e-5, + ) + + # Summary + print("\n" + "=" * 70) + print("Summary") + print("=" * 70) + print(f""" +Single-kernel two-stage reduction: + Stage 1: {num_blocks} blocks -> {num_blocks} partial sums + grid.sync() <- All blocks synchronize (KEY innovation) + Stage 2: Block 0 -> 1 final result + Total: 1 kernel launch, {throughput_gb_s:.2f} GB/s + +Comparison: + • Traditional: 2 kernel launches or kernel + CPU + • This sample: 1 kernel with grid.sync() between stages + • Benefit: Eliminates ~5-20us launch overhead per stage + """) + + print("=" * 70) + if success: + print("Single-Pass Multi-Block Reduction completed successfully!") + else: + print("Single-Pass Multi-Block Reduction FAILED!") + print("=" * 70 + "\n") + + return 0 if success else 1 + finally: + stream.close() + + +def main(): + """Main entry point with argument parsing.""" + parser = argparse.ArgumentParser(description="Single-Pass Multi-Block Reduction with Cooperative Groups") + + parser.add_argument( + "--n", + type=int, + default=1 << 25, + help="Number of elements to reduce (default: 33554432 = 2^25)", + ) + + parser.add_argument( + "--threads", + type=int, + default=None, + help=("Threads per block, power of 2 in [1, 1024]; default: device maximum (typically 1024)"), + ) + + parser.add_argument( + "--maxblocks", + type=int, + default=None, + help=("Maximum number of blocks (default: auto-calculated for cooperative launch)"), + ) + + parser.add_argument( + "--iterations", + type=int, + default=100, + help="Number of benchmark iterations (default: 100)", + ) + + parser.add_argument( + "--cuda-include-dir", + type=str, + required=True, + help=( + "CUDA include directory for NVRTC. " + "Use os.pathsep to separate multiple paths " + "(':' on POSIX, ';' on Windows)." + ), + ) + + args = parser.parse_args() + + # Validate arguments + if args.n <= 0: + print("Error: n must be positive") + return 1 + + err_threads = _validate_threads_arg(args.threads) + if err_threads: + print(f"Error: {err_threads}") + return 1 + + if args.maxblocks is not None and args.maxblocks <= 0: + print("Error: maxblocks must be positive") + return 1 + + try: + exit_code = run( + num_elements=args.n, + max_threads=args.threads, + max_blocks=args.maxblocks, + test_iterations=args.iterations, + cuda_include_dir=args.cuda_include_dir, + ) + sys.exit(exit_code) + except Exception as e: + print(f"\nError: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/samples/reductionMultiBlockCG/requirements.txt b/samples/reductionMultiBlockCG/requirements.txt new file mode 100644 index 00000000000..6d8c89fed04 --- /dev/null +++ b/samples/reductionMultiBlockCG/requirements.txt @@ -0,0 +1,10 @@ +# Python Multi-Block Cooperative Reduction Sample Requirements +# Install with: pip install -r requirements.txt + +numpy>=2.3.2 +cuda-core>=1.0.0 +cuda-python>=13.0.0 +# Headers for NVRTC: cooperative_groups.h includes cuda/std/* (CCCL) +cuda-cccl>=1.0.0 +# Use cupy-cuda13x>=14.0.0 for cp.cuda.Stream.from_external(stream) +cupy-cuda13x>=14.0.0 diff --git a/samples/simpleP2P/README.md b/samples/simpleP2P/README.md new file mode 100644 index 00000000000..774109073c4 --- /dev/null +++ b/samples/simpleP2P/README.md @@ -0,0 +1,190 @@ +# Sample: simpleP2P (Python) + +## Description + +This sample demonstrates peer-to-peer (P2P) memory access between multiple GPUs in CUDA using the cuda.core Python library. P2P allows GPUs to directly access each other's memory without routing data through the host (CPU), enabling efficient multi-GPU applications. This sample detects P2P-capable GPUs, enables peer access, measures bandwidth using CUDA events for accurate GPU-side timing, and launches kernels (using grid-stride loops) that read from one GPU's memory and write to another GPU's memory. + +## What you will learn + +- How to detect multiple CUDA-capable GPUs using `system.get_num_devices()` and `Device(id)` +- How to check P2P capability between GPU pairs using `device.can_access_peer()` +- How to enable and disable peer access using `DeviceMemoryResource.peer_accessible_by` +- How to allocate device memory on specific GPUs using `DeviceMemoryResource` +- How to perform direct GPU-to-GPU memory transfers with explicit event-based synchronization +- How to measure P2P bandwidth using CUDA events for accurate GPU-side timing +- How to use event-based synchronization between streams for sequential bandwidth measurement +- How to launch kernels on one GPU that access memory from another GPU +- How to compile and launch CUDA kernels using cuda.core's `Program` and `launch` APIs with grid-stride loops +- How to validate multi-GPU computation results +- How to properly clean up resources using try/finally blocks + +## Key libraries + +- `numpy` - CPU array operations and data initialization +- `cuda-core` - Modern Python interface to CUDA runtime with full P2P support + +## Key APIs + +**From cuda.core:** +- `system` – Pre-instantiated singleton for system-level CUDA information +- `system.get_num_devices()` – Get number of CUDA-capable devices +- `Device(id)` – Get specific CUDA device handle +- `device.can_access_peer(peer)` – Check if this device can access peer device memory +- `device.set_current()` – Set active device for subsequent operations +- `device.create_stream()` – Create CUDA stream for kernel execution +- `DeviceMemoryResource(device)` – Create memory resource for specific GPU +- `memory_resource.peer_accessible_by` – Get/set which devices can access this memory pool's allocations + - Example: `mr.peer_accessible_by = [1]` grants device 1 access + - Example: `mr.peer_accessible_by = []` revokes all access +- `PinnedMemoryResource()` – Allocate pinned (page-locked) host memory +- `EventOptions(enable_timing=True)` – Create options for CUDA events with timing enabled +- `stream.record(options=event_options)` – Record a CUDA event on a stream +- `event.elapsed_time(start_event)` – Get elapsed time in milliseconds between two events +- `stream.wait_event(event)` – Make a stream wait for an event to complete +- `stream.close()` – Clean up stream resources +- `Program()` – Compile CUDA C++ kernel code +- `LaunchConfig()` – Configure kernel launch parameters (grid, block) +- `launch()` – Launch compiled kernel with arguments +- `buffer.copy_from(src, stream=stream)` – Copy data from source buffer asynchronously +- `buffer.copy_to(dst, stream=stream)` – Copy data to destination buffer asynchronously + +**From DLPack:** +- `numpy.from_dlpack()` – Create NumPy array view of memory buffer + +**Memory Management:** +- Resources (streams, buffers) should be cleaned up using try/finally blocks to ensure proper cleanup even if errors occur +- Streams should be explicitly closed with `stream.close()` in finally blocks + +## Peer-to-Peer (P2P): When to Use + +### Benefits +- **Direct GPU-to-GPU transfers**: Bypass host memory for faster communication +- **Higher bandwidth**: PCIe or NVLink bandwidth between GPUs (up to 600 GB/s with NVLink) +- **Lower latency**: No CPU involvement in data transfers +- **Efficient multi-GPU**: Essential for scaling deep learning, HPC, and simulation workloads +- **Simplified programming**: Kernels can directly access remote GPU memory + +### Requirements +- **Two or more GPUs**: System must have multiple CUDA-capable GPUs +- **P2P support**: GPUs must be P2P-capable (check with `can_access_peer()`) +- **PCIe topology**: Usually requires GPUs on same PCIe root complex +- **Platform support**: Not available on Mac OSX, limited on ARM platforms + +### Best Use Cases +1. Multi-GPU deep learning training (model parallelism, data parallelism) +2. Large-scale scientific simulations across multiple GPUs +3. Real-time rendering with multiple GPUs +4. GPU clusters with direct GPU communication +5. Reducing CPU-GPU traffic in multi-GPU systems + +## Requirements + +1. **Two or more NVIDIA Graphics Cards** with CUDA support and P2P capability +2. **CUDA Drivers** installed on your system +3. **CUDA Toolkit 13.0+** installed on your system +4. **Python 3.10 or newer** +5. **Proper PCIe topology** (GPUs should be on same PCIe root complex for best performance) + +**Note**: This sample will gracefully exit if fewer than 2 GPUs are detected or if P2P is not supported between any GPU pair. + +**Install packages:** +```bash +pip install -r requirements.txt +``` + +Or manually: +```bash +pip install numpy>=2.3.2 cuda-core>=1.0.0 cuda-python>=13.0.0 +``` + +## How to run + +Basic usage: +```bash + +# Run with default parameters (16M elements = 64MB) +python simpleP2P.py +``` + +With custom parameters: +```bash +# Use 32M elements (128MB) +python simpleP2P.py --num_elements 33554432 + +# Show help +python simpleP2P.py --help +``` + +### Command line arguments + +- `--num_elements`: Number of elements in arrays (default: 16777216) + - Each array uses `num_elements * 4 bytes` (float32) + - Default: 64 MB per array + - Sample allocates 2 device buffers + 1 host buffer + +## Expected Output + +``` +====================================================================== +simpleP2P - CUDA Python Sample +====================================================================== + +Starting... + +Checking for multiple GPUs... +CUDA-capable device count: 2 + +Checking GPU(s) for support of peer to peer memory access... +> Peer access from Tesla T10 (GPU0) -> Tesla T10 (GPU1): Yes +> Peer access from Tesla T10 (GPU1) -> Tesla T10 (GPU0): Yes + +Using GPU0 (Tesla T10) and GPU1 (Tesla T10) + +Allocating buffers (64MB on GPU0, GPU1 and CPU Host)... + Peer access enabled: GPU0 <-> GPU1 + Peer access status: MR0 accessible by (1,), MR1 accessible by (0,) + Memory allocated successfully + +Measuring P2P bandwidth... + Performing 100 ping-pong copies between GPUs... + P2P bandwidth: 12.37 GB/s + +Preparing host buffer and memcpy to GPU0... + Data initialized and copied to GPU + +Compiling CUDA kernel... + Kernels compiled successfully + +Run kernel on GPU1, taking source data from GPU0 and writing to GPU1... + Kernel execution complete + +Run kernel on GPU0, taking source data from GPU1 and writing to GPU0... + Kernel execution complete + +Copy data back to host from GPU0 and verify results... + +Checking results... + Comparing 16,777,216 elements... +Test PASSED + [PASS] Validation PASSED + +Disabling peer access... + Peer access revoked: MR0 accessible by (), MR1 accessible by () + +====================================================================== +simpleP2P completed successfully! +====================================================================== + +Shutting down... +``` + +**Note**: P2P bandwidth varies based on: +- PCIe generation +- NVLink +- System topology and configuration + +## Files + +- `simpleP2P.py` – Main Python implementation +- `README.md` – This file +- `requirements.txt` – Python package dependencies diff --git a/samples/simpleP2P/requirements.txt b/samples/simpleP2P/requirements.txt new file mode 100644 index 00000000000..4626ed5dffa --- /dev/null +++ b/samples/simpleP2P/requirements.txt @@ -0,0 +1,6 @@ +# simpleP2P - Requirements +# Install with: pip install -r requirements.txt + +numpy>=2.3.2 +cuda-python>=13.0.0 +cuda-core>=1.0.0 diff --git a/samples/simpleP2P/simpleP2P.py b/samples/simpleP2P/simpleP2P.py new file mode 100644 index 00000000000..42ea028f04d --- /dev/null +++ b/samples/simpleP2P/simpleP2P.py @@ -0,0 +1,358 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# distribution and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["numpy>=2.3.2", "cuda-python>=13.0.0", "cuda-core>=1.0.0"] +# /// + +import argparse +import sys +from pathlib import Path + +try: + import numpy as np + + from cuda.core import ( + Device, + DeviceMemoryResource, + EventOptions, + LaunchConfig, + PinnedMemoryResource, + Program, + ProgramOptions, + launch, + system, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + +# Add parent directory to path to import utilities +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import verify_array_result + +# CUDA kernel for simple P2P operation +SIMPLE_P2P_KERNEL = """ +extern "C" __global__ +void SimpleKernel(float *src, float *dst, int N) { + // Grid-stride loop pattern for canonical CUDA kernel + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = gridDim.x * blockDim.x; + + for (size_t i = tid; i < N; i += stride) { + dst[i] = src[i] * 2.0f; + } +} +""" + + +def run(num_elements=1024 * 1024 * 16): + """ + Demonstrates peer-to-peer (P2P) memory access between multiple GPUs using cuda.core. + + This function shows how to: + 1. Detect and verify multiple GPUs with P2P capability + 2. Enable peer access between GPUs + 3. Perform direct GPU-to-GPU memory transfers + 4. Launch kernels that access memory from other GPUs + 5. Measure P2P bandwidth + 6. Validate results + + Parameters + ---------- + num_elements : int + Number of elements in arrays (default: 16M elements = 64MB) + """ + + print("\n" + "=" * 70) + print("simpleP2P - CUDA Python Sample") + print("=" * 70) + print("\nStarting...") + + # Check for multiple GPUs + print("\nChecking for multiple GPUs...") + num_devices = system.get_num_devices() + print(f"CUDA-capable device count: {num_devices}") + + if num_devices < 2: + print("Two or more GPUs with Peer-to-Peer access capability are required, waiving this sample.") + return 2 + + # Get device properties + devices = [Device(i) for i in range(num_devices)] + + # Check for P2P capability + print("\nChecking GPU(s) for support of peer to peer memory access...") + + p2p_capable_gpus = [-1, -1] + + for i in range(num_devices): + p2p_capable_gpus[0] = i + for j in range(num_devices): + if i == j: + continue + + # Check peer access capability using cuda.core + i_access_j = devices[i].can_access_peer(devices[j]) + j_access_i = devices[j].can_access_peer(devices[i]) + + print( + f"> Peer access from {devices[i].name} (GPU{i}) -> " + f"{devices[j].name} (GPU{j}): {'Yes' if i_access_j else 'No'}" + ) + print( + f"> Peer access from {devices[j].name} (GPU{j}) -> " + f"{devices[i].name} (GPU{i}): {'Yes' if j_access_i else 'No'}" + ) + + if i_access_j and j_access_i: + p2p_capable_gpus[1] = j + break + + if p2p_capable_gpus[1] != -1: + break + + if p2p_capable_gpus[0] == -1 or p2p_capable_gpus[1] == -1: + print("\nTwo or more GPUs with Peer-to-Peer access capability are required.") + print("Peer to Peer access is not available amongst GPUs in the system, waiving test.") + return 2 + + # Use first pair of P2P capable GPUs detected + gpuid = [p2p_capable_gpus[0], p2p_capable_gpus[1]] + dev0 = devices[gpuid[0]] + dev1 = devices[gpuid[1]] + + print(f"\nUsing GPU{gpuid[0]} ({dev0.name}) and GPU{gpuid[1]} ({dev1.name})") + + # Allocate buffers with P2P access + buf_size = num_elements * np.dtype(np.float32).itemsize + print(f"\nAllocating buffers ({int(buf_size / 1024 / 1024)}MB on GPU{gpuid[0]}, GPU{gpuid[1]} and CPU Host)...") + + # Allocate on GPU 0 and grant access to GPU 1 + dev0.set_current() + mr0 = DeviceMemoryResource(dev0) + mr0.peer_accessible_by = [gpuid[1]] # Grant GPU 1 access to GPU 0's memory + g0 = mr0.allocate(buf_size, stream=dev0.default_stream) + + # Allocate on GPU 1 and grant access to GPU 0 + dev1.set_current() + mr1 = DeviceMemoryResource(dev1) + mr1.peer_accessible_by = [gpuid[0]] # Grant GPU 0 access to GPU 1's memory + g1 = mr1.allocate(buf_size, stream=dev1.default_stream) + + print(f" Peer access enabled: GPU{gpuid[0]} <-> GPU{gpuid[1]}") + print( + f" Peer access status: MR0 accessible by {mr0.peer_accessible_by}, MR1 accessible by {mr1.peer_accessible_by}" + ) + + # Allocate pinned host memory + pinned_mr = PinnedMemoryResource() + h0 = pinned_mr.allocate(buf_size, stream=dev0.default_stream) + + print(" Memory allocated successfully") + + # Create streams + stream0 = dev0.create_stream() + stream1 = dev1.create_stream() + + try: + # P2P bandwidth test using CUDA events for accurate GPU-side timing + print("\nMeasuring P2P bandwidth...") + print(" Performing 100 ping-pong copies between GPUs...") + + event_options = EventOptions(timing_enabled=True) + sync_event0 = None + sync_event1 = None + + # Record start event on stream0 + start_event = stream0.record(options=event_options) + + for i in range(100): + # Ping-pong copy between GPUs with explicit event-based synchronization + if i % 2 == 0: + # Wait for previous stream1 copy to complete (if any) + if sync_event1 is not None: + stream0.wait(sync_event1) + # Copy g0 -> g1 on stream0 + g1.copy_from(g0, stream=stream0) + # Record event on stream0 to signal completion of this copy + sync_event0 = stream0.record(options=EventOptions(timing_enabled=False)) + else: + # Wait for previous stream0 copy to complete + if sync_event0 is not None: + stream1.wait(sync_event0) + # Copy g1 -> g0 on stream1 + g0.copy_from(g1, stream=stream1) + # Record event on stream1 to signal completion of this copy + sync_event1 = stream1.record(options=EventOptions(timing_enabled=False)) + + # Wait for last stream1 copy to complete + if sync_event1 is not None: + stream0.wait(sync_event1) + + # Record end event on stream0 after all copies have been enqueued + end_event = stream0.record(options=event_options) + end_event.sync() + + # Elapsed time in milliseconds (using subtraction operator) + time_memcpy = end_event - start_event + + bandwidth = (1.0 / (time_memcpy / 1000.0)) * (100.0 * buf_size) / (1024.0**3) + print(f" P2P bandwidth: {bandwidth:.2f} GB/s") + + # Prepare host buffer and initialize data + print(f"\nPreparing host buffer and memcpy to GPU{gpuid[0]}...") + + # Create numpy view and initialize + h0_array = np.from_dlpack(h0).view(dtype=np.float32) + h0_array[:] = (np.arange(num_elements, dtype=np.float32) % 4096).astype(np.float32) + + # Copy to GPU 0 + dev0.set_current() + g0.copy_from(h0, stream=stream0) + stream0.sync() + + print(" Data initialized and copied to GPU") + + # Compile kernel for both GPUs + print("\nCompiling CUDA kernel...") + dev0.set_current() + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}") + prog = Program(SIMPLE_P2P_KERNEL, code_type="c++", options=program_options) + mod0 = prog.compile("cubin") + kernel0 = mod0.get_kernel("SimpleKernel") + + dev1.set_current() + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}") + prog = Program(SIMPLE_P2P_KERNEL, code_type="c++", options=program_options) + mod1 = prog.compile("cubin") + kernel1 = mod1.get_kernel("SimpleKernel") + + print(" Kernels compiled successfully") + + # Launch configuration + threads = 512 + blocks = (num_elements + threads - 1) // threads + config = LaunchConfig(grid=blocks, block=threads) + + # Run kernel on GPU 1, reading from GPU 0, writing to GPU 1 + print(f"\nRun kernel on GPU{gpuid[1]}, taking source data from GPU{gpuid[0]} and writing to GPU{gpuid[1]}...") + dev1.set_current() + launch(stream1, config, kernel1, g0, g1, np.int32(num_elements)) + stream1.sync() + print(" Kernel execution complete") + + # Run kernel on GPU 0, reading from GPU 1, writing to GPU 0 + print(f"\nRun kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...") + dev0.set_current() + launch(stream0, config, kernel0, g1, g0, np.int32(num_elements)) + stream0.sync() + print(" Kernel execution complete") + + # Copy data back to host and verify + print(f"\nCopy data back to host from GPU{gpuid[0]} and verify results...") + g0.copy_to(h0, stream=stream0) + stream0.sync() + + # Verify results + print("\nChecking results...") + print(f" Comparing {num_elements:,} elements...") + + # Input data goes through two kernels, each multiplying by 2.0. + expected = (np.arange(num_elements, dtype=np.float32) % 4096) * 4.0 + + # Use utility function for verification (handles both numpy and cupy arrays) + if verify_array_result(h0_array, expected, rtol=1e-5, atol=1e-6, verbose=True): + print(" [PASS] Validation PASSED") + success = True + else: + print(" [FAIL] Validation FAILED") + # Show first few errors for debugging + errors = np.where(~np.isclose(h0_array, expected, rtol=1e-5, atol=1e-6))[0] + print(f" Number of mismatches: {len(errors)}") + for idx in errors[:10]: + print(f" Error @ element {idx}: got {h0_array[idx]}, expected {expected[idx]}") + success = False + + # Disable peer access + print("\nDisabling peer access...") + mr0.peer_accessible_by = [] # Revoke GPU 1's access to GPU 0's memory + mr1.peer_accessible_by = [] # Revoke GPU 0's access to GPU 1's memory + print( + f" Peer access revoked: MR0 accessible by {mr0.peer_accessible_by}, " + f"MR1 accessible by {mr1.peer_accessible_by}" + ) + + print("\n" + "=" * 70) + if success: + print("simpleP2P completed successfully!") + else: + print("simpleP2P FAILED!") + print("=" * 70 + "\n") + + return 0 if success else 1 + finally: + # Cleanup streams and buffers + print("Shutting down...") + stream0.close() + stream1.close() + + +def main(): + """Main entry point with argument parsing.""" + parser = argparse.ArgumentParser( + description=("Demonstrate peer-to-peer (P2P) memory access between multiple GPUs with CUDA") + ) + + parser.add_argument( + "--num_elements", + type=int, + default=1024 * 1024 * 16, # 16M elements = 64MB + help="Number of elements in arrays (default: 16777216 = 64MB)", + ) + + args = parser.parse_args() + + # Validate arguments + if args.num_elements <= 0: + print("Error: num_elements must be positive") + return 1 + + try: + exit_code = run(num_elements=args.num_elements) + sys.exit(exit_code) + except Exception as e: + print(f"\nError: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/samples/simplePrint/README.md b/samples/simplePrint/README.md new file mode 100644 index 00000000000..350694b06da --- /dev/null +++ b/samples/simplePrint/README.md @@ -0,0 +1,263 @@ +# simplePrint - Printing from CUDA Kernels + +## Description + +This sample demonstrates how to use `printf()` inside CUDA kernels using **two different approaches**: + +1. **CUDA C++ kernels** compiled with `cuda.core.Program` - Full C++ features and control +2. **Numba CUDA kernels** - Pythonic kernel authoring using `numba.cuda.grid()` for modern indexing + +The sample shows basic device management, kernel compilation with inline CUDA C++ code, and multi-dimensional kernel launches (2D grid × 3D blocks) using modern CUDA Python. The Numba example demonstrates the recommended `numba.cuda.grid()` indexing style while also showing how it relates to classic CUDA C++ block/thread IDs. Both approaches use `cuda.core` APIs for stream management and synchronization, demonstrating interoperability. + +This is the Python equivalent of the C++ `simplePrintf` sample, enhanced with Numba CUDA examples. + +## Key Concepts + +CUDA Python (cuda.core), Numba CUDA, Kernel Compilation, Printf in Kernels, Multi-dimensional Launch, Pythonic GPU Programming, Modern Thread Indexing (grid()), Stream-based Execution, cuda.core/Numba Interoperability + +## CUDA APIs involved + +### [cuda.core (cuda-python)](https://nvidia.github.io/cuda-python/) + +- `Device()` - Device management +- `Device.create_stream()` - Create CUDA streams +- `Stream.sync()` - Synchronize stream execution +- `Program()` - Compile CUDA C++ kernels +- `LaunchConfig()` - Configure kernel launch +- `launch()` - Execute kernels on streams + +### [Numba CUDA](https://nvidia.github.io/numba-cuda/) + +- `@cuda.jit` - JIT compile Python functions to CUDA kernels +- `cuda.grid()` - Get global thread position (recommended modern approach) +- `cuda.blockIdx`, `cuda.threadIdx` - Thread/block indices (classic style) +- `cuda.gridDim`, `cuda.blockDim` - Grid/block dimensions +- **Note:** Uses `cuda.core` APIs for stream management (interoperability) + +### CUDA Kernel Functions + +- `printf()` - Print from device code (C++) +- `print()` - Print from device code (Numba, limited formatting) +- `blockIdx`, `threadIdx` - Thread/block indices +- `gridDim`, `blockDim` - Grid/block dimensions + +### What You Learn + +- Device initialization with `cuda.core.Device` +- Compiling CUDA C++ kernels with `Program` and `ProgramOptions` +- Writing Pythonic CUDA kernels with Numba's `@cuda.jit` decorator +- Using `numba.cuda.grid()` for modern thread indexing (recommended approach) +- Understanding the relationship between global coordinates and classic block/thread IDs +- **Interoperability**: Using `cuda.core` streams with Numba CUDA kernels +- Comparing CUDA C++ vs Pythonic kernel authoring approaches +- Multi-dimensional kernel launches (2D grid, 3D blocks) +- Using streams for kernel execution and synchronization +- Using `printf()` and `print()` in GPU kernels for debugging +- Understanding print limitations in Numba CUDA (no f-strings) +- Proper error handling and resource management + +## Requirements + +### Hardware: + +- NVIDIA GPU with Compute Capability 7.0 or higher +- Minimum GPU memory: 512 MB + +### Software: + +- CUDA Toolkit 13.0 or newer +- Python 3.10 or newer +- `cuda-python` package (13.0+) +- `cuda-core` package (>=1.0.0) +- `numba-cuda` package (0.24.0+, for Pythonic kernel authoring) + +Download and install: +- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) +- [cuda-python package](https://nvidia.github.io/cuda-python/): `pip install cuda-python` +- [numba-cuda](https://nvidia.github.io/numba-cuda/): `pip install numba-cuda` + +## Build and Run + +```bash +# Install dependencies +pip install -r requirements.txt + +# Run the sample +python simplePrint.py +``` + +## Expected Output + +``` +Simple Print - Printing from CUDA Kernels +Demonstrating both CUDA C++ and Numba CUDA approaches + +Device: +Compute Capability: sm_ + +====================================================================== +METHOD 1: CUDA C++ Kernel (via cuda.core.Program) +====================================================================== +Advantage: Full C++ features, better for complex kernels + +Compiling CUDA C++ kernel... +Kernel compiled successfully. + +Kernel configuration: + Grid: (2, 2) + Block: (2, 2, 2) + Total threads: 32 + +Launching kernel with value=10. Output: + +[0, 0]: Value is: 10 +[0, 1]: Value is: 10 +[0, 2]: Value is: 10 +[0, 3]: Value is: 10 +[0, 4]: Value is: 10 +[0, 5]: Value is: 10 +[0, 6]: Value is: 10 +[0, 7]: Value is: 10 +[1, 0]: Value is: 10 +... +[3, 7]: Value is: 10 + +CUDA C++ kernel execution complete. + + +====================================================================== +METHOD 2: Numba CUDA Kernel (Pythonic / modern indexing) +====================================================================== +Advantage: Uses numba.cuda.grid(3) for global indexing, + while still showing classic CUDA C++ IDs for reference. + Uses cuda.core for stream management (interoperability). + +Kernel configuration: + Grid: (2, 2) + Block: (2, 2, 2) + Total threads: 32 + +Launching Numba CUDA kernel (grid(3) + classic IDs) with value=10: +Uses numba.cuda.grid(3) to get global (x, y, z), +and prints the corresponding blockId/threadId like the C++ sample. +Stream managed by cuda.core for consistency with C++ example. + +global[ 0 , 0 , 0 ] -> [ 0 , 0 ]: Value is: 10 +global[ 1 , 0 , 0 ] -> [ 0 , 1 ]: Value is: 10 +global[ 0 , 1 , 0 ] -> [ 0 , 2 ]: Value is: 10 +... +global[ 3 , 3 , 1 ] -> [ 3 , 7 ]: Value is: 10 + +Numba CUDA kernel execution complete. + +====================================================================== +Done! Both kernel approaches demonstrated successfully. +====================================================================== +``` + +## Understanding the Output + +- **Grid**: 2×2 = 4 blocks (labeled 0-3) +- **Block**: 2×2×2 = 8 threads per block (labeled 0-7) +- **Total**: 32 threads, each printing its position and value + +### CUDA C++ Kernel: +Each thread calculates: +- Block ID (linear): `blockIdx.y * gridDim.x + blockIdx.x` +- Thread ID (linear): `threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x` + +### Numba CUDA Kernel: +Each thread shows: +- **Global position** using `numba.cuda.grid(3)` → `(x, y, z)` coordinates across entire grid +- **Classic IDs** (block ID, thread ID) calculated the same way as C++ for comparison +- This demonstrates how modern indexing relates to traditional CUDA C++ style + +### Comparing the Two Approaches + +**CUDA C++ Kernel (Method 1):** +- Uses C++ syntax and `printf()` with full formatting control +- Requires compilation via `cuda.core.Program` +- Best for complex kernels needing C++ features (templates, libraries, etc.) +- Uses classic block/thread ID indexing +- Output: `[0, 0]: Value is: 10` (clean formatting) + +**Numba CUDA Kernel (Method 2):** +- Uses Python syntax with `@cuda.jit` decorator +- JIT compiled automatically when called +- Best for prototyping and simpler kernels +- **Modern indexing**: Uses `numba.cuda.grid(3)` to get global thread coordinates (recommended) +- Also shows classic block/thread IDs to help relate the two indexing models +- **Interoperability**: Uses `cuda.core` streams via `stream` for consistency +- Demonstrates that numba-cuda kernels can work seamlessly with cuda.core infrastructure +- Limited print formatting (no f-strings, basic `print()` only; adds spaces between arguments) +- Output: `global[ 0 , 0 , 0 ] -> [ 0 , 0 ]: Value is: 10` (shows both indexing styles; note extra spaces due to `print()` behavior) + +## Experiments + +Try modifying: + +### For Both Approaches: +- **Grid size**: Change `grid=(4, 4)` for 16 blocks +- **Block size**: Change `block=(4, 4, 4)` for 64 threads per block +- **Conditional printing**: Print only from specific threads (e.g., `if threadId == 0:`) + +### CUDA C++ Specific: +- **Format strings**: Experiment with different `printf()` formats +- **Kernel code**: Add complex C++ computations before printing +- **External libraries**: Include CUDA math libraries or device functions (e.g., ``, ``) + +### Numba CUDA Specific: +- **Grid indexing**: Try `numba.cuda.grid(1)` or `numba.cuda.grid(2)` for different dimensions +- **Conditional printing**: Print only from threads where `x == 0` or `y == z` +- **Python operations**: Use NumPy-like operations in the kernel +- **Device math libraries**: Use [nvmath-python device APIs](https://docs.nvidia.com/cuda/nvmath-python/latest/device-apis/index.html) for optimized math operations (similar to CUDA math libraries in C++) +- **Shared memory**: Add `numba.cuda.shared.array()` for fast inter-thread communication +- **Atomic operations**: Try `numba.cuda.atomic.add()` for thread-safe updates +- **Print variations**: Experiment with what numba-cuda's `print()` can and cannot handle +- **Streams**: Create multiple `cuda.core` streams and launch numba-cuda kernels on them concurrently +- **Interoperability**: Mix numba-cuda kernels and CUDA C++ kernels on the same stream + +## Notes + +### General: +- Printing from GPU is relatively slow - use sparingly in production code +- Printf output is buffered and limited (~1MB buffer on most GPUs) + +### CUDA C++ Kernels: +- Always call `stream.sync()` after kernel launch to flush printf output +- Full `printf()` format string support (%, flags, width, precision) + +### Numba CUDA Kernels: +- **Recommended**: Use `numba.cuda.grid(ndim)` for thread indexing (modern, Pythonic) + - `grid(1)` for 1D indexing, `grid(2)` for 2D, `grid(3)` for 3D + - Returns global thread position across the entire grid +- **Interoperability**: Use `cuda.core` streams with Numba kernels via `stream` + - Create streams: `stream = device.create_stream()` + - Launch kernels: `kernel[grid, block, stream](args)` + - Synchronize: `stream.sync()` +- Numba's `print()` has limited capabilities compared to Python's `print()` +- F-strings are NOT supported in Numba CUDA kernels +- Use comma-separated arguments: `print("Value:", x)` instead of f-strings +- **Note**: `print()` automatically adds spaces between comma-separated arguments (e.g., `print("[", x, "]")` outputs `[ 0 ]` not `[0]`) +- Always synchronize the stream to flush output + +## Files + +- `simplePrint.py` - Python implementation using cuda.core API +- `README.md` - This file +- `requirements.txt` - Sample dependencies + +## See Also + +### CUDA Python (cuda.core): +- [cuda.core Documentation](https://nvidia.github.io/cuda-python/) +- [CUDA Python Examples](https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples) + +### Numba CUDA: +- [Numba CUDA Documentation](https://nvidia.github.io/numba-cuda/) +- [numba.cuda.grid() Reference](https://nvidia.github.io/numba-cuda/reference/kernel.html#numba.cuda.grid) +- [nvmath-python Device APIs](https://docs.nvidia.com/cuda/nvmath-python/latest/device-apis/index.html) - Optimized math operations for Numba CUDA kernels + +### CUDA References: +- [CUDA C Programming Guide - Printf](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#formatted-output) +- [C++ simplePrintf Sample](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/0_Introduction/simplePrintf) diff --git a/samples/simplePrint/requirements.txt b/samples/simplePrint/requirements.txt new file mode 100644 index 00000000000..a39c5e7e518 --- /dev/null +++ b/samples/simplePrint/requirements.txt @@ -0,0 +1,7 @@ +# Simple Printf Sample - Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +# Numba JIT uses nvJitLink from pip; keep in step with cuda-bindings (e.g. 13.2.x). +nvidia-nvjitlink>=13.2.0 +numba-cuda>=0.29.0 diff --git a/samples/simplePrint/simplePrint.py b/samples/simplePrint/simplePrint.py new file mode 100644 index 00000000000..38f8e33ba4c --- /dev/null +++ b/samples/simplePrint/simplePrint.py @@ -0,0 +1,289 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "nvidia-nvjitlink>=13.2.0", "numba-cuda>=0.29.0"] +# /// + +""" +Simple Print - Printing from CUDA Kernels + +This sample demonstrates how to print output from CUDA kernels using printf(). +It shows: +1. Device management with cuda.core.Device +2. Compiling CUDA C++ code that uses printf() +3. Launching kernels with 2D grids and 3D blocks +4. Seeing kernel output printed to stdout +5. Using Numba CUDA for Pythonic kernel authoring + +This sample demonstrates both approaches: +- CUDA C++ kernels compiled via cuda.core.Program (more control, C++ features) +- Numba CUDA kernels (more Pythonic, easier to write) + +This is the Python equivalent of the C++ simplePrintf sample. +""" + +import sys +import traceback + +try: + from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + +try: + from numba import cuda as numba_cuda + + NUMBA_AVAILABLE = True +except ImportError: + NUMBA_AVAILABLE = False + print("Warning: numba not found. Numba CUDA example will be skipped.") + print("To install: pip install numba") + + +# CUDA C++ kernel with printf +# This kernel prints the block index, thread index, and a value from each thread +PRINTF_KERNEL = """ +extern "C" +__global__ void printKernel(int val) { + // Calculate linear block index from 2D grid + int blockId = blockIdx.y * gridDim.x + blockIdx.x; + + // Calculate linear thread index from 3D block + int threadId = threadIdx.z * blockDim.x * blockDim.y + + threadIdx.y * blockDim.x + + threadIdx.x; + + // Print from each thread + printf("[%d, %d]:\\t\\tValue is: %d\\n", blockId, threadId, val); +} +""" + + +# Numba CUDA kernel - Pythonic equivalent using numba.cuda.grid() +# This demonstrates the same functionality using Numba's Python-based kernel syntax +if NUMBA_AVAILABLE: + + @numba_cuda.jit + def numba_print_kernel(val): + """ + Numba CUDA kernel showing the *recommended* grid() indexing style, + while also relating it to the classic CUDA C++ blockId/threadId. + + - Primary view: global 3D coordinates from numba.cuda.grid(3) + (modern, Pythonic way to index work for a 3D thread layout). + - Secondary view: linear blockId / threadId matching the CUDA C++ + printf sample, to help CUDA C++ users connect the two models. + """ + # Modern / recommended view: global 3D thread coordinates + x, y, z = numba_cuda.grid(3) + + # Classic CUDA-style indices, same formulas as the C++ sample + block_id = numba_cuda.blockIdx.y * numba_cuda.gridDim.x + numba_cuda.blockIdx.x + + thread_id = ( + numba_cuda.threadIdx.z * numba_cuda.blockDim.x * numba_cuda.blockDim.y + + numba_cuda.threadIdx.y * numba_cuda.blockDim.x + + numba_cuda.threadIdx.x + ) + + # Print both views side-by-side + # Note: Numba print() adds spaces between comma-separated args + print( + "global[", + x, + ",", + y, + ",", + z, + "] -> [", + block_id, + ",", + thread_id, + "]:\t\tValue is:", + val, + ) + + +def run_cuda_cpp_kernel(device, test_value=10): + """ + Demonstrate printing from CUDA C++ kernel compiled with cuda.core. + + This approach gives you full access to CUDA C++ features and allows + for more complex kernel implementations. + """ + print("=" * 70) + print("METHOD 1: CUDA C++ Kernel (via cuda.core.Program)") + print("=" * 70) + print("Advantage: Full C++ features, better for complex kernels") + print() + + # Compile the kernel + print("Compiling CUDA C++ kernel...") + program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + prog = Program(PRINTF_KERNEL, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("printKernel",)) + kernel = mod.get_kernel("printKernel") + print("Kernel compiled successfully.\n") + + # Create stream for kernel execution + stream = device.create_stream() + + # Configure kernel launch + # Using 2D grid (2x2) and 3D blocks (2x2x2) + grid_x, grid_y = 2, 2 + block_x, block_y, block_z = 2, 2, 2 + + print("Kernel configuration:") + print(f" Grid: ({grid_x}, {grid_y})") + print(f" Block: ({block_x}, {block_y}, {block_z})") + print(f" Total threads: {grid_x * grid_y * block_x * block_y * block_z}") + print() + + # Launch configuration with 2D grid and 3D block + config = LaunchConfig(grid=(grid_x, grid_y), block=(block_x, block_y, block_z)) + + print(f"Launching kernel with value={test_value}. Output:\n") + try: + # Launch kernel + launch(stream, config, kernel, test_value) + + # Synchronize to ensure printf output is flushed + stream.sync() + + print("\nCUDA C++ kernel execution complete.") + except Exception as e: + print(f"\nError during kernel execution: {e}") + traceback.print_exc() + return 1 + finally: + # Cleanup + stream.close() + + return 0 + + +def run_numba_kernel(device, test_value=10): + """ + Demonstrate printing from a Numba CUDA kernel. + + This example uses numba.cuda.grid(3) as the primary indexing mechanism + (recommended modern style), and also prints the equivalent blockId / + threadId used in the CUDA C++ printf sample for side-by-side comparison. + + Uses cuda.core APIs for stream management, demonstrating interoperability + between Numba CUDA kernels and cuda.core infrastructure. + """ + print("\n") + print("=" * 70) + print("METHOD 2: Numba CUDA Kernel (Pythonic / modern indexing)") + print("=" * 70) + print("Advantage: Uses numba.cuda.grid(3) for global indexing,") + print(" while still showing classic CUDA C++ IDs for reference.") + print(" Uses cuda.core for stream management (interoperability).") + print() + + # Same launch configuration as the C++ version + grid_x, grid_y = 2, 2 + block_x, block_y, block_z = 2, 2, 2 + + print("Kernel configuration:") + print(f" Grid: ({grid_x}, {grid_y})") + print(f" Block: ({block_x}, {block_y}, {block_z})") + print(f" Total threads: {grid_x * grid_y * block_x * block_y * block_z}") + print() + + # Use cuda.core stream (same as C++ example) instead of numba.cuda.stream() + stream = device.create_stream() + + print(f"Launching Numba kernel (grid(3) + classic IDs) with value={test_value}:") + print("Uses numba.cuda.grid(3) to get global (x, y, z),") + print("and prints the corresponding blockId/threadId like the C++ sample.") + print("Stream managed by cuda.core for consistency with C++ example.\n") + + try: + # Launch Numba kernel on cuda.core stream + numba_print_kernel[(grid_x, grid_y), (block_x, block_y, block_z), stream](test_value) + + # Synchronize cuda.core stream (same as C++ example) + stream.sync() + print("\nNumba CUDA kernel execution complete.") + except Exception as e: + print(f"\nError during Numba kernel execution: {e}") + traceback.print_exc() + return 1 + finally: + # Cleanup + stream.close() + + return 0 + + +def main(): + """Main function demonstrating printing from CUDA kernels using both approaches""" + + print("Simple Print - Printing from CUDA Kernels") + print("Demonstrating both CUDA C++ and Numba CUDA approaches") + print() + # Initialize device + device = Device() + device.set_current() + + # Get device properties + print(f"Device: {device.name}") + print(f"Compute Capability: sm_{device.arch}") + print() + + # Value to pass to both kernels + test_value = 10 + + # Run CUDA C++ kernel + result = run_cuda_cpp_kernel(device, test_value) + if result != 0: + return result + + # Run Numba kernel if available + if NUMBA_AVAILABLE: + result = run_numba_kernel(device, test_value) + if result != 0: + return result + else: + print("\n" + "=" * 70) + print("Numba CUDA example skipped (numba not installed)") + print("To run the Numba example: pip install numba") + print("=" * 70) + + print("\n" + "=" * 70) + print("Done! Both kernel approaches demonstrated successfully.") + print("=" * 70) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/simpleZeroCopy/README.md b/samples/simpleZeroCopy/README.md new file mode 100644 index 00000000000..cf8a38d3d28 --- /dev/null +++ b/samples/simpleZeroCopy/README.md @@ -0,0 +1,141 @@ +# Sample: simpleZeroCopy (Python) + +## Description + +This sample demonstrates zero-copy access using **`cuda.core`** to compile and launch a kernel, and **`cuda.bindings.runtime`** for mapped pinned host memory (`cudaHostAlloc` with `cudaHostAllocMapped`, `cudaHostGetDevicePointer`, and `cudaFreeHost`). The GPU loads and stores through **device** addresses that refer to that host memory—no `cudaMemcpy` in or out. The example is vector add with inputs and output as NumPy views of the host side of those buffers. + +## What you will learn + +- How to allocate **mapped** pinned host memory with `cudaHostAlloc` (via `cuda.bindings.runtime`) so the GPU can use `cudaHostGetDevicePointer` addresses in a kernel +- How `cuda.core.PinnedMemoryResource` differs (staging/copies; not guaranteed to be `cudaHostAllocMapped` for direct kernel access) +- How to build NumPy views of host addresses with `ctypes` and `numpy.frombuffer` +- How to launch CUDA kernels with `cuda.core`’s `Program` and `launch`, passing **device** pointers for mapped buffers +- When zero-copy is beneficial vs. device memory with explicit transfers +- How to validate results on the host without a D2H memcpy + +## Key libraries + +- `numpy` – CPU arrays and reference computation +- `cuda-core` – `Device`, stream, `Program`, `LaunchConfig`, `launch` +- `cuda-python` (`cuda.bindings.runtime`) – `cudaHostAlloc` / `cudaHostGetDevicePointer` / `cudaFreeHost` for mapped host memory + +## Key APIs + +**From cuda.core:** `Device`, `device.create_stream()`, `Program`, `ProgramOptions`, `LaunchConfig`, `launch` + +**From cuda.bindings.runtime:** `cudaHostAlloc` (with `cudaHostAllocMapped` | `cudaHostAllocPortable`), `cudaHostGetDevicePointer`, `cudaFreeHost` + +**From the standard library:** `ctypes` – wrap host pointers for `numpy.frombuffer` float32 views + +**Memory management:** Free host memory with `cudaFreeHost` in a `finally` block; call `stream.close()` when done. + +## Zero-Copy Memory: When to Use + +### Benefits +- **No explicit transfers**: Simplifies code by eliminating `cudaMemcpy` calls +- **Automatic synchronization**: Host can access results immediately after kernel completes +- **Good for small data**: Overhead of explicit transfers can exceed benefits for small arrays +- **Excellent for integrated GPUs**: On systems like Jetson (Tegra), CPU and GPU share physical memory + +### Limitations +- **Slower access**: Limited by PCIe bandwidth vs. device memory bandwidth +- **Not for compute-intensive**: Device memory is much faster for frequently accessed data +- **Discrete GPU overhead**: Each access crosses PCIe bus + +### Best Use Cases +1. Small data sets where transfer overhead dominates +2. Data accessed infrequently by GPU +3. Integrated GPU platforms (shared memory) +4. Streaming data from host to device +5. Prototyping and debugging (simplifies memory management) + +## Requirements + +1. **NVIDIA GPU** and a **driver** compatible with your installed `cuda-python` / `cuda-core` wheels. +2. **Python 3.10 or newer** +3. Install **`pip install -r requirements.txt`** (NumPy, `cuda-python`, `cuda-core`). A **system** CUDA Toolkit is not strictly required if the process can load the driver/runtime; use `LD_LIBRARY_PATH` in *How to run* if you hit missing-library errors. + +**Install packages:** +```bash +pip install -r requirements.txt +``` + +Or manually: +```bash +pip install numpy>=2.3.2 cuda-core>=1.0.0 cuda-python>=13.0.0 +``` + +## How to run + +Basic usage: +```bash +# Pre-steps: Set library path +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH + +# Run with default parameters (1M elements) +python simpleZeroCopy.py +``` + +With custom parameters: +```bash +# Use 2M elements +python simpleZeroCopy.py --num_elements 2097152 + +# Show help +python simpleZeroCopy.py --help +``` + +### Command line arguments + +- `--num_elements`: Number of elements in vectors (default: 1048576) + - Each vector uses `num_elements * 4 bytes` (float32) + - Default: ~4 MB per vector, ~12 MB total + +## Expected Output + +Device name and compute capability **depend on your system**; the rest of the log should match this shape when validation passes. + +``` +====================================================================== +simpleZeroCopy - CUDA Python Sample +====================================================================== + +Device Information: + Name: + Compute Capability: . + +> Memory: mapped pinned host (cudaHostAlloc + cudaHostGetDevicePointer) + +Compiling CUDA kernel... + Kernel compiled successfully + +Allocating memory: + Vector size: 1,048,576 elements + Memory per vector: 4.00 MB + Total memory: 12.00 MB + +> Allocating mapped pinned host memory... + Mapped host memory allocated successfully + +> Initializing vectors on host... +> Computing reference result on CPU... + +> Launching vectorAddGPU kernel... + Note: GPU accesses host memory directly (zero-copy) + Kernel execution complete + +> Checking results from vectorAddGPU()... + Comparing 1,048,576 elements... + Relative error: 0.000000e+00 + Validation PASSED + +====================================================================== +simpleZeroCopy completed successfully! +====================================================================== +``` + +## Files + +- `simpleZeroCopy.py` – Main Python implementation +- `README.md` – This file +- `requirements.txt` – Python package dependencies diff --git a/samples/simpleZeroCopy/requirements.txt b/samples/simpleZeroCopy/requirements.txt new file mode 100644 index 00000000000..cfd9d89ea13 --- /dev/null +++ b/samples/simpleZeroCopy/requirements.txt @@ -0,0 +1,6 @@ +# simpleZeroCopy - Requirements +# Install with: pip install -r requirements.txt + +numpy>=2.3.2 +cuda-python>=13.0.0 +cuda-core>=1.0.0 diff --git a/samples/simpleZeroCopy/simpleZeroCopy.py b/samples/simpleZeroCopy/simpleZeroCopy.py new file mode 100644 index 00000000000..a000ad0f8ca --- /dev/null +++ b/samples/simpleZeroCopy/simpleZeroCopy.py @@ -0,0 +1,275 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# distribution and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["numpy>=2.3.2", "cuda-python>=13.0.0", "cuda-core>=1.0.0"] +# /// + +import argparse +import ctypes +import sys +from pathlib import Path + +try: + import numpy as np + + from cuda.bindings import runtime as cuda_rt + from cuda.core import ( + Device, + LaunchConfig, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + +# Add parent directory to path to import utilities +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + + +def _mapped_host_alloc(num_floats, stream): + """ + Allocate page-locked host memory mapped for device access; return + (host_ptr, device_ptr) for CPU views and for ``launch()``. + """ + nbytes = int(num_floats) * np.dtype(np.float32).itemsize + if nbytes <= 0: + return 0, 0 + err, h_ptr = cuda_rt.cudaHostAlloc(nbytes, cuda_rt.cudaHostAllocMapped | cuda_rt.cudaHostAllocPortable) + if err != cuda_rt.cudaError_t.cudaSuccess: + raise RuntimeError(f"cudaHostAlloc failed: {err}") + err, d_ptr = cuda_rt.cudaHostGetDevicePointer(h_ptr, 0) + if err != cuda_rt.cudaError_t.cudaSuccess: + cuda_rt.cudaFreeHost(h_ptr) + raise RuntimeError(f"cudaHostGetDevicePointer failed: {err}") + # Ensure prior work on this stream is visible before host fills buffers. + if stream is not None: + stream.sync() + return h_ptr, d_ptr + + +def _float_view(host_ptr, num_floats): + return np.frombuffer( + (ctypes.c_float * num_floats).from_address(host_ptr), + dtype=np.float32, + count=num_floats, + ) + + +# CUDA C++: vector add with grid-stride loop +VECTOR_ADD_KERNEL = """ +extern "C" __global__ +void vectorAddGPU(float* c, const float* a, const float* b, int N) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = gridDim.x * blockDim.x; + + for (size_t i = tid; i < N; i += stride) { + c[i] = a[i] + b[i]; + } +} +""" + + +def run(num_elements=1048576): + """ + Zero-copy vector add: map host memory, launch kernel with device + pointers, validate on CPU. + + This function shows how to: + 1. Allocate pinned (page-locked) host memory + 2. Map host memory into GPU address space (zero-copy) + 3. Access host memory directly from GPU kernel + 4. Validate results + + Parameters + ---------- + num_elements : int + Number of elements in vectors (default: 1048576) + """ + print("\n" + "=" * 70) + print("simpleZeroCopy - CUDA Python Sample") + print("=" * 70) + + # Initialize device + device = Device() + device.set_current() + major, minor = device.compute_capability + + print("\nDevice Information:") + print(f" Name: {device.name}") + print(f" Compute Capability: {major}.{minor}") + + # Create stream + stream = device.create_stream() + mapped_host_ptrs = [] + + try: + print("\n> Memory: mapped pinned host (cudaHostAlloc + cudaHostGetDevicePointer)") + + print("\nCompiling CUDA kernel...") + program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + prog = Program(VECTOR_ADD_KERNEL, code_type="c++", options=program_options) + mod = prog.compile("cubin") + kernel = mod.get_kernel("vectorAddGPU") + print(" Kernel compiled successfully") + + bytes_total = num_elements * np.dtype(np.float32).itemsize + print("\nAllocating memory:") + print(f" Vector size: {num_elements:,} elements") + print(f" Memory per vector: {bytes_total / (1024**2):.2f} MB") + print(f" Total memory: {3 * bytes_total / (1024**2):.2f} MB") + + print("\n> Allocating mapped pinned host memory...") + h_a, d_a = _mapped_host_alloc(num_elements, stream) + mapped_host_ptrs.append(h_a) + h_b, d_b = _mapped_host_alloc(num_elements, stream) + mapped_host_ptrs.append(h_b) + h_c, d_c = _mapped_host_alloc(num_elements, stream) + mapped_host_ptrs.append(h_c) + + a = _float_view(h_a, num_elements) + b = _float_view(h_b, num_elements) + c = _float_view(h_c, num_elements) + + print(" Mapped host memory allocated successfully") + + print("\n> Initializing vectors on host...") + rng = np.random.default_rng(42) + a[:] = rng.random(num_elements).astype(np.float32) + b[:] = rng.random(num_elements).astype(np.float32) + c[:] = 0 + + print("> Computing reference result on CPU...") + reference = a + b + + print("\n> Launching vectorAddGPU kernel...") + print(" Note: GPU accesses host memory directly (zero-copy)") + + block_size = 256 + grid_size = (num_elements + block_size - 1) // block_size + config = LaunchConfig(grid=grid_size, block=block_size) + + # Pass device pointers from cudaHostGetDevicePointer, not raw host VAs. + launch( + stream, + config, + kernel, + int(d_c), + int(d_a), + int(d_b), + np.int32(num_elements), + ) + stream.sync() + + print(" Kernel execution complete") + + print("\n> Checking results from vectorAddGPU()...") + print(f" Comparing {num_elements:,} elements...") + + # ``c`` is a host view of the same buffer; no cudaMemcpy D2H needed. + if np.allclose(c, reference, rtol=1e-5, atol=1e-6): + error_norm = np.linalg.norm(c - reference) + ref_norm = np.linalg.norm(reference) + relative_error = error_norm / ref_norm + print(f" Relative error: {relative_error:.6e}") + print(" Validation PASSED") + success = True + else: + max_error = np.max(np.abs(c - reference)) + print(f" Max error: {max_error}") + print(" Validation FAILED") + success = False + + print("\n" + "=" * 70) + if success: + print("simpleZeroCopy completed successfully!") + else: + print("simpleZeroCopy FAILED!") + print("=" * 70 + "\n") + + return 0 if success else 1 + finally: + for h in reversed(mapped_host_ptrs): + if h: + cuda_rt.cudaFreeHost(h) + stream.close() + + +def main(): + """Parse CLI, call ``run()``, and exit with validation status.""" + parser = argparse.ArgumentParser( + description="Demonstrate zero-copy memory access with CUDA", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python simpleZeroCopy.py + python simpleZeroCopy.py --num_elements 2097152 +What is Zero-Copy Memory? + Zero-copy allows the GPU to directly access host (CPU) memory without + explicit memory transfers. This is useful for: + - Small data that doesn't benefit from explicit transfers + - Data that is accessed infrequently + - Integrated GPUs that share memory with CPU + + Trade-offs: + - Slower than device memory (PCIe bandwidth limited) + - No explicit transfers needed (simpler code) + - Good for discrete GPUs with small data + - Excellent for integrated GPUs (e.g., Tegra) + """, + ) + + parser.add_argument( + "--num_elements", + type=int, + default=1048576, + help="Number of elements in vectors (default: 1048576)", + ) + + args = parser.parse_args() + + if args.num_elements <= 0: + print("Error: num_elements must be positive") + sys.exit(1) + + try: + exit_code = run(num_elements=args.num_elements) + except Exception as e: + print(f"\nError: {e}") + import traceback + + traceback.print_exc() + exit_code = 1 + + sys.exit(exit_code) + + +if __name__ == "__main__": + main() diff --git a/samples/streamingCopyComputeOverlap/README.md b/samples/streamingCopyComputeOverlap/README.md new file mode 100644 index 00000000000..7ecdc142dd7 --- /dev/null +++ b/samples/streamingCopyComputeOverlap/README.md @@ -0,0 +1,98 @@ +# Sample: Streaming Copy + Compute Overlap (Python) + +## Description + +Demonstrate how to overlap memory transfers (H2D/D2H) with kernel computation using CUDA streams. This technique hides transfer latency and improves GPU utilization. + +## What You'll Learn + +- Using `PinnedMemoryResource` for async-capable host memory +- Using `DeviceMemoryResource` for GPU memory allocation +- Creating multiple streams with `Device.create_stream()` +- Async memory copies with `Buffer.copy_to()` +- Overlapping H2D transfers, kernel execution, and D2H transfers + +## Key Concept + +**Without overlap (sequential):** +``` +[====H2D====][====Compute====][====D2H====] +``` + +**With overlap (multiple streams):** +``` +Stream 0: [H2D][Compute][D2H] +Stream 1: [H2D][Compute][D2H] +Stream 2: [H2D][Compute][D2H] +``` + +## Key APIs (all from `cuda.core`) + +- `Device` - Device management +- `Device.create_stream()` - Create CUDA streams +- `Stream.sync()` - Synchronize stream +- `PinnedMemoryResource` - Pinned host memory (required for async transfers) +- `DeviceMemoryResource` - GPU device memory +- `Buffer.copy_to(dst, stream=stream)` - Async memory copy +- `Program`, `LaunchConfig`, `launch` - Kernel compilation and execution + +### From `numpy`: + +- `np.from_dlpack()` - Zero-copy view of pinned memory buffers + +## Requirements + +- CUDA Toolkit 13.0+ +- Python 3.10+ +- `cuda-python`, `cuda-core`, `numpy` + +## Installation + +```bash +pip install -r requirements.txt +``` + +## How to Run + +```bash +python streamingCopyComputeOverlap.py +``` + +## Expected Output + +``` +============================================================ +Streaming Copy + Compute Overlap +Using pure cuda.core APIs +============================================================ + +Device: NVIDIA GeForce RTX XXXX +Kernel compiled [OK] + +Problem size: 16,000,000 elements (61 MB) + +--- Sequential (no overlap) --- +Timeline: [H2D][Compute][D2H] +Time: X.XX ms (±X.XX) + +--- Streamed (with overlap) --- +Stream 0: [H2D][Compute][D2H] +Stream 1: [H2D][Compute][D2H] +Stream 2: [H2D][Compute][D2H] +... +2 streams: X.XX ms (±X.XX) - speedup: X.XXx +4 streams: X.XX ms (±X.XX) - speedup: X.XXx +8 streams: X.XX ms (±X.XX) - speedup: X.XXx + +============================================================ +Key: Pinned memory + multiple streams = overlap transfers with compute + +Note: Speedup depends on hardware characteristics. This technique +benefits most when transfer time is significant relative to compute. +============================================================ +``` + +## See Also + +- [cuda.core Documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [CUDA Streams Best Practices](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#overlapping-data-transfers) diff --git a/samples/streamingCopyComputeOverlap/requirements.txt b/samples/streamingCopyComputeOverlap/requirements.txt new file mode 100644 index 00000000000..0b19f485db2 --- /dev/null +++ b/samples/streamingCopyComputeOverlap/requirements.txt @@ -0,0 +1,6 @@ +# Streaming Copy Compute Overlap Sample Requirements +# Requires Python 3.10 or newer + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +numpy>=2.3.2 diff --git a/samples/streamingCopyComputeOverlap/streamingCopyComputeOverlap.py b/samples/streamingCopyComputeOverlap/streamingCopyComputeOverlap.py new file mode 100644 index 00000000000..efa7802a324 --- /dev/null +++ b/samples/streamingCopyComputeOverlap/streamingCopyComputeOverlap.py @@ -0,0 +1,308 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "numpy>=2.3.2"] +# /// + +""" +Streaming Copy + Compute Overlap + +Demonstrates how to overlap memory transfers with kernel computation using +CUDA streams to maximize GPU utilization. + +Uses pure cuda.core APIs: + - Device, Stream for device and stream management + - PinnedMemoryResource, DeviceMemoryResource for memory allocation + - Buffer.copy_to() for async memory copies + - Program, LaunchConfig, launch for kernel compilation and execution +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import numpy as np + from cuda_samples_utils import print_gpu_info + + from cuda.core import ( + Device, + DeviceMemoryResource, + EventOptions, + LaunchConfig, + PinnedMemoryResource, + Program, + ProgramOptions, + launch, + ) +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Install with: pip install -r requirements.txt") + sys.exit(1) + + +# CUDA Kernel - compute-intensive vector operation (grid-stride loop) +VECTOR_SCALE_KERNEL = r""" +extern "C" __global__ +void vector_scale(const float* input, float* output, float scale, size_t N) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = (size_t)gridDim.x * blockDim.x; + for (size_t i = tid; i < N; i += stride) { + float val = input[i] * scale; + // Add compute work to make kernel non-trivial + for (int j = 0; j < 50; j++) { + val = sqrtf(val * val + 1.0f); + } + output[i] = val; + } +} +""" + + +def buffer_to_numpy(buffer, n_elements): + """Create numpy view of cuda.core Buffer via DLPack.""" + return np.from_dlpack(buffer).view(np.float32).reshape(n_elements) + + +def main(): + print("=" * 60) + print("Streaming Copy + Compute Overlap") + print("Using pure cuda.core APIs") + print("=" * 60) + + # Initialize device + device = Device(0) + device.set_current() + print() + print_gpu_info(device) + + # Compile kernel + arch = f"sm_{device.arch}" + program = Program(VECTOR_SCALE_KERNEL, code_type="c++", options=ProgramOptions(arch=arch)) + kernel = program.compile(target_type="cubin").get_kernel("vector_scale") + print("Kernel compiled [OK]") + + # Parameters + N = 16_000_000 # 16M elements + n_bytes = N * 4 + scale = 2.5 + n_runs = 10 + + print(f"\nProblem size: {N:,} elements ({n_bytes / 1024 / 1024:.0f} MB)") + + # Create memory resources + pinned_mr = PinnedMemoryResource() + device_mr = DeviceMemoryResource(device.device_id) + default_stream = device.create_stream() + + # ========================================================================= + # Sequential Execution + # ========================================================================= + print("\n--- Sequential (no overlap) ---") + print("Timeline: [H2D][Compute][D2H]") + + h_in = h_out = d_in = d_out = None + try: + # Pre-allocate buffers + h_in = pinned_mr.allocate(n_bytes, stream=default_stream) + h_out = pinned_mr.allocate(n_bytes, stream=default_stream) + d_in = device_mr.allocate(n_bytes, stream=default_stream) + d_out = device_mr.allocate(n_bytes, stream=default_stream) + # Sync before numpy access (numpy operations aren't stream ordered) + default_stream.sync() + + # Initialize input + np_in = buffer_to_numpy(h_in, N) + np_in[:] = np.random.rand(N).astype(np.float32) * 100 + + config = LaunchConfig(grid=((N + 255) // 256,), block=(256,)) + event_opts = EventOptions(timing_enabled=True) + + # Warm up + h_in.copy_to(d_in, stream=default_stream) + launch( + default_stream, + config, + kernel, + d_in, + d_out, + np.float32(scale), + np.uint64(N), + ) + d_out.copy_to(h_out, stream=default_stream) + default_stream.sync() + + # Benchmark with CUDA events + times = [] + for _ in range(n_runs): + start_ev = device.create_event(options=event_opts) + end_ev = device.create_event(options=event_opts) + default_stream.record(start_ev) + h_in.copy_to(d_in, stream=default_stream) # Async H2D + launch( + default_stream, + config, + kernel, + d_in, + d_out, + np.float32(scale), + np.uint64(N), + ) + d_out.copy_to(h_out, stream=default_stream) # Async D2H + default_stream.record(end_ev) + default_stream.sync() + times.append(end_ev - start_ev) + + seq_time = np.mean(times) + print(f"Time: {seq_time:.2f} ms (±{np.std(times):.2f})") + + # Verification: compute expected on CPU and compare + default_stream.sync() + np_out = buffer_to_numpy(h_out, N) + expected = np_in.astype(np.float32) * scale + for _ in range(50): + expected = np.sqrt(expected * expected + 1.0).astype(np.float32) + if np.allclose(np_out, expected, rtol=1e-4, atol=1e-4): + print("Verification: PASSED") + else: + print("Verification: FAILED") + finally: + for buf in (h_in, h_out, d_in, d_out): + if buf is not None: + buf.close() + default_stream.close() + + # ========================================================================= + # Streamed Execution + # ========================================================================= + print("\n--- Streamed (with overlap) ---") + print("Stream 0: [H2D][Compute][D2H]") + print("Stream 1: [H2D][Compute][D2H]") + print("Stream 2: [H2D][Compute][D2H]") + print("...") + + for n_streams in [2, 4, 8]: + chunk_size = N // n_streams + chunk_bytes = chunk_size * 4 + + # Create streams + streams = [device.create_stream() for _ in range(n_streams)] + + # Pre-allocate per-stream buffers + h_ins, h_outs, d_ins, d_outs = [], [], [], [] + try: + for i in range(n_streams): + h_ins.append(pinned_mr.allocate(chunk_bytes, stream=streams[i])) + h_outs.append(pinned_mr.allocate(chunk_bytes, stream=streams[i])) + d_ins.append(device_mr.allocate(chunk_bytes, stream=streams[i])) + d_outs.append(device_mr.allocate(chunk_bytes, stream=streams[i])) + + # Initialize input data + for i in range(n_streams): + streams[i].sync() + np_view = buffer_to_numpy(h_ins[i], chunk_size) + np_view[:] = np.random.rand(chunk_size).astype(np.float32) * 100 + + chunk_config = LaunchConfig(grid=((chunk_size + 255) // 256,), block=(256,)) + + # Warm up + for i in range(n_streams): + h_ins[i].copy_to(d_ins[i], stream=streams[i]) + launch( + streams[i], + chunk_config, + kernel, + d_ins[i], + d_outs[i], + np.float32(scale), + np.uint64(chunk_size), + ) + d_outs[i].copy_to(h_outs[i], stream=streams[i]) + for stream in streams: + stream.sync() + + # Benchmark with CUDA events (use stream 0 for timing) + times = [] + event_opts = EventOptions(timing_enabled=True) + for _ in range(n_runs): + start_ev = device.create_event(options=event_opts) + end_ev = device.create_event(options=event_opts) + streams[0].record(start_ev) + + # Issue all operations - they overlap across streams + for i in range(n_streams): + h_ins[i].copy_to(d_ins[i], stream=streams[i]) # Async H2D + launch( + streams[i], + chunk_config, + kernel, + d_ins[i], + d_outs[i], + np.float32(scale), + np.uint64(chunk_size), + ) + d_outs[i].copy_to(h_outs[i], stream=streams[i]) # Async D2H + + # Wait for all streams, record end on stream 0 + for stream in streams: + stream.sync() + streams[0].record(end_ev) + streams[0].sync() + times.append(end_ev - start_ev) + + avg = np.mean(times) + speedup = seq_time / avg + print(f"{n_streams} streams: {avg:.2f} ms (±{np.std(times):.2f}) - speedup: {speedup:.2f}x") + + # Verification (streamed): concatenate chunks and compare to expected + for s in streams: + s.sync() + out_chunks = [buffer_to_numpy(h_outs[i], chunk_size) for i in range(n_streams)] + in_chunks = [buffer_to_numpy(h_ins[i], chunk_size) for i in range(n_streams)] + np_out = np.concatenate(out_chunks) + np_in = np.concatenate(in_chunks) + expected = np_in.astype(np.float32) * scale + for _ in range(50): + expected = np.sqrt(expected * expected + 1.0).astype(np.float32) + if not np.allclose(np_out, expected, rtol=1e-4, atol=1e-4): + print(f" Verification: FAILED for {n_streams} streams") + finally: + for buf in h_ins + h_outs + d_ins + d_outs: + buf.close() + for s in streams: + s.close() + + print("\n" + "=" * 60) + print("Key: Pinned memory + multiple streams = overlap transfers with compute") + print("\nNote: Speedup depends on hardware characteristics. This technique") + print("benefits most when transfer time is significant relative to compute.") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/samples/systemInfo/README.md b/samples/systemInfo/README.md new file mode 100644 index 00000000000..2e93196418d --- /dev/null +++ b/samples/systemInfo/README.md @@ -0,0 +1,152 @@ +# Sample: System Information Query (Python) + +## Description + +This sample demonstrates how to inspect the CUDA driver, NVML, and every +installed GPU through the +[`cuda.core.system`](https://nvidia.github.io/cuda-python/cuda-core/latest/) +module. + +`cuda.core.system` wraps the NVIDIA Management Library (NVML) and can be +imported without CUDA being installed or initialized, so it is useful as a +lightweight pre-flight check before any CUDA context is created. The script +prints driver and NVML versions, the current process name, per-device +metadata (name, compute capability, architecture, memory, PCI info, +temperature, performance state), and, on multi-GPU systems, the topology +and peer-to-peer capabilities between each pair of devices. + +## What You'll Learn + +- Querying CUDA driver and NVML versions with `cuda.core.system` +- Enumerating GPUs without creating a CUDA context +- Reading per-device metadata exposed by NVML (name, UUID, memory usage, + temperature, performance state) +- Inspecting GPU-to-GPU topology and peer-to-peer (P2P) capabilities + +## Key Libraries + +- [`cuda.core.system`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - Python wrapper around NVML + +## Key APIs + +From `cuda.core.system`: + +- `get_driver_version()`, `get_driver_version_full()`, `get_driver_branch()` - CUDA driver version tuple and branch string +- `get_nvml_version()` - NVML library version +- `get_num_devices()` - number of GPUs visible to NVML +- `get_process_name(pid)` - process name for a given PID +- `Device(index=...)` - NVML-backed device handle (no CUDA context required) + - `name`, `uuid`, `cuda_compute_capability`, `arch`, `brand` + - `memory_info` (`total`, `used`, `free`) + - `pci_info` (`domain`, `bus`, `device`, `bus_id`) + - `temperature.sensor(TemperatureSensors.TEMPERATURE_GPU)` + - `performance_state` +- `get_topology_common_ancestor(dev0, dev1)` - `GpuTopologyLevel` between two devices +- `get_p2p_status(dev0, dev1, GpuP2PCapsIndex.P2P_CAPS_INDEX_READ)` - peer-access capability between two devices + +Import stable symbols from the top-level `cuda.core` package (not `cuda.core.experimental`). + +## Requirements + +1. **NVIDIA Graphics Card** with CUDA support +2. **CUDA Drivers** installed on your system +3. **CUDA Toolkit** installed on your system +4. **Python 3.12 or newer** + +### Hardware + +- One or more NVIDIA GPUs +- Driver compatible with `cuda-python` 13.x + +### Software + +- CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x) +- Python 3.10 or newer +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) + +## Installation + +Install the required packages from `requirements.txt`: + +```bash +cd /path/to/cuda-samples/python/1_GettingStarted/systemInfo +pip install -r requirements.txt +``` + +The `requirements.txt` installs: + +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) + +## How to Run + +### Basic usage + +```bash +cd cuda-samples/python/1_GettingStarted/systemInfo +python systemInfo.py +``` + +### Skip topology queries + +Useful on machines with only one GPU or to shorten the output: + +```bash +python systemInfo.py --no-topology +``` + +## Expected Output + +Output varies with your hardware. On a machine with two GPUs you should see +something like: + +``` +====================================================================== +Driver / NVML +====================================================================== +CUDA driver version: 13.2 +CUDA driver version (full): (13, 2, 0) +NVML version: (13, 595, 58, 3) +Driver branch: r595_88 +Current process: /usr/bin/python + +====================================================================== +Devices detected: 2 +====================================================================== + +-- Device 0 -- +Name: +UUID: ... +Compute capability: 8.9 +Architecture: ADA +Brand: BRAND_GEFORCE +Memory: total=23.99 GiB, used=960.00 KiB, free=23.52 GiB +PCI: domain=0000 bus=41 device=00 id=00000000:41:00.0 +Temperature (GPU sensor): 47 C +Performance state: + +... + +====================================================================== +GPU topology and peer-to-peer +====================================================================== +Device 0 <-> Device 1: topology=TOPOLOGY_HOSTBRIDGE, p2p_read=..., p2p_write=... + +Done +``` + +**Note:** Device names, compute capability, temperatures, and topology +details will vary based on your GPUs and system. + +## Files + +- `systemInfo.py` - Python implementation using `cuda.core.system` +- `README.md` - This file +- `requirements.txt` - Sample dependencies + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [`cuda.core.system` API reference](https://nvidia.github.io/cuda-python/cuda-core/latest/api.html) +- [NVML reference](https://docs.nvidia.com/deploy/nvml-api/) diff --git a/samples/systemInfo/requirements.txt b/samples/systemInfo/requirements.txt new file mode 100644 index 00000000000..13628c0b308 --- /dev/null +++ b/samples/systemInfo/requirements.txt @@ -0,0 +1,4 @@ +# System Information Sample Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 diff --git a/samples/systemInfo/systemInfo.py b/samples/systemInfo/systemInfo.py new file mode 100644 index 00000000000..edfdde5ba70 --- /dev/null +++ b/samples/systemInfo/systemInfo.py @@ -0,0 +1,199 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0"] +# /// + +""" +System Information via cuda.core.system (NVML) + +Demonstrates the ``cuda.core.system`` module, which wraps NVIDIA Management +Library (NVML) functionality. + +This sample prints: + * Driver and NVML versions + * Current process name + * Per-device: name, UUID, compute capability / arch, PCI info, memory usage, + temperature, performance state + * GPU-to-GPU topology and peer-to-peer status (when more than one GPU) +""" + +import os +import sys + +try: + from cuda.core import system + from cuda.core.system import CUDA_BINDINGS_NVML_IS_COMPATIBLE + from cuda.core.system.typing import GpuP2PCapsIndex +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +def print_header(title: str) -> None: + print() + print("=" * 70) + print(title) + print("=" * 70) + + +def format_bytes(nbytes: int) -> str: + """Format a byte count as a human-readable string.""" + units = ["B", "KiB", "MiB", "GiB", "TiB"] + size = float(nbytes) + for unit in units: + if size < 1024.0: + return f"{size:.2f} {unit}" + size /= 1024.0 + return f"{size:.2f} PiB" + + +def print_driver_info() -> None: + print_header("Driver / NVML") + major, minor = system.get_user_mode_driver_version() + print(f"CUDA driver version (user-mode): {major}.{minor}") + if CUDA_BINDINGS_NVML_IS_COMPATIBLE: + kmd = system.get_kernel_mode_driver_version() + print(f"CUDA driver version (kernel-mode): {'.'.join(str(x) for x in kmd)}") + print(f"NVML version: {system.get_nvml_version()}") + try: + print(f"Driver branch: {system.get_driver_branch()}") + except Exception as e: + print(f"Driver branch: unavailable ({e})") + else: + print("NVML bindings are not compatible with this driver; device info will be limited.") + print(f"Current process: {system.get_process_name(os.getpid())}") + + +def print_device_info(device: "system.Device") -> None: + print(f"\n-- Device {device.index} --") + print(f"Name: {device.name}") + print(f"UUID: {device.uuid}") + try: + cc_major, cc_minor = device.cuda_compute_capability + print(f"Compute capability: {cc_major}.{cc_minor}") + except Exception as e: + print(f"Compute capability: unavailable ({e})") + try: + print(f"Architecture: {device.arch.name}") + except Exception as e: + print(f"Architecture: unavailable ({e})") + try: + print(f"Brand: {device.brand}") + except Exception as e: + print(f"Brand: unavailable ({e})") + + # Memory + try: + mem = device.memory_info + print(f"Memory: total={format_bytes(mem.total)}, used={format_bytes(mem.used)}, free={format_bytes(mem.free)}") + except Exception as e: + print(f"Memory: unavailable ({e})") + + # PCI + try: + pci = device.pci_info + print(f"PCI: domain={pci.domain:04x} bus={pci.bus:02x} device={pci.device:02x} id={pci.bus_id}") + except Exception as e: + print(f"PCI: unavailable ({e})") + + # Temperature (GPU sensor) + try: + temp_c = device.temperature.get_sensor() + print(f"Temperature (GPU sensor): {temp_c} C") + except Exception as e: + print(f"Temperature: unavailable ({e})") + + # Performance state + try: + pstate = device.performance_state + print(f"Performance state: {pstate}") + except Exception as e: + print(f"Performance state: unavailable ({e})") + + +def print_topology(devices: list) -> None: + if len(devices) < 2: + return + print_header("GPU topology and peer-to-peer") + for i, d0 in enumerate(devices): + for d1 in devices[i + 1 :]: + try: + level = system.get_topology_common_ancestor(d0, d1) + level_name = level.name + except Exception as e: + level_name = f"unavailable ({e})" + try: + read = system.get_p2p_status(d0, d1, GpuP2PCapsIndex.READ) + write = system.get_p2p_status(d0, d1, GpuP2PCapsIndex.WRITE) + read_name = read.name + write_name = write.name + except Exception as e: + read_name = write_name = f"unavailable ({e})" + print( + f"Device {d0.index} <-> Device {d1.index}: " + f"topology={level_name}, p2p_read={read_name}, p2p_write={write_name}" + ) + + +def main() -> int: + import argparse + + parser = argparse.ArgumentParser(description="Print CUDA system / NVML information via cuda.core.system") + parser.add_argument( + "--no-topology", + action="store_true", + help="Skip cross-device topology/P2P queries", + ) + args = parser.parse_args() + + print_driver_info() + + num_devices = system.get_num_devices() + print_header(f"Devices detected: {num_devices}") + if num_devices == 0: + print("No CUDA-capable devices found.") + return 0 + if not CUDA_BINDINGS_NVML_IS_COMPATIBLE: + print("NVML is not compatible with the installed driver; skipping device detail.") + return 0 + + devices = [system.Device(index=i) for i in range(num_devices)] + for device in devices: + print_device_info(device) + + if not args.no_topology: + print_topology(devices) + + print("\nDone") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/tmaTensorMap/README.md b/samples/tmaTensorMap/README.md new file mode 100644 index 00000000000..2260a0c47d0 --- /dev/null +++ b/samples/tmaTensorMap/README.md @@ -0,0 +1,138 @@ +# tmaTensorMap (Python) + +## Description + +This sample demonstrates how to use Tensor Memory Accelerator (TMA) +descriptors with `cuda.core` on Hopper and later GPUs (compute +capability >= 9.0). TMA enables efficient bulk data movement between +global and shared memory using hardware-managed tensor map +descriptors, which are a key building block for modern GEMM kernels +and large shared-memory tile loads. + +The sample: + +1. Creates a TMA tiled descriptor from a CuPy device array via + `StridedMemoryView.from_any_interface(...).as_tensor_map(...)`. +2. Passes the descriptor by value (as `__grid_constant__`) to a + kernel that uses libcudacxx TMA/barrier wrappers to bulk-load a + tile into shared memory, then copies it out to verify correctness. +3. Reuses the same descriptor against a new source tensor with + `replace_address()` to avoid rebuilding it. + +## What You'll Learn + +- Creating a TMA descriptor from a strided device tensor via + `StridedMemoryView.as_tensor_map(box_dim=...)` +- Passing a tensor map to a kernel by value using + `__grid_constant__` +- Using libcudacxx (`cuda/barrier`) to coordinate TMA loads with a + block-scoped barrier +- Reusing a descriptor against a new source buffer via + `tensor_map.replace_address(new_tensor)` +- Compiling a kernel to CUBIN for a specific target arch so Hopper + features are available +- Using `cuda.pathfinder` to locate the CUDA toolkit include directory + CCCL headers and libcudacxx + +## Key Libraries + +- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - compilation, launching, and tensor-map helpers +- `cuda.pathfinder` - locate the CUDA toolkit include directory +- `cupy` - allocate and fill device tensors +- `numpy` - scalar kernel arguments + +## Key APIs + +### From `cuda.core` + +- `StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)` - build a typed view from any DLPack/CUDA-array-interface tensor +- `StridedMemoryView.as_tensor_map(box_dim=(...))` - produce a TMA descriptor for the given tile shape +- `tensor_map.replace_address(new_tensor)` - retarget an existing descriptor at a new tensor +- `Program(code, code_type="c++", options=ProgramOptions(std="c++17", arch="sm_90", include_path=[...]))` - compile a C++ kernel against libcudacxx +- `program.compile("cubin")` - produce a CUBIN so `__grid_constant__` and TMA intrinsics are fully supported +- `launch(stream, config, kernel, tensor_map, ...)` - pass the TMA descriptor as a kernel argument + +### From `cuda.pathfinder` + +- `get_cuda_path_or_home()` - return the detected CUDA toolkit root for locating `include/cccl` + +### From `cuda_samples_utils` + +- `print_gpu_info()` - print device name and compute capability + +## Requirements + +### Hardware + +- NVIDIA Hopper or newer GPU with Compute Capability 9.0 or higher (H100, H200, B200, ...) +- On GPUs older than Hopper the sample exits cleanly without running the kernel +- Minimum GPU memory: 512 MB + +### Software + +- CUDA Toolkit 13.0 or newer with libcudacxx (cccl) headers +- Python 3.10 or newer +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## Installation + +Install the required packages from `requirements.txt`: + +```bash +cd /path/to/cuda-samples/python/2_CoreConcepts/tmaTensorMap +pip install -r requirements.txt +``` + +The `requirements.txt` installs: + +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## How to Run + +### Basic usage + +```bash +cd cuda-samples/python/2_CoreConcepts/tmaTensorMap +python tmaTensorMap.py +``` + +### With custom parameters + +```bash +# Larger tensor (must be a multiple of the 128-element tile) +python tmaTensorMap.py --elements 8192 + +# Use a specific GPU +python tmaTensorMap.py --device 1 +``` + +## Expected Output + +On a Hopper (sm_90) GPU: + +``` +Device: NVIDIA H100 PCIe +Compute Capability: 9.0 + +TMA copy verified: 1024 elements across 8 tiles +replace_address verified: descriptor reused with new source tensor +``` + +**Note:** Device name and compute capability will vary based on your GPU. + +## Files + +- `tmaTensorMap.py` - Python implementation using `cuda.core` TMA APIs +- `README.md` - This file +- `requirements.txt` - Sample dependencies +- `../../Utilities/cuda_samples_utils.py` - Common utilities (imported by this sample) + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [TMA in the CUDA C++ Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#tensor-memory-accelerator) +- [`cuda::barrier` reference](https://nvidia.github.io/cccl/libcudacxx/extended_api/synchronization_primitives/barrier.html) diff --git a/samples/tmaTensorMap/requirements.txt b/samples/tmaTensorMap/requirements.txt new file mode 100644 index 00000000000..c33f5dd8d9f --- /dev/null +++ b/samples/tmaTensorMap/requirements.txt @@ -0,0 +1,4 @@ +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 +numpy>=1.24.0 diff --git a/samples/tmaTensorMap/tmaTensorMap.py b/samples/tmaTensorMap/tmaTensorMap.py new file mode 100644 index 00000000000..50dffa4e5e4 --- /dev/null +++ b/samples/tmaTensorMap/tmaTensorMap.py @@ -0,0 +1,278 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0", "numpy>=1.24.0"] +# /// + +""" +TMA Tensor Map with cuda.core + +This sample demonstrates how to use Tensor Memory Accelerator (TMA) +descriptors with cuda.core on Hopper and later GPUs (compute +capability >= 9.0). TMA enables efficient bulk data movement between +global and shared memory using hardware-managed tensor map +descriptors. + +The sample: + + 1. Creates a TMA tiled descriptor from a CuPy device array via + ``StridedMemoryView.from_any_interface(...).as_tensor_map(...)``. + 2. Passes the descriptor by value (as ``__grid_constant__``) to a + kernel that uses libcudacxx TMA/barrier wrappers to bulk-load a + tile into shared memory. + 3. Reuses the same descriptor against a new source tensor with + ``replace_address()`` to avoid rebuilding it. + +On GPUs older than Hopper (sm < 90), the sample prints a diagnostic +and exits cleanly. + +Ported from ``cuda_core/examples/tma_tensor_map.py`` in the +`cuda-python` repository. +""" + +import os +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) + +try: + import cupy as cp + import numpy as np + from cuda_samples_utils import print_gpu_info + + from cuda.core import ( + Device, + LaunchConfig, + Program, + ProgramOptions, + launch, + ) + from cuda.core.utils import StridedMemoryView + from cuda.pathfinder import find_nvidia_header_directory, get_cuda_path_or_home +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +TILE_SIZE = 128 # elements per tile, must match the kernel constant + +KERNEL_SRC = r""" +#include + +// Minimal definition of the 128-byte opaque tensor-map struct. +struct __align__(64) TensorMap { unsigned long long opaque[16]; }; + +static constexpr int TILE_SIZE = 128; +using TmaBarrier = cuda::barrier; + +extern "C" +__global__ void tma_copy( + const __grid_constant__ TensorMap tensor_map, + float* output, + int N) +{ + __shared__ __align__(128) float smem[TILE_SIZE]; + __shared__ TmaBarrier bar; + + const int tid = threadIdx.x; + const int tile_start = blockIdx.x * TILE_SIZE; + + if (tid == 0) + { + init(&bar, 1); + } + __syncthreads(); + + if (tid == 0) + { + cuda::device::experimental::cp_async_bulk_tensor_1d_global_to_shared( + smem, + reinterpret_cast(&tensor_map), + tile_start, + bar); + bar.wait(cuda::device::barrier_arrive_tx(bar, 1, TILE_SIZE * sizeof(float))); + } + __syncthreads(); + + if (tid < TILE_SIZE) + { + const int idx = tile_start + tid; + if (idx < N) + output[idx] = smem[tid]; + } +} +""" + + +def _get_cccl_include_paths() -> list: + """Locate the CUDA toolkit and libcudacxx (cccl) include directories. + + ``cuda.pathfinder.find_nvidia_header_directory`` searches pip-installed + CUDA packages, conda environments, and the standard system install + locations, so this works without requiring ``CUDA_PATH`` or + ``CUDA_HOME``. ``get_cuda_path_or_home`` is used as a final fallback. + """ + include_path: list = [] + + # libcudacxx (cccl) - preferred, provides used below. + try: + cccl_dir = find_nvidia_header_directory("cccl") + if cccl_dir and os.path.isdir(cccl_dir): + include_path.append(cccl_dir) + except Exception: # noqa: S110 - fallback probes continue below + pass + + # CUDA runtime headers - needed for the CUtensorMap driver type. + try: + cudart_dir = find_nvidia_header_directory("cudart") + if cudart_dir and os.path.isdir(cudart_dir) and cudart_dir not in include_path: + include_path.append(cudart_dir) + except Exception: # noqa: S110 - fallback probes continue below + pass + + # Fallback: use CUDA_PATH / CUDA_HOME when pathfinder comes up empty. + if not include_path: + cuda_path = get_cuda_path_or_home() + if cuda_path is not None: + cuda_include = os.path.join(cuda_path, "include") + if os.path.isdir(cuda_include): + include_path.append(cuda_include) + cccl_include = os.path.join(cuda_include, "cccl") + if os.path.isdir(cccl_include): + include_path.insert(0, cccl_include) + + if not include_path: + print( + "Could not locate CUDA toolkit headers.\n" + "Tried cuda.pathfinder (pip/conda/system installs) and " + "CUDA_PATH / CUDA_HOME; none succeeded.\n" + "Set CUDA_HOME to your toolkit root (containing include/cccl " + "and include/cuda_runtime.h) and retry.", + file=sys.stderr, + ) + sys.exit(1) + return include_path + + +def main() -> int: + import argparse + + parser = argparse.ArgumentParser(description="Use a TMA tensor map to bulk-copy data on Hopper+ GPUs") + parser.add_argument( + "--elements", + type=int, + default=1024, + help="Total number of float32 elements (must be a multiple of 128)", + ) + parser.add_argument("--device", type=int, default=0, help="CUDA device id") + args = parser.parse_args() + + if args.elements % TILE_SIZE != 0: + print(f"--elements must be a multiple of TILE_SIZE={TILE_SIZE}") + return 1 + + dev = Device(args.device) + print_gpu_info(dev) + + arch = dev.compute_capability + if arch < (9, 0): + print( + f"\nTMA requires compute capability >= 9.0 (Hopper or later); " + f"this device is {arch.major}.{arch.minor}. Exiting cleanly." + ) + return 0 + + dev.set_current() + include_path = _get_cccl_include_paths() + + # Compile with the CUBIN code type to target the exact device arch. + prog = Program( + KERNEL_SRC, + code_type="c++", + options=ProgramOptions( + std="c++17", + arch=f"sm_{dev.arch}", + include_path=include_path, + ), + ) + mod = prog.compile("cubin") + kernel = mod.get_kernel("tma_copy") + + # (1) Prepare input data and verify the initial TMA copy. + n = args.elements + src = cp.arange(n, dtype=cp.float32) + output = cp.zeros(n, dtype=cp.float32) + dev.sync() # CuPy uses its own stream + + tensor_map = StridedMemoryView.from_any_interface(src, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,)) + + n_tiles = n // TILE_SIZE + config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) + launch( + dev.default_stream, + config, + kernel, + tensor_map, + output.data.ptr, + np.int32(n), + ) + dev.sync() + + if not cp.array_equal(output, src): + print("TMA copy produced incorrect results") + return 1 + print(f"TMA copy verified: {n} elements across {n_tiles} tiles") + + # (2) Demonstrate replace_address() without rebuilding the descriptor. + replacement = cp.full(n, fill_value=42.0, dtype=cp.float32) + dev.sync() + + tensor_map.replace_address(replacement) + + output2 = cp.zeros(n, dtype=cp.float32) + launch( + dev.default_stream, + config, + kernel, + tensor_map, + output2.data.ptr, + np.int32(n), + ) + dev.sync() + + if not cp.array_equal(output2, replacement): + print("replace_address produced incorrect results") + return 1 + print("replace_address verified: descriptor reused with new source tensor") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/samples/vectorAdd/README.md b/samples/vectorAdd/README.md new file mode 100644 index 00000000000..8abc4728e33 --- /dev/null +++ b/samples/vectorAdd/README.md @@ -0,0 +1,130 @@ +# Sample: Vector Addition (Python) + +## Description + +Run your first GPU kernel: add two vectors element-wise on the GPU using the [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) API with runtime compilation. + +## What You'll Learn + +- Writing CUDA kernels in C++ with template support +- Runtime compilation of CUDA kernels from Python +- Using `cuda.core` for device management, programs, and launches +- Configuring and launching kernels with grid and block dimensions +- Using CuPy for GPU memory management +- Verifying GPU results against CPU computation + +## Key Libraries + +- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) — Pythonic access to CUDA runtime and compilation +- `cupy` — GPU array library for Python + +## Key APIs + +### From `cuda.core` + +- `Device` — Initialize and manage CUDA device +- `Program` — Create program from kernel source code +- `ProgramOptions` — Set compilation options (C++ standard, architecture) +- `LaunchConfig` — Configure kernel launch parameters +- `launch` — Execute kernel on specified stream + +Import stable symbols from the top-level package (not `cuda.core.experimental`). See the [cuda.core documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/). + +### From CuPy + +- `cp.random.rand()` — Generate random arrays on GPU +- `cp.empty()` — Allocate uninitialized GPU arrays +- `cp.allclose()` — Verify results with tolerance + +### From `cuda_samples_utils` + +- `verify_array_result()` — Verify computation results + +## Kernel Techniques + +- **1D Grid-Stride Loop** — Handle arbitrary array sizes with fixed grid +- **Template Programming** — Generic kernel for different data types +- **Bounds Checking** — Prevent out-of-bounds memory access + +## Requirements + +### Hardware + +- NVIDIA GPU with Compute Capability 7.0 or higher +- Minimum GPU memory: 512 MB + +### Software + +- CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x) +- Python 3.10 or newer +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## Installation + +Install the required packages from requirements.txt: + +```bash +cd /path/to/cuda-samples/python/1_GettingStarted/vectorAdd +pip install -r requirements.txt +``` + +The requirements.txt installs: + +- `cuda-python` (>=13.0.0) +- `cuda-core` (>=1.0.0) +- `cupy-cuda13x` (>=14.0.0) + +## How to Run + +### Basic usage + +```bash +cd samples/python/1_GettingStarted/vectorAdd +python vectorAdd.py +``` + +### With custom parameters + +```bash +# Custom vector size +python vectorAdd.py --elements 1000000 + +# Use specific GPU +python vectorAdd.py --device 1 + +# Skip verification for benchmarking +python vectorAdd.py --no-verify +``` + +## Expected Output + +``` +[Vector addition using CUDA Core API] +Device: +Compute Capability: sm_ +Compiling kernel 'vectorAdd'... +Kernel compiled successfully +[Vector addition of 50000 elements] +CUDA kernel launch with 196 blocks of 256 threads +Verifying result... +Test PASSED + +Done +``` + +**Note:** Device name and compute capability will vary based on your GPU. + +## Files + +- `vectorAdd.py` — Python implementation using cuda.core API +- `README.md` — This file +- `requirements.txt` — Sample dependencies +- `../../Utilities/cuda_samples_utils.py` — Common utilities (imported by this sample) + +## See Also + +- [CUDA Python Documentation](https://nvidia.github.io/cuda-python/) +- [cuda.core API](https://nvidia.github.io/cuda-python/cuda-core/latest/) +- [CuPy Documentation](https://docs.cupy.dev/) diff --git a/samples/vectorAdd/requirements.txt b/samples/vectorAdd/requirements.txt new file mode 100644 index 00000000000..46d588ce780 --- /dev/null +++ b/samples/vectorAdd/requirements.txt @@ -0,0 +1,5 @@ +# Vector Addition Sample Requirements + +cuda-python>=13.0.0 +cuda-core>=1.0.0 +cupy-cuda13x>=14.0.0 diff --git a/samples/vectorAdd/vectorAdd.py b/samples/vectorAdd/vectorAdd.py new file mode 100755 index 00000000000..ec1eca0cf96 --- /dev/null +++ b/samples/vectorAdd/vectorAdd.py @@ -0,0 +1,192 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# /// script +# dependencies = ["cuda-python>=13.0.0", "cuda-core>=1.0.0", "cupy-cuda13x>=14.0.0"] +# /// + +""" +Vector Addition using CUDA Core API + +This sample demonstrates element-wise vector addition: C = A + B +using cuda.core for runtime compilation and kernel launch. +""" + +import sys +from pathlib import Path + +# Add parent directory to path to import utilities +sys.path.insert(0, str(Path(__file__).parent.parent / "Utilities")) +from cuda_samples_utils import verify_array_result + +try: + import cupy as cp + + from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch +except ImportError as e: + print(f"Error: Required package not found: {e}") + print("Please install from requirements.txt:") + print(" pip install -r requirements.txt") + sys.exit(1) + + +# CUDA kernel source code +VECTOR_ADD_KERNEL = """ +/** + * CUDA Kernel for vector addition + * Computes the vector addition of A and B into C. + */ +template +__global__ void vectorAdd(const T *A, const T *B, T *C, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + C[i] = A[i] + B[i]; + } +} +""" + + +def vector_add_cuda_core(num_elements=50000, device_id=0, verify=True): + """ + Perform vector addition using cuda.core API. + + Parameters + ---------- + num_elements : int + Number of elements in each vector + device_id : int + CUDA device ID to use + verify : bool + Whether to verify the result + + Returns + ------- + bool + True if successful, False otherwise + """ + try: + # Initialize device + print("[Vector addition using CUDA Core API]") + device = Device(device_id) + device.set_current() + + print(f"Device: {device.name}") + print(f"Compute Capability: sm_{device.arch}") + + stream = device.create_stream() + + # Compile kernel + print("Compiling kernel 'vectorAdd'...") + program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + program = Program(VECTOR_ADD_KERNEL, code_type="c++", options=program_options) + module = program.compile("cubin", name_expressions=("vectorAdd",)) + kernel = module.get_kernel("vectorAdd") + print("Kernel compiled successfully") + + # Allocate and initialize vectors + print(f"[Vector addition of {num_elements} elements]") + dtype = cp.float32 + + a = cp.random.rand(num_elements).astype(dtype) + b = cp.random.rand(num_elements).astype(dtype) + c = cp.empty(num_elements, dtype=dtype) + + # Synchronize before kernel launch + device.sync() + + # Configure and launch kernel + threads_per_block = 256 + blocks_per_grid = (num_elements + threads_per_block - 1) // threads_per_block + + print(f"CUDA kernel launch with {blocks_per_grid} blocks of {threads_per_block} threads") + + config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block) + + # Launch kernel + launch( + stream, + config, + kernel, + a.data.ptr, + b.data.ptr, + c.data.ptr, + cp.int32(num_elements), + ) + stream.sync() + + # Verify result + if verify: + print("Verifying result...") + expected = a + b + if not verify_array_result(c, expected): + return False + + return True + + except Exception as e: + print(f"Error: {e}") + import traceback + + traceback.print_exc() + return False + + +def main(): + """ + Main entry point for the vector addition sample. + """ + import argparse + + parser = argparse.ArgumentParser(description="Vector Addition using CUDA Core API") + parser.add_argument( + "--elements", + type=int, + default=50000, + help="Number of elements in vectors (default: 50000)", + ) + parser.add_argument("--device", type=int, default=0, help="CUDA device ID (default: 0)") + parser.add_argument("--no-verify", action="store_true", help="Skip result verification") + + args = parser.parse_args() + + if args.elements <= 0: + print("Error: Number of elements must be positive") + return 1 + + success = vector_add_cuda_core(num_elements=args.elements, device_id=args.device, verify=not args.no_verify) + + if success: + print("\nDone") + return 0 + else: + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 163ff70a997..19a698394f4 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -27,6 +27,7 @@ Targets: core Run cuda_core tests bindings Run cuda_bindings tests pathfinder Run cuda_pathfinder tests + samples Run samples tests (tests/samples) smoke Run meta-level smoke tests (tests/integration) Options: @@ -261,6 +262,14 @@ PY add_result "smoke" "${rc}" } +run_samples() { + echo "[tests] samples" + cd "${repo_root}" + run_pytest tests/samples + local rc=$? + add_result "samples" "${rc}" +} + case "${target}" in all) run_pathfinder @@ -273,6 +282,8 @@ case "${target}" in run_bindings ;; pathfinder) run_pathfinder ;; + samples) + run_samples ;; smoke) run_smoke ;; *) diff --git a/tests/samples/__init__.py b/tests/samples/__init__.py new file mode 100644 index 00000000000..e5725ea5a48 --- /dev/null +++ b/tests/samples/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/samples/run_samples.py b/tests/samples/run_samples.py new file mode 100644 index 00000000000..6ba17b19ba1 --- /dev/null +++ b/tests/samples/run_samples.py @@ -0,0 +1,432 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +""" +Sample test orchestrator for samples under ./samples/. + +``samples//.py``, applies per-sample overrides from +``tests/samples/test_args.json`` (same schema used in cuda-samples, plus a +``python`` sub-object for Python-specific CLI args / launcher), and executes +each sample in its own subprocess. + +Exit-code contract (matches cuda-samples): + 0 -> sample passed + 2 -> sample waived (missing dependency / unmet hardware requirement) + * -> sample failed + +The script can be invoked directly: + python tests/samples/run_samples.py [--samples-dir samples] [--config tests/samples/test_args.json] +""" + +from __future__ import annotations + +import argparse +import concurrent.futures +import json +import os +import re +import subprocess +import sys +import threading +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +# Default timeout per sample run (seconds). Match cuda-samples. +DEFAULT_TIMEOUT = 300 +EXIT_WAIVED = 2 +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +DEFAULT_SAMPLES_DIR = REPO_ROOT / "samples" +DEFAULT_CONFIG = Path(__file__).resolve().parent / "test_args.json" + +_print_lock = threading.Lock() + + +def _safe_print(*args: Any, **kwargs: Any) -> None: + with _print_lock: + print(*args, **kwargs) + + +# --------------------------------------------------------------------------- +# Discovery +# --------------------------------------------------------------------------- + + +def discover_samples(samples_dir: Path) -> list[Path]: + """Return ``samples//.py`` for every sample directory. + + Only one Python entrypoint per sample is recognised, matching the + cuda-samples convention. The Utilities directory is excluded. + """ + samples: list[Path] = [] + for sample_dir in sorted(samples_dir.iterdir()): + if not sample_dir.is_dir() or sample_dir.name == "Utilities": + continue + entry = sample_dir / f"{sample_dir.name}.py" + if entry.is_file(): + samples.append(entry) + return samples + + +# --------------------------------------------------------------------------- +# Config + GPU detection +# --------------------------------------------------------------------------- + + +def load_config(config_path: Path) -> dict[str, Any]: + if not config_path.is_file(): + return {} + try: + with open(config_path, encoding="utf-8") as fh: + data = json.load(fh) + except json.JSONDecodeError as exc: + _safe_print(f"Warning: failed to parse {config_path}: {exc}") + return {} + if not isinstance(data, dict): + _safe_print(f"Warning: {config_path} must contain a JSON object") + return {} + # Drop any keys starting with '_' (used for comments). + return {k: v for k, v in data.items() if not k.startswith("_")} + + +def get_gpu_count() -> int: + """Return the visible CUDA GPU count, conservatively 0 on error. + + Matches cuda-samples/run_tests.py::get_gpu_count(): uses ``nvidia-smi -L`` + first and falls back to ``CUDA_VISIBLE_DEVICES``. + """ + try: + smi = subprocess.run( + ["nvidia-smi", "-L"], # noqa: S607 + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + text=True, + check=False, + ) + if smi.returncode == 0: + return sum(1 for line in smi.stdout.splitlines() if line.strip().lower().startswith("gpu ")) + except FileNotFoundError: + pass + except OSError: + pass + + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible and visible.lower() not in {"no", "none"}: + return len([v for v in visible.split(",") if v]) + return 0 + + +# --------------------------------------------------------------------------- +# PEP 723 dep gating (reuse the helper that ships with cuda-bindings test +# infrastructure when available; otherwise fall back to a local parser so the +# runner stays usable without cuda-bindings installed). +# --------------------------------------------------------------------------- + +_DEP_NAME_RE = re.compile(r"[a-zA-Z0-9_-]+") +_PEP723_RE = re.compile(r"(?m)^# /// (?P[a-zA-Z0-9-]+)$\s(?P(^#(| .*)$\s)+)^# ///$") + +# Aliases bridging PyPI distribution names declared in sample PEP 723 blocks +# and the install-name a conda/pixi environment provides. CI uses wheels where +# the names match exactly, so this map only fires in local pixi runs. Each +# entry maps a PyPI name to a list of alternative import names to try with +# ``importlib.import_module`` before declaring the dep missing. +_DEP_FALLBACK_IMPORTS: dict[str, tuple[str, ...]] = { + "cuda-python": ("cuda.bindings",), + "cuda-bindings": ("cuda.bindings",), + "cuda-core": ("cuda.core",), + "cuda-pathfinder": ("cuda.pathfinder",), + "cuda-cccl": ("cuda.cccl", "cccl"), + "cupy-cuda11x": ("cupy",), + "cupy-cuda12x": ("cupy",), + "cupy-cuda13x": ("cupy",), + "nvidia-nvjitlink": ("nvjitlink",), + "nvmath-python": ("nvmath",), + "cugraph-cu12": ("cugraph",), + "cugraph-cu13": ("cugraph",), + "cudf-cu12": ("cudf",), + "cudf-cu13": ("cudf",), +} + + +def _extract_pep723_dependencies(example: Path) -> list[str] | None: + """Return the dependency list declared via PEP 723, or ``None`` if absent.""" + content = example.read_text(encoding="utf-8") + match = _PEP723_RE.search(content) + if not match: + return None + metadata: dict[str, str] = {} + for raw in match.group("content").splitlines(): + line = raw.lstrip("# ").rstrip() + if not line: + continue + key, _, value = line.partition("=") + if not _: + continue + metadata[key.strip()] = value.strip() + deps_literal = metadata.get("dependencies") + if not deps_literal: + return None + try: + # The PEP 723 spec uses TOML semantics, but in practice the values + # are simple list-of-strings literals; eval keeps the runner aligned + # with the cuda-bindings helper without taking a TOML dependency. + result = eval(deps_literal, {"__builtins__": {}}) # noqa: S307 + except Exception: + return None + if not isinstance(result, list): + return None + return [str(item) for item in result] + + +def missing_dependencies(example: Path) -> list[str]: + """Return the subset of declared deps that are not importable as distributions. + + Returns an empty list if all declared deps are present, or if no PEP 723 + block exists (no gating to perform). + """ + deps = _extract_pep723_dependencies(example) + if not deps: + return [] + # Local imports keep top-level import cost down. + import importlib + import importlib.metadata + + missing: list[str] = [] + for spec in deps: + match = _DEP_NAME_RE.match(spec) + if match is None: + continue + name = match.group(0) + try: + importlib.metadata.distribution(name) + continue + except importlib.metadata.PackageNotFoundError: + pass + + # Strict distribution check missed it. Try the known alias imports so + # conda/pixi environments (which install under different distribution + # names than the PyPI wheels) don't waive every sample. + for module_name in _DEP_FALLBACK_IMPORTS.get(name, ()): + try: + importlib.import_module(module_name) + break + except ImportError: + continue + else: + missing.append(name) + return missing + + +# --------------------------------------------------------------------------- +# Run plan +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class RunPlan: + sample: Path + args: list[str] + launcher: list[str] + timeout: int + skip_reason: str | None = None + + +def _expand_env(value: str) -> str: + return os.path.expandvars(value) + + +def build_run_plan( + sample: Path, + config: dict[str, Any], + gpu_count: int, + timeout: int = DEFAULT_TIMEOUT, +) -> RunPlan: + """Combine config overrides + GPU availability into a concrete run plan. + + The returned plan carries either a ``skip_reason`` (sample must be + waived) or the command components to invoke. + """ + sample_cfg = config.get(sample.parent.name, {}) + + if sample_cfg.get("skip"): + return RunPlan(sample, [], [], timeout, skip_reason="skipped in test_args.json") + + required_gpus = int(sample_cfg.get("min_gpus", 1)) + if required_gpus > gpu_count: + return RunPlan( + sample, + [], + [], + timeout, + skip_reason=(f"requires {required_gpus} GPU(s), only {gpu_count} available"), + ) + + python_cfg = sample_cfg.get("python", {}) + raw_args = python_cfg.get("args", []) or [] + raw_launcher = python_cfg.get("launcher", []) or [] + if not isinstance(raw_args, list) or not isinstance(raw_launcher, list): + return RunPlan( + sample, + [], + [], + timeout, + skip_reason="invalid config: 'args' and 'launcher' must be lists", + ) + + return RunPlan( + sample=sample, + args=[_expand_env(str(a)) for a in raw_args], + launcher=[_expand_env(str(a)) for a in raw_launcher], + timeout=timeout, + ) + + +# --------------------------------------------------------------------------- +# Execution +# --------------------------------------------------------------------------- + + +@dataclass +class RunResult: + sample: Path + status: str # "PASS", "FAIL", "WAIVED", "TIMEOUT", "ERROR" + return_code: int + detail: str = "" + + +def run_sample(plan: RunPlan) -> RunResult: + sample = plan.sample + name = sample.parent.name + + if plan.skip_reason is not None: + _safe_print(f" [WAIVED] {name}: {plan.skip_reason}") + return RunResult(sample, "WAIVED", EXIT_WAIVED, plan.skip_reason) + + missing = missing_dependencies(sample) + if missing: + reason = f"missing package(s): {', '.join(missing)}" + _safe_print(f" [WAIVED] {name}: {reason}") + return RunResult(sample, "WAIVED", EXIT_WAIVED, reason) + + cmd = list(plan.launcher) + [sys.executable, str(sample)] + list(plan.args) + _safe_print(f" [RUN ] {name}: {' '.join(cmd)}") + + try: + proc = subprocess.run( # noqa: S603 + cmd, + cwd=str(sample.parent), + capture_output=True, + text=True, + timeout=plan.timeout, + check=False, + ) + except subprocess.TimeoutExpired: + _safe_print(f" [TIMEOUT] {name}: exceeded {plan.timeout}s") + return RunResult(sample, "TIMEOUT", -1, f"timed out after {plan.timeout}s") + except OSError as exc: + _safe_print(f" [ERROR] {name}: {exc}") + return RunResult(sample, "ERROR", -1, str(exc)) + + if proc.returncode == 0: + _safe_print(f" [PASS ] {name}") + return RunResult(sample, "PASS", 0) + if proc.returncode == EXIT_WAIVED: + _safe_print(f" [WAIVED] {name}: sample reported waived") + return RunResult(sample, "WAIVED", EXIT_WAIVED, "sample-reported") + + # Fail. Surface output so CI logs are diagnosable. + msg = f"return code {proc.returncode}" + _safe_print(f" [FAIL ] {name}: {msg}") + if proc.stdout: + _safe_print(f"---- stdout ({name}) ----\n{proc.stdout.rstrip()}") + if proc.stderr: + _safe_print(f"---- stderr ({name}) ----\n{proc.stderr.rstrip()}") + return RunResult(sample, "FAIL", proc.returncode, msg) + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Run cuda-python samples") + parser.add_argument( + "--samples-dir", + type=Path, + default=DEFAULT_SAMPLES_DIR, + help="Directory containing one subdir per sample (default: ./samples)", + ) + parser.add_argument( + "--config", + type=Path, + default=DEFAULT_CONFIG, + help="Path to test_args.json (default: tests/samples/test_args.json)", + ) + parser.add_argument( + "--parallel", + type=int, + default=1, + help="Maximum number of samples to run concurrently (default: 1)", + ) + parser.add_argument( + "--filter", + action="append", + default=[], + help=("Run only samples whose directory name contains the given substring (may be repeated)"), + ) + parser.add_argument( + "--timeout", + type=int, + default=DEFAULT_TIMEOUT, + help=f"Per-sample timeout in seconds (default: {DEFAULT_TIMEOUT})", + ) + args = parser.parse_args(argv) + + samples_dir: Path = args.samples_dir.resolve() + if not samples_dir.is_dir(): + _safe_print(f"Error: samples directory not found: {samples_dir}") + return 1 + + samples = discover_samples(samples_dir) + if args.filter: + keep = [] + for sample in samples: + if any(token in sample.parent.name for token in args.filter): + keep.append(sample) + samples = keep + if not samples: + _safe_print("No samples found.") + return 1 + + config = load_config(args.config.resolve()) + gpu_count = get_gpu_count() + _safe_print(f"Detected {gpu_count} GPU(s).") + _safe_print(f"Running {len(samples)} sample(s) with parallelism={args.parallel}\n") + + plans = [build_run_plan(s, config, gpu_count, args.timeout) for s in samples] + + if args.parallel <= 1: + results = [run_sample(plan) for plan in plans] + else: + with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as pool: + results = list(pool.map(run_sample, plans)) + + failed = [r for r in results if r.status in {"FAIL", "TIMEOUT", "ERROR"}] + waived = [r for r in results if r.status == "WAIVED"] + passed = [r for r in results if r.status == "PASS"] + + _safe_print("\nSummary") + _safe_print(f" passed : {len(passed)}") + _safe_print(f" waived : {len(waived)}") + _safe_print(f" failed : {len(failed)}") + if failed: + for r in failed: + _safe_print(f" - {r.sample.parent.name}: {r.status} ({r.detail})") + first = next((r.return_code for r in failed if r.return_code > 0), 1) + return first + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/samples/test_args.json b/tests/samples/test_args.json new file mode 100644 index 00000000000..3c2013c14a1 --- /dev/null +++ b/tests/samples/test_args.json @@ -0,0 +1,17 @@ +{ + "simpleP2P": { + "min_gpus": 2 + }, + "reductionMultiBlockCG": { + "python": { + "args": ["--cuda-include-dir=$CUDA_HOME/include:$CUDA_HOME/include/cccl"] + } + }, + "multiGPUGradientAverage": { + "min_gpus": 2, + "python": { + "launcher": ["mpirun", "--allow-run-as-root", "-np", "2"], + "args": ["--size", "1024"] + } + } +} diff --git a/tests/samples/test_samples.py b/tests/samples/test_samples.py new file mode 100644 index 00000000000..343d8fd388a --- /dev/null +++ b/tests/samples/test_samples.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +""" +Pytest wrapper for samples under ./samples/. + +The samples themselves should be plain runnable scripts. + +This module uses ``run_samples.py`` to run the samples, which is a +convenience wrapper around the ``cuda_bindings.samples.run_samples`` module. +""" + +from __future__ import annotations + +import sys + +import pytest + +from .run_samples import ( + DEFAULT_CONFIG, + DEFAULT_SAMPLES_DIR, + build_run_plan, + discover_samples, + get_gpu_count, + load_config, + run_sample, +) + + +def _collect_samples() -> list[str]: + if not DEFAULT_SAMPLES_DIR.is_dir(): + return [] + return [s.parent.name for s in discover_samples(DEFAULT_SAMPLES_DIR)] + + +_SAMPLES = _collect_samples() +_CONFIG = load_config(DEFAULT_CONFIG) +# Resolve GPU count once at collection time so we report the same skip reason +# consistently across the parametrized test ids. +_GPU_COUNT = get_gpu_count() if _SAMPLES else 0 + + +@pytest.mark.parametrize("sample_name", _SAMPLES) +def test_sample(sample_name: str) -> None: + if _GPU_COUNT == 0: + pytest.skip("No CUDA GPU detected on the test runner") + + entry = DEFAULT_SAMPLES_DIR / sample_name / f"{sample_name}.py" + if not entry.is_file(): + pytest.fail(f"Sample entrypoint missing: {entry}") + + plan = build_run_plan(entry, _CONFIG, _GPU_COUNT) + result = run_sample(plan) + + if result.status == "WAIVED": + pytest.skip(result.detail or "sample waived") + if result.status == "PASS": + return + + # Re-print captured output through stdout/stderr so pytest's failure + # capture surfaces it in the report. + sys.stdout.flush() + pytest.fail(f"sample {sample_name} returned status={result.status} (rc={result.return_code}): {result.detail}") diff --git a/toolshed/check_spdx.py b/toolshed/check_spdx.py index e119eaa4795..1bf3afebbd9 100644 --- a/toolshed/check_spdx.py +++ b/toolshed/check_spdx.py @@ -32,7 +32,9 @@ "cuda_python": "LicenseRef-NVIDIA-SOFTWARE-LICENSE", "cuda_python_test_helpers": "Apache-2.0", "qa": "LicenseRef-NVIDIA-SOFTWARE-LICENSE", + "samples": "Apache-2.0", "scripts": "Apache-2.0", + "tests": "Apache-2.0", "toolshed": "Apache-2.0", }