From 9d977bed3f20f0c035bfd73820d65abbc1e0432b Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 25 Jun 2026 14:34:39 -0700 Subject: [PATCH] cuda.core: make managed-prefetch test page-size aware TestPrefetchBatch.test_per_buffer_location hardcoded a 4096-byte allocation and assumed two pooled buffers landed on separate physical pages. Managed-memory prefetch and CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION operate at page granularity, so on nvidia-64k aarch64 kernels both 4 KB buffers shared one 64 KB page; prefetching buf[1] to the device migrated the shared page and buf[0]'s host prefetch reported device 0 (assert 0 == -1). Derive the allocation size from mmap.PAGESIZE so each buffer occupies a full page on every platform, and add a precondition asserting the two buffers sit on distinct pages so a pool-packing regression fails loudly. Co-Authored-By: Claude Opus 4.8 (1M context) --- cuda_core/tests/memory/test_managed_ops.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index c2e0c7195b..33def77935 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import mmap + import pytest from helpers.buffers import DummyDeviceMemoryResource, DummyUnifiedMemoryResource @@ -9,7 +11,14 @@ from cuda.core import Device, Host, ManagedBuffer from cuda.core._memory._managed_buffer import _get_int_attr -_MANAGED_TEST_ALLOCATION_SIZE = 4096 +# Managed-memory prefetch and CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION +# operate at physical-page granularity. Test buffers must each occupy a full +# page; otherwise the pool packs sub-page allocations into one page and +# per-buffer prefetch locations become indistinguishable. ``mmap.PAGESIZE`` +# tracks the OS page size (4 KiB on most x86, 64 KiB on nvidia-64k aarch64 +# kernels), so allocations stay one-page-per-buffer on every platform. +_PAGE_SIZE = mmap.PAGESIZE +_MANAGED_TEST_ALLOCATION_SIZE = _PAGE_SIZE _READ_MOSTLY_ENABLED = 1 _HOST_LOCATION_ID = -1 _INVALID_HOST_DEVICE_ORDINAL = 0 @@ -21,6 +30,12 @@ def _last_prefetch_location(buf): return _get_int_attr(buf, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION) +def _page_base(buf): + # Page-aligned base of the buffer's start address; two buffers sharing a + # page cannot be prefetched to different locations independently. + return int(buf.handle) & ~(_PAGE_SIZE - 1) + + def _skip_if_raw_managed_alloc_unsupported(device): # Raw `cuMemAllocManaged` capability — distinct from conftest's # `skip_if_managed_memory_unsupported`, which gates `ManagedMemoryResource` @@ -216,6 +231,10 @@ def test_per_buffer_location(self, location_ops_device, location_ops_mr): device = location_ops_device bufs = [location_ops_mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE, stream=device.default_stream) for _ in range(2)] + # Per-buffer prefetch locations are only observable when the buffers sit + # on distinct physical pages; assert that here so a pool-packing change + # fails loudly instead of silently migrating one shared page. + assert _page_base(bufs[0]) != _page_base(bufs[1]) stream = device.create_stream() prefetch_batch(stream, bufs, [Host(), device])