diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index c2e0c7195b..33def77935 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import mmap + import pytest from helpers.buffers import DummyDeviceMemoryResource, DummyUnifiedMemoryResource @@ -9,7 +11,14 @@ from cuda.core import Device, Host, ManagedBuffer from cuda.core._memory._managed_buffer import _get_int_attr -_MANAGED_TEST_ALLOCATION_SIZE = 4096 +# Managed-memory prefetch and CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION +# operate at physical-page granularity. Test buffers must each occupy a full +# page; otherwise the pool packs sub-page allocations into one page and +# per-buffer prefetch locations become indistinguishable. ``mmap.PAGESIZE`` +# tracks the OS page size (4 KiB on most x86, 64 KiB on nvidia-64k aarch64 +# kernels), so allocations stay one-page-per-buffer on every platform. +_PAGE_SIZE = mmap.PAGESIZE +_MANAGED_TEST_ALLOCATION_SIZE = _PAGE_SIZE _READ_MOSTLY_ENABLED = 1 _HOST_LOCATION_ID = -1 _INVALID_HOST_DEVICE_ORDINAL = 0 @@ -21,6 +30,12 @@ def _last_prefetch_location(buf): return _get_int_attr(buf, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION) +def _page_base(buf): + # Page-aligned base of the buffer's start address; two buffers sharing a + # page cannot be prefetched to different locations independently. + return int(buf.handle) & ~(_PAGE_SIZE - 1) + + def _skip_if_raw_managed_alloc_unsupported(device): # Raw `cuMemAllocManaged` capability — distinct from conftest's # `skip_if_managed_memory_unsupported`, which gates `ManagedMemoryResource` @@ -216,6 +231,10 @@ def test_per_buffer_location(self, location_ops_device, location_ops_mr): device = location_ops_device bufs = [location_ops_mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE, stream=device.default_stream) for _ in range(2)] + # Per-buffer prefetch locations are only observable when the buffers sit + # on distinct physical pages; assert that here so a pool-packing change + # fails loudly instead of silently migrating one shared page. + assert _page_base(bufs[0]) != _page_base(bufs[1]) stream = device.create_stream() prefetch_batch(stream, bufs, [Host(), device])