NVIDIA · kkraus14 · Jun 26, 2026 · Jun 25, 2026
diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py
@@ -1,6 +1,8 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import mmap
+
 import pytest
 from helpers.buffers import DummyDeviceMemoryResource, DummyUnifiedMemoryResource
 
@@ -9,7 +11,14 @@
 from cuda.core import Device, Host, ManagedBuffer
 from cuda.core._memory._managed_buffer import _get_int_attr
 
-_MANAGED_TEST_ALLOCATION_SIZE = 4096
+# Managed-memory prefetch and CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
+# operate at physical-page granularity. Test buffers must each occupy a full
+# page; otherwise the pool packs sub-page allocations into one page and
+# per-buffer prefetch locations become indistinguishable. ``mmap.PAGESIZE``
+# tracks the OS page size (4 KiB on most x86, 64 KiB on nvidia-64k aarch64
+# kernels), so allocations stay one-page-per-buffer on every platform.
+_PAGE_SIZE = mmap.PAGESIZE
+_MANAGED_TEST_ALLOCATION_SIZE = _PAGE_SIZE
 _READ_MOSTLY_ENABLED = 1
 _HOST_LOCATION_ID = -1
 _INVALID_HOST_DEVICE_ORDINAL = 0
@@ -21,6 +30,12 @@ def _last_prefetch_location(buf):
     return _get_int_attr(buf, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION)
 
 
+def _page_base(buf):
+    # Page-aligned base of the buffer's start address; two buffers sharing a
+    # page cannot be prefetched to different locations independently.
+    return int(buf.handle) & ~(_PAGE_SIZE - 1)
+
+
 def _skip_if_raw_managed_alloc_unsupported(device):
     # Raw `cuMemAllocManaged` capability — distinct from conftest's
     # `skip_if_managed_memory_unsupported`, which gates `ManagedMemoryResource`
@@ -216,6 +231,10 @@ def test_per_buffer_location(self, location_ops_device, location_ops_mr):
 
         device = location_ops_device
         bufs = [location_ops_mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE, stream=device.default_stream) for _ in range(2)]
+        # Per-buffer prefetch locations are only observable when the buffers sit
+        # on distinct physical pages; assert that here so a pool-packing change
+        # fails loudly instead of silently migrating one shared page.
+        assert _page_base(bufs[0]) != _page_base(bufs[1])
         stream = device.create_stream()
 
         prefetch_batch(stream, bufs, [Host(), device])