From 9d977bed3f20f0c035bfd73820d65abbc1e0432b Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Thu, 25 Jun 2026 14:34:39 -0700
Subject: [PATCH] cuda.core: make managed-prefetch test page-size aware

TestPrefetchBatch.test_per_buffer_location hardcoded a 4096-byte
allocation and assumed two pooled buffers landed on separate physical
pages. Managed-memory prefetch and CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
operate at page granularity, so on nvidia-64k aarch64 kernels both
4 KB buffers shared one 64 KB page; prefetching buf[1] to the device
migrated the shared page and buf[0]'s host prefetch reported device 0
(assert 0 == -1).

Derive the allocation size from mmap.PAGESIZE so each buffer occupies a
full page on every platform, and add a precondition asserting the two
buffers sit on distinct pages so a pool-packing regression fails loudly.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cuda_core/tests/memory/test_managed_ops.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py
index c2e0c7195b..33def77935 100644
--- a/cuda_core/tests/memory/test_managed_ops.py
+++ b/cuda_core/tests/memory/test_managed_ops.py
@@ -1,6 +1,8 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import mmap
+
 import pytest
 from helpers.buffers import DummyDeviceMemoryResource, DummyUnifiedMemoryResource
 
@@ -9,7 +11,14 @@
 from cuda.core import Device, Host, ManagedBuffer
 from cuda.core._memory._managed_buffer import _get_int_attr
 
-_MANAGED_TEST_ALLOCATION_SIZE = 4096
+# Managed-memory prefetch and CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
+# operate at physical-page granularity. Test buffers must each occupy a full
+# page; otherwise the pool packs sub-page allocations into one page and
+# per-buffer prefetch locations become indistinguishable. ``mmap.PAGESIZE``
+# tracks the OS page size (4 KiB on most x86, 64 KiB on nvidia-64k aarch64
+# kernels), so allocations stay one-page-per-buffer on every platform.
+_PAGE_SIZE = mmap.PAGESIZE
+_MANAGED_TEST_ALLOCATION_SIZE = _PAGE_SIZE
 _READ_MOSTLY_ENABLED = 1
 _HOST_LOCATION_ID = -1
 _INVALID_HOST_DEVICE_ORDINAL = 0
@@ -21,6 +30,12 @@ def _last_prefetch_location(buf):
     return _get_int_attr(buf, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION)
 
 
+def _page_base(buf):
+    # Page-aligned base of the buffer's start address; two buffers sharing a
+    # page cannot be prefetched to different locations independently.
+    return int(buf.handle) & ~(_PAGE_SIZE - 1)
+
+
 def _skip_if_raw_managed_alloc_unsupported(device):
     # Raw `cuMemAllocManaged` capability — distinct from conftest's
     # `skip_if_managed_memory_unsupported`, which gates `ManagedMemoryResource`
@@ -216,6 +231,10 @@ def test_per_buffer_location(self, location_ops_device, location_ops_mr):
 
         device = location_ops_device
         bufs = [location_ops_mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE, stream=device.default_stream) for _ in range(2)]
+        # Per-buffer prefetch locations are only observable when the buffers sit
+        # on distinct physical pages; assert that here so a pool-packing change
+        # fails loudly instead of silently migrating one shared page.
+        assert _page_base(bufs[0]) != _page_base(bufs[1])
         stream = device.create_stream()
 
         prefetch_batch(stream, bufs, [Host(), device])