From b478199ef0888a60ac0a2296e190807578d456fe Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 23 Jun 2026 12:17:30 +0200 Subject: [PATCH] Fully clean up buffers/memory resources in tests Add explicit sync points and notes around `mr.close()` making sure that there is no MR shutdown with pending async buffer freeing. These can currently cause occasional dead-locks when we have many MRs in parallel. Signed-off-by: Sebastian Berg --- cuda_core/tests/conftest.py | 3 +++ cuda_core/tests/memory_ipc/test_peer_access.py | 4 ++++ .../tests/memory_ipc/test_send_buffers.py | 5 +++++ cuda_core/tests/memory_ipc/test_workerpool.py | 18 +++++++++++++++--- cuda_core/tests/test_memory.py | 10 ++++++++++ cuda_core/tests/test_object_protocols.py | 6 +++++- 6 files changed, 42 insertions(+), 4 deletions(-) diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index d7a81d88904..611f83c3a0e 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -290,6 +290,9 @@ def ipc_memory_resource(request, ipc_device): assert mr.is_ipc_enabled yield mr mr.close() + # TODO(seberg): Make sure the `mr` and it's buffers are fully torn down. + # May be unnecessary as `mr.close()` is not parallel with other work. + ipc_device.sync() @pytest.fixture diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index efb67b4cdb8..ac7f71a88e9 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -92,6 +92,8 @@ def test_main(self, ipc_mempool_device_x2, grant_access_in_parent): assert process.exitcode == 0 buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + dev1.sync() mr.close() def child_main(self, mr, buffer): @@ -129,4 +131,6 @@ def child_main(self, mr, buffer): PatternGen(dev0, NBYTES).verify_buffer(buffer, seed=False) buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + dev1.sync() mr.close() diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index 01c9496e773..59216cd9cce 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -29,6 +29,7 @@ def test_main(self, ipc_device, nmrs): device = ipc_device options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)] + buffers = [] try: # Allocate and fill memory. @@ -54,6 +55,10 @@ def test_main(self, ipc_device, nmrs): pgen.verify_buffer(buffer, seed=True) buffer.close() finally: + for buffer in buffers: + buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + device.sync() for mr in mrs: mr.close() diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index 08d9bd79d92..358c16fd7bf 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -35,6 +35,7 @@ def test_main(self, ipc_device, nmrs): device = ipc_device options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)] + buffers = [] try: buffers = [mr.allocate(NBYTES, stream=device.default_stream) for mr, _ in zip(cycle(mrs), range(NTASKS))] @@ -45,8 +46,11 @@ def test_main(self, ipc_device, nmrs): pgen = PatternGen(device, NBYTES) for buffer in buffers: pgen.verify_buffer(buffer, seed=True) - buffer.close() finally: + for buffer in buffers: + buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + device.sync() for mr in mrs: mr.close() @@ -77,6 +81,7 @@ def test_main(self, ipc_device, nmrs): device = ipc_device options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)] + buffers = [] try: buffers = [mr.allocate(NBYTES, stream=device.default_stream) for mr, _ in zip(cycle(mrs), range(NTASKS))] @@ -90,8 +95,11 @@ def test_main(self, ipc_device, nmrs): pgen = PatternGen(device, NBYTES) for buffer in buffers: pgen.verify_buffer(buffer, seed=True) - buffer.close() finally: + for buffer in buffers: + buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + device.sync() for mr in mrs: mr.close() @@ -127,6 +135,7 @@ def test_main(self, ipc_device, nmrs): device = ipc_device options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)] + buffers = [] try: buffers = [mr.allocate(NBYTES, stream=device.default_stream) for mr, _ in zip(cycle(mrs), range(NTASKS))] @@ -137,8 +146,11 @@ def test_main(self, ipc_device, nmrs): pgen = PatternGen(device, NBYTES) for buffer in buffers: pgen.verify_buffer(buffer, seed=True) - buffer.close() finally: + for buffer in buffers: + buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + device.sync() for mr in mrs: mr.close() diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 35592485c94..f53e8244328 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1103,6 +1103,8 @@ def test_device_memory_resource_with_options(init_cuda): device.sync() dst_buffer.close() src_buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + device.sync() def test_pinned_memory_resource_with_options(init_cuda): @@ -1149,6 +1151,8 @@ def test_pinned_memory_resource_with_options(init_cuda): device.sync() dst_buffer.close() src_buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + device.sync() def test_managed_memory_resource_with_options(init_cuda): @@ -1365,6 +1369,8 @@ def test_mempool_ipc_errors(mempool_device): Buffer.from_ipc_descriptor(mr, handle, stream=device.default_stream) buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + device.sync() def test_pinned_mempool_ipc_basic(): @@ -1405,6 +1411,8 @@ def test_pinned_mempool_ipc_basic(): assert ipc_desc.size == 1024 buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + device.sync() mr.close() @@ -1436,6 +1444,8 @@ def test_pinned_mempool_ipc_errors(): Buffer.from_ipc_descriptor(mr, handle, stream=device.default_stream) buffer.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + device.sync() mr.close() diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index d1085a952bb..e8391c75678 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -233,7 +233,11 @@ def sample_ipc_buffer_descriptor(ipc_device): options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mr = DeviceMemoryResource(ipc_device, options=options) buf = mr.allocate(64, stream=ipc_device.default_stream) - return buf.ipc_descriptor + descriptor = buf.ipc_descriptor + buf.close() + # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()` + ipc_device.sync() + return descriptor @pytest.fixture