From 0604b18472a8e6c2cf66b34b23025cb603d49b41 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 5 May 2026 12:12:24 -0700 Subject: [PATCH 1/8] cuda.core: add GraphBuilder.graph_definition property Completes step 3 of #1330 by exposing the captured graph as an explicit `GraphDefinition` view that shares ownership of the underlying `CUgraph`. The handle-layer plumbing landed in PR #2008; this commit wires up the user-facing surface and locks in the state-guard rules. State semantics: - PRIMARY builder: only valid after `end_building()`. Before `begin_building()` no graph exists; during capture the driver is the sole writer, so explicit access is unsafe. - CONDITIONAL_BODY builder: valid both before `begin_building()` (the body graph is allocated at conditional-node creation time) and after `end_building()`. This enables a hybrid flow where a conditional body is populated entirely via the explicit API, with no capture at all. - FORKED builder: never valid. Forked builders share the primary's graph; access through the primary instead. Tests cover the happy path, both hybrid flows on conditional bodies (populate-via-explicit-API and capture-then-augment), the three error states (forked, capturing, primary pre-capture), and the shared-ownership guarantee (the `GraphDefinition` survives the builder's `close()`). Co-authored-by: Cursor --- cuda_core/cuda/core/graph/_graph_builder.pyi | 42 ++++ cuda_core/cuda/core/graph/_graph_builder.pyx | 60 +++++- cuda_core/tests/graph/test_graph_builder.py | 192 ++++++++++++++++++- 3 files changed, 291 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyi b/cuda_core/cuda/core/graph/_graph_builder.pyi index af1748ad86..00af261423 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyi +++ b/cuda_core/cuda/core/graph/_graph_builder.pyi @@ -129,6 +129,48 @@ class GraphBuilder: def is_join_required(self) -> bool: """Returns True if this graph builder must be joined before building is ended.""" + @property + def graph_definition(self) -> GraphDefinition: + """The captured graph as an explicit :class:`~graph.GraphDefinition`. + + The returned :class:`~graph.GraphDefinition` is a view of the same + graph this builder is producing: nodes added through it appear in + subsequent :meth:`complete` and :meth:`debug_dot_print` calls, and + the view stays valid even after the builder is closed. + + This lets you mix the capture and explicit APIs on a single graph, + for example to inspect what was captured, augment it with extra + nodes, or build a conditional body entirely with the explicit API. + + Availability: + + - **Primary builders** (created by :meth:`Device.create_graph_builder` + or :meth:`Stream.create_graph_builder`): only after + :meth:`end_building`. + + - **Conditional-body builders** (returned by :meth:`if_then`, + :meth:`if_else`, :meth:`while_loop`, :meth:`switch`): both before + :meth:`begin_building` and after :meth:`end_building`. The body + graph already exists when the conditional is created, so you may + populate it through this view without ever calling + :meth:`begin_building` on the body builder. + + - **Forked builders** (returned by :meth:`split`): never. Forked + builders share the primary builder's graph; access it through the + primary instead. + + Returns + ------- + GraphDefinition + A view of the graph being built. + + Raises + ------ + RuntimeError + If the builder is forked, currently building, or (for primary + builders) has not started building yet. + """ + def begin_building(self, mode: str | None='relaxed') -> GraphBuilder: """Begins the building process. diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index c7b2ba5f74..dfd106fb0d 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -9,7 +9,7 @@ from libc.stdint cimport intptr_t from cuda.bindings cimport cydriver -from cuda.core.graph._graph_definition cimport GraphCondition +from cuda.core.graph._graph_definition cimport GraphCondition, GraphDefinition from cuda.core.graph._utils cimport _attach_host_callback_to_graph from cuda.core._resource_handles cimport ( GraphHandle, @@ -282,6 +282,64 @@ cdef class GraphBuilder: """Returns True if this graph builder must be joined before building is ended.""" return self._kind == FORKED + @property + def graph_definition(self) -> GraphDefinition: + """The captured graph as an explicit :class:`~graph.GraphDefinition`. + + The returned :class:`~graph.GraphDefinition` is a view of the same + graph this builder is producing: nodes added through it appear in + subsequent :meth:`complete` and :meth:`debug_dot_print` calls, and + the view stays valid even after the builder is closed. + + This lets you mix the capture and explicit APIs on a single graph, + for example to inspect what was captured, augment it with extra + nodes, or build a conditional body entirely with the explicit API. + + Availability: + + - **Primary builders** (created by :meth:`Device.create_graph_builder` + or :meth:`Stream.create_graph_builder`): only after + :meth:`end_building`. + + - **Conditional-body builders** (returned by :meth:`if_then`, + :meth:`if_else`, :meth:`while_loop`, :meth:`switch`): both before + :meth:`begin_building` and after :meth:`end_building`. The body + graph already exists when the conditional is created, so you may + populate it through this view without ever calling + :meth:`begin_building` on the body builder. + + - **Forked builders** (returned by :meth:`split`): never. Forked + builders share the primary builder's graph; access it through the + primary instead. + + Returns + ------- + GraphDefinition + A view of the graph being built. + + Raises + ------ + RuntimeError + If the builder is forked, currently building, or (for primary + builders) has not started building yet. + """ + if self._kind == FORKED: + raise RuntimeError( + "graph_definition is unavailable on forked graph builders; " + "access it through the primary builder instead." + ) + if self._state == CAPTURING: + raise RuntimeError( + "graph_definition is unavailable while capture is in " + "progress; call end_building() first." + ) + if self._kind == PRIMARY and self._state == CAPTURE_NOT_STARTED: + raise RuntimeError( + "graph_definition is unavailable before begin_building() on " + "a primary builder; no graph has been created yet." + ) + return GraphDefinition._from_handle(self._h_graph) + def begin_building(self, mode: str | None = "relaxed") -> GraphBuilder: """Begins the building process. diff --git a/cuda_core/tests/graph/test_graph_builder.py b/cuda_core/tests/graph/test_graph_builder.py index 18dfe21cc1..efb70fe75d 100644 --- a/cuda_core/tests/graph/test_graph_builder.py +++ b/cuda_core/tests/graph/test_graph_builder.py @@ -5,11 +5,12 @@ import numpy as np import pytest -from helpers.graph_kernels import compile_common_kernels +from helpers.graph_kernels import compile_common_kernels, compile_conditional_kernels from helpers.marks import requires_module +from helpers.misc import try_create_condition from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, launch -from cuda.core.graph import GraphBuilder +from cuda.core.graph import GraphBuilder, GraphDefinition def test_graph_is_building(init_cuda): @@ -384,3 +385,190 @@ def test_graph_stream_lifetime(init_cuda): # Destroy the stream stream.close() + + +# --------------------------------------------------------------------------- +# GraphBuilder.graph_definition +# --------------------------------------------------------------------------- + + +def test_graph_definition_returns_graph_definition_after_end_building(init_cuda): + """Primary builder exposes its captured graph as a GraphDefinition after end_building().""" + mod = compile_common_kernels() + empty_kernel = mod.get_kernel("empty_kernel") + + gb = Device().create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + gb.end_building() + + gd = gb.graph_definition + assert isinstance(gd, GraphDefinition) + # The captured graph must contain the launched kernels. + assert len(gd.nodes()) == 2 + + +def test_graph_definition_raises_before_begin_building(init_cuda): + """Primary builder has no graph allocated before begin_building().""" + gb = Device().create_graph_builder() + with pytest.raises(RuntimeError, match="before begin_building"): + _ = gb.graph_definition + + +def test_graph_definition_raises_during_capture(init_cuda): + """graph_definition is unsafe while the driver is actively capturing.""" + gb = Device().create_graph_builder().begin_building() + try: + with pytest.raises(RuntimeError, match="capture is in"): + _ = gb.graph_definition + finally: + gb.end_building() + + +def test_graph_definition_raises_for_forked(init_cuda): + """Forked builders share the primary's graph; their property must raise.""" + mod = compile_common_kernels() + empty_kernel = mod.get_kernel("empty_kernel") + + gb = Device().create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + primary, sibling = gb.split(2) + try: + with pytest.raises(RuntimeError, match="forked"): + _ = sibling.graph_definition + finally: + sibling = GraphBuilder.join(primary, sibling) + sibling.end_building() + + +def test_graph_definition_shares_ownership(init_cuda): + """Closing the builder must not invalidate a held GraphDefinition.""" + mod = compile_common_kernels() + empty_kernel = mod.get_kernel("empty_kernel") + + gb = Device().create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + gb.end_building() + + gd = gb.graph_definition + gb.close() + # The shared CUgraph keeps the graph alive. + assert len(gd.nodes()) == 1 + + +def test_graph_definition_round_trips_through_explicit_api(init_cuda): + """Mutating via the explicit API survives complete() and runs correctly.""" + mod = compile_common_kernels() + add_one = mod.get_kernel("add_one") + + launch_stream = Device().create_stream() + mr = LegacyPinnedMemoryResource() + b = mr.allocate(4) + arr = np.from_dlpack(b).view(np.int32) + arr[0] = 0 + + gb = launch_stream.create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + gb.end_building() + + # Add a second add_one through the explicit GraphDefinition view. + gd = gb.graph_definition + captured_node = next(iter(gd.nodes())) + captured_node.launch(LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + assert len(gd.nodes()) == 2 + + graph = gb.complete() + graph.launch(launch_stream) + launch_stream.sync() + assert arr[0] == 2 + + b.close() + + +@requires_module(np, "2.1") +def test_graph_definition_hybrid_conditional_body(init_cuda): + """Populate a conditional body entirely through the explicit API. + + This is the headline hybrid flow enabled by the new property: + ``if_then`` returns a ``GraphBuilder`` for the body, but instead of + calling ``begin_building`` and capturing into it, we reach for + ``graph_definition`` and add nodes through the explicit API. + """ + mod = compile_conditional_kernels(int) + add_one = mod.get_kernel("add_one") + set_handle = mod.get_kernel("set_handle") + + launch_stream = Device().create_stream() + mr = LegacyPinnedMemoryResource() + b = mr.allocate(4) + arr = np.from_dlpack(b).view(np.int32) + arr[0] = 0 + + gb = Device().create_graph_builder().begin_building() + condition = try_create_condition(gb) + launch(gb, LaunchConfig(grid=1, block=1), set_handle, condition, 1) + body_gb = gb.if_then(condition) + + # Skip body_gb.begin_building() entirely -- the body graph already + # exists at conditional-node creation time and is exposed here. + body_def = body_gb.graph_definition + assert isinstance(body_def, GraphDefinition) + assert len(body_def.nodes()) == 0 + body_def.launch(LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + + graph = gb.end_building().complete() + graph.launch(launch_stream) + launch_stream.sync() + assert arr[0] == 1 + + b.close() + + +@requires_module(np, "2.1") +def test_graph_definition_conditional_body_after_capture(init_cuda): + """Capture into a conditional body, then augment it via the explicit API.""" + mod = compile_conditional_kernels(int) + add_one = mod.get_kernel("add_one") + set_handle = mod.get_kernel("set_handle") + + launch_stream = Device().create_stream() + mr = LegacyPinnedMemoryResource() + b = mr.allocate(4) + arr = np.from_dlpack(b).view(np.int32) + arr[0] = 0 + + gb = Device().create_graph_builder().begin_building() + condition = try_create_condition(gb) + launch(gb, LaunchConfig(grid=1, block=1), set_handle, condition, 1) + body_gb = gb.if_then(condition).begin_building() + + # Capture one increment into the body. + launch(body_gb, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + body_gb.end_building() + + # Add a second increment via the explicit API on the same body graph. + body_def = body_gb.graph_definition + captured_node = next(iter(body_def.nodes())) + captured_node.launch(LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + assert len(body_def.nodes()) == 2 + + graph = gb.end_building().complete() + graph.launch(launch_stream) + launch_stream.sync() + assert arr[0] == 2 + + b.close() + + +@requires_module(np, "2.1") +def test_graph_definition_conditional_body_during_capture_raises(init_cuda): + """The CAPTURING-state guard fires for conditional bodies too.""" + gb = Device().create_graph_builder().begin_building() + condition = try_create_condition(gb) + body_gb = gb.if_then(condition).begin_building() + try: + with pytest.raises(RuntimeError, match="capture is in"): + _ = body_gb.graph_definition + finally: + body_gb.end_building() + gb.end_building() From c3740e609532315e5bf4cc3aff2da56ec8880c7c Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 29 May 2026 12:28:13 -0700 Subject: [PATCH 2/8] cuda.core: add graph slot table infrastructure (phase 1) Introduce OpaqueHandle and a per-graph slot table retained on the CUgraph as a user object, preparing to replace ad-hoc per-resource user objects when wiring graph node attachments in a follow-up change. --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 102 +++++++++++++++++- cuda_core/cuda/core/_cpp/resource_handles.hpp | 32 ++++++ cuda_core/cuda/core/_resource_handles.pyx | 7 ++ 3 files changed, 138 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 3bbe0fafe0..097b2ec4d5 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -6,8 +6,11 @@ #include "resource_handles.hpp" #include +#include #include +#include #include +#include #include #include #include @@ -70,6 +73,9 @@ decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr; // Graph decltype(&cuGraphDestroy) p_cuGraphDestroy = nullptr; decltype(&cuGraphExecDestroy) p_cuGraphExecDestroy = nullptr; +decltype(&cuUserObjectCreate) p_cuUserObjectCreate = nullptr; +decltype(&cuUserObjectRelease) p_cuUserObjectRelease = nullptr; +decltype(&cuGraphRetainUserObject) p_cuGraphRetainUserObject = nullptr; // Linker decltype(&cuLinkDestroy) p_cuLinkDestroy = nullptr; @@ -1114,15 +1120,91 @@ LibraryHandle get_kernel_library(const KernelHandle& h) noexcept { // ============================================================================ namespace { + +// Slot table layout (internal). Each owning graph maps CUgraphNode -> a +// fixed-size array of type-erased owners. The width is the most any single +// node needs: a kernel node holds its kernel and its packed arguments; a host +// node holds its callback and the userData. The table is heap-allocated and +// retained on the graph as a user object, so the driver frees it -- and every +// owner in it -- when the graph is destroyed. +constexpr std::size_t SLOTS_PER_NODE = 2; +using NodeSlots = std::array; +using GraphSlotTable = std::map; + +// shared_ptr deleters for the payloads that need one. Typed handles convert to +// OpaqueHandle by assignment and reuse their own control block, so they need no +// deleter here. The Python deleter follows the owner-release pattern used by +// the stream/deviceptr handles above. +void py_deleter(const void* p) noexcept { + GILAcquireGuard gil; + if (gil.acquired()) { + Py_DECREF(const_cast(static_cast(p))); + } +} + +void free_deleter(const void* p) noexcept { + std::free(const_cast(p)); +} + +void destroy_graph_slot_table(void* table) noexcept { + delete static_cast(table); +} + +// Allocate a slot table and hand it to graph via a user object. On success the +// user object owns the table and MOVE transfers our reference into the graph, +// so the table lives exactly as long as the graph. This is best-effort: if the +// driver calls fail it returns nullptr without disturbing the error channel, so +// graph creation keeps its existing behavior. A missing table surfaces later as +// CUDA_ERROR_NOT_SUPPORTED from graph_set_slot, where callers check for it. +GraphSlotTable* register_graph_slot_table(CUgraph graph) { + if (!p_cuUserObjectCreate || !p_cuGraphRetainUserObject || !p_cuUserObjectRelease) { + return nullptr; + } + auto* table = new GraphSlotTable(); + CUuserObject user_obj = nullptr; + GILReleaseGuard gil; + if (p_cuUserObjectCreate(&user_obj, table, + reinterpret_cast(destroy_graph_slot_table), + 1, CU_USER_OBJECT_NO_DESTRUCTOR_SYNC) != CUDA_SUCCESS) { + delete table; // no user object created; nothing else owns the table + return nullptr; + } + // The user object now owns the table; releasing its last reference frees it. + if (p_cuGraphRetainUserObject(graph, user_obj, 1, CU_GRAPH_USER_OBJECT_MOVE) != CUDA_SUCCESS) { + p_cuUserObjectRelease(user_obj, 1); // drops refcount to 0 -> frees table + return nullptr; + } + return table; +} + struct GraphBox { CUgraph resource; - GraphHandle h_parent; // Keeps parent alive for child/branch graphs + GraphHandle h_parent; // Keeps parent alive for child/branch graphs + GraphSlotTable* slot_table; // Non-owning; owned by the graph's user object }; + +const GraphBox* get_box(const GraphHandle& h) { + const CUgraph* p = h.get(); + return reinterpret_cast( + reinterpret_cast(p) - offsetof(GraphBox, resource) + ); +} + } // namespace +OpaqueHandle make_opaque_py(PyObject* obj) { + Py_INCREF(obj); + return OpaqueHandle(static_cast(obj), py_deleter); +} + +OpaqueHandle make_opaque_malloc(void* buf) { + return OpaqueHandle(static_cast(buf), free_deleter); +} + GraphHandle create_graph_handle(CUgraph graph) { + GraphSlotTable* slot_table = register_graph_slot_table(graph); auto box = std::shared_ptr( - new GraphBox{graph, {}}, + new GraphBox{graph, {}, slot_table}, [](const GraphBox* b) { GILReleaseGuard gil; p_cuGraphDestroy(b->resource); @@ -1133,10 +1215,24 @@ GraphHandle create_graph_handle(CUgraph graph) { } GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent) { - auto box = std::make_shared(GraphBox{graph, h_parent}); + auto box = std::make_shared(GraphBox{graph, h_parent, nullptr}); return GraphHandle(box, &box->resource); } +void graph_set_slot(const GraphHandle& h_graph, CUgraphNode node, + unsigned int slot, OpaqueHandle owner) { + if (slot >= SLOTS_PER_NODE) { + err = CUDA_ERROR_INVALID_VALUE; + return; + } + GraphSlotTable* table = h_graph ? get_box(h_graph)->slot_table : nullptr; + if (!table) { + err = CUDA_ERROR_NOT_SUPPORTED; + return; + } + (*table)[node][slot] = std::move(owner); +} + // ============================================================================ // Graph Exec Handles // ============================================================================ diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 520e7f4763..d10d1fc494 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -109,6 +109,9 @@ extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel; // Graph extern decltype(&cuGraphDestroy) p_cuGraphDestroy; extern decltype(&cuGraphExecDestroy) p_cuGraphExecDestroy; +extern decltype(&cuUserObjectCreate) p_cuUserObjectCreate; +extern decltype(&cuUserObjectRelease) p_cuUserObjectRelease; +extern decltype(&cuGraphRetainUserObject) p_cuGraphRetainUserObject; // Linker extern decltype(&cuLinkDestroy) p_cuLinkDestroy; @@ -466,6 +469,35 @@ GraphHandle create_graph_handle(CUgraph graph); // but h_parent will be prevented from destruction while this handle exists. GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent); +// ============================================================================ +// Graph slot attachments +// +// An owning graph carries a side table that keeps resources used by its nodes +// (kernel arguments, host callbacks, events, ...) alive for as long as the +// graph can execute. The table is retained on the CUgraph as a user object, so +// the driver releases it -- and everything attached through it -- when the +// graph is destroyed. The table layout is an internal detail; callers use the +// abstract API below. +// ============================================================================ + +// Type-erased shared owner of an attached resource. Typed handles such as +// EventHandle and KernelHandle convert to OpaqueHandle by assignment, reusing +// their existing control block; the helpers below build OpaqueHandles for the +// two cases that need a custom deleter. +using OpaqueHandle = std::shared_ptr; + +// Build an OpaqueHandle from a Python object: increments its refcount now and +// decrements it (under the GIL) on release. The caller must hold the GIL. +OpaqueHandle make_opaque_py(PyObject* obj); + +// Build an OpaqueHandle from a malloc'd buffer: std::free on release. +OpaqueHandle make_opaque_malloc(void* buf); + +// Attach owner to one of node's fixed slots on h_graph, replacing whatever was +// there. The owner lives until it is replaced or the graph is destroyed. +void graph_set_slot(const GraphHandle& h_graph, CUgraphNode node, + unsigned int slot, OpaqueHandle owner); + // ============================================================================ // Graph exec handle functions // ============================================================================ diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 4bb7156109..e8931fdde0 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -304,6 +304,9 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Graph void* p_cuGraphDestroy "reinterpret_cast(cuda_core::p_cuGraphDestroy)" void* p_cuGraphExecDestroy "reinterpret_cast(cuda_core::p_cuGraphExecDestroy)" + void* p_cuUserObjectCreate "reinterpret_cast(cuda_core::p_cuUserObjectCreate)" + void* p_cuUserObjectRelease "reinterpret_cast(cuda_core::p_cuUserObjectRelease)" + void* p_cuGraphRetainUserObject "reinterpret_cast(cuda_core::p_cuGraphRetainUserObject)" # Linker void* p_cuLinkDestroy "reinterpret_cast(cuda_core::p_cuLinkDestroy)" @@ -364,6 +367,7 @@ cdef void _init_driver_fn_pointers() noexcept: global p_cuMemPoolImportPointer global p_cuLibraryLoadFromFile, p_cuLibraryLoadData, p_cuLibraryUnload, p_cuLibraryGetKernel global p_cuGraphDestroy, p_cuGraphExecDestroy + global p_cuUserObjectCreate, p_cuUserObjectRelease, p_cuGraphRetainUserObject global p_cuLinkDestroy global p_cuGraphicsUnmapResources, p_cuGraphicsUnregisterResource global p_cuDevSmResourceSplit @@ -424,6 +428,9 @@ cdef void _init_driver_fn_pointers() noexcept: # Graph p_cuGraphDestroy = _get_driver_fn("cuGraphDestroy") p_cuGraphExecDestroy = _get_driver_fn("cuGraphExecDestroy") + p_cuUserObjectCreate = _get_driver_fn("cuUserObjectCreate") + p_cuUserObjectRelease = _get_driver_fn("cuUserObjectRelease") + p_cuGraphRetainUserObject = _get_driver_fn("cuGraphRetainUserObject") # Linker p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy") From effedd89ee329bae797c952d457e3415f7019fd5 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 29 May 2026 14:52:48 -0700 Subject: [PATCH 3/8] cuda.core: wire graph node attachments to the slot table (phase 2) Replace the per-resource CUDA user objects attached at each graph node with the per-graph slot table from phase 1. Kernel, event-record, event-wait, and host-callback nodes now store their owning handles in node slots via graph_set_slot. Stream-captured callbacks map the just-captured host node from cuStreamGetCaptureInfo and use the same path; forked builders share the primary's graph handle so their attachments reach the same table. Refine the phase 1 surface to support this: the slot table is created lazily on first attachment, so conditional-branch bodies (ref handles) get one too, and graph_set_slot returns CUresult for HANDLE_RETURN-style error checking. Removes _attach_user_object and the per-type heap-copy deleters. --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 95 ++++++++-------- cuda_core/cuda/core/_cpp/resource_handles.hpp | 20 ++-- cuda_core/cuda/core/_resource_handles.pxd | 13 +++ cuda_core/cuda/core/_resource_handles.pyx | 7 ++ cuda_core/cuda/core/graph/_graph_builder.pyx | 40 ++++++- cuda_core/cuda/core/graph/_graph_node.pyx | 52 ++++----- cuda_core/cuda/core/graph/_utils.pxd | 13 ++- cuda_core/cuda/core/graph/_utils.pyx | 102 ++++++------------ 8 files changed, 174 insertions(+), 168 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 097b2ec4d5..df3edd908c 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -1121,12 +1121,12 @@ LibraryHandle get_kernel_library(const KernelHandle& h) noexcept { namespace { -// Slot table layout (internal). Each owning graph maps CUgraphNode -> a -// fixed-size array of type-erased owners. The width is the most any single -// node needs: a kernel node holds its kernel and its packed arguments; a host -// node holds its callback and the userData. The table is heap-allocated and -// retained on the graph as a user object, so the driver frees it -- and every -// owner in it -- when the graph is destroyed. +// Slot table layout (internal). Each graph maps CUgraphNode -> a fixed-size +// array of type-erased owners. The width is the most any single node needs: a +// kernel node holds its kernel and its packed arguments; a host node holds its +// callback and the userData. The table is heap-allocated and retained on the +// graph as a user object, so the driver frees it -- and every owner in it -- +// when the graph is destroyed. constexpr std::size_t SLOTS_PER_NODE = 2; using NodeSlots = std::array; using GraphSlotTable = std::map; @@ -1150,37 +1150,10 @@ void destroy_graph_slot_table(void* table) noexcept { delete static_cast(table); } -// Allocate a slot table and hand it to graph via a user object. On success the -// user object owns the table and MOVE transfers our reference into the graph, -// so the table lives exactly as long as the graph. This is best-effort: if the -// driver calls fail it returns nullptr without disturbing the error channel, so -// graph creation keeps its existing behavior. A missing table surfaces later as -// CUDA_ERROR_NOT_SUPPORTED from graph_set_slot, where callers check for it. -GraphSlotTable* register_graph_slot_table(CUgraph graph) { - if (!p_cuUserObjectCreate || !p_cuGraphRetainUserObject || !p_cuUserObjectRelease) { - return nullptr; - } - auto* table = new GraphSlotTable(); - CUuserObject user_obj = nullptr; - GILReleaseGuard gil; - if (p_cuUserObjectCreate(&user_obj, table, - reinterpret_cast(destroy_graph_slot_table), - 1, CU_USER_OBJECT_NO_DESTRUCTOR_SYNC) != CUDA_SUCCESS) { - delete table; // no user object created; nothing else owns the table - return nullptr; - } - // The user object now owns the table; releasing its last reference frees it. - if (p_cuGraphRetainUserObject(graph, user_obj, 1, CU_GRAPH_USER_OBJECT_MOVE) != CUDA_SUCCESS) { - p_cuUserObjectRelease(user_obj, 1); // drops refcount to 0 -> frees table - return nullptr; - } - return table; -} - struct GraphBox { CUgraph resource; - GraphHandle h_parent; // Keeps parent alive for child/branch graphs - GraphSlotTable* slot_table; // Non-owning; owned by the graph's user object + GraphHandle h_parent; // Keeps parent alive for child/branch graphs + mutable GraphSlotTable* slot_table = nullptr; // Lazily created; owned by the graph's user object }; const GraphBox* get_box(const GraphHandle& h) { @@ -1190,6 +1163,38 @@ const GraphBox* get_box(const GraphHandle& h) { ); } +// Return box's slot table, creating it on first use. The table is retained on +// the graph as a user object (MOVE transfers our only reference into the +// graph), so it -- and every owner in it -- is freed when the graph is +// destroyed. Returns nullptr if the driver lacks user-object support or a +// driver call fails; the cached pointer is non-owning. +GraphSlotTable* ensure_slot_table(const GraphBox* box) { + if (box->slot_table) { + return box->slot_table; + } + if (!p_cuUserObjectCreate || !p_cuGraphRetainUserObject || !p_cuUserObjectRelease) { + return nullptr; + } + auto* table = new GraphSlotTable(); + CUuserObject user_obj = nullptr; + { + GILReleaseGuard gil; + if (p_cuUserObjectCreate(&user_obj, table, + reinterpret_cast(destroy_graph_slot_table), + 1, CU_USER_OBJECT_NO_DESTRUCTOR_SYNC) != CUDA_SUCCESS) { + delete table; // no user object created; nothing else owns the table + return nullptr; + } + if (p_cuGraphRetainUserObject(box->resource, user_obj, 1, + CU_GRAPH_USER_OBJECT_MOVE) != CUDA_SUCCESS) { + p_cuUserObjectRelease(user_obj, 1); // drops refcount to 0 -> frees table + return nullptr; + } + } + box->slot_table = table; // non-owning cache; the user object owns it + return table; +} + } // namespace OpaqueHandle make_opaque_py(PyObject* obj) { @@ -1202,9 +1207,8 @@ OpaqueHandle make_opaque_malloc(void* buf) { } GraphHandle create_graph_handle(CUgraph graph) { - GraphSlotTable* slot_table = register_graph_slot_table(graph); auto box = std::shared_ptr( - new GraphBox{graph, {}, slot_table}, + new GraphBox{graph, {}}, [](const GraphBox* b) { GILReleaseGuard gil; p_cuGraphDestroy(b->resource); @@ -1215,22 +1219,21 @@ GraphHandle create_graph_handle(CUgraph graph) { } GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent) { - auto box = std::make_shared(GraphBox{graph, h_parent, nullptr}); + auto box = std::make_shared(GraphBox{graph, h_parent}); return GraphHandle(box, &box->resource); } -void graph_set_slot(const GraphHandle& h_graph, CUgraphNode node, - unsigned int slot, OpaqueHandle owner) { - if (slot >= SLOTS_PER_NODE) { - err = CUDA_ERROR_INVALID_VALUE; - return; +CUresult graph_set_slot(const GraphHandle& h_graph, CUgraphNode node, + unsigned int slot, OpaqueHandle owner) { + if (!h_graph || slot >= SLOTS_PER_NODE) { + return CUDA_ERROR_INVALID_VALUE; } - GraphSlotTable* table = h_graph ? get_box(h_graph)->slot_table : nullptr; + GraphSlotTable* table = ensure_slot_table(get_box(h_graph)); if (!table) { - err = CUDA_ERROR_NOT_SUPPORTED; - return; + return CUDA_ERROR_NOT_SUPPORTED; } (*table)[node][slot] = std::move(owner); + return CUDA_SUCCESS; } // ============================================================================ diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index d10d1fc494..686d590b6e 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -472,12 +472,12 @@ GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent); // ============================================================================ // Graph slot attachments // -// An owning graph carries a side table that keeps resources used by its nodes -// (kernel arguments, host callbacks, events, ...) alive for as long as the -// graph can execute. The table is retained on the CUgraph as a user object, so -// the driver releases it -- and everything attached through it -- when the -// graph is destroyed. The table layout is an internal detail; callers use the -// abstract API below. +// A graph carries a side table that keeps resources used by its nodes (kernel +// arguments, host callbacks, events, ...) alive for as long as the graph can +// execute. The table is created on first use and retained on the CUgraph as a +// user object, so the driver releases it -- and everything attached through it +// -- when the graph is destroyed. The table layout is an internal detail; +// callers use the abstract API below. // ============================================================================ // Type-erased shared owner of an attached resource. Typed handles such as @@ -494,9 +494,11 @@ OpaqueHandle make_opaque_py(PyObject* obj); OpaqueHandle make_opaque_malloc(void* buf); // Attach owner to one of node's fixed slots on h_graph, replacing whatever was -// there. The owner lives until it is replaced or the graph is destroyed. -void graph_set_slot(const GraphHandle& h_graph, CUgraphNode node, - unsigned int slot, OpaqueHandle owner); +// there. The graph's slot table is created on first use. Returns CUDA_SUCCESS, +// or an error if slot is out of range or the graph cannot hold a table (e.g. +// the driver lacks user-object support). +CUresult graph_set_slot(const GraphHandle& h_graph, CUgraphNode node, + unsigned int slot, OpaqueHandle owner); // ============================================================================ // Graph exec handle functions diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 54b22ac602..aaf3e75222 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -57,6 +57,12 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const TexObjectValue] TexObjectHandle ctypedef shared_ptr[const SurfObjectValue] SurfObjectHandle + # Type-erased shared owner for resources attached to graph node slots. + # Typed handles above assign directly to an OpaqueHandle (shared control + # block); make_opaque_py / make_opaque_malloc cover the two cases needing a + # custom deleter. + ctypedef shared_ptr[const void] OpaqueHandle + # as_cu() - extract the raw CUDA handle (inline C++) cydriver.CUcontext as_cu(ContextHandle h) noexcept nogil cydriver.CUgreenCtx as_cu(GreenCtxHandle h) noexcept nogil @@ -223,6 +229,13 @@ cdef LibraryHandle get_kernel_library(const KernelHandle& h) noexcept nogil cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil cdef GraphHandle create_graph_handle_ref(cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil +# Graph slot attachments +cdef OpaqueHandle make_opaque_py(object obj) except+ +cdef OpaqueHandle make_opaque_malloc(void* buf) except+ +cdef cydriver.CUresult graph_set_slot( + const GraphHandle& h_graph, cydriver.CUgraphNode node, + unsigned int slot, OpaqueHandle owner) except+ + # Graph exec handles cdef GraphExecHandle create_graph_exec_handle(cydriver.CUgraphExec graph_exec) except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index e8931fdde0..9867f5cdcb 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -151,6 +151,13 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": GraphHandle create_graph_handle_ref "cuda_core::create_graph_handle_ref" ( cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil + # Graph slot attachments + OpaqueHandle make_opaque_py "cuda_core::make_opaque_py" (object obj) except+ + OpaqueHandle make_opaque_malloc "cuda_core::make_opaque_malloc" (void* buf) except+ + cydriver.CUresult graph_set_slot "cuda_core::graph_set_slot" ( + const GraphHandle& h_graph, cydriver.CUgraphNode node, + unsigned int slot, OpaqueHandle owner) except+ + # Graph exec handles GraphExecHandle create_graph_exec_handle "cuda_core::create_graph_exec_handle" ( cydriver.CUgraphExec graph_exec) except+ nogil diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index dfd106fb0d..010b038249 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -10,11 +10,13 @@ from libc.stdint cimport intptr_t from cuda.bindings cimport cydriver from cuda.core.graph._graph_definition cimport GraphCondition, GraphDefinition -from cuda.core.graph._utils cimport _attach_host_callback_to_graph +from cuda.core.graph._utils cimport _resolve_host_callback from cuda.core._resource_handles cimport ( GraphHandle, + OpaqueHandle, as_cu, as_py, create_graph_exec_handle, create_graph_handle, create_graph_handle_ref, + graph_set_slot, ) from cuda.core._stream cimport Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN @@ -813,21 +815,29 @@ cdef class GraphBuilder: cdef Stream stream = self._stream cdef cydriver.CUstream c_stream = as_cu(stream._h_stream) cdef cydriver.CUstreamCaptureStatus capture_status - cdef cydriver.CUgraph c_graph = NULL with nogil: - _get_capture_info(c_stream, &capture_status, &c_graph) + _get_capture_info(c_stream, &capture_status, NULL) if capture_status != cydriver.CU_STREAM_CAPTURE_STATUS_ACTIVE: raise RuntimeError("Cannot add callback when graph is not being built") cdef cydriver.CUhostFn c_fn cdef void* c_user_data = NULL - _attach_host_callback_to_graph(c_graph, fn, user_data, &c_fn, &c_user_data) + cdef OpaqueHandle fn_owner, data_owner + _resolve_host_callback(fn, user_data, &c_fn, &c_user_data, &fn_owner, &data_owner) with nogil: HANDLE_RETURN(cydriver.cuLaunchHostFunc(c_stream, c_fn, c_user_data)) + # Capturing the host function added a node to the graph; it is now the + # stream's sole capture dependency. Key the callback's owners to it so + # they live in the graph's slot table like any explicitly-added node. + cdef cydriver.CUgraphNode host_node = _capture_tail_node(c_stream) + HANDLE_RETURN(graph_set_slot(self._h_graph, host_node, 0, fn_owner)) + if data_owner: + HANDLE_RETURN(graph_set_slot(self._h_graph, host_node, 1, data_owner)) + cdef inline int GB_check_open(GraphBuilder gb) except -1: """Reject operations on a builder that has been closed. @@ -907,6 +917,28 @@ cdef inline int _get_capture_info( stream, status, NULL, graph, NULL, NULL)) +cdef inline cydriver.CUgraphNode _capture_tail_node(cydriver.CUstream stream) except *: + """Return the node a freshly-captured single-node operation left as the + stream's sole capture dependency (e.g. the host node added by + ``cuLaunchHostFunc``). The driver advances the stream's dependency set to + the new node, so the next captured op would depend on it. + """ + cdef cydriver.CUstreamCaptureStatus status + cdef const cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + with nogil: + IF CUDA_CORE_BUILD_MAJOR >= 13: + HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo( + stream, &status, NULL, NULL, &deps, NULL, &num_deps)) + ELSE: + HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo( + stream, &status, NULL, NULL, &deps, &num_deps)) + if num_deps != 1: + raise RuntimeError( + f"expected exactly one capture dependency after a host callback, got {num_deps}") + return deps[0] + + cdef inline tuple GB_cond_with_params(GraphBuilder gb, node_params): status, _, graph, *deps_info, num_dependencies = handle_return( driver.cuStreamGetCaptureInfo(gb._stream.handle) diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index 53145dd5e2..728bbdab51 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -9,8 +9,6 @@ from __future__ import annotations from collections.abc import Iterable from typing import TYPE_CHECKING -from cpython.ref cimport Py_INCREF - from libc.stddef cimport size_t from libc.stdint cimport uintptr_t from libc.string cimport memset as c_memset @@ -46,21 +44,19 @@ from cuda.core._resource_handles cimport ( GraphHandle, GraphNodeHandle, KernelHandle, + OpaqueHandle, as_cu, as_intptr, as_py, create_graph_handle_ref, create_graph_node_handle, graph_node_get_graph, + graph_set_slot, invalidate_graph_node, - py_object_user_object_destroy, ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value -from cuda.core.graph._utils cimport ( - _attach_host_callback_to_graph, - _attach_user_object, -) +from cuda.core.graph._utils cimport _resolve_host_callback import weakref @@ -500,16 +496,6 @@ cdef class GraphNode: cydriver.CU_GRAPH_COND_TYPE_SWITCH, count, SwitchNode) -cdef void _destroy_event_handle_copy(void* ptr) noexcept nogil: - cdef EventHandle* p = ptr - del p - - -cdef void _destroy_kernel_handle_copy(void* ptr) noexcept nogil: - cdef KernelHandle* p = ptr - del p - - cdef inline ConditionalNode _make_conditional_node( GraphNode pred, GraphCondition condition, @@ -626,6 +612,7 @@ cdef inline KernelNode GN_launch(GraphNode self, LaunchConfig conf, Kernel ker, cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 + cdef OpaqueHandle owner if pred_node != NULL: deps = &pred_node @@ -648,14 +635,8 @@ cdef inline KernelNode GN_launch(GraphNode self, LaunchConfig conf, Kernel ker, HANDLE_RETURN(cydriver.cuGraphAddKernelNode( &new_node, as_cu(h_graph), deps, num_deps, &node_params)) - _attach_user_object(as_cu(h_graph), new KernelHandle(ker._h_kernel), - _destroy_kernel_handle_copy) - - cdef object kernel_args = ker_args.kernel_args - if kernel_args is not None: - Py_INCREF(kernel_args) - _attach_user_object(as_cu(h_graph), kernel_args, - py_object_user_object_destroy) + owner = ker._h_kernel + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, owner)) return _registered(KernelNode._create_with_params( create_graph_node_handle(new_node, h_graph), @@ -914,6 +895,7 @@ cdef inline EventRecordNode GN_record_event(GraphNode self, Event ev): cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 + cdef OpaqueHandle owner if pred_node != NULL: deps = &pred_node @@ -923,8 +905,8 @@ cdef inline EventRecordNode GN_record_event(GraphNode self, Event ev): HANDLE_RETURN(cydriver.cuGraphAddEventRecordNode( &new_node, as_cu(h_graph), deps, num_deps, as_cu(ev._h_event))) - _attach_user_object(as_cu(h_graph), new EventHandle(ev._h_event), - _destroy_event_handle_copy) + owner = ev._h_event + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, owner)) return _registered(EventRecordNode._create_with_params( create_graph_node_handle(new_node, h_graph), ev._h_event)) @@ -936,6 +918,7 @@ cdef inline EventWaitNode GN_wait_event(GraphNode self, Event ev): cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 + cdef OpaqueHandle owner if pred_node != NULL: deps = &pred_node @@ -945,8 +928,8 @@ cdef inline EventWaitNode GN_wait_event(GraphNode self, Event ev): HANDLE_RETURN(cydriver.cuGraphAddEventWaitNode( &new_node, as_cu(h_graph), deps, num_deps, as_cu(ev._h_event))) - _attach_user_object(as_cu(h_graph), new EventHandle(ev._h_event), - _destroy_event_handle_copy) + owner = ev._h_event + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, owner)) return _registered(EventWaitNode._create_with_params( create_graph_node_handle(new_node, h_graph), ev._h_event)) @@ -961,19 +944,24 @@ cdef inline HostCallbackNode GN_callback(GraphNode self, object fn, object user_ cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 + cdef OpaqueHandle fn_owner, data_owner if pred_node != NULL: deps = &pred_node num_deps = 1 - _attach_host_callback_to_graph( - as_cu(h_graph), fn, user_data, - &node_params.fn, &node_params.userData) + _resolve_host_callback( + fn, user_data, &node_params.fn, &node_params.userData, + &fn_owner, &data_owner) with nogil: HANDLE_RETURN(cydriver.cuGraphAddHostNode( &new_node, as_cu(h_graph), deps, num_deps, &node_params)) + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, fn_owner)) + if data_owner: + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 1, data_owner)) + cdef object callable_obj = fn if not isinstance(fn, ct._CFuncPtr) else None return _registered(HostCallbackNode._create_with_params( create_graph_node_handle(new_node, h_graph), callable_obj, diff --git a/cuda_core/cuda/core/graph/_utils.pxd b/cuda_core/cuda/core/graph/_utils.pxd index 63fdb00ac4..fc77809c84 100644 --- a/cuda_core/cuda/core/graph/_utils.pxd +++ b/cuda_core/cuda/core/graph/_utils.pxd @@ -4,13 +4,12 @@ from cuda.bindings cimport cydriver +from cuda.core._resource_handles cimport OpaqueHandle -cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil -cdef void _attach_user_object( - cydriver.CUgraph graph, void* ptr, - cydriver.CUhostFn destroy) except * +cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil -cdef void _attach_host_callback_to_graph( - cydriver.CUgraph graph, object fn, object user_data, - cydriver.CUhostFn* out_fn, void** out_user_data) except * +cdef void _resolve_host_callback( + object fn, object user_data, + cydriver.CUhostFn* out_fn, void** out_user_data, + OpaqueHandle* out_fn_owner, OpaqueHandle* out_data_owner) except * diff --git a/cuda_core/cuda/core/graph/_utils.pyx b/cuda_core/cuda/core/graph/_utils.pyx index dfc2f4f3fe..1bfc8c6d04 100644 --- a/cuda_core/cuda/core/graph/_utils.pyx +++ b/cuda_core/cuda/core/graph/_utils.pyx @@ -2,16 +2,17 @@ # # SPDX-License-Identifier: Apache-2.0 -from cpython.ref cimport Py_INCREF - from libc.stdint cimport uintptr_t -from libc.stdlib cimport malloc, free +from libc.stdlib cimport malloc from libc.string cimport memcpy as c_memcpy from cuda.bindings cimport cydriver -from cuda.core._resource_handles cimport py_object_user_object_destroy -from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._resource_handles cimport ( + OpaqueHandle, + make_opaque_malloc, + make_opaque_py, +) cdef void _py_host_trampoline(void* data) noexcept with gil: @@ -22,78 +23,39 @@ cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil: return fn == _py_host_trampoline -cdef void _attach_user_object( - cydriver.CUgraph graph, void* ptr, - cydriver.CUhostFn destroy) except *: - """Create a CUDA user object and transfer ownership to the graph. - - On success the graph owns the resource (via MOVE semantics). - On failure the destroy callback is invoked to clean up ptr, - then a CUDAError is raised — callers need no try/except. - """ - cdef cydriver.CUuserObject user_obj = NULL - cdef cydriver.CUresult ret - with nogil: - ret = cydriver.cuUserObjectCreate( - &user_obj, ptr, destroy, 1, - cydriver.CU_USER_OBJECT_NO_DESTRUCTOR_SYNC) - if ret == cydriver.CUDA_SUCCESS: - ret = cydriver.cuGraphRetainUserObject( - graph, user_obj, 1, cydriver.CU_GRAPH_USER_OBJECT_MOVE) - if ret != cydriver.CUDA_SUCCESS: - cydriver.cuUserObjectRelease(user_obj, 1) - if ret != cydriver.CUDA_SUCCESS: - if user_obj == NULL: - destroy(ptr) - HANDLE_RETURN(ret) - - -cdef void _attach_host_callback_to_graph( - cydriver.CUgraph graph, object fn, object user_data, - cydriver.CUhostFn* out_fn, void** out_user_data) except *: - """Resolve a Python callable or ctypes CFuncPtr into a C callback pair. +cdef void _resolve_host_callback( + object fn, object user_data, + cydriver.CUhostFn* out_fn, void** out_user_data, + OpaqueHandle* out_fn_owner, OpaqueHandle* out_data_owner) except *: + """Resolve a Python callable or ctypes CFuncPtr into a C callback pair and + the owners that keep it alive. - Handles Py_INCREF, user-object attachment for lifetime management, - and user_data copying. On return, *out_fn and *out_user_data are - ready to pass to cuGraphAddHostNode or cuLaunchHostFunc. + On return ``*out_fn`` / ``*out_user_data`` are ready to pass to + ``cuGraphAddHostNode`` or ``cuLaunchHostFunc``. ``*out_fn_owner`` owns the + callback object; ``*out_data_owner`` owns a copied ``user_data`` buffer and + is left null otherwise. The caller attaches the owners to the node's graph + slots. """ import ctypes as ct - cdef void* fn_pyobj = NULL - if isinstance(fn, ct._CFuncPtr): - Py_INCREF(fn) - fn_pyobj = fn - _attach_user_object( - graph, fn_pyobj, - py_object_user_object_destroy) - out_fn[0] = ct.cast( - fn, ct.c_void_p).value - - if user_data is not None: - if isinstance(user_data, int): - out_user_data[0] = user_data - else: - buf = bytes(user_data) - out_user_data[0] = malloc(len(buf)) - if out_user_data[0] == NULL: - raise MemoryError( - "failed to allocate user_data buffer") - c_memcpy(out_user_data[0], buf, len(buf)) - _attach_user_object( - graph, out_user_data[0], - free) - else: + out_fn[0] = ct.cast(fn, ct.c_void_p).value + if user_data is None: out_user_data[0] = NULL + elif isinstance(user_data, int): + out_user_data[0] = user_data + else: + buf = bytes(user_data) + out_user_data[0] = malloc(len(buf)) + if out_user_data[0] == NULL: + raise MemoryError("failed to allocate user_data buffer") + c_memcpy(out_user_data[0], buf, len(buf)) + out_data_owner[0] = make_opaque_malloc(out_user_data[0]) else: if user_data is not None: raise ValueError( - "user_data is only supported with ctypes " - "function pointers") - Py_INCREF(fn) - fn_pyobj = fn + "user_data is only supported with ctypes function pointers") out_fn[0] = _py_host_trampoline - out_user_data[0] = fn_pyobj - _attach_user_object( - graph, fn_pyobj, - py_object_user_object_destroy) + out_user_data[0] = fn + + out_fn_owner[0] = make_opaque_py(fn) From 91cf159b7b9ca935662fbd034d892475f0c4bcf4 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 29 May 2026 16:17:46 -0700 Subject: [PATCH 4/8] cuda.core: rename graph host-callback module and retain kernel args Rename graph/_utils to graph/_host_callback now that it holds only host-callback machinery (the trampoline, _is_py_host_trampoline, and _resolve_host_callback), matching the concept-named files around it, and update the three cimport sites. Add _attach_host_callback_owners to share the "callback -> slot 0, user_data -> slot 1" attachment between the eager (GN_callback) and capture (add_callback) paths. Guard a zero-length user_data copy against malloc(0) and hoist the per-call ctypes import. Attach the kernel-argument tuple to the kernel node's slot 1 so the Python objects backing the arguments -- notably device Buffers -- outlive the graph. The driver copies argument values into the node at add time but does not keep the referenced device memory alive, so without this a kernel node could be left with a stale device pointer. This is the slot-table port of the user-object fix from #2041 (currently only on main). --- cuda_core/cuda/core/graph/_graph_builder.pyx | 7 ++-- cuda_core/cuda/core/graph/_graph_node.pyx | 21 ++++++++---- .../graph/{_utils.pxd => _host_callback.pxd} | 6 +++- .../graph/{_utils.pyx => _host_callback.pyx} | 32 +++++++++++++++---- cuda_core/cuda/core/graph/_subclasses.pyx | 2 +- 5 files changed, 48 insertions(+), 20 deletions(-) rename cuda_core/cuda/core/graph/{_utils.pxd => _host_callback.pxd} (65%) rename cuda_core/cuda/core/graph/{_utils.pyx => _host_callback.pyx} (65%) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index 010b038249..b449017d3f 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -10,13 +10,12 @@ from libc.stdint cimport intptr_t from cuda.bindings cimport cydriver from cuda.core.graph._graph_definition cimport GraphCondition, GraphDefinition -from cuda.core.graph._utils cimport _resolve_host_callback +from cuda.core.graph._host_callback cimport _attach_host_callback_owners, _resolve_host_callback from cuda.core._resource_handles cimport ( GraphHandle, OpaqueHandle, as_cu, as_py, create_graph_exec_handle, create_graph_handle, create_graph_handle_ref, - graph_set_slot, ) from cuda.core._stream cimport Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN @@ -834,9 +833,7 @@ cdef class GraphBuilder: # stream's sole capture dependency. Key the callback's owners to it so # they live in the graph's slot table like any explicitly-added node. cdef cydriver.CUgraphNode host_node = _capture_tail_node(c_stream) - HANDLE_RETURN(graph_set_slot(self._h_graph, host_node, 0, fn_owner)) - if data_owner: - HANDLE_RETURN(graph_set_slot(self._h_graph, host_node, 1, data_owner)) + _attach_host_callback_owners(self._h_graph, host_node, fn_owner, data_owner) cdef inline int GB_check_open(GraphBuilder gb) except -1: diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index 728bbdab51..ae08898ff9 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -53,11 +53,16 @@ from cuda.core._resource_handles cimport ( graph_node_get_graph, graph_set_slot, invalidate_graph_node, + make_opaque_py, ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value -from cuda.core.graph._utils cimport _resolve_host_callback +from cuda.core.graph._host_callback cimport ( + _attach_host_callback_owners, + _resolve_host_callback, +) +import ctypes as ct import weakref from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy @@ -635,8 +640,16 @@ cdef inline KernelNode GN_launch(GraphNode self, LaunchConfig conf, Kernel ker, HANDLE_RETURN(cydriver.cuGraphAddKernelNode( &new_node, as_cu(h_graph), deps, num_deps, &node_params)) + # Slot 0 keeps the kernel loaded; slot 1 keeps the Python kernel-argument + # objects (notably device Buffers) alive for the graph's lifetime. The + # driver copies argument values into the node at add time but does not own + # the device memory they reference. owner = ker._h_kernel HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, owner)) + kernel_args = ker_args.kernel_args + if kernel_args is not None: + owner = make_opaque_py(kernel_args) + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 1, owner)) return _registered(KernelNode._create_with_params( create_graph_node_handle(new_node, h_graph), @@ -936,8 +949,6 @@ cdef inline EventWaitNode GN_wait_event(GraphNode self, Event ev): cdef inline HostCallbackNode GN_callback(GraphNode self, object fn, object user_data): - import ctypes as ct - cdef cydriver.CUDA_HOST_NODE_PARAMS node_params cdef cydriver.CUgraphNode new_node = NULL cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) @@ -958,9 +969,7 @@ cdef inline HostCallbackNode GN_callback(GraphNode self, object fn, object user_ HANDLE_RETURN(cydriver.cuGraphAddHostNode( &new_node, as_cu(h_graph), deps, num_deps, &node_params)) - HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, fn_owner)) - if data_owner: - HANDLE_RETURN(graph_set_slot(h_graph, new_node, 1, data_owner)) + _attach_host_callback_owners(h_graph, new_node, fn_owner, data_owner) cdef object callable_obj = fn if not isinstance(fn, ct._CFuncPtr) else None return _registered(HostCallbackNode._create_with_params( diff --git a/cuda_core/cuda/core/graph/_utils.pxd b/cuda_core/cuda/core/graph/_host_callback.pxd similarity index 65% rename from cuda_core/cuda/core/graph/_utils.pxd rename to cuda_core/cuda/core/graph/_host_callback.pxd index fc77809c84..dac249c74e 100644 --- a/cuda_core/cuda/core/graph/_utils.pxd +++ b/cuda_core/cuda/core/graph/_host_callback.pxd @@ -4,7 +4,7 @@ from cuda.bindings cimport cydriver -from cuda.core._resource_handles cimport OpaqueHandle +from cuda.core._resource_handles cimport GraphHandle, OpaqueHandle cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil @@ -13,3 +13,7 @@ cdef void _resolve_host_callback( object fn, object user_data, cydriver.CUhostFn* out_fn, void** out_user_data, OpaqueHandle* out_fn_owner, OpaqueHandle* out_data_owner) except * + +cdef int _attach_host_callback_owners( + const GraphHandle& h_graph, cydriver.CUgraphNode node, + OpaqueHandle fn_owner, OpaqueHandle data_owner) except -1 diff --git a/cuda_core/cuda/core/graph/_utils.pyx b/cuda_core/cuda/core/graph/_host_callback.pyx similarity index 65% rename from cuda_core/cuda/core/graph/_utils.pyx rename to cuda_core/cuda/core/graph/_host_callback.pyx index 1bfc8c6d04..bed2d8152f 100644 --- a/cuda_core/cuda/core/graph/_utils.pyx +++ b/cuda_core/cuda/core/graph/_host_callback.pyx @@ -9,10 +9,15 @@ from libc.string cimport memcpy as c_memcpy from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport ( + GraphHandle, OpaqueHandle, + graph_set_slot, make_opaque_malloc, make_opaque_py, ) +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN + +import ctypes as ct cdef void _py_host_trampoline(void* data) noexcept with gil: @@ -36,8 +41,6 @@ cdef void _resolve_host_callback( is left null otherwise. The caller attaches the owners to the node's graph slots. """ - import ctypes as ct - if isinstance(fn, ct._CFuncPtr): out_fn[0] = ct.cast(fn, ct.c_void_p).value if user_data is None: @@ -46,11 +49,14 @@ cdef void _resolve_host_callback( out_user_data[0] = user_data else: buf = bytes(user_data) - out_user_data[0] = malloc(len(buf)) - if out_user_data[0] == NULL: - raise MemoryError("failed to allocate user_data buffer") - c_memcpy(out_user_data[0], buf, len(buf)) - out_data_owner[0] = make_opaque_malloc(out_user_data[0]) + if len(buf): + out_user_data[0] = malloc(len(buf)) + if out_user_data[0] == NULL: + raise MemoryError("failed to allocate user_data buffer") + c_memcpy(out_user_data[0], buf, len(buf)) + out_data_owner[0] = make_opaque_malloc(out_user_data[0]) + else: + out_user_data[0] = NULL else: if user_data is not None: raise ValueError( @@ -59,3 +65,15 @@ cdef void _resolve_host_callback( out_user_data[0] = fn out_fn_owner[0] = make_opaque_py(fn) + + +cdef int _attach_host_callback_owners( + const GraphHandle& h_graph, cydriver.CUgraphNode node, + OpaqueHandle fn_owner, OpaqueHandle data_owner) except -1: + """Attach a resolved host callback's owners to its node's graph slots: the + callback in slot 0 and any copied ``user_data`` buffer in slot 1. + """ + HANDLE_RETURN(graph_set_slot(h_graph, node, 0, fn_owner)) + if data_owner: + HANDLE_RETURN(graph_set_slot(h_graph, node, 1, data_owner)) + return 0 diff --git a/cuda_core/cuda/core/graph/_subclasses.pyx b/cuda_core/cuda/core/graph/_subclasses.pyx index 85a382197f..a393dfff69 100644 --- a/cuda_core/cuda/core/graph/_subclasses.pyx +++ b/cuda_core/cuda/core/graph/_subclasses.pyx @@ -30,7 +30,7 @@ from cuda.core._resource_handles cimport ( ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN -from cuda.core.graph._utils cimport _is_py_host_trampoline +from cuda.core.graph._host_callback cimport _is_py_host_trampoline from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core.typing import GraphConditionalType From cb3adc295574f4e02a75169162bbb49c241593a2 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 29 May 2026 17:01:35 -0700 Subject: [PATCH 5/8] cuda.core: accept Buffer in graph memcpy/memset and retain operands GraphNode.memcpy/memset (and the GraphDefinition pass-throughs) now accept a Buffer or a raw int for each address. A new _resolve_ptr helper reads the device pointer from a Buffer and returns it as an owner; a raw int casts through with no owner. GN_memcpy attaches a Buffer dst to slot 0 and src to slot 1, and GN_memset attaches dst to slot 0, so buffers passed by value outlive the graph. Raw ints behave exactly as before (caller owns the lifetime), so this is backward compatible. Document the stream-capture lifetime contract on GraphBuilder: operations recorded during capture reference caller-owned memory and are not retained, unlike explicit GraphDefinition construction. Host callbacks are the one exception, retained on both the capture and explicit paths. --- cuda_core/cuda/core/graph/_graph_builder.pyx | 12 ++++ cuda_core/cuda/core/graph/_graph_node.pyx | 63 ++++++++++++++++---- 2 files changed, 62 insertions(+), 13 deletions(-) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index b449017d3f..3feecc60f1 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -244,6 +244,18 @@ cdef class GraphBuilder: to ambiguity. New graph builders should instead be created through a :obj:`~_device.Device`, or a :obj:`~_stream.stream` object. + .. note:: + + Operations recorded during capture reference your memory but do not + take ownership of it. As with ordinary stream work, you must keep the + operands alive for as long as the completed graph may execute -- for + example, the :obj:`~_memory.Buffer` objects passed to :func:`~launch` + or :meth:`~_memory.Buffer.copy_to`. Host callbacks added with + :meth:`callback` are the exception: the callable (and any copied + ``user_data``) are retained for the graph's lifetime. This differs from + building a graph explicitly with :class:`~graph.GraphDefinition`, which + retains the operands it is given. + """ def __init__(self): diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index ae08898ff9..5759edbfb2 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -20,6 +20,7 @@ from cuda.bindings cimport cydriver from cuda.core._event cimport Event from cuda.core._kernel_arg_handler cimport ParamHolder from cuda.core._launch_config cimport LaunchConfig +from cuda.core._memory._buffer cimport Buffer from cuda.core._module cimport Kernel from cuda.core.graph._graph_definition cimport GraphCondition, GraphDefinition from cuda.core.graph._subclasses cimport ( @@ -283,13 +284,15 @@ cdef class GraphNode: """ return GN_free(self, dptr) - def memset(self, dst: int, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode: + def memset(self, dst: Buffer | int, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode: """Add a memset node depending on this node. Parameters ---------- - dst : int - Destination device pointer. + dst : Buffer or int + Destination. If a :class:`Buffer` is given it is retained for the + graph's lifetime; if a raw device pointer (int) is given it is used + as-is and the caller must keep the underlying memory alive. value : int or buffer-protocol object Fill value. int for 1-byte fill (range [0, 256)), or buffer-protocol object of 1, 2, or 4 bytes. @@ -305,12 +308,14 @@ cdef class GraphNode: MemsetNode A new MemsetNode representing the memset operation. """ + cdef cydriver.CUdeviceptr c_dst cdef unsigned int val cdef unsigned int elem_size + dst_owner = _resolve_ptr(dst, &c_dst) val, elem_size = _parse_fill_value(value) - return GN_memset(self, dst, val, elem_size, width, height, pitch) + return GN_memset(self, c_dst, dst_owner, val, elem_size, width, height, pitch) - def memcpy(self, dst: int, src: int, size_t size) -> MemcpyNode: + def memcpy(self, dst: Buffer | int, src: Buffer | int, size_t size) -> MemcpyNode: """Add a memcpy node depending on this node. Copies ``size`` bytes from ``src`` to ``dst``. Memory types are @@ -319,10 +324,12 @@ cdef class GraphNode: Parameters ---------- - dst : int - Destination pointer (device or pinned host). - src : int - Source pointer (device or pinned host). + dst : Buffer or int + Destination (device or pinned host). If a :class:`Buffer` is given + it is retained for the graph's lifetime; a raw pointer (int) is used + as-is and the caller must keep the underlying memory alive. + src : Buffer or int + Source (device or pinned host). Retained like ``dst`` when a Buffer. size : int Number of bytes to copy. @@ -331,7 +338,11 @@ cdef class GraphNode: MemcpyNode A new MemcpyNode representing the copy operation. """ - return GN_memcpy(self, dst, src, size) + cdef cydriver.CUdeviceptr c_dst + cdef cydriver.CUdeviceptr c_src + dst_owner = _resolve_ptr(dst, &c_dst) + src_owner = _resolve_ptr(src, &c_src) + return GN_memcpy(self, c_dst, dst_owner, c_src, src_owner, size) def embed(self, child: GraphDefinition) -> ChildGraphNode: """Add a child graph node depending on this node. @@ -778,8 +789,24 @@ cdef inline FreeNode GN_free(GraphNode self, cydriver.CUdeviceptr c_dptr): return _registered(FreeNode._create_with_params(create_graph_node_handle(new_node, h_graph), c_dptr)) +cdef inline object _resolve_ptr(object value, cydriver.CUdeviceptr* out_ptr): + """Resolve a memcpy/memset operand into a device pointer. + + ``value`` is a :class:`Buffer` or a raw integer address. For a Buffer the + device pointer is read from the buffer and the buffer is returned so the + caller can retain it on the node's slot table for the graph's lifetime. For + a raw integer no owner is returned and the caller is responsible for keeping + the underlying memory alive. + """ + if isinstance(value, Buffer): + out_ptr[0] = as_cu((value)._h_ptr) + return value + out_ptr[0] = value + return None + + cdef inline MemsetNode GN_memset( - GraphNode self, cydriver.CUdeviceptr c_dst, + GraphNode self, cydriver.CUdeviceptr c_dst, object dst_owner, unsigned int val, unsigned int elem_size, size_t width, size_t height, size_t pitch): cdef cydriver.CUDA_MEMSET_NODE_PARAMS memset_params @@ -810,14 +837,18 @@ cdef inline MemsetNode GN_memset( &new_node, as_cu(h_graph), deps, num_deps, &memset_params, ctx)) + # Retain a Buffer destination for the graph's lifetime (slot 0). + if dst_owner is not None: + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, make_opaque_py(dst_owner))) + return _registered(MemsetNode._create_with_params( create_graph_node_handle(new_node, h_graph), c_dst, val, elem_size, width, height, pitch)) cdef inline MemcpyNode GN_memcpy( - GraphNode self, cydriver.CUdeviceptr c_dst, - cydriver.CUdeviceptr c_src, size_t size): + GraphNode self, cydriver.CUdeviceptr c_dst, object dst_owner, + cydriver.CUdeviceptr c_src, object src_owner, size_t size): cdef unsigned int dst_mem_type = cydriver.CU_MEMORYTYPE_DEVICE cdef unsigned int src_mem_type = cydriver.CU_MEMORYTYPE_DEVICE cdef cydriver.CUresult ret @@ -871,6 +902,12 @@ cdef inline MemcpyNode GN_memcpy( HANDLE_RETURN(cydriver.cuGraphAddMemcpyNode( &new_node, as_cu(h_graph), deps, num_deps, ¶ms, ctx)) + # Retain Buffer operands for the graph's lifetime (dst -> slot 0, src -> slot 1). + if dst_owner is not None: + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, make_opaque_py(dst_owner))) + if src_owner is not None: + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 1, make_opaque_py(src_owner))) + return _registered(MemcpyNode._create_with_params( create_graph_node_handle(new_node, h_graph), c_dst, c_src, size, c_dst_type, c_src_type)) From 813058d0153a278a326e13b9ea112083870956d0 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 3 Jun 2026 11:51:32 -0700 Subject: [PATCH 6/8] cuda.core: add slot-table lifetime tests for Buffer memcpy/memset and capture callbacks Cover GraphDefinition memset/memcpy with Buffer operands (including clone), and GraphBuilder capture host callbacks retained after dropping Python refs. --- cuda_core/tests/graph/test_graph_builder.py | 49 +++++++++ .../graph/test_graph_definition_lifetime.py | 101 ++++++++++++++++++ 2 files changed, 150 insertions(+) diff --git a/cuda_core/tests/graph/test_graph_builder.py b/cuda_core/tests/graph/test_graph_builder.py index efb70fe75d..db094495ac 100644 --- a/cuda_core/tests/graph/test_graph_builder.py +++ b/cuda_core/tests/graph/test_graph_builder.py @@ -3,6 +3,8 @@ """GraphBuilder stream capture tests.""" +import gc + import numpy as np import pytest from helpers.graph_kernels import compile_common_kernels, compile_conditional_kernels @@ -290,6 +292,53 @@ def read_byte(data): assert result[0] == 0xAB +def test_graph_capture_callback_python_survives_del(init_cuda): + """Captured host callback is retained in the graph slot table after del.""" + called = [False] + + def my_callback(): + called[0] = True + + launch_stream = Device().create_stream() + gb = launch_stream.create_graph_builder().begin_building() + gb.callback(my_callback) + graph = gb.end_building().complete() + + del my_callback + gc.collect() + + graph.launch(launch_stream) + launch_stream.sync() + + assert called[0] + + +def test_graph_capture_callback_ctypes_user_data_survives_del(init_cuda): + """Captured ctypes callback and copied user_data survive after del.""" + import ctypes + + CALLBACK = ctypes.CFUNCTYPE(None, ctypes.c_void_p) + result = [0] + + @CALLBACK + def read_byte(data): + result[0] = ctypes.cast(data, ctypes.POINTER(ctypes.c_uint8))[0] + + payload = bytes([0xAB]) + launch_stream = Device().create_stream() + gb = launch_stream.create_graph_builder().begin_building() + gb.callback(read_byte, user_data=payload) + graph = gb.end_building().complete() + + del read_byte, payload + gc.collect() + + graph.launch(launch_stream) + launch_stream.sync() + + assert result[0] == 0xAB + + @pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+") def test_graph_child_graph(init_cuda): mod = compile_common_kernels() diff --git a/cuda_core/tests/graph/test_graph_definition_lifetime.py b/cuda_core/tests/graph/test_graph_definition_lifetime.py index 40bc6f3c44..9be46f96f3 100644 --- a/cuda_core/tests/graph/test_graph_definition_lifetime.py +++ b/cuda_core/tests/graph/test_graph_definition_lifetime.py @@ -594,3 +594,104 @@ def test_kernel_args_survive_graph_clone(init_cuda): out = (ctypes.c_int * 1)(0) handle_return(driver.cuMemcpyDtoH(out, dptr, ctypes.sizeof(ctypes.c_int))) assert out[0] == 1 + + +# ============================================================================= +# Memcpy/memset Buffer lifetime — operands passed as Buffer objects +# ============================================================================= + + +def test_memset_buffer_lifetime(init_cuda): + """Buffer passed as memset destination is kept alive after the Python ref drops.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + buf_weak = weakref.ref(buf) + dptr = int(buf.handle) + + g = GraphDefinition() + g.memset(buf, 0xAB, 4) + + del buf + gc.collect() + assert buf_weak() is not None + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dptr, 4)) + assert list(out) == [0xAB] * 4 + + del g + _wait_until(lambda: buf_weak() is None) + + +def test_memcpy_buffer_lifetime(init_cuda): + """Source and destination Buffers are kept alive for a memcpy node.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + src.fill(0xCD, stream=dev.default_stream) + dev.default_stream.sync() + src_weak = weakref.ref(src) + dst_weak = weakref.ref(dst) + dst_dptr = int(dst.handle) + + g = GraphDefinition() + g.memcpy(dst, src, 4) + + del src, dst + gc.collect() + assert src_weak() is not None + assert dst_weak() is not None + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) + assert list(out) == [0xCD] * 4 + + del g + _wait_until(lambda: src_weak() is None and dst_weak() is None) + + +def test_memcpy_buffers_survive_graph_clone(init_cuda): + """Cloned graph keeps memcpy operand Buffers alive via CUDA user objects.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + src.fill(0xCD, stream=dev.default_stream) + dev.default_stream.sync() + dst_dptr = int(dst.handle) + + g = GraphDefinition() + g.memcpy(dst, src, 4) + cloned_cu_graph = handle_return(driver.cuGraphClone(driver.CUgraph(g.handle))) + + del src, dst, g + gc.collect() + + graph_exec = handle_return(driver.cuGraphInstantiate(cloned_cu_graph, 0)) + stream = dev.create_stream() + handle_return(driver.cuGraphLaunch(graph_exec, driver.CUstream(int(stream.handle)))) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) + assert list(out) == [0xCD] * 4 From 10268ff9f1927f1376d694bb7e2604b35a9673c2 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 3 Jun 2026 12:21:31 -0700 Subject: [PATCH 7/8] cuda.core: add explicit dst/src_owner for graph memcpy/memset Keyword-only *_owner args retain arbitrary objects for raw pointer operands; Buffer+owner combinations are rejected. Strengthen owner tests with weakref retention checks and add src_owner rejection test. --- .../cuda/core/graph/_graph_definition.pyx | 15 +- cuda_core/cuda/core/graph/_graph_node.pyx | 109 +++++++++++---- .../graph/test_graph_definition_lifetime.py | 132 ++++++++++++++++++ 3 files changed, 222 insertions(+), 34 deletions(-) diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyx b/cuda_core/cuda/core/graph/_graph_definition.pyx index 1ec5697832..85e19a9666 100644 --- a/cuda_core/cuda/core/graph/_graph_definition.pyx +++ b/cuda_core/cuda/core/graph/_graph_definition.pyx @@ -33,6 +33,7 @@ if TYPE_CHECKING: from cuda.core._device import Device from cuda.core._event import Event from cuda.core._launch_config import LaunchConfig + from cuda.core._memory._buffer import Buffer from cuda.core._module import Kernel from cuda.core.graph._graph_builder import ( Graph, @@ -156,17 +157,21 @@ cdef class GraphDefinition: def memset( self, - dst: int, + dst: Buffer | int, value, size_t width, + *, size_t height=1, - size_t pitch=0 + size_t pitch=0, + dst_owner=None, ) -> MemsetNode: """Add an entry-point memset node (no dependencies). See :meth:`GraphNode.memset` for full documentation. """ - return self._entry.memset(dst, value, width, height, pitch) + return self._entry.memset( + dst, value, width, height=height, pitch=pitch, dst_owner=dst_owner + ) def launch(self, config: LaunchConfig, kernel: Kernel, *args) -> KernelNode: """Add an entry-point kernel launch node (no dependencies). @@ -200,12 +205,12 @@ cdef class GraphDefinition: """ return self._entry.join(*nodes) - def memcpy(self, dst: int, src: int, size_t size) -> MemcpyNode: + def memcpy(self, dst: Buffer | int, src: Buffer | int, size_t size, *, dst_owner=None, src_owner=None) -> MemcpyNode: """Add an entry-point memcpy node (no dependencies). See :meth:`GraphNode.memcpy` for full documentation. """ - return self._entry.memcpy(dst, src, size) + return self._entry.memcpy(dst, src, size, dst_owner=dst_owner, src_owner=src_owner) def embed(self, child: GraphDefinition) -> ChildGraphNode: """Add an entry-point child graph node (no dependencies). diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index 5759edbfb2..17fb5685a1 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -284,15 +284,25 @@ cdef class GraphNode: """ return GN_free(self, dptr) - def memset(self, dst: Buffer | int, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode: + def memset( + self, + dst: Buffer | int, + value, + size_t width, + *, + size_t height=1, + size_t pitch=0, + dst_owner=None, + ) -> MemsetNode: """Add a memset node depending on this node. Parameters ---------- dst : Buffer or int - Destination. If a :class:`Buffer` is given it is retained for the - graph's lifetime; if a raw device pointer (int) is given it is used - as-is and the caller must keep the underlying memory alive. + Destination. A :class:`Buffer` is retained for the graph's lifetime. + A raw pointer (``int``) is used as-is; the caller must keep the + underlying memory alive, or supply ``dst_owner`` to have the graph + retain it. value : int or buffer-protocol object Fill value. int for 1-byte fill (range [0, 256)), or buffer-protocol object of 1, 2, or 4 bytes. @@ -302,20 +312,37 @@ cdef class GraphNode: Number of rows (default 1). pitch : int, optional Pitch of destination in bytes (default 0, unused if height is 1). + dst_owner : object, optional + Object retained for the graph's lifetime when ``dst`` is a raw + pointer. Must not be passed when ``dst`` is a :class:`Buffer`, which + is retained automatically. Returns ------- MemsetNode A new MemsetNode representing the memset operation. + + Raises + ------ + ValueError + If ``dst_owner`` is given together with a :class:`Buffer` ``dst``. """ cdef cydriver.CUdeviceptr c_dst cdef unsigned int val cdef unsigned int elem_size - dst_owner = _resolve_ptr(dst, &c_dst) + dst_keepalive = _resolve_memcpy_operand(dst, dst_owner, "dst", &c_dst) val, elem_size = _parse_fill_value(value) - return GN_memset(self, c_dst, dst_owner, val, elem_size, width, height, pitch) - - def memcpy(self, dst: Buffer | int, src: Buffer | int, size_t size) -> MemcpyNode: + return GN_memset(self, c_dst, dst_keepalive, val, elem_size, width, height, pitch) + + def memcpy( + self, + dst: Buffer | int, + src: Buffer | int, + size_t size, + *, + dst_owner=None, + src_owner=None, + ) -> MemcpyNode: """Add a memcpy node depending on this node. Copies ``size`` bytes from ``src`` to ``dst``. Memory types are @@ -325,24 +352,40 @@ cdef class GraphNode: Parameters ---------- dst : Buffer or int - Destination (device or pinned host). If a :class:`Buffer` is given - it is retained for the graph's lifetime; a raw pointer (int) is used - as-is and the caller must keep the underlying memory alive. + Destination (device or pinned host). A :class:`Buffer` is retained + for the graph's lifetime. A raw pointer (``int``) is used as-is; the + caller must keep the underlying memory alive, or supply ``dst_owner`` + to have the graph retain it. src : Buffer or int - Source (device or pinned host). Retained like ``dst`` when a Buffer. + Source (device or pinned host). Same retention rules as ``dst``; + use ``src_owner`` for a raw pointer. size : int Number of bytes to copy. + dst_owner : object, optional + Object retained for the graph's lifetime when ``dst`` is a raw + pointer. Must not be passed when ``dst`` is a :class:`Buffer`, which + is retained automatically. + src_owner : object, optional + Object retained for the graph's lifetime when ``src`` is a raw + pointer. Must not be passed when ``src`` is a :class:`Buffer`, which + is retained automatically. Returns ------- MemcpyNode A new MemcpyNode representing the copy operation. + + Raises + ------ + ValueError + If ``dst_owner`` or ``src_owner`` is given together with a + :class:`Buffer` ``dst`` or ``src`` respectively. """ cdef cydriver.CUdeviceptr c_dst cdef cydriver.CUdeviceptr c_src - dst_owner = _resolve_ptr(dst, &c_dst) - src_owner = _resolve_ptr(src, &c_src) - return GN_memcpy(self, c_dst, dst_owner, c_src, src_owner, size) + dst_keepalive = _resolve_memcpy_operand(dst, dst_owner, "dst", &c_dst) + src_keepalive = _resolve_memcpy_operand(src, src_owner, "src", &c_src) + return GN_memcpy(self, c_dst, dst_keepalive, c_src, src_keepalive, size) def embed(self, child: GraphDefinition) -> ChildGraphNode: """Add a child graph node depending on this node. @@ -789,20 +832,28 @@ cdef inline FreeNode GN_free(GraphNode self, cydriver.CUdeviceptr c_dptr): return _registered(FreeNode._create_with_params(create_graph_node_handle(new_node, h_graph), c_dptr)) -cdef inline object _resolve_ptr(object value, cydriver.CUdeviceptr* out_ptr): - """Resolve a memcpy/memset operand into a device pointer. +cdef inline object _resolve_memcpy_operand( + object operand, object owner, str side, cydriver.CUdeviceptr* out_ptr): + """Resolve a memcpy/memset operand to a device pointer and an owner. + + ``operand`` is a :class:`Buffer` or a raw integer address; its device + pointer is written to ``out_ptr``. Returns the object to retain on the + graph's slot table for the graph's lifetime: the Buffer itself, or + ``owner`` (possibly ``None``) for a raw pointer. ``side`` is ``"dst"`` or + ``"src"`` and is used only to compose the error message. - ``value`` is a :class:`Buffer` or a raw integer address. For a Buffer the - device pointer is read from the buffer and the buffer is returned so the - caller can retain it on the node's slot table for the graph's lifetime. For - a raw integer no owner is returned and the caller is responsible for keeping - the underlying memory alive. + Raises ValueError if ``operand`` is a Buffer and ``owner`` is not None, + since a Buffer is already retained automatically. """ - if isinstance(value, Buffer): - out_ptr[0] = as_cu((value)._h_ptr) - return value - out_ptr[0] = value - return None + if isinstance(operand, Buffer): + if owner is not None: + raise ValueError( + f"{side}_owner cannot be used when {side} is a Buffer" + ) + out_ptr[0] = as_cu((operand)._h_ptr) + return operand + out_ptr[0] = operand + return owner cdef inline MemsetNode GN_memset( @@ -837,7 +888,7 @@ cdef inline MemsetNode GN_memset( &new_node, as_cu(h_graph), deps, num_deps, &memset_params, ctx)) - # Retain a Buffer destination for the graph's lifetime (slot 0). + # Retain the destination owner for the graph's lifetime (slot 0). if dst_owner is not None: HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, make_opaque_py(dst_owner))) @@ -902,7 +953,7 @@ cdef inline MemcpyNode GN_memcpy( HANDLE_RETURN(cydriver.cuGraphAddMemcpyNode( &new_node, as_cu(h_graph), deps, num_deps, ¶ms, ctx)) - # Retain Buffer operands for the graph's lifetime (dst -> slot 0, src -> slot 1). + # Retain operand owners for the graph's lifetime (dst -> slot 0, src -> slot 1). if dst_owner is not None: HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, make_opaque_py(dst_owner))) if src_owner is not None: diff --git a/cuda_core/tests/graph/test_graph_definition_lifetime.py b/cuda_core/tests/graph/test_graph_definition_lifetime.py index 9be46f96f3..8fd7d1ca89 100644 --- a/cuda_core/tests/graph/test_graph_definition_lifetime.py +++ b/cuda_core/tests/graph/test_graph_definition_lifetime.py @@ -695,3 +695,135 @@ def test_memcpy_buffers_survive_graph_clone(init_cuda): out = (ctypes.c_uint8 * 4)(0) handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) assert list(out) == [0xCD] * 4 + + +# ============================================================================= +# Explicit dst_owner / src_owner for raw pointer operands +# ============================================================================= + + +def test_memset_raw_ptr_with_dst_owner(init_cuda): + """Raw dst address plus dst_owner: the graph retains the owner until destroyed.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + buf_weak = weakref.ref(buf) + dptr = int(buf.handle) + + g = GraphDefinition() + g.memset(dptr, 0xAB, 4, dst_owner=buf) + + del buf + gc.collect() + assert buf_weak() is not None # graph retains the explicit owner + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dptr, 4)) + assert list(out) == [0xAB] * 4 + + del g + _wait_until(lambda: buf_weak() is None) + + +def test_memcpy_raw_ptrs_with_owners(init_cuda): + """Raw src/dst addresses: the graph retains both owners until destroyed.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + src.fill(0xCD, stream=dev.default_stream) + dev.default_stream.sync() + src_weak = weakref.ref(src) + dst_weak = weakref.ref(dst) + src_dptr = int(src.handle) + dst_dptr = int(dst.handle) + + g = GraphDefinition() + g.memcpy(dst_dptr, src_dptr, 4, dst_owner=dst, src_owner=src) + + del src, dst + gc.collect() + assert src_weak() is not None and dst_weak() is not None # both owners retained + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) + assert list(out) == [0xCD] * 4 + + del g + _wait_until(lambda: src_weak() is None and dst_weak() is None) + + +def test_memcpy_mixed_buffer_and_raw_owner(init_cuda): + """Buffer dst is auto-retained; raw src uses src_owner. Both survive until destroyed.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + src.fill(0xCD, stream=dev.default_stream) + dev.default_stream.sync() + src_weak = weakref.ref(src) + dst_weak = weakref.ref(dst) + src_dptr = int(src.handle) + dst_dptr = int(dst.handle) + + g = GraphDefinition() + g.memcpy(dst, src_dptr, 4, src_owner=src) + + del src, dst + gc.collect() + assert src_weak() is not None and dst_weak() is not None # explicit + auto owner + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) + assert list(out) == [0xCD] * 4 + + del g + _wait_until(lambda: src_weak() is None and dst_weak() is None) + + +def test_memcpy_buffer_and_dst_owner_rejected(init_cuda): + """dst_owner cannot be combined with a Buffer dst operand.""" + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + + g = GraphDefinition() + with pytest.raises(ValueError, match="dst_owner cannot be used when dst is a Buffer"): + g.memcpy(buf, buf, 4, dst_owner=object()) + + +def test_memcpy_buffer_and_src_owner_rejected(init_cuda): + """src_owner cannot be combined with a Buffer src operand.""" + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + + g = GraphDefinition() + with pytest.raises(ValueError, match="src_owner cannot be used when src is a Buffer"): + g.memcpy(buf, buf, 4, src_owner=object()) From 621ade84a176226f2400c5a4ffbfc3fba6a41f28 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 4 Jun 2026 13:33:53 -0700 Subject: [PATCH 8/8] cuda.core: retain device allocations in graph memcpy/memset slots Store DevicePtrHandle in slot table instead of Buffer wrappers so reset/close cannot release memory while a graph still references it. Add test-only weak_handle() for deterministic allocation lifetime checks and extend graph lifetime tests accordingly. --- cuda_core/cuda/core/_resource_handles.pyi | 3 +- cuda_core/cuda/core/_utils/_weak_handles.pyi | 55 ++++ cuda_core/cuda/core/_utils/_weak_handles.pyx | 106 ++++++++ cuda_core/cuda/core/graph/_graph_builder.pyi | 12 + .../cuda/core/graph/_graph_definition.pyi | 5 +- cuda_core/cuda/core/graph/_graph_node.pyi | 47 +++- cuda_core/cuda/core/graph/_graph_node.pyx | 108 +++++--- .../graph/{_utils.pyi => _host_callback.pyi} | 2 +- .../graph/test_graph_definition_lifetime.py | 250 ++++++++++++++---- 9 files changed, 487 insertions(+), 101 deletions(-) create mode 100644 cuda_core/cuda/core/_utils/_weak_handles.pyi create mode 100644 cuda_core/cuda/core/_utils/_weak_handles.pyx rename cuda_core/cuda/core/graph/{_utils.pyi => _host_callback.pyi} (74%) diff --git a/cuda_core/cuda/core/_resource_handles.pyi b/cuda_core/cuda/core/_resource_handles.pyi index d4511ae063..4236df5d6e 100644 --- a/cuda_core/cuda/core/_resource_handles.pyi +++ b/cuda_core/cuda/core/_resource_handles.pyi @@ -24,4 +24,5 @@ FileDescriptorHandle = shared_ptr OpaqueArrayHandle = shared_ptr MipmappedArrayHandle = shared_ptr TexObjectHandle = shared_ptr -SurfObjectHandle = shared_ptr \ No newline at end of file +SurfObjectHandle = shared_ptr +OpaqueHandle = shared_ptr \ No newline at end of file diff --git a/cuda_core/cuda/core/_utils/_weak_handles.pyi b/cuda_core/cuda/core/_utils/_weak_handles.pyi new file mode 100644 index 0000000000..3cf095d7b8 --- /dev/null +++ b/cuda_core/cuda/core/_utils/_weak_handles.pyi @@ -0,0 +1,55 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_utils/_weak_handles.pyx + +"""Test-only weak handles for resource-handle lifetime checks. + +This module is **not** part of the public ``cuda.core`` API. It is built into +the package (like other private ``_utils`` modules) purely so the test suite can +observe, deterministically, when the strong references that keep a CUDA resource +alive have all been released -- without relying on driver- or hardware-specific +side effects (for example, whether freed device memory happens to remain +readable). + +Every resource handle is owned by a C++ ``std::shared_ptr``. A **weak handle** +is a non-owning ``std::weak_ptr`` observer of that control block: truthy while +some strong owner remains, falsy once the last one is gone. Use :func:`weak_handle` +to obtain a weak handle from a supported front-end object. + +To support another type, add a ``cdef _weak_from_`` that reads its ``cdef`` +handle field (see ``*.pxd``), assigns to :ctype:`OpaqueHandle`, and extend the +``isinstance`` chain in :func:`weak_handle`. Types whose slots hold arbitrary +Python owners via ``make_opaque_py`` are not covered here -- use +:class:`weakref.ref` on a weak-referenceable owner object in tests instead. +""" +from __future__ import annotations + + +class WeakHandle: + """Non-owning weak handle for a resource's shared control block. + + Truthy while some strong owner of the underlying resource handle remains, + falsy once the last strong reference is released. Obtain instances via + :func:`weak_handle` rather than constructing directly. + """ + + def __bool__(self): + ... + + def expired(self): + """Return ``True`` once every strong owner of the handle is gone.""" + + def use_count(self): + """Number of strong owners currently sharing the handle.""" + +def weak_handle(obj): + """Return a :class:`WeakHandle` observing the resource behind ``obj``. + + Currently supports :class:`~cuda.core.Buffer` (device allocation handle). + See the module docstring for how to add more types. + + Raises + ------ + ValueError + If ``obj`` is a :class:`~cuda.core.Buffer` with no active allocation. + TypeError + If ``obj`` is not a supported type. + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/_utils/_weak_handles.pyx b/cuda_core/cuda/core/_utils/_weak_handles.pyx new file mode 100644 index 0000000000..65737b958a --- /dev/null +++ b/cuda_core/cuda/core/_utils/_weak_handles.pyx @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Test-only weak handles for resource-handle lifetime checks. + +This module is **not** part of the public ``cuda.core`` API. It is built into +the package (like other private ``_utils`` modules) purely so the test suite can +observe, deterministically, when the strong references that keep a CUDA resource +alive have all been released -- without relying on driver- or hardware-specific +side effects (for example, whether freed device memory happens to remain +readable). + +Every resource handle is owned by a C++ ``std::shared_ptr``. A **weak handle** +is a non-owning ``std::weak_ptr`` observer of that control block: truthy while +some strong owner remains, falsy once the last one is gone. Use :func:`weak_handle` +to obtain a weak handle from a supported front-end object. + +To support another type, add a ``cdef _weak_from_`` that reads its ``cdef`` +handle field (see ``*.pxd``), assigns to :ctype:`OpaqueHandle`, and extend the +``isinstance`` chain in :func:`weak_handle`. Types whose slots hold arbitrary +Python owners via ``make_opaque_py`` are not covered here -- use +:class:`weakref.ref` on a weak-referenceable owner object in tests instead. +""" + +from cuda.core._memory._buffer cimport Buffer +from cuda.core._resource_handles cimport OpaqueHandle + + +# Cython cannot spell ``weak_ptr[const void]`` inline (the ``const void`` +# template argument fails to parse), so the weak type and its one constructor +# are provided by a small inline C++ shim local to this test-only module. This +# keeps the production resource_handles translation units untouched. +cdef extern from *: + """ + #include + namespace cuda_core_test { + using OpaqueWeakHandle = std::weak_ptr; + static inline OpaqueWeakHandle make_weak(const std::shared_ptr& h) { + return OpaqueWeakHandle(h); + } + } // namespace cuda_core_test + """ + cppclass OpaqueWeakHandle "cuda_core_test::OpaqueWeakHandle": + OpaqueWeakHandle() + bint expired() + long use_count() + OpaqueWeakHandle make_weak "cuda_core_test::make_weak" (const OpaqueHandle& h) + + +cdef class WeakHandle: + """Non-owning weak handle for a resource's shared control block. + + Truthy while some strong owner of the underlying resource handle remains, + falsy once the last strong reference is released. Obtain instances via + :func:`weak_handle` rather than constructing directly. + """ + + cdef OpaqueWeakHandle _w + + def __bool__(self): + return not self._w.expired() + + def expired(self): + """Return ``True`` once every strong owner of the handle is gone.""" + return self._w.expired() + + def use_count(self): + """Number of strong owners currently sharing the handle.""" + return self._w.use_count() + + +cdef WeakHandle _weak_from_opaque(OpaqueHandle h): + # Build the weak handle from a (temporary) strong handle. The strong copy + # lives only for the duration of this call, so it does not perturb the + # reference count the weak handle later reports. + cdef WeakHandle wh = WeakHandle.__new__(WeakHandle) + wh._w = make_weak(h) + return wh + + +cdef WeakHandle _weak_from_buffer(Buffer buf): + cdef OpaqueHandle h = buf._h_ptr + if not h: + raise ValueError("Buffer has no active allocation") + return _weak_from_opaque(h) + + +def weak_handle(obj): + """Return a :class:`WeakHandle` observing the resource behind ``obj``. + + Currently supports :class:`~cuda.core.Buffer` (device allocation handle). + See the module docstring for how to add more types. + + Raises + ------ + ValueError + If ``obj`` is a :class:`~cuda.core.Buffer` with no active allocation. + TypeError + If ``obj`` is not a supported type. + """ + if isinstance(obj, Buffer): + return _weak_from_buffer(obj) + raise TypeError( + f"weak_handle() does not support {type(obj).__name__!r}; " + "supported types: Buffer" + ) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyi b/cuda_core/cuda/core/graph/_graph_builder.pyi index 00af261423..8398102ddc 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyi +++ b/cuda_core/cuda/core/graph/_graph_builder.pyi @@ -106,6 +106,18 @@ class GraphBuilder: to ambiguity. New graph builders should instead be created through a :obj:`~_device.Device`, or a :obj:`~_stream.stream` object. + .. note:: + + Operations recorded during capture reference your memory but do not + take ownership of it. As with ordinary stream work, you must keep the + operands alive for as long as the completed graph may execute -- for + example, the :obj:`~_memory.Buffer` objects passed to :func:`~launch` + or :meth:`~_memory.Buffer.copy_to`. Host callbacks added with + :meth:`callback` are the exception: the callable (and any copied + ``user_data``) are retained for the graph's lifetime. This differs from + building a graph explicitly with :class:`~graph.GraphDefinition`, which + retains the operands it is given. + """ def __init__(self): diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyi b/cuda_core/cuda/core/graph/_graph_definition.pyi index 15f34cec9a..40cc616d35 100644 --- a/cuda_core/cuda/core/graph/_graph_definition.pyi +++ b/cuda_core/cuda/core/graph/_graph_definition.pyi @@ -6,6 +6,7 @@ from __future__ import annotations from cuda.core._device import Device from cuda.core._event import Event from cuda.core._launch_config import LaunchConfig +from cuda.core._memory._buffer import Buffer from cuda.core._module import Kernel from cuda.core._utils.cuda_utils import driver from cuda.core.graph._graph_builder import (Graph, GraphCompleteOptions, @@ -85,7 +86,7 @@ class GraphDefinition: See :meth:`GraphNode.deallocate` for full documentation. """ - def memset(self, dst: int, value, width: int, height: int=1, pitch: int=0) -> MemsetNode: + def memset(self, dst: Buffer | int, value, width: int, *, height: int=1, pitch: int=0, dst_owner=None) -> MemsetNode: """Add an entry-point memset node (no dependencies). See :meth:`GraphNode.memset` for full documentation. @@ -120,7 +121,7 @@ class GraphDefinition: A new EmptyNode that depends on all input nodes. """ - def memcpy(self, dst: int, src: int, size: int) -> MemcpyNode: + def memcpy(self, dst: Buffer | int, src: Buffer | int, size: int, *, dst_owner=None, src_owner=None) -> MemcpyNode: """Add an entry-point memcpy node (no dependencies). See :meth:`GraphNode.memcpy` for full documentation. diff --git a/cuda_core/cuda/core/graph/_graph_node.pyi b/cuda_core/cuda/core/graph/_graph_node.pyi index 3e701fe389..40e6745ff6 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyi +++ b/cuda_core/cuda/core/graph/_graph_node.pyi @@ -9,6 +9,7 @@ from collections.abc import Iterable from cuda.core._device import Device from cuda.core._event import Event from cuda.core._launch_config import LaunchConfig +from cuda.core._memory._buffer import Buffer from cuda.core._module import Kernel from cuda.core._utils.cuda_utils import driver from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy @@ -178,13 +179,16 @@ class GraphNode: A new FreeNode representing the free operation. """ - def memset(self, dst: int, value, width: int, height: int=1, pitch: int=0) -> MemsetNode: + def memset(self, dst: Buffer | int, value, width: int, *, height: int=1, pitch: int=0, dst_owner=None) -> MemsetNode: """Add a memset node depending on this node. Parameters ---------- - dst : int - Destination device pointer. + dst : Buffer or int + Destination. When ``dst`` is a :class:`Buffer`, the underlying + allocation is retained for the graph's lifetime. A raw pointer + (``int``) is used as-is; the caller must keep the underlying memory + alive, or supply ``dst_owner`` to have the graph retain it. value : int or buffer-protocol object Fill value. int for 1-byte fill (range [0, 256)), or buffer-protocol object of 1, 2, or 4 bytes. @@ -194,14 +198,23 @@ class GraphNode: Number of rows (default 1). pitch : int, optional Pitch of destination in bytes (default 0, unused if height is 1). + dst_owner : object, optional + Object retained for the graph's lifetime when ``dst`` is a raw + pointer. A :class:`Buffer` owner retains its underlying allocation, + not the wrapper. Must not be passed when ``dst`` is a :class:`Buffer`. Returns ------- MemsetNode A new MemsetNode representing the memset operation. + + Raises + ------ + ValueError + If ``dst_owner`` is given together with a :class:`Buffer` ``dst``. """ - def memcpy(self, dst: int, src: int, size: int) -> MemcpyNode: + def memcpy(self, dst: Buffer | int, src: Buffer | int, size: int, *, dst_owner=None, src_owner=None) -> MemcpyNode: """Add a memcpy node depending on this node. Copies ``size`` bytes from ``src`` to ``dst``. Memory types are @@ -210,17 +223,35 @@ class GraphNode: Parameters ---------- - dst : int - Destination pointer (device or pinned host). - src : int - Source pointer (device or pinned host). + dst : Buffer or int + Destination (device or pinned host). When a :class:`Buffer` is given, + the underlying allocation is retained for the graph's lifetime. A raw + pointer (``int``) is used as-is; the caller must keep the underlying + memory alive, or supply ``dst_owner`` to have the graph retain it. + src : Buffer or int + Source (device or pinned host). Same retention rules as ``dst``; + use ``src_owner`` for a raw pointer. size : int Number of bytes to copy. + dst_owner : object, optional + Object retained for the graph's lifetime when ``dst`` is a raw + pointer. A :class:`Buffer` owner retains its underlying allocation. + Must not be passed when ``dst`` is a :class:`Buffer`. + src_owner : object, optional + Object retained for the graph's lifetime when ``src`` is a raw + pointer. A :class:`Buffer` owner retains its underlying allocation. + Must not be passed when ``src`` is a :class:`Buffer`. Returns ------- MemcpyNode A new MemcpyNode representing the copy operation. + + Raises + ------ + ValueError + If ``dst_owner`` or ``src_owner`` is given together with a + :class:`Buffer` ``dst`` or ``src`` respectively. """ def embed(self, child: GraphDefinition) -> ChildGraphNode: diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index 17fb5685a1..96644abe7c 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -41,10 +41,8 @@ from cuda.core.graph._subclasses cimport ( WhileNode, ) from cuda.core._resource_handles cimport ( - EventHandle, GraphHandle, GraphNodeHandle, - KernelHandle, OpaqueHandle, as_cu, as_intptr, @@ -299,10 +297,10 @@ cdef class GraphNode: Parameters ---------- dst : Buffer or int - Destination. A :class:`Buffer` is retained for the graph's lifetime. - A raw pointer (``int``) is used as-is; the caller must keep the - underlying memory alive, or supply ``dst_owner`` to have the graph - retain it. + Destination. When ``dst`` is a :class:`Buffer`, the underlying + allocation is retained for the graph's lifetime. A raw pointer + (``int``) is used as-is; the caller must keep the underlying memory + alive, or supply ``dst_owner`` to have the graph retain it. value : int or buffer-protocol object Fill value. int for 1-byte fill (range [0, 256)), or buffer-protocol object of 1, 2, or 4 bytes. @@ -314,8 +312,8 @@ cdef class GraphNode: Pitch of destination in bytes (default 0, unused if height is 1). dst_owner : object, optional Object retained for the graph's lifetime when ``dst`` is a raw - pointer. Must not be passed when ``dst`` is a :class:`Buffer`, which - is retained automatically. + pointer. A :class:`Buffer` owner retains its underlying allocation, + not the wrapper. Must not be passed when ``dst`` is a :class:`Buffer`. Returns ------- @@ -330,9 +328,10 @@ cdef class GraphNode: cdef cydriver.CUdeviceptr c_dst cdef unsigned int val cdef unsigned int elem_size - dst_keepalive = _resolve_memcpy_operand(dst, dst_owner, "dst", &c_dst) + cdef OpaqueHandle dst_slot_owner + dst_slot_owner = _resolve_memcpy_operand(dst, dst_owner, "dst", &c_dst) val, elem_size = _parse_fill_value(value) - return GN_memset(self, c_dst, dst_keepalive, val, elem_size, width, height, pitch) + return GN_memset(self, c_dst, dst_slot_owner, val, elem_size, width, height, pitch) def memcpy( self, @@ -352,10 +351,10 @@ cdef class GraphNode: Parameters ---------- dst : Buffer or int - Destination (device or pinned host). A :class:`Buffer` is retained - for the graph's lifetime. A raw pointer (``int``) is used as-is; the - caller must keep the underlying memory alive, or supply ``dst_owner`` - to have the graph retain it. + Destination (device or pinned host). When a :class:`Buffer` is given, + the underlying allocation is retained for the graph's lifetime. A raw + pointer (``int``) is used as-is; the caller must keep the underlying + memory alive, or supply ``dst_owner`` to have the graph retain it. src : Buffer or int Source (device or pinned host). Same retention rules as ``dst``; use ``src_owner`` for a raw pointer. @@ -363,12 +362,12 @@ cdef class GraphNode: Number of bytes to copy. dst_owner : object, optional Object retained for the graph's lifetime when ``dst`` is a raw - pointer. Must not be passed when ``dst`` is a :class:`Buffer`, which - is retained automatically. + pointer. A :class:`Buffer` owner retains its underlying allocation. + Must not be passed when ``dst`` is a :class:`Buffer`. src_owner : object, optional Object retained for the graph's lifetime when ``src`` is a raw - pointer. Must not be passed when ``src`` is a :class:`Buffer`, which - is retained automatically. + pointer. A :class:`Buffer` owner retains its underlying allocation. + Must not be passed when ``src`` is a :class:`Buffer`. Returns ------- @@ -383,9 +382,10 @@ cdef class GraphNode: """ cdef cydriver.CUdeviceptr c_dst cdef cydriver.CUdeviceptr c_src - dst_keepalive = _resolve_memcpy_operand(dst, dst_owner, "dst", &c_dst) - src_keepalive = _resolve_memcpy_operand(src, src_owner, "src", &c_src) - return GN_memcpy(self, c_dst, dst_keepalive, c_src, src_keepalive, size) + cdef OpaqueHandle dst_slot_owner, src_slot_owner + dst_slot_owner = _resolve_memcpy_operand(dst, dst_owner, "dst", &c_dst) + src_slot_owner = _resolve_memcpy_operand(src, src_owner, "src", &c_src) + return GN_memcpy(self, c_dst, dst_slot_owner, c_src, src_slot_owner, size) def embed(self, child: GraphDefinition) -> ChildGraphNode: """Add a child graph node depending on this node. @@ -832,32 +832,52 @@ cdef inline FreeNode GN_free(GraphNode self, cydriver.CUdeviceptr c_dptr): return _registered(FreeNode._create_with_params(create_graph_node_handle(new_node, h_graph), c_dptr)) -cdef inline object _resolve_memcpy_operand( +cdef inline OpaqueHandle _buffer_slot_owner(Buffer buf, str label): + """Copy a Buffer's device-pointer handle into a graph slot owner.""" + cdef OpaqueHandle slot_owner + if not buf._h_ptr: + raise ValueError(f"{label} Buffer has no active allocation") + slot_owner = buf._h_ptr + return slot_owner + + +cdef inline OpaqueHandle _resolve_memcpy_operand( object operand, object owner, str side, cydriver.CUdeviceptr* out_ptr): - """Resolve a memcpy/memset operand to a device pointer and an owner. + """Resolve a memcpy/memset operand to a pointer and optional slot owner. ``operand`` is a :class:`Buffer` or a raw integer address; its device - pointer is written to ``out_ptr``. Returns the object to retain on the - graph's slot table for the graph's lifetime: the Buffer itself, or - ``owner`` (possibly ``None``) for a raw pointer. ``side`` is ``"dst"`` or - ``"src"`` and is used only to compose the error message. - - Raises ValueError if ``operand`` is a Buffer and ``owner`` is not None, - since a Buffer is already retained automatically. + pointer is written to ``out_ptr``. For a :class:`Buffer` operand, returns an + owner that retains the underlying allocation (not the wrapper). For a raw + pointer, returns an owner built from ``owner`` (or an empty handle when + ``owner`` is ``None``). + + Raises + ------ + ValueError + If ``operand`` is a :class:`Buffer` and ``owner`` is not ``None``. + If a :class:`Buffer` operand or ``*_owner`` has no active allocation. """ + cdef Buffer buf + if isinstance(operand, Buffer): if owner is not None: raise ValueError( f"{side}_owner cannot be used when {side} is a Buffer" ) - out_ptr[0] = as_cu((operand)._h_ptr) - return operand + buf = operand + slot_owner = _buffer_slot_owner(buf, side) + out_ptr[0] = as_cu(buf._h_ptr) + return slot_owner out_ptr[0] = operand - return owner + if owner is None: + return OpaqueHandle() + if isinstance(owner, Buffer): + return _buffer_slot_owner(owner, f"{side}_owner") + return make_opaque_py(owner) cdef inline MemsetNode GN_memset( - GraphNode self, cydriver.CUdeviceptr c_dst, object dst_owner, + GraphNode self, cydriver.CUdeviceptr c_dst, OpaqueHandle dst_owner, unsigned int val, unsigned int elem_size, size_t width, size_t height, size_t pitch): cdef cydriver.CUDA_MEMSET_NODE_PARAMS memset_params @@ -888,9 +908,9 @@ cdef inline MemsetNode GN_memset( &new_node, as_cu(h_graph), deps, num_deps, &memset_params, ctx)) - # Retain the destination owner for the graph's lifetime (slot 0). - if dst_owner is not None: - HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, make_opaque_py(dst_owner))) + # Retain the destination allocation for the graph's lifetime (slot 0). + if dst_owner: + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, dst_owner)) return _registered(MemsetNode._create_with_params( create_graph_node_handle(new_node, h_graph), c_dst, @@ -898,8 +918,8 @@ cdef inline MemsetNode GN_memset( cdef inline MemcpyNode GN_memcpy( - GraphNode self, cydriver.CUdeviceptr c_dst, object dst_owner, - cydriver.CUdeviceptr c_src, object src_owner, size_t size): + GraphNode self, cydriver.CUdeviceptr c_dst, OpaqueHandle dst_owner, + cydriver.CUdeviceptr c_src, OpaqueHandle src_owner, size_t size): cdef unsigned int dst_mem_type = cydriver.CU_MEMORYTYPE_DEVICE cdef unsigned int src_mem_type = cydriver.CU_MEMORYTYPE_DEVICE cdef cydriver.CUresult ret @@ -953,11 +973,11 @@ cdef inline MemcpyNode GN_memcpy( HANDLE_RETURN(cydriver.cuGraphAddMemcpyNode( &new_node, as_cu(h_graph), deps, num_deps, ¶ms, ctx)) - # Retain operand owners for the graph's lifetime (dst -> slot 0, src -> slot 1). - if dst_owner is not None: - HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, make_opaque_py(dst_owner))) - if src_owner is not None: - HANDLE_RETURN(graph_set_slot(h_graph, new_node, 1, make_opaque_py(src_owner))) + # Retain operand allocations for the graph's lifetime (dst -> slot 0, src -> slot 1). + if dst_owner: + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 0, dst_owner)) + if src_owner: + HANDLE_RETURN(graph_set_slot(h_graph, new_node, 1, src_owner)) return _registered(MemcpyNode._create_with_params( create_graph_node_handle(new_node, h_graph), c_dst, c_src, size, diff --git a/cuda_core/cuda/core/graph/_utils.pyi b/cuda_core/cuda/core/graph/_host_callback.pyi similarity index 74% rename from cuda_core/cuda/core/graph/_utils.pyi rename to cuda_core/cuda/core/graph/_host_callback.pyi index 79072e66eb..6c9d0ead31 100644 --- a/cuda_core/cuda/core/graph/_utils.pyi +++ b/cuda_core/cuda/core/graph/_host_callback.pyi @@ -1,3 +1,3 @@ -# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_utils.pyx +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_host_callback.pyx from __future__ import annotations \ No newline at end of file diff --git a/cuda_core/tests/graph/test_graph_definition_lifetime.py b/cuda_core/tests/graph/test_graph_definition_lifetime.py index 8fd7d1ca89..d196e35f47 100644 --- a/cuda_core/tests/graph/test_graph_definition_lifetime.py +++ b/cuda_core/tests/graph/test_graph_definition_lifetime.py @@ -13,22 +13,52 @@ from helpers.misc import try_create_condition from conftest import xfail_on_graph_mempool_oom +from cuda_python_test_helpers import under_compute_sanitizer +# Resource finalization triggered by graph destruction is not strictly +# synchronous: the graph's slot table is freed through a CUDA user-object +# destructor that the driver may run on its own thread, after which each owner +# is released (a shared_ptr decrement, or Py_DECREF under the GIL). Release is +# deterministic at the reference-count level, so the predicate normally flips +# within milliseconds; this budget only bounds a slow/loaded runner. It stays a +# hard failure rather than a warning so a real leak still fails the suite. +# Compute-sanitizer slows everything down, hence the larger ceiling there. +_FINALIZE_TIMEOUT = 30.0 if under_compute_sanitizer() else 5.0 -def _wait_until(predicate, timeout=2.0, interval=0.01): - """Poll predicate() until True or timeout, driving gc each iteration. - Used for assertions about resource cleanup that may be delayed by CUDA's - asynchronous user-object destructor pump (DPC) or, on free-threaded - Python, by deferred reference-count processing. A bounded poll keeps the - test correct without depending on undocumented driver timing guarantees. +class _Sentinel: + """Weak-referenceable stand-in for an owner attached to a graph slot. + + Bare ``object()`` instances do not support weak references, so tests that + observe owner release through a :class:`weakref.ref` use this trivial + subclass instead. + """ + + +def _wait_until(predicate, timeout=None, interval=0.02): + """Poll ``predicate()`` until true, or raise AssertionError on timeout. + + Each iteration drives ``gc.collect()`` and yields the main thread (which + releases the GIL) so the driver's asynchronous user-object destructor -- + and the ``Py_DECREF`` it triggers -- can make progress. Used for resource + cleanup that lags graph destruction; see ``_FINALIZE_TIMEOUT``. """ + if timeout is None: + timeout = _FINALIZE_TIMEOUT deadline = time.monotonic() + timeout - while time.monotonic() < deadline: + while True: gc.collect() if predicate(): return + if time.monotonic() >= deadline: + break + time.sleep(0) # yield the GIL to the driver's finalizer thread time.sleep(interval) + # Final attempt after one more yield and collection. + time.sleep(0) + gc.collect() + if predicate(): + return raise AssertionError(f"condition not satisfied within {timeout}s") @@ -602,7 +632,7 @@ def test_kernel_args_survive_graph_clone(init_cuda): def test_memset_buffer_lifetime(init_cuda): - """Buffer passed as memset destination is kept alive after the Python ref drops.""" + """Memset retains the Buffer allocation after the wrapper is collected.""" from cuda.core._utils.cuda_utils import driver, handle_return _skip_if_no_mempool() @@ -610,7 +640,6 @@ def test_memset_buffer_lifetime(init_cuda): mr = DeviceMemoryResource(dev) buf = mr.allocate(4, stream=dev.default_stream) dev.default_stream.sync() - buf_weak = weakref.ref(buf) dptr = int(buf.handle) g = GraphDefinition() @@ -618,7 +647,6 @@ def test_memset_buffer_lifetime(init_cuda): del buf gc.collect() - assert buf_weak() is not None stream = dev.create_stream() g.instantiate().launch(stream) @@ -628,12 +656,33 @@ def test_memset_buffer_lifetime(init_cuda): handle_return(driver.cuMemcpyDtoH(out, dptr, 4)) assert list(out) == [0xAB] * 4 - del g - _wait_until(lambda: buf_weak() is None) + +def test_memset_buffer_survives_close(init_cuda): + """Memset retains the allocation when the Buffer wrapper is closed.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + + g = GraphDefinition() + g.memset(buf, 0xAB, 4) + buf.close() + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dptr, 4)) + assert list(out) == [0xAB] * 4 def test_memcpy_buffer_lifetime(init_cuda): - """Source and destination Buffers are kept alive for a memcpy node.""" + """Memcpy retains operand allocations after the Buffer wrappers are collected.""" from cuda.core._utils.cuda_utils import driver, handle_return _skip_if_no_mempool() @@ -643,8 +692,6 @@ def test_memcpy_buffer_lifetime(init_cuda): dst = mr.allocate(4, stream=dev.default_stream) src.fill(0xCD, stream=dev.default_stream) dev.default_stream.sync() - src_weak = weakref.ref(src) - dst_weak = weakref.ref(dst) dst_dptr = int(dst.handle) g = GraphDefinition() @@ -652,8 +699,6 @@ def test_memcpy_buffer_lifetime(init_cuda): del src, dst gc.collect() - assert src_weak() is not None - assert dst_weak() is not None stream = dev.create_stream() g.instantiate().launch(stream) @@ -663,12 +708,69 @@ def test_memcpy_buffer_lifetime(init_cuda): handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) assert list(out) == [0xCD] * 4 + +def test_memcpy_buffer_survives_close(init_cuda): + """Memcpy retains allocations when Buffer wrappers are closed.""" + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + src.fill(0xCD, stream=dev.default_stream) + dev.default_stream.sync() + dst_dptr = int(dst.handle) + + g = GraphDefinition() + g.memcpy(dst, src, 4) + src.close() + dst.close() + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_uint8 * 4)(0) + handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) + assert list(out) == [0xCD] * 4 + + +def test_memcpy_buffer_allocations_released_after_graph_destroyed(init_cuda): + """Destroying the graph frees both memcpy operand allocations. + + Each operand's device-pointer handle is observed via a weak handle + (see ``cuda.core._utils._weak_handles``), so release is checked at the + reference-count level rather than through a driver side effect. With both + Buffer wrappers closed, the graph's slots are the only remaining owners; + destroying the graph releases them and the weak handles expire. + """ + from cuda.core._utils._weak_handles import weak_handle + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + src = mr.allocate(4, stream=dev.default_stream) + dst = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + + g = GraphDefinition() + g.memcpy(dst, src, 4) + + # Observe the allocations, then drop the wrappers' strong references; the + # graph slots remain the sole owners. + src_weak = weak_handle(src) + dst_weak = weak_handle(dst) + src.close() + dst.close() + assert src_weak and dst_weak # graph slots still retain both allocations + del g - _wait_until(lambda: src_weak() is None and dst_weak() is None) + _wait_until(lambda: not src_weak and not dst_weak) def test_memcpy_buffers_survive_graph_clone(init_cuda): - """Cloned graph keeps memcpy operand Buffers alive via CUDA user objects.""" + """Cloned graph keeps memcpy operand allocations alive via CUDA user objects.""" from cuda.core._utils.cuda_utils import driver, handle_return _skip_if_no_mempool() @@ -703,7 +805,7 @@ def test_memcpy_buffers_survive_graph_clone(init_cuda): def test_memset_raw_ptr_with_dst_owner(init_cuda): - """Raw dst address plus dst_owner: the graph retains the owner until destroyed.""" + """Raw dst plus Buffer dst_owner retains the allocation after close.""" from cuda.core._utils.cuda_utils import driver, handle_return _skip_if_no_mempool() @@ -711,15 +813,11 @@ def test_memset_raw_ptr_with_dst_owner(init_cuda): mr = DeviceMemoryResource(dev) buf = mr.allocate(4, stream=dev.default_stream) dev.default_stream.sync() - buf_weak = weakref.ref(buf) dptr = int(buf.handle) g = GraphDefinition() g.memset(dptr, 0xAB, 4, dst_owner=buf) - - del buf - gc.collect() - assert buf_weak() is not None # graph retains the explicit owner + buf.close() stream = dev.create_stream() g.instantiate().launch(stream) @@ -729,12 +827,44 @@ def test_memset_raw_ptr_with_dst_owner(init_cuda): handle_return(driver.cuMemcpyDtoH(out, dptr, 4)) assert list(out) == [0xAB] * 4 + +def test_slot_owners_released_after_graph_destroyed(init_cuda): + """Destroying the graph releases every owner held in its slot table. + + Raw-pointer operands with explicit sentinel owners make release observable + in pure Python: the slot table holds a strong Python reference to each owner + (via ``make_opaque_py``), and graph destruction frees the table -- dropping + those references. This exercises the same teardown that releases a Buffer + operand's device-pointer handle (slot 0 for ``dst``, slot 1 for ``src``). + """ + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(8, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + + dst_owner = _Sentinel() + src_owner = _Sentinel() + dst_weak = weakref.ref(dst_owner) + src_weak = weakref.ref(src_owner) + + g = GraphDefinition() + # Non-overlapping 4-byte copy within an 8-byte allocation. + g.memcpy(dptr, dptr + 4, 4, dst_owner=dst_owner, src_owner=src_owner) + + del dst_owner, src_owner + gc.collect() + assert dst_weak() is not None and src_weak() is not None # graph retains owners + del g - _wait_until(lambda: buf_weak() is None) + _wait_until(lambda: dst_weak() is None and src_weak() is None) + + buf.close() def test_memcpy_raw_ptrs_with_owners(init_cuda): - """Raw src/dst addresses: the graph retains both owners until destroyed.""" + """Raw src/dst plus Buffer owners retain allocations after close.""" from cuda.core._utils.cuda_utils import driver, handle_return _skip_if_no_mempool() @@ -744,17 +874,13 @@ def test_memcpy_raw_ptrs_with_owners(init_cuda): dst = mr.allocate(4, stream=dev.default_stream) src.fill(0xCD, stream=dev.default_stream) dev.default_stream.sync() - src_weak = weakref.ref(src) - dst_weak = weakref.ref(dst) src_dptr = int(src.handle) dst_dptr = int(dst.handle) g = GraphDefinition() g.memcpy(dst_dptr, src_dptr, 4, dst_owner=dst, src_owner=src) - - del src, dst - gc.collect() - assert src_weak() is not None and dst_weak() is not None # both owners retained + src.close() + dst.close() stream = dev.create_stream() g.instantiate().launch(stream) @@ -764,12 +890,9 @@ def test_memcpy_raw_ptrs_with_owners(init_cuda): handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) assert list(out) == [0xCD] * 4 - del g - _wait_until(lambda: src_weak() is None and dst_weak() is None) - def test_memcpy_mixed_buffer_and_raw_owner(init_cuda): - """Buffer dst is auto-retained; raw src uses src_owner. Both survive until destroyed.""" + """Buffer dst and raw src with src_owner retain allocations after close.""" from cuda.core._utils.cuda_utils import driver, handle_return _skip_if_no_mempool() @@ -779,17 +902,13 @@ def test_memcpy_mixed_buffer_and_raw_owner(init_cuda): dst = mr.allocate(4, stream=dev.default_stream) src.fill(0xCD, stream=dev.default_stream) dev.default_stream.sync() - src_weak = weakref.ref(src) - dst_weak = weakref.ref(dst) src_dptr = int(src.handle) dst_dptr = int(dst.handle) g = GraphDefinition() g.memcpy(dst, src_dptr, 4, src_owner=src) - - del src, dst - gc.collect() - assert src_weak() is not None and dst_weak() is not None # explicit + auto owner + src.close() + dst.close() stream = dev.create_stream() g.instantiate().launch(stream) @@ -799,8 +918,49 @@ def test_memcpy_mixed_buffer_and_raw_owner(init_cuda): handle_return(driver.cuMemcpyDtoH(out, dst_dptr, 4)) assert list(out) == [0xCD] * 4 - del g - _wait_until(lambda: src_weak() is None and dst_weak() is None) + +def test_memset_closed_buffer_rejected(init_cuda): + """Memset rejects a Buffer with no active allocation.""" + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + buf.close() + + g = GraphDefinition() + with pytest.raises(ValueError, match="dst Buffer has no active allocation"): + g.memset(buf, 0xAB, 4) + + +def test_memset_closed_buffer_dst_owner_rejected(init_cuda): + """Memset rejects a closed Buffer passed as dst_owner.""" + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + buf.close() + + g = GraphDefinition() + with pytest.raises(ValueError, match="dst_owner Buffer has no active allocation"): + g.memset(dptr, 0xAB, 4, dst_owner=buf) + + +def test_memcpy_closed_buffer_src_owner_rejected(init_cuda): + """Memcpy rejects a closed Buffer passed as src_owner.""" + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + buf = mr.allocate(4, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + buf.close() + + g = GraphDefinition() + with pytest.raises(ValueError, match="src_owner Buffer has no active allocation"): + g.memcpy(dptr, dptr, 4, src_owner=buf) def test_memcpy_buffer_and_dst_owner_rejected(init_cuda):