From a41bf7c4c8e69959714c2a7c743e2a02972a8d76 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 23 Jun 2026 09:35:44 -0400 Subject: [PATCH] Use shared_ptr for event sets --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 71 +++++++++++++++++++ cuda_core/cuda/core/_cpp/resource_handles.hpp | 62 ++++++++++++++++ cuda_core/cuda/core/_resource_handles.pxd | 17 +++++ cuda_core/cuda/core/_resource_handles.pyi | 2 + cuda_core/cuda/core/_resource_handles.pyx | 8 +++ cuda_core/cuda/core/system/_device.pyi | 6 +- cuda_core/cuda/core/system/_device.pyx | 25 +++++++ cuda_core/cuda/core/system/_event.pxi | 21 +++--- cuda_core/cuda/core/system/_system_events.pyi | 6 +- cuda_core/cuda/core/system/_system_events.pyx | 44 +++++++++--- 10 files changed, 235 insertions(+), 27 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 6e82d734e35..cf2cfb2c0cb 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -1337,6 +1337,77 @@ FileDescriptorHandle create_fd_handle_ref(int fd) { #endif } +// ============================================================================ +// NVML event set function pointers and registration +// ============================================================================ + +NvmlEventSetFreeFn p_nvmlEventSetFree = nullptr; +NvmlSysEventSetFreeFn p_nvmlSysEventSetFree = nullptr; + +void register_nvml_event_set_fn_pointers(intptr_t event_set_free_fn, + intptr_t sys_event_set_free_fn) noexcept { + p_nvmlEventSetFree = reinterpret_cast(event_set_free_fn); + p_nvmlSysEventSetFree = reinterpret_cast(sys_event_set_free_fn); +} + +// ============================================================================ +// NVML Event Set Handles (device-scope) +// ============================================================================ + +namespace { +struct NvmlEventSetBox { + NvmlEventSetValue resource; +}; +} // namespace + +NvmlEventSetHandle create_nvml_event_set_handle(intptr_t handle) { + if (!p_nvmlEventSetFree) { + return NvmlEventSetHandle{}; + } + auto box = std::shared_ptr( + new NvmlEventSetBox{{handle}}, + [](NvmlEventSetBox* b) { + if (p_nvmlEventSetFree && b->resource.raw) { + p_nvmlEventSetFree(reinterpret_cast(b->resource.raw)); + } + delete b; + } + ); + return NvmlEventSetHandle(box, &box->resource); +} + +// ============================================================================ +// NVML System Event Set Handles (system-scope) +// ============================================================================ + +namespace { +struct NvmlSysEventSetBox { + NvmlSysEventSetValue resource; +}; +} // namespace + +NvmlSysEventSetHandle create_nvml_sys_event_set_handle(intptr_t handle) { + if (!p_nvmlSysEventSetFree) { + return NvmlSysEventSetHandle{}; + } + auto box = std::shared_ptr( + new NvmlSysEventSetBox{{handle}}, + [](NvmlSysEventSetBox* b) { + if (p_nvmlSysEventSetFree && b->resource.raw) { + // Matches NVML_STRUCT_VERSION(SystemEventSetFreeRequest, 1): + // version = sizeof(struct) | (1 << 24). Both our struct and the + // NVML header struct have the same layout ({unsigned int, void*}). + NvmlSysEventSetFreeRequest req; + req.set = reinterpret_cast(b->resource.raw); + req.version = (unsigned int)(sizeof(NvmlSysEventSetFreeRequest) | (1u << 24u)); + p_nvmlSysEventSetFree(&req); + } + delete b; + } + ); + return NvmlSysEventSetHandle(box, &box->resource); +} + // ============================================================================ // SM resource split wrapper // ============================================================================ diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 32a88f0b3cd..4ee8a08741e 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -36,6 +36,13 @@ struct TaggedHandle { using NvvmProgramValue = TaggedHandle; using NvJitLinkValue = TaggedHandle; +// NVML event set types — forward-declared as void* to avoid nvml.h dependency. +// nvmlEventSet_t = nvmlEventSet_st* (device-scope event set) +// nvmlSystemEventSet_t = nvmlSystemEventSet_st* (system-scope event set) +// TaggedHandle distinguishes the two intptr_t-based handle types for overloading. +using NvmlEventSetValue = TaggedHandle; +using NvmlSysEventSetValue = TaggedHandle; + // ============================================================================ // Thread-local error handling // ============================================================================ @@ -152,6 +159,35 @@ extern NvvmDestroyProgramFn p_nvvmDestroyProgram; using NvJitLinkDestroyFn = int (*)(nvJitLink_t*); extern NvJitLinkDestroyFn p_nvJitLinkDestroy; +// ============================================================================ +// NVML event set function pointers +// +// Populated by register_nvml_event_set_fn_pointers(), called from the system +// event / device modules once the NVML bindings have loaded the library. +// Both may be null until registration; deleters are no-ops when null. +// ============================================================================ + +// nvmlReturn_t nvmlEventSetFree(nvmlEventSet_t set) +// nvmlEventSet_t is nvmlEventSet_st* (opaque pointer stored as intptr_t here) +using NvmlEventSetFreeFn = unsigned int (*)(void*); +extern NvmlEventSetFreeFn p_nvmlEventSetFree; + +// Minimal layout-compatible counterpart to nvmlSystemEventSetFreeRequest_v1_t. +// Both fields match the NVML header: {unsigned int version; void* set;}. +struct NvmlSysEventSetFreeRequest { + unsigned int version; + void* set; // nvmlSystemEventSet_t +}; + +// nvmlReturn_t nvmlSystemEventSetFree(nvmlSystemEventSetFreeRequest_t*) +using NvmlSysEventSetFreeFn = unsigned int (*)(NvmlSysEventSetFreeRequest*); +extern NvmlSysEventSetFreeFn p_nvmlSysEventSetFree; + +// Register both NVML event-set free function pointers. +// safe to call multiple times (idempotent); second call is a no-op. +void register_nvml_event_set_fn_pointers(intptr_t event_set_free_fn, + intptr_t sys_event_set_free_fn) noexcept; + // ============================================================================ // Handle type aliases - expose only the raw CUDA resource // ============================================================================ @@ -171,6 +207,24 @@ using NvvmProgramHandle = std::shared_ptr; using NvJitLinkHandle = std::shared_ptr; using CuLinkHandle = std::shared_ptr; using FileDescriptorHandle = std::shared_ptr; +using NvmlEventSetHandle = std::shared_ptr; +using NvmlSysEventSetHandle = std::shared_ptr; + +// ============================================================================ +// NVML event set handle functions +// ============================================================================ + +// Create an owning device-scope NVML event set handle. +// handle is the intptr_t value returned by nvml.event_set_create(). +// When the last reference is released, nvmlEventSetFree is called. +// Returns empty handle if registration has not been done (p_nvmlEventSetFree is null). +NvmlEventSetHandle create_nvml_event_set_handle(intptr_t handle); + +// Create an owning system-scope NVML event set handle. +// handle is the intptr_t value returned by nvml.system_event_set_create(). +// When the last reference is released, nvmlSystemEventSetFree is called via struct. +// Returns empty handle if registration has not been done. +NvmlSysEventSetHandle create_nvml_sys_event_set_handle(intptr_t handle); // ============================================================================ @@ -661,6 +715,14 @@ inline std::intptr_t as_intptr(const FileDescriptorHandle& h) noexcept { return h ? static_cast(*h) : -1; } +inline std::intptr_t as_intptr(const NvmlEventSetHandle& h) noexcept { + return h ? h->raw : 0; +} + +inline std::intptr_t as_intptr(const NvmlSysEventSetHandle& h) noexcept { + return h ? h->raw : 0; +} + // as_py() - convert handle to Python wrapper object (returns new reference) #if PY_VERSION_HEX < 0x030D0000 extern "C" int _Py_IsFinalizing(void); diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 0ed3d6e5942..f86d04e2cef 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -41,6 +41,15 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const NvvmProgramValue] NvvmProgramHandle ctypedef shared_ptr[const NvJitLinkValue] NvJitLinkHandle + # NvmlEventSetValue and NvmlSysEventSetValue are TaggedHandle + # instantiations to distinguish the two NVML event set handle types. + cppclass NvmlEventSetValue "cuda_core::NvmlEventSetValue": + pass + cppclass NvmlSysEventSetValue "cuda_core::NvmlSysEventSetValue": + pass + ctypedef shared_ptr[const NvmlEventSetValue] NvmlEventSetHandle + ctypedef shared_ptr[const NvmlSysEventSetValue] NvmlSysEventSetHandle + ctypedef shared_ptr[const cydriver.CUlinkState] CuLinkHandle ctypedef shared_ptr[const int] FileDescriptorHandle @@ -78,6 +87,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": intptr_t as_intptr(NvJitLinkHandle h) noexcept nogil intptr_t as_intptr(CuLinkHandle h) noexcept nogil intptr_t as_intptr(FileDescriptorHandle h) noexcept nogil + intptr_t as_intptr(NvmlEventSetHandle h) noexcept nogil + intptr_t as_intptr(NvmlSysEventSetHandle h) noexcept nogil # as_py() - convert handle to Python wrapper object (inline C++; requires GIL) object as_py(ContextHandle h) @@ -224,6 +235,12 @@ cdef CuLinkHandle create_culink_handle_ref(cydriver.CUlinkState state) except+ n cdef FileDescriptorHandle create_fd_handle(int fd) except+ nogil cdef FileDescriptorHandle create_fd_handle_ref(int fd) except+ nogil +# NVML event set handles +cdef void register_nvml_event_set_fn_pointers( + intptr_t event_set_free_fn, intptr_t sys_event_set_free_fn) noexcept +cdef NvmlEventSetHandle create_nvml_event_set_handle(intptr_t handle) noexcept nogil +cdef NvmlSysEventSetHandle create_nvml_sys_event_set_handle(intptr_t handle) noexcept nogil + # SM resource split (13.1+ — calls through function pointer, safe on older bindings) # groupParams is void* here to avoid referencing CU_DEV_SM_RESOURCE_GROUP_PARAMS # (which doesn't exist in cuda-bindings 13.0 .pxd). The C++ side casts it. diff --git a/cuda_core/cuda/core/_resource_handles.pyi b/cuda_core/cuda/core/_resource_handles.pyi index 490073c9fd1..5766bd9bd81 100644 --- a/cuda_core/cuda/core/_resource_handles.pyi +++ b/cuda_core/cuda/core/_resource_handles.pyi @@ -18,5 +18,7 @@ GraphicsResourceHandle = shared_ptr NvrtcProgramHandle = shared_ptr NvvmProgramHandle = shared_ptr NvJitLinkHandle = shared_ptr +NvmlEventSetHandle = shared_ptr +NvmlSysEventSetHandle = shared_ptr CuLinkHandle = shared_ptr FileDescriptorHandle = shared_ptr \ No newline at end of file diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index c87956f0c68..736ae53f97f 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -193,6 +193,14 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": FileDescriptorHandle create_fd_handle_ref "cuda_core::create_fd_handle_ref" ( int fd) except+ nogil + # NVML event set handles + void register_nvml_event_set_fn_pointers "cuda_core::register_nvml_event_set_fn_pointers" ( + intptr_t event_set_free_fn, intptr_t sys_event_set_free_fn) noexcept + NvmlEventSetHandle create_nvml_event_set_handle "cuda_core::create_nvml_event_set_handle" ( + intptr_t handle) noexcept nogil + NvmlSysEventSetHandle create_nvml_sys_event_set_handle "cuda_core::create_nvml_sys_event_set_handle" ( + intptr_t handle) noexcept nogil + # SM resource split (13.1+ wrapper — avoids direct cydriver cimport) # groupParams is void* to avoid referencing CU_DEV_SM_RESOURCE_GROUP_PARAMS # (which doesn't exist in cuda-bindings 13.0 .pxd). The C++ side casts it. diff --git a/cuda_core/cuda/core/system/_device.pyi b/cuda_core/cuda/core/system/_device.pyi index 2d35c7f63bc..09b0e636b2d 100644 --- a/cuda_core/cuda/core/system/_device.pyi +++ b/cuda_core/cuda/core/system/_device.pyi @@ -269,10 +269,10 @@ class DeviceEvents: Represents a set of events that can be waited on for a specific device. """ - def __init__(self, device_handle: int, events: EventType | str | list[EventType | str]): - ... + def close(self): + """Destroy the device event set, releasing its NVML resources.""" - def __dealloc__(self) -> None: + def __init__(self, device_handle: int, events: EventType | str | list[EventType | str]): ... def wait(self, timeout_ms: int=0) -> EventData: diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index f0126b78a5b..f0006240391 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -11,6 +11,13 @@ import warnings from cuda.bindings import nvml +from cuda.core._resource_handles cimport ( + NvmlEventSetHandle, + as_intptr, + create_nvml_event_set_handle, + register_nvml_event_set_fn_pointers, +) + from ._nvml_context cimport initialize from cuda.core.system.typing import ( AddressingMode, @@ -53,6 +60,24 @@ cdef int _pstate_to_enum(int pstate): return int(pstate) + int(nvml.Pstates.PSTATE_0) +cdef void _register_nvml_fn_pointers() noexcept: + # Register NVML event-set free function pointers so that NvmlEventSetHandle + # and NvmlSysEventSetHandle deleters can call them without GIL. + # Function pointers come from the NVML internal bindings (loaded via dlsym + # at their module import time) and are safe to read immediately. + try: + from cuda.bindings._internal import nvml as _nvml_internal + except ImportError: + return + fn_ptrs = _nvml_internal._inspect_function_pointers() + cdef intptr_t p_event_set_free = fn_ptrs.get("__nvmlEventSetFree", 0) + cdef intptr_t p_sys_event_set_free = fn_ptrs.get("__nvmlSystemEventSetFree", 0) + register_nvml_event_set_fn_pointers(p_event_set_free, p_sys_event_set_free) + + +_register_nvml_fn_pointers() + + include "_clock.pxi" include "_cooler.pxi" include "_device_attributes.pxi" diff --git a/cuda_core/cuda/core/system/_event.pxi b/cuda_core/cuda/core/system/_event.pxi index f81e5934aa7..f5f9d3b34e9 100644 --- a/cuda_core/cuda/core/system/_event.pxi +++ b/cuda_core/cuda/core/system/_event.pxi @@ -91,12 +91,10 @@ cdef class DeviceEvents: """ Represents a set of events that can be waited on for a specific device. """ - cdef intptr_t _event_set + cdef NvmlEventSetHandle _h_event_set cdef intptr_t _device_handle def __init__(self, device_handle: intptr_t, events: EventType | str | list[EventType | str]): - self._event_set = 0 - cdef unsigned long long event_bitmask if isinstance(events, (str, EventType)): events = [events] @@ -116,14 +114,15 @@ cdef class DeviceEvents: raise TypeError("events must be an EventType, str, or list of EventType or str") self._device_handle = device_handle - self._event_set = nvml.event_set_create() - # If this raises, the event needs to be freed and this is handled by - # this class's __dealloc__ method. - nvml.device_register_events(self._device_handle, event_bitmask, self._event_set) + cdef intptr_t raw_set = nvml.event_set_create() + # If device_register_events raises, create_nvml_event_set_handle already + # owns the handle and its shared_ptr deleter will free it. + self._h_event_set = create_nvml_event_set_handle(raw_set) + nvml.device_register_events(self._device_handle, event_bitmask, raw_set) - def __dealloc__(self) -> None: - if self._event_set != 0: - nvml.event_set_free(self._event_set) + cpdef close(self): + """Destroy the device event set, releasing its NVML resources.""" + self._h_event_set.reset() def wait(self, timeout_ms: int = 0) -> EventData: """ @@ -167,4 +166,4 @@ cdef class DeviceEvents: :class:`cuda.core.system.GpuIsLostError` If the GPU has fallen off the bus or is otherwise inaccessible. """ - return EventData(nvml.event_set_wait_v2(self._event_set, timeout_ms)) + return EventData(nvml.event_set_wait_v2(as_intptr(self._h_event_set), timeout_ms)) diff --git a/cuda_core/cuda/core/system/_system_events.pyi b/cuda_core/cuda/core/system/_system_events.pyi index 5ae5b86bc57..1c3c3559654 100644 --- a/cuda_core/cuda/core/system/_system_events.pyi +++ b/cuda_core/cuda/core/system/_system_events.pyi @@ -55,10 +55,10 @@ class RegisteredSystemEvents: Represents a set of events that can be waited on for a specific device. """ - def __init__(self, events: SystemEventType | str | list[SystemEventType | str]): - ... + def close(self): + """Destroy the system event set, releasing its NVML resources.""" - def __dealloc__(self) -> None: + def __init__(self, events: SystemEventType | str | list[SystemEventType | str]): ... def wait(self, timeout_ms: int=0, buffer_size: int=1) -> SystemEvents: diff --git a/cuda_core/cuda/core/system/_system_events.pyx b/cuda_core/cuda/core/system/_system_events.pyx index 87a3dfcf1ef..1c095cc13fa 100644 --- a/cuda_core/cuda/core/system/_system_events.pyx +++ b/cuda_core/cuda/core/system/_system_events.pyx @@ -7,6 +7,13 @@ from libc.stdint cimport intptr_t from cuda.bindings import nvml +from cuda.core._resource_handles cimport ( + NvmlSysEventSetHandle, + as_intptr, + create_nvml_sys_event_set_handle, + register_nvml_event_set_fn_pointers, +) + from ._nvml_context cimport initialize from . import _device @@ -22,6 +29,23 @@ _SYSTEM_EVENT_TYPE_MAPPING = { _SYSTEM_EVENT_TYPE_INV_MAPPING = {v: k for k, v in _SYSTEM_EVENT_TYPE_MAPPING.items()} +cdef void _register_nvml_fn_pointers() noexcept: + try: + from cuda.bindings._internal import nvml_linux as _nvml_internal + except ImportError: + try: + from cuda.bindings._internal import nvml_windows as _nvml_internal + except ImportError: + return + fn_ptrs = _nvml_internal._inspect_function_pointers() + cdef intptr_t p_event_set_free = fn_ptrs.get("__nvmlEventSetFree", 0) + cdef intptr_t p_sys_event_set_free = fn_ptrs.get("__nvmlSystemEventSetFree", 0) + register_nvml_event_set_fn_pointers(p_event_set_free, p_sys_event_set_free) + + +_register_nvml_fn_pointers() + + cdef class SystemEvent: """ Data about a collection of system events. @@ -73,7 +97,7 @@ cdef class RegisteredSystemEvents: """ Represents a set of events that can be waited on for a specific device. """ - cdef intptr_t _event_set + cdef NvmlSysEventSetHandle _h_event_set def __init__(self, events: SystemEventType | str | list[SystemEventType | str]): cdef unsigned long long event_bitmask @@ -96,15 +120,15 @@ cdef class RegisteredSystemEvents: initialize() - self._event_set = 0 - self._event_set = nvml.system_event_set_create() - # If this raises, the event needs to be freed and this is handled by - # this class's __dealloc__ method. - nvml.system_register_events(event_bitmask, self._event_set) + cdef intptr_t raw_set = nvml.system_event_set_create() + # If system_register_events raises, create_nvml_sys_event_set_handle already + # owns the handle and its shared_ptr deleter will free it. + self._h_event_set = create_nvml_sys_event_set_handle(raw_set) + nvml.system_register_events(event_bitmask, raw_set) - def __dealloc__(self) -> None: - if self._event_set != 0: - nvml.system_event_set_free(self._event_set) + cpdef close(self): + """Destroy the system event set, releasing its NVML resources.""" + self._h_event_set.reset() def wait(self, timeout_ms: int = 0, buffer_size: int = 1) -> SystemEvents: """ @@ -140,7 +164,7 @@ cdef class RegisteredSystemEvents: :class:`cuda.core.system.GpuIsLostError` If the GPU has fallen off the bus or is otherwise inaccessible. """ - return SystemEvents(nvml.system_event_set_wait(self._event_set, timeout_ms, buffer_size)) + return SystemEvents(nvml.system_event_set_wait(as_intptr(self._h_event_set), timeout_ms, buffer_size)) def register_events(events: SystemEventType | str | list[SystemEventType | str]) -> RegisteredSystemEvents: