-
Notifications
You must be signed in to change notification settings - Fork 301
Use shared_ptr for NVML event_set resource management #2240
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1337,6 +1337,77 @@ FileDescriptorHandle create_fd_handle_ref(int fd) { | |
| #endif | ||
| } | ||
|
|
||
| // ============================================================================ | ||
| // NVML event set function pointers and registration | ||
| // ============================================================================ | ||
|
|
||
| NvmlEventSetFreeFn p_nvmlEventSetFree = nullptr; | ||
| NvmlSysEventSetFreeFn p_nvmlSysEventSetFree = nullptr; | ||
|
|
||
| void register_nvml_event_set_fn_pointers(intptr_t event_set_free_fn, | ||
| intptr_t sys_event_set_free_fn) noexcept { | ||
| p_nvmlEventSetFree = reinterpret_cast<NvmlEventSetFreeFn>(event_set_free_fn); | ||
| p_nvmlSysEventSetFree = reinterpret_cast<NvmlSysEventSetFreeFn>(sys_event_set_free_fn); | ||
| } | ||
|
|
||
| // ============================================================================ | ||
| // NVML Event Set Handles (device-scope) | ||
| // ============================================================================ | ||
|
|
||
| namespace { | ||
| struct NvmlEventSetBox { | ||
| NvmlEventSetValue resource; | ||
| }; | ||
| } // namespace | ||
|
|
||
| NvmlEventSetHandle create_nvml_event_set_handle(intptr_t handle) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An open design question: some I have not been able to settle on one approach, but it seems like something we could standardize. Here, if On the other hand, accepting construction parameters makes a fatter interface, with potentially many functions that just wrap existing CUDA APIs, I don't have a coherent analysis, just a few disconnected thoughts like this. |
||
| if (!p_nvmlEventSetFree) { | ||
| return NvmlEventSetHandle{}; | ||
| } | ||
| auto box = std::shared_ptr<NvmlEventSetBox>( | ||
| new NvmlEventSetBox{{handle}}, | ||
| [](NvmlEventSetBox* b) { | ||
| if (p_nvmlEventSetFree && b->resource.raw) { | ||
| p_nvmlEventSetFree(reinterpret_cast<void*>(b->resource.raw)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing |
||
| } | ||
| delete b; | ||
| } | ||
| ); | ||
| return NvmlEventSetHandle(box, &box->resource); | ||
| } | ||
|
|
||
| // ============================================================================ | ||
| // NVML System Event Set Handles (system-scope) | ||
| // ============================================================================ | ||
|
|
||
| namespace { | ||
| struct NvmlSysEventSetBox { | ||
| NvmlSysEventSetValue resource; | ||
| }; | ||
| } // namespace | ||
|
|
||
| NvmlSysEventSetHandle create_nvml_sys_event_set_handle(intptr_t handle) { | ||
| if (!p_nvmlSysEventSetFree) { | ||
| return NvmlSysEventSetHandle{}; | ||
| } | ||
| auto box = std::shared_ptr<NvmlSysEventSetBox>( | ||
| new NvmlSysEventSetBox{{handle}}, | ||
| [](NvmlSysEventSetBox* b) { | ||
| if (p_nvmlSysEventSetFree && b->resource.raw) { | ||
| // Matches NVML_STRUCT_VERSION(SystemEventSetFreeRequest, 1): | ||
| // version = sizeof(struct) | (1 << 24). Both our struct and the | ||
| // NVML header struct have the same layout ({unsigned int, void*}). | ||
| NvmlSysEventSetFreeRequest req; | ||
| req.set = reinterpret_cast<void*>(b->resource.raw); | ||
| req.version = (unsigned int)(sizeof(NvmlSysEventSetFreeRequest) | (1u << 24u)); | ||
| p_nvmlSysEventSetFree(&req); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto regarding |
||
| } | ||
| delete b; | ||
| } | ||
| ); | ||
| return NvmlSysEventSetHandle(box, &box->resource); | ||
| } | ||
|
|
||
| // ============================================================================ | ||
| // SM resource split wrapper | ||
| // ============================================================================ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,6 +36,13 @@ struct TaggedHandle { | |
| using NvvmProgramValue = TaggedHandle<nvvmProgram, 0>; | ||
| using NvJitLinkValue = TaggedHandle<nvJitLink_t, 1>; | ||
|
|
||
| // NVML event set types — forward-declared as void* to avoid nvml.h dependency. | ||
| // nvmlEventSet_t = nvmlEventSet_st* (device-scope event set) | ||
| // nvmlSystemEventSet_t = nvmlSystemEventSet_st* (system-scope event set) | ||
| // TaggedHandle distinguishes the two intptr_t-based handle types for overloading. | ||
| using NvmlEventSetValue = TaggedHandle<intptr_t, 2>; | ||
| using NvmlSysEventSetValue = TaggedHandle<intptr_t, 3>; | ||
|
|
||
| // ============================================================================ | ||
| // Thread-local error handling | ||
| // ============================================================================ | ||
|
|
@@ -152,6 +159,35 @@ extern NvvmDestroyProgramFn p_nvvmDestroyProgram; | |
| using NvJitLinkDestroyFn = int (*)(nvJitLink_t*); | ||
| extern NvJitLinkDestroyFn p_nvJitLinkDestroy; | ||
|
|
||
| // ============================================================================ | ||
| // NVML event set function pointers | ||
| // | ||
| // Populated by register_nvml_event_set_fn_pointers(), called from the system | ||
| // event / device modules once the NVML bindings have loaded the library. | ||
| // Both may be null until registration; deleters are no-ops when null. | ||
| // ============================================================================ | ||
|
|
||
| // nvmlReturn_t nvmlEventSetFree(nvmlEventSet_t set) | ||
| // nvmlEventSet_t is nvmlEventSet_st* (opaque pointer stored as intptr_t here) | ||
| using NvmlEventSetFreeFn = unsigned int (*)(void*); | ||
| extern NvmlEventSetFreeFn p_nvmlEventSetFree; | ||
|
|
||
| // Minimal layout-compatible counterpart to nvmlSystemEventSetFreeRequest_v1_t. | ||
| // Both fields match the NVML header: {unsigned int version; void* set;}. | ||
| struct NvmlSysEventSetFreeRequest { | ||
| unsigned int version; | ||
| void* set; // nvmlSystemEventSet_t | ||
| }; | ||
|
|
||
| // nvmlReturn_t nvmlSystemEventSetFree(nvmlSystemEventSetFreeRequest_t*) | ||
| using NvmlSysEventSetFreeFn = unsigned int (*)(NvmlSysEventSetFreeRequest*); | ||
| extern NvmlSysEventSetFreeFn p_nvmlSysEventSetFree; | ||
|
|
||
| // Register both NVML event-set free function pointers. | ||
| // safe to call multiple times (idempotent); second call is a no-op. | ||
| void register_nvml_event_set_fn_pointers(intptr_t event_set_free_fn, | ||
| intptr_t sys_event_set_free_fn) noexcept; | ||
|
|
||
| // ============================================================================ | ||
| // Handle type aliases - expose only the raw CUDA resource | ||
| // ============================================================================ | ||
|
|
@@ -171,6 +207,24 @@ using NvvmProgramHandle = std::shared_ptr<const NvvmProgramValue>; | |
| using NvJitLinkHandle = std::shared_ptr<const NvJitLinkValue>; | ||
| using CuLinkHandle = std::shared_ptr<const CUlinkState>; | ||
| using FileDescriptorHandle = std::shared_ptr<const int>; | ||
| using NvmlEventSetHandle = std::shared_ptr<const NvmlEventSetValue>; | ||
| using NvmlSysEventSetHandle = std::shared_ptr<const NvmlSysEventSetValue>; | ||
|
|
||
| // ============================================================================ | ||
| // NVML event set handle functions | ||
| // ============================================================================ | ||
|
|
||
| // Create an owning device-scope NVML event set handle. | ||
| // handle is the intptr_t value returned by nvml.event_set_create(). | ||
| // When the last reference is released, nvmlEventSetFree is called. | ||
| // Returns empty handle if registration has not been done (p_nvmlEventSetFree is null). | ||
| NvmlEventSetHandle create_nvml_event_set_handle(intptr_t handle); | ||
|
|
||
| // Create an owning system-scope NVML event set handle. | ||
| // handle is the intptr_t value returned by nvml.system_event_set_create(). | ||
| // When the last reference is released, nvmlSystemEventSetFree is called via struct. | ||
| // Returns empty handle if registration has not been done. | ||
| NvmlSysEventSetHandle create_nvml_sys_event_set_handle(intptr_t handle); | ||
|
|
||
|
|
||
| // ============================================================================ | ||
|
|
@@ -661,6 +715,14 @@ inline std::intptr_t as_intptr(const FileDescriptorHandle& h) noexcept { | |
| return h ? static_cast<std::intptr_t>(*h) : -1; | ||
| } | ||
|
|
||
| inline std::intptr_t as_intptr(const NvmlEventSetHandle& h) noexcept { | ||
| return h ? h->raw : 0; | ||
| } | ||
|
|
||
| inline std::intptr_t as_intptr(const NvmlSysEventSetHandle& h) noexcept { | ||
| return h ? h->raw : 0; | ||
| } | ||
|
|
||
|
Comment on lines
+718
to
+725
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was expecting |
||
| // as_py() - convert handle to Python wrapper object (returns new reference) | ||
| #if PY_VERSION_HEX < 0x030D0000 | ||
| extern "C" int _Py_IsFinalizing(void); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -91,12 +91,10 @@ cdef class DeviceEvents: | |
| """ | ||
| Represents a set of events that can be waited on for a specific device. | ||
| """ | ||
| cdef intptr_t _event_set | ||
| cdef NvmlEventSetHandle _h_event_set | ||
| cdef intptr_t _device_handle | ||
|
|
||
| def __init__(self, device_handle: intptr_t, events: EventType | str | list[EventType | str]): | ||
| self._event_set = 0 | ||
|
|
||
| cdef unsigned long long event_bitmask | ||
| if isinstance(events, (str, EventType)): | ||
| events = [events] | ||
|
|
@@ -116,14 +114,15 @@ cdef class DeviceEvents: | |
| raise TypeError("events must be an EventType, str, or list of EventType or str") | ||
|
|
||
| self._device_handle = device_handle | ||
| self._event_set = nvml.event_set_create() | ||
| # If this raises, the event needs to be freed and this is handled by | ||
| # this class's __dealloc__ method. | ||
| nvml.device_register_events(self._device_handle, event_bitmask, self._event_set) | ||
| cdef intptr_t raw_set = nvml.event_set_create() | ||
| # If device_register_events raises, create_nvml_event_set_handle already | ||
| # owns the handle and its shared_ptr deleter will free it. | ||
| self._h_event_set = create_nvml_event_set_handle(raw_set) | ||
| nvml.device_register_events(self._device_handle, event_bitmask, raw_set) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is where I would expect to see |
||
|
|
||
| def __dealloc__(self) -> None: | ||
| if self._event_set != 0: | ||
| nvml.event_set_free(self._event_set) | ||
| cpdef close(self): | ||
| """Destroy the device event set, releasing its NVML resources.""" | ||
| self._h_event_set.reset() | ||
|
|
||
| def wait(self, timeout_ms: int = 0) -> EventData: | ||
| """ | ||
|
|
@@ -167,4 +166,4 @@ cdef class DeviceEvents: | |
| :class:`cuda.core.system.GpuIsLostError` | ||
| If the GPU has fallen off the bus or is otherwise inaccessible. | ||
| """ | ||
| return EventData(nvml.event_set_wait_v2(self._event_set, timeout_ms)) | ||
| return EventData(nvml.event_set_wait_v2(as_intptr(self._h_event_set), timeout_ms)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to the above comment. My mental model:
Maybe it makes sense to do something different for NVML; or, is it better to have consistency at every call site? |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can/should the registration of these NVML symbols be made to match the driver pattern, where
p_*symbols are resolved viacuda.bindingsin_resource_handles.pyx? The NVML initialization adds a wrinkle so it's not clear to me what's best.