NVIDIA · KRRT7 · May 11, 2026 · May 12, 2026 · May 11, 2026 · May 12, 2026
diff --git a/cuda_core/cuda/core/_launch_config.pyx b/cuda_core/cuda/core/_launch_config.pyx
@@ -6,9 +6,7 @@ from libc.string cimport memset
 
 from typing import Any
 
-from cuda.core._device import Device
 from cuda.core._utils.cuda_utils import (
-    CUDAError,
     cast_to_3_tuple,
     driver,
 )
@@ -78,16 +76,7 @@ cdef class LaunchConfig:
         self.grid = cast_to_3_tuple("LaunchConfig.grid", grid)
         self.block = cast_to_3_tuple("LaunchConfig.block", block)
 
-        # FIXME: Calling Device() strictly speaking is not quite right; we should instead
-        # look up the device from stream. We probably need to defer the checks related to
-        # device compute capability or attributes.
-        # thread block clusters are supported starting H100
         if cluster is not None:
-            cc = Device().compute_capability
-            if cc < (9, 0):
-                raise CUDAError(
-                    f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
-                )
             self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster)
         else:
             self.cluster = None
@@ -100,9 +89,6 @@ cdef class LaunchConfig:
 
         self.is_cooperative = is_cooperative
 
-        if self.is_cooperative and not Device().properties.cooperative_launch:
-            raise CUDAError("cooperative kernels are not supported on this device")
-
     def _identity(self) -> tuple[Any, ...]:
         return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS)
 

diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx
@@ -17,6 +17,7 @@ from cuda.core._utils.cuda_utils cimport (
 )
 from cuda.core._module import Kernel
 from cuda.core._stream import Stream
+from cuda.core._utils.cuda_utils import CUDAError
 from math import prod
 from typing import TYPE_CHECKING
 
@@ -62,14 +63,26 @@ def launch(
 
     drv_cfg = conf._to_native_launch_config()
     drv_cfg.hStream = as_cu(s._h_stream)
+    if conf.cluster is not None:
+        _check_cluster_launch(conf, s)
     if conf.is_cooperative:
         _check_cooperative_launch(kernel, conf, s)
     with nogil:
         HANDLE_RETURN(cydriver.cuLaunchKernelEx(&drv_cfg, func_handle, args_ptr, NULL))
 
 
+cdef _check_cluster_launch(config: LaunchConfig, stream: Stream):
+    cc = stream.device.compute_capability
+    if cc < (9, 0):
+        raise CUDAError(
+            f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
+        )
+
+
 cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
     dev = stream.device
+    if not dev.properties.cooperative_launch:
+        raise CUDAError("cooperative kernels are not supported on this device")
     num_sm = dev.properties.multiprocessor_count
     max_grid_size = (
         kernel.occupancy.max_active_blocks_per_multiprocessor(prod(config.block), config.shmem_size) * num_sm

diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
@@ -25,7 +25,6 @@
     launch,
 )
 from cuda.core._memory._legacy import _SynchronousMemoryResource
-from cuda.core._utils.cuda_utils import CUDAError
 from cuda.core.typing import ObjectCodeFormatType, SourceCodeType
 
 
@@ -63,66 +62,58 @@ def test_launch_config_shmem_size():
     assert config.shmem_size == 0
 
 
-def test_launch_config_cluster_grid_conversion(init_cuda):
+def test_launch_config_cluster_grid_conversion():
     """Test that LaunchConfig preserves original grid values and conversion happens in native config."""
-    try:
-        # Test case 1: 1D - Issue #867 example
-        config = LaunchConfig(grid=4, cluster=2, block=32)
-        assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
-        assert config.cluster == (2, 1, 1), f"Expected (2, 1, 1), got {config.cluster}"
-        assert config.block == (32, 1, 1), f"Expected (32, 1, 1), got {config.block}"
+    # Test case 1: 1D - Issue #867 example
+    config = LaunchConfig(grid=4, cluster=2, block=32)
+    assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
+    assert config.cluster == (2, 1, 1), f"Expected (2, 1, 1), got {config.cluster}"
+    assert config.block == (32, 1, 1), f"Expected (32, 1, 1), got {config.block}"
 
-        # Test case 2: 2D grid and cluster
-        config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
-        assert config.grid == (2, 3, 1), f"Expected (2, 3, 1), got {config.grid}"
-        assert config.cluster == (2, 2, 1), f"Expected (2, 2, 1), got {config.cluster}"
-
-        # Test case 3: 3D full specification
-        config = LaunchConfig(grid=(2, 2, 2), cluster=(3, 3, 3), block=(8, 8, 8))
-        assert config.grid == (2, 2, 2), f"Expected (2, 2, 2), got {config.grid}"
-        assert config.cluster == (3, 3, 3), f"Expected (3, 3, 3), got {config.cluster}"
+    # Test case 2: 2D grid and cluster
+    config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
+    assert config.grid == (2, 3, 1), f"Expected (2, 3, 1), got {config.grid}"
+    assert config.cluster == (2, 2, 1), f"Expected (2, 2, 1), got {config.cluster}"
 
-        # Test case 4: Identity case
-        config = LaunchConfig(grid=1, cluster=1, block=32)
-        assert config.grid == (1, 1, 1), f"Expected (1, 1, 1), got {config.grid}"
+    # Test case 3: 3D full specification
+    config = LaunchConfig(grid=(2, 2, 2), cluster=(3, 3, 3), block=(8, 8, 8))
+    assert config.grid == (2, 2, 2), f"Expected (2, 2, 2), got {config.grid}"
+    assert config.cluster == (3, 3, 3), f"Expected (3, 3, 3), got {config.cluster}"
 
-        # Test case 5: No cluster (should not convert grid)
-        config = LaunchConfig(grid=4, block=32)
-        assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
-        assert config.cluster is None
+    # Test case 4: Identity case
+    config = LaunchConfig(grid=1, cluster=1, block=32)
+    assert config.grid == (1, 1, 1), f"Expected (1, 1, 1), got {config.grid}"
 
-    except CUDAError:
-        pytest.skip("Driver or GPU not new enough for thread block clusters")
+    # Test case 5: No cluster (should not convert grid)
+    config = LaunchConfig(grid=4, block=32)
+    assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
+    assert config.cluster is None
 
 
 def test_launch_config_native_conversion(init_cuda):
     """Test that _to_native_launch_config correctly converts grid from cluster units to block units."""
     from cuda.core._launch_config import _to_native_launch_config
 
-    try:
-        # Test case 1: 1D - Issue #867 example
-        config = LaunchConfig(grid=4, cluster=2, block=32)
-        native_config = _to_native_launch_config(config)
-        assert native_config.gridDimX == 8, f"Expected gridDimX=8, got {native_config.gridDimX}"
-        assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
-        assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
-
-        # Test case 2: 2D grid and cluster
-        config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
-        native_config = _to_native_launch_config(config)
-        assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
-        assert native_config.gridDimY == 6, f"Expected gridDimY=6, got {native_config.gridDimY}"
-        assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
+    # Test case 1: 1D - Issue #867 example
+    config = LaunchConfig(grid=4, cluster=2, block=32)
+    native_config = _to_native_launch_config(config)
+    assert native_config.gridDimX == 8, f"Expected gridDimX=8, got {native_config.gridDimX}"
+    assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
+    assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
 
-        # Test case 3: No cluster (should not convert grid)
-        config = LaunchConfig(grid=4, block=32)
-        native_config = _to_native_launch_config(config)
-        assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
-        assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
-        assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
+    # Test case 2: 2D grid and cluster
+    config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
+    native_config = _to_native_launch_config(config)
+    assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
+    assert native_config.gridDimY == 6, f"Expected gridDimY=6, got {native_config.gridDimY}"
+    assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
 
-    except CUDAError:
-        pytest.skip("Driver or GPU not new enough for thread block clusters")
+    # Test case 3: No cluster (should not convert grid)
+    config = LaunchConfig(grid=4, block=32)
+    native_config = _to_native_launch_config(config)
+    assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
+    assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
+    assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
 
 
 def test_to_native_launch_config_no_cluster():
@@ -142,35 +133,18 @@ def test_to_native_launch_config_no_cluster():
     assert list(native.attrs) == [], f"Expected empty attrs, got {list(native.attrs)}"
 
 
-def test_launch_config_cooperative_unsupported(monkeypatch):
-    """LaunchConfig(is_cooperative=True) raises when device does not support it."""
-    from cuda.core import _launch_config as _lc_mod
-
-    class _FakeProps:
-        cooperative_launch = False
-
-    class _FakeDev:
-        properties = _FakeProps()
-
-    monkeypatch.setattr(_lc_mod, "Device", lambda: _FakeDev())
-    with pytest.raises(CUDAError, match="cooperative kernels are not supported"):
-        LaunchConfig(grid=1, block=1, is_cooperative=True)
+@pytest.mark.human_reviewed
+def test_launch_config_cooperative_defers_device_check():
+    """LaunchConfig(is_cooperative=True) is pure config construction."""
+    config = LaunchConfig(grid=1, block=1, is_cooperative=True)
+    assert config.is_cooperative is True
 
 
-def test_to_native_launch_config_cooperative(monkeypatch):
-    """Covers the is_cooperative branch of _to_native_launch_config; Device is mocked so it runs on any GPU."""
+def test_to_native_launch_config_cooperative():
+    """Covers the is_cooperative branch of _to_native_launch_config."""
     from cuda.bindings import driver
-    from cuda.core import _launch_config as _lc_mod
     from cuda.core._launch_config import _to_native_launch_config
 
-    class _FakeProps:
-        cooperative_launch = True
-
-    class _FakeDev:
-        properties = _FakeProps()
-
-    monkeypatch.setattr(_lc_mod, "Device", lambda: _FakeDev())
-
     config = LaunchConfig(grid=2, block=4, is_cooperative=True)
     native = _to_native_launch_config(config)
     assert native.gridDimX == 2
@@ -183,50 +157,21 @@ class _FakeDev:
     assert attr.value.cooperative == 1, f"Expected cooperative=1, got {attr.value.cooperative}"
 
 
-def test_launch_config_cluster_accepts_hopper_cc(monkeypatch):
-    """LaunchConfig accepts ``cluster`` when the device reports compute
-    capability >= 9.0. Device is mocked so the cluster-cast branch runs on any
-    GPU (real cluster support otherwise requires Hopper+)."""
-    from cuda.core import _launch_config as _lc_mod
-
-    class _FakeDev:
-        compute_capability = (9, 0)
-
-    # looked_up confirms the mock took effect.
-    looked_up = []
-    monkeypatch.setattr(_lc_mod, "Device", lambda: looked_up.append(1) or _FakeDev())
-
+@pytest.mark.human_reviewed
+def test_launch_config_cluster_defers_device_check():
+    """LaunchConfig accepts ``cluster`` without consulting the current device."""
     config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
-    assert looked_up, "Device was not looked up via the module global; mock did not take effect"
     assert config.cluster == (2, 2, 1)
     assert config.grid == (2, 3, 1)
 
 
-def test_launch_config_cluster_rejects_pre_hopper_cc(monkeypatch):
-    """LaunchConfig(cluster=...) raises on a device with compute capability < 9.0."""
-    from cuda.core import _launch_config as _lc_mod
-
-    class _FakeDev:
-        compute_capability = (8, 6)
-
-    # looked_up confirms the mock took effect.
-    looked_up = []
-    monkeypatch.setattr(_lc_mod, "Device", lambda: looked_up.append(1) or _FakeDev())
-
-    with pytest.raises(CUDAError, match="thread block clusters are not supported"):
-        LaunchConfig(grid=2, cluster=2, block=32)
-    assert looked_up, "Device was not looked up via the module global; mock did not take effect"
-
-
 def test_to_native_launch_config_cluster_branch():
     """Covers the cluster branch of ``_to_native_launch_config`` (grid is
     converted from cluster units to block units, plus the cluster-dimension
     attribute) without requiring Hopper.
 
-    The cc gate lives in ``LaunchConfig.__init__``; ``cluster`` itself is a
-    public attribute, so setting it on a cluster-free config yields the exact
-    object ``__init__`` would build on Hopper and lets the conversion run on
-    any GPU.
+    The cc gate lives in launch-time validation, so constructing cluster
+    configs and converting them to native launch configs can run on any GPU.
 
     Note: this exercises the standalone ``cpdef _to_native_launch_config``
     function (a duplicate of the ``LaunchConfig._to_native_launch_config``
@@ -236,8 +181,7 @@ def test_to_native_launch_config_cluster_branch():
     from cuda.bindings import driver
     from cuda.core._launch_config import _to_native_launch_config
 
-    config = LaunchConfig(grid=(2, 3, 4), block=(5, 6, 7))
-    config.cluster = (2, 2, 2)
+    config = LaunchConfig(grid=(2, 3, 4), cluster=(2, 2, 2), block=(5, 6, 7))
     native = _to_native_launch_config(config)
 
     # grid (in cluster units) * cluster -> block units