diff --git a/cuda_bindings/tests/test_interoperability.py b/cuda_bindings/tests/test_interoperability.py index 3da1877128b..96270e208c3 100644 --- a/cuda_bindings/tests/test_interoperability.py +++ b/cuda_bindings/tests/test_interoperability.py @@ -6,6 +6,7 @@ import cuda.bindings.driver as cuda import cuda.bindings.runtime as cudart +from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom def supportsMemoryPool(): @@ -87,12 +88,14 @@ def test_interop_graphNode(): def test_interop_memPool(): # DRV to RT err_dr, pool = cuda.cuDeviceGetDefaultMemPool(0) + xfail_if_mempool_oom(err_dr, "cuDeviceGetDefaultMemPool", 0) assert err_dr == cuda.CUresult.CUDA_SUCCESS (err_rt,) = cudart.cudaDeviceSetMemPool(0, pool) assert err_rt == cudart.cudaError_t.cudaSuccess # RT to DRV err_rt, pool = cudart.cudaDeviceGetDefaultMemPool(0) + xfail_if_mempool_oom(err_rt, "cudaDeviceGetDefaultMemPool", 0) assert err_rt == cudart.cudaError_t.cudaSuccess (err_dr,) = cuda.cuDeviceSetMemPool(0, pool) assert err_dr == cuda.CUresult.CUDA_SUCCESS diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx index b7b8b247a92..fb7689c54bb 100644 --- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx @@ -13,6 +13,7 @@ from cuda.core._memory._ipc cimport IPCAllocationHandle from cuda.core._resource_handles cimport ( as_cu, get_device_mempool, + get_last_error, ) from cuda.core._utils.cuda_utils cimport ( check_or_create_options, @@ -262,6 +263,14 @@ cdef inline _DMR_init(DeviceMemoryResource self, device_id, options): if opts is None: self._h_pool = get_device_mempool(dev_id) + if not self._h_pool: + HANDLE_RETURN(get_last_error()) + raise RuntimeError( + f"Failed to initialize DeviceMemoryResource for device {dev_id}: " + "cuda-core returned an empty memory pool handle without recording a CUDA error. " + "This is an internal cuda-core error; please report it with your CUDA driver, " + "CUDA Toolkit, and cuda-python versions." + ) self._mempool_owned = False MP_raise_release_threshold(self) else: diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx index 8fdc324dc59..60b056d3f28 100644 --- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx @@ -11,6 +11,7 @@ from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, Mem from cuda.core._resource_handles cimport ( DevicePtrHandle, deviceptr_alloc_async, + get_last_error, as_cu, ) @@ -194,7 +195,13 @@ cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream check_capturing(s) h_ptr = deviceptr_alloc_async(size, stream._h_stream) if not h_ptr: - raise RuntimeError("Failed to allocate memory asynchronously") + HANDLE_RETURN(get_last_error()) + raise RuntimeError( + f"Failed to allocate {size} bytes from GraphMemoryResource: " + "cuda-core returned an empty allocation handle without recording a CUDA error. " + "This is an internal cuda-core error; please report it with your CUDA driver, " + "CUDA Toolkit, and cuda-python versions." + ) return Buffer_from_deviceptr_handle(h_ptr, size, self, None) diff --git a/cuda_core/cuda/core/_memory/_ipc.pyx b/cuda_core/cuda/core/_memory/_ipc.pyx index 59414fc1b2e..833b24b0e2a 100644 --- a/cuda_core/cuda/core/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/_memory/_ipc.pyx @@ -211,7 +211,13 @@ cdef _MemPool MP_from_allocation_handle(cls, alloc_handle): cdef int ipc_fd = int(alloc_handle) self._h_pool = create_mempool_handle_ipc(ipc_fd, IPC_HANDLE_TYPE) if not self._h_pool: - raise RuntimeError("Failed to import memory pool from IPC handle") + HANDLE_RETURN(get_last_error()) + raise RuntimeError( + f"Failed to import {cls.__name__} from an allocation handle: " + "cuda-core returned an empty memory pool handle without recording a CUDA error. " + "This is an internal cuda-core error; please report it with your CUDA driver, " + "CUDA Toolkit, and cuda-python versions." + ) self._ipc_data = IPCDataForMR(alloc_handle, True) # Register it. diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx index 4da5e26ea92..02857cbb163 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -17,6 +17,7 @@ from cuda.core._resource_handles cimport ( DevicePtrHandle, create_mempool_handle, deviceptr_alloc_from_pool, + get_last_error, as_cu, as_py, ) @@ -228,6 +229,14 @@ cdef int MP_init_create_pool( self._mempool_owned = True self._h_pool = create_mempool_handle(properties) + if not self._h_pool: + HANDLE_RETURN(get_last_error()) + raise RuntimeError( + f"Failed to initialize {self.__class__.__name__}: " + "cuda-core returned an empty memory pool handle without recording a CUDA error. " + "This is an internal cuda-core error; please report it with your CUDA driver, " + "CUDA Toolkit, and cuda-python versions." + ) if ipc_enabled: alloc_handle = _ipc.MP_export_mempool(self) @@ -307,7 +316,13 @@ cdef inline Buffer _MP_allocate(_MemPool self, size_t size, Stream stream): check_not_capturing(s) h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream) if not h_ptr: - raise RuntimeError("Failed to allocate memory from pool") + HANDLE_RETURN(get_last_error()) + raise RuntimeError( + f"Failed to allocate {size} bytes from {self.__class__.__name__}: " + "cuda-core returned an empty allocation handle without recording a CUDA error. " + "This is an internal cuda-core error; please report it with your CUDA driver, " + "CUDA Toolkit, and cuda-python versions." + ) return Buffer_from_deviceptr_handle(h_ptr, size, self, None) diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 9f48686c30c..86c0c0cd7d4 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -5,6 +5,7 @@ import os import pathlib import sys +from contextlib import contextmanager from importlib.metadata import PackageNotFoundError, distribution import pytest @@ -87,6 +88,8 @@ def create_managed_memory_resource_or_skip(*args, xfail_device=None, **kwargs): return ManagedMemoryResource(*args, **kwargs) except CUDAError as e: xfail_if_mempool_oom(e, _device_id_from_resource_options(xfail_device, args, kwargs)) + if "CUDA_ERROR_NOT_SUPPORTED" in str(e): + pytest.skip("ManagedMemoryResource is not supported on this platform/device") raise except RuntimeError as e: if "requires CUDA 13.0" in str(e): @@ -102,6 +105,15 @@ def create_pinned_memory_resource_or_xfail(*args, xfail_device=None, **kwargs): raise +@contextmanager +def xfail_on_graph_mempool_oom(device=0): + try: + yield + except CUDAError as e: + xfail_if_mempool_oom(e, "cuGraphAddMemAllocNode", device) + raise + + def _device_id_from_resource_options(device, args, kwargs): if device is not None: return device diff --git a/cuda_core/tests/graph/test_graph_definition.py b/cuda_core/tests/graph/test_graph_definition.py index f9d10c766eb..da78bea577f 100644 --- a/cuda_core/tests/graph/test_graph_definition.py +++ b/cuda_core/tests/graph/test_graph_definition.py @@ -10,6 +10,7 @@ from helpers.graph_kernels import compile_common_kernels from helpers.misc import try_create_condition +from conftest import xfail_on_graph_mempool_oom from cuda.core import Device, LaunchConfig from cuda.core.graph import ( AllocNode, @@ -201,13 +202,15 @@ def _build_disconnected(): def graph_spec(request, init_cuda): if request.param is not _build_empty: _skip_if_no_mempool() - return request.param() + with xfail_on_graph_mempool_oom(): + return request.param() @pytest.fixture(params=_NONEMPTY_BUILDERS) def nonempty_graph_spec(request, init_cuda): _skip_if_no_mempool() - return request.param() + with xfail_on_graph_mempool_oom(): + return request.param() # ============================================================================= @@ -562,7 +565,8 @@ def node_spec(request, init_cuda): if spec.needs_mempool: _skip_if_no_mempool() g = GraphDefinition() - node, expected_attrs = spec.builder(g) + with xfail_on_graph_mempool_oom(): + node, expected_attrs = spec.builder(g) return spec, g, node, expected_attrs @@ -803,18 +807,20 @@ def test_alloc_zero_size_fails(sample_graphdef): def test_free_creates_dependency(sample_graphdef): """Free node depends on its predecessor.""" _skip_if_no_mempool() - alloc = sample_graphdef.allocate(ALLOC_SIZE) - free = alloc.deallocate(alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = sample_graphdef.allocate(ALLOC_SIZE) + free = alloc.deallocate(alloc.dptr) assert alloc in free.pred def test_alloc_free_chain(sample_graphdef): """Alloc and free can be chained.""" _skip_if_no_mempool() - a1 = sample_graphdef.allocate(ALLOC_SIZE) - a2 = a1.allocate(ALLOC_SIZE) - f2 = a2.deallocate(a2.dptr) - f1 = f2.deallocate(a1.dptr) + with xfail_on_graph_mempool_oom(): + a1 = sample_graphdef.allocate(ALLOC_SIZE) + a2 = a1.allocate(ALLOC_SIZE) + f2 = a2.deallocate(a2.dptr) + f1 = f2.deallocate(a1.dptr) assert a1 in a2.pred assert a2 in f2.pred assert f2 in f1.pred @@ -842,7 +848,8 @@ def test_alloc_device_option(sample_graphdef, device_spec): """Device can be specified as int or Device object.""" _skip_if_no_mempool() device = Device() - node = sample_graphdef.allocate(ALLOC_SIZE, device=device_spec(device)) + with xfail_on_graph_mempool_oom(device): + node = sample_graphdef.allocate(ALLOC_SIZE, device=device_spec(device)) assert node.dptr != 0 @@ -850,7 +857,8 @@ def test_alloc_peer_access(mempool_device_x2): """AllocNode.peer_access reflects requested peers.""" d0, d1 = mempool_device_x2 g = GraphDefinition() - node = g.allocate(ALLOC_SIZE, device=d0.device_id, peer_access=[d1.device_id]) + with xfail_on_graph_mempool_oom(d0): + node = g.allocate(ALLOC_SIZE, device=d0.device_id, peer_access=[d1.device_id]) assert d1.device_id in node.peer_access @@ -863,8 +871,9 @@ def test_alloc_peer_access(mempool_device_x2): def test_join_merges_branches(sample_graphdef, num_branches): """join() with multiple branches creates correct dependencies.""" _skip_if_no_mempool() - branches = [sample_graphdef.allocate(ALLOC_SIZE) for _ in range(num_branches)] - joined = sample_graphdef.join(*branches) + with xfail_on_graph_mempool_oom(): + branches = [sample_graphdef.allocate(ALLOC_SIZE) for _ in range(num_branches)] + joined = sample_graphdef.join(*branches) assert isinstance(joined, EmptyNode) assert set(joined.pred) == set(branches) @@ -956,8 +965,9 @@ def test_instantiate_empty_graph(sample_graphdef, inst_kwargs): def test_instantiate_with_nodes(sample_graphdef, inst_kwargs): """Graph with nodes can be instantiated.""" _skip_if_no_mempool() - sample_graphdef.allocate(ALLOC_SIZE) - sample_graphdef.allocate(ALLOC_SIZE) + with xfail_on_graph_mempool_oom(): + sample_graphdef.allocate(ALLOC_SIZE) + sample_graphdef.allocate(ALLOC_SIZE) graph = _instantiate(sample_graphdef, inst_kwargs) assert graph is not None @@ -997,8 +1007,9 @@ def test_instantiate_and_execute_kernel(sample_graphdef, inst_kwargs): def test_instantiate_and_execute_alloc_free(sample_graphdef, inst_kwargs): """Graph with alloc/free can be executed.""" _skip_if_no_mempool() - alloc = sample_graphdef.allocate(ALLOC_SIZE) - alloc.deallocate(alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = sample_graphdef.allocate(ALLOC_SIZE) + alloc.deallocate(alloc.dptr) stream = Device().create_stream() graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream) @@ -1010,9 +1021,10 @@ def test_instantiate_and_execute_alloc_free(sample_graphdef, inst_kwargs): def test_instantiate_and_execute_memset(sample_graphdef, inst_kwargs): """Graph with alloc/memset/free can be executed.""" _skip_if_no_mempool() - alloc = sample_graphdef.allocate(ALLOC_SIZE) - ms = alloc.memset(alloc.dptr, 0xAB, ALLOC_SIZE) - ms.deallocate(alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = sample_graphdef.allocate(ALLOC_SIZE) + ms = alloc.memset(alloc.dptr, 0xAB, ALLOC_SIZE) + ms.deallocate(alloc.dptr) stream = Device().create_stream() graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream) @@ -1026,12 +1038,13 @@ def test_instantiate_and_execute_memcpy(sample_graphdef, inst_kwargs): _skip_if_no_mempool() import ctypes - src_alloc = sample_graphdef.allocate(ALLOC_SIZE) - dst_alloc = sample_graphdef.allocate(ALLOC_SIZE) - dep = sample_graphdef.join(src_alloc, dst_alloc) - ms = dep.memset(src_alloc.dptr, 0xAB, ALLOC_SIZE) - cp = ms.memcpy(dst_alloc.dptr, src_alloc.dptr, ALLOC_SIZE) - cp.deallocate(src_alloc.dptr) + with xfail_on_graph_mempool_oom(): + src_alloc = sample_graphdef.allocate(ALLOC_SIZE) + dst_alloc = sample_graphdef.allocate(ALLOC_SIZE) + dep = sample_graphdef.join(src_alloc, dst_alloc) + ms = dep.memset(src_alloc.dptr, 0xAB, ALLOC_SIZE) + cp = ms.memcpy(dst_alloc.dptr, src_alloc.dptr, ALLOC_SIZE) + cp.deallocate(src_alloc.dptr) stream = Device().create_stream() graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream) @@ -1166,11 +1179,12 @@ def test_instantiate_and_execute_if_then(sample_graphdef): set_handle = mod.get_kernel("set_handle") add_one = mod.get_kernel("add_one") - alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int)) - ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) - setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 1) - if_node = setter.if_then(condition) - if_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int)) + ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) + setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 1) + if_node = setter.if_then(condition) + if_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) graph = sample_graphdef.instantiate() stream = Device().create_stream() @@ -1198,13 +1212,14 @@ def test_instantiate_and_execute_if_else(sample_graphdef): set_handle = mod.get_kernel("set_handle") add_one = mod.get_kernel("add_one") - alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int)) - ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) - setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 0) - ie_node = setter.if_else(condition) - ie_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) - n1 = ie_node.else_.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) - n1.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int)) + ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) + setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 0) + ie_node = setter.if_else(condition) + ie_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) + n1 = ie_node.else_.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) + n1.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) graph = sample_graphdef.instantiate() stream = Device().create_stream() @@ -1232,12 +1247,13 @@ def test_instantiate_and_execute_switch(sample_graphdef): set_handle = mod.get_kernel("set_handle") add_one = mod.get_kernel("add_one") - alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int)) - ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) - setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 2) - sw_node = setter.switch(condition, 4) - for branch in sw_node.branches: - branch.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int)) + ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) + setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 2) + sw_node = setter.switch(condition, 4) + for branch in sw_node.branches: + branch.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) graph = sample_graphdef.instantiate() stream = Device().create_stream() @@ -1272,7 +1288,8 @@ def test_conditional_node_type_preserved_by_nodes(sample_graphdef): def test_debug_dot_print_creates_file(sample_graphdef, dot_file): """debug_dot_print writes a DOT file.""" _skip_if_no_mempool() - sample_graphdef.allocate(ALLOC_SIZE) + with xfail_on_graph_mempool_oom(): + sample_graphdef.allocate(ALLOC_SIZE) sample_graphdef.debug_dot_print(str(dot_file)) assert dot_file.exists() content = dot_file.read_text() @@ -1282,7 +1299,8 @@ def test_debug_dot_print_creates_file(sample_graphdef, dot_file): def test_debug_dot_print_with_options(sample_graphdef, dot_file): """debug_dot_print accepts GraphDebugPrintOptions.""" _skip_if_no_mempool() - sample_graphdef.allocate(ALLOC_SIZE) + with xfail_on_graph_mempool_oom(): + sample_graphdef.allocate(ALLOC_SIZE) options = GraphDebugPrintOptions(verbose=True, handles=True) sample_graphdef.debug_dot_print(str(dot_file), options) assert dot_file.exists() @@ -1291,6 +1309,7 @@ def test_debug_dot_print_with_options(sample_graphdef, dot_file): def test_debug_dot_print_invalid_options(sample_graphdef, dot_file): """debug_dot_print rejects invalid options type.""" _skip_if_no_mempool() - sample_graphdef.allocate(ALLOC_SIZE) + with xfail_on_graph_mempool_oom(): + sample_graphdef.allocate(ALLOC_SIZE) with pytest.raises(TypeError, match="options must be a GraphDebugPrintOptions"): sample_graphdef.debug_dot_print(str(dot_file), "invalid") diff --git a/cuda_core/tests/graph/test_graph_definition_errors.py b/cuda_core/tests/graph/test_graph_definition_errors.py index 40f181e5db1..a8a3c9b8f09 100644 --- a/cuda_core/tests/graph/test_graph_definition_errors.py +++ b/cuda_core/tests/graph/test_graph_definition_errors.py @@ -9,6 +9,7 @@ from helpers.graph_kernels import compile_common_kernels from helpers.misc import try_create_condition +from conftest import xfail_on_graph_mempool_oom from cuda.core import Device, LaunchConfig from cuda.core._utils.cuda_utils import CUDAError from cuda.core.graph import ( @@ -69,7 +70,8 @@ def test_memset_invalid_value_size(init_cuda): """memset with 3-byte value (not 1, 2, or 4) raises ValueError.""" _skip_if_no_mempool() g = GraphDefinition() - alloc = g.allocate(1024) + with xfail_on_graph_mempool_oom(): + alloc = g.allocate(1024) with pytest.raises(ValueError): alloc.memset(alloc.dptr, b"\x01\x02\x03", 100) @@ -113,8 +115,9 @@ def test_join_single_predecessor(init_cuda): """node.join() with no extra args creates a single-dep empty node.""" _skip_if_no_mempool() g = GraphDefinition() - a = g.allocate(1024) - joined = a.join() + with xfail_on_graph_mempool_oom(): + a = g.allocate(1024) + joined = a.join() assert isinstance(joined, EmptyNode) assert set(joined.pred) == {a} @@ -136,7 +139,8 @@ def test_unmatched_alloc_succeeds(init_cuda): """Alloc without corresponding free is valid (graph-scoped lifetime).""" _skip_if_no_mempool() g = GraphDefinition() - g.allocate(1024) + with xfail_on_graph_mempool_oom(): + g.allocate(1024) graph = g.instantiate() stream = Device().create_stream() graph.launch(stream) @@ -174,10 +178,11 @@ def test_while_loop_zero_iterations(init_cuda): g = GraphDefinition() condition = g.create_condition(default_value=0) - alloc = g.allocate(SIZEOF_INT) - ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) - loop = ms.while_loop(condition) - loop.body.launch(cfg, add_one, alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = g.allocate(SIZEOF_INT) + ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) + loop = ms.while_loop(condition) + loop.body.launch(cfg, add_one, alloc.dptr) graph = g.instantiate() stream = Device().create_stream() @@ -202,10 +207,11 @@ def test_if_then_false_skips_body(init_cuda): g = GraphDefinition() condition = g.create_condition(default_value=0) - alloc = g.allocate(SIZEOF_INT) - ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) - if_node = ms.if_then(condition) - if_node.then.launch(cfg, add_one, alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = g.allocate(SIZEOF_INT) + ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) + if_node = ms.if_then(condition) + if_node.then.launch(cfg, add_one, alloc.dptr) graph = g.instantiate() stream = Device().create_stream() @@ -230,11 +236,12 @@ def test_switch_oob_skips_all_branches(init_cuda): g = GraphDefinition() condition = g.create_condition(default_value=99) - alloc = g.allocate(SIZEOF_INT) - ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) - sw = ms.switch(condition, 3) - for branch in sw.branches: - branch.launch(cfg, add_one, alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = g.allocate(SIZEOF_INT) + ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) + sw = ms.switch(condition, 3) + for branch in sw.branches: + branch.launch(cfg, add_one, alloc.dptr) graph = g.instantiate() stream = Device().create_stream() diff --git a/cuda_core/tests/graph/test_graph_definition_integration.py b/cuda_core/tests/graph/test_graph_definition_integration.py index b33b23d8860..12b57bb73a5 100644 --- a/cuda_core/tests/graph/test_graph_definition_integration.py +++ b/cuda_core/tests/graph/test_graph_definition_integration.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from conftest import xfail_on_graph_mempool_oom from cuda.core import Device, EventOptions, LaunchConfig, Program, ProgramOptions from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core.graph import GraphDefinition @@ -204,7 +205,8 @@ def test_heat_diffusion(init_cuda): host_ptr = handle_return(driver.cuMemAllocHost(_HEAT_N * SIZEOF_FLOAT)) try: - _run_heat_graph(dev, k_heat, k_countdown, host_ptr) + with xfail_on_graph_mempool_oom(dev): + _run_heat_graph(dev, k_heat, k_countdown, host_ptr) finally: handle_return(driver.cuMemFreeHost(host_ptr)) @@ -314,7 +316,8 @@ def test_bisection_root(init_cuda): host_ptr = handle_return(driver.cuMemAllocHost(SIZEOF_FLOAT)) try: - _run_bisection_graph(dev, k_eval, k_hi, k_lo, k_cd, k_check, k_newton, host_ptr) + with xfail_on_graph_mempool_oom(dev): + _run_bisection_graph(dev, k_eval, k_hi, k_lo, k_cd, k_check, k_newton, host_ptr) finally: handle_return(driver.cuMemFreeHost(host_ptr)) @@ -416,7 +419,8 @@ def test_switch_dispatch(init_cuda, mode, expected): host_ptr = handle_return(driver.cuMemAllocHost(SIZEOF_INT)) try: - _run_switch_graph(dev, mode, k_negate, k_double, k_square, host_ptr) + with xfail_on_graph_mempool_oom(dev): + _run_switch_graph(dev, mode, k_negate, k_double, k_square, host_ptr) result = ctypes.c_int.from_address(host_ptr).value assert result == expected diff --git a/cuda_core/tests/graph/test_graph_definition_lifetime.py b/cuda_core/tests/graph/test_graph_definition_lifetime.py index c53009a5724..40bc6f3c442 100644 --- a/cuda_core/tests/graph/test_graph_definition_lifetime.py +++ b/cuda_core/tests/graph/test_graph_definition_lifetime.py @@ -12,6 +12,8 @@ from helpers.graph_kernels import compile_common_kernels from helpers.misc import try_create_condition +from conftest import xfail_on_graph_mempool_oom + def _wait_until(predicate, timeout=2.0, interval=0.01): """Poll predicate() until True or timeout, driving gc each iteration. @@ -193,7 +195,8 @@ def test_event_record_node_keeps_event_alive(init_cuda): _skip_if_no_mempool() dev = Device() g = GraphDefinition() - alloc = g.allocate(1024) + with xfail_on_graph_mempool_oom(dev): + alloc = g.allocate(1024) event = dev.create_event(EventOptions(timing_enabled=False)) node = alloc.record(event) @@ -210,7 +213,8 @@ def test_event_wait_node_keeps_event_alive(init_cuda): _skip_if_no_mempool() dev = Device() g = GraphDefinition() - alloc = g.allocate(1024) + with xfail_on_graph_mempool_oom(dev): + alloc = g.allocate(1024) event = dev.create_event(EventOptions(timing_enabled=False)) node = alloc.wait(event) diff --git a/cuda_core/tests/graph/test_graph_memory_resource.py b/cuda_core/tests/graph/test_graph_memory_resource.py index cdf694e3230..a231d5d694c 100644 --- a/cuda_core/tests/graph/test_graph_memory_resource.py +++ b/cuda_core/tests/graph/test_graph_memory_resource.py @@ -8,6 +8,7 @@ from helpers import IS_WINDOWS, IS_WSL from helpers.buffers import compare_buffer_to_constant, make_scratch_buffer, set_buffer +from conftest import xfail_on_graph_mempool_oom from cuda.core import ( Device, DeviceMemoryResource, @@ -64,8 +65,9 @@ def reset(self): def alloc(self, num, nbytes): """Allocate num buffers of size nbytes from graph memory.""" gb = self.device.create_graph_builder().begin_building(self.mode) - buffers = [self.gmr.allocate(nbytes, stream=gb) for _ in range(num)] - graph = gb.end_building().complete() + with xfail_on_graph_mempool_oom(self.device): + buffers = [self.gmr.allocate(nbytes, stream=gb) for _ in range(num)] + graph = gb.end_building().complete() graph.upload(self.stream) graph.launch(self.stream) self.stream.sync() @@ -129,8 +131,9 @@ def apply_kernels(mr, stream, out): else: # Capture work, then upload and launch. gb = device.create_graph_builder().begin_building(mode) - apply_kernels(mr=gmr, stream=gb, out=out) - graph = gb.end_building().complete() + with xfail_on_graph_mempool_oom(device): + apply_kernels(mr=gmr, stream=gb, out=out) + graph = gb.end_building().complete() # First launch. graph.upload(stream) @@ -166,16 +169,17 @@ def test_graph_alloc_with_output(mempool_device, mode): # buffer allocated within the graph. The auto_free_on_launch option # is required to properly use the output buffer. gb = device.create_graph_builder().begin_building(mode) - out = gmr.allocate(NBYTES, stream=gb) - out.copy_from(in_, stream=gb) - launch(gb, LaunchConfig(grid=1, block=1), add_one, out, NBYTES) - options = GraphCompleteOptions(auto_free_on_launch=True) - try: - graph = gb.end_building().complete(options) - except CUDAError as exc: - if "CUDA_ERROR_INVALID_VALUE" in str(exc): - pytest.skip("auto_free_on_launch not supported on this platform") - raise + with xfail_on_graph_mempool_oom(device): + out = gmr.allocate(NBYTES, stream=gb) + out.copy_from(in_, stream=gb) + launch(gb, LaunchConfig(grid=1, block=1), add_one, out, NBYTES) + options = GraphCompleteOptions(auto_free_on_launch=True) + try: + graph = gb.end_building().complete(options) + except CUDAError as exc: + if "CUDA_ERROR_INVALID_VALUE" in str(exc): + pytest.skip("auto_free_on_launch not supported on this platform") + raise # Launch the graph. The output buffer is allocated and set to one. graph.upload(stream) @@ -197,8 +201,9 @@ def test_graph_mem_alloc_zero(mempool_device, mode): gb = device.create_graph_builder().begin_building(mode) stream = device.create_stream() gmr = GraphMemoryResource(device) - buffer = gmr.allocate(0, stream=gb) - graph = gb.end_building().complete() + with xfail_on_graph_mempool_oom(device): + buffer = gmr.allocate(0, stream=gb) + graph = gb.end_building().complete() graph.upload(stream) graph.launch(stream) stream.sync() @@ -280,8 +285,9 @@ def test_gmr_check_capture_state(mempool_device, mode): # Capturing gb = device.create_graph_builder().begin_building(mode=mode) - gmr.allocate(1, stream=gb) # no error - gb.end_building().complete() + with xfail_on_graph_mempool_oom(device): + gmr.allocate(1, stream=gb) # no error + gb.end_building().complete() @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) diff --git a/cuda_core/tests/test_managed_memory_warning.py b/cuda_core/tests/test_managed_memory_warning.py index 5e6032ebe9e..01dd840e2ef 100644 --- a/cuda_core/tests/test_managed_memory_warning.py +++ b/cuda_core/tests/test_managed_memory_warning.py @@ -13,7 +13,7 @@ import pytest import cuda.bindings -from conftest import xfail_if_mempool_oom +from conftest import create_managed_memory_resource_or_skip, xfail_if_mempool_oom from cuda.core import Device, ManagedMemoryResource, ManagedMemoryResourceOptions from cuda.core._memory._managed_memory_resource import reset_concurrent_access_warning from cuda.core._utils.cuda_utils import CUDAError @@ -28,7 +28,10 @@ def _make_managed_mr(device_id): """Create a ManagedMemoryResource with an explicit device preference.""" - return ManagedMemoryResource(options=ManagedMemoryResourceOptions(preferred_location=device_id)) + return create_managed_memory_resource_or_skip( + options=ManagedMemoryResourceOptions(preferred_location=device_id), + xfail_device=device_id, + ) @pytest.fixture diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index 72f7891a711..d1085a952bb 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -16,6 +16,7 @@ from helpers.graph_kernels import compile_common_kernels from helpers.misc import try_create_condition +from conftest import xfail_on_graph_mempool_oom from cuda.core import ( Buffer, Device, @@ -278,32 +279,36 @@ def sample_root_node_alt(sample_graphdef_alt): def sample_empty_node(sample_graphdef): """An EmptyNode created by merging two branches.""" _skip_if_no_mempool() - a = sample_graphdef.allocate(ALLOC_SIZE) - b = sample_graphdef.allocate(ALLOC_SIZE) - return sample_graphdef.join(a, b) + with xfail_on_graph_mempool_oom(): + a = sample_graphdef.allocate(ALLOC_SIZE) + b = sample_graphdef.allocate(ALLOC_SIZE) + return sample_graphdef.join(a, b) @pytest.fixture def sample_empty_node_alt(sample_graphdef): """An alternate EmptyNode from same graph.""" _skip_if_no_mempool() - c = sample_graphdef.allocate(ALLOC_SIZE) - d = sample_graphdef.allocate(ALLOC_SIZE) - return sample_graphdef.join(c, d) + with xfail_on_graph_mempool_oom(): + c = sample_graphdef.allocate(ALLOC_SIZE) + d = sample_graphdef.allocate(ALLOC_SIZE) + return sample_graphdef.join(c, d) @pytest.fixture def sample_alloc_node(sample_graphdef): """An AllocNode.""" _skip_if_no_mempool() - return sample_graphdef.allocate(ALLOC_SIZE) + with xfail_on_graph_mempool_oom(): + return sample_graphdef.allocate(ALLOC_SIZE) @pytest.fixture def sample_alloc_node_alt(sample_graphdef): """An alternate AllocNode from same graph.""" _skip_if_no_mempool() - return sample_graphdef.allocate(ALLOC_SIZE) + with xfail_on_graph_mempool_oom(): + return sample_graphdef.allocate(ALLOC_SIZE) @pytest.fixture @@ -328,52 +333,58 @@ def sample_kernel_node_alt(sample_graphdef, init_cuda): def sample_free_node(sample_graphdef): """A FreeNode.""" _skip_if_no_mempool() - alloc = sample_graphdef.allocate(ALLOC_SIZE) - return alloc.deallocate(alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = sample_graphdef.allocate(ALLOC_SIZE) + return alloc.deallocate(alloc.dptr) @pytest.fixture def sample_free_node_alt(sample_graphdef): """An alternate FreeNode from same graph.""" _skip_if_no_mempool() - alloc = sample_graphdef.allocate(ALLOC_SIZE) - return alloc.deallocate(alloc.dptr) + with xfail_on_graph_mempool_oom(): + alloc = sample_graphdef.allocate(ALLOC_SIZE) + return alloc.deallocate(alloc.dptr) @pytest.fixture def sample_memset_node(sample_graphdef): """A MemsetNode.""" _skip_if_no_mempool() - alloc = sample_graphdef.allocate(ALLOC_SIZE) - return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) + with xfail_on_graph_mempool_oom(): + alloc = sample_graphdef.allocate(ALLOC_SIZE) + return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) @pytest.fixture def sample_memset_node_alt(sample_graphdef): """An alternate MemsetNode from same graph.""" _skip_if_no_mempool() - alloc = sample_graphdef.allocate(ALLOC_SIZE) - return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) + with xfail_on_graph_mempool_oom(): + alloc = sample_graphdef.allocate(ALLOC_SIZE) + return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) @pytest.fixture def sample_memcpy_node(sample_graphdef): """A MemcpyNode.""" _skip_if_no_mempool() - src = sample_graphdef.allocate(ALLOC_SIZE) - dst = sample_graphdef.allocate(ALLOC_SIZE) - dep = sample_graphdef.join(src, dst) - return dep.memcpy(dst.dptr, src.dptr, ALLOC_SIZE) + with xfail_on_graph_mempool_oom(): + src = sample_graphdef.allocate(ALLOC_SIZE) + dst = sample_graphdef.allocate(ALLOC_SIZE) + dep = sample_graphdef.join(src, dst) + return dep.memcpy(dst.dptr, src.dptr, ALLOC_SIZE) @pytest.fixture def sample_memcpy_node_alt(sample_graphdef): """An alternate MemcpyNode from same graph.""" _skip_if_no_mempool() - src = sample_graphdef.allocate(ALLOC_SIZE) - dst = sample_graphdef.allocate(ALLOC_SIZE) - dep = sample_graphdef.join(src, dst) - return dep.memcpy(dst.dptr, src.dptr, ALLOC_SIZE) + with xfail_on_graph_mempool_oom(): + src = sample_graphdef.allocate(ALLOC_SIZE) + dst = sample_graphdef.allocate(ALLOC_SIZE) + dep = sample_graphdef.join(src, dst) + return dep.memcpy(dst.dptr, src.dptr, ALLOC_SIZE) @pytest.fixture