Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cuda_bindings/tests/test_interoperability.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import cuda.bindings.driver as cuda
import cuda.bindings.runtime as cudart
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom


def supportsMemoryPool():
Expand Down Expand Up @@ -87,12 +88,14 @@ def test_interop_graphNode():
def test_interop_memPool():
# DRV to RT
err_dr, pool = cuda.cuDeviceGetDefaultMemPool(0)
xfail_if_mempool_oom(err_dr, "cuDeviceGetDefaultMemPool", 0)
assert err_dr == cuda.CUresult.CUDA_SUCCESS
(err_rt,) = cudart.cudaDeviceSetMemPool(0, pool)
assert err_rt == cudart.cudaError_t.cudaSuccess

# RT to DRV
err_rt, pool = cudart.cudaDeviceGetDefaultMemPool(0)
xfail_if_mempool_oom(err_rt, "cudaDeviceGetDefaultMemPool", 0)
assert err_rt == cudart.cudaError_t.cudaSuccess
(err_dr,) = cuda.cuDeviceSetMemPool(0, pool)
assert err_dr == cuda.CUresult.CUDA_SUCCESS
Expand Down
9 changes: 9 additions & 0 deletions cuda_core/cuda/core/_memory/_device_memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ from cuda.core._memory._ipc cimport IPCAllocationHandle
from cuda.core._resource_handles cimport (
as_cu,
get_device_mempool,
get_last_error,
)
from cuda.core._utils.cuda_utils cimport (
check_or_create_options,
Expand Down Expand Up @@ -262,6 +263,14 @@ cdef inline _DMR_init(DeviceMemoryResource self, device_id, options):

if opts is None:
self._h_pool = get_device_mempool(dev_id)
if not self._h_pool:
HANDLE_RETURN(get_last_error())
raise RuntimeError(
f"Failed to initialize DeviceMemoryResource for device {dev_id}: "
"cuda-core returned an empty memory pool handle without recording a CUDA error. "
"This is an internal cuda-core error; please report it with your CUDA driver, "
"CUDA Toolkit, and cuda-python versions."
)
self._mempool_owned = False
MP_raise_release_threshold(self)
else:
Expand Down
9 changes: 8 additions & 1 deletion cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, Mem
from cuda.core._resource_handles cimport (
DevicePtrHandle,
deviceptr_alloc_async,
get_last_error,
as_cu,
)

Expand Down Expand Up @@ -194,7 +195,13 @@ cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream
check_capturing(s)
h_ptr = deviceptr_alloc_async(size, stream._h_stream)
if not h_ptr:
raise RuntimeError("Failed to allocate memory asynchronously")
HANDLE_RETURN(get_last_error())
raise RuntimeError(
f"Failed to allocate {size} bytes from GraphMemoryResource: "
"cuda-core returned an empty allocation handle without recording a CUDA error. "
"This is an internal cuda-core error; please report it with your CUDA driver, "
"CUDA Toolkit, and cuda-python versions."
)
return Buffer_from_deviceptr_handle(h_ptr, size, self, None)


Expand Down
8 changes: 7 additions & 1 deletion cuda_core/cuda/core/_memory/_ipc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,13 @@ cdef _MemPool MP_from_allocation_handle(cls, alloc_handle):
cdef int ipc_fd = int(alloc_handle)
self._h_pool = create_mempool_handle_ipc(ipc_fd, IPC_HANDLE_TYPE)
if not self._h_pool:
raise RuntimeError("Failed to import memory pool from IPC handle")
HANDLE_RETURN(get_last_error())
raise RuntimeError(
f"Failed to import {cls.__name__} from an allocation handle: "
"cuda-core returned an empty memory pool handle without recording a CUDA error. "
"This is an internal cuda-core error; please report it with your CUDA driver, "
"CUDA Toolkit, and cuda-python versions."
)
self._ipc_data = IPCDataForMR(alloc_handle, True)

# Register it.
Expand Down
17 changes: 16 additions & 1 deletion cuda_core/cuda/core/_memory/_memory_pool.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ from cuda.core._resource_handles cimport (
DevicePtrHandle,
create_mempool_handle,
deviceptr_alloc_from_pool,
get_last_error,
as_cu,
as_py,
)
Expand Down Expand Up @@ -228,6 +229,14 @@ cdef int MP_init_create_pool(

self._mempool_owned = True
self._h_pool = create_mempool_handle(properties)
if not self._h_pool:
HANDLE_RETURN(get_last_error())
raise RuntimeError(
f"Failed to initialize {self.__class__.__name__}: "
"cuda-core returned an empty memory pool handle without recording a CUDA error. "
"This is an internal cuda-core error; please report it with your CUDA driver, "
"CUDA Toolkit, and cuda-python versions."
)

if ipc_enabled:
alloc_handle = _ipc.MP_export_mempool(self)
Expand Down Expand Up @@ -307,7 +316,13 @@ cdef inline Buffer _MP_allocate(_MemPool self, size_t size, Stream stream):
check_not_capturing(s)
h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream)
if not h_ptr:
raise RuntimeError("Failed to allocate memory from pool")
HANDLE_RETURN(get_last_error())
raise RuntimeError(
f"Failed to allocate {size} bytes from {self.__class__.__name__}: "
"cuda-core returned an empty allocation handle without recording a CUDA error. "
"This is an internal cuda-core error; please report it with your CUDA driver, "
"CUDA Toolkit, and cuda-python versions."
)
return Buffer_from_deviceptr_handle(h_ptr, size, self, None)


Expand Down
12 changes: 12 additions & 0 deletions cuda_core/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import pathlib
import sys
from contextlib import contextmanager
from importlib.metadata import PackageNotFoundError, distribution

import pytest
Expand Down Expand Up @@ -87,6 +88,8 @@ def create_managed_memory_resource_or_skip(*args, xfail_device=None, **kwargs):
return ManagedMemoryResource(*args, **kwargs)
except CUDAError as e:
xfail_if_mempool_oom(e, _device_id_from_resource_options(xfail_device, args, kwargs))
if "CUDA_ERROR_NOT_SUPPORTED" in str(e):
pytest.skip("ManagedMemoryResource is not supported on this platform/device")
raise
except RuntimeError as e:
if "requires CUDA 13.0" in str(e):
Expand All @@ -102,6 +105,15 @@ def create_pinned_memory_resource_or_xfail(*args, xfail_device=None, **kwargs):
raise


@contextmanager
def xfail_on_graph_mempool_oom(device=0):
try:
yield
except CUDAError as e:
xfail_if_mempool_oom(e, "cuGraphAddMemAllocNode", device)
raise


def _device_id_from_resource_options(device, args, kwargs):
if device is not None:
return device
Expand Down
113 changes: 66 additions & 47 deletions cuda_core/tests/graph/test_graph_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from helpers.graph_kernels import compile_common_kernels
from helpers.misc import try_create_condition

from conftest import xfail_on_graph_mempool_oom
from cuda.core import Device, LaunchConfig
from cuda.core.graph import (
AllocNode,
Expand Down Expand Up @@ -201,13 +202,15 @@ def _build_disconnected():
def graph_spec(request, init_cuda):
if request.param is not _build_empty:
_skip_if_no_mempool()
return request.param()
with xfail_on_graph_mempool_oom():
return request.param()


@pytest.fixture(params=_NONEMPTY_BUILDERS)
def nonempty_graph_spec(request, init_cuda):
_skip_if_no_mempool()
return request.param()
with xfail_on_graph_mempool_oom():
return request.param()


# =============================================================================
Expand Down Expand Up @@ -562,7 +565,8 @@ def node_spec(request, init_cuda):
if spec.needs_mempool:
_skip_if_no_mempool()
g = GraphDefinition()
node, expected_attrs = spec.builder(g)
with xfail_on_graph_mempool_oom():
node, expected_attrs = spec.builder(g)
return spec, g, node, expected_attrs


Expand Down Expand Up @@ -803,18 +807,20 @@ def test_alloc_zero_size_fails(sample_graphdef):
def test_free_creates_dependency(sample_graphdef):
"""Free node depends on its predecessor."""
_skip_if_no_mempool()
alloc = sample_graphdef.allocate(ALLOC_SIZE)
free = alloc.deallocate(alloc.dptr)
with xfail_on_graph_mempool_oom():
alloc = sample_graphdef.allocate(ALLOC_SIZE)
free = alloc.deallocate(alloc.dptr)
assert alloc in free.pred


def test_alloc_free_chain(sample_graphdef):
"""Alloc and free can be chained."""
_skip_if_no_mempool()
a1 = sample_graphdef.allocate(ALLOC_SIZE)
a2 = a1.allocate(ALLOC_SIZE)
f2 = a2.deallocate(a2.dptr)
f1 = f2.deallocate(a1.dptr)
with xfail_on_graph_mempool_oom():
a1 = sample_graphdef.allocate(ALLOC_SIZE)
a2 = a1.allocate(ALLOC_SIZE)
f2 = a2.deallocate(a2.dptr)
f1 = f2.deallocate(a1.dptr)
assert a1 in a2.pred
assert a2 in f2.pred
assert f2 in f1.pred
Expand Down Expand Up @@ -842,15 +848,17 @@ def test_alloc_device_option(sample_graphdef, device_spec):
"""Device can be specified as int or Device object."""
_skip_if_no_mempool()
device = Device()
node = sample_graphdef.allocate(ALLOC_SIZE, device=device_spec(device))
with xfail_on_graph_mempool_oom(device):
node = sample_graphdef.allocate(ALLOC_SIZE, device=device_spec(device))
assert node.dptr != 0


def test_alloc_peer_access(mempool_device_x2):
"""AllocNode.peer_access reflects requested peers."""
d0, d1 = mempool_device_x2
g = GraphDefinition()
node = g.allocate(ALLOC_SIZE, device=d0.device_id, peer_access=[d1.device_id])
with xfail_on_graph_mempool_oom(d0):
node = g.allocate(ALLOC_SIZE, device=d0.device_id, peer_access=[d1.device_id])
assert d1.device_id in node.peer_access


Expand All @@ -863,8 +871,9 @@ def test_alloc_peer_access(mempool_device_x2):
def test_join_merges_branches(sample_graphdef, num_branches):
"""join() with multiple branches creates correct dependencies."""
_skip_if_no_mempool()
branches = [sample_graphdef.allocate(ALLOC_SIZE) for _ in range(num_branches)]
joined = sample_graphdef.join(*branches)
with xfail_on_graph_mempool_oom():
branches = [sample_graphdef.allocate(ALLOC_SIZE) for _ in range(num_branches)]
joined = sample_graphdef.join(*branches)
assert isinstance(joined, EmptyNode)
assert set(joined.pred) == set(branches)

Expand Down Expand Up @@ -956,8 +965,9 @@ def test_instantiate_empty_graph(sample_graphdef, inst_kwargs):
def test_instantiate_with_nodes(sample_graphdef, inst_kwargs):
"""Graph with nodes can be instantiated."""
_skip_if_no_mempool()
sample_graphdef.allocate(ALLOC_SIZE)
sample_graphdef.allocate(ALLOC_SIZE)
with xfail_on_graph_mempool_oom():
sample_graphdef.allocate(ALLOC_SIZE)
sample_graphdef.allocate(ALLOC_SIZE)
graph = _instantiate(sample_graphdef, inst_kwargs)
assert graph is not None

Expand Down Expand Up @@ -997,8 +1007,9 @@ def test_instantiate_and_execute_kernel(sample_graphdef, inst_kwargs):
def test_instantiate_and_execute_alloc_free(sample_graphdef, inst_kwargs):
"""Graph with alloc/free can be executed."""
_skip_if_no_mempool()
alloc = sample_graphdef.allocate(ALLOC_SIZE)
alloc.deallocate(alloc.dptr)
with xfail_on_graph_mempool_oom():
alloc = sample_graphdef.allocate(ALLOC_SIZE)
alloc.deallocate(alloc.dptr)

stream = Device().create_stream()
graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream)
Expand All @@ -1010,9 +1021,10 @@ def test_instantiate_and_execute_alloc_free(sample_graphdef, inst_kwargs):
def test_instantiate_and_execute_memset(sample_graphdef, inst_kwargs):
"""Graph with alloc/memset/free can be executed."""
_skip_if_no_mempool()
alloc = sample_graphdef.allocate(ALLOC_SIZE)
ms = alloc.memset(alloc.dptr, 0xAB, ALLOC_SIZE)
ms.deallocate(alloc.dptr)
with xfail_on_graph_mempool_oom():
alloc = sample_graphdef.allocate(ALLOC_SIZE)
ms = alloc.memset(alloc.dptr, 0xAB, ALLOC_SIZE)
ms.deallocate(alloc.dptr)

stream = Device().create_stream()
graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream)
Expand All @@ -1026,12 +1038,13 @@ def test_instantiate_and_execute_memcpy(sample_graphdef, inst_kwargs):
_skip_if_no_mempool()
import ctypes

src_alloc = sample_graphdef.allocate(ALLOC_SIZE)
dst_alloc = sample_graphdef.allocate(ALLOC_SIZE)
dep = sample_graphdef.join(src_alloc, dst_alloc)
ms = dep.memset(src_alloc.dptr, 0xAB, ALLOC_SIZE)
cp = ms.memcpy(dst_alloc.dptr, src_alloc.dptr, ALLOC_SIZE)
cp.deallocate(src_alloc.dptr)
with xfail_on_graph_mempool_oom():
src_alloc = sample_graphdef.allocate(ALLOC_SIZE)
dst_alloc = sample_graphdef.allocate(ALLOC_SIZE)
dep = sample_graphdef.join(src_alloc, dst_alloc)
ms = dep.memset(src_alloc.dptr, 0xAB, ALLOC_SIZE)
cp = ms.memcpy(dst_alloc.dptr, src_alloc.dptr, ALLOC_SIZE)
cp.deallocate(src_alloc.dptr)

stream = Device().create_stream()
graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream)
Expand Down Expand Up @@ -1166,11 +1179,12 @@ def test_instantiate_and_execute_if_then(sample_graphdef):
set_handle = mod.get_kernel("set_handle")
add_one = mod.get_kernel("add_one")

alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 1)
if_node = setter.if_then(condition)
if_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
with xfail_on_graph_mempool_oom():
alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 1)
if_node = setter.if_then(condition)
if_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)

graph = sample_graphdef.instantiate()
stream = Device().create_stream()
Expand Down Expand Up @@ -1198,13 +1212,14 @@ def test_instantiate_and_execute_if_else(sample_graphdef):
set_handle = mod.get_kernel("set_handle")
add_one = mod.get_kernel("add_one")

alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 0)
ie_node = setter.if_else(condition)
ie_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
n1 = ie_node.else_.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
n1.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
with xfail_on_graph_mempool_oom():
alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 0)
ie_node = setter.if_else(condition)
ie_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
n1 = ie_node.else_.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
n1.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)

graph = sample_graphdef.instantiate()
stream = Device().create_stream()
Expand Down Expand Up @@ -1232,12 +1247,13 @@ def test_instantiate_and_execute_switch(sample_graphdef):
set_handle = mod.get_kernel("set_handle")
add_one = mod.get_kernel("add_one")

alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 2)
sw_node = setter.switch(condition, 4)
for branch in sw_node.branches:
branch.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
with xfail_on_graph_mempool_oom():
alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 2)
sw_node = setter.switch(condition, 4)
for branch in sw_node.branches:
branch.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)

graph = sample_graphdef.instantiate()
stream = Device().create_stream()
Expand Down Expand Up @@ -1272,7 +1288,8 @@ def test_conditional_node_type_preserved_by_nodes(sample_graphdef):
def test_debug_dot_print_creates_file(sample_graphdef, dot_file):
"""debug_dot_print writes a DOT file."""
_skip_if_no_mempool()
sample_graphdef.allocate(ALLOC_SIZE)
with xfail_on_graph_mempool_oom():
sample_graphdef.allocate(ALLOC_SIZE)
sample_graphdef.debug_dot_print(str(dot_file))
assert dot_file.exists()
content = dot_file.read_text()
Expand All @@ -1282,7 +1299,8 @@ def test_debug_dot_print_creates_file(sample_graphdef, dot_file):
def test_debug_dot_print_with_options(sample_graphdef, dot_file):
"""debug_dot_print accepts GraphDebugPrintOptions."""
_skip_if_no_mempool()
sample_graphdef.allocate(ALLOC_SIZE)
with xfail_on_graph_mempool_oom():
sample_graphdef.allocate(ALLOC_SIZE)
options = GraphDebugPrintOptions(verbose=True, handles=True)
sample_graphdef.debug_dot_print(str(dot_file), options)
assert dot_file.exists()
Expand All @@ -1291,6 +1309,7 @@ def test_debug_dot_print_with_options(sample_graphdef, dot_file):
def test_debug_dot_print_invalid_options(sample_graphdef, dot_file):
"""debug_dot_print rejects invalid options type."""
_skip_if_no_mempool()
sample_graphdef.allocate(ALLOC_SIZE)
with xfail_on_graph_mempool_oom():
sample_graphdef.allocate(ALLOC_SIZE)
with pytest.raises(TypeError, match="options must be a GraphDebugPrintOptions"):
sample_graphdef.debug_dot_print(str(dot_file), "invalid")
Loading
Loading