Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cuda_core/cuda/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,12 @@
Stream,
StreamOptions,
)
from cuda.core._tensor_map import ( # noqa: E402
TensorMapDataType,
TensorMapDescriptor,
TensorMapIm2ColWideMode,
TensorMapInterleave,
TensorMapL2Promotion,
TensorMapOOBFill,
TensorMapSwizzle,
)
31 changes: 31 additions & 0 deletions cuda_core/cuda/core/_kernel_arg_handler.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
from libc.stdint cimport (intptr_t,
int8_t, int16_t, int32_t, int64_t,
uint8_t, uint16_t, uint32_t, uint64_t,)
from libc.string cimport memcpy
from libcpp cimport bool as cpp_bool
from libcpp.complex cimport complex as cpp_complex
from libcpp cimport nullptr
Expand All @@ -16,6 +17,8 @@ import ctypes
import numpy

from cuda.core._memory import Buffer
from cuda.core._tensor_map import TensorMapDescriptor as _TensorMapDescriptor_py
from cuda.core._tensor_map cimport TensorMapDescriptor
from cuda.core._utils.cuda_utils import driver
from cuda.bindings cimport cydriver

Expand Down Expand Up @@ -97,6 +100,9 @@ cdef object numpy_complex64 = numpy.complex64
cdef object numpy_complex128 = numpy.complex128


cdef object tensor_map_descriptor_type = _TensorMapDescriptor_py


# limitation due to cython/cython#534
ctypedef void* voidptr

Expand Down Expand Up @@ -124,6 +130,25 @@ cdef inline int prepare_arg(
return 0


cdef inline int prepare_tensor_map_arg(
vector.vector[void*]& data,
vector.vector[void*]& data_addresses,
TensorMapDescriptor arg,
const size_t idx) except -1:
# Allocate a temporary buffer for the 128-byte CUtensorMap struct.
# We copy rather than pointing directly at arg._tensor_map for lifetime
# safety: ParamHolder owns and frees its argument buffers independently.
cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
if ptr is NULL:
raise MemoryError("Failed to allocate memory for CUtensorMap")
memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
# data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__,
# data_addresses[idx] is the pointer passed to cuLaunchKernel.
data_addresses[idx] = ptr
data[idx] = ptr
return 0


cdef inline int prepare_ctypes_arg(
vector.vector[void*]& data,
vector.vector[void*]& data_addresses,
Expand Down Expand Up @@ -273,6 +298,9 @@ cdef class ParamHolder:
# it's a CUdeviceptr:
self.data_addresses[i] = <void*><intptr_t>(arg.handle.getPtr())
continue
elif arg_type is tensor_map_descriptor_type:
prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
continue
elif arg_type is bool:
prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i)
continue
Expand Down Expand Up @@ -322,6 +350,9 @@ cdef class ParamHolder:
elif isinstance(arg, driver.CUgraphConditionalHandle):
prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i)
continue
elif isinstance(arg, tensor_map_descriptor_type):
prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
continue
# TODO: support ctypes/numpy struct
raise TypeError("the argument is of unsupported type: " + str(type(arg)))

Expand Down
13 changes: 13 additions & 0 deletions cuda_core/cuda/core/_tensor_map.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

from cuda.bindings cimport cydriver


cdef class TensorMapDescriptor:
cdef cydriver.CUtensorMap _tensor_map
cdef object _source_ref
cdef object _repr_info

cdef void* _get_data_ptr(self)
Loading
Loading