From 1f5c57331ce17968e581adfb003aa266a914f763 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 23 Feb 2026 14:02:32 -0800 Subject: [PATCH 1/9] initial commit --- cuda_core/cuda/core/__init__.py | 8 + cuda_core/cuda/core/_kernel_arg_handler.pyx | 24 + cuda_core/cuda/core/_tensor_map.pxd | 12 + cuda_core/cuda/core/_tensor_map.pyx | 524 ++++++++++++++++++++ cuda_core/examples/tma_tensor_map.py | 195 ++++++++ cuda_core/tests/test_tensor_map.py | 279 +++++++++++ 6 files changed, 1042 insertions(+) create mode 100644 cuda_core/cuda/core/_tensor_map.pxd create mode 100644 cuda_core/cuda/core/_tensor_map.pyx create mode 100644 cuda_core/examples/tma_tensor_map.py create mode 100644 cuda_core/tests/test_tensor_map.py diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index f22bbc7f16..15eead893a 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -61,6 +61,14 @@ ) from cuda.core._module import Kernel, ObjectCode # noqa: E402 from cuda.core._program import Program, ProgramOptions # noqa: E402 +from cuda.core._tensor_map import ( # noqa: E402 + TensorMapDescriptor, + TensorMapDataType, + TensorMapInterleave, + TensorMapL2Promotion, + TensorMapOOBFill, + TensorMapSwizzle, +) from cuda.core._stream import ( # noqa: E402 LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index 882ca5eaab..e88def13cd 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -6,6 +6,7 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.stdint cimport (intptr_t, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t,) +from libc.string cimport memcpy from libcpp cimport bool as cpp_bool from libcpp.complex cimport complex as cpp_complex from libcpp cimport nullptr @@ -16,6 +17,8 @@ import ctypes import numpy from cuda.core._memory import Buffer +from cuda.core._tensor_map import TensorMapDescriptor as _TensorMapDescriptor_py +from cuda.core._tensor_map cimport TensorMapDescriptor from cuda.core._utils.cuda_utils import driver from cuda.bindings cimport cydriver @@ -97,6 +100,9 @@ cdef object numpy_complex64 = numpy.complex64 cdef object numpy_complex128 = numpy.complex128 +cdef object tensor_map_descriptor_type = _TensorMapDescriptor_py + + # limitation due to cython/cython#534 ctypedef void* voidptr @@ -124,6 +130,18 @@ cdef inline int prepare_arg( return 0 +cdef inline int prepare_tensor_map_arg( + vector.vector[void*]& data, + vector.vector[void*]& data_addresses, + TensorMapDescriptor arg, + const size_t idx) except -1: + cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap)) + memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap)) + data_addresses[idx] = ptr + data[idx] = ptr + return 0 + + cdef inline int prepare_ctypes_arg( vector.vector[void*]& data, vector.vector[void*]& data_addresses, @@ -273,6 +291,9 @@ cdef class ParamHolder: # it's a CUdeviceptr: self.data_addresses[i] = (arg.handle.getPtr()) continue + elif arg_type is tensor_map_descriptor_type: + prepare_tensor_map_arg(self.data, self.data_addresses, arg, i) + continue elif arg_type is bool: prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i) continue @@ -322,6 +343,9 @@ cdef class ParamHolder: elif isinstance(arg, driver.CUgraphConditionalHandle): prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i) continue + elif isinstance(arg, tensor_map_descriptor_type): + prepare_tensor_map_arg(self.data, self.data_addresses, arg, i) + continue # TODO: support ctypes/numpy struct raise TypeError("the argument is of unsupported type: " + str(type(arg))) diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd new file mode 100644 index 0000000000..3cfd571d89 --- /dev/null +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + + +cdef class TensorMapDescriptor: + cdef cydriver.CUtensorMap _tensor_map + cdef object _source_ref + + cdef void* _get_data_ptr(self) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx new file mode 100644 index 0000000000..9375acf944 --- /dev/null +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -0,0 +1,524 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t, uint32_t, uint64_t +from cuda.bindings cimport cydriver +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN + +import enum + +import numpy + +from cuda.core._memoryview import StridedMemoryView + + +try: + from ml_dtypes import bfloat16 as ml_bfloat16 +except ImportError: + ml_bfloat16 = None + + +class TensorMapDataType(enum.IntEnum): + """Data types for tensor map descriptors. + + These correspond to the ``CUtensorMapDataType`` driver enum values. + """ + UINT8 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8 + UINT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16 + UINT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32 + INT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32 + UINT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64 + INT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64 + FLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16 + FLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32 + FLOAT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64 + BFLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16 + FLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ + TFLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32 + TFLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ + + +class TensorMapInterleave(enum.IntEnum): + """Interleave layout for tensor map descriptors. + + These correspond to the ``CUtensorMapInterleave`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_INTERLEAVE_NONE + INTERLEAVE_16B = cydriver.CU_TENSOR_MAP_INTERLEAVE_16B + INTERLEAVE_32B = cydriver.CU_TENSOR_MAP_INTERLEAVE_32B + + +class TensorMapSwizzle(enum.IntEnum): + """Swizzle mode for tensor map descriptors. + + These correspond to the ``CUtensorMapSwizzle`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_SWIZZLE_NONE + SWIZZLE_32B = cydriver.CU_TENSOR_MAP_SWIZZLE_32B + SWIZZLE_64B = cydriver.CU_TENSOR_MAP_SWIZZLE_64B + SWIZZLE_128B = cydriver.CU_TENSOR_MAP_SWIZZLE_128B + + +class TensorMapL2Promotion(enum.IntEnum): + """L2 promotion mode for tensor map descriptors. + + These correspond to the ``CUtensorMapL2promotion`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_L2_PROMOTION_NONE + L2_64B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_64B + L2_128B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_128B + L2_256B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_256B + + +class TensorMapOOBFill(enum.IntEnum): + """Out-of-bounds fill mode for tensor map descriptors. + + These correspond to the ``CUtensorMapFloatOOBfill`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE + NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA + + +# Mapping from numpy dtype to TMA data type +_NUMPY_DTYPE_TO_TMA = { + numpy.dtype(numpy.uint8): TensorMapDataType.UINT8, + numpy.dtype(numpy.uint16): TensorMapDataType.UINT16, + numpy.dtype(numpy.uint32): TensorMapDataType.UINT32, + numpy.dtype(numpy.int32): TensorMapDataType.INT32, + numpy.dtype(numpy.uint64): TensorMapDataType.UINT64, + numpy.dtype(numpy.int64): TensorMapDataType.INT64, + numpy.dtype(numpy.float16): TensorMapDataType.FLOAT16, + numpy.dtype(numpy.float32): TensorMapDataType.FLOAT32, + numpy.dtype(numpy.float64): TensorMapDataType.FLOAT64, +} + +if ml_bfloat16 is not None: + _NUMPY_DTYPE_TO_TMA[numpy.dtype(ml_bfloat16)] = TensorMapDataType.BFLOAT16 + + +# Mapping from TMA data type to element size in bytes +_TMA_DATA_TYPE_SIZE = { + TensorMapDataType.UINT8: 1, + TensorMapDataType.UINT16: 2, + TensorMapDataType.UINT32: 4, + TensorMapDataType.INT32: 4, + TensorMapDataType.UINT64: 8, + TensorMapDataType.INT64: 8, + TensorMapDataType.FLOAT16: 2, + TensorMapDataType.FLOAT32: 4, + TensorMapDataType.FLOAT64: 8, + TensorMapDataType.BFLOAT16: 2, + TensorMapDataType.FLOAT32_FTZ: 4, + TensorMapDataType.TFLOAT32: 4, + TensorMapDataType.TFLOAT32_FTZ: 4, +} + + +def _resolve_data_type(view, data_type): + """Resolve the TMA data type from an explicit value or the view's dtype.""" + + if data_type is not None: + if not isinstance(data_type, TensorMapDataType): + raise TypeError( + f"data_type must be a TensorMapDataType, got {type(data_type)}") + return data_type + + dt = view.dtype + if dt is None: + raise ValueError( + "Cannot infer TMA data type from the tensor; " + "please specify data_type explicitly") + + tma_dt = _NUMPY_DTYPE_TO_TMA.get(dt) + if tma_dt is None: + raise ValueError( + f"Unsupported dtype {dt} for TMA; " + f"supported dtypes: {list(_NUMPY_DTYPE_TO_TMA.keys())}. " + "You may also specify data_type explicitly.") + + return tma_dt + + +cdef class TensorMapDescriptor: + """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs. + + A ``TensorMapDescriptor`` wraps the opaque 128-byte ``CUtensorMap`` struct + used by the hardware TMA unit for efficient bulk data movement between + global and shared memory. + + Instances are created via the class methods :meth:`from_tiled` and + :meth:`from_im2col`, and can be passed directly to + :func:`~cuda.core.launch` as a kernel argument. + """ + + def __init__(self): + raise RuntimeError( + "TensorMapDescriptor cannot be instantiated directly. " + "Use TensorMapDescriptor.from_tiled() or " + "TensorMapDescriptor.from_im2col().") + + cdef void* _get_data_ptr(self): + return &self._tensor_map + + @staticmethod + def from_tiled(tensor, box_dim, *, + element_strides=None, + data_type=None, + interleave=TensorMapInterleave.NONE, + swizzle=TensorMapSwizzle.NONE, + l2_promotion=TensorMapL2Promotion.NONE, + oob_fill=TensorMapOOBFill.NONE): + """Create a tiled TMA descriptor from a tensor object. + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + box_dim : tuple of int + The size of each tile dimension (in elements). Must have the + same rank as the tensor and each value must be in [1, 256]. + Specified in the same (row-major) order as the tensor shape. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + Specified in the same (row-major) order as the tensor shape. + data_type : TensorMapDataType, optional + Explicit data type override. If ``None``, inferred from the + tensor's dtype. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``NONE``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [1, 5], the pointer is not + 16-byte aligned, or dimension/stride constraints are violated. + """ + cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + + # Obtain a StridedMemoryView from the tensor + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + # Keep a strong reference to prevent GC + desc._source_ref = tensor + + # Resolve data type + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int + + # Get tensor metadata + cdef intptr_t global_address = view.ptr + shape = view.shape + strides = view.strides # in elements, can be None for C-contiguous + + cdef int rank = len(shape) + if rank < 1 or rank > 5: + raise ValueError( + f"Tensor rank must be between 1 and 5, got {rank}") + + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)") + + if len(box_dim) != rank: + raise ValueError( + f"box_dim must have {rank} elements (same as tensor rank), " + f"got {len(box_dim)}") + + for i, bd in enumerate(box_dim): + if bd < 1 or bd > 256: + raise ValueError( + f"box_dim[{i}] must be in [1, 256], got {bd}") + + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + else: + element_strides = (1,) * rank + + # Compute byte strides from element strides + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + if strides is not None: + byte_strides = tuple(s * elem_size for s in strides) + else: + # C-contiguous: strides in bytes, row-major + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + + # Reverse dimensions for column-major cuTensorMap convention + # Python/DLPack: row-major (dim 0 = outermost) + # cuTensorMap: column-major (dim 0 = innermost) + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides # rank - 1 elements + cdef uint32_t[5] c_box_dim + cdef uint32_t[5] c_element_strides + cdef int i_c + + for i_c in range(rank): + # Reverse: Python dim i -> cuTensorMap dim (rank - 1 - i) + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_box_dim[i_c] = box_dim[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + # globalStrides: rank-1 elements (byte strides for dims 1..N-1 in col-major order) + # The innermost stride (dim 0) is implicit = element size + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + cdef uint32_t c_rank = rank + cdef int c_interleave_int = int(interleave) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeTiled( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_box_dim, + c_element_strides, + c_interleave, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + return desc + + @staticmethod + def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner, + channels_per_pixel, pixels_per_column, *, + element_strides=None, + data_type=None, + interleave=TensorMapInterleave.NONE, + swizzle=TensorMapSwizzle.NONE, + l2_promotion=TensorMapL2Promotion.NONE, + oob_fill=TensorMapOOBFill.NONE): + """Create an im2col TMA descriptor from a tensor object. + + Im2col layout is used for convolution-style data access patterns. + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + pixel_box_lower_corner : tuple of int + Lower corner of the pixel bounding box for each spatial + dimension (rank - 2 elements). Specified in row-major order + matching the tensor's spatial dimensions. + pixel_box_upper_corner : tuple of int + Upper corner of the pixel bounding box for each spatial + dimension (rank - 2 elements). Specified in row-major order + matching the tensor's spatial dimensions. + channels_per_pixel : int + Number of channels per pixel. + pixels_per_column : int + Number of pixels per column. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + data_type : TensorMapDataType, optional + Explicit data type override. If ``None``, inferred from the + tensor's dtype. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``NONE``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [3, 5], the pointer is not + 16-byte aligned, or other constraints are violated. + """ + cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + + # Obtain a StridedMemoryView from the tensor + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + desc._source_ref = tensor + + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int + + cdef intptr_t global_address = view.ptr + shape = view.shape + strides = view.strides + + cdef int rank = len(shape) + if rank < 3 or rank > 5: + raise ValueError( + f"Im2col tensor rank must be between 3 and 5, got {rank}") + + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x}") + + cdef int n_spatial = rank - 2 + if len(pixel_box_lower_corner) != n_spatial: + raise ValueError( + f"pixel_box_lower_corner must have {n_spatial} elements " + f"(rank - 2), got {len(pixel_box_lower_corner)}") + if len(pixel_box_upper_corner) != n_spatial: + raise ValueError( + f"pixel_box_upper_corner must have {n_spatial} elements " + f"(rank - 2), got {len(pixel_box_upper_corner)}") + + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + else: + element_strides = (1,) * rank + + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + if strides is not None: + byte_strides = tuple(s * elem_size for s in strides) + else: + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + + # Reverse all dimension arrays for column-major convention + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides + cdef uint32_t[5] c_element_strides + cdef int[3] c_pixel_box_lower # max 3 spatial dims (rank 5 - 2) + cdef int[3] c_pixel_box_upper + cdef int i_c + + for i_c in range(rank): + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + # Reverse spatial dimensions for lower/upper corners + for i_c in range(n_spatial): + c_pixel_box_lower[i_c] = pixel_box_lower_corner[n_spatial - 1 - i_c] + c_pixel_box_upper[i_c] = pixel_box_upper_corner[n_spatial - 1 - i_c] + + cdef uint32_t c_rank = rank + cdef uint32_t c_channels = channels_per_pixel + cdef uint32_t c_pixels = pixels_per_column + cdef int c_interleave_int = int(interleave) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2col( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_pixel_box_lower, + c_pixel_box_upper, + c_channels, + c_pixels, + c_element_strides, + c_interleave, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + return desc + + def replace_address(self, tensor): + """Replace the global memory address in this tensor map descriptor. + + This is useful when the tensor data has been reallocated but the + shape, strides, and other parameters remain the same. + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + """ + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + cdef intptr_t global_address = view.ptr + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x}") + + self._source_ref = tensor + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress( + &self._tensor_map, + global_address, + )) + + def __repr__(self): + return f"TensorMapDescriptor()" diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py new file mode 100644 index 0000000000..de51570ec4 --- /dev/null +++ b/cuda_core/examples/tma_tensor_map.py @@ -0,0 +1,195 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates how to use TMA (Tensor Memory Accelerator) descriptors +# with cuda.core on Hopper+ GPUs (compute capability >= 9.0). +# +# TMA enables efficient bulk data movement between global and shared memory using +# hardware-managed tensor map descriptors. This example shows: +# +# 1. Creating a TMA tiled descriptor from a CuPy device array +# 2. Passing the descriptor to a kernel via launch() +# 3. Using TMA to load tiles into shared memory (via inline PTX) +# 4. Updating the descriptor's source address with replace_address() +# +# Requirements: +# - Hopper or later GPU (compute capability >= 9.0) +# - CuPy +# - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set) +# +# ################################################################################ + +import sys + +import cupy as cp +import numpy as np + +from cuda.core import ( + Device, + LaunchConfig, + Program, + ProgramOptions, + TensorMapDescriptor, + launch, +) + +# --------------------------------------------------------------------------- +# Check for Hopper+ GPU +# --------------------------------------------------------------------------- +dev = Device() +arch = dev.compute_capability +if arch < (9, 0): + print( + "TMA requires compute capability >= 9.0 (Hopper or later)", + file=sys.stderr, + ) + sys.exit(0) +dev.set_current() + +arch_str = "".join(f"{i}" for i in arch) + +# --------------------------------------------------------------------------- +# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then +# copies the tile to an output buffer so we can verify correctness. +# +# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be +# compiled with NVRTC without pulling in the full driver-API header. +# +# Key points: +# - The tensor map is passed by value with __grid_constant__ so the TMA +# hardware can read it from grid-constant memory. +# - Thread 0 in each block issues the TMA load and manages the mbarrier. +# - All threads wait on the mbarrier, then copy from shared to global. +# --------------------------------------------------------------------------- +TILE_SIZE = 128 # elements per tile (must match the kernel constant) + +code = r""" +// Minimal definition of the 128-byte opaque tensor map struct. +struct __align__(64) TensorMap { unsigned long long opaque[16]; }; + +static constexpr int TILE_SIZE = 128; + +extern "C" +__global__ void tma_copy( + const __grid_constant__ TensorMap tensor_map, + float* output, + int N) +{ + __shared__ __align__(128) float smem[TILE_SIZE]; + __shared__ __align__(8) unsigned long long mbar; + + const int tid = threadIdx.x; + const int tile_start = blockIdx.x * TILE_SIZE; + + // ---- Thread 0: set up mbarrier and issue the TMA load ---- + if (tid == 0) + { + // Initialise a single-phase mbarrier (1 arriving thread). + asm volatile( + "mbarrier.init.shared.b64 [%0], 1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start' + // from the tensor described by 'tensor_map' into shared memory. + asm volatile( + "cp.async.bulk.tensor.1d.shared::cluster.global.tile" + ".mbarrier::complete_tx::bytes" + " [%0], [%1, {%2}], [%3];" + :: "r"((unsigned)__cvta_generic_to_shared(smem)), + "l"(&tensor_map), + "r"(tile_start), + "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Tell the mbarrier how many bytes the TMA will deliver. + asm volatile( + "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar)), + "r"((unsigned)(TILE_SIZE * sizeof(float)))); + } + + __syncthreads(); + + // ---- Wait for the TMA load to complete ---- + if (tid == 0) + { + asm volatile( + "{ .reg .pred P; \n" + "WAIT: \n" + " mbarrier.try_wait.parity.shared.b64 P, [%0], 0; \n" + " @!P bra WAIT; \n" + "} \n" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + } + + __syncthreads(); + + // ---- Copy the tile from shared memory to the output buffer ---- + if (tid < TILE_SIZE) + { + const int idx = tile_start + tid; + if (idx < N) + output[idx] = smem[tid]; + } +} +""" + +# --------------------------------------------------------------------------- +# Compile the kernel +# --------------------------------------------------------------------------- +prog = Program( + code, + code_type="c++", + options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"), +) +mod = prog.compile("cubin") +ker = mod.get_kernel("tma_copy") + +# --------------------------------------------------------------------------- +# 1) Prepare input data on the device +# --------------------------------------------------------------------------- +N = 1024 +a = cp.arange(N, dtype=cp.float32) # [0, 1, 2, ..., N-1] +output = cp.zeros(N, dtype=cp.float32) +dev.sync() # cupy uses its own stream + +# --------------------------------------------------------------------------- +# 2) Create a TMA tiled descriptor +# from_tiled() accepts any DLPack / __cuda_array_interface__ object. +# The dtype (float32) is inferred automatically from the CuPy array. +# --------------------------------------------------------------------------- +tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,)) + +# --------------------------------------------------------------------------- +# 3) Launch the kernel +# The TensorMapDescriptor is passed directly as a kernel argument — the +# 128-byte struct is copied into kernel parameter space automatically. +# --------------------------------------------------------------------------- +n_tiles = N // TILE_SIZE +config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) +launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output, a), "TMA copy produced incorrect results" +print(f"TMA copy verified: {N} elements across {n_tiles} tiles") + +# --------------------------------------------------------------------------- +# 4) Demonstrate replace_address() +# Create a second tensor with different content, point the *same* +# descriptor at it, and re-launch without rebuilding the descriptor. +# --------------------------------------------------------------------------- +b = cp.full(N, fill_value=42.0, dtype=cp.float32) +dev.sync() + +tensor_map.replace_address(b) + +output2 = cp.zeros(N, dtype=cp.float32) +dev.sync() + +launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output2, b), "replace_address produced incorrect results" +print("replace_address verified: descriptor reused with new source tensor") diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py new file mode 100644 index 0000000000..0aa5968a85 --- /dev/null +++ b/cuda_core/tests/test_tensor_map.py @@ -0,0 +1,279 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +import numpy as np + +from cuda.core import ( + Device, + TensorMapDescriptor, + TensorMapDataType, + TensorMapInterleave, + TensorMapL2Promotion, + TensorMapOOBFill, + TensorMapSwizzle, +) + + +@pytest.fixture +def dev(init_cuda): + return Device() + + +@pytest.fixture +def skip_if_no_tma(dev): + if not dev.properties.tensor_map_access_supported: + pytest.skip("Device does not support TMA (requires compute capability 9.0+)") + + +def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256): + """Allocate a device buffer and return it with proper alignment.""" + n_elements = 1 + for s in shape: + n_elements *= s + buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment) + return buf + + +class TestTensorMapEnums: + """Test that enum wrappers expose the expected values.""" + + def test_data_type_values(self): + assert TensorMapDataType.UINT8 == 0 + assert TensorMapDataType.FLOAT32 == 7 + assert TensorMapDataType.FLOAT64 == 8 + assert TensorMapDataType.BFLOAT16 == 9 + + def test_interleave_values(self): + assert TensorMapInterleave.NONE == 0 + assert TensorMapInterleave.INTERLEAVE_16B == 1 + assert TensorMapInterleave.INTERLEAVE_32B == 2 + + def test_swizzle_values(self): + assert TensorMapSwizzle.NONE == 0 + assert TensorMapSwizzle.SWIZZLE_32B == 1 + assert TensorMapSwizzle.SWIZZLE_64B == 2 + assert TensorMapSwizzle.SWIZZLE_128B == 3 + + def test_l2_promotion_values(self): + assert TensorMapL2Promotion.NONE == 0 + assert TensorMapL2Promotion.L2_64B == 1 + assert TensorMapL2Promotion.L2_128B == 2 + assert TensorMapL2Promotion.L2_256B == 3 + + def test_oob_fill_values(self): + assert TensorMapOOBFill.NONE == 0 + assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1 + + +class TestTensorMapDescriptorCreation: + """Test TensorMapDescriptor factory methods.""" + + def test_cannot_instantiate_directly(self): + with pytest.raises(RuntimeError, match="cannot be instantiated directly"): + TensorMapDescriptor() + + def test_from_tiled_1d(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) # 1024 float32 elements + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + assert repr(desc) == "TensorMapDescriptor()" + + def test_from_tiled_2d(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) # 64x64 float32 + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_tiled_3d(self, dev, skip_if_no_tma): + buf = dev.allocate(16 * 16 * 16 * 4) # 16x16x16 float32 + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(8, 8, 8), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + swizzle=TensorMapSwizzle.SWIZZLE_128B, + ) + assert desc is not None + + def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + l2_promotion=TensorMapL2Promotion.L2_128B, + ) + assert desc is not None + + def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA, + ) + assert desc is not None + + +class TestTensorMapDescriptorValidation: + """Test validation in TensorMapDescriptor factory methods.""" + + def test_invalid_rank_zero(self, dev, skip_if_no_tma): + buf = dev.allocate(64) + with pytest.raises(ValueError, match="rank must be between 1 and 5"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="box_dim must have 1 elements"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_box_dim_out_of_range(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match=r"box_dim\[0\] must be in \[1, 256\]"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(512,), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_element_strides_rank_mismatch(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="element_strides must have 1 elements"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + element_strides=(1, 1), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_invalid_data_type(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(TypeError, match="data_type must be a TensorMapDataType"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + data_type=42, + ) + + +class TestTensorMapDtypeMapping: + """Test automatic dtype inference from numpy dtypes.""" + + @pytest.mark.parametrize("np_dtype,expected_tma_dt", [ + (np.uint8, TensorMapDataType.UINT8), + (np.uint16, TensorMapDataType.UINT16), + (np.uint32, TensorMapDataType.UINT32), + (np.int32, TensorMapDataType.INT32), + (np.uint64, TensorMapDataType.UINT64), + (np.int64, TensorMapDataType.INT64), + (np.float16, TensorMapDataType.FLOAT16), + (np.float32, TensorMapDataType.FLOAT32), + (np.float64, TensorMapDataType.FLOAT64), + ]) + def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma): + from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt + + def test_bfloat16_mapping(self): + try: + from ml_dtypes import bfloat16 + from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16 + except ImportError: + pytest.skip("ml_dtypes not installed") + + +class TestTensorMapReplaceAddress: + """Test replace_address functionality.""" + + def test_replace_address(self, dev, skip_if_no_tma): + buf1 = dev.allocate(1024 * 4) + desc = TensorMapDescriptor.from_tiled( + buf1, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + + buf2 = dev.allocate(1024 * 4) + desc.replace_address(buf2) + # No exception means success + + def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma): + buf1 = dev.allocate(1024 * 4) + desc = TensorMapDescriptor.from_tiled( + buf1, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + # Create a host-only array (not device-accessible) + host_arr = np.zeros(1024, dtype=np.float32) + with pytest.raises(ValueError, match="device-accessible"): + desc.replace_address(host_arr) + + +class TestTensorMapIm2col: + """Test im2col TMA descriptor creation.""" + + def test_from_im2col_3d(self, dev, skip_if_no_tma): + # 3D tensor: batch=1, height=32, channels=64 + buf = dev.allocate(1 * 32 * 64 * 4) + desc = TensorMapDescriptor.from_im2col( + buf, + pixel_box_lower_corner=(0,), + pixel_box_upper_corner=(4,), + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_im2col_rank_validation(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="Im2col tensor rank must be between 3 and 5"): + TensorMapDescriptor.from_im2col( + buf, + pixel_box_lower_corner=(), + pixel_box_upper_corner=(), + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + + def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma): + buf = dev.allocate(1 * 32 * 64 * 4) + with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"): + TensorMapDescriptor.from_im2col( + buf, + pixel_box_lower_corner=(0, 0), + pixel_box_upper_corner=(4,), + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) From 725121455365dd9366fda97e6ae92594fe898259 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 13:53:49 -0800 Subject: [PATCH 2/9] tma wide --- cuda_core/cuda/core/__init__.py | 1 + cuda_core/cuda/core/_tensor_map.pxd | 1 + cuda_core/cuda/core/_tensor_map.pyx | 208 +++++++++++++++++++++++++++- cuda_core/pixi.toml | 14 +- cuda_core/tests/test_tensor_map.py | 133 +++++++++++++++++- 5 files changed, 342 insertions(+), 15 deletions(-) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 15eead893a..ce1e1a132a 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -64,6 +64,7 @@ from cuda.core._tensor_map import ( # noqa: E402 TensorMapDescriptor, TensorMapDataType, + TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, TensorMapOOBFill, diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index 3cfd571d89..fa2254ba70 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -8,5 +8,6 @@ from cuda.bindings cimport cydriver cdef class TensorMapDescriptor: cdef cydriver.CUtensorMap _tensor_map cdef object _source_ref + cdef object _repr_info cdef void* _get_data_ptr(self) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 9375acf944..6b072f7404 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -80,6 +80,16 @@ class TensorMapOOBFill(enum.IntEnum): NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA +class TensorMapIm2ColWideMode(enum.IntEnum): + """Im2col wide mode for tensor map descriptors. + + These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values. + Supported on compute capability 10.0+. + """ + W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W + W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128 + + # Mapping from numpy dtype to TMA data type _NUMPY_DTYPE_TO_TMA = { numpy.dtype(numpy.uint8): TensorMapDataType.UINT8, @@ -316,6 +326,13 @@ cdef class TensorMapDescriptor: c_oob_fill, )) + desc._repr_info = { + "method": "tiled", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + return desc @staticmethod @@ -483,6 +500,176 @@ cdef class TensorMapDescriptor: c_oob_fill, )) + desc._repr_info = { + "method": "im2col", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + + return desc + + @staticmethod + def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width, + channels_per_pixel, pixels_per_column, *, + element_strides=None, + data_type=None, + interleave=TensorMapInterleave.NONE, + mode=TensorMapIm2ColWideMode.W, + swizzle=TensorMapSwizzle.SWIZZLE_128B, + l2_promotion=TensorMapL2Promotion.NONE, + oob_fill=TensorMapOOBFill.NONE): + """Create an im2col-wide TMA descriptor from a tensor object. + + Im2col-wide layout loads elements exclusively along the W (width) + dimension. This variant is supported on compute capability 10.0+ + (Blackwell and later). + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + pixel_box_lower_corner_width : int + Lower corner of the pixel bounding box along the W dimension. + pixel_box_upper_corner_width : int + Upper corner of the pixel bounding box along the W dimension. + channels_per_pixel : int + Number of channels per pixel. + pixels_per_column : int + Number of pixels per column. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + data_type : TensorMapDataType, optional + Explicit data type override. If ``None``, inferred from the + tensor's dtype. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + mode : TensorMapIm2ColWideMode + Im2col wide mode. Default ``W``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``SWIZZLE_128B``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [3, 5], the pointer is not + 16-byte aligned, or other constraints are violated. + """ + cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + + # Obtain a StridedMemoryView from the tensor + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + desc._source_ref = tensor + + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int + + cdef intptr_t global_address = view.ptr + shape = view.shape + strides = view.strides + + cdef int rank = len(shape) + if rank < 3 or rank > 5: + raise ValueError( + f"Im2col-wide tensor rank must be between 3 and 5, got {rank}") + + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x}") + + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + else: + element_strides = (1,) * rank + + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + if strides is not None: + byte_strides = tuple(s * elem_size for s in strides) + else: + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + + # Reverse all dimension arrays for column-major convention + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides + cdef uint32_t[5] c_element_strides + cdef int i_c + + for i_c in range(rank): + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + cdef uint32_t c_rank = rank + cdef int c_lower_w = pixel_box_lower_corner_width + cdef int c_upper_w = pixel_box_upper_corner_width + cdef uint32_t c_channels = channels_per_pixel + cdef uint32_t c_pixels = pixels_per_column + cdef int c_interleave_int = int(interleave) + cdef int c_mode_int = int(mode) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapIm2ColWideMode c_mode = c_mode_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_lower_w, + c_upper_w, + c_channels, + c_pixels, + c_element_strides, + c_interleave, + c_mode, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + desc._repr_info = { + "method": "im2col_wide", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + return desc def replace_address(self, tensor): @@ -512,13 +699,28 @@ cdef class TensorMapDescriptor: f"Global memory address must be 16-byte aligned, " f"got address 0x{global_address:x}") - self._source_ref = tensor - with nogil: HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress( &self._tensor_map, global_address, )) + # Update the source reference only after the driver call succeeds, + # so we don't drop the old tensor (risking a dangling pointer in the + # CUtensorMap struct) if the call fails. + self._source_ref = tensor + def __repr__(self): - return f"TensorMapDescriptor()" + info = self._repr_info + if info is None: + return "TensorMapDescriptor()" + parts = [] + if "method" in info: + parts.append(info["method"]) + if "rank" in info: + parts.append(f"rank={info['rank']}") + if "data_type" in info: + parts.append(f"dtype={info['data_type'].name}") + if "swizzle" in info: + parts.append(f"swizzle={info['swizzle'].name}") + return f"TensorMapDescriptor({', '.join(parts)})" diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml index 42fa19f59e..a8d3da7644 100644 --- a/cuda_core/pixi.toml +++ b/cuda_core/pixi.toml @@ -54,16 +54,9 @@ cuda = "13" [feature.cu13.dependencies] cuda-version = "13.1.*" -[feature.cu12.system-requirements] -cuda = "12" - -[feature.cu12.dependencies] -cuda-version = "12.*" - -# We keep both cu12 and cu13 because cuda.core works with either major version -# NOTE: Path dependency to ../cuda_bindings only works for cu13 (local bindings is v13.1) -# For cu12 testing, use conda-forge packages: temporarily change path to wildcard -# or skip cu12 locally: pixi run -e cu13 test +# NOTE: cu12 environment is intentionally omitted because the path dependency +# to ../cuda_bindings (v13.1) makes it unsolvable locally. For cu12 testing, +# use conda-forge packages or CI workflows. [environments] default = { features = [ "cu13", @@ -71,7 +64,6 @@ default = { features = [ "cython-tests", ], solve-group = "default" } cu13 = { features = ["cu13", "test", "cython-tests"], solve-group = "default" } -cu12 = { features = ["cu12", "test", "cython-tests"], solve-group = "cu12" } # TODO: check if these can be extracted from pyproject.toml [package] diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 0aa5968a85..f57e73ded8 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -9,6 +9,7 @@ Device, TensorMapDescriptor, TensorMapDataType, + TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, TensorMapOOBFill, @@ -36,6 +37,23 @@ def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256): return buf +class _DeviceArray: + """Wrap a Buffer with explicit shape via __cuda_array_interface__. + + dev.allocate() returns a 1D byte buffer. For multi-dimensional TMA tests + we need the tensor to report a proper shape/dtype so the TMA encoder sees + the correct rank, dimensions, and strides. + """ + def __init__(self, buf, shape, dtype=np.float32): + self._buf = buf # prevent GC + self.__cuda_array_interface__ = { + "shape": tuple(shape), + "typestr": np.dtype(dtype).str, + "data": (int(buf.handle), False), + "version": 3, + } + + class TestTensorMapEnums: """Test that enum wrappers expose the expected values.""" @@ -66,6 +84,10 @@ def test_oob_fill_values(self): assert TensorMapOOBFill.NONE == 0 assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1 + def test_im2col_wide_mode_values(self): + assert TensorMapIm2ColWideMode.W == 0 + assert TensorMapIm2ColWideMode.W128 == 1 + class TestTensorMapDescriptorCreation: """Test TensorMapDescriptor factory methods.""" @@ -82,7 +104,7 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma): data_type=TensorMapDataType.FLOAT32, ) assert desc is not None - assert repr(desc) == "TensorMapDescriptor()" + assert repr(desc) == "TensorMapDescriptor(tiled, rank=1, dtype=FLOAT32, swizzle=NONE)" def test_from_tiled_2d(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) # 64x64 float32 @@ -102,6 +124,18 @@ def test_from_tiled_3d(self, dev, skip_if_no_tma): ) assert desc is not None + def test_from_tiled_5d(self, dev, skip_if_no_tma): + # 5D: exercises all 5 c_global_dim / 4 c_global_strides slots + shape = (2, 4, 4, 4, 8) + n_bytes = 2 * 4 * 4 * 4 * 8 * 4 # float32 + buf = dev.allocate(n_bytes) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_tiled( + tensor, + box_dim=(1, 2, 2, 2, 8), + ) + assert desc is not None + def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) desc = TensorMapDescriptor.from_tiled( @@ -277,3 +311,100 @@ def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma): pixels_per_column=4, data_type=TensorMapDataType.FLOAT32, ) + + def test_from_im2col_4d(self, dev, skip_if_no_tma): + # NHWC layout: N=1, H=8, W=8, C=64 — 2 spatial dims + # Exercises spatial corner reversal with n_spatial=2: + # Python [H_lower, W_lower] -> driver [W_lower, H_lower] + shape = (1, 8, 8, 64) + buf = dev.allocate(1 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col( + tensor, + pixel_box_lower_corner=(0, 0), + pixel_box_upper_corner=(4, 4), + channels_per_pixel=64, + pixels_per_column=16, + ) + assert desc is not None + + def test_from_im2col_5d(self, dev, skip_if_no_tma): + # NDHWC layout: N=1, D=4, H=8, W=8, C=64 — 3 spatial dims + # Exercises the full spatial corner reversal: + # Python [D, H, W] -> driver [W, H, D] + shape = (1, 4, 8, 8, 64) + buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col( + tensor, + pixel_box_lower_corner=(0, 0, 0), + pixel_box_upper_corner=(2, 4, 4), + channels_per_pixel=64, + pixels_per_column=32, + ) + assert desc is not None + + +class TestTensorMapIm2colWide: + """Test im2col-wide TMA descriptor creation (compute capability 10.0+).""" + + @pytest.fixture + def skip_if_no_im2col_wide(self, dev): + cc = dev.compute_capability + if cc.major < 10: + pytest.skip("Device does not support im2col-wide (requires compute capability 10.0+)") + + def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide): + # 3D tensor: batch=1, width=32, channels=64 + buf = dev.allocate(1 * 32 * 64 * 4) + desc = TensorMapDescriptor.from_im2col_wide( + buf, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_im2col_wide_4d(self, dev, skip_if_no_im2col_wide): + # NHWC layout: N=1, H=8, W=8, C=64 + # Wide mode only uses scalar W corners, even with higher rank + shape = (1, 8, 8, 64) + buf = dev.allocate(1 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col_wide( + tensor, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=16, + ) + assert desc is not None + + def test_from_im2col_wide_5d(self, dev, skip_if_no_im2col_wide): + # NDHWC layout: N=1, D=4, H=8, W=8, C=64 + # Max rank boundary — verifies all 5 dim/stride slots are filled + shape = (1, 4, 8, 8, 64) + buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col_wide( + tensor, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=32, + ) + assert desc is not None + + def test_from_im2col_wide_rank_validation(self, dev, skip_if_no_im2col_wide): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="Im2col-wide tensor rank must be between 3 and 5"): + TensorMapDescriptor.from_im2col_wide( + buf, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) From beb1c124648201bc5c309d2d00ec5001ebbdcf0d Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 15:15:56 -0800 Subject: [PATCH 3/9] clean up --- cuda_core/cuda/core/_tensor_map.pxd | 2 +- cuda_core/cuda/core/_tensor_map.pyx | 185 ++++++++++----------------- cuda_core/examples/tma_tensor_map.py | 2 +- cuda_core/tests/test_tensor_map.py | 2 +- 4 files changed, 72 insertions(+), 119 deletions(-) diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index fa2254ba70..b2b165b5aa 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 6b072f7404..9edcccb928 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -150,6 +150,55 @@ def _resolve_data_type(view, data_type): return tma_dt +def _get_validated_view(tensor): + """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer.""" + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + # stream_ptr=-1: no stream synchronization needed because descriptor + # creation only reads tensor metadata, it does not move data. + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + if view.ptr % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{view.ptr:x}") + + return view + + +def _compute_byte_strides(shape, strides, elem_size): + """Compute byte strides from element strides or C-contiguous fallback. + + Returns a tuple of byte strides in row-major order. + """ + if strides is not None: + return tuple(s * elem_size for s in strides) + + # C-contiguous: compute byte strides from shape, innermost first + rank = len(shape) + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + return tuple(byte_strides) + + +def _validate_element_strides(element_strides, rank): + """Validate or default element_strides to all-ones.""" + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + return element_strides + return (1,) * rank + + cdef class TensorMapDescriptor: """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs. @@ -171,8 +220,8 @@ cdef class TensorMapDescriptor: cdef void* _get_data_ptr(self): return &self._tensor_map - @staticmethod - def from_tiled(tensor, box_dim, *, + @classmethod + def from_tiled(cls, tensor, box_dim, *, element_strides=None, data_type=None, interleave=TensorMapInterleave.NONE, @@ -216,40 +265,23 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [1, 5], the pointer is not 16-byte aligned, or dimension/stride constraints are violated. """ - cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + cdef TensorMapDescriptor desc = cls.__new__(cls) - # Obtain a StridedMemoryView from the tensor - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") - - # Keep a strong reference to prevent GC + view = _get_validated_view(tensor) desc._source_ref = tensor - # Resolve data type tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int - # Get tensor metadata cdef intptr_t global_address = view.ptr shape = view.shape - strides = view.strides # in elements, can be None for C-contiguous cdef int rank = len(shape) if rank < 1 or rank > 5: raise ValueError( f"Tensor rank must be between 1 and 5, got {rank}") - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)") - if len(box_dim) != rank: raise ValueError( f"box_dim must have {rank} elements (same as tensor rank), " @@ -260,25 +292,10 @@ cdef class TensorMapDescriptor: raise ValueError( f"box_dim[{i}] must be in [1, 256], got {bd}") - if element_strides is not None: - if len(element_strides) != rank: - raise ValueError( - f"element_strides must have {rank} elements, got {len(element_strides)}") - else: - element_strides = (1,) * rank + element_strides = _validate_element_strides(element_strides, rank) - # Compute byte strides from element strides cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - if strides is not None: - byte_strides = tuple(s * elem_size for s in strides) - else: - # C-contiguous: strides in bytes, row-major - byte_strides = [] - stride = elem_size - for i in range(rank - 1, -1, -1): - byte_strides.append(stride) - stride *= shape[i] - byte_strides.reverse() + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) # Reverse dimensions for column-major cuTensorMap convention # Python/DLPack: row-major (dim 0 = outermost) @@ -335,8 +352,8 @@ cdef class TensorMapDescriptor: return desc - @staticmethod - def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner, + @classmethod + def from_im2col(cls, tensor, pixel_box_lower_corner, pixel_box_upper_corner, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, @@ -390,17 +407,9 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [3, 5], the pointer is not 16-byte aligned, or other constraints are violated. """ - cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) - - # Obtain a StridedMemoryView from the tensor - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") + cdef TensorMapDescriptor desc = cls.__new__(cls) + view = _get_validated_view(tensor) desc._source_ref = tensor tma_dt = _resolve_data_type(view, data_type) @@ -409,18 +418,12 @@ cdef class TensorMapDescriptor: cdef intptr_t global_address = view.ptr shape = view.shape - strides = view.strides cdef int rank = len(shape) if rank < 3 or rank > 5: raise ValueError( f"Im2col tensor rank must be between 3 and 5, got {rank}") - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x}") - cdef int n_spatial = rank - 2 if len(pixel_box_lower_corner) != n_spatial: raise ValueError( @@ -431,23 +434,10 @@ cdef class TensorMapDescriptor: f"pixel_box_upper_corner must have {n_spatial} elements " f"(rank - 2), got {len(pixel_box_upper_corner)}") - if element_strides is not None: - if len(element_strides) != rank: - raise ValueError( - f"element_strides must have {rank} elements, got {len(element_strides)}") - else: - element_strides = (1,) * rank + element_strides = _validate_element_strides(element_strides, rank) cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - if strides is not None: - byte_strides = tuple(s * elem_size for s in strides) - else: - byte_strides = [] - stride = elem_size - for i in range(rank - 1, -1, -1): - byte_strides.append(stride) - stride *= shape[i] - byte_strides.reverse() + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) # Reverse all dimension arrays for column-major convention cdef uint64_t[5] c_global_dim @@ -509,8 +499,8 @@ cdef class TensorMapDescriptor: return desc - @staticmethod - def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width, + @classmethod + def from_im2col_wide(cls, tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, @@ -565,17 +555,9 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [3, 5], the pointer is not 16-byte aligned, or other constraints are violated. """ - cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) - - # Obtain a StridedMemoryView from the tensor - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") + cdef TensorMapDescriptor desc = cls.__new__(cls) + view = _get_validated_view(tensor) desc._source_ref = tensor tma_dt = _resolve_data_type(view, data_type) @@ -584,35 +566,16 @@ cdef class TensorMapDescriptor: cdef intptr_t global_address = view.ptr shape = view.shape - strides = view.strides cdef int rank = len(shape) if rank < 3 or rank > 5: raise ValueError( f"Im2col-wide tensor rank must be between 3 and 5, got {rank}") - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x}") - - if element_strides is not None: - if len(element_strides) != rank: - raise ValueError( - f"element_strides must have {rank} elements, got {len(element_strides)}") - else: - element_strides = (1,) * rank + element_strides = _validate_element_strides(element_strides, rank) cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - if strides is not None: - byte_strides = tuple(s * elem_size for s in strides) - else: - byte_strides = [] - stride = elem_size - for i in range(rank - 1, -1, -1): - byte_strides.append(stride) - stride *= shape[i] - byte_strides.reverse() + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) # Reverse all dimension arrays for column-major convention cdef uint64_t[5] c_global_dim @@ -685,19 +648,9 @@ cdef class TensorMapDescriptor: or a :obj:`~cuda.core.StridedMemoryView`. Must refer to device-accessible memory with a 16-byte-aligned pointer. """ - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") + view = _get_validated_view(tensor) cdef intptr_t global_address = view.ptr - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x}") with nogil: HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress( diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index de51570ec4..e93c02dc08 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index f57e73ded8..790307857b 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import pytest From 6f689175e55399f87def6ab3ca374a844bdbd01b Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 15:55:00 -0800 Subject: [PATCH 4/9] Add comments to prepare_tensor_map_arg explaining allocation and lifetime Co-Authored-By: Claude Opus 4.6 --- cuda_core/cuda/core/_kernel_arg_handler.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index e88def13cd..b55a4abc7a 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -135,8 +135,13 @@ cdef inline int prepare_tensor_map_arg( vector.vector[void*]& data_addresses, TensorMapDescriptor arg, const size_t idx) except -1: + # Allocate a temporary buffer for the 128-byte CUtensorMap struct. + # We copy rather than pointing directly at arg._tensor_map for lifetime + # safety: ParamHolder owns and frees its argument buffers independently. cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap)) memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap)) + # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__, + # data_addresses[idx] is the pointer passed to cuLaunchKernel. data_addresses[idx] = ptr data[idx] = ptr return 0 From 53c4a3db2c44dfb5b39c7e45f12990a9437098b2 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 17:04:26 -0800 Subject: [PATCH 5/9] Address Copilot review feedback - Remove unused _alloc_device_tensor helper from tests - Add test for rank > 5 (6D tensor) to verify upper bound validation - Add NULL check for PyMem_Malloc in prepare_tensor_map_arg Co-Authored-By: Claude Opus 4.6 --- cuda_core/cuda/core/_kernel_arg_handler.pyx | 2 ++ cuda_core/tests/test_tensor_map.py | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index b55a4abc7a..5619ba7492 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -139,6 +139,8 @@ cdef inline int prepare_tensor_map_arg( # We copy rather than pointing directly at arg._tensor_map for lifetime # safety: ParamHolder owns and frees its argument buffers independently. cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap)) + if ptr is NULL: + raise MemoryError("Failed to allocate memory for CUtensorMap") memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap)) # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__, # data_addresses[idx] is the pointer passed to cuLaunchKernel. diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 790307857b..96dd14a419 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -28,14 +28,6 @@ def skip_if_no_tma(dev): pytest.skip("Device does not support TMA (requires compute capability 9.0+)") -def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256): - """Allocate a device buffer and return it with proper alignment.""" - n_elements = 1 - for s in shape: - n_elements *= s - buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment) - return buf - class _DeviceArray: """Wrap a Buffer with explicit shape via __cuda_array_interface__. @@ -179,6 +171,19 @@ def test_invalid_rank_zero(self, dev, skip_if_no_tma): data_type=TensorMapDataType.FLOAT32, ) + def test_invalid_rank_six(self, dev, skip_if_no_tma): + shape = (2, 2, 2, 2, 2, 2) + n_elements = 1 + for s in shape: + n_elements *= s + buf = dev.allocate(n_elements * 4) + arr = _DeviceArray(buf, shape) + with pytest.raises(ValueError, match="rank must be between 1 and 5"): + TensorMapDescriptor.from_tiled( + arr, + box_dim=(2,) * 6, + ) + def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma): buf = dev.allocate(1024 * 4) with pytest.raises(ValueError, match="box_dim must have 1 elements"): From b1b2d3fa9c1ce7d0d38d4735be2aa38de800e4b2 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 17:20:05 -0800 Subject: [PATCH 6/9] Split TMA example into two focused files Move the replace_address() demonstration into its own self-contained example (tma_replace_address.py) so each file covers a single concept. Co-Authored-By: Claude Opus 4.6 --- cuda_core/examples/tma_replace_address.py | 189 ++++++++++++++++++++++ cuda_core/examples/tma_tensor_map.py | 20 --- 2 files changed, 189 insertions(+), 20 deletions(-) create mode 100644 cuda_core/examples/tma_replace_address.py diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py new file mode 100644 index 0000000000..7f11cf551c --- /dev/null +++ b/cuda_core/examples/tma_replace_address.py @@ -0,0 +1,189 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates how to use replace_address() to repoint a TMA +# (Tensor Memory Accelerator) descriptor at a different tensor without +# rebuilding the descriptor from scratch. +# +# The workflow is: +# +# 1. Create a TMA tiled descriptor and launch a kernel to verify it works +# 2. Allocate a second tensor with different content +# 3. Call replace_address() to repoint the same descriptor at the new tensor +# 4. Re-launch the kernel and verify it reads from the new tensor +# +# This is useful when the tensor layout (shape, dtype, tile size) stays the +# same but the underlying data buffer changes, e.g. double-buffering or +# iterating over a sequence of same-shaped tensors. +# +# Requirements: +# - Hopper or later GPU (compute capability >= 9.0) +# - CuPy +# - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set) +# +# ################################################################################ + +import sys + +import cupy as cp +import numpy as np + +from cuda.core import ( + Device, + LaunchConfig, + Program, + ProgramOptions, + TensorMapDescriptor, + launch, +) + +# --------------------------------------------------------------------------- +# Check for Hopper+ GPU +# --------------------------------------------------------------------------- +dev = Device() +arch = dev.compute_capability +if arch < (9, 0): + print( + "TMA requires compute capability >= 9.0 (Hopper or later)", + file=sys.stderr, + ) + sys.exit(0) +dev.set_current() + +arch_str = "".join(f"{i}" for i in arch) + +# --------------------------------------------------------------------------- +# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then +# copies the tile to an output buffer so we can verify correctness. +# +# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be +# compiled with NVRTC without pulling in the full driver-API header. +# +# Key points: +# - The tensor map is passed by value with __grid_constant__ so the TMA +# hardware can read it from grid-constant memory. +# - Thread 0 in each block issues the TMA load and manages the mbarrier. +# - All threads wait on the mbarrier, then copy from shared to global. +# --------------------------------------------------------------------------- +TILE_SIZE = 128 # elements per tile (must match the kernel constant) + +code = r""" +// Minimal definition of the 128-byte opaque tensor map struct. +struct __align__(64) TensorMap { unsigned long long opaque[16]; }; + +static constexpr int TILE_SIZE = 128; + +extern "C" +__global__ void tma_copy( + const __grid_constant__ TensorMap tensor_map, + float* output, + int N) +{ + __shared__ __align__(128) float smem[TILE_SIZE]; + __shared__ __align__(8) unsigned long long mbar; + + const int tid = threadIdx.x; + const int tile_start = blockIdx.x * TILE_SIZE; + + // ---- Thread 0: set up mbarrier and issue the TMA load ---- + if (tid == 0) + { + // Initialise a single-phase mbarrier (1 arriving thread). + asm volatile( + "mbarrier.init.shared.b64 [%0], 1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start' + // from the tensor described by 'tensor_map' into shared memory. + asm volatile( + "cp.async.bulk.tensor.1d.shared::cluster.global.tile" + ".mbarrier::complete_tx::bytes" + " [%0], [%1, {%2}], [%3];" + :: "r"((unsigned)__cvta_generic_to_shared(smem)), + "l"(&tensor_map), + "r"(tile_start), + "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Tell the mbarrier how many bytes the TMA will deliver. + asm volatile( + "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar)), + "r"((unsigned)(TILE_SIZE * sizeof(float)))); + } + + __syncthreads(); + + // ---- Wait for the TMA load to complete ---- + if (tid == 0) + { + asm volatile( + "{ .reg .pred P; \n" + "WAIT: \n" + " mbarrier.try_wait.parity.shared.b64 P, [%0], 0; \n" + " @!P bra WAIT; \n" + "} \n" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + } + + __syncthreads(); + + // ---- Copy the tile from shared memory to the output buffer ---- + if (tid < TILE_SIZE) + { + const int idx = tile_start + tid; + if (idx < N) + output[idx] = smem[tid]; + } +} +""" + +# --------------------------------------------------------------------------- +# Compile the kernel +# --------------------------------------------------------------------------- +prog = Program( + code, + code_type="c++", + options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"), +) +mod = prog.compile("cubin") +ker = mod.get_kernel("tma_copy") + +# --------------------------------------------------------------------------- +# 1) Prepare input data and verify the initial TMA copy +# --------------------------------------------------------------------------- +N = 1024 +a = cp.arange(N, dtype=cp.float32) # [0, 1, 2, ..., N-1] +output = cp.zeros(N, dtype=cp.float32) +dev.sync() # cupy uses its own stream + +tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,)) + +n_tiles = N // TILE_SIZE +config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) +launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output, a), "TMA copy produced incorrect results" +print(f"TMA copy verified: {N} elements across {n_tiles} tiles") + +# --------------------------------------------------------------------------- +# 2) Demonstrate replace_address() +# Create a second tensor with different content, point the *same* +# descriptor at it, and re-launch without rebuilding the descriptor. +# --------------------------------------------------------------------------- +b = cp.full(N, fill_value=42.0, dtype=cp.float32) +dev.sync() + +tensor_map.replace_address(b) + +output2 = cp.zeros(N, dtype=cp.float32) +dev.sync() + +launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output2, b), "replace_address produced incorrect results" +print("replace_address verified: descriptor reused with new source tensor") diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index e93c02dc08..63c0ac7b5c 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -13,7 +13,6 @@ # 1. Creating a TMA tiled descriptor from a CuPy device array # 2. Passing the descriptor to a kernel via launch() # 3. Using TMA to load tiles into shared memory (via inline PTX) -# 4. Updating the descriptor's source address with replace_address() # # Requirements: # - Hopper or later GPU (compute capability >= 9.0) @@ -174,22 +173,3 @@ assert cp.array_equal(output, a), "TMA copy produced incorrect results" print(f"TMA copy verified: {N} elements across {n_tiles} tiles") - -# --------------------------------------------------------------------------- -# 4) Demonstrate replace_address() -# Create a second tensor with different content, point the *same* -# descriptor at it, and re-launch without rebuilding the descriptor. -# --------------------------------------------------------------------------- -b = cp.full(N, fill_value=42.0, dtype=cp.float32) -dev.sync() - -tensor_map.replace_address(b) - -output2 = cp.zeros(N, dtype=cp.float32) -dev.sync() - -launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N)) -dev.sync() - -assert cp.array_equal(output2, b), "replace_address produced incorrect results" -print("replace_address verified: descriptor reused with new source tensor") From 0d884c294f1086d7d20bee077bb4f006c06868ad Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 25 Feb 2026 14:14:38 -0800 Subject: [PATCH 7/9] pre-commit --- cuda_core/cuda/core/__init__.py | 14 ++++----- cuda_core/examples/tma_replace_address.py | 1 - cuda_core/examples/tma_tensor_map.py | 1 - cuda_core/tests/test_tensor_map.py | 37 ++++++++++++----------- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index feeb23797c..2a0af71b36 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -62,18 +62,18 @@ ) from cuda.core._module import Kernel, ObjectCode # noqa: E402 from cuda.core._program import Program, ProgramOptions # noqa: E402 +from cuda.core._stream import ( # noqa: E402 + LEGACY_DEFAULT_STREAM, + PER_THREAD_DEFAULT_STREAM, + Stream, + StreamOptions, +) from cuda.core._tensor_map import ( # noqa: E402 - TensorMapDescriptor, TensorMapDataType, + TensorMapDescriptor, TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, TensorMapOOBFill, TensorMapSwizzle, ) -from cuda.core._stream import ( # noqa: E402 - LEGACY_DEFAULT_STREAM, - PER_THREAD_DEFAULT_STREAM, - Stream, - StreamOptions, -) diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py index 7f11cf551c..21fb91c254 100644 --- a/cuda_core/examples/tma_replace_address.py +++ b/cuda_core/examples/tma_replace_address.py @@ -30,7 +30,6 @@ import cupy as cp import numpy as np - from cuda.core import ( Device, LaunchConfig, diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index 63c0ac7b5c..c85c5d96bd 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -25,7 +25,6 @@ import cupy as cp import numpy as np - from cuda.core import ( Device, LaunchConfig, diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 96dd14a419..1d6e8195b6 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -1,14 +1,12 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import pytest - import numpy as np - +import pytest from cuda.core import ( Device, - TensorMapDescriptor, TensorMapDataType, + TensorMapDescriptor, TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, @@ -28,7 +26,6 @@ def skip_if_no_tma(dev): pytest.skip("Device does not support TMA (requires compute capability 9.0+)") - class _DeviceArray: """Wrap a Buffer with explicit shape via __cuda_array_interface__. @@ -36,6 +33,7 @@ class _DeviceArray: we need the tensor to report a proper shape/dtype so the TMA encoder sees the correct rank, dimensions, and strides. """ + def __init__(self, buf, shape, dtype=np.float32): self._buf = buf # prevent GC self.__cuda_array_interface__ = { @@ -225,25 +223,30 @@ def test_invalid_data_type(self, dev, skip_if_no_tma): class TestTensorMapDtypeMapping: """Test automatic dtype inference from numpy dtypes.""" - @pytest.mark.parametrize("np_dtype,expected_tma_dt", [ - (np.uint8, TensorMapDataType.UINT8), - (np.uint16, TensorMapDataType.UINT16), - (np.uint32, TensorMapDataType.UINT32), - (np.int32, TensorMapDataType.INT32), - (np.uint64, TensorMapDataType.UINT64), - (np.int64, TensorMapDataType.INT64), - (np.float16, TensorMapDataType.FLOAT16), - (np.float32, TensorMapDataType.FLOAT32), - (np.float64, TensorMapDataType.FLOAT64), - ]) + @pytest.mark.parametrize( + "np_dtype,expected_tma_dt", + [ + (np.uint8, TensorMapDataType.UINT8), + (np.uint16, TensorMapDataType.UINT16), + (np.uint32, TensorMapDataType.UINT32), + (np.int32, TensorMapDataType.INT32), + (np.uint64, TensorMapDataType.UINT64), + (np.int64, TensorMapDataType.INT64), + (np.float16, TensorMapDataType.FLOAT16), + (np.float32, TensorMapDataType.FLOAT32), + (np.float64, TensorMapDataType.FLOAT64), + ], + ) def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma): from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt def test_bfloat16_mapping(self): try: - from ml_dtypes import bfloat16 from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + from ml_dtypes import bfloat16 + assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16 except ImportError: pytest.skip("ml_dtypes not installed") From 63177590ca66a2efd45618b074bdccf4ed81dfe8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 25 Feb 2026 15:07:54 -0800 Subject: [PATCH 8/9] adding stride meta data to gpu allocated memory --- cuda_core/tests/test_tensor_map.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 1d6e8195b6..3e030dd29d 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -98,8 +98,9 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma): def test_from_tiled_2d(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) # 64x64 float32 + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, ) @@ -107,8 +108,9 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma): def test_from_tiled_3d(self, dev, skip_if_no_tma): buf = dev.allocate(16 * 16 * 16 * 4) # 16x16x16 float32 + tensor = _DeviceArray(buf, (16, 16, 16)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(8, 8, 8), data_type=TensorMapDataType.FLOAT32, ) @@ -128,8 +130,9 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma): def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, swizzle=TensorMapSwizzle.SWIZZLE_128B, @@ -138,8 +141,9 @@ def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, l2_promotion=TensorMapL2Promotion.L2_128B, @@ -148,8 +152,9 @@ def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA, @@ -162,9 +167,10 @@ class TestTensorMapDescriptorValidation: def test_invalid_rank_zero(self, dev, skip_if_no_tma): buf = dev.allocate(64) + tensor = _DeviceArray(buf, ()) # 0-dim tensor with pytest.raises(ValueError, match="rank must be between 1 and 5"): TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(), data_type=TensorMapDataType.FLOAT32, ) @@ -286,8 +292,9 @@ class TestTensorMapIm2col: def test_from_im2col_3d(self, dev, skip_if_no_tma): # 3D tensor: batch=1, height=32, channels=64 buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) desc = TensorMapDescriptor.from_im2col( - buf, + tensor, pixel_box_lower_corner=(0,), pixel_box_upper_corner=(4,), channels_per_pixel=64, @@ -310,9 +317,10 @@ def test_from_im2col_rank_validation(self, dev, skip_if_no_tma): def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma): buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) # 3D: n_spatial = 1 with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"): TensorMapDescriptor.from_im2col( - buf, + tensor, pixel_box_lower_corner=(0, 0), pixel_box_upper_corner=(4,), channels_per_pixel=64, From 4c866c53ecf08916f1749f7dc93a257aa702e399 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 25 Feb 2026 15:59:31 -0800 Subject: [PATCH 9/9] im2col fixes --- cuda_core/tests/test_tensor_map.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 3e030dd29d..8b5e27ce6f 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -373,8 +373,9 @@ def skip_if_no_im2col_wide(self, dev): def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide): # 3D tensor: batch=1, width=32, channels=64 buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) desc = TensorMapDescriptor.from_im2col_wide( - buf, + tensor, pixel_box_lower_corner_width=0, pixel_box_upper_corner_width=4, channels_per_pixel=64,