From 1f5c57331ce17968e581adfb003aa266a914f763 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 23 Feb 2026 14:02:32 -0800
Subject: [PATCH 1/9] initial commit

---
 cuda_core/cuda/core/__init__.py             |   8 +
 cuda_core/cuda/core/_kernel_arg_handler.pyx |  24 +
 cuda_core/cuda/core/_tensor_map.pxd         |  12 +
 cuda_core/cuda/core/_tensor_map.pyx         | 524 ++++++++++++++++++++
 cuda_core/examples/tma_tensor_map.py        | 195 ++++++++
 cuda_core/tests/test_tensor_map.py          | 279 +++++++++++
 6 files changed, 1042 insertions(+)
 create mode 100644 cuda_core/cuda/core/_tensor_map.pxd
 create mode 100644 cuda_core/cuda/core/_tensor_map.pyx
 create mode 100644 cuda_core/examples/tma_tensor_map.py
 create mode 100644 cuda_core/tests/test_tensor_map.py

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index f22bbc7f16..15eead893a 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -61,6 +61,14 @@
 )
 from cuda.core._module import Kernel, ObjectCode  # noqa: E402
 from cuda.core._program import Program, ProgramOptions  # noqa: E402
+from cuda.core._tensor_map import (  # noqa: E402
+    TensorMapDescriptor,
+    TensorMapDataType,
+    TensorMapInterleave,
+    TensorMapL2Promotion,
+    TensorMapOOBFill,
+    TensorMapSwizzle,
+)
 from cuda.core._stream import (  # noqa: E402
     LEGACY_DEFAULT_STREAM,
     PER_THREAD_DEFAULT_STREAM,
diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index 882ca5eaab..e88def13cd 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -6,6 +6,7 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from libc.stdint cimport (intptr_t,
                           int8_t, int16_t, int32_t, int64_t,
                           uint8_t, uint16_t, uint32_t, uint64_t,)
+from libc.string cimport memcpy
 from libcpp cimport bool as cpp_bool
 from libcpp.complex cimport complex as cpp_complex
 from libcpp cimport nullptr
@@ -16,6 +17,8 @@ import ctypes
 import numpy
 
 from cuda.core._memory import Buffer
+from cuda.core._tensor_map import TensorMapDescriptor as _TensorMapDescriptor_py
+from cuda.core._tensor_map cimport TensorMapDescriptor
 from cuda.core._utils.cuda_utils import driver
 from cuda.bindings cimport cydriver
 
@@ -97,6 +100,9 @@ cdef object numpy_complex64 = numpy.complex64
 cdef object numpy_complex128 = numpy.complex128
 
 
+cdef object tensor_map_descriptor_type = _TensorMapDescriptor_py
+
+
 # limitation due to cython/cython#534
 ctypedef void* voidptr
 
@@ -124,6 +130,18 @@ cdef inline int prepare_arg(
     return 0
 
 
+cdef inline int prepare_tensor_map_arg(
+        vector.vector[void*]& data,
+        vector.vector[void*]& data_addresses,
+        TensorMapDescriptor arg,
+        const size_t idx) except -1:
+    cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
+    memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
+    data_addresses[idx] = ptr
+    data[idx] = ptr
+    return 0
+
+
 cdef inline int prepare_ctypes_arg(
         vector.vector[void*]& data,
         vector.vector[void*]& data_addresses,
@@ -273,6 +291,9 @@ cdef class ParamHolder:
                     # it's a CUdeviceptr:
                     self.data_addresses[i] = <void*><intptr_t>(arg.handle.getPtr())
                 continue
+            elif arg_type is tensor_map_descriptor_type:
+                prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
+                continue
             elif arg_type is bool:
                 prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i)
                 continue
@@ -322,6 +343,9 @@ cdef class ParamHolder:
                 elif isinstance(arg, driver.CUgraphConditionalHandle):
                     prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i)
                     continue
+                elif isinstance(arg, tensor_map_descriptor_type):
+                    prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
+                    continue
                 # TODO: support ctypes/numpy struct
                 raise TypeError("the argument is of unsupported type: " + str(type(arg)))
 
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
new file mode 100644
index 0000000000..3cfd571d89
--- /dev/null
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+
+
+cdef class TensorMapDescriptor:
+    cdef cydriver.CUtensorMap _tensor_map
+    cdef object _source_ref
+
+    cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
new file mode 100644
index 0000000000..9375acf944
--- /dev/null
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -0,0 +1,524 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t, uint32_t, uint64_t
+from cuda.bindings cimport cydriver
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+
+import enum
+
+import numpy
+
+from cuda.core._memoryview import StridedMemoryView
+
+
+try:
+    from ml_dtypes import bfloat16 as ml_bfloat16
+except ImportError:
+    ml_bfloat16 = None
+
+
+class TensorMapDataType(enum.IntEnum):
+    """Data types for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapDataType`` driver enum values.
+    """
+    UINT8 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8
+    UINT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16
+    UINT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32
+    INT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32
+    UINT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64
+    INT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64
+    FLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16
+    FLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32
+    FLOAT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64
+    BFLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16
+    FLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ
+    TFLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32
+    TFLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ
+
+
+class TensorMapInterleave(enum.IntEnum):
+    """Interleave layout for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapInterleave`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_INTERLEAVE_NONE
+    INTERLEAVE_16B = cydriver.CU_TENSOR_MAP_INTERLEAVE_16B
+    INTERLEAVE_32B = cydriver.CU_TENSOR_MAP_INTERLEAVE_32B
+
+
+class TensorMapSwizzle(enum.IntEnum):
+    """Swizzle mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapSwizzle`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_SWIZZLE_NONE
+    SWIZZLE_32B = cydriver.CU_TENSOR_MAP_SWIZZLE_32B
+    SWIZZLE_64B = cydriver.CU_TENSOR_MAP_SWIZZLE_64B
+    SWIZZLE_128B = cydriver.CU_TENSOR_MAP_SWIZZLE_128B
+
+
+class TensorMapL2Promotion(enum.IntEnum):
+    """L2 promotion mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapL2promotion`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_L2_PROMOTION_NONE
+    L2_64B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_64B
+    L2_128B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_128B
+    L2_256B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+
+
+class TensorMapOOBFill(enum.IntEnum):
+    """Out-of-bounds fill mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapFloatOOBfill`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
+    NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+
+
+# Mapping from numpy dtype to TMA data type
+_NUMPY_DTYPE_TO_TMA = {
+    numpy.dtype(numpy.uint8): TensorMapDataType.UINT8,
+    numpy.dtype(numpy.uint16): TensorMapDataType.UINT16,
+    numpy.dtype(numpy.uint32): TensorMapDataType.UINT32,
+    numpy.dtype(numpy.int32): TensorMapDataType.INT32,
+    numpy.dtype(numpy.uint64): TensorMapDataType.UINT64,
+    numpy.dtype(numpy.int64): TensorMapDataType.INT64,
+    numpy.dtype(numpy.float16): TensorMapDataType.FLOAT16,
+    numpy.dtype(numpy.float32): TensorMapDataType.FLOAT32,
+    numpy.dtype(numpy.float64): TensorMapDataType.FLOAT64,
+}
+
+if ml_bfloat16 is not None:
+    _NUMPY_DTYPE_TO_TMA[numpy.dtype(ml_bfloat16)] = TensorMapDataType.BFLOAT16
+
+
+# Mapping from TMA data type to element size in bytes
+_TMA_DATA_TYPE_SIZE = {
+    TensorMapDataType.UINT8: 1,
+    TensorMapDataType.UINT16: 2,
+    TensorMapDataType.UINT32: 4,
+    TensorMapDataType.INT32: 4,
+    TensorMapDataType.UINT64: 8,
+    TensorMapDataType.INT64: 8,
+    TensorMapDataType.FLOAT16: 2,
+    TensorMapDataType.FLOAT32: 4,
+    TensorMapDataType.FLOAT64: 8,
+    TensorMapDataType.BFLOAT16: 2,
+    TensorMapDataType.FLOAT32_FTZ: 4,
+    TensorMapDataType.TFLOAT32: 4,
+    TensorMapDataType.TFLOAT32_FTZ: 4,
+}
+
+
+def _resolve_data_type(view, data_type):
+    """Resolve the TMA data type from an explicit value or the view's dtype."""
+
+    if data_type is not None:
+        if not isinstance(data_type, TensorMapDataType):
+            raise TypeError(
+                f"data_type must be a TensorMapDataType, got {type(data_type)}")
+        return data_type
+
+    dt = view.dtype
+    if dt is None:
+        raise ValueError(
+            "Cannot infer TMA data type from the tensor; "
+            "please specify data_type explicitly")
+
+    tma_dt = _NUMPY_DTYPE_TO_TMA.get(dt)
+    if tma_dt is None:
+        raise ValueError(
+            f"Unsupported dtype {dt} for TMA; "
+            f"supported dtypes: {list(_NUMPY_DTYPE_TO_TMA.keys())}. "
+            "You may also specify data_type explicitly.")
+
+    return tma_dt
+
+
+cdef class TensorMapDescriptor:
+    """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs.
+
+    A ``TensorMapDescriptor`` wraps the opaque 128-byte ``CUtensorMap`` struct
+    used by the hardware TMA unit for efficient bulk data movement between
+    global and shared memory.
+
+    Instances are created via the class methods :meth:`from_tiled` and
+    :meth:`from_im2col`, and can be passed directly to
+    :func:`~cuda.core.launch` as a kernel argument.
+    """
+
+    def __init__(self):
+        raise RuntimeError(
+            "TensorMapDescriptor cannot be instantiated directly. "
+            "Use TensorMapDescriptor.from_tiled() or "
+            "TensorMapDescriptor.from_im2col().")
+
+    cdef void* _get_data_ptr(self):
+        return <void*>&self._tensor_map
+
+    @staticmethod
+    def from_tiled(tensor, box_dim, *,
+                   element_strides=None,
+                   data_type=None,
+                   interleave=TensorMapInterleave.NONE,
+                   swizzle=TensorMapSwizzle.NONE,
+                   l2_promotion=TensorMapL2Promotion.NONE,
+                   oob_fill=TensorMapOOBFill.NONE):
+        """Create a tiled TMA descriptor from a tensor object.
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        box_dim : tuple of int
+            The size of each tile dimension (in elements). Must have the
+            same rank as the tensor and each value must be in [1, 256].
+            Specified in the same (row-major) order as the tensor shape.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+            Specified in the same (row-major) order as the tensor shape.
+        data_type : TensorMapDataType, optional
+            Explicit data type override. If ``None``, inferred from the
+            tensor's dtype.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``NONE``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [1, 5], the pointer is not
+            16-byte aligned, or dimension/stride constraints are violated.
+        """
+        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+
+        # Obtain a StridedMemoryView from the tensor
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        # Keep a strong reference to prevent GC
+        desc._source_ref = tensor
+
+        # Resolve data type
+        tma_dt = _resolve_data_type(view, data_type)
+        cdef int c_data_type_int = int(tma_dt)
+        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
+
+        # Get tensor metadata
+        cdef intptr_t global_address = view.ptr
+        shape = view.shape
+        strides = view.strides  # in elements, can be None for C-contiguous
+
+        cdef int rank = len(shape)
+        if rank < 1 or rank > 5:
+            raise ValueError(
+                f"Tensor rank must be between 1 and 5, got {rank}")
+
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)")
+
+        if len(box_dim) != rank:
+            raise ValueError(
+                f"box_dim must have {rank} elements (same as tensor rank), "
+                f"got {len(box_dim)}")
+
+        for i, bd in enumerate(box_dim):
+            if bd < 1 or bd > 256:
+                raise ValueError(
+                    f"box_dim[{i}] must be in [1, 256], got {bd}")
+
+        if element_strides is not None:
+            if len(element_strides) != rank:
+                raise ValueError(
+                    f"element_strides must have {rank} elements, got {len(element_strides)}")
+        else:
+            element_strides = (1,) * rank
+
+        # Compute byte strides from element strides
+        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+        if strides is not None:
+            byte_strides = tuple(s * elem_size for s in strides)
+        else:
+            # C-contiguous: strides in bytes, row-major
+            byte_strides = []
+            stride = elem_size
+            for i in range(rank - 1, -1, -1):
+                byte_strides.append(stride)
+                stride *= shape[i]
+            byte_strides.reverse()
+
+        # Reverse dimensions for column-major cuTensorMap convention
+        # Python/DLPack: row-major (dim 0 = outermost)
+        # cuTensorMap: column-major (dim 0 = innermost)
+        cdef uint64_t[5] c_global_dim
+        cdef uint64_t[4] c_global_strides  # rank - 1 elements
+        cdef uint32_t[5] c_box_dim
+        cdef uint32_t[5] c_element_strides
+        cdef int i_c
+
+        for i_c in range(rank):
+            # Reverse: Python dim i -> cuTensorMap dim (rank - 1 - i)
+            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+            c_box_dim[i_c] = <uint32_t>box_dim[rank - 1 - i_c]
+            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+        # globalStrides: rank-1 elements (byte strides for dims 1..N-1 in col-major order)
+        # The innermost stride (dim 0) is implicit = element size
+        for i_c in range(rank - 1):
+            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+        cdef uint32_t c_rank = <uint32_t>rank
+        cdef int c_interleave_int = int(interleave)
+        cdef int c_swizzle_int = int(swizzle)
+        cdef int c_l2_promotion_int = int(l2_promotion)
+        cdef int c_oob_fill_int = int(oob_fill)
+        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapEncodeTiled(
+                &desc._tensor_map,
+                c_data_type,
+                c_rank,
+                <void*>global_address,
+                c_global_dim,
+                c_global_strides,
+                c_box_dim,
+                c_element_strides,
+                c_interleave,
+                c_swizzle,
+                c_l2_promotion,
+                c_oob_fill,
+            ))
+
+        return desc
+
+    @staticmethod
+    def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner,
+                    channels_per_pixel, pixels_per_column, *,
+                    element_strides=None,
+                    data_type=None,
+                    interleave=TensorMapInterleave.NONE,
+                    swizzle=TensorMapSwizzle.NONE,
+                    l2_promotion=TensorMapL2Promotion.NONE,
+                    oob_fill=TensorMapOOBFill.NONE):
+        """Create an im2col TMA descriptor from a tensor object.
+
+        Im2col layout is used for convolution-style data access patterns.
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        pixel_box_lower_corner : tuple of int
+            Lower corner of the pixel bounding box for each spatial
+            dimension (rank - 2 elements). Specified in row-major order
+            matching the tensor's spatial dimensions.
+        pixel_box_upper_corner : tuple of int
+            Upper corner of the pixel bounding box for each spatial
+            dimension (rank - 2 elements). Specified in row-major order
+            matching the tensor's spatial dimensions.
+        channels_per_pixel : int
+            Number of channels per pixel.
+        pixels_per_column : int
+            Number of pixels per column.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+        data_type : TensorMapDataType, optional
+            Explicit data type override. If ``None``, inferred from the
+            tensor's dtype.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``NONE``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [3, 5], the pointer is not
+            16-byte aligned, or other constraints are violated.
+        """
+        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+
+        # Obtain a StridedMemoryView from the tensor
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        desc._source_ref = tensor
+
+        tma_dt = _resolve_data_type(view, data_type)
+        cdef int c_data_type_int = int(tma_dt)
+        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
+
+        cdef intptr_t global_address = view.ptr
+        shape = view.shape
+        strides = view.strides
+
+        cdef int rank = len(shape)
+        if rank < 3 or rank > 5:
+            raise ValueError(
+                f"Im2col tensor rank must be between 3 and 5, got {rank}")
+
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x}")
+
+        cdef int n_spatial = rank - 2
+        if len(pixel_box_lower_corner) != n_spatial:
+            raise ValueError(
+                f"pixel_box_lower_corner must have {n_spatial} elements "
+                f"(rank - 2), got {len(pixel_box_lower_corner)}")
+        if len(pixel_box_upper_corner) != n_spatial:
+            raise ValueError(
+                f"pixel_box_upper_corner must have {n_spatial} elements "
+                f"(rank - 2), got {len(pixel_box_upper_corner)}")
+
+        if element_strides is not None:
+            if len(element_strides) != rank:
+                raise ValueError(
+                    f"element_strides must have {rank} elements, got {len(element_strides)}")
+        else:
+            element_strides = (1,) * rank
+
+        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+        if strides is not None:
+            byte_strides = tuple(s * elem_size for s in strides)
+        else:
+            byte_strides = []
+            stride = elem_size
+            for i in range(rank - 1, -1, -1):
+                byte_strides.append(stride)
+                stride *= shape[i]
+            byte_strides.reverse()
+
+        # Reverse all dimension arrays for column-major convention
+        cdef uint64_t[5] c_global_dim
+        cdef uint64_t[4] c_global_strides
+        cdef uint32_t[5] c_element_strides
+        cdef int[3] c_pixel_box_lower  # max 3 spatial dims (rank 5 - 2)
+        cdef int[3] c_pixel_box_upper
+        cdef int i_c
+
+        for i_c in range(rank):
+            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+        for i_c in range(rank - 1):
+            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+        # Reverse spatial dimensions for lower/upper corners
+        for i_c in range(n_spatial):
+            c_pixel_box_lower[i_c] = <int>pixel_box_lower_corner[n_spatial - 1 - i_c]
+            c_pixel_box_upper[i_c] = <int>pixel_box_upper_corner[n_spatial - 1 - i_c]
+
+        cdef uint32_t c_rank = <uint32_t>rank
+        cdef uint32_t c_channels = <uint32_t>channels_per_pixel
+        cdef uint32_t c_pixels = <uint32_t>pixels_per_column
+        cdef int c_interleave_int = int(interleave)
+        cdef int c_swizzle_int = int(swizzle)
+        cdef int c_l2_promotion_int = int(l2_promotion)
+        cdef int c_oob_fill_int = int(oob_fill)
+        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2col(
+                &desc._tensor_map,
+                c_data_type,
+                c_rank,
+                <void*>global_address,
+                c_global_dim,
+                c_global_strides,
+                c_pixel_box_lower,
+                c_pixel_box_upper,
+                c_channels,
+                c_pixels,
+                c_element_strides,
+                c_interleave,
+                c_swizzle,
+                c_l2_promotion,
+                c_oob_fill,
+            ))
+
+        return desc
+
+    def replace_address(self, tensor):
+        """Replace the global memory address in this tensor map descriptor.
+
+        This is useful when the tensor data has been reallocated but the
+        shape, strides, and other parameters remain the same.
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        """
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        cdef intptr_t global_address = view.ptr
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x}")
+
+        self._source_ref = tensor
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress(
+                &self._tensor_map,
+                <void*>global_address,
+            ))
+
+    def __repr__(self):
+        return f"TensorMapDescriptor()"
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
new file mode 100644
index 0000000000..de51570ec4
--- /dev/null
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -0,0 +1,195 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates how to use TMA (Tensor Memory Accelerator) descriptors
+# with cuda.core on Hopper+ GPUs (compute capability >= 9.0).
+#
+# TMA enables efficient bulk data movement between global and shared memory using
+# hardware-managed tensor map descriptors. This example shows:
+#
+#   1. Creating a TMA tiled descriptor from a CuPy device array
+#   2. Passing the descriptor to a kernel via launch()
+#   3. Using TMA to load tiles into shared memory (via inline PTX)
+#   4. Updating the descriptor's source address with replace_address()
+#
+# Requirements:
+#   - Hopper or later GPU (compute capability >= 9.0)
+#   - CuPy
+#   - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set)
+#
+# ################################################################################
+
+import sys
+
+import cupy as cp
+import numpy as np
+
+from cuda.core import (
+    Device,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    TensorMapDescriptor,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Check for Hopper+ GPU
+# ---------------------------------------------------------------------------
+dev = Device()
+arch = dev.compute_capability
+if arch < (9, 0):
+    print(
+        "TMA requires compute capability >= 9.0 (Hopper or later)",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+dev.set_current()
+
+arch_str = "".join(f"{i}" for i in arch)
+
+# ---------------------------------------------------------------------------
+# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
+# copies the tile to an output buffer so we can verify correctness.
+#
+# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be
+# compiled with NVRTC without pulling in the full driver-API header.
+#
+# Key points:
+#   - The tensor map is passed by value with __grid_constant__ so the TMA
+#     hardware can read it from grid-constant memory.
+#   - Thread 0 in each block issues the TMA load and manages the mbarrier.
+#   - All threads wait on the mbarrier, then copy from shared to global.
+# ---------------------------------------------------------------------------
+TILE_SIZE = 128  # elements per tile (must match the kernel constant)
+
+code = r"""
+// Minimal definition of the 128-byte opaque tensor map struct.
+struct __align__(64) TensorMap { unsigned long long opaque[16]; };
+
+static constexpr int TILE_SIZE = 128;
+
+extern "C"
+__global__ void tma_copy(
+    const __grid_constant__ TensorMap tensor_map,
+    float* output,
+    int N)
+{
+    __shared__ __align__(128) float smem[TILE_SIZE];
+    __shared__ __align__(8) unsigned long long mbar;
+
+    const int tid        = threadIdx.x;
+    const int tile_start = blockIdx.x * TILE_SIZE;
+
+    // ---- Thread 0: set up mbarrier and issue the TMA load ----
+    if (tid == 0)
+    {
+        // Initialise a single-phase mbarrier (1 arriving thread).
+        asm volatile(
+            "mbarrier.init.shared.b64 [%0], 1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start'
+        // from the tensor described by 'tensor_map' into shared memory.
+        asm volatile(
+            "cp.async.bulk.tensor.1d.shared::cluster.global.tile"
+            ".mbarrier::complete_tx::bytes"
+            " [%0], [%1, {%2}], [%3];"
+            :: "r"((unsigned)__cvta_generic_to_shared(smem)),
+               "l"(&tensor_map),
+               "r"(tile_start),
+               "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Tell the mbarrier how many bytes the TMA will deliver.
+        asm volatile(
+            "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)),
+               "r"((unsigned)(TILE_SIZE * sizeof(float))));
+    }
+
+    __syncthreads();
+
+    // ---- Wait for the TMA load to complete ----
+    if (tid == 0)
+    {
+        asm volatile(
+            "{ .reg .pred P;                                           \n"
+            "WAIT:                                                     \n"
+            "  mbarrier.try_wait.parity.shared.b64 P, [%0], 0;         \n"
+            "  @!P bra WAIT;                                           \n"
+            "}                                                         \n"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+    }
+
+    __syncthreads();
+
+    // ---- Copy the tile from shared memory to the output buffer ----
+    if (tid < TILE_SIZE)
+    {
+        const int idx = tile_start + tid;
+        if (idx < N)
+            output[idx] = smem[tid];
+    }
+}
+"""
+
+# ---------------------------------------------------------------------------
+# Compile the kernel
+# ---------------------------------------------------------------------------
+prog = Program(
+    code,
+    code_type="c++",
+    options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"),
+)
+mod = prog.compile("cubin")
+ker = mod.get_kernel("tma_copy")
+
+# ---------------------------------------------------------------------------
+# 1) Prepare input data on the device
+# ---------------------------------------------------------------------------
+N = 1024
+a = cp.arange(N, dtype=cp.float32)  # [0, 1, 2, ..., N-1]
+output = cp.zeros(N, dtype=cp.float32)
+dev.sync()  # cupy uses its own stream
+
+# ---------------------------------------------------------------------------
+# 2) Create a TMA tiled descriptor
+#    from_tiled() accepts any DLPack / __cuda_array_interface__ object.
+#    The dtype (float32) is inferred automatically from the CuPy array.
+# ---------------------------------------------------------------------------
+tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+
+# ---------------------------------------------------------------------------
+# 3) Launch the kernel
+#    The TensorMapDescriptor is passed directly as a kernel argument — the
+#    128-byte struct is copied into kernel parameter space automatically.
+# ---------------------------------------------------------------------------
+n_tiles = N // TILE_SIZE
+config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
+launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output, a), "TMA copy produced incorrect results"
+print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
+
+# ---------------------------------------------------------------------------
+# 4) Demonstrate replace_address()
+#    Create a second tensor with different content, point the *same*
+#    descriptor at it, and re-launch without rebuilding the descriptor.
+# ---------------------------------------------------------------------------
+b = cp.full(N, fill_value=42.0, dtype=cp.float32)
+dev.sync()
+
+tensor_map.replace_address(b)
+
+output2 = cp.zeros(N, dtype=cp.float32)
+dev.sync()
+
+launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output2, b), "replace_address produced incorrect results"
+print("replace_address verified: descriptor reused with new source tensor")
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
new file mode 100644
index 0000000000..0aa5968a85
--- /dev/null
+++ b/cuda_core/tests/test_tensor_map.py
@@ -0,0 +1,279 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import numpy as np
+
+from cuda.core import (
+    Device,
+    TensorMapDescriptor,
+    TensorMapDataType,
+    TensorMapInterleave,
+    TensorMapL2Promotion,
+    TensorMapOOBFill,
+    TensorMapSwizzle,
+)
+
+
+@pytest.fixture
+def dev(init_cuda):
+    return Device()
+
+
+@pytest.fixture
+def skip_if_no_tma(dev):
+    if not dev.properties.tensor_map_access_supported:
+        pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
+
+
+def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256):
+    """Allocate a device buffer and return it with proper alignment."""
+    n_elements = 1
+    for s in shape:
+        n_elements *= s
+    buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment)
+    return buf
+
+
+class TestTensorMapEnums:
+    """Test that enum wrappers expose the expected values."""
+
+    def test_data_type_values(self):
+        assert TensorMapDataType.UINT8 == 0
+        assert TensorMapDataType.FLOAT32 == 7
+        assert TensorMapDataType.FLOAT64 == 8
+        assert TensorMapDataType.BFLOAT16 == 9
+
+    def test_interleave_values(self):
+        assert TensorMapInterleave.NONE == 0
+        assert TensorMapInterleave.INTERLEAVE_16B == 1
+        assert TensorMapInterleave.INTERLEAVE_32B == 2
+
+    def test_swizzle_values(self):
+        assert TensorMapSwizzle.NONE == 0
+        assert TensorMapSwizzle.SWIZZLE_32B == 1
+        assert TensorMapSwizzle.SWIZZLE_64B == 2
+        assert TensorMapSwizzle.SWIZZLE_128B == 3
+
+    def test_l2_promotion_values(self):
+        assert TensorMapL2Promotion.NONE == 0
+        assert TensorMapL2Promotion.L2_64B == 1
+        assert TensorMapL2Promotion.L2_128B == 2
+        assert TensorMapL2Promotion.L2_256B == 3
+
+    def test_oob_fill_values(self):
+        assert TensorMapOOBFill.NONE == 0
+        assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1
+
+
+class TestTensorMapDescriptorCreation:
+    """Test TensorMapDescriptor factory methods."""
+
+    def test_cannot_instantiate_directly(self):
+        with pytest.raises(RuntimeError, match="cannot be instantiated directly"):
+            TensorMapDescriptor()
+
+    def test_from_tiled_1d(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)  # 1024 float32 elements
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+        assert repr(desc) == "TensorMapDescriptor()"
+
+    def test_from_tiled_2d(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)  # 64x64 float32
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_tiled_3d(self, dev, skip_if_no_tma):
+        buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(8, 8, 8),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+            swizzle=TensorMapSwizzle.SWIZZLE_128B,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+            l2_promotion=TensorMapL2Promotion.L2_128B,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+            oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA,
+        )
+        assert desc is not None
+
+
+class TestTensorMapDescriptorValidation:
+    """Test validation in TensorMapDescriptor factory methods."""
+
+    def test_invalid_rank_zero(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64)
+        with pytest.raises(ValueError, match="rank must be between 1 and 5"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="box_dim must have 1 elements"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(32, 32),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_box_dim_out_of_range(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match=r"box_dim\[0\] must be in \[1, 256\]"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(512,),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_element_strides_rank_mismatch(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="element_strides must have 1 elements"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(64,),
+                element_strides=(1, 1),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_invalid_data_type(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(TypeError, match="data_type must be a TensorMapDataType"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(64,),
+                data_type=42,
+            )
+
+
+class TestTensorMapDtypeMapping:
+    """Test automatic dtype inference from numpy dtypes."""
+
+    @pytest.mark.parametrize("np_dtype,expected_tma_dt", [
+        (np.uint8, TensorMapDataType.UINT8),
+        (np.uint16, TensorMapDataType.UINT16),
+        (np.uint32, TensorMapDataType.UINT32),
+        (np.int32, TensorMapDataType.INT32),
+        (np.uint64, TensorMapDataType.UINT64),
+        (np.int64, TensorMapDataType.INT64),
+        (np.float16, TensorMapDataType.FLOAT16),
+        (np.float32, TensorMapDataType.FLOAT32),
+        (np.float64, TensorMapDataType.FLOAT64),
+    ])
+    def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma):
+        from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+        assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt
+
+    def test_bfloat16_mapping(self):
+        try:
+            from ml_dtypes import bfloat16
+            from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+            assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16
+        except ImportError:
+            pytest.skip("ml_dtypes not installed")
+
+
+class TestTensorMapReplaceAddress:
+    """Test replace_address functionality."""
+
+    def test_replace_address(self, dev, skip_if_no_tma):
+        buf1 = dev.allocate(1024 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf1,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+
+        buf2 = dev.allocate(1024 * 4)
+        desc.replace_address(buf2)
+        # No exception means success
+
+    def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma):
+        buf1 = dev.allocate(1024 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf1,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        # Create a host-only array (not device-accessible)
+        host_arr = np.zeros(1024, dtype=np.float32)
+        with pytest.raises(ValueError, match="device-accessible"):
+            desc.replace_address(host_arr)
+
+
+class TestTensorMapIm2col:
+    """Test im2col TMA descriptor creation."""
+
+    def test_from_im2col_3d(self, dev, skip_if_no_tma):
+        # 3D tensor: batch=1, height=32, channels=64
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        desc = TensorMapDescriptor.from_im2col(
+            buf,
+            pixel_box_lower_corner=(0,),
+            pixel_box_upper_corner=(4,),
+            channels_per_pixel=64,
+            pixels_per_column=4,
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_im2col_rank_validation(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="Im2col tensor rank must be between 3 and 5"):
+            TensorMapDescriptor.from_im2col(
+                buf,
+                pixel_box_lower_corner=(),
+                pixel_box_upper_corner=(),
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"):
+            TensorMapDescriptor.from_im2col(
+                buf,
+                pixel_box_lower_corner=(0, 0),
+                pixel_box_upper_corner=(4,),
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )

From 725121455365dd9366fda97e6ae92594fe898259 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 13:53:49 -0800
Subject: [PATCH 2/9] tma wide

---
 cuda_core/cuda/core/__init__.py     |   1 +
 cuda_core/cuda/core/_tensor_map.pxd |   1 +
 cuda_core/cuda/core/_tensor_map.pyx | 208 +++++++++++++++++++++++++++-
 cuda_core/pixi.toml                 |  14 +-
 cuda_core/tests/test_tensor_map.py  | 133 +++++++++++++++++-
 5 files changed, 342 insertions(+), 15 deletions(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 15eead893a..ce1e1a132a 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -64,6 +64,7 @@
 from cuda.core._tensor_map import (  # noqa: E402
     TensorMapDescriptor,
     TensorMapDataType,
+    TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
     TensorMapOOBFill,
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index 3cfd571d89..fa2254ba70 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -8,5 +8,6 @@ from cuda.bindings cimport cydriver
 cdef class TensorMapDescriptor:
     cdef cydriver.CUtensorMap _tensor_map
     cdef object _source_ref
+    cdef object _repr_info
 
     cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 9375acf944..6b072f7404 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -80,6 +80,16 @@ class TensorMapOOBFill(enum.IntEnum):
     NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 
 
+class TensorMapIm2ColWideMode(enum.IntEnum):
+    """Im2col wide mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values.
+    Supported on compute capability 10.0+.
+    """
+    W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
+    W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+
+
 # Mapping from numpy dtype to TMA data type
 _NUMPY_DTYPE_TO_TMA = {
     numpy.dtype(numpy.uint8): TensorMapDataType.UINT8,
@@ -316,6 +326,13 @@ cdef class TensorMapDescriptor:
                 c_oob_fill,
             ))
 
+        desc._repr_info = {
+            "method": "tiled",
+            "rank": rank,
+            "data_type": tma_dt,
+            "swizzle": swizzle,
+        }
+
         return desc
 
     @staticmethod
@@ -483,6 +500,176 @@ cdef class TensorMapDescriptor:
                 c_oob_fill,
             ))
 
+        desc._repr_info = {
+            "method": "im2col",
+            "rank": rank,
+            "data_type": tma_dt,
+            "swizzle": swizzle,
+        }
+
+        return desc
+
+    @staticmethod
+    def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
+                         channels_per_pixel, pixels_per_column, *,
+                         element_strides=None,
+                         data_type=None,
+                         interleave=TensorMapInterleave.NONE,
+                         mode=TensorMapIm2ColWideMode.W,
+                         swizzle=TensorMapSwizzle.SWIZZLE_128B,
+                         l2_promotion=TensorMapL2Promotion.NONE,
+                         oob_fill=TensorMapOOBFill.NONE):
+        """Create an im2col-wide TMA descriptor from a tensor object.
+
+        Im2col-wide layout loads elements exclusively along the W (width)
+        dimension. This variant is supported on compute capability 10.0+
+        (Blackwell and later).
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        pixel_box_lower_corner_width : int
+            Lower corner of the pixel bounding box along the W dimension.
+        pixel_box_upper_corner_width : int
+            Upper corner of the pixel bounding box along the W dimension.
+        channels_per_pixel : int
+            Number of channels per pixel.
+        pixels_per_column : int
+            Number of pixels per column.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+        data_type : TensorMapDataType, optional
+            Explicit data type override. If ``None``, inferred from the
+            tensor's dtype.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        mode : TensorMapIm2ColWideMode
+            Im2col wide mode. Default ``W``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``SWIZZLE_128B``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [3, 5], the pointer is not
+            16-byte aligned, or other constraints are violated.
+        """
+        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+
+        # Obtain a StridedMemoryView from the tensor
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        desc._source_ref = tensor
+
+        tma_dt = _resolve_data_type(view, data_type)
+        cdef int c_data_type_int = int(tma_dt)
+        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
+
+        cdef intptr_t global_address = view.ptr
+        shape = view.shape
+        strides = view.strides
+
+        cdef int rank = len(shape)
+        if rank < 3 or rank > 5:
+            raise ValueError(
+                f"Im2col-wide tensor rank must be between 3 and 5, got {rank}")
+
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x}")
+
+        if element_strides is not None:
+            if len(element_strides) != rank:
+                raise ValueError(
+                    f"element_strides must have {rank} elements, got {len(element_strides)}")
+        else:
+            element_strides = (1,) * rank
+
+        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+        if strides is not None:
+            byte_strides = tuple(s * elem_size for s in strides)
+        else:
+            byte_strides = []
+            stride = elem_size
+            for i in range(rank - 1, -1, -1):
+                byte_strides.append(stride)
+                stride *= shape[i]
+            byte_strides.reverse()
+
+        # Reverse all dimension arrays for column-major convention
+        cdef uint64_t[5] c_global_dim
+        cdef uint64_t[4] c_global_strides
+        cdef uint32_t[5] c_element_strides
+        cdef int i_c
+
+        for i_c in range(rank):
+            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+        for i_c in range(rank - 1):
+            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+        cdef uint32_t c_rank = <uint32_t>rank
+        cdef int c_lower_w = <int>pixel_box_lower_corner_width
+        cdef int c_upper_w = <int>pixel_box_upper_corner_width
+        cdef uint32_t c_channels = <uint32_t>channels_per_pixel
+        cdef uint32_t c_pixels = <uint32_t>pixels_per_column
+        cdef int c_interleave_int = int(interleave)
+        cdef int c_mode_int = int(mode)
+        cdef int c_swizzle_int = int(swizzle)
+        cdef int c_l2_promotion_int = int(l2_promotion)
+        cdef int c_oob_fill_int = int(oob_fill)
+        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+        cdef cydriver.CUtensorMapIm2ColWideMode c_mode = <cydriver.CUtensorMapIm2ColWideMode>c_mode_int
+        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide(
+                &desc._tensor_map,
+                c_data_type,
+                c_rank,
+                <void*>global_address,
+                c_global_dim,
+                c_global_strides,
+                c_lower_w,
+                c_upper_w,
+                c_channels,
+                c_pixels,
+                c_element_strides,
+                c_interleave,
+                c_mode,
+                c_swizzle,
+                c_l2_promotion,
+                c_oob_fill,
+            ))
+
+        desc._repr_info = {
+            "method": "im2col_wide",
+            "rank": rank,
+            "data_type": tma_dt,
+            "swizzle": swizzle,
+        }
+
         return desc
 
     def replace_address(self, tensor):
@@ -512,13 +699,28 @@ cdef class TensorMapDescriptor:
                 f"Global memory address must be 16-byte aligned, "
                 f"got address 0x{global_address:x}")
 
-        self._source_ref = tensor
-
         with nogil:
             HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress(
                 &self._tensor_map,
                 <void*>global_address,
             ))
 
+        # Update the source reference only after the driver call succeeds,
+        # so we don't drop the old tensor (risking a dangling pointer in the
+        # CUtensorMap struct) if the call fails.
+        self._source_ref = tensor
+
     def __repr__(self):
-        return f"TensorMapDescriptor()"
+        info = self._repr_info
+        if info is None:
+            return "TensorMapDescriptor()"
+        parts = []
+        if "method" in info:
+            parts.append(info["method"])
+        if "rank" in info:
+            parts.append(f"rank={info['rank']}")
+        if "data_type" in info:
+            parts.append(f"dtype={info['data_type'].name}")
+        if "swizzle" in info:
+            parts.append(f"swizzle={info['swizzle'].name}")
+        return f"TensorMapDescriptor({', '.join(parts)})"
diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml
index 42fa19f59e..a8d3da7644 100644
--- a/cuda_core/pixi.toml
+++ b/cuda_core/pixi.toml
@@ -54,16 +54,9 @@ cuda = "13"
 [feature.cu13.dependencies]
 cuda-version = "13.1.*"
 
-[feature.cu12.system-requirements]
-cuda = "12"
-
-[feature.cu12.dependencies]
-cuda-version = "12.*"
-
-# We keep both cu12 and cu13 because cuda.core works with either major version
-# NOTE: Path dependency to ../cuda_bindings only works for cu13 (local bindings is v13.1)
-#       For cu12 testing, use conda-forge packages: temporarily change path to wildcard
-#       or skip cu12 locally: pixi run -e cu13 test
+# NOTE: cu12 environment is intentionally omitted because the path dependency
+# to ../cuda_bindings (v13.1) makes it unsolvable locally. For cu12 testing,
+# use conda-forge packages or CI workflows.
 [environments]
 default = { features = [
     "cu13",
@@ -71,7 +64,6 @@ default = { features = [
     "cython-tests",
 ], solve-group = "default" }
 cu13 = { features = ["cu13", "test", "cython-tests"], solve-group = "default" }
-cu12 = { features = ["cu12", "test", "cython-tests"], solve-group = "cu12" }
 
 # TODO: check if these can be extracted from pyproject.toml
 [package]
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 0aa5968a85..f57e73ded8 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -9,6 +9,7 @@
     Device,
     TensorMapDescriptor,
     TensorMapDataType,
+    TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
     TensorMapOOBFill,
@@ -36,6 +37,23 @@ def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256):
     return buf
 
 
+class _DeviceArray:
+    """Wrap a Buffer with explicit shape via __cuda_array_interface__.
+
+    dev.allocate() returns a 1D byte buffer. For multi-dimensional TMA tests
+    we need the tensor to report a proper shape/dtype so the TMA encoder sees
+    the correct rank, dimensions, and strides.
+    """
+    def __init__(self, buf, shape, dtype=np.float32):
+        self._buf = buf  # prevent GC
+        self.__cuda_array_interface__ = {
+            "shape": tuple(shape),
+            "typestr": np.dtype(dtype).str,
+            "data": (int(buf.handle), False),
+            "version": 3,
+        }
+
+
 class TestTensorMapEnums:
     """Test that enum wrappers expose the expected values."""
 
@@ -66,6 +84,10 @@ def test_oob_fill_values(self):
         assert TensorMapOOBFill.NONE == 0
         assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1
 
+    def test_im2col_wide_mode_values(self):
+        assert TensorMapIm2ColWideMode.W == 0
+        assert TensorMapIm2ColWideMode.W128 == 1
+
 
 class TestTensorMapDescriptorCreation:
     """Test TensorMapDescriptor factory methods."""
@@ -82,7 +104,7 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma):
             data_type=TensorMapDataType.FLOAT32,
         )
         assert desc is not None
-        assert repr(desc) == "TensorMapDescriptor()"
+        assert repr(desc) == "TensorMapDescriptor(tiled, rank=1, dtype=FLOAT32, swizzle=NONE)"
 
     def test_from_tiled_2d(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)  # 64x64 float32
@@ -102,6 +124,18 @@ def test_from_tiled_3d(self, dev, skip_if_no_tma):
         )
         assert desc is not None
 
+    def test_from_tiled_5d(self, dev, skip_if_no_tma):
+        # 5D: exercises all 5 c_global_dim / 4 c_global_strides slots
+        shape = (2, 4, 4, 4, 8)
+        n_bytes = 2 * 4 * 4 * 4 * 8 * 4  # float32
+        buf = dev.allocate(n_bytes)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_tiled(
+            tensor,
+            box_dim=(1, 2, 2, 2, 8),
+        )
+        assert desc is not None
+
     def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
         desc = TensorMapDescriptor.from_tiled(
@@ -277,3 +311,100 @@ def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma):
                 pixels_per_column=4,
                 data_type=TensorMapDataType.FLOAT32,
             )
+
+    def test_from_im2col_4d(self, dev, skip_if_no_tma):
+        # NHWC layout: N=1, H=8, W=8, C=64 — 2 spatial dims
+        # Exercises spatial corner reversal with n_spatial=2:
+        #   Python [H_lower, W_lower] -> driver [W_lower, H_lower]
+        shape = (1, 8, 8, 64)
+        buf = dev.allocate(1 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col(
+            tensor,
+            pixel_box_lower_corner=(0, 0),
+            pixel_box_upper_corner=(4, 4),
+            channels_per_pixel=64,
+            pixels_per_column=16,
+        )
+        assert desc is not None
+
+    def test_from_im2col_5d(self, dev, skip_if_no_tma):
+        # NDHWC layout: N=1, D=4, H=8, W=8, C=64 — 3 spatial dims
+        # Exercises the full spatial corner reversal:
+        #   Python [D, H, W] -> driver [W, H, D]
+        shape = (1, 4, 8, 8, 64)
+        buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col(
+            tensor,
+            pixel_box_lower_corner=(0, 0, 0),
+            pixel_box_upper_corner=(2, 4, 4),
+            channels_per_pixel=64,
+            pixels_per_column=32,
+        )
+        assert desc is not None
+
+
+class TestTensorMapIm2colWide:
+    """Test im2col-wide TMA descriptor creation (compute capability 10.0+)."""
+
+    @pytest.fixture
+    def skip_if_no_im2col_wide(self, dev):
+        cc = dev.compute_capability
+        if cc.major < 10:
+            pytest.skip("Device does not support im2col-wide (requires compute capability 10.0+)")
+
+    def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide):
+        # 3D tensor: batch=1, width=32, channels=64
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        desc = TensorMapDescriptor.from_im2col_wide(
+            buf,
+            pixel_box_lower_corner_width=0,
+            pixel_box_upper_corner_width=4,
+            channels_per_pixel=64,
+            pixels_per_column=4,
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_im2col_wide_4d(self, dev, skip_if_no_im2col_wide):
+        # NHWC layout: N=1, H=8, W=8, C=64
+        # Wide mode only uses scalar W corners, even with higher rank
+        shape = (1, 8, 8, 64)
+        buf = dev.allocate(1 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col_wide(
+            tensor,
+            pixel_box_lower_corner_width=0,
+            pixel_box_upper_corner_width=4,
+            channels_per_pixel=64,
+            pixels_per_column=16,
+        )
+        assert desc is not None
+
+    def test_from_im2col_wide_5d(self, dev, skip_if_no_im2col_wide):
+        # NDHWC layout: N=1, D=4, H=8, W=8, C=64
+        # Max rank boundary — verifies all 5 dim/stride slots are filled
+        shape = (1, 4, 8, 8, 64)
+        buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col_wide(
+            tensor,
+            pixel_box_lower_corner_width=0,
+            pixel_box_upper_corner_width=4,
+            channels_per_pixel=64,
+            pixels_per_column=32,
+        )
+        assert desc is not None
+
+    def test_from_im2col_wide_rank_validation(self, dev, skip_if_no_im2col_wide):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="Im2col-wide tensor rank must be between 3 and 5"):
+            TensorMapDescriptor.from_im2col_wide(
+                buf,
+                pixel_box_lower_corner_width=0,
+                pixel_box_upper_corner_width=4,
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )

From beb1c124648201bc5c309d2d00ec5001ebbdcf0d Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 15:15:56 -0800
Subject: [PATCH 3/9] clean up

---
 cuda_core/cuda/core/_tensor_map.pxd  |   2 +-
 cuda_core/cuda/core/_tensor_map.pyx  | 185 ++++++++++-----------------
 cuda_core/examples/tma_tensor_map.py |   2 +-
 cuda_core/tests/test_tensor_map.py   |   2 +-
 4 files changed, 72 insertions(+), 119 deletions(-)

diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index fa2254ba70..b2b165b5aa 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 6b072f7404..9edcccb928 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -150,6 +150,55 @@ def _resolve_data_type(view, data_type):
     return tma_dt
 
 
+def _get_validated_view(tensor):
+    """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer."""
+    if isinstance(tensor, StridedMemoryView):
+        view = tensor
+    else:
+        # stream_ptr=-1: no stream synchronization needed because descriptor
+        # creation only reads tensor metadata, it does not move data.
+        view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+    if not view.is_device_accessible:
+        raise ValueError("The tensor must be device-accessible")
+
+    if view.ptr % 16 != 0:
+        raise ValueError(
+            f"Global memory address must be 16-byte aligned, "
+            f"got address 0x{view.ptr:x}")
+
+    return view
+
+
+def _compute_byte_strides(shape, strides, elem_size):
+    """Compute byte strides from element strides or C-contiguous fallback.
+
+    Returns a tuple of byte strides in row-major order.
+    """
+    if strides is not None:
+        return tuple(s * elem_size for s in strides)
+
+    # C-contiguous: compute byte strides from shape, innermost first
+    rank = len(shape)
+    byte_strides = []
+    stride = elem_size
+    for i in range(rank - 1, -1, -1):
+        byte_strides.append(stride)
+        stride *= shape[i]
+    byte_strides.reverse()
+    return tuple(byte_strides)
+
+
+def _validate_element_strides(element_strides, rank):
+    """Validate or default element_strides to all-ones."""
+    if element_strides is not None:
+        if len(element_strides) != rank:
+            raise ValueError(
+                f"element_strides must have {rank} elements, got {len(element_strides)}")
+        return element_strides
+    return (1,) * rank
+
+
 cdef class TensorMapDescriptor:
     """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs.
 
@@ -171,8 +220,8 @@ cdef class TensorMapDescriptor:
     cdef void* _get_data_ptr(self):
         return <void*>&self._tensor_map
 
-    @staticmethod
-    def from_tiled(tensor, box_dim, *,
+    @classmethod
+    def from_tiled(cls, tensor, box_dim, *,
                    element_strides=None,
                    data_type=None,
                    interleave=TensorMapInterleave.NONE,
@@ -216,40 +265,23 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [1, 5], the pointer is not
             16-byte aligned, or dimension/stride constraints are violated.
         """
-        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+        cdef TensorMapDescriptor desc = cls.__new__(cls)
 
-        # Obtain a StridedMemoryView from the tensor
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
-
-        # Keep a strong reference to prevent GC
+        view = _get_validated_view(tensor)
         desc._source_ref = tensor
 
-        # Resolve data type
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
         cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
 
-        # Get tensor metadata
         cdef intptr_t global_address = view.ptr
         shape = view.shape
-        strides = view.strides  # in elements, can be None for C-contiguous
 
         cdef int rank = len(shape)
         if rank < 1 or rank > 5:
             raise ValueError(
                 f"Tensor rank must be between 1 and 5, got {rank}")
 
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)")
-
         if len(box_dim) != rank:
             raise ValueError(
                 f"box_dim must have {rank} elements (same as tensor rank), "
@@ -260,25 +292,10 @@ cdef class TensorMapDescriptor:
                 raise ValueError(
                     f"box_dim[{i}] must be in [1, 256], got {bd}")
 
-        if element_strides is not None:
-            if len(element_strides) != rank:
-                raise ValueError(
-                    f"element_strides must have {rank} elements, got {len(element_strides)}")
-        else:
-            element_strides = (1,) * rank
+        element_strides = _validate_element_strides(element_strides, rank)
 
-        # Compute byte strides from element strides
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        if strides is not None:
-            byte_strides = tuple(s * elem_size for s in strides)
-        else:
-            # C-contiguous: strides in bytes, row-major
-            byte_strides = []
-            stride = elem_size
-            for i in range(rank - 1, -1, -1):
-                byte_strides.append(stride)
-                stride *= shape[i]
-            byte_strides.reverse()
+        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
         # Reverse dimensions for column-major cuTensorMap convention
         # Python/DLPack: row-major (dim 0 = outermost)
@@ -335,8 +352,8 @@ cdef class TensorMapDescriptor:
 
         return desc
 
-    @staticmethod
-    def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner,
+    @classmethod
+    def from_im2col(cls, tensor, pixel_box_lower_corner, pixel_box_upper_corner,
                     channels_per_pixel, pixels_per_column, *,
                     element_strides=None,
                     data_type=None,
@@ -390,17 +407,9 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [3, 5], the pointer is not
             16-byte aligned, or other constraints are violated.
         """
-        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
-
-        # Obtain a StridedMemoryView from the tensor
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
+        cdef TensorMapDescriptor desc = cls.__new__(cls)
 
+        view = _get_validated_view(tensor)
         desc._source_ref = tensor
 
         tma_dt = _resolve_data_type(view, data_type)
@@ -409,18 +418,12 @@ cdef class TensorMapDescriptor:
 
         cdef intptr_t global_address = view.ptr
         shape = view.shape
-        strides = view.strides
 
         cdef int rank = len(shape)
         if rank < 3 or rank > 5:
             raise ValueError(
                 f"Im2col tensor rank must be between 3 and 5, got {rank}")
 
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x}")
-
         cdef int n_spatial = rank - 2
         if len(pixel_box_lower_corner) != n_spatial:
             raise ValueError(
@@ -431,23 +434,10 @@ cdef class TensorMapDescriptor:
                 f"pixel_box_upper_corner must have {n_spatial} elements "
                 f"(rank - 2), got {len(pixel_box_upper_corner)}")
 
-        if element_strides is not None:
-            if len(element_strides) != rank:
-                raise ValueError(
-                    f"element_strides must have {rank} elements, got {len(element_strides)}")
-        else:
-            element_strides = (1,) * rank
+        element_strides = _validate_element_strides(element_strides, rank)
 
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        if strides is not None:
-            byte_strides = tuple(s * elem_size for s in strides)
-        else:
-            byte_strides = []
-            stride = elem_size
-            for i in range(rank - 1, -1, -1):
-                byte_strides.append(stride)
-                stride *= shape[i]
-            byte_strides.reverse()
+        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
         # Reverse all dimension arrays for column-major convention
         cdef uint64_t[5] c_global_dim
@@ -509,8 +499,8 @@ cdef class TensorMapDescriptor:
 
         return desc
 
-    @staticmethod
-    def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
+    @classmethod
+    def from_im2col_wide(cls, tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
                          channels_per_pixel, pixels_per_column, *,
                          element_strides=None,
                          data_type=None,
@@ -565,17 +555,9 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [3, 5], the pointer is not
             16-byte aligned, or other constraints are violated.
         """
-        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
-
-        # Obtain a StridedMemoryView from the tensor
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
+        cdef TensorMapDescriptor desc = cls.__new__(cls)
 
+        view = _get_validated_view(tensor)
         desc._source_ref = tensor
 
         tma_dt = _resolve_data_type(view, data_type)
@@ -584,35 +566,16 @@ cdef class TensorMapDescriptor:
 
         cdef intptr_t global_address = view.ptr
         shape = view.shape
-        strides = view.strides
 
         cdef int rank = len(shape)
         if rank < 3 or rank > 5:
             raise ValueError(
                 f"Im2col-wide tensor rank must be between 3 and 5, got {rank}")
 
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x}")
-
-        if element_strides is not None:
-            if len(element_strides) != rank:
-                raise ValueError(
-                    f"element_strides must have {rank} elements, got {len(element_strides)}")
-        else:
-            element_strides = (1,) * rank
+        element_strides = _validate_element_strides(element_strides, rank)
 
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        if strides is not None:
-            byte_strides = tuple(s * elem_size for s in strides)
-        else:
-            byte_strides = []
-            stride = elem_size
-            for i in range(rank - 1, -1, -1):
-                byte_strides.append(stride)
-                stride *= shape[i]
-            byte_strides.reverse()
+        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
         # Reverse all dimension arrays for column-major convention
         cdef uint64_t[5] c_global_dim
@@ -685,19 +648,9 @@ cdef class TensorMapDescriptor:
             or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
             device-accessible memory with a 16-byte-aligned pointer.
         """
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
+        view = _get_validated_view(tensor)
 
         cdef intptr_t global_address = view.ptr
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x}")
 
         with nogil:
             HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress(
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index de51570ec4..e93c02dc08 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index f57e73ded8..790307857b 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest

From 6f689175e55399f87def6ab3ca374a844bdbd01b Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 15:55:00 -0800
Subject: [PATCH 4/9] Add comments to prepare_tensor_map_arg explaining
 allocation and lifetime

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cuda_core/cuda/core/_kernel_arg_handler.pyx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index e88def13cd..b55a4abc7a 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -135,8 +135,13 @@ cdef inline int prepare_tensor_map_arg(
         vector.vector[void*]& data_addresses,
         TensorMapDescriptor arg,
         const size_t idx) except -1:
+    # Allocate a temporary buffer for the 128-byte CUtensorMap struct.
+    # We copy rather than pointing directly at arg._tensor_map for lifetime
+    # safety: ParamHolder owns and frees its argument buffers independently.
     cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
     memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
+    # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__,
+    # data_addresses[idx] is the pointer passed to cuLaunchKernel.
     data_addresses[idx] = ptr
     data[idx] = ptr
     return 0

From 53c4a3db2c44dfb5b39c7e45f12990a9437098b2 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 17:04:26 -0800
Subject: [PATCH 5/9] Address Copilot review feedback

- Remove unused _alloc_device_tensor helper from tests
- Add test for rank > 5 (6D tensor) to verify upper bound validation
- Add NULL check for PyMem_Malloc in prepare_tensor_map_arg

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cuda_core/cuda/core/_kernel_arg_handler.pyx |  2 ++
 cuda_core/tests/test_tensor_map.py          | 21 +++++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index b55a4abc7a..5619ba7492 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -139,6 +139,8 @@ cdef inline int prepare_tensor_map_arg(
     # We copy rather than pointing directly at arg._tensor_map for lifetime
     # safety: ParamHolder owns and frees its argument buffers independently.
     cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
+    if ptr is NULL:
+        raise MemoryError("Failed to allocate memory for CUtensorMap")
     memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
     # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__,
     # data_addresses[idx] is the pointer passed to cuLaunchKernel.
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 790307857b..96dd14a419 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -28,14 +28,6 @@ def skip_if_no_tma(dev):
         pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
 
 
-def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256):
-    """Allocate a device buffer and return it with proper alignment."""
-    n_elements = 1
-    for s in shape:
-        n_elements *= s
-    buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment)
-    return buf
-
 
 class _DeviceArray:
     """Wrap a Buffer with explicit shape via __cuda_array_interface__.
@@ -179,6 +171,19 @@ def test_invalid_rank_zero(self, dev, skip_if_no_tma):
                 data_type=TensorMapDataType.FLOAT32,
             )
 
+    def test_invalid_rank_six(self, dev, skip_if_no_tma):
+        shape = (2, 2, 2, 2, 2, 2)
+        n_elements = 1
+        for s in shape:
+            n_elements *= s
+        buf = dev.allocate(n_elements * 4)
+        arr = _DeviceArray(buf, shape)
+        with pytest.raises(ValueError, match="rank must be between 1 and 5"):
+            TensorMapDescriptor.from_tiled(
+                arr,
+                box_dim=(2,) * 6,
+            )
+
     def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma):
         buf = dev.allocate(1024 * 4)
         with pytest.raises(ValueError, match="box_dim must have 1 elements"):

From b1b2d3fa9c1ce7d0d38d4735be2aa38de800e4b2 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 17:20:05 -0800
Subject: [PATCH 6/9] Split TMA example into two focused files

Move the replace_address() demonstration into its own self-contained
example (tma_replace_address.py) so each file covers a single concept.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cuda_core/examples/tma_replace_address.py | 189 ++++++++++++++++++++++
 cuda_core/examples/tma_tensor_map.py      |  20 ---
 2 files changed, 189 insertions(+), 20 deletions(-)
 create mode 100644 cuda_core/examples/tma_replace_address.py

diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
new file mode 100644
index 0000000000..7f11cf551c
--- /dev/null
+++ b/cuda_core/examples/tma_replace_address.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates how to use replace_address() to repoint a TMA
+# (Tensor Memory Accelerator) descriptor at a different tensor without
+# rebuilding the descriptor from scratch.
+#
+# The workflow is:
+#
+#   1. Create a TMA tiled descriptor and launch a kernel to verify it works
+#   2. Allocate a second tensor with different content
+#   3. Call replace_address() to repoint the same descriptor at the new tensor
+#   4. Re-launch the kernel and verify it reads from the new tensor
+#
+# This is useful when the tensor layout (shape, dtype, tile size) stays the
+# same but the underlying data buffer changes, e.g. double-buffering or
+# iterating over a sequence of same-shaped tensors.
+#
+# Requirements:
+#   - Hopper or later GPU (compute capability >= 9.0)
+#   - CuPy
+#   - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set)
+#
+# ################################################################################
+
+import sys
+
+import cupy as cp
+import numpy as np
+
+from cuda.core import (
+    Device,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    TensorMapDescriptor,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Check for Hopper+ GPU
+# ---------------------------------------------------------------------------
+dev = Device()
+arch = dev.compute_capability
+if arch < (9, 0):
+    print(
+        "TMA requires compute capability >= 9.0 (Hopper or later)",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+dev.set_current()
+
+arch_str = "".join(f"{i}" for i in arch)
+
+# ---------------------------------------------------------------------------
+# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
+# copies the tile to an output buffer so we can verify correctness.
+#
+# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be
+# compiled with NVRTC without pulling in the full driver-API header.
+#
+# Key points:
+#   - The tensor map is passed by value with __grid_constant__ so the TMA
+#     hardware can read it from grid-constant memory.
+#   - Thread 0 in each block issues the TMA load and manages the mbarrier.
+#   - All threads wait on the mbarrier, then copy from shared to global.
+# ---------------------------------------------------------------------------
+TILE_SIZE = 128  # elements per tile (must match the kernel constant)
+
+code = r"""
+// Minimal definition of the 128-byte opaque tensor map struct.
+struct __align__(64) TensorMap { unsigned long long opaque[16]; };
+
+static constexpr int TILE_SIZE = 128;
+
+extern "C"
+__global__ void tma_copy(
+    const __grid_constant__ TensorMap tensor_map,
+    float* output,
+    int N)
+{
+    __shared__ __align__(128) float smem[TILE_SIZE];
+    __shared__ __align__(8) unsigned long long mbar;
+
+    const int tid        = threadIdx.x;
+    const int tile_start = blockIdx.x * TILE_SIZE;
+
+    // ---- Thread 0: set up mbarrier and issue the TMA load ----
+    if (tid == 0)
+    {
+        // Initialise a single-phase mbarrier (1 arriving thread).
+        asm volatile(
+            "mbarrier.init.shared.b64 [%0], 1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start'
+        // from the tensor described by 'tensor_map' into shared memory.
+        asm volatile(
+            "cp.async.bulk.tensor.1d.shared::cluster.global.tile"
+            ".mbarrier::complete_tx::bytes"
+            " [%0], [%1, {%2}], [%3];"
+            :: "r"((unsigned)__cvta_generic_to_shared(smem)),
+               "l"(&tensor_map),
+               "r"(tile_start),
+               "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Tell the mbarrier how many bytes the TMA will deliver.
+        asm volatile(
+            "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)),
+               "r"((unsigned)(TILE_SIZE * sizeof(float))));
+    }
+
+    __syncthreads();
+
+    // ---- Wait for the TMA load to complete ----
+    if (tid == 0)
+    {
+        asm volatile(
+            "{ .reg .pred P;                                           \n"
+            "WAIT:                                                     \n"
+            "  mbarrier.try_wait.parity.shared.b64 P, [%0], 0;         \n"
+            "  @!P bra WAIT;                                           \n"
+            "}                                                         \n"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+    }
+
+    __syncthreads();
+
+    // ---- Copy the tile from shared memory to the output buffer ----
+    if (tid < TILE_SIZE)
+    {
+        const int idx = tile_start + tid;
+        if (idx < N)
+            output[idx] = smem[tid];
+    }
+}
+"""
+
+# ---------------------------------------------------------------------------
+# Compile the kernel
+# ---------------------------------------------------------------------------
+prog = Program(
+    code,
+    code_type="c++",
+    options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"),
+)
+mod = prog.compile("cubin")
+ker = mod.get_kernel("tma_copy")
+
+# ---------------------------------------------------------------------------
+# 1) Prepare input data and verify the initial TMA copy
+# ---------------------------------------------------------------------------
+N = 1024
+a = cp.arange(N, dtype=cp.float32)  # [0, 1, 2, ..., N-1]
+output = cp.zeros(N, dtype=cp.float32)
+dev.sync()  # cupy uses its own stream
+
+tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+
+n_tiles = N // TILE_SIZE
+config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
+launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output, a), "TMA copy produced incorrect results"
+print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
+
+# ---------------------------------------------------------------------------
+# 2) Demonstrate replace_address()
+#    Create a second tensor with different content, point the *same*
+#    descriptor at it, and re-launch without rebuilding the descriptor.
+# ---------------------------------------------------------------------------
+b = cp.full(N, fill_value=42.0, dtype=cp.float32)
+dev.sync()
+
+tensor_map.replace_address(b)
+
+output2 = cp.zeros(N, dtype=cp.float32)
+dev.sync()
+
+launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output2, b), "replace_address produced incorrect results"
+print("replace_address verified: descriptor reused with new source tensor")
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index e93c02dc08..63c0ac7b5c 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -13,7 +13,6 @@
 #   1. Creating a TMA tiled descriptor from a CuPy device array
 #   2. Passing the descriptor to a kernel via launch()
 #   3. Using TMA to load tiles into shared memory (via inline PTX)
-#   4. Updating the descriptor's source address with replace_address()
 #
 # Requirements:
 #   - Hopper or later GPU (compute capability >= 9.0)
@@ -174,22 +173,3 @@
 
 assert cp.array_equal(output, a), "TMA copy produced incorrect results"
 print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
-
-# ---------------------------------------------------------------------------
-# 4) Demonstrate replace_address()
-#    Create a second tensor with different content, point the *same*
-#    descriptor at it, and re-launch without rebuilding the descriptor.
-# ---------------------------------------------------------------------------
-b = cp.full(N, fill_value=42.0, dtype=cp.float32)
-dev.sync()
-
-tensor_map.replace_address(b)
-
-output2 = cp.zeros(N, dtype=cp.float32)
-dev.sync()
-
-launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N))
-dev.sync()
-
-assert cp.array_equal(output2, b), "replace_address produced incorrect results"
-print("replace_address verified: descriptor reused with new source tensor")

From 0d884c294f1086d7d20bee077bb4f006c06868ad Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 25 Feb 2026 14:14:38 -0800
Subject: [PATCH 7/9] pre-commit

---
 cuda_core/cuda/core/__init__.py           | 14 ++++-----
 cuda_core/examples/tma_replace_address.py |  1 -
 cuda_core/examples/tma_tensor_map.py      |  1 -
 cuda_core/tests/test_tensor_map.py        | 37 ++++++++++++-----------
 4 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index feeb23797c..2a0af71b36 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -62,18 +62,18 @@
 )
 from cuda.core._module import Kernel, ObjectCode  # noqa: E402
 from cuda.core._program import Program, ProgramOptions  # noqa: E402
+from cuda.core._stream import (  # noqa: E402
+    LEGACY_DEFAULT_STREAM,
+    PER_THREAD_DEFAULT_STREAM,
+    Stream,
+    StreamOptions,
+)
 from cuda.core._tensor_map import (  # noqa: E402
-    TensorMapDescriptor,
     TensorMapDataType,
+    TensorMapDescriptor,
     TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
     TensorMapOOBFill,
     TensorMapSwizzle,
 )
-from cuda.core._stream import (  # noqa: E402
-    LEGACY_DEFAULT_STREAM,
-    PER_THREAD_DEFAULT_STREAM,
-    Stream,
-    StreamOptions,
-)
diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
index 7f11cf551c..21fb91c254 100644
--- a/cuda_core/examples/tma_replace_address.py
+++ b/cuda_core/examples/tma_replace_address.py
@@ -30,7 +30,6 @@
 
 import cupy as cp
 import numpy as np
-
 from cuda.core import (
     Device,
     LaunchConfig,
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index 63c0ac7b5c..c85c5d96bd 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -25,7 +25,6 @@
 
 import cupy as cp
 import numpy as np
-
 from cuda.core import (
     Device,
     LaunchConfig,
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 96dd14a419..1d6e8195b6 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -1,14 +1,12 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import pytest
-
 import numpy as np
-
+import pytest
 from cuda.core import (
     Device,
-    TensorMapDescriptor,
     TensorMapDataType,
+    TensorMapDescriptor,
     TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
@@ -28,7 +26,6 @@ def skip_if_no_tma(dev):
         pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
 
 
-
 class _DeviceArray:
     """Wrap a Buffer with explicit shape via __cuda_array_interface__.
 
@@ -36,6 +33,7 @@ class _DeviceArray:
     we need the tensor to report a proper shape/dtype so the TMA encoder sees
     the correct rank, dimensions, and strides.
     """
+
     def __init__(self, buf, shape, dtype=np.float32):
         self._buf = buf  # prevent GC
         self.__cuda_array_interface__ = {
@@ -225,25 +223,30 @@ def test_invalid_data_type(self, dev, skip_if_no_tma):
 class TestTensorMapDtypeMapping:
     """Test automatic dtype inference from numpy dtypes."""
 
-    @pytest.mark.parametrize("np_dtype,expected_tma_dt", [
-        (np.uint8, TensorMapDataType.UINT8),
-        (np.uint16, TensorMapDataType.UINT16),
-        (np.uint32, TensorMapDataType.UINT32),
-        (np.int32, TensorMapDataType.INT32),
-        (np.uint64, TensorMapDataType.UINT64),
-        (np.int64, TensorMapDataType.INT64),
-        (np.float16, TensorMapDataType.FLOAT16),
-        (np.float32, TensorMapDataType.FLOAT32),
-        (np.float64, TensorMapDataType.FLOAT64),
-    ])
+    @pytest.mark.parametrize(
+        "np_dtype,expected_tma_dt",
+        [
+            (np.uint8, TensorMapDataType.UINT8),
+            (np.uint16, TensorMapDataType.UINT16),
+            (np.uint32, TensorMapDataType.UINT32),
+            (np.int32, TensorMapDataType.INT32),
+            (np.uint64, TensorMapDataType.UINT64),
+            (np.int64, TensorMapDataType.INT64),
+            (np.float16, TensorMapDataType.FLOAT16),
+            (np.float32, TensorMapDataType.FLOAT32),
+            (np.float64, TensorMapDataType.FLOAT64),
+        ],
+    )
     def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma):
         from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+
         assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt
 
     def test_bfloat16_mapping(self):
         try:
-            from ml_dtypes import bfloat16
             from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+            from ml_dtypes import bfloat16
+
             assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16
         except ImportError:
             pytest.skip("ml_dtypes not installed")

From 63177590ca66a2efd45618b074bdccf4ed81dfe8 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 25 Feb 2026 15:07:54 -0800
Subject: [PATCH 8/9] adding stride meta data to gpu allocated memory

---
 cuda_core/tests/test_tensor_map.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 1d6e8195b6..3e030dd29d 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -98,8 +98,9 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma):
 
     def test_from_tiled_2d(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)  # 64x64 float32
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -107,8 +108,9 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma):
 
     def test_from_tiled_3d(self, dev, skip_if_no_tma):
         buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
+        tensor = _DeviceArray(buf, (16, 16, 16))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(8, 8, 8),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -128,8 +130,9 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma):
 
     def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             swizzle=TensorMapSwizzle.SWIZZLE_128B,
@@ -138,8 +141,9 @@ def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
 
     def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             l2_promotion=TensorMapL2Promotion.L2_128B,
@@ -148,8 +152,9 @@ def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
 
     def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA,
@@ -162,9 +167,10 @@ class TestTensorMapDescriptorValidation:
 
     def test_invalid_rank_zero(self, dev, skip_if_no_tma):
         buf = dev.allocate(64)
+        tensor = _DeviceArray(buf, ())  # 0-dim tensor
         with pytest.raises(ValueError, match="rank must be between 1 and 5"):
             TensorMapDescriptor.from_tiled(
-                buf,
+                tensor,
                 box_dim=(),
                 data_type=TensorMapDataType.FLOAT32,
             )
@@ -286,8 +292,9 @@ class TestTensorMapIm2col:
     def test_from_im2col_3d(self, dev, skip_if_no_tma):
         # 3D tensor: batch=1, height=32, channels=64
         buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))
         desc = TensorMapDescriptor.from_im2col(
-            buf,
+            tensor,
             pixel_box_lower_corner=(0,),
             pixel_box_upper_corner=(4,),
             channels_per_pixel=64,
@@ -310,9 +317,10 @@ def test_from_im2col_rank_validation(self, dev, skip_if_no_tma):
 
     def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma):
         buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))  # 3D: n_spatial = 1
         with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"):
             TensorMapDescriptor.from_im2col(
-                buf,
+                tensor,
                 pixel_box_lower_corner=(0, 0),
                 pixel_box_upper_corner=(4,),
                 channels_per_pixel=64,

From 4c866c53ecf08916f1749f7dc93a257aa702e399 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 25 Feb 2026 15:59:31 -0800
Subject: [PATCH 9/9] im2col fixes

---
 cuda_core/tests/test_tensor_map.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 3e030dd29d..8b5e27ce6f 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -373,8 +373,9 @@ def skip_if_no_im2col_wide(self, dev):
     def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide):
         # 3D tensor: batch=1, width=32, channels=64
         buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))
         desc = TensorMapDescriptor.from_im2col_wide(
-            buf,
+            tensor,
             pixel_box_lower_corner_width=0,
             pixel_box_upper_corner_width=4,
             channels_per_pixel=64,