NVIDIA · rwgk · Feb 24, 2026 · Feb 24, 2026
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
@@ -55,7 +55,7 @@ cu12 = ["cuda-bindings[all]==12.*"]
 cu13 = ["cuda-bindings[all]==13.*"]
 
 [dependency-groups]
-test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-randomly", "pytest-repeat"]
+test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures"]
 ml-dtypes = ["ml-dtypes>=0.5.4,<0.6.0"]
 test-cu12 = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"]  # runtime headers needed by CuPy
 test-cu13 = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"]  # runtime headers needed by CuPy

diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
@@ -1,14 +1,15 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing
 import pickle
 import re
 
+import pytest
 from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from cuda.core._utils.cuda_utils import CUDAError
 
-CHILD_TIMEOUT_SEC = 20
+CHILD_TIMEOUT_SEC = 30
 NBYTES = 64
 POOL_SIZE = 2097152
 
@@ -17,6 +18,7 @@ class ChildErrorHarness:
     """Test harness for checking errors in child processes. Subclasses override
     PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples)."""
 
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         """Parent process that checks child errors."""
         # Attach fixtures to this object for convenience. These can be accessed

diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing as mp
@@ -10,13 +10,14 @@
 from helpers.logging import TimestampedLogger
 
 ENABLE_LOGGING = False  # Set True for test debugging and development
-CHILD_TIMEOUT_SEC = 20
+CHILD_TIMEOUT_SEC = 30
 NBYTES = 64
 
 
 class TestEventIpc:
     """Check the basic usage of IPC-enabled events with a latch kernel."""
 
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
         device = ipc_device
@@ -93,6 +94,7 @@ def child_main(self, log, q_in, q_out):
         log("done")
 
 
+@pytest.mark.flaky(reruns=2)
 def test_event_is_monadic(ipc_device):
     """Check that IPC-enabled events are always bound and cannot be reset."""
     device = ipc_device
@@ -108,6 +110,7 @@ def test_event_is_monadic(ipc_device):
         stream.record(e)
 
 
+@pytest.mark.flaky(reruns=2)
 @pytest.mark.parametrize(
     "options", [{"ipc_enabled": True, "enable_timing": True}, EventOptions(ipc_enabled=True, enable_timing=True)]
 )
@@ -125,6 +128,7 @@ class TestIpcEventProperties:
     process.
     """
 
+    @pytest.mark.flaky(reruns=2)
     @pytest.mark.parametrize("busy_waited_sync", [True, False])
     @pytest.mark.parametrize("use_options_cls", [True, False])
     @pytest.mark.parametrize("use_option_kw", [True, False])

diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 """Test for duplicate IPC buffer imports.
@@ -16,7 +16,7 @@
 from cuda.core import Buffer, Device
 from helpers.logging import TimestampedLogger
 
-CHILD_TIMEOUT_SEC = 20
+CHILD_TIMEOUT_SEC = 30
 NBYTES = 64
 POOL_SIZE = 2097152
 
@@ -60,6 +60,7 @@ def _set_start_method(self):
         with contextlib.suppress(RuntimeError):
             mp.set_start_method("spawn", force=True)
 
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
         ipc_device.set_current()

diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
@@ -14,7 +14,7 @@
     HAVE_PSUTIL = True
 import pytest
 
-CHILD_TIMEOUT_SEC = 20
+CHILD_TIMEOUT_SEC = 30
 NBYTES = 64
 
 USING_FDS = platform.system() == "Linux"
@@ -23,6 +23,7 @@
 )
 
 
+@pytest.mark.flaky(reruns=2)
 @skip_if_unrunnable
 def test_alloc_handle(ipc_memory_resource):
     """Check for fd leaks in get_allocation_handle."""
@@ -79,6 +80,7 @@ def __reduce__(self):
         raise RuntimeError("Irreducible")
 
 
+@pytest.mark.flaky(reruns=2)
 @skip_if_unrunnable
 @pytest.mark.parametrize(
     "getobject",

diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -1,18 +1,20 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing as mp
 
+import pytest
 from cuda.core import Buffer, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
-CHILD_TIMEOUT_SEC = 20
+CHILD_TIMEOUT_SEC = 30
 NBYTES = 64
 NWORKERS = 2
 NTASKS = 2
 
 
 class TestIpcMempool:
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         """Test IPC with memory pools."""
         # Set up the IPC-enabled memory pool and share it.
@@ -54,6 +56,7 @@ def child_main(self, device, mr, queue):
 
 
 class TestIPCMempoolMultiple:
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         """Test IPC with memory pools using multiple processes."""
         # Construct an IPC-enabled memory resource and share it with two children.
@@ -104,6 +107,7 @@ def child_main(self, device, mr, seed, queue):
 
 
 class TestIPCSharedAllocationHandleAndBufferDescriptors:
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         """
         Demonstrate that a memory pool allocation handle can be reused for IPC
@@ -154,6 +158,7 @@ def child_main(self, device, alloc_handle, seed, queue):
 
 
 class TestIPCSharedAllocationHandleAndBufferObjects:
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         """
         Demonstrate that a memory pool allocation handle can be reused for IPC

diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing as mp
@@ -8,7 +8,7 @@
 from cuda.core._utils.cuda_utils import CUDAError
 from helpers.buffers import PatternGen
 
-CHILD_TIMEOUT_SEC = 20
+CHILD_TIMEOUT_SEC = 30
 NBYTES = 64
 POOL_SIZE = 2097152
 
@@ -19,6 +19,7 @@ class TestPeerAccessNotPreservedOnImport:
     is sent to another process via IPC, and that peer access can be set after import.
     """
 
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, mempool_device_x2):
         dev0, dev1 = mempool_device_x2
 
@@ -57,6 +58,7 @@ class TestBufferPeerAccessAfterImport:
     setting peer access on the imported memory resource, and that access can be revoked.
     """
 
+    @pytest.mark.flaky(reruns=2)
     @pytest.mark.parametrize("grant_access_in_parent", [True, False])
     def test_main(self, mempool_device_x2, grant_access_in_parent):
         dev0, dev1 = mempool_device_x2

diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing as mp
@@ -8,14 +8,15 @@
 from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from helpers.buffers import PatternGen
 
-CHILD_TIMEOUT_SEC = 20
+CHILD_TIMEOUT_SEC = 30
 NBYTES = 64
 NMRS = 3
 NTASKS = 7
 POOL_SIZE = 2097152
 
 
 class TestIpcSendBuffers:
+    @pytest.mark.flaky(reruns=2)
     @pytest.mark.parametrize("nmrs", (1, NMRS))
     def test_main(self, ipc_device, nmrs):
         """Test passing buffers sourced from multiple memory resources."""
@@ -67,6 +68,7 @@ class TestIpcReexport:
     re-exported from B to C.
     """
 
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         # Set up the device.
         device = ipc_device

diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -1,14 +1,15 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing as mp
 import multiprocessing.reduction
 import os
 
+import pytest
 from cuda.core import Buffer, Device, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
-CHILD_TIMEOUT_SEC = 20
+CHILD_TIMEOUT_SEC = 30
 NBYTES = 64
 POOL_SIZE = 2097152
 
@@ -21,6 +22,7 @@ class TestObjectSerializationDirect:
     it on the other end and demonstrate buffer sharing.
     """
 
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         device = ipc_device
         mr = ipc_memory_resource
@@ -76,6 +78,7 @@ def child_main(self, conn):
 
 
 class TestObjectSerializationWithMR:
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         """Test sending IPC memory objects to a child through a queue."""
         device = ipc_device
@@ -131,6 +134,7 @@ class TestObjectPassing:
     in multiprocessing (e.g., Queue) work.
     """
 
+    @pytest.mark.flaky(reruns=2)
     def test_main(self, ipc_device, ipc_memory_resource):
         # Define the objects.
         device = ipc_device

diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing as mp
@@ -9,7 +9,7 @@
 from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from helpers.buffers import PatternGen
 
-CHILD_TIMEOUT_SEC = 20
+CHILD_TIMEOUT_SEC = 30
 NBYTES = 64
 NWORKERS = 2
 NMRS = 3
@@ -26,6 +26,7 @@ class TestIpcWorkerPool:
     resource (duplicates are ignored on the receiving end).
     """
 
+    @pytest.mark.flaky(reruns=2)
     @pytest.mark.parametrize("nmrs", (1, NMRS))
     def test_main(self, ipc_device, nmrs):
         device = ipc_device
@@ -62,6 +63,7 @@ def init_worker(mrs):
         """Called during child process initialization to store received memory resources."""
         TestIpcWorkerPoolUsingIPCDescriptors.mrs = mrs
 
+    @pytest.mark.flaky(reruns=2)
     @pytest.mark.parametrize("nmrs", (1, NMRS))
     def test_main(self, ipc_device, nmrs):
         device = ipc_device
@@ -106,6 +108,7 @@ def init_worker(mrs):
         # Passing mrs implicitly registers them.
         pass
 
+    @pytest.mark.flaky(reruns=2)
     @pytest.mark.parametrize("nmrs", (1, NMRS))
     def test_main(self, ipc_device, nmrs):
         device = ipc_device