From 362387a67cb4f87f7aec1451cb9431296fe85efb Mon Sep 17 00:00:00 2001
From: David Bayer <bayer.david99@gmail.com>
Date: Wed, 14 May 2025 19:56:31 +0200
Subject: [PATCH 1/5] Check `cuda::memcpy_async` preconditions

---
 .../include/cuda/__barrier/aligned_size.h     | 12 +++++++---
 .../cuda/__memcpy_async/memcpy_async.h        |  9 +++++++
 .../__memcpy_async/memcpy_async_barrier.h     | 24 +++++++------------
 .../cuda/__memcpy_async/memcpy_async_tx.h     |  5 ++++
 libcudacxx/include/cuda/pipeline              | 17 ++++++++-----
 .../cuda/barrier/aligned_size_t.pass.cpp      |  8 +++----
 6 files changed, 47 insertions(+), 28 deletions(-)
diff --git a/libcudacxx/include/cuda/__barrier/aligned_size.h b/libcudacxx/include/cuda/__barrier/aligned_size.h
index 9926023f05f..59ea50ff161 100644
--- a/libcudacxx/include/cuda/__barrier/aligned_size.h
+++ b/libcudacxx/include/cuda/__barrier/aligned_size.h
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__cmath/pow2.h>
 #include <cuda/std/cstddef>
 
 #include <cuda/std/__cccl/prologue.h>
@@ -30,13 +31,18 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 template <_CUDA_VSTD::size_t _Alignment>
 struct aligned_size_t
 {
+  static_assert(::cuda::is_power_of_two(_Alignment), "alignment must be a power of two");
+
   static constexpr _CUDA_VSTD::size_t align = _Alignment;
   _CUDA_VSTD::size_t value;
 
-  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(_CUDA_VSTD::size_t __s)
       : value(__s)
-  {}
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const
+  {
+    _CCCL_ASSERT(value % align == 0,
+                 "aligned_size_t must be constructed with a size that is a multiple of the alignment");
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator _CUDA_VSTD::size_t() const
   {
     return value;
   }
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
index ca58faefd88..a40d4abf77e 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
@@ -33,6 +33,7 @@
 #  include <cuda/std/__atomic/scopes.h>
 #  include <cuda/std/__type_traits/void_t.h>
 #  include <cuda/std/cstddef>
+#  include <cuda/std/cstdint>
 
 #  include <cuda/std/__cccl/prologue.h>
 
@@ -104,6 +105,10 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   barrier<_Sco, _CompF>& __barrier)
 {
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__destination) % _Alignment == 0,
+               "destination pointer must be aligned to the specified alignment");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__source) % _Alignment == 0,
+               "source pointer must be aligned to the specified alignment");
   return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
 }
 
@@ -145,6 +150,10 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   barrier<_Sco, _CompF>& __barrier)
 {
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__destination) % _Alignment == 0,
+               "destination pointer must be aligned to the specified alignment");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__source) % _Alignment == 0,
+               "source pointer must be aligned to the specified alignment");
   return __memcpy_async_barrier(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
 }
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
index 67acc3fd87f..9e04b577f18 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
@@ -42,30 +42,24 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 /***********************************************************************
  * cuda::memcpy_async dispatch helper functions
  *
- * - __get_size_align struct to determine the alignment from a size type.
+ * - __get_size_align_v get the alignment from a size type.
  ***********************************************************************/
 
-// The __get_size_align struct provides a way to query the guaranteed
+// The __get_size_align provides a way to query the guaranteed
 // "alignment" of a provided size. In this case, an n-byte aligned size means
 // that the size is a multiple of n.
 //
 // Use as follows:
-// static_assert(__get_size_align<size_t>::align == 1)
-// static_assert(__get_size_align<aligned_size_t<n>>::align == n)
+// static_assert(__get_size_align_v<size_t> == 1)
+// static_assert(__get_size_align_v<aligned_size_t<n>> == n)
 
 // Default impl: always returns 1.
-template <typename, typename = void>
-struct __get_size_align
-{
-  static constexpr int align = 1;
-};
+template <class, class = void>
+inline constexpr _CUDA_VSTD::size_t __get_size_align_v = 1;
 
 // aligned_size_t<n> overload: return n.
-template <typename T>
-struct __get_size_align<T, _CUDA_VSTD::void_t<decltype(T::align)>>
-{
-  static constexpr int align = T::align;
-};
+template <class _Tp>
+inline constexpr _CUDA_VSTD::size_t __get_size_align_v<_Tp, _CUDA_VSTD::void_t<decltype(_Tp::align)>> = _Tp::align;
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -99,7 +93,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier(
       : _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
 
   // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
-  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
+  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align_v<_Size>;
   constexpr _CUDA_VSTD::size_t __align      = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
   // Cast to char pointers. We don't need the type for alignment anymore and
   // erasing the types reduces the number of instantiations of down-stream
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
index f3506dc6a37..f012156d211 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
@@ -57,6 +57,11 @@ _CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx(
 #    endif
   static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
 
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__dest) % _Alignment == 0,
+               "destination pointer must be aligned to the specified alignment");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__src) % _Alignment == 0,
+               "source pointer must be aligned to the specified alignment");
+
   _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
   _CCCL_ASSERT(__isShared(__dest), "dest must point to shared memory.");
   _CCCL_ASSERT(__isGlobal(__src), "src must point to global memory.");
diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index ed2abe9f19f..f3966963427 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -26,6 +26,7 @@
 #include <cuda/atomic>
 #include <cuda/barrier>
 #include <cuda/std/chrono>
+#include <cuda/std/cstdint>
 
 #include <cuda/std/__cccl/prologue.h>
 
@@ -499,7 +500,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_pipeline(
   _CUDA_VSTD::uint32_t __allowed_completions = _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
 
   // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
-  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
+  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align_v<_Size>;
   constexpr _CUDA_VSTD::size_t __align      = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
   // Cast to char pointers. We don't need the type for alignment anymore and
   // erasing the types reduces the number of instantiations of down-stream
@@ -521,11 +522,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
 }
 
-template <typename _Group,
-          class _Type,
-          std::size_t _Alignment,
-          thread_scope _Scope,
-          std::size_t _Larger_alignment = (alignof(_Type) > _Alignment) ? alignof(_Type) : _Alignment>
+template <typename _Group, class _Type, std::size_t _Alignment, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group,
   _Type* __destination,
@@ -533,6 +530,10 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   pipeline<_Scope>& __pipeline)
 {
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__destination) % _Alignment == 0,
+               "destination pointer must be aligned to the specified alignment");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__source) % _Alignment == 0,
+               "source pointer must be aligned to the specified alignment");
   return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
 }
 
@@ -559,6 +560,10 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   pipeline<_Scope>& __pipeline)
 {
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__destination) % _Alignment == 0,
+               "destination pointer must be aligned to the specified alignment");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__source) % _Alignment == 0,
+               "source pointer must be aligned to the specified alignment");
   return __memcpy_async_pipeline(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/aligned_size_t.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/aligned_size_t.pass.cpp
index 24fb11bd7b6..25cae5e59b5 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/aligned_size_t.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/aligned_size_t.pass.cpp
@@ -22,8 +22,8 @@
 __host__ __device__ constexpr bool test()
 {
   using aligned_t = cuda::aligned_size_t<1>;
-  static_assert(!cuda::std::is_default_constructible<aligned_t>::value, "");
-  static_assert(aligned_t::align == 1, "");
+  static_assert(!cuda::std::is_default_constructible<aligned_t>::value);
+  static_assert(aligned_t::align == 1);
   {
     const aligned_t aligned{42};
     assert(aligned.value == 42);
@@ -33,11 +33,11 @@ __host__ __device__ constexpr bool test()
 }
 
 // test C++11 differently
-static_assert(cuda::aligned_size_t<42>{1337}.value == 1337, "");
+static_assert(cuda::aligned_size_t<32>{1024}.value == 1024);
 
 int main(int, char**)
 {
   test();
-  static_assert(test(), "");
+  static_assert(test());
   return 0;
 }

From 123f07d95660274f33bc01b5f55c0d925a925c09 Mon Sep 17 00:00:00 2001
From: David Bayer <bayer.david99@gmail.com>
Date: Thu, 15 May 2025 11:25:28 +0200
Subject: [PATCH 2/5] Improve pre condition checking

---
 .../include/cuda/__barrier/aligned_size.h     |   6 +
 .../cuda/__memcpy_async/check_preconditions.h |  79 ++++++++++
 .../cuda/__memcpy_async/memcpy_async.h        |  16 +-
 .../__memcpy_async/memcpy_async_barrier.h     |  29 +---
 .../cuda/__memcpy_async/memcpy_async_tx.h     |   7 +-
 libcudacxx/include/cuda/pipeline              |  21 +--
 .../cuda/memcpy_async/preconditions.pass.cpp  | 148 ++++++++++++++++++
 7 files changed, 258 insertions(+), 48 deletions(-)
 create mode 100644 libcudacxx/include/cuda/__memcpy_async/check_preconditions.h
 create mode 100644 libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp

diff --git a/libcudacxx/include/cuda/__barrier/aligned_size.h b/libcudacxx/include/cuda/__barrier/aligned_size.h
index 59ea50ff161..6aa5b4fa37c 100644
--- a/libcudacxx/include/cuda/__barrier/aligned_size.h
+++ b/libcudacxx/include/cuda/__barrier/aligned_size.h
@@ -48,6 +48,12 @@ struct aligned_size_t
   }
 };
 
+template <class, class = void>
+inline constexpr _CUDA_VSTD::size_t __get_size_align_v = 1;
+
+template <class _Tp>
+inline constexpr _CUDA_VSTD::size_t __get_size_align_v<_Tp, _CUDA_VSTD::void_t<decltype(_Tp::align)>> = _Tp::align;
+
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
 #include <cuda/std/__cccl/epilogue.h>
diff --git a/libcudacxx/include/cuda/__memcpy_async/check_preconditions.h b/libcudacxx/include/cuda/__memcpy_async/check_preconditions.h
new file mode 100644
index 00000000000..d415e8091ab
--- /dev/null
+++ b/libcudacxx/include/cuda/__memcpy_async/check_preconditions.h
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H
+#define _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__barrier/aligned_size.h>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+#ifndef _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING
+#  define _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(_Cond, _Msg) _CCCL_ASSERT(_Cond, _Msg)
+#else // ^^^ _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING ^^^ / vvv !_LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING vvv
+#  define _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(_Cond, _Msg) \
+    do                                                     \
+    {                                                      \
+      if (!(_Cond))                                        \
+      {                                                    \
+        return false;                                      \
+      }                                                    \
+    } while (false)
+#endif // _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING
+
+// Check the memcpy_async preconditions, return value is intended for testing purposes exclusively
+template <class _Tp, class _Size>
+_LIBCUDACXX_HIDE_FROM_ABI bool __memcpy_async_check_pre(_Tp* __dst, const _Tp* __src, _Size __size)
+{
+  constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>);
+
+  const auto __dst_val = reinterpret_cast<uintptr_t>(__dst);
+  const auto __src_val = reinterpret_cast<uintptr_t>(__src);
+
+  // check src and dst alignment
+  _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(
+    __dst_val % __align == 0, "destination pointer must be aligned to the specified alignment");
+  _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(
+    __src_val % __align == 0, "source pointer must be aligned to the specified alignment");
+
+  // check src and dst overlap
+  _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(
+    !((__dst_val <= __src_val && __src_val < __dst_val + __size)
+      || (__src_val <= __dst_val && __dst_val < __src_val + __size)),
+    "destination and source buffers must not overlap");
+  return true;
+}
+
+template <class _Size>
+_LIBCUDACXX_HIDE_FROM_ABI bool __memcpy_async_check_pre(void* __dst, const void* __src, _Size __size)
+{
+  return ::cuda::__memcpy_async_check_pre(reinterpret_cast<char*>(__dst), reinterpret_cast<const char*>(__src), __size);
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
index a40d4abf77e..d32a315f80a 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
@@ -29,6 +29,7 @@
 #  include <cuda/__barrier/barrier.h>
 #  include <cuda/__barrier/barrier_block_scope.h>
 #  include <cuda/__barrier/barrier_thread_scope.h>
+#  include <cuda/__memcpy_async/check_preconditions.h>
 #  include <cuda/__memcpy_async/memcpy_async_barrier.h>
 #  include <cuda/std/__atomic/scopes.h>
 #  include <cuda/std/__type_traits/void_t.h>
@@ -105,10 +106,8 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   barrier<_Sco, _CompF>& __barrier)
 {
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__destination) % _Alignment == 0,
-               "destination pointer must be aligned to the specified alignment");
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__source) % _Alignment == 0,
-               "source pointer must be aligned to the specified alignment");
+  static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
 }
 
@@ -116,6 +115,7 @@ template <class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
 {
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier);
 }
 
@@ -127,6 +127,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _CUDA_VSTD::size_t __size,
   barrier<_Sco, _CompF>& __barrier)
 {
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
 }
 
@@ -138,6 +139,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _CUDA_VSTD::size_t __size,
   barrier<_Sco, _CompF>& __barrier)
 {
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_barrier(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
 }
@@ -150,10 +152,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   barrier<_Sco, _CompF>& __barrier)
 {
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__destination) % _Alignment == 0,
-               "destination pointer must be aligned to the specified alignment");
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__source) % _Alignment == 0,
-               "source pointer must be aligned to the specified alignment");
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_barrier(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
 }
@@ -162,6 +161,7 @@ template <typename _Size, thread_scope _Sco, typename _CompF>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
 {
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_barrier(
     __single_thread_group{},
     reinterpret_cast<char*>(__destination),
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
index 9e04b577f18..5cc7ac93761 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
@@ -22,6 +22,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__barrier/aligned_size.h>
 #include <cuda/__barrier/barrier.h>
 #include <cuda/__barrier/barrier_block_scope.h>
 #include <cuda/__barrier/barrier_thread_scope.h>
@@ -30,6 +31,7 @@
 #include <cuda/__memcpy_async/is_local_smem_barrier.h>
 #include <cuda/__memcpy_async/memcpy_completion.h>
 #include <cuda/__memcpy_async/try_get_barrier_handle.h>
+#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/cstddef>
@@ -39,30 +41,6 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-/***********************************************************************
- * cuda::memcpy_async dispatch helper functions
- *
- * - __get_size_align_v get the alignment from a size type.
- ***********************************************************************/
-
-// The __get_size_align provides a way to query the guaranteed
-// "alignment" of a provided size. In this case, an n-byte aligned size means
-// that the size is a multiple of n.
-//
-// Use as follows:
-// static_assert(__get_size_align_v<size_t> == 1)
-// static_assert(__get_size_align_v<aligned_size_t<n>> == n)
-
-// Default impl: always returns 1.
-template <class, class = void>
-inline constexpr _CUDA_VSTD::size_t __get_size_align_v = 1;
-
-// aligned_size_t<n> overload: return n.
-template <class _Tp>
-inline constexpr _CUDA_VSTD::size_t __get_size_align_v<_Tp, _CUDA_VSTD::void_t<decltype(_Tp::align)>> = _Tp::align;
-
-////////////////////////////////////////////////////////////////////////////////
-
 struct __single_thread_group
 {
   _LIBCUDACXX_HIDE_FROM_ABI void sync() const {}
@@ -93,8 +71,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier(
       : _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
 
   // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
-  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align_v<_Size>;
-  constexpr _CUDA_VSTD::size_t __align      = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
+  constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>);
   // Cast to char pointers. We don't need the type for alignment anymore and
   // erasing the types reduces the number of instantiations of down-stream
   // functions.
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
index f012156d211..f5e49be9fba 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
@@ -29,6 +29,7 @@
 #    include <cuda/__barrier/async_contract_fulfillment.h>
 #    include <cuda/__barrier/barrier_block_scope.h>
 #    include <cuda/__barrier/barrier_native_handle.h>
+#    include <cuda/__memcpy_async/check_preconditions.h>
 #    include <cuda/__ptx/instructions/cp_async_bulk.h>
 #    include <cuda/__ptx/ptx_dot_variants.h>
 #    include <cuda/__ptx/ptx_helper_functions.h>
@@ -56,11 +57,9 @@ _CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx(
   static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type");
 #    endif
   static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
+  static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");
 
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__dest) % _Alignment == 0,
-               "destination pointer must be aligned to the specified alignment");
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__src) % _Alignment == 0,
-               "source pointer must be aligned to the specified alignment");
+  ::cuda::__memcpy_async_check_pre(__dest, __src, __size);
 
   _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
   _CCCL_ASSERT(__isShared(__dest), "dest must point to shared memory.");
diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index f3966963427..607ac4cfa84 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -21,10 +21,13 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__barrier/aligned_size.h>
+#include <cuda/__memcpy_async/check_preconditions.h>
 #include <cuda/__memcpy_async/completion_mechanism.h>
 #include <cuda/__memcpy_async/memcpy_async_barrier.h>
 #include <cuda/atomic>
 #include <cuda/barrier>
+#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/chrono>
 #include <cuda/std/cstdint>
 
@@ -500,8 +503,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_pipeline(
   _CUDA_VSTD::uint32_t __allowed_completions = _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
 
   // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
-  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align_v<_Size>;
-  constexpr _CUDA_VSTD::size_t __align      = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
+  constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>);
   // Cast to char pointers. We don't need the type for alignment anymore and
   // erasing the types reduces the number of instantiations of down-stream
   // functions.
@@ -519,6 +521,7 @@ template <typename _Group, class _Type, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group, _Type* __destination, _Type const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
 {
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
 }
 
@@ -530,10 +533,8 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   pipeline<_Scope>& __pipeline)
 {
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__destination) % _Alignment == 0,
-               "destination pointer must be aligned to the specified alignment");
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__source) % _Alignment == 0,
-               "source pointer must be aligned to the specified alignment");
+  static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
 }
 
@@ -541,6 +542,7 @@ template <class _Type, typename _Size, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(_Type* __destination, _Type const* __source, _Size __size, pipeline<_Scope>& __pipeline)
 {
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline);
 }
 
@@ -548,6 +550,7 @@ template <typename _Group, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group, void* __destination, void const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
 {
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_pipeline(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
 }
@@ -560,10 +563,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   pipeline<_Scope>& __pipeline)
 {
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__destination) % _Alignment == 0,
-               "destination pointer must be aligned to the specified alignment");
-  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__source) % _Alignment == 0,
-               "source pointer must be aligned to the specified alignment");
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_pipeline(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
 }
@@ -572,6 +572,7 @@ template <typename _Size, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(void* __destination, void const* __source, _Size __size, pipeline<_Scope>& __pipeline)
 {
+  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_pipeline(
     __single_thread_group{},
     reinterpret_cast<char*>(__destination),
diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp
new file mode 100644
index 00000000000..7d4ead758dd
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp
@@ -0,0 +1,148 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#define _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING
+
+#include <cuda/__memcpy_async/check_preconditions.h>
+#include <cuda/std/cstddef>
+
+__host__ __device__ void test_typed()
+{
+  using T = int;
+
+  constexpr cuda::std::size_t align_scale = 2;
+  constexpr cuda::std::size_t align       = align_scale * alignof(T);
+  constexpr cuda::std::size_t n           = 16;
+  constexpr cuda::std::size_t size        = n * sizeof(T);
+
+  alignas(align) T a[n * 2]{};
+  alignas(align) const T b[n * 2]{};
+
+  const auto a_missaligned = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(a) + alignof(T) / 2);
+  const auto b_missaligned = reinterpret_cast<const T*>(reinterpret_cast<uintptr_t>(b) + alignof(T) / 2);
+
+  // 1. test ordinary size type
+  {
+    assert(cuda::__memcpy_async_check_pre(a, b, size));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size));
+    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size));
+  }
+
+  // 2. test overaligned cuda::aligned_size_t
+  {
+    cuda::aligned_size_t<align> size_aligned(size);
+    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+  }
+
+  // 3. test cuda::aligned_size_t aligned to alignof(T)
+  {
+    cuda::aligned_size_t<align / align_scale> size_aligned(size);
+    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+  }
+
+  // 4. test underaligned cuda::aligned_size_t
+  {
+    cuda::aligned_size_t<align / (2 * align_scale)> size_aligned(size);
+    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+  }
+
+  // 5. test overlap
+  {
+    assert(!cuda::__memcpy_async_check_pre(a, a, size));
+    assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size));
+    assert(cuda::__memcpy_async_check_pre(a, a + n, size));
+    assert(cuda::__memcpy_async_check_pre(a + n, a, size));
+    assert(!cuda::__memcpy_async_check_pre(a, a + n - 1, size));
+    assert(!cuda::__memcpy_async_check_pre(a + n - 1, a, size));
+  }
+}
+
+__host__ __device__ void test_void()
+{
+  using T = int;
+
+  constexpr cuda::std::size_t align_scale = 2;
+  constexpr cuda::std::size_t align       = align_scale * alignof(T);
+  constexpr cuda::std::size_t n           = 16;
+  constexpr cuda::std::size_t size        = n * sizeof(T);
+
+  alignas(align) T a_buff[n * 2]{};
+  alignas(align) const T b_buff[n * 2]{};
+
+  void* a       = a_buff;
+  const void* b = b_buff;
+
+  const auto a_missaligned = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(a) + alignof(T) / 2);
+  const auto b_missaligned = reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(b) + alignof(T) / 2);
+
+  // 1. test ordinary size type
+  {
+    assert(cuda::__memcpy_async_check_pre(a, b, size));
+    assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size));
+    assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size));
+    assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size));
+  }
+
+  // 2. test overaligned cuda::aligned_size_t
+  {
+    cuda::aligned_size_t<align> size_aligned(size);
+    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+  }
+
+  // 3. test cuda::aligned_size_t aligned to alignof(T)
+  {
+    cuda::aligned_size_t<align / align_scale> size_aligned(size);
+    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+  }
+
+  // 4. test underaligned cuda::aligned_size_t
+  {
+    cuda::aligned_size_t<align / (2 * align_scale)> size_aligned(size);
+    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+    assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+    assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+    assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+  }
+
+  // 5. test overlap
+  {
+    assert(!cuda::__memcpy_async_check_pre(a, a, size));
+    assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size));
+    assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size));
+    assert(cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n), size));
+    assert(cuda::__memcpy_async_check_pre((void*) (a_buff + n), a, size));
+    assert(!cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n - 1), size));
+    assert(!cuda::__memcpy_async_check_pre((void*) (a_buff + n - 1), a, size));
+  }
+}
+
+int main(int, char**)
+{
+  test_typed();
+  test_void();
+  return 0;
+}

From bfa7354d3e6efd468121dcbdc63fdac27331b8f5 Mon Sep 17 00:00:00 2001
From: David Bayer <bayer.david99@gmail.com>
Date: Thu, 15 May 2025 12:46:55 +0200
Subject: [PATCH 3/5] fix invalid type name

---
 libcudacxx/include/cuda/pipeline | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index 607ac4cfa84..3089b6874bb 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -533,7 +533,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   pipeline<_Scope>& __pipeline)
 {
-  static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");
+  static_assert(_Alignment >= alignof(_Type), "alignment must be at least the alignof(T)");
   ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
   return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
 }

From d7bbc1067002efd9b92cbf5485f220cac887e074 Mon Sep 17 00:00:00 2001
From: David Bayer <bayer.david99@gmail.com>
Date: Tue, 20 May 2025 10:18:02 +0200
Subject: [PATCH 4/5] wrap checks with `_CCCL_ASSERT`

---
 .../include/cuda/__memcpy_async/memcpy_async.h       | 12 ++++++------
 .../include/cuda/__memcpy_async/memcpy_async_tx.h    |  2 +-
 libcudacxx/include/cuda/pipeline                     | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
index d32a315f80a..1b9be7568cd 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
@@ -107,7 +107,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   barrier<_Sco, _CompF>& __barrier)
 {
   static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
 }
 
@@ -115,7 +115,7 @@ template <class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
 {
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier);
 }
 
@@ -127,7 +127,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _CUDA_VSTD::size_t __size,
   barrier<_Sco, _CompF>& __barrier)
 {
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
 }
 
@@ -139,7 +139,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _CUDA_VSTD::size_t __size,
   barrier<_Sco, _CompF>& __barrier)
 {
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_barrier(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
 }
@@ -152,7 +152,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   barrier<_Sco, _CompF>& __barrier)
 {
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_barrier(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
 }
@@ -161,7 +161,7 @@ template <typename _Size, thread_scope _Sco, typename _CompF>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
 {
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_barrier(
     __single_thread_group{},
     reinterpret_cast<char*>(__destination),
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
index f5e49be9fba..cde45efd20c 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
@@ -59,7 +59,7 @@ _CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx(
   static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
   static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");
 
-  ::cuda::__memcpy_async_check_pre(__dest, __src, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__dest, __src, __size), "memcpy_async_tx preconditions unmet");
 
   _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
   _CCCL_ASSERT(__isShared(__dest), "dest must point to shared memory.");
diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index 3089b6874bb..c78b93bd0d6 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -521,7 +521,7 @@ template <typename _Group, class _Type, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group, _Type* __destination, _Type const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
 {
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
 }
 
@@ -534,7 +534,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   pipeline<_Scope>& __pipeline)
 {
   static_assert(_Alignment >= alignof(_Type), "alignment must be at least the alignof(T)");
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
 }
 
@@ -542,7 +542,7 @@ template <class _Type, typename _Size, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(_Type* __destination, _Type const* __source, _Size __size, pipeline<_Scope>& __pipeline)
 {
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline);
 }
 
@@ -550,7 +550,7 @@ template <typename _Group, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group, void* __destination, void const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
 {
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_pipeline(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
 }
@@ -563,7 +563,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   pipeline<_Scope>& __pipeline)
 {
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_pipeline(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
 }
@@ -572,7 +572,7 @@ template <typename _Size, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(void* __destination, void const* __source, _Size __size, pipeline<_Scope>& __pipeline)
 {
-  ::cuda::__memcpy_async_check_pre(__destination, __source, __size);
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return __memcpy_async_pipeline(
     __single_thread_group{},
     reinterpret_cast<char*>(__destination),

From 284945f581c3e06cb07b9b5b12136a08671bd62e Mon Sep 17 00:00:00 2001
From: David Bayer <bayer.david99@gmail.com>
Date: Fri, 23 May 2025 09:21:30 +0200
Subject: [PATCH 5/5] avoid compiler bugs in test file

---
 .../cuda/memcpy_async/preconditions.pass.cpp  | 223 +++++++++---------
 1 file changed, 109 insertions(+), 114 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp
index 7d4ead758dd..0f465f84514 100644
--- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp
@@ -13,7 +13,7 @@
 #include <cuda/__memcpy_async/check_preconditions.h>
 #include <cuda/std/cstddef>
 
-__host__ __device__ void test_typed()
+__host__ __device__ void test()
 {
   using T = int;
 
@@ -22,127 +22,122 @@ __host__ __device__ void test_typed()
   constexpr cuda::std::size_t n           = 16;
   constexpr cuda::std::size_t size        = n * sizeof(T);
 
-  alignas(align) T a[n * 2]{};
-  alignas(align) const T b[n * 2]{};
-
-  const auto a_missaligned = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(a) + alignof(T) / 2);
-  const auto b_missaligned = reinterpret_cast<const T*>(reinterpret_cast<uintptr_t>(b) + alignof(T) / 2);
-
-  // 1. test ordinary size type
-  {
-    assert(cuda::__memcpy_async_check_pre(a, b, size));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size));
-    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size));
-  }
-
-  // 2. test overaligned cuda::aligned_size_t
-  {
-    cuda::aligned_size_t<align> size_aligned(size);
-    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
-  }
-
-  // 3. test cuda::aligned_size_t aligned to alignof(T)
-  {
-    cuda::aligned_size_t<align / align_scale> size_aligned(size);
-    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
-  }
-
-  // 4. test underaligned cuda::aligned_size_t
-  {
-    cuda::aligned_size_t<align / (2 * align_scale)> size_aligned(size);
-    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
-  }
-
-  // 5. test overlap
-  {
-    assert(!cuda::__memcpy_async_check_pre(a, a, size));
-    assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size));
-    assert(cuda::__memcpy_async_check_pre(a, a + n, size));
-    assert(cuda::__memcpy_async_check_pre(a + n, a, size));
-    assert(!cuda::__memcpy_async_check_pre(a, a + n - 1, size));
-    assert(!cuda::__memcpy_async_check_pre(a + n - 1, a, size));
-  }
-}
-
-__host__ __device__ void test_void()
-{
-  using T = int;
-
-  constexpr cuda::std::size_t align_scale = 2;
-  constexpr cuda::std::size_t align       = align_scale * alignof(T);
-  constexpr cuda::std::size_t n           = 16;
-  constexpr cuda::std::size_t size        = n * sizeof(T);
-
-  alignas(align) T a_buff[n * 2]{};
-  alignas(align) const T b_buff[n * 2]{};
-
-  void* a       = a_buff;
-  const void* b = b_buff;
-
-  const auto a_missaligned = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(a) + alignof(T) / 2);
-  const auto b_missaligned = reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(b) + alignof(T) / 2);
-
-  // 1. test ordinary size type
-  {
-    assert(cuda::__memcpy_async_check_pre(a, b, size));
-    assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size));
-    assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size));
-    assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size));
-  }
-
-  // 2. test overaligned cuda::aligned_size_t
-  {
-    cuda::aligned_size_t<align> size_aligned(size);
-    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
-  }
-
-  // 3. test cuda::aligned_size_t aligned to alignof(T)
-  {
-    cuda::aligned_size_t<align / align_scale> size_aligned(size);
-    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
-  }
-
-  // 4. test underaligned cuda::aligned_size_t
+  // test typed overloads
   {
-    cuda::aligned_size_t<align / (2 * align_scale)> size_aligned(size);
-    assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
-    assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
-    assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
-    assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+    alignas(align) T a[n * 2]{};
+    alignas(align) const T b[n * 2]{};
+
+    const auto a_missaligned = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(a) + alignof(T) / 2);
+    const auto b_missaligned = reinterpret_cast<const T*>(reinterpret_cast<uintptr_t>(b) + alignof(T) / 2);
+
+    // 1. test ordinary size type
+    {
+      assert(cuda::__memcpy_async_check_pre(a, b, size));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size));
+      assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size));
+    }
+
+    // 2. test overaligned cuda::aligned_size_t
+    {
+      cuda::aligned_size_t<align> size_aligned(size);
+      assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+    }
+
+    // 3. test cuda::aligned_size_t aligned to alignof(T)
+    {
+      cuda::aligned_size_t<align / align_scale> size_aligned(size);
+      assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+    }
+
+    // 4. test underaligned cuda::aligned_size_t
+    {
+      cuda::aligned_size_t<align / (2 * align_scale)> size_aligned(size);
+      assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+    }
+
+    // 5. test overlap
+    {
+      assert(!cuda::__memcpy_async_check_pre(a, a, size));
+      assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size));
+      assert(cuda::__memcpy_async_check_pre(a, a + n, size));
+      assert(cuda::__memcpy_async_check_pre(a + n, a, size));
+      assert(!cuda::__memcpy_async_check_pre(a, a + n - 1, size));
+      assert(!cuda::__memcpy_async_check_pre(a + n - 1, a, size));
+    }
   }
 
-  // 5. test overlap
+  // test void overloads
   {
-    assert(!cuda::__memcpy_async_check_pre(a, a, size));
-    assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size));
-    assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size));
-    assert(cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n), size));
-    assert(cuda::__memcpy_async_check_pre((void*) (a_buff + n), a, size));
-    assert(!cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n - 1), size));
-    assert(!cuda::__memcpy_async_check_pre((void*) (a_buff + n - 1), a, size));
+    alignas(align) T a_buff[n * 2]{};
+    alignas(align) const T b_buff[n * 2]{};
+
+    void* a       = a_buff;
+    const void* b = b_buff;
+
+    const auto a_missaligned = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(a) + alignof(T) / 2);
+    const auto b_missaligned = reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(b) + alignof(T) / 2);
+
+    // 1. test ordinary size type
+    {
+      assert(cuda::__memcpy_async_check_pre(a, b, size));
+      assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size));
+      assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size));
+      assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size));
+    }
+
+    // 2. test overaligned cuda::aligned_size_t
+    {
+      cuda::aligned_size_t<align> size_aligned(size);
+      assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+    }
+
+    // 3. test cuda::aligned_size_t aligned to alignof(T)
+    {
+      cuda::aligned_size_t<align / align_scale> size_aligned(size);
+      assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+    }
+
+    // 4. test underaligned cuda::aligned_size_t
+    {
+      cuda::aligned_size_t<align / (2 * align_scale)> size_aligned(size);
+      assert(cuda::__memcpy_async_check_pre(a, b, size_aligned));
+      assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned));
+      assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned));
+      assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned));
+    }
+
+    // 5. test overlap
+    {
+      assert(!cuda::__memcpy_async_check_pre(a, a, size));
+      assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size));
+      assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size));
+      assert(cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n), size));
+      assert(cuda::__memcpy_async_check_pre((void*) (a_buff + n), a, size));
+      assert(!cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n - 1), size));
+      assert(!cuda::__memcpy_async_check_pre((void*) (a_buff + n - 1), a, size));
+    }
   }
 }
 
 int main(int, char**)
 {
-  test_typed();
-  test_void();
+  test();
   return 0;
 }